From 1b55bc4db9e2815c421257d496af04bdb5bda8a5 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Tue, 15 Nov 2022 17:47:25 +0800 Subject: [PATCH 001/115] alpha version --- components/tikv_util/src/config.rs | 8 - engine_store_ffi/src/interfaces.rs | 97 +++ engine_store_ffi/src/lib.rs | 192 ++++++ engine_store_ffi/src/ps_engine.rs | 601 ++++++++++++++++++ proxy_server/src/run.rs | 33 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 24 + 6 files changed, 944 insertions(+), 11 deletions(-) create mode 100644 engine_store_ffi/src/ps_engine.rs diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index e11a4799bc0..828bf1cb3ba 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -1451,14 +1451,6 @@ impl RaftDataStateMachine { self.target.display() )); } - let exists = Self::data_exists(&self.source) || Self::data_exists(&self.target); - if exists != should_exist { - if should_exist { - return Err("Cannot find raft data set.".to_owned()); - } else { - return Err("Found raft data set when it should not exist.".to_owned()); - } - } Ok(()) } diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index c7633f6010c..5a39fb1f155 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -136,6 +136,24 @@ pub mod root { pub inner: root::DB::RawCppPtr, pub view: root::DB::BaseBuffView, } + #[repr(C)] + #[derive(Debug)] + pub struct PageWithView { + pub inner: root::DB::RawCppPtr, + pub view: root::DB::BaseBuffView, + } + #[repr(C)] + #[derive(Debug)] + pub struct PageWithViewVec { + pub inner: * mut PageWithView, + pub len: u64, + } + #[repr(C)] + #[derive(Debug)] + pub struct CppStrWithViewVec { + pub inner: * const CppStrWithView, + pub len: u64, + } #[repr(u8)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum HttpRequestStatus { @@ -368,6 +386,85 @@ pub mod root { arg5: u64, ) -> u8, >, + pub fn_create_write_batch: ::std::option::Option< + unsafe extern "C" fn( + ) -> root::DB::RawCppPtr, + >, + pub fn_write_batch_put_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::BaseBuffView, + arg3: root::DB::BaseBuffView, + ), + >, + pub fn_write_batch_del_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::BaseBuffView, + ), + >, + pub fn_write_batch_size: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + ) -> u64, + >, + pub fn_write_batch_is_empty: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + ) -> u8, + >, + pub fn_write_batch_merge: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::RawVoidPtr, + ), + >, + pub fn_write_batch_clear: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + ), + >, + pub fn_consume_write_batch: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::RawVoidPtr, + ), + >, + pub fn_handle_read_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + ) -> root::DB::PageWithView, + >, + pub fn_handle_scan_page: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + arg3: root::DB::BaseBuffView, + ) -> root::DB::PageWithViewVec, + >, + pub fn_gc_page_with_view_vec: ::std::option::Option< + unsafe extern "C" fn( + arg1: * mut PageWithView, + arg2: u64, + ), + >, + pub fn_handle_purge_pagestorage: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + ), + >, + pub fn_handle_seek_ps_key: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + arg2: root::DB::BaseBuffView, + ) -> root::DB::CppStrWithView, + >, + pub fn_is_ps_empty: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + ) -> u8, + >, pub fn_atomic_update_proxy: ::std::option::Option< unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 3456d1d6c86..cad6017ffed 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -8,6 +8,7 @@ mod lock_cf_reader; pub mod observer; mod read_index_helper; mod utils; +pub mod ps_engine; use std::{ cell::RefCell, @@ -35,6 +36,7 @@ pub use self::interfaces::root::DB::{ EngineStoreServerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + CppStrWithView, CppStrWithViewVec, PageWithView, PageWithViewVec, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, FileEncryptionInfoRaw, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, @@ -856,6 +858,17 @@ impl Drop for RawCppPtr { } } +impl Drop for PageWithViewVec { + fn drop(&mut self) { + if self.inner != std::ptr::null_mut() { + let helper = get_engine_store_server_helper(); + helper.gc_page_with_view_vec(self.inner, self.len); + self.inner = std::ptr::null_mut(); + self.len = 0; + } + } +} + static mut ENGINE_STORE_SERVER_HELPER_PTR: isize = 0; pub fn get_engine_store_server_helper_ptr() -> isize { @@ -1008,6 +1021,185 @@ impl EngineStoreServerHelper { } } + pub fn create_write_batch( + &self, + ) -> RawCppPtr { + debug_assert!(self.fn_create_write_batch.is_some()); + unsafe { + (self.fn_create_write_batch.into_inner())() + } + } + + pub fn write_batch_put_page( + &self, + wb: RawVoidPtr, + page_id: BaseBuffView, + page: BaseBuffView, + ) { + debug_assert!(self.fn_write_batch_put_page.is_some()); + unsafe { + (self.fn_write_batch_put_page.into_inner())( + wb, + page_id, + page, + ) + } + } + + pub fn write_batch_del_page( + &self, + wb: RawVoidPtr, + page_id: BaseBuffView, + ) { + debug_assert!(self.fn_write_batch_del_page.is_some()); + unsafe { + (self.fn_write_batch_del_page.into_inner())( + wb, + page_id, + ) + } + } + + pub fn write_batch_size( + &self, + wb: RawVoidPtr, + ) -> u64 { + debug_assert!(self.fn_write_batch_size.is_some()); + unsafe { + (self.fn_write_batch_size.into_inner())( + wb, + ) + } + } + + pub fn write_batch_is_empty( + &self, + wb: RawVoidPtr, + ) -> u8 { + debug_assert!(self.fn_write_batch_is_empty.is_some()); + unsafe { + (self.fn_write_batch_is_empty.into_inner())( + wb, + ) + } + } + + pub fn write_batch_merge( + &self, + lwb: RawVoidPtr, + rwb: RawVoidPtr, + ) { + debug_assert!(self.fn_write_batch_merge.is_some()); + unsafe { + (self.fn_write_batch_merge.into_inner())( + lwb, + rwb, + ) + } + } + + pub fn write_batch_clear( + &self, + wb: RawVoidPtr, + ) { + debug_assert!(self.fn_write_batch_clear.is_some()); + unsafe { + (self.fn_write_batch_clear.into_inner())( + wb, + ) + } + } + + pub fn consume_write_batch( + &self, + wb: RawVoidPtr, + ) { + debug_assert!(self.fn_consume_write_batch.is_some()); + unsafe { + (self.fn_consume_write_batch.into_inner())( + self.inner, + wb, + ) + } + } + + pub fn read_page( + &self, + page_id: BaseBuffView, + ) -> PageWithView { + debug_assert!(self.fn_handle_read_page.is_some()); + unsafe { + (self.fn_handle_read_page.into_inner())( + self.inner, + page_id, + ) + } + } + + pub fn scan_page( + &self, + start_page_id: BaseBuffView, + end_page_id: BaseBuffView, + ) -> PageWithViewVec { + debug_assert!(self.fn_handle_scan_page.is_some()); + unsafe { + (self.fn_handle_scan_page.into_inner())( + self.inner, + start_page_id, + end_page_id, + ) + } + } + + pub fn gc_page_with_view_vec( + &self, + arg1: * mut PageWithView, + arg2: u64, + ) { + debug_assert!(self.fn_gc_page_with_view_vec.is_some()); + unsafe { + (self.fn_gc_page_with_view_vec.into_inner())( + arg1, + arg2, + ) + } + } + + pub fn purge_pagestorage( + &self, + ) { + debug_assert!(self.fn_handle_purge_pagestorage.is_some()); + unsafe { + (self.fn_handle_purge_pagestorage.into_inner())( + self.inner, + ) + } + } + + pub fn seek_ps_key( + &self, + page_id: BaseBuffView, + ) -> CppStrWithView { + debug_assert!(self.fn_handle_seek_ps_key.is_some()); + unsafe { + (self.fn_handle_seek_ps_key.into_inner())( + self.inner, + page_id, + ) + } + } + + pub fn is_ps_empty( + &self, + ) -> u8 { + debug_assert!(self.fn_is_ps_empty.is_some()); + unsafe { + (self.fn_is_ps_empty.into_inner())( + self.inner, + ) + } + } + pub fn pre_handle_snapshot( &self, region: &metapb::Region, diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs new file mode 100644 index 00000000000..73487d51b36 --- /dev/null +++ b/engine_store_ffi/src/ps_engine.rs @@ -0,0 +1,601 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +// Disable warnings for unused engine_rocks's feature. +#![allow(dead_code)] +#![allow(unused_variables)] + +use std::{fmt, slice}; + +use std::{ + fmt::{Formatter, Debug}, + mem, +}; + +use engine_traits::{ + Error, + RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, Result, RaftLogGcTask, + PerfContext, PerfContextExt, PerfContextKind, PerfLevel, +}; + +use tracker::TrackerToken; + +use protobuf::Message; +use raft::eraftpb::Entry; +use kvproto::{ + metapb::Region, + raft_serverpb::{ + RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, + }, +}; + +use byteorder::{BigEndian, ByteOrder}; +use tikv_util::{info, box_err, box_try}; + +use crate::{gen_engine_store_server_helper, RawCppPtr}; + +// 1. STORE_IDENT 0 +// 2. PREPARE_BOOTSTRAP 1 +// 3. RaftLocalState 2 +// 4. RegionLocalState 3 +// 5. RaftApplyState 4 +// 6. Snapshot RaftLocalState 5 +// 7. Reserved 6..9 +// 8. Log 10(+ offset 5) + +// pub const PS_KEY_PREFIX: &[u8] = &[b'r', b'_']; +// pub const PS_KEY_SEP: u8 = b'_'; +// +// const RAFT_LOCAL_STATE_ID : u64 = 2; +// const RAFT_LOG_ID_OFFSET : u64 = 5; +// +// pub fn ps_raft_state_key(region_id: u64) -> [u8; 19] { +// let mut key = [0; 19]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// BigEndian::write_u64(&mut key[11..19], RAFT_LOCAL_STATE_ID); +// key +// } +// +// pub fn ps_raft_log_key(region_id: u64, log_index: u64) -> [u8; 19] { +// let mut key = [0; 19]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// BigEndian::write_u64(&mut key[11..19], log_index + RAFT_LOG_ID_OFFSET); +// key +// } +// +// pub fn ps_raft_log_prefix(region_id: u64) -> [u8; 11] { +// let mut key = [0; 11]; +// key[..2].copy_from_slice(PS_KEY_PREFIX); +// BigEndian::write_u64(&mut key[2..10], region_id); +// key[10] = PS_KEY_SEP; +// key +// } +// +// pub fn ps_raft_log_index(key: &[u8]) -> u64 { +// let expect_key_len = PS_KEY_PREFIX.len() +// + mem::size_of::() +// + mem::size_of::() +// + mem::size_of::(); +// if key.len() != expect_key_len { +// panic!("wrong key format {:?}", key); +// } +// BigEndian::read_u64( +// &key[expect_key_len - mem::size_of::()..], +// ) +// } + +pub struct PSEngineWriteBatch { + pub engine_store_server_helper: isize, + pub raw_write_batch: RawCppPtr, +} + +impl PSEngineWriteBatch { + pub fn new(engine_store_server_helper: isize) -> PSEngineWriteBatch { + let helper = gen_engine_store_server_helper(engine_store_server_helper); + let raw_write_batch = helper.create_write_batch(); + PSEngineWriteBatch { engine_store_server_helper, raw_write_batch } + } + + fn put_page(&mut self, page_id: &[u8], value: &[u8]) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_put_page(self.raw_write_batch.ptr, page_id.into(), value.into()); + Ok(()) + } + + fn del_page(&mut self, page_id: &[u8]) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_del_page(self.raw_write_batch.ptr, page_id.into()); + Ok(()) + } + + fn append_impl( + &mut self, + raft_group_id: u64, + entries: &[Entry], + mut ser_buf: Vec, + ) -> Result<()> { + for entry in entries { + ser_buf.clear(); + entry.write_to_vec(&mut ser_buf).unwrap(); + let key = keys::raft_log_key(raft_group_id, entry.get_index()); + self.put_page(&key, &ser_buf)?; + } + Ok(()) + } + + fn put_msg(&mut self, page_id: &[u8], m: &M) -> Result<()> { + self.put_page(page_id, &m.write_to_bytes()?) + } + + fn data_size(&self) -> usize { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + return helper.write_batch_size(self.raw_write_batch.ptr) as usize; + } + + fn clear(&self) { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_clear(self.raw_write_batch.ptr); + } +} + +impl RaftLogBatch for PSEngineWriteBatch { + fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { + let ser_buf = Vec::with_capacity(max_size as usize); + return self.append_impl(raft_group_id, &entries, ser_buf); + } + Ok(()) + } + + fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { + // This function is used to clean entries that will be overwritten later. + // TODO: make sure overlapped entries will be overwritten by newer log. + // for index in from..to { + // let key = ps_raft_log_key(raft_group_id, index); + // self.del_page(&key).unwrap(); + // } + } + + fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { + self.put_msg(&keys::raft_state_key(raft_group_id), state) + } + + fn persist_size(&self) -> usize { + self.data_size() + } + + fn is_empty(&self) -> bool { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_is_empty(self.raw_write_batch.ptr) != 0 + } + + fn merge(&mut self, src: Self) -> Result<()> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.write_batch_merge(self.raw_write_batch.ptr, src.raw_write_batch.ptr); + Ok(()) + } + + fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()> { + self.put_msg(keys::STORE_IDENT_KEY, ident) + } + + fn put_prepare_bootstrap_region(&mut self, region: &Region) -> Result<()> { + self.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region) + } + + fn remove_prepare_bootstrap_region(&mut self) -> Result<()> { + self.del_page(keys::PREPARE_BOOTSTRAP_KEY) + } + + fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + self.put_msg(&keys::region_state_key(raft_group_id), state) + } + + fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + self.put_msg(&keys::apply_state_key(raft_group_id), state) + } +} + +#[derive(Clone)] +pub struct PSEngine { + pub engine_store_server_helper: isize, +} + +impl std::fmt::Debug for PSEngine { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("PSEngine") + .field( + "engine_store_server_helper", + &self.engine_store_server_helper, + ) + .finish() + } +} + +impl PSEngine { + pub fn new() -> Self { + PSEngine { engine_store_server_helper: 0 } + } + + pub fn init( + &mut self, + engine_store_server_helper: isize, + ) { + self.engine_store_server_helper = engine_store_server_helper; + } + + fn get_msg_cf( + &self, + page_id: &[u8], + ) -> Result> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let value = helper.read_page(page_id.into()); + if value.view.len == 0 { + return Ok(None); + } + + let mut m = M::default(); + m.merge_from_bytes(unsafe { slice::from_raw_parts(value.view.data as *const u8, value.view.len as usize) })?; + Ok(Some(m)) + } + + fn get_value( + &self, + page_id: &[u8], + ) -> Option> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let value = helper.read_page(page_id.into()); + return if value.view.len == 0 { + None + } else { + Some(value.view.to_slice().to_vec()) + } + } + + // Seek the first key >= given key, if not found, return None. + fn seek(&self, key: &[u8]) -> Option> { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let target_key = helper.seek_ps_key(key.into()); + if target_key.view.len == 0 { + None + } else { + Some(target_key.view.to_slice().to_vec()) + } + } + + /// scan the key between start_key(inclusive) and end_key(exclusive), + /// the upper bound is omitted if end_key is empty + fn scan(&self, start_key: &[u8], end_key: &[u8], mut f: F) -> Result<()> + where + F: FnMut(&[u8], &[u8]) -> Result, + { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + let values = helper.scan_page(start_key.into(), end_key.into()); + for i in 0..values.len { + let value = unsafe { + &*values.inner.offset(i as isize) + }; + if value.view.len != 0 { + if !f(&[], &value.view.to_slice().to_vec())? { + break; + } + } + } + Ok(()) + } + + fn gc_impl(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { + if from == 0 { + let start_key = keys::raft_log_key(raft_group_id, 0); + let prefix = keys::raft_log_prefix(raft_group_id); + // TODO: make sure the seek can skip other raft related key and to the first log key + match self.seek(&start_key) { + Some(target_key) if target_key.starts_with(&prefix) => from = box_try!(keys::raft_log_index(&target_key)), + // No need to gc. + _ => return Ok(0), + } + } + if from >= to { + return Ok(0); + } + // info!("gc_impl raft_group_id {} from {} to {}", raft_group_id, from ,to); + + let mut raft_wb = self.log_batch(0); + for idx in from..to { + raft_wb.del_page(&keys::raft_log_key(raft_group_id, idx)); + } + // TODO: keep the max size of raft_wb under some threshold + self.consume(&mut raft_wb, false); + Ok((to - from) as usize) + } + + fn is_empty(&self) -> bool { + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.is_ps_empty() != 0 + } +} + +impl RaftEngineReadOnly for PSEngine { + fn get_raft_state(&self, raft_group_id: u64) -> Result> { + let key = keys::raft_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_entry(&self, raft_group_id: u64, index: u64) -> Result> { + let key = keys::raft_log_key(raft_group_id, index); + self.get_msg_cf(&key) + } + + fn fetch_entries_to( + &self, + region_id: u64, + low: u64, + high: u64, + max_size: Option, + buf: &mut Vec, + ) -> Result { + let (max_size, mut total_size, mut count) = (max_size.unwrap_or(usize::MAX), 0, 0); + + let start_key = keys::raft_log_key(region_id, low); + let end_key = keys::raft_log_key(region_id, high); + + let mut count = 1; + + self.scan( + &start_key, + &end_key, + |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + total_size += page.len(); + count += 1; + Ok(total_size < max_size) + }, + )?; + + return Ok(count); + } + + fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { + let start_key = keys::raft_log_key(region_id, 0); + let end_key = keys::raft_log_key(region_id, u64::MAX); + self.scan( + &start_key, + &end_key, + |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + Ok(true) + }, + )?; + Ok(()) + } + + fn is_empty(&self) -> Result { + let mut is_empty = true; + Ok(self.is_empty()) + } + + fn get_store_ident(&self) -> Result> { + self.get_msg_cf(keys::STORE_IDENT_KEY) + } + + fn get_prepare_bootstrap_region(&self) -> Result> { + self.get_msg_cf(keys::PREPARE_BOOTSTRAP_KEY) + } + + fn get_region_state(&self, raft_group_id: u64) -> Result> { + let key = keys::region_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_apply_state(&self, raft_group_id: u64) -> Result> { + let key = keys::apply_state_key(raft_group_id); + self.get_msg_cf(&key) + } + + fn get_recover_state(&self) -> Result> { + self.get_msg_cf(keys::RECOVER_STATE_KEY) + } +} + +impl RaftEngineDebug for PSEngine { + fn scan_entries(&self, raft_group_id: u64, mut f: F) -> Result<()> + where + F: FnMut(&Entry) -> Result, + { + let start_key = keys::raft_log_key(raft_group_id, 0); + let end_key = keys::raft_log_key(raft_group_id, u64::MAX); + self.scan( + &start_key, + &end_key, + |_, value| { + let mut entry = Entry::default(); + entry.merge_from_bytes(value)?; + f(&entry) + }, + ); + Ok(()) + } +} + +impl RaftEngine for PSEngine { + type LogBatch = PSEngineWriteBatch; + + fn log_batch(&self, capacity: usize) -> Self::LogBatch { + PSEngineWriteBatch::new(self.engine_store_server_helper) + } + + fn sync(&self) -> Result<()> { + Ok(()) + } + + fn consume(&self, batch: &mut Self::LogBatch, sync_log: bool) -> Result { + let bytes = batch.data_size(); + let helper = gen_engine_store_server_helper(self.engine_store_server_helper); + helper.consume_write_batch(batch.raw_write_batch.ptr); + batch.clear(); + Ok(bytes) + } + + fn consume_and_shrink( + &self, + batch: &mut Self::LogBatch, + sync_log: bool, + max_capacity: usize, + shrink_to: usize, + ) -> Result { + self.consume(batch, sync_log) + } + + fn clean( + &self, + raft_group_id: u64, + mut first_index: u64, + state: &RaftLocalState, + batch: &mut Self::LogBatch, + ) -> Result<()> { + // info!("try clean raft_group_id {} from {} to {}", raft_group_id, first_index, state.last_index); + batch.del_page(&keys::raft_state_key(raft_group_id))?; + batch.del_page(&keys::region_state_key(raft_group_id))?; + batch.del_page(&keys::apply_state_key(raft_group_id))?; + if first_index == 0 { + let start_key = keys::raft_log_key(raft_group_id, 0); + let prefix = keys::raft_log_prefix(raft_group_id); + // TODO: make sure the seek can skip other raft related key and to the first log key + match self.seek(&start_key) { + Some(target_key) if target_key.starts_with(&prefix) => first_index = box_try!(keys::raft_log_index(&target_key)), + // No need to gc. + _ => return Ok(()), + } + } + if first_index >= state.last_index { + return Ok(()); + } + info!("clean raft_group_id {} from {} to {}", raft_group_id, first_index, state.last_index); + // TODO: find the first raft log index of this raft group + if first_index <= state.last_index { + for index in first_index..=state.last_index { + batch.del_page( &keys::raft_log_key(raft_group_id, index)); + } + } + self.consume(batch, true); + Ok(()) + } + + fn append(&self, raft_group_id: u64, entries: Vec) -> Result { + let mut wb = self.log_batch(0); + if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { + let buf = Vec::with_capacity(max_size as usize); + wb.append_impl(raft_group_id, &entries, buf)?; + return self.consume(&mut wb, false) + } + Ok(0) + } + + fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { + let mut wb = self.log_batch(0); + wb.put_msg(&keys::raft_state_key(raft_group_id), state); + self.consume(&mut wb, false); + Ok(()) + } + + fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { + self.gc_impl(raft_group_id, from, to) + } + + fn batch_gc(&self, groups: Vec) -> Result { + let mut total = 0; + for task in groups { + total += self.gc(task.raft_group_id, task.from, task.to)?; + } + Ok(total) + } + + fn flush_metrics(&self, instance: &str) { + } + + fn reset_statistics(&self) { + } + + fn dump_stats(&self) -> Result { + Ok(String::from("")) + } + + fn get_engine_size(&self) -> Result { + Ok(0) + } + + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { + let mut wb = self.log_batch(0); + wb.put_msg(keys::STORE_IDENT_KEY, ident); + self.consume(&mut wb, false); + Ok(()) + } + + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, + { + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut err = None; + self.scan(start_key, end_key, |key, _| { + let (region_id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + + match f(region_id) { + Ok(()) => Ok(true), + Err(e) => { + err = Some(e); + Ok(false) + } + } + })?; + match err { + None => Ok(()), + Some(e) => Err(e), + } + } + + fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { + let mut wb = self.log_batch(0); + wb.put_msg(keys::RECOVER_STATE_KEY, state); + self.consume(&mut wb, false); + Ok(()) + } +} + +impl PerfContextExt for PSEngine { + type PerfContext = PSPerfContext; + + fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + PSPerfContext::new(level, kind) + } +} + +#[derive(Debug)] +pub struct PSPerfContext { +} + +impl PSPerfContext { + pub fn new(level: PerfLevel, kind: PerfContextKind) -> Self { + PSPerfContext { } + } +} + +impl PerfContext for PSPerfContext { + fn start_observe(&mut self) { + } + + fn report_metrics(&mut self, trackers: &[TrackerToken]) { + + } +} diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 14a3620465c..4f2e14a8426 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -28,7 +28,7 @@ use engine_rocks::{ use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_store_ffi::{ self, EngineStoreServerHelper, EngineStoreServerStatus, RaftProxyStatus, RaftStoreProxy, - RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, + RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, ps_engine::PSEngine }; use engine_traits::{ CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, TabletFactory, @@ -332,7 +332,8 @@ pub unsafe fn run_tikv_proxy( engine_store_server_helper, ) } else { - run_impl::(config, proxy_config, engine_store_server_helper) + run_impl::(config, proxy_config, engine_store_server_helper) + // run_impl::(config, proxy_config, engine_store_server_helper) } }) } @@ -391,12 +392,20 @@ impl TiKvServer { .unwrap(); // Create raft engine - let raft_engine = CER::build( + let mut raft_engine = CER::build( &self.config, &env, &self.encryption_key_manager, &block_cache, ); + match raft_engine.as_ps_engine() { + None => { + + } + Some(ps_engine) => { + ps_engine.init(engine_store_server_helper); + } + } // Create kv engine. let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) @@ -1611,6 +1620,9 @@ pub trait ConfiguredRaftEngine: RaftEngine { fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } + fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { + None + } fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} } @@ -1705,6 +1717,21 @@ impl ConfiguredRaftEngine for RaftLogEngine { } } +impl ConfiguredRaftEngine for PSEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Option, + )-> Self { + PSEngine::new() + } + + fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { + Some(self) + } +} + /// Various sanity-checks and logging before running a server. /// /// Warnings are logged. diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 49b82c3704c..b4ded43adf3 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -86,6 +86,16 @@ struct CppStrWithView { BaseBuffView view; }; +struct PageWithView { + RawCppPtr inner; + BaseBuffView view; +}; + +struct PageWithViewVec { + PageWithView * inner; + const uint64_t len; +}; + enum class HttpRequestStatus : uint8_t { Ok = 0, ErrorParam, @@ -190,6 +200,20 @@ struct EngineStoreServerHelper { uint8_t (*fn_need_flush_data)(EngineStoreServerWrap *, uint64_t); uint8_t (*fn_try_flush_data)(EngineStoreServerWrap *, uint64_t, uint8_t, uint64_t, uint64_t); + RawCppPtr (*fn_create_write_batch)(); + void (*fn_write_batch_put_page)(RawVoidPtr, BaseBuffView, BaseBuffView); + void (*fn_write_batch_del_page)(RawVoidPtr, BaseBuffView); + uint64_t (*fn_write_batch_size)(RawVoidPtr); + uint8_t (*fn_write_batch_is_empty)(RawVoidPtr); + void (*fn_write_batch_merge)(RawVoidPtr, RawVoidPtr); + void (*fn_write_batch_clear)(RawVoidPtr); + void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); + PageWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, BaseBuffView); + PageWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, BaseBuffView, BaseBuffView); + void (*fn_gc_page_with_view_vec)(PageWithView * inner, uint64_t len); + void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); + CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, BaseBuffView); + uint8_t (*fn_ps_is_empty)(const EngineStoreServerWrap *); void (*fn_atomic_update_proxy)(EngineStoreServerWrap *, RaftStoreProxyFFIHelper *); void (*fn_handle_destroy)(EngineStoreServerWrap *, uint64_t); From 1e978b927a78cb9e1072c6d02f6365f7d235a740 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 2 Dec 2022 11:51:45 +0800 Subject: [PATCH 002/115] introduce cache mem info Signed-off-by: CalvinNeo --- components/raftstore/src/store/fsm/peer.rs | 24 ++++++++++++++++ engine_store_ffi/src/observer.rs | 33 ++++++++++++++++++++++ proxy_tests/proxy/region.rs | 16 +++++++++++ 3 files changed, 73 insertions(+) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 63bb878838c..d6fe7010bcb 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2439,12 +2439,23 @@ where "to_peer_id" => msg.get_to_peer().get_id(), ); + tikv_util::debug!("!!!!! on_raft_message after check 0"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "self.fsm.stopped" => self.fsm.stopped, + "msg" => ?msg, + "commit" => msg.get_message().get_commit(), + ); if self.fsm.peer.pending_remove || self.fsm.stopped { return Ok(()); } self.handle_reported_disk_usage(&msg); + tikv_util::debug!("!!!!! on_raft_message after check 0.1"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); let msg_type = msg.get_message().get_msg_type(); if matches!(self.ctx.self_disk_usage, DiskUsage::AlreadyFull) && MessageType::MsgTimeoutNow == msg_type @@ -2457,10 +2468,18 @@ where return Ok(()); } + tikv_util::debug!("!!!!! on_raft_message after check 0.2"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); if !self.validate_raft_msg(&msg) { return Ok(()); } + tikv_util::debug!("!!!!! on_raft_message after check 1"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); if msg.get_is_tombstone() { // we receive a message tells us to remove ourself. self.handle_gc_peer_msg(&msg); @@ -2479,6 +2498,11 @@ where return Ok(()); } + tikv_util::debug!("!!!!! on_raft_message after check 2"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); + if msg.has_extra_msg() { self.on_extra_message(msg); return Ok(()); diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 19fb6337210..07613e086a2 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -90,6 +90,10 @@ impl PrehandleTask { unsafe impl Send for PrehandleTask {} unsafe impl Sync for PrehandleTask {} +const CACHED_REGION_INFO_SLOT_COUNT: usize = 128; + +pub struct CachedRegionInfo {} + pub struct TiFlashObserver { pub peer_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, @@ -99,6 +103,7 @@ pub struct TiFlashObserver { pub snap_handle_pool_size: usize, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, + pub cached_mem_info: Arc>>>, } impl Clone for TiFlashObserver { @@ -112,6 +117,7 @@ impl Clone for TiFlashObserver { snap_handle_pool_size: self.snap_handle_pool_size, apply_snap_pool: self.apply_snap_pool.clone(), pending_delete_ssts: self.pending_delete_ssts.clone(), + cached_mem_info: self.cached_mem_info.clone(), } } } @@ -120,7 +126,29 @@ impl Clone for TiFlashObserver { // avoid being bypassed. const TIFLASH_OBSERVER_PRIORITY: u32 = 0; +// Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) +#[inline] +fn hash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); + i ^ (i >> 31) +} + +#[allow(dead_code)] +#[inline] +fn unhash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); + i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); + i ^ (i >> 30) ^ (i >> 60) +} + impl TiFlashObserver { + #[inline] + fn slot_index(id: u64) -> usize { + debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); + hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) + } + pub fn new( peer_id: u64, engine: engine_tiflash::RocksEngine, @@ -133,6 +161,10 @@ impl TiFlashObserver { let snap_pool = Builder::new(tikv_util::thd_name!("region-task")) .max_thread_count(snap_handle_pool_size) .build_future_pool(); + let mut mem_infos = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); + for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { + mem_infos.push(RwLock::new(HashMap::default())); + } TiFlashObserver { peer_id, engine_store_server_helper, @@ -142,6 +174,7 @@ impl TiFlashObserver { snap_handle_pool_size, apply_snap_pool: Some(Arc::new(snap_pool)), pending_delete_ssts: Arc::new(RwLock::new(vec![])), + cached_mem_info: Arc::new(mem_infos), } } diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index e1193b199fb..a6d5cb888f4 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -693,3 +693,19 @@ fn test_add_delayed_started_learner_snapshot() { fail::remove("on_pre_persist_with_finish"); cluster.shutdown(); } + +#[test] +fn test_fast_add_peer2() { + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + let _ = cluster.run_conf_change(); + pd_client.add_peer(1, new_peer(2, 2)); + + std::thread::sleep(std::time::Duration::from_millis(1000)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); +} From 758e22f83cc43d5823abad29cfcf4c1d8fe9d38b Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 5 Dec 2022 20:56:35 +0800 Subject: [PATCH 003/115] partially run Signed-off-by: CalvinNeo --- Cargo.lock | 2 - Cargo.toml | 7 +- .../raftstore/src/coprocessor/dispatcher.rs | 11 + components/raftstore/src/coprocessor/mod.rs | 6 +- components/raftstore/src/store/fsm/peer.rs | 56 ++++- components/raftstore/src/store/fsm/store.rs | 5 + components/raftstore/src/store/peer.rs | 5 + components/raftstore/src/store/snap.rs | 1 + components/test_pd_client/src/pd.rs | 8 +- engine_store_ffi/src/observer.rs | 196 +++++++++++++++--- new-mock-engine-store/src/node.rs | 2 + new-mock-engine-store/src/server.rs | 17 +- proxy_server/src/run.rs | 18 +- proxy_tests/proxy/region.rs | 9 +- src/server/raft_client.rs | 2 + 15 files changed, 283 insertions(+), 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fa64aa145a8..3ded3d8e68a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4484,7 +4484,6 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#36d3293a8b1a32c4b4115855419108386abcdc4a" dependencies = [ "bytes", "fxhash", @@ -4532,7 +4531,6 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#36d3293a8b1a32c4b4115855419108386abcdc4a" dependencies = [ "bytes", "protobuf", diff --git a/Cargo.toml b/Cargo.toml index bff99190c58..38a721c49a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -193,8 +193,11 @@ zipf = "6.1.0" prometheus = { git = "https://github.com/solotzg/rust-prometheus.git", rev = "b4fe98a06a58d29f9b9987a0d7186f6ed5230193" } # TODO: remove this when new raft-rs is published. -raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } -raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } +#raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } +raft = { path = "/Users/calvin/tiflash/raft-rs" } +# raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } +raft-proto = { path = "/Users/calvin/tiflash/raft-rs/proto" } + protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 99228aef44c..97a141aa97b 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -8,6 +8,7 @@ use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; @@ -669,6 +670,16 @@ impl CoprocessorHost { true } + pub fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if observer.should_skip_raft_message(msg) { + return true; + } + } + false + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 7ac783c0d6d..f2b9d0b9364 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -14,7 +14,7 @@ use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, - raft_serverpb::RaftApplyState, + raft_serverpb::{RaftApplyState, RaftMessage}, }; use raft::{eraftpb, StateRole}; @@ -328,6 +328,10 @@ pub trait RegionChangeObserver: Coprocessor { fn pre_write_apply_state(&self, _: &mut ObserverContext<'_>) -> bool { true } + + fn should_skip_raft_message(&self, _: &RaftMessage) -> bool { + false + } } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d6fe7010bcb..011c804c09a 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -610,6 +610,10 @@ where for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { + debug!("!!!! peer skip message"); + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -2443,7 +2447,6 @@ where "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "self.fsm.stopped" => self.fsm.stopped, - "msg" => ?msg, "commit" => msg.get_message().get_commit(), ); if self.fsm.peer.pending_remove || self.fsm.stopped { @@ -2468,15 +2471,11 @@ where return Ok(()); } - tikv_util::debug!("!!!!! on_raft_message after check 0.2"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if !self.validate_raft_msg(&msg) { return Ok(()); } - tikv_util::debug!("!!!!! on_raft_message after check 1"; + tikv_util::debug!("!!!!! on_raft_message after check 0.2"; "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() ); @@ -2494,6 +2493,10 @@ where return Ok(()); } + tikv_util::debug!("!!!!! on_raft_message after check 0.3"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); if self.check_msg(&msg) { return Ok(()); } @@ -2502,7 +2505,6 @@ where "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() ); - if msg.has_extra_msg() { self.on_extra_message(msg); return Ok(()); @@ -2510,9 +2512,17 @@ where let is_snapshot = msg.get_message().has_snapshot(); + tikv_util::debug!("!!!!! on_raft_message after check 3"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { + tikv_util::debug!("!!!!! on_raft_message after check 3.1"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); if let Some(key) = key { // If the snapshot file is not used again, then it's OK to // delete them here. If the snapshot file will be reused when @@ -2523,9 +2533,19 @@ where } return Ok(()); } - Either::Right(v) => v, + Either::Right(v) => { + tikv_util::debug!("!!!!! on_raft_message after check 3.2"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); + v + } }; + tikv_util::debug!("!!!!! on_raft_message after check 4"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); if util::is_vote_msg(msg.get_message()) || msg_type == MessageType::MsgTimeoutNow { if self.fsm.hibernate_state.group_state() != GroupState::Chaos { self.fsm.reset_hibernate_state(GroupState::Chaos); @@ -2552,14 +2572,28 @@ where self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(()); } - self.fsm.peer.step(self.ctx, msg.take_message()) + let res = self.fsm.peer.step(self.ctx, msg.take_message()); + tikv_util::debug!("!!!!! on_raft_message after check 4.1"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); + res }; stepped.set(result.is_ok()); + tikv_util::debug!("!!!!! on_raft_message after check 5"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id(), + "result" => ?result + ); if is_snapshot { if !self.fsm.peer.has_pending_snapshot() { // This snapshot is rejected by raft-rs. + tikv_util::debug!("!!!!! on_raft_message after check 5.1"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); let mut meta = self.ctx.store_meta.lock().unwrap(); meta.pending_snapshot_regions .retain(|r| self.fsm.region_id() != r.get_id()); @@ -2570,6 +2604,10 @@ where // region after applying that snapshot. // But if `regions_to_destroy` is not empty, the pending snapshot must be this // msg's snapshot because this kind of snapshot is exclusive. + tikv_util::debug!("!!!!! on_raft_message after check 5.2"; + "region_id" => self.region_id(), + "peer_id" => self.fsm.peer_id() + ); self.destroy_regions_for_snapshot(regions_to_destroy); } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 28c0db02eee..b39656f9d3f 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -714,6 +714,10 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { + debug!("!!!! store skip message"); + continue; + } if let Err(e) = self.on_raft_message(msg) { if matches!(&e, Error::RegionNotRegistered { .. }) { // This may happen in normal cases when add-peer runs slowly @@ -1980,6 +1984,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER error!( "missing epoch in raft message, ignore it"; "region_id" => region_id, + "!!!! msg" => ?msg, ); self.ctx .raft_metrics diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9614161739a..374b371116f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1769,6 +1769,11 @@ where for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgSnapshot { + let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + snap_data + .merge_from_bytes(msg.get_message().get_snapshot().get_data()) + .unwrap(); + debug!("!!!! send snapshot {:?} XXXXXXX {:?}", msg, snap_data); let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); if snap_index > self.last_sent_snapshot_idx { self.last_sent_snapshot_idx = snap_index; diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 8ca5b26d02b..cf09671b4f7 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -186,6 +186,7 @@ where // A helper function to copy snapshot. // Only used in tests. pub fn copy_snapshot(mut from: Box, mut to: Box) -> io::Result<()> { + debug!("!!!!! copy_snapshot {}", to.exists()); if !to.exists() { io::copy(&mut from, &mut to)?; to.save()?; diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 513d08643a7..b6a78abc799 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -739,8 +739,12 @@ impl PdCluster { let operator = operator?; debug!( - "[region {}] schedule {:?} to {:?}, region: {:?}", - region_id, operator, leader, region + "[region {}] schedule {:?} to {:?}, region: {:?} {:?}", + region_id, + operator, + leader, + region, + std::backtrace::Backtrace::capture(), ); let mut resp = operator.make_region_heartbeat_response(region.get_id(), self); diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 07613e086a2..a8ec8186a4e 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ + collections::hash_map::Entry as MapEntry, ops::DerefMut, path::PathBuf, str::FromStr, @@ -13,9 +13,10 @@ use engine_traits::SstMetaInfo; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::RaftApplyState, + raft_serverpb::{RaftApplyState, RaftMessage}, }; -use raft::StateRole; +use protobuf::Message; +use raft::{eraftpb, eraftpb::MessageType, StateRole}; use raftstore::{ coprocessor::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, @@ -25,10 +26,10 @@ use raftstore::{ StoreSizeInfo, UpdateSafeTsObserver, }, store, - store::{check_sst_for_ingestion, snap::plain_file_used, SnapKey}, + store::{check_sst_for_ingestion, snap::plain_file_used, SnapKey, Transport}, }; use sst_importer::SstImporter; -use tikv_util::{debug, error, info, warn}; +use tikv_util::{box_err, debug, error, info, warn}; use yatp::{ pool::{Builder, ThreadPool}, task::future::TaskCell, @@ -92,10 +93,15 @@ unsafe impl Sync for PrehandleTask {} const CACHED_REGION_INFO_SLOT_COUNT: usize = 128; -pub struct CachedRegionInfo {} +#[derive(Debug, Default)] +pub struct CachedRegionInfo { + pub inited: bool, +} -pub struct TiFlashObserver { - pub peer_id: u64, +pub type CachedRegionInfoMap = HashMap>; + +pub struct TiFlashObserver { + pub store_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, pub engine: TiFlashEngine, pub sst_importer: Arc, @@ -103,13 +109,15 @@ pub struct TiFlashObserver { pub snap_handle_pool_size: usize, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, - pub cached_mem_info: Arc>>>, + pub cached_region_info: Arc>>, + // TODO should we use a Mutex here? + pub trans: Arc>, } -impl Clone for TiFlashObserver { +impl Clone for TiFlashObserver { fn clone(&self) -> Self { TiFlashObserver { - peer_id: self.peer_id, + store_id: self.store_id, engine_store_server_helper: self.engine_store_server_helper, engine: self.engine.clone(), sst_importer: self.sst_importer.clone(), @@ -117,7 +125,8 @@ impl Clone for TiFlashObserver { snap_handle_pool_size: self.snap_handle_pool_size, apply_snap_pool: self.apply_snap_pool.clone(), pending_delete_ssts: self.pending_delete_ssts.clone(), - cached_mem_info: self.cached_mem_info.clone(), + cached_region_info: self.cached_region_info.clone(), + trans: self.trans.clone(), } } } @@ -142,18 +151,137 @@ fn unhash_u64(mut i: u64) -> u64 { i ^ (i >> 30) ^ (i >> 60) } -impl TiFlashObserver { +impl TiFlashObserver { #[inline] fn slot_index(id: u64) -> usize { debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) } + pub fn access_cached_region_info_mut>)>( + &self, + region_id: u64, + mut f: F, + ) -> Result<(), String> { + let slot_id = Self::slot_index(region_id); + let guard = self.cached_region_info.get(slot_id).unwrap().write(); + let mut guard = guard.unwrap(); + f(guard.entry(region_id)); + Ok(()) + } + + pub fn is_first_msg_append(&self, msg: &RaftMessage) -> bool { + // return false; + // TODO Need to recover all region infomation from restart. + let inner_msg = msg.get_message(); + if inner_msg.get_msg_type() != MessageType::MsgAppend { + return false; + } + let region_id = msg.get_region_id(); + let mut is_first = false; + // Can use immutable version. + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(o) => { + is_first = !o.get().inited; + if is_first { + info!("fast path: ongoing {}:{}, skip MsgAppend", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + } + } + MapEntry::Vacant(v) => { + info!("fast path: first MsgAppend of {}:{}, skip", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + v.insert(Arc::new(CachedRegionInfo::default())); + is_first = true; + } + }, + ) + .unwrap(); + let mut response = RaftMessage::default(); + use kvproto::metapb::RegionEpoch; + let mut epoch = RegionEpoch::default(); + epoch.set_conf_ver(2); + epoch.set_version(1); + response.set_region_epoch(epoch.clone()); + response.set_region_id(1); + response.set_from_peer(msg.get_from_peer().clone()); + response.set_to_peer(msg.get_to_peer().clone()); + response + .mut_message() + .set_msg_type(MessageType::MsgSnapshot); + response.mut_message().set_term(inner_msg.get_term()); + let snapshot: &mut eraftpb::Snapshot = response.mut_message().mut_snapshot(); + let metadata: &mut eraftpb::SnapshotMetadata = snapshot.mut_metadata(); + + // TODO The rest is test, please remove it after we can fetch the real data. + metadata + .mut_conf_state() + .mut_voters() + .push(msg.get_from_peer().get_id()); + metadata + .mut_conf_state() + .mut_learners() + .push(msg.get_to_peer().get_id()); + + metadata.set_index(inner_msg.get_index()); + metadata.set_term(inner_msg.get_term()); + + let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + let mut region = kvproto::metapb::Region::default(); + region.set_id(1); + region.set_region_epoch(epoch); + use kvproto::metapb::{Peer, PeerRole::Learner}; + + { + let mut peer = Peer::default(); + peer.set_id(1); + peer.set_store_id(1); + region.mut_peers().push(peer); + let mut peer = Peer::default(); + peer.set_id(2); + peer.set_store_id(2); + peer.set_role(Learner); + region.mut_peers().push(peer); + snap_data.set_region(region); + snap_data.set_file_size(0); + snap_data.set_version(2); + } + + // snap_data.mut_meta().set_for_witness(true); + + for cf in raftstore::store::snap::SNAPSHOT_CFS { + let mut cf_file = kvproto::raft_serverpb::SnapshotCfFile::default(); + let path = format!("/tmp/loop_{}.sst", cf); + let mut file = std::fs::File::create(path.as_str()).unwrap(); + cf_file.set_cf(cf.to_string()); + cf_file.set_size(0); + cf_file.set_checksum(0); + snap_data.mut_meta().mut_cf_files().push(cf_file); + } + + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + debug!("!!!!! send response {:?} data {:?}", response, snap_data); + self.trans.lock().unwrap().send(response).unwrap(); + debug!("!!!!! send response FINISH"); + is_first + } +} + +impl TiFlashObserver { pub fn new( - peer_id: u64, + store_id: u64, engine: engine_tiflash::RocksEngine, sst_importer: Arc, snap_handle_pool_size: usize, + trans: T, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -161,12 +289,12 @@ impl TiFlashObserver { let snap_pool = Builder::new(tikv_util::thd_name!("region-task")) .max_thread_count(snap_handle_pool_size) .build_future_pool(); - let mut mem_infos = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); + let mut cached_region_info = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { - mem_infos.push(RwLock::new(HashMap::default())); + cached_region_info.push(RwLock::new(HashMap::default())); } TiFlashObserver { - peer_id, + store_id, engine_store_server_helper, engine, sst_importer, @@ -174,7 +302,8 @@ impl TiFlashObserver { snap_handle_pool_size, apply_snap_pool: Some(Arc::new(snap_pool)), pending_delete_ssts: Arc::new(RwLock::new(vec![])), - cached_mem_info: Arc::new(mem_infos), + cached_region_info: Arc::new(cached_region_info), + trans: Arc::new(Mutex::new(trans)), } } @@ -278,14 +407,14 @@ impl TiFlashObserver { } } -impl Coprocessor for TiFlashObserver { +impl Coprocessor for TiFlashObserver { fn stop(&self) { - info!("shutdown tiflash observer"; "peer_id" => self.peer_id); + info!("shutdown tiflash observer"; "store_id" => self.store_id); self.apply_snap_pool.as_ref().unwrap().shutdown(); } } -impl AdminObserver for TiFlashObserver { +impl AdminObserver for TiFlashObserver { fn pre_exec_admin( &self, ob_ctx: &mut ObserverContext<'_>, @@ -448,7 +577,7 @@ impl AdminObserver for TiFlashObserver { } } -impl QueryObserver for TiFlashObserver { +impl QueryObserver for TiFlashObserver { fn on_empty_cmd(&self, ob_ctx: &mut ObserverContext<'_>, index: u64, term: u64) { fail::fail_point!("on_empty_cmd_normal", |_| {}); debug!("encounter empty cmd, maybe due to leadership change"; @@ -616,7 +745,7 @@ impl QueryObserver for TiFlashObserver { } } -impl UpdateSafeTsObserver for TiFlashObserver { +impl UpdateSafeTsObserver for TiFlashObserver { fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { self.engine_store_server_helper.handle_safe_ts_update( region_id, @@ -626,7 +755,7 @@ impl UpdateSafeTsObserver for TiFlashObserver { } } -impl RegionChangeObserver for TiFlashObserver { +impl RegionChangeObserver for TiFlashObserver { fn on_region_changed( &self, ob_ctx: &mut ObserverContext<'_>, @@ -637,7 +766,7 @@ impl RegionChangeObserver for TiFlashObserver { info!( "observe destroy"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); @@ -669,13 +798,13 @@ impl RegionChangeObserver for TiFlashObserver { debug!( "observe pre_persist, persist"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, ); } else { debug!( "observe pre_persist"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, "is_finished" => is_finished, ); }; @@ -686,9 +815,18 @@ impl RegionChangeObserver for TiFlashObserver { fail::fail_point!("on_pre_persist_with_finish", |_| { true }); false } + + fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { + let inner_msg = msg.get_message(); + if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { + } else if inner_msg.get_msg_type() == MessageType::MsgAppend { + return self.is_first_msg_append(&msg); + } + false + } } -impl PdTaskObserver for TiFlashObserver { +impl PdTaskObserver for TiFlashObserver { fn on_compute_engine_size(&self, store_size: &mut Option) { let stats = self.engine_store_server_helper.handle_compute_store_stats(); let _ = store_size.insert(StoreSizeInfo { @@ -759,7 +897,7 @@ fn pre_handle_snapshot_impl( PtrWrapper(ptr) } -impl ApplySnapshotObserver for TiFlashObserver { +impl ApplySnapshotObserver for TiFlashObserver { #[allow(clippy::single_match)] fn pre_apply_snapshot( &self, diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 954050a7f2c..9f25b72a14c 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -135,6 +135,7 @@ impl Transport for ChannelTransport { match core.routers.get(&to_store) { Some(h) => { + debug!("!!!!! ChannelTransport send {} msg {:?}", to_store, msg); h.send_raft_msg(msg)?; if is_snapshot { // should report snapshot finish. @@ -319,6 +320,7 @@ impl Simulator for NodeCluster { engines.kv.clone(), importer.clone(), cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 466de08126f..5f29f87f233 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -409,14 +409,6 @@ impl ServerCluster { Arc::clone(&importer), ); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node_id, - engines.kv.clone(), - importer.clone(), - 2, - ); - tiflash_ob.register_to(&mut coprocessor_host); - let check_leader_runner = CheckLeaderRunner::new(store_meta.clone(), coprocessor_host.clone()); let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); @@ -563,6 +555,15 @@ impl ServerCluster { let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node_id, + engines.kv.clone(), + importer.clone(), + cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), + ); + tiflash_ob.register_to(&mut coprocessor_host); + // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 14a3620465c..5232bc677d3 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -1173,14 +1173,6 @@ impl TiKvServer { } let importer = Arc::new(importer); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node.id(), - self.engines.as_ref().unwrap().engines.kv.clone(), - importer.clone(), - self.proxy_config.raft_store.snap_handle_pool_size, - ); - tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - let check_leader_runner = CheckLeaderRunner::new( engines.store_meta.clone(), self.coprocessor_host.clone().unwrap(), @@ -1215,6 +1207,16 @@ impl TiKvServer { health_service, ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node.id(), + self.engines.as_ref().unwrap().engines.kv.clone(), + importer.clone(), + self.proxy_config.raft_store.snap_handle_pool_size, + server.transport().clone(), + ); + tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + cfg_controller.register( tikv::config::Module::Server, Box::new(ServerConfigManager::new( diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index a6d5cb888f4..281bba28b14 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -699,8 +699,15 @@ fn test_fast_add_peer2() { let (mut cluster, pd_client) = new_mock_cluster(0, 2); fail::cfg("on_pre_persist_with_finish", "return").unwrap(); disable_auto_gen_compact_log(&mut cluster); + // Siable auto generate peer. + pd_client.disable_default_operator(); let _ = cluster.run_conf_change(); - pd_client.add_peer(1, new_peer(2, 2)); + + // If we don't write here, we will have the first MsgAppend with (6,6), which + // will cause "fast-forwarded commit to snapshot". + cluster.must_put(b"k0", b"v0"); + + pd_client.must_add_peer(1, new_learner_peer(2, 2)); std::thread::sleep(std::time::Duration::from_millis(1000)); cluster.must_put(b"k1", b"v1"); diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 0230174fb42..6c13418bee9 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1047,6 +1047,8 @@ where pub fn send(&mut self, msg: RaftMessage) -> result::Result<(), DiscardReason> { let store_id = msg.get_to_peer().store_id; let grpc_raft_conn_num = self.builder.cfg.value().grpc_raft_conn_num as u64; + + tikv_util::info!("!!!!! Client send {:?}", store_id); let conn_id = if grpc_raft_conn_num == 1 { 0 } else { From 2efbf19d96eb50fd864d3bb3e2946fb07bc178d4 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 6 Dec 2022 22:16:03 +0800 Subject: [PATCH 004/115] it can work Signed-off-by: CalvinNeo --- .../raftstore/src/coprocessor/dispatcher.rs | 7 + components/raftstore/src/coprocessor/mod.rs | 2 + components/raftstore/src/router.rs | 1 + components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 3 + .../raftstore/src/store/peer_storage.rs | 4 + components/raftstore/src/store/snap.rs | 29 +- components/test_pd_client/src/pd.rs | 8 +- components/test_raftstore/src/router.rs | 2 + engine_store_ffi/src/interfaces.rs | 9 +- engine_store_ffi/src/lib.rs | 11 + engine_store_ffi/src/observer.rs | 306 +++++++++++++----- new-mock-engine-store/src/lib.rs | 174 +++++++++- new-mock-engine-store/src/node.rs | 2 + new-mock-engine-store/src/server.rs | 2 + proxy_server/src/run.rs | 2 + proxy_tests/Cargo.toml | 2 +- proxy_tests/proxy/proxy.rs | 32 +- proxy_tests/proxy/region.rs | 82 +---- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 2 + 21 files changed, 488 insertions(+), 196 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 97a141aa97b..69ebfa7b385 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -680,6 +680,13 @@ impl CoprocessorHost { false } + pub fn on_peer_created(&self, region_id: u64) { + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + observer.on_peer_created(region_id) + } + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index f2b9d0b9364..70427df9922 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -332,6 +332,8 @@ pub trait RegionChangeObserver: Coprocessor { fn should_skip_raft_message(&self, _: &RaftMessage) -> bool { false } + + fn on_peer_created(&self, _: u64) {} } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 1ded8be3886..4e1eb4fe2f1 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -267,6 +267,7 @@ pub fn handle_send_error(region_id: u64, e: TrySendError) -> RaftStoreErro impl RaftStoreRouter for RaftRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + tikv_util::debug!("!!!!! RaftStoreRouter::send_raft_msg"); let region_id = msg.get_region_id(); self.send_raft_message(msg) .map_err(|e| handle_send_error(region_id, e)) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 011c804c09a..2cda4403d6f 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2443,7 +2443,7 @@ where "to_peer_id" => msg.get_to_peer().get_id(), ); - tikv_util::debug!("!!!!! on_raft_message after check 0"; + tikv_util::debug!("!!!!! on_raft_message after check 0 {:?}", msg.get_message().get_msg_type(); "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), "self.fsm.stopped" => self.fsm.stopped, diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index b39656f9d3f..4de2f05c9cf 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1813,6 +1813,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER if local_state.get_state() != PeerState::Tombstone { // Maybe split, but not registered yet. if !util::is_first_message(msg.get_message()) { + debug!("!!!!! find RegionNotRegistered {:?}", msg); self.ctx .raft_metrics .message_dropped @@ -2232,6 +2233,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER target.clone(), )?; + self.ctx.coprocessor_host.on_peer_created(region_id); + // WARNING: The checking code must be above this line. // Now all checking passed diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 0d10b1f36cf..61bf22dcd97 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -206,6 +206,10 @@ fn init_apply_state( state.set_index(RAFT_INIT_LOG_INDEX); state.set_term(RAFT_INIT_LOG_TERM); } + debug!( + "!!!!! init_raft_state {}", + util::is_region_initialized(region) + ); apply_state } }, diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index cf09671b4f7..e0d0694e83c 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -206,7 +206,7 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -418,8 +418,8 @@ impl CfFile { } } -#[derive(Default)] -struct MetaFile { +#[derive(Default, Debug)] +pub struct MetaFile { pub meta: Option, pub path: PathBuf, pub file: Option, @@ -432,10 +432,10 @@ pub struct Snapshot { key: SnapKey, display_path: String, dir_path: PathBuf, - cf_files: Vec, + pub cf_files: Vec, cf_index: usize, cf_file_index: usize, - meta_file: MetaFile, + pub meta_file: MetaFile, hold_tmp_files: bool, mgr: SnapManagerCore, @@ -644,6 +644,7 @@ impl Snapshot { // new file at the temporary meta file path, so that all other try will fail. fn init_for_building(&mut self) -> RaftStoreResult<()> { if self.exists() { + debug!("!!!!! init_for_building exists"); return Ok(()); } let file = OpenOptions::new() @@ -812,7 +813,7 @@ impl Snapshot { } // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -873,6 +874,10 @@ impl Snapshot { for (cf_enum, cf) in SNAPSHOT_CFS_ENUM_PAIR { self.switch_to_cf_file(cf)?; let cf_file = &mut self.cf_files[self.cf_index]; + info!( + "!!!!! buuild {:?} {} {} {:?}", + cf_file.path, cf_file.file_prefix, cf_file.file_suffix, cf_file.file_names + ); let cf_stat = if plain_file_used(cf_file.cf) { let key_mgr = self.mgr.encryption_key_manager.as_ref(); snap_io::build_plain_cf_file::(cf_file, key_mgr, kv_snap, &begin_key, &end_key)? @@ -1112,6 +1117,12 @@ impl Snapshot { pub fn exists(&self) -> bool { self.cf_files.iter().all(|cf_file| { + debug!( + "!!!!! copy_snapshot exists cf_file.size {:?} cf_file.file_paths() {:?} meta {:?}", + cf_file.size, + cf_file.file_paths(), + self.meta_file.path + ); cf_file.size.is_empty() || (cf_file .file_paths() @@ -1536,6 +1547,12 @@ impl SnapManager { Ok(Box::new(f)) } + pub fn get_empty_snapshot_for_building(&self, key: &SnapKey) -> RaftStoreResult> { + let base = &self.core.base; + let f = Snapshot::new_for_building(base, key, &self.core)?; + Ok(Box::new(f)) + } + pub fn get_snapshot_for_gc( &self, key: &SnapKey, diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index b6a78abc799..8ea8a52bfbd 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -739,12 +739,8 @@ impl PdCluster { let operator = operator?; debug!( - "[region {}] schedule {:?} to {:?}, region: {:?} {:?}", - region_id, - operator, - leader, - region, - std::backtrace::Backtrace::capture(), + "[region {}] schedule {:?} to {:?}, region: {:?}", + region_id, operator, leader, region, ); let mut resp = operator.make_region_heartbeat_response(region.get_id(), self); diff --git a/components/test_raftstore/src/router.rs b/components/test_raftstore/src/router.rs index 3b6b1e962c3..7fd978f65a1 100644 --- a/components/test_raftstore/src/router.rs +++ b/components/test_raftstore/src/router.rs @@ -58,6 +58,7 @@ impl ProposalRouter for MockRaftStoreRouter { impl CasualRouter for MockRaftStoreRouter { fn send(&self, region_id: u64, msg: CasualMessage) -> RaftStoreResult<()> { + debug!("!!!!! MockRaftStoreRouter"); let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { tx.try_send(PeerMsg::CasualMessage(msg)) @@ -74,6 +75,7 @@ impl SignificantRouter for MockRaftStoreRouter { region_id: u64, msg: SignificantMsg, ) -> RaftStoreResult<()> { + debug!("!!!!! MockRaftStoreRouter"); let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { tx.force_send(PeerMsg::SignificantMsg(msg)).unwrap(); diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index c7633f6010c..c43b7b3be83 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -451,8 +451,15 @@ pub mod root { leader_safe_ts: u64, ), >, + pub fn_debug_func: ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut root::DB::EngineStoreServerWrap, + type_: u64, + arg2: root::DB::RawVoidPtr, + ) -> root::DB::RawVoidPtr, + >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 15776819379826780689; + pub const RAFT_STORE_PROXY_VERSION: u64 = 4624446451501389788; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index c32b3ae89d7..cfd1e44ce15 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1158,6 +1158,11 @@ impl EngineStoreServerHelper { ) } } + + pub fn debug_func(&self, debug_type: u64, ptr: RawVoidPtr) -> RawVoidPtr { + debug_assert!(self.fn_debug_func.is_some()); + unsafe { (self.fn_debug_func.into_inner())(self.inner, debug_type, ptr) } + } } #[allow(clippy::clone_on_copy)] @@ -1246,3 +1251,9 @@ pub unsafe extern "C" fn ffi_poll_timer_task(task_ptr: RawVoidPtr, waker: RawVoi 0 } } + +pub const USE_LEADER_FOR_REGION: u64 = 10; + +pub struct DebugStruct_UseLeaderForRegion { + pub region_id: u64, +} diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index a8ec8186a4e..edfbaa215b7 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1,15 +1,19 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + cell::Cell, collections::hash_map::Entry as MapEntry, ops::DerefMut, path::PathBuf, str::FromStr, - sync::{atomic::Ordering, mpsc, Arc, Mutex, RwLock}, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc, Arc, Mutex, RwLock, + }, }; use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::SstMetaInfo; +use engine_traits::{RaftEngine, SstMetaInfo}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, @@ -25,8 +29,9 @@ use raftstore::{ PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, RegionState, StoreSizeInfo, UpdateSafeTsObserver, }, - store, - store::{check_sst_for_ingestion, snap::plain_file_used, SnapKey, Transport}, + store::{ + self, check_sst_for_ingestion, snap::plain_file_used, SnapKey, SnapManager, Transport, + }, }; use sst_importer::SstImporter; use tikv_util::{box_err, debug, error, info, warn}; @@ -91,19 +96,21 @@ impl PrehandleTask { unsafe impl Send for PrehandleTask {} unsafe impl Sync for PrehandleTask {} -const CACHED_REGION_INFO_SLOT_COUNT: usize = 128; +const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; #[derive(Debug, Default)] pub struct CachedRegionInfo { - pub inited: bool, + pub replicated_or_created: AtomicBool, + pub inited: AtomicBool, } pub type CachedRegionInfoMap = HashMap>; -pub struct TiFlashObserver { +pub struct TiFlashObserver { pub store_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, pub engine: TiFlashEngine, + pub raft_engine: ER, pub sst_importer: Arc, pub pre_handle_snapshot_ctx: Arc>, pub snap_handle_pool_size: usize, @@ -112,14 +119,16 @@ pub struct TiFlashObserver { pub cached_region_info: Arc>>, // TODO should we use a Mutex here? pub trans: Arc>, + pub snap_mgr: Arc, } -impl Clone for TiFlashObserver { +impl Clone for TiFlashObserver { fn clone(&self) -> Self { TiFlashObserver { store_id: self.store_id, engine_store_server_helper: self.engine_store_server_helper, engine: self.engine.clone(), + raft_engine: self.raft_engine.clone(), sst_importer: self.sst_importer.clone(), pre_handle_snapshot_ctx: self.pre_handle_snapshot_ctx.clone(), snap_handle_pool_size: self.snap_handle_pool_size, @@ -127,6 +136,7 @@ impl Clone for TiFlashObserver { pending_delete_ssts: self.pending_delete_ssts.clone(), cached_region_info: self.cached_region_info.clone(), trans: self.trans.clone(), + snap_mgr: self.snap_mgr.clone(), } } } @@ -151,7 +161,7 @@ fn unhash_u64(mut i: u64) -> u64 { i ^ (i >> 30) ^ (i >> 60) } -impl TiFlashObserver { +impl TiFlashObserver { #[inline] fn slot_index(id: u64) -> usize { debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); @@ -179,17 +189,20 @@ impl TiFlashObserver { } let region_id = msg.get_region_id(); let mut is_first = false; - // Can use immutable version. - self.access_cached_region_info_mut( - region_id, - |info: MapEntry>| match info { - MapEntry::Occupied(o) => { - is_first = !o.get().inited; + let mut is_replicated = false; + let f = |info: MapEntry>| { + match info { + MapEntry::Occupied(mut o) => { + is_first = !o.get().inited.load(Ordering::SeqCst); + // TODO include create + is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { + // TODO Maybe too much printing info!("fast path: ongoing {}:{}, skip MsgAppend", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, ); } } @@ -202,24 +215,131 @@ impl TiFlashObserver { v.insert(Arc::new(CachedRegionInfo::default())); is_first = true; } - }, - ) - .unwrap(); - let mut response = RaftMessage::default(); - use kvproto::metapb::RegionEpoch; - let mut epoch = RegionEpoch::default(); - epoch.set_conf_ver(2); - epoch.set_version(1); - response.set_region_epoch(epoch.clone()); - response.set_region_id(1); - response.set_from_peer(msg.get_from_peer().clone()); - response.set_to_peer(msg.get_to_peer().clone()); - response - .mut_message() - .set_msg_type(MessageType::MsgSnapshot); - response.mut_message().set_term(inner_msg.get_term()); - let snapshot: &mut eraftpb::Snapshot = response.mut_message().mut_snapshot(); + } + }; + // Can use immutable version. + self.access_cached_region_info_mut(region_id, f).unwrap(); + + if is_first { + info!("fast path: normal MsgAppend of {}:{}", self.store_id, region_id; + ); + return false; + } + + use std::io::Write; + + use engine_traits::{Peekable, CF_RAFT}; + use into_other::into_other; + use kvproto::raft_serverpb::{RaftApplyState, RegionLocalState}; + use raftstore::store::snap::SnapEntry; + use tikv_util::defer; + { + if !is_replicated { + info!("fast path: ongoing {}:{}, wait replicating peer", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + return true; + } + } + + info!("fast path: ongoing {}:{}, start load", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + // Feed data + // #[cfg(any(test, feature = "testexport"))] + { + let mut s = crate::DebugStruct_UseLeaderForRegion { region_id }; + self.engine_store_server_helper.debug_func( + crate::USE_LEADER_FOR_REGION, + &s as *const crate::DebugStruct_UseLeaderForRegion as crate::RawVoidPtr, + ); + } + + // Build snapshot by get_snapshot_for_building + let (mut snap, key, apply_state, region_state) = { + let apply_state: RaftApplyState = self + .engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) + .unwrap() + .unwrap(); + let region_state: RegionLocalState = self + .engine + .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) + .unwrap() + .unwrap(); + let key = SnapKey::new( + region_id, + apply_state.get_commit_term(), // TODO apply index term + apply_state.get_applied_index(), + ); + self.snap_mgr.register(key.clone(), SnapEntry::Generating); + defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); + let snapshot = self.snap_mgr.get_empty_snapshot_for_building(&key).unwrap(); + + // let base = &self.snap_mgr.core.base; + // let f = Snapshot::new_for_building(base, key, &self.snap_mgr.core).unwrap(); + (snapshot, key.clone(), apply_state, region_state) + }; + + debug!( + "!!!!! snap 1 {:?} {:?} {}", + snap, + snap.meta_file, + snap.cf_files.len() + ); + // Build snapshot by do_snapshot + let mut snapshot: eraftpb::Snapshot = Default::default(); let metadata: &mut eraftpb::SnapshotMetadata = snapshot.mut_metadata(); + let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + { + // Data + for (cf_enum, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { + let cf_index = snap.cf_files.iter().position(|x| &x.cf == cf).unwrap(); + let cf_file = &mut snap.cf_files[cf_index]; + let mut path = cf_file.path.clone(); + path.push(cf_file.file_prefix.clone()); + path.set_extension("sst"); + debug!("!!!! snap g {:?}", path); + let mut file = std::fs::File::create(path.as_path()).unwrap(); + // let mut file = std::fs::create_dir(); + } + // Meta + snap.meta_file.meta = + Some(raftstore::store::snap::gen_snapshot_meta(&snap.cf_files[..], true).unwrap()); + { + let v = snap + .meta_file + .meta + .as_ref() + .unwrap() + .write_to_bytes() + .unwrap(); + let mut f = std::fs::File::create(snap.meta_file.path.as_path()).unwrap(); + f.write_all(&v[..]).unwrap(); + f.flush().unwrap(); + f.sync_all().unwrap(); + } + + debug!( + "!!!!! snap 2 {:?} {:?} {}", + snap.meta_file.meta, + snap.meta_file.file, + snap.cf_files.len() + ); + + // snap.save_meta_file().unwrap(); + // let mut file = std::fs::File::create(path.as_path()).unwrap(); + snap_data.set_region(region_state.get_region().clone()); + + snap_data.set_file_size(0); + let SNAPSHOT_VERSION = 2; + snap_data.set_version(SNAPSHOT_VERSION); + snap_data.set_meta(snap.meta_file.meta.as_ref().unwrap().clone()); + } + // Compose snapshot // TODO The rest is test, please remove it after we can fetch the real data. metadata @@ -234,54 +354,49 @@ impl TiFlashObserver { metadata.set_index(inner_msg.get_index()); metadata.set_term(inner_msg.get_term()); - let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); - let mut region = kvproto::metapb::Region::default(); - region.set_id(1); - region.set_region_epoch(epoch); - use kvproto::metapb::{Peer, PeerRole::Learner}; - - { - let mut peer = Peer::default(); - peer.set_id(1); - peer.set_store_id(1); - region.mut_peers().push(peer); - let mut peer = Peer::default(); - peer.set_id(2); - peer.set_store_id(2); - peer.set_role(Learner); - region.mut_peers().push(peer); - snap_data.set_region(region); - snap_data.set_file_size(0); - snap_data.set_version(2); - } - // snap_data.mut_meta().set_for_witness(true); - for cf in raftstore::store::snap::SNAPSHOT_CFS { - let mut cf_file = kvproto::raft_serverpb::SnapshotCfFile::default(); - let path = format!("/tmp/loop_{}.sst", cf); - let mut file = std::fs::File::create(path.as_str()).unwrap(); - cf_file.set_cf(cf.to_string()); - cf_file.set_size(0); - cf_file.set_checksum(0); - snap_data.mut_meta().mut_cf_files().push(cf_file); - } + // for cf in raftstore::store::snap::SNAPSHOT_CFS { + // let mut cf_file = kvproto::raft_serverpb::SnapshotCfFile::default(); + // let path = format!("/tmp/loop_{}.sst", cf); + // let mut file = std::fs::File::create(path.as_str()).unwrap(); + // cf_file.set_cf(cf.to_string()); + // cf_file.set_size(0); + // cf_file.set_checksum(0); + // snap_data.mut_meta().mut_cf_files().push(cf_file); + // } snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + + // Send reponse + let mut response = RaftMessage::default(); + use kvproto::metapb::RegionEpoch; + let mut epoch = region_state.get_region().get_region_epoch(); + response.set_region_epoch(epoch.clone()); + response.set_region_id(region_id); + response.set_from_peer(msg.get_from_peer().clone()); + response.set_to_peer(msg.get_to_peer().clone()); + response + .mut_message() + .set_msg_type(MessageType::MsgSnapshot); + response.mut_message().set_term(inner_msg.get_term()); + response.mut_message().set_snapshot(snapshot); debug!("!!!!! send response {:?} data {:?}", response, snap_data); - self.trans.lock().unwrap().send(response).unwrap(); - debug!("!!!!! send response FINISH"); + let res = self.trans.lock().unwrap().send(response); + debug!("!!!!! send response FINISH {:?}", res); is_first } } -impl TiFlashObserver { +impl TiFlashObserver { pub fn new( store_id: u64, engine: engine_tiflash::RocksEngine, + raft_engine: ER, sst_importer: Arc, snap_handle_pool_size: usize, trans: T, + snap_mgr: SnapManager, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -297,6 +412,7 @@ impl TiFlashObserver { store_id, engine_store_server_helper, engine, + raft_engine, sst_importer, pre_handle_snapshot_ctx: Arc::new(Mutex::new(PrehandleContext::default())), snap_handle_pool_size, @@ -304,6 +420,7 @@ impl TiFlashObserver { pending_delete_ssts: Arc::new(RwLock::new(vec![])), cached_region_info: Arc::new(cached_region_info), trans: Arc::new(Mutex::new(trans)), + snap_mgr: Arc::new(snap_mgr), } } @@ -407,14 +524,14 @@ impl TiFlashObserver { } } -impl Coprocessor for TiFlashObserver { +impl Coprocessor for TiFlashObserver { fn stop(&self) { info!("shutdown tiflash observer"; "store_id" => self.store_id); self.apply_snap_pool.as_ref().unwrap().shutdown(); } } -impl AdminObserver for TiFlashObserver { +impl AdminObserver for TiFlashObserver { fn pre_exec_admin( &self, ob_ctx: &mut ObserverContext<'_>, @@ -577,7 +694,7 @@ impl AdminObserver for TiFlashObserver { } } -impl QueryObserver for TiFlashObserver { +impl QueryObserver for TiFlashObserver { fn on_empty_cmd(&self, ob_ctx: &mut ObserverContext<'_>, index: u64, term: u64) { fail::fail_point!("on_empty_cmd_normal", |_| {}); debug!("encounter empty cmd, maybe due to leadership change"; @@ -745,7 +862,7 @@ impl QueryObserver for TiFlashObserver { } } -impl UpdateSafeTsObserver for TiFlashObserver { +impl UpdateSafeTsObserver for TiFlashObserver { fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { self.engine_store_server_helper.handle_safe_ts_update( region_id, @@ -755,7 +872,7 @@ impl UpdateSafeTsObserver for TiFlashObserver { } } -impl RegionChangeObserver for TiFlashObserver { +impl RegionChangeObserver for TiFlashObserver { fn on_region_changed( &self, ob_ctx: &mut ObserverContext<'_>, @@ -824,9 +941,28 @@ impl RegionChangeObserver for TiFlashObserver { } false } + + fn on_peer_created(&self, region_id: u64) { + let mut f = |info: MapEntry>| { + debug!("!!!! on_peer_created"); + match info { + MapEntry::Occupied(mut o) => { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + let mut c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); + } + } + }; + self.access_cached_region_info_mut(region_id, f).unwrap(); + } } -impl PdTaskObserver for TiFlashObserver { +impl PdTaskObserver for TiFlashObserver { fn on_compute_engine_size(&self, store_size: &mut Option) { let stats = self.engine_store_server_helper.handle_compute_store_stats(); let _ = store_size.insert(StoreSizeInfo { @@ -897,7 +1033,7 @@ fn pre_handle_snapshot_impl( PtrWrapper(ptr) } -impl ApplySnapshotObserver for TiFlashObserver { +impl ApplySnapshotObserver for TiFlashObserver { #[allow(clippy::single_match)] fn pre_apply_snapshot( &self, @@ -987,6 +1123,29 @@ impl ApplySnapshotObserver for TiFlashObserver { "snap_key" => ?snap_key, "region" => ?ob_ctx.region(), ); + let region_id = ob_ctx.region().get_id(); + let mut should_skip = false; + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + if !o.get().inited.load(Ordering::SeqCst) { + info!("fast path: first snapshot applied {}:{}, recover MsgAppend", self.store_id, region_id; + "snap_key" => ?snap_key, + ); + } + should_skip = o.get().inited.load(Ordering::SeqCst); + o.get_mut().inited.store(true, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + panic!("unknown snapshot!"); + } + }, + ) + .unwrap(); + if should_skip { + return; + } let snap = match snap { None => return, Some(s) => s, @@ -1056,10 +1215,11 @@ impl ApplySnapshotObserver for TiFlashObserver { self.engine_store_server_helper .apply_pre_handled_snapshot(ptr.0); info!("apply snapshot finished"; - "peer_id" => ?snap_key, + "snap_key" => ?snap_key, "region" => ?ob_ctx.region(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); + let region_id = ob_ctx.region().get_id(); } } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 4dec9d67607..87d48ef22cd 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -30,7 +30,7 @@ pub use mock_cluster::{ use protobuf::Message; use tikv_util::{debug, error, info, warn}; -use crate::{config::MockConfig, server::ServerCluster}; +use crate::{config::MockConfig, node::NodeCluster, server::ServerCluster}; pub mod config; pub mod mock_cluster; @@ -593,6 +593,7 @@ impl EngineStoreServerWrap { "node_id"=>node_id, ); panic!("observe obsolete write index"); + // TODO this can happen since we recover from another peer // return ffi_interfaces::EngineStoreApplyRes::None; } for i in 0..cmds.len { @@ -689,6 +690,7 @@ pub fn gen_engine_store_server_helper( fn_set_store: None, fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), + fn_debug_func: Some(ffi_debug_func), } } @@ -1221,3 +1223,173 @@ unsafe extern "C" fn ffi_handle_compute_store_stats( engine_keys_read: 0, } } + +use engine_store_ffi::{DebugStruct_UseLeaderForRegion, USE_LEADER_FOR_REGION}; + +unsafe extern "C" fn ffi_debug_func( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + debug_type: u64, + ptr: ffi_interfaces::RawVoidPtr, +) -> ffi_interfaces::RawVoidPtr { + let store = into_engine_store_server_wrap(arg1); + if debug_type == USE_LEADER_FOR_REGION { + let s = &*(ptr as *const DebugStruct_UseLeaderForRegion); + let region_id = s.region_id; + let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); + let lock = cluster.ffi_helper_set.lock().unwrap(); + let source_server = &lock.get(&1).unwrap().engine_store_server; + let source_engines = &source_server.engines.clone().unwrap(); + let source_region = source_server.kvstore.get(®ion_id).unwrap(); + let new_region_meta = get_region_local_state(&source_engines.kv.rocks, region_id) + .get_region() + .clone(); + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = (*store.engine_store_server).engines.clone().unwrap(); + let target_region = (*store.engine_store_server) + .kvstore + .get_mut(®ion_id) + .unwrap(); + debug!("recover from leader"; "region_id" => region_id, "region" => ?new_region_meta); + copy_data_from( + source_engines, + &target_engines, + &source_region, + target_region, + ) + .unwrap(); + copy_meta_from( + source_engines, + &target_engines, + &source_region, + target_region, + new_region_meta, + ) + .unwrap(); + } + std::ptr::null_mut() +} + +use engine_store_ffi::RawVoidPtr; +use engine_traits::{KvEngine, Mutable, RaftEngine, RaftEngineDebug, RaftLogBatch, WriteBatch}; +use kvproto::raft_serverpb::RaftLocalState; +use tikv_util::box_try; + +// TODO Need refactor if moved to raft-engine +pub fn get_region_local_state( + engine: &engine_rocks::RocksEngine, + region_id: u64, +) -> RegionLocalState { + let region_state_key = keys::region_state_key(region_id); + let region_state = match engine.get_msg_cf::(CF_RAFT, ®ion_state_key) { + Ok(Some(s)) => s, + _ => unreachable!(), + }; + region_state +} + +// TODO Need refactor if moved to raft-engine +pub fn get_apply_state(engine: &engine_rocks::RocksEngine, region_id: u64) -> RaftApplyState { + let apply_state_key = keys::apply_state_key(region_id); + let apply_state = match engine.get_msg_cf::(CF_RAFT, &apply_state_key) { + Ok(Some(s)) => s, + _ => unreachable!(), + }; + apply_state +} + +pub fn get_raft_local_state( + raft_engine: &ER, + region_id: u64, +) -> RaftLocalState { + raft_engine.get_raft_state(region_id).unwrap().unwrap() +} + +pub fn copy_meta_from( + source_engines: &Engines< + impl KvEngine, + impl RaftEngine + engine_traits::Peekable + RaftEngineDebug, + >, + target_engines: &Engines, + source: &Box, + target: &mut Box, + new_region_meta: kvproto::metapb::Region, +) -> raftstore::Result<()> { + let region_id = source.region.get_id(); + + let mut wb = target_engines.kv.write_batch(); + let mut raft_wb = target_engines.raft.log_batch(1024); + + // Can't copy this key, otherwise will cause a bootstrap. + // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); + + // region local state + let mut state = RegionLocalState::default(); + state.set_region(new_region_meta); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); + + // apply state + { + let key = keys::apply_state_key(region_id); + let apply_state: RaftApplyState = source_engines + .kv + .get_msg_cf(CF_RAFT, &key) + .unwrap() + .unwrap(); + wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; + target.apply_state = apply_state.clone(); + target.applied_term = source.applied_term; + } + + wb.write()?; + target_engines.sync_kv()?; + + // raft state + { + let key = keys::raft_state_key(region_id); + let raft_state = source_engines + .raft + .get_msg_cf(CF_DEFAULT, &key) + .unwrap() + .unwrap(); + raft_wb.put_raft_state(region_id, &raft_state)?; + }; + + // raft log + let mut entries: Vec = Default::default(); + source_engines + .raft + .scan_entries(region_id, |e| { + debug!("copy raft log"; "e" => ?e); + entries.push(e.clone()); + Ok(true) + }) + .unwrap(); + + raft_wb.append(region_id, entries)?; + box_try!(target_engines.raft.consume(&mut raft_wb, true)); + + Ok(()) +} + +pub fn copy_data_from( + source_engines: &Engines< + impl KvEngine, + impl RaftEngine + engine_traits::Peekable + RaftEngineDebug, + >, + target_engines: &Engines, + source: &Box, + target: &mut Box, +) -> raftstore::Result<()> { + for cf in 0..3 { + for (k, v) in &source.data[cf] { + write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); + } + } + Ok(()) +} diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 9f25b72a14c..6b06ce9332e 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -318,9 +318,11 @@ impl Simulator for NodeCluster { let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), + engines.raft.clone(), importer.clone(), cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), + snap_mgr.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 5f29f87f233..13f5889390c 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -558,9 +558,11 @@ impl ServerCluster { let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), + engines.raft.clone(), importer.clone(), cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), + snap_mgr.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 5232bc677d3..f25c983727a 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -1211,9 +1211,11 @@ impl TiKvServer { let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node.id(), self.engines.as_ref().unwrap().engines.kv.clone(), + self.engines.as_ref().unwrap().engines.raft.clone(), importer.clone(), self.proxy_config.raft_store.snap_handle_pool_size, server.transport().clone(), + snap_mgr.clone(), ); tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); diff --git a/proxy_tests/Cargo.toml b/proxy_tests/Cargo.toml index 2d878998fff..e9730c960c5 100644 --- a/proxy_tests/Cargo.toml +++ b/proxy_tests/Cargo.toml @@ -15,7 +15,7 @@ failpoints = ["fail/failpoints", "tikv/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] -testexport = ["raftstore/testexport", "tikv/testexport", "engine_tiflash/testexport"] +testexport = ["raftstore/testexport", "tikv/testexport", "engine_tiflash/testexport", "engine_store_ffi/testexport"] profiling = ["profiler/profiling"] test-engine-kv-rocksdb = [ diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 4a11b8a269b..e80989ce2b7 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -27,7 +27,7 @@ pub use kvproto::{ }; pub use new_mock_engine_store::{ config::Config, - make_new_region, + get_apply_state, get_raft_local_state, get_region_local_state, make_new_region, mock_cluster::{new_put_cmd, new_request, FFIHelperSet}, must_get_equal, must_get_none, node::NodeCluster, @@ -47,36 +47,6 @@ pub use tikv_util::{ HandyRwLock, }; -// TODO Need refactor if moved to raft-engine -pub fn get_region_local_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> RegionLocalState { - let region_state_key = keys::region_state_key(region_id); - let region_state = match engine.get_msg_cf::(CF_RAFT, ®ion_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - region_state -} - -// TODO Need refactor if moved to raft-engine -pub fn get_apply_state(engine: &engine_rocks::RocksEngine, region_id: u64) -> RaftApplyState { - let apply_state_key = keys::apply_state_key(region_id); - let apply_state = match engine.get_msg_cf::(CF_RAFT, &apply_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - apply_state -} - -pub fn get_raft_local_state( - raft_engine: &ER, - region_id: u64, -) -> RaftLocalState { - raft_engine.get_raft_state(region_id).unwrap().unwrap() -} - pub fn new_compute_hash_request() -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ComputeHash); diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 281bba28b14..6254efa428a 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -367,74 +367,10 @@ fn test_add_delayed_started_learner_by_joint() { cluster.shutdown(); } -pub fn copy_meta_from( - source_engines: &Engines< - impl KvEngine, - impl RaftEngine + engine_traits::Peekable + RaftEngineDebug, - >, - target_engines: &Engines, - source: &Box, - target: &mut Box, - new_region_meta: kvproto::metapb::Region, -) -> raftstore::Result<()> { - let region_id = source.region.get_id(); - - let mut wb = target_engines.kv.write_batch(); - let mut raft_wb = target_engines.raft.log_batch(1024); - - // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); - - // region local state - let mut state = RegionLocalState::default(); - state.set_region(new_region_meta); - box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); - - // apply state - { - let key = keys::apply_state_key(region_id); - let apply_state: RaftApplyState = source_engines - .kv - .get_msg_cf(CF_RAFT, &key) - .unwrap() - .unwrap(); - wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - target.apply_state = apply_state.clone(); - target.applied_term = source.applied_term; - } - - wb.write()?; - target_engines.sync_kv()?; - - // raft state - { - let key = keys::raft_state_key(region_id); - let raft_state = source_engines - .raft - .get_msg_cf(CF_DEFAULT, &key) - .unwrap() - .unwrap(); - raft_wb.put_raft_state(region_id, &raft_state)?; - }; - - // raft log - let mut entries: Vec = Default::default(); - source_engines - .raft - .scan_entries(region_id, |e| { - debug!("copy raft log"; "e" => ?e); - entries.push(e.clone()); - Ok(true) - }) - .unwrap(); - - raft_wb.append(region_id, entries)?; - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - - Ok(()) -} +use new_mock_engine_store::{copy_data_from, copy_meta_from}; fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_id: u64) { - let source_region_1 = cluster + let source_region = cluster .ffi_helper_set .lock() .unwrap() @@ -446,7 +382,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ .unwrap() .clone(); - let mut new_region_meta = source_region_1.region.clone(); + let mut new_region_meta = source_region.region.clone(); new_region_meta.mut_peers().push(new_learner_peer(to, to)); // Copy all node `from`'s data to node `to` @@ -457,22 +393,18 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ let server = &mut ffi.engine_store_server; assert!(server.kvstore.get(®ion_id).is_none()); - let new_region = make_new_region(Some(source_region_1.region.clone()), Some(id)); + let new_region = make_new_region(Some(source_region.region.clone()), Some(id)); server .kvstore - .insert(source_region_1.region.get_id(), Box::new(new_region)); + .insert(source_region.region.get_id(), Box::new(new_region)); if let Some(region) = server.kvstore.get_mut(®ion_id) { - for cf in 0..3 { - for (k, v) in &source_region_1.data[cf] { - write_kv_in_mem(region, cf, k.as_slice(), v.as_slice()); - } - } let source_engines = cluster.get_engines(from); let target_engines = cluster.get_engines(to); + copy_data_from(source_engines, target_engines, &source_region, region).unwrap(); copy_meta_from( source_engines, target_engines, - &source_region_1, + &source_region, region, new_region_meta.clone(), ) diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 519af996bc4..58b337ebfc1 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 15776819379826780689ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4624446451501389788ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 49b82c3704c..a5915fd5a54 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -215,5 +215,7 @@ struct EngineStoreServerHelper { void (*fn_handle_safe_ts_update)(EngineStoreServerWrap *, uint64_t region_id, uint64_t self_safe_ts, uint64_t leader_safe_ts); + RawVoidPtr (*fn_debug_func)(EngineStoreServerWrap *, uint64_t type, + RawVoidPtr); }; } // namespace DB From c3afc97b394dd94259a54513789fe9f329fcc864 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Thu, 8 Dec 2022 00:06:36 +0800 Subject: [PATCH 005/115] fix test bug Signed-off-by: CalvinNeo --- Makefile | 7 + components/raftstore/src/store/snap.rs | 5 + engine_store_ffi/src/interfaces.rs | 17 +- engine_store_ffi/src/lib.rs | 27 +- engine_store_ffi/src/observer.rs | 305 ++++++++++++------ new-mock-engine-store/src/lib.rs | 217 ++++++++----- new-mock-engine-store/src/mock_cluster.rs | 7 + new-mock-engine-store/src/node.rs | 1 + new-mock-engine-store/src/server.rs | 1 + proxy_server/src/config.rs | 5 + proxy_server/src/run.rs | 1 + proxy_tests/proxy/mod.rs | 1 + proxy_tests/proxy/proxy.rs | 38 ++- proxy_tests/proxy/region.rs | 41 --- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 10 + 16 files changed, 449 insertions(+), 236 deletions(-) diff --git a/Makefile b/Makefile index 68a0606a3a3..d0dc1361605 100644 --- a/Makefile +++ b/Makefile @@ -213,6 +213,10 @@ pre-format: unset-override @rustup component add rustfmt @cargo install --force -q cargo-sort +pre-format-fast: unset-override + @rustup component add rustfmt + @cargo install -q cargo-sort + ci_fmt_check: M="fmt" ./proxy_scripts/ci_check.sh @@ -224,6 +228,9 @@ ci_test: gen_proxy_ffi: pre-format ./gen-proxy-ffi.sh +gen_proxy_ffi_fast: pre-format-fast + ./gen-proxy-ffi.sh + format: pre-format @cargo fmt @cargo sort -w ./Cargo.toml ./*/Cargo.toml components/*/Cargo.toml cmd/*/Cargo.toml >/dev/null diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index e0d0694e83c..68c7977da92 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1791,6 +1791,11 @@ impl SnapManagerCore { ); return false; } + debug!( + "!!!!! deletee snapshot {:?} {:?}", + key, + std::backtrace::Backtrace::capture() + ); snap.delete(); true } diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index c43b7b3be83..45d8f5d81a1 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -230,6 +230,15 @@ pub mod root { Error = 1, NotFound = 2, } + #[repr(u32)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub enum FastAddPeerRes { + Ok = 0, + OtherError = 1, + NoSuitable = 2, + BadData = 3, + FailedInject = 4, + } #[repr(C)] #[derive(Debug)] pub struct RaftStoreProxyFFIHelper { @@ -458,8 +467,14 @@ pub mod root { arg2: root::DB::RawVoidPtr, ) -> root::DB::RawVoidPtr, >, + pub fn_fast_add_peer: ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut root::DB::EngineStoreServerWrap, + region_id: u64, + ) -> root::DB::FastAddPeerRes, + >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 4624446451501389788; + pub const RAFT_STORE_PROXY_VERSION: u64 = 13418513559228271669; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index cfd1e44ce15..41c57fae8ef 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -32,9 +32,9 @@ pub use read_index_helper::ReadIndexClient; pub use self::interfaces::root::DB::{ BaseBuffView, ColumnFamilyType, CppStrVecView, EngineStoreApplyRes, EngineStoreServerHelper, - EngineStoreServerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, - KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, - RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + EngineStoreServerStatus, FastAddPeerRes, FileEncryptionRes, FsStats, HttpRequestRes, + HttpRequestStatus, KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, + RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, FileEncryptionInfoRaw, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, @@ -1163,6 +1163,11 @@ impl EngineStoreServerHelper { debug_assert!(self.fn_debug_func.is_some()); unsafe { (self.fn_debug_func.into_inner())(self.inner, debug_type, ptr) } } + + pub fn fast_add_peer(&self, region_id: u64) -> FastAddPeerRes { + debug_assert!(self.fn_fast_add_peer.is_some()); + unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id) } + } } #[allow(clippy::clone_on_copy)] @@ -1252,8 +1257,18 @@ pub unsafe extern "C" fn ffi_poll_timer_task(task_ptr: RawVoidPtr, waker: RawVoi } } -pub const USE_LEADER_FOR_REGION: u64 = 10; +use serde_derive::{Deserialize, Serialize}; +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct EngineStoreConfig { + pub enable_fast_add_peer: bool, +} -pub struct DebugStruct_UseLeaderForRegion { - pub region_id: u64, +impl Default for EngineStoreConfig { + fn default() -> Self { + Self { + enable_fast_add_peer: false, + } + } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index edfbaa215b7..580ec9384b9 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -2,6 +2,7 @@ use std::{ cell::Cell, collections::hash_map::Entry as MapEntry, + io::Write, ops::DerefMut, path::PathBuf, str::FromStr, @@ -13,11 +14,12 @@ use std::{ use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::{RaftEngine, SstMetaInfo}; +use engine_traits::{Peekable, RaftEngine, SstMetaInfo, CF_RAFT}; +use into_other::into_other; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::{RaftApplyState, RaftMessage}, + raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, }; use protobuf::Message; use raft::{eraftpb, eraftpb::MessageType, StateRole}; @@ -30,11 +32,14 @@ use raftstore::{ StoreSizeInfo, UpdateSafeTsObserver, }, store::{ - self, check_sst_for_ingestion, snap::plain_file_used, SnapKey, SnapManager, Transport, + self, check_sst_for_ingestion, + snap::{plain_file_used, SnapEntry}, + SnapKey, SnapManager, Transport, }, + Error as RaftStoreError, Result as RaftStoreResult, }; use sst_importer::SstImporter; -use tikv_util::{box_err, debug, error, info, warn}; +use tikv_util::{box_err, crit, debug, defer, error, info, store::find_peer, warn}; use yatp::{ pool::{Builder, ThreadPool}, task::future::TaskCell, @@ -47,6 +52,13 @@ use crate::{ WriteCmdType, WriteCmds, CF_LOCK, }; +macro_rules! fatal { + ($lvl:expr $(, $arg:expr)*) => ({ + crit!($lvl $(, $arg)*); + ::std::process::exit(1) + }) +} + #[allow(clippy::from_over_into)] impl Into for ffi_interfaces::StoreStats { fn into(self) -> FsStatsExt { @@ -101,7 +113,12 @@ const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; #[derive(Debug, Default)] pub struct CachedRegionInfo { pub replicated_or_created: AtomicBool, - pub inited: AtomicBool, + // TiKV assumes a region's learner peer is added through snapshot. + // If this field is false, will try fast path when meet MsgAppend. + // If this field is true, it means this peer is inited or will be inited by a TiKV snapshot. + // NOTE If we want a fallback, then we must set inited_or_fallback to true, + // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. + pub inited_or_fallback: AtomicBool, } pub type CachedRegionInfoMap = HashMap>; @@ -120,6 +137,7 @@ pub struct TiFlashObserver { // TODO should we use a Mutex here? pub trans: Arc>, pub snap_mgr: Arc, + pub engine_store_cfg: crate::EngineStoreConfig, } impl Clone for TiFlashObserver { @@ -137,6 +155,7 @@ impl Clone for TiFlashObserver { cached_region_info: self.cached_region_info.clone(), trans: self.trans.clone(), snap_mgr: self.snap_mgr.clone(), + engine_store_cfg: self.engine_store_cfg.clone(), } } } @@ -172,19 +191,40 @@ impl TiFlashObserver { &self, region_id: u64, mut f: F, - ) -> Result<(), String> { + ) -> RaftStoreResult<()> { let slot_id = Self::slot_index(region_id); - let guard = self.cached_region_info.get(slot_id).unwrap().write(); - let mut guard = guard.unwrap(); + let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { + Ok(g) => g, + Err(e) => return Err(box_err!("access_cached_region_info_mut poisoned")), + }; f(guard.entry(region_id)); Ok(()) } - pub fn is_first_msg_append(&self, msg: &RaftMessage) -> bool { - // return false; + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + // Returns whether we need to ignore this message and run fast path instead. + pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { + if !self.engine_store_cfg.enable_fast_add_peer { + // fast path not enabled + return false; + } // TODO Need to recover all region infomation from restart. let inner_msg = msg.get_message(); if inner_msg.get_msg_type() != MessageType::MsgAppend { + // we only handles the first MsgAppend return false; } let region_id = msg.get_region_id(); @@ -193,7 +233,7 @@ impl TiFlashObserver { let f = |info: MapEntry>| { match info { MapEntry::Occupied(mut o) => { - is_first = !o.get().inited.load(Ordering::SeqCst); + is_first = !o.get().inited_or_fallback.load(Ordering::SeqCst); // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { @@ -207,7 +247,7 @@ impl TiFlashObserver { } } MapEntry::Vacant(v) => { - info!("fast path: first MsgAppend of {}:{}, skip", self.store_id, region_id; + info!("fast path: ongoing {}:{}, first message", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, @@ -220,20 +260,17 @@ impl TiFlashObserver { // Can use immutable version. self.access_cached_region_info_mut(region_id, f).unwrap(); - if is_first { - info!("fast path: normal MsgAppend of {}:{}", self.store_id, region_id; + if !is_first { + info!( + "fast path: normal MsgAppend of {}:{}", + self.store_id, region_id ); return false; } - use std::io::Write; - - use engine_traits::{Peekable, CF_RAFT}; - use into_other::into_other; - use kvproto::raft_serverpb::{RaftApplyState, RegionLocalState}; - use raftstore::store::snap::SnapEntry; - use tikv_util::defer; { + // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, + // see `check_msg`. if !is_replicated { info!("fast path: ongoing {}:{}, wait replicating peer", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), @@ -249,38 +286,104 @@ impl TiFlashObserver { "from_peer_id" => msg.get_from_peer().get_id(), ); // Feed data - // #[cfg(any(test, feature = "testexport"))] - { - let mut s = crate::DebugStruct_UseLeaderForRegion { region_id }; - self.engine_store_server_helper.debug_func( - crate::USE_LEADER_FOR_REGION, - &s as *const crate::DebugStruct_UseLeaderForRegion as crate::RawVoidPtr, + let res = self.engine_store_server_helper.fast_add_peer(region_id); + if res != crate::FastAddPeerRes::Ok { + error!( + "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, res ); + if let Err(e) = self.set_inited_or_fallback(region_id, true) { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + // TODO clean local, and prepare to request snapshot from TiKV as a trivial + // procedure. + return false; } + info!("fast path: ongoing {}:{}, start buid and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + match self.build_and_send_snapshot(region_id, msg.get_to_peer().get_id(), msg) { + Ok(s) => { + if s != crate::FastAddPeerRes::Ok { + error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; + "is_first" => is_first,); + if let Err(e) = self.set_inited_or_fallback(region_id, true) { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } + } + Err(e) => { + error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; + "is_first" => is_first,); + if let Err(e) = self.set_inited_or_fallback(region_id, true) { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } + }; + info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + is_first + } + + fn build_and_send_snapshot( + &self, + region_id: u64, + new_peer_id: u64, + msg: &RaftMessage, + ) -> RaftStoreResult { + let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building let (mut snap, key, apply_state, region_state) = { - let apply_state: RaftApplyState = self + let apply_state: RaftApplyState = match self .engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .unwrap() - .unwrap(); - let region_state: RegionLocalState = self + .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id))? + { + Some(e) => e, + None => return Ok(crate::FastAddPeerRes::BadData), + }; + let region_state: RegionLocalState = match self .engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - .unwrap() - .unwrap(); + .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id))? + { + Some(e) => e, + None => return Ok(crate::FastAddPeerRes::BadData), + }; + + // check if the source already knows the know peer + match find_peer(region_state.get_region(), self.store_id) { + Some(peer) => { + if peer.get_id() != new_peer_id { + return Ok(crate::FastAddPeerRes::BadData); + } + } + None => return Ok(crate::FastAddPeerRes::BadData), + } + + // Find term of entry at applied_index. + let applied_index = apply_state.get_applied_index(); + let applied_term = match self.raft_engine.get_entry(region_id, applied_index)? { + Some(apply_entry) => apply_entry.get_term(), + None => { + return Err(box_err!( + "can't find entry for applied_index {} of region {}", + applied_index, + region_id + )); + } + }; let key = SnapKey::new( region_id, - apply_state.get_commit_term(), // TODO apply index term - apply_state.get_applied_index(), + applied_term, // TODO apply index term + applied_index, ); self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); - let snapshot = self.snap_mgr.get_empty_snapshot_for_building(&key).unwrap(); + let snapshot = self.snap_mgr.get_empty_snapshot_for_building(&key)?; - // let base = &self.snap_mgr.core.base; - // let f = Snapshot::new_for_building(base, key, &self.snap_mgr.core).unwrap(); (snapshot, key.clone(), apply_state, region_state) }; @@ -297,47 +400,48 @@ impl TiFlashObserver { { // Data for (cf_enum, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { - let cf_index = snap.cf_files.iter().position(|x| &x.cf == cf).unwrap(); + let cf_index: RaftStoreResult = snap + .cf_files + .iter() + .position(|x| &x.cf == cf) + .ok_or(box_err!("can't find index for cf {}", cf)); + let cf_index = cf_index?; let cf_file = &mut snap.cf_files[cf_index]; let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); - debug!("!!!! snap g {:?}", path); - let mut file = std::fs::File::create(path.as_path()).unwrap(); + debug!( + "!!!! snap g cf_file.path {:?} {:?} {:?}", + cf_file.path, cf_file.file_prefix, path + ); + let mut file = std::fs::File::create(path.as_path())?; // let mut file = std::fs::create_dir(); } - // Meta - snap.meta_file.meta = - Some(raftstore::store::snap::gen_snapshot_meta(&snap.cf_files[..], true).unwrap()); + snap_data.set_region(region_state.get_region().clone()); + snap_data.set_file_size(0); + let SNAPSHOT_VERSION = 2; + snap_data.set_version(SNAPSHOT_VERSION); + // MetaFile + // Which is snap.meta_file.meta + let meta_file_meta = + raftstore::store::snap::gen_snapshot_meta(&snap.cf_files[..], true)?; { - let v = snap - .meta_file - .meta - .as_ref() - .unwrap() - .write_to_bytes() - .unwrap(); - let mut f = std::fs::File::create(snap.meta_file.path.as_path()).unwrap(); - f.write_all(&v[..]).unwrap(); - f.flush().unwrap(); - f.sync_all().unwrap(); + let v = meta_file_meta.write_to_bytes()?; + let mut f = std::fs::File::create(snap.meta_file.path.as_path())?; + f.write_all(&v[..])?; + f.flush()?; + f.sync_all()?; } debug!( - "!!!!! snap 2 {:?} {:?} {}", + "!!!!! snap 2 {:?} {:?} XX {:?} {}", snap.meta_file.meta, snap.meta_file.file, + snap.meta_file.path, snap.cf_files.len() ); - // snap.save_meta_file().unwrap(); - // let mut file = std::fs::File::create(path.as_path()).unwrap(); - snap_data.set_region(region_state.get_region().clone()); - - snap_data.set_file_size(0); - let SNAPSHOT_VERSION = 2; - snap_data.set_version(SNAPSHOT_VERSION); - snap_data.set_meta(snap.meta_file.meta.as_ref().unwrap().clone()); + snap_data.set_meta(meta_file_meta.clone()); } // Compose snapshot @@ -350,22 +454,9 @@ impl TiFlashObserver { .mut_conf_state() .mut_learners() .push(msg.get_to_peer().get_id()); - metadata.set_index(inner_msg.get_index()); metadata.set_term(inner_msg.get_term()); - // snap_data.mut_meta().set_for_witness(true); - - // for cf in raftstore::store::snap::SNAPSHOT_CFS { - // let mut cf_file = kvproto::raft_serverpb::SnapshotCfFile::default(); - // let path = format!("/tmp/loop_{}.sst", cf); - // let mut file = std::fs::File::create(path.as_str()).unwrap(); - // cf_file.set_cf(cf.to_string()); - // cf_file.set_size(0); - // cf_file.set_checksum(0); - // snap_data.mut_meta().mut_cf_files().push(cf_file); - // } - snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); // Send reponse @@ -382,9 +473,14 @@ impl TiFlashObserver { response.mut_message().set_term(inner_msg.get_term()); response.mut_message().set_snapshot(snapshot); debug!("!!!!! send response {:?} data {:?}", response, snap_data); - let res = self.trans.lock().unwrap().send(response); - debug!("!!!!! send response FINISH {:?}", res); - is_first + match self.trans.lock() { + Ok(mut trans) => { + let res = trans.send(response); + } + Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), + } + + Ok(crate::FastAddPeerRes::Ok) } } @@ -397,6 +493,7 @@ impl TiFlashObserver { snap_handle_pool_size: usize, trans: T, snap_mgr: SnapManager, + engine_store_cfg: crate::EngineStoreConfig, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -421,6 +518,7 @@ impl TiFlashObserver { cached_region_info: Arc::new(cached_region_info), trans: Arc::new(Mutex::new(trans)), snap_mgr: Arc::new(snap_mgr), + engine_store_cfg, } } @@ -937,7 +1035,7 @@ impl RegionChangeObserver for TiFlashObs let inner_msg = msg.get_message(); if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { } else if inner_msg.get_msg_type() == MessageType::MsgAppend { - return self.is_first_msg_append(&msg); + return self.maybe_fast_path(&msg); } false } @@ -958,6 +1056,7 @@ impl RegionChangeObserver for TiFlashObs } } }; + // TODO remove unwrap self.access_cached_region_info_mut(region_id, f).unwrap(); } } @@ -1067,7 +1166,10 @@ impl ApplySnapshotObserver for TiFlashOb let (sender, receiver) = mpsc::channel(); let task = Arc::new(PrehandleTask::new(receiver, peer_id)); { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(e) => fatal!("pre_apply_snapshot poisoned"), + }; let ctx = lock.deref_mut(); ctx.tracer.insert(snap_key.clone(), task.clone()); } @@ -1125,36 +1227,41 @@ impl ApplySnapshotObserver for TiFlashOb ); let region_id = ob_ctx.region().get_id(); let mut should_skip = false; - self.access_cached_region_info_mut( + match self.access_cached_region_info_mut( region_id, |info: MapEntry>| match info { MapEntry::Occupied(mut o) => { - if !o.get().inited.load(Ordering::SeqCst) { - info!("fast path: first snapshot applied {}:{}, recover MsgAppend", self.store_id, region_id; + if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; "snap_key" => ?snap_key, ); } - should_skip = o.get().inited.load(Ordering::SeqCst); - o.get_mut().inited.store(true, Ordering::SeqCst); + should_skip = o.get().inited_or_fallback.load(Ordering::SeqCst); + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } MapEntry::Vacant(v) => { panic!("unknown snapshot!"); } }, - ) - .unwrap(); - if should_skip { - return; - } + ) { + Err(e) => fatal!("post_apply_snapshot poisoned"), + _ => (), + }; let snap = match snap { None => return, Some(s) => s, }; let maybe_snapshot = { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(e) => fatal!("post_apply_snapshot poisoned"), + }; let ctx = lock.deref_mut(); ctx.tracer.remove(snap_key) }; + if should_skip { + return; + } let need_retry = match maybe_snapshot { Some(t) => { let neer_retry = match t.recv.recv() { @@ -1164,8 +1271,10 @@ impl ApplySnapshotObserver for TiFlashOb "region" => ?ob_ctx.region(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); - self.engine_store_server_helper - .apply_pre_handled_snapshot(snap_ptr.0); + if !should_skip { + self.engine_store_server_helper + .apply_pre_handled_snapshot(snap_ptr.0); + } false } Err(_) => { @@ -1199,7 +1308,7 @@ impl ApplySnapshotObserver for TiFlashOb true } }; - if need_retry { + if need_retry && !should_skip { let ssts = retrieve_sst_files(snap); let ptr = pre_handle_snapshot_impl( self.engine_store_server_helper, diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 87d48ef22cd..25f4498e1a8 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -691,6 +691,7 @@ pub fn gen_engine_store_server_helper( fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), fn_debug_func: Some(ffi_debug_func), + fn_fast_add_peer: Some(ffi_fast_add_peer), } } @@ -1224,98 +1225,154 @@ unsafe extern "C" fn ffi_handle_compute_store_stats( } } -use engine_store_ffi::{DebugStruct_UseLeaderForRegion, USE_LEADER_FOR_REGION}; - unsafe extern "C" fn ffi_debug_func( arg1: *mut ffi_interfaces::EngineStoreServerWrap, debug_type: u64, ptr: ffi_interfaces::RawVoidPtr, ) -> ffi_interfaces::RawVoidPtr { + std::ptr::null_mut() +} + +unsafe extern "C" fn ffi_fast_add_peer( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + region_id: u64, +) -> ffi_interfaces::FastAddPeerRes { let store = into_engine_store_server_wrap(arg1); - if debug_type == USE_LEADER_FOR_REGION { - let s = &*(ptr as *const DebugStruct_UseLeaderForRegion); - let region_id = s.region_id; - let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); - let lock = cluster.ffi_helper_set.lock().unwrap(); - let source_server = &lock.get(&1).unwrap().engine_store_server; - let source_engines = &source_server.engines.clone().unwrap(); - let source_region = source_server.kvstore.get(®ion_id).unwrap(); - let new_region_meta = get_region_local_state(&source_engines.kv.rocks, region_id) - .get_region() - .clone(); - let new_region = make_new_region( - Some(new_region_meta.clone()), - Some((*store.engine_store_server).id), - ); - (*store.engine_store_server) - .kvstore - .insert(region_id, Box::new(new_region)); - let target_engines = (*store.engine_store_server).engines.clone().unwrap(); - let target_region = (*store.engine_store_server) - .kvstore - .get_mut(®ion_id) - .unwrap(); - debug!("recover from leader"; "region_id" => region_id, "region" => ?new_region_meta); - copy_data_from( - source_engines, - &target_engines, - &source_region, - target_region, - ) - .unwrap(); - copy_meta_from( - source_engines, - &target_engines, - &source_region, - target_region, - new_region_meta, - ) - .unwrap(); + let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); + let lock = cluster.ffi_helper_set.lock(); + let guard = match lock { + Ok(e) => e, + Err(e) => { + error!("ffi_debug_func failed to lock"); + return ffi_interfaces::FastAddPeerRes::OtherError; + } + }; + let from_store = (|| { + fail::fail_point!("ffi_fast_add_peer_from_id", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 1 + })(); + debug!("ffi_fast_add_peer from {}", from_store); + let source_server = match guard.get(&from_store) { + Some(s) => &s.engine_store_server, + None => return ffi_interfaces::FastAddPeerRes::NoSuitable, + }; + let source_engines = match source_server.engines.clone() { + Some(s) => s, + None => return ffi_interfaces::FastAddPeerRes::BadData, + }; + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, + None => return ffi_interfaces::FastAddPeerRes::BadData, + }; + let new_region_meta = match get_region_local_state(&source_engines.kv.rocks, region_id) { + Some(s) => s.get_region().clone(), + None => return ffi_interfaces::FastAddPeerRes::BadData, + }; + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = match (*store.engine_store_server).engines.clone() { + Some(s) => s, + None => return ffi_interfaces::FastAddPeerRes::OtherError, + }; + let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { + Some(s) => s, + None => return ffi_interfaces::FastAddPeerRes::BadData, + }; + debug!("recover from other peer"; "region_id" => region_id); + if let Err(_) = copy_data_from( + &source_engines, + &target_engines, + &source_region, + target_region, + ) { + return ffi_interfaces::FastAddPeerRes::FailedInject; } - std::ptr::null_mut() + debug!("recover meta from other peer"; "region_id" => region_id); + if let Err(_) = copy_meta_from( + &source_engines, + &target_engines, + &source_region, + target_region, + new_region_meta, + ) { + return ffi_interfaces::FastAddPeerRes::FailedInject; + } + debug!("recover from other peer ok"; "region_id" => region_id); + ffi_interfaces::FastAddPeerRes::Ok } use engine_store_ffi::RawVoidPtr; use engine_traits::{KvEngine, Mutable, RaftEngine, RaftEngineDebug, RaftLogBatch, WriteBatch}; use kvproto::raft_serverpb::RaftLocalState; -use tikv_util::box_try; +use tikv_util::{box_err, box_try}; // TODO Need refactor if moved to raft-engine +pub fn general_get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) +} + +// TODO Need refactor if moved to raft-engine +pub fn general_get_apply_state( + engine: &EK, + region_id: u64, +) -> Option { + let apply_state_key = keys::apply_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, &apply_state_key) + .unwrap_or(None) +} + pub fn get_region_local_state( engine: &engine_rocks::RocksEngine, region_id: u64, -) -> RegionLocalState { +) -> Option { let region_state_key = keys::region_state_key(region_id); - let region_state = match engine.get_msg_cf::(CF_RAFT, ®ion_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - region_state + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) } // TODO Need refactor if moved to raft-engine -pub fn get_apply_state(engine: &engine_rocks::RocksEngine, region_id: u64) -> RaftApplyState { +pub fn get_apply_state( + engine: &engine_rocks::RocksEngine, + region_id: u64, +) -> Option { let apply_state_key = keys::apply_state_key(region_id); - let apply_state = match engine.get_msg_cf::(CF_RAFT, &apply_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - apply_state + engine + .get_msg_cf::(CF_RAFT, &apply_state_key) + .unwrap_or(None) } pub fn get_raft_local_state( raft_engine: &ER, region_id: u64, -) -> RaftLocalState { - raft_engine.get_raft_state(region_id).unwrap().unwrap() +) -> Option { + match raft_engine.get_raft_state(region_id) { + Ok(Some(x)) => Some(x), + _ => None, + } } -pub fn copy_meta_from( - source_engines: &Engines< - impl KvEngine, - impl RaftEngine + engine_traits::Peekable + RaftEngineDebug, - >, - target_engines: &Engines, +pub fn copy_meta_from< + EK: engine_traits::KvEngine, + ER: RaftEngine + engine_traits::Peekable + RaftEngineDebug, +>( + source_engines: &Engines, + target_engines: &Engines, source: &Box, target: &mut Box, new_region_meta: kvproto::metapb::Region, @@ -1335,12 +1392,11 @@ pub fn copy_meta_from( // apply state { - let key = keys::apply_state_key(region_id); - let apply_state: RaftApplyState = source_engines - .kv - .get_msg_cf(CF_RAFT, &key) - .unwrap() - .unwrap(); + let apply_state: RaftApplyState = + match general_get_apply_state(&source_engines.kv, region_id) { + Some(x) => x, + None => return Err(box_err!("bad RaftApplyState")), + }; wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; target.apply_state = apply_state.clone(); target.applied_term = source.applied_term; @@ -1351,25 +1407,20 @@ pub fn copy_meta_from( // raft state { - let key = keys::raft_state_key(region_id); - let raft_state = source_engines - .raft - .get_msg_cf(CF_DEFAULT, &key) - .unwrap() - .unwrap(); + let raft_state = match get_raft_local_state(&source_engines.raft, region_id) { + Some(x) => x, + None => return Err(box_err!("bad RaftLocalState")), + }; raft_wb.put_raft_state(region_id, &raft_state)?; }; // raft log let mut entries: Vec = Default::default(); - source_engines - .raft - .scan_entries(region_id, |e| { - debug!("copy raft log"; "e" => ?e); - entries.push(e.clone()); - Ok(true) - }) - .unwrap(); + source_engines.raft.scan_entries(region_id, |e| { + debug!("copy raft log"; "e" => ?e); + entries.push(e.clone()); + Ok(true) + })?; raft_wb.append(region_id, entries)?; box_try!(target_engines.raft.consume(&mut raft_wb, true)); diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index c75ee573fd6..3111e9f53ee 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -1069,6 +1069,13 @@ impl> Cluster { &self.get_tiflash_engine(node_id).rocks } + pub fn clear_send_filters(&mut self) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + sim.clear_send_filters(node_id); + } + } + pub fn must_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { let timer = Instant::now(); loop { diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 6b06ce9332e..c25bee26f8a 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -323,6 +323,7 @@ impl Simulator for NodeCluster { cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), snap_mgr.clone(), + cfg.proxy_cfg.engine_store.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 13f5889390c..bdf45e573e7 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -563,6 +563,7 @@ impl ServerCluster { cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), snap_mgr.clone(), + cfg.proxy_cfg.engine_store.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/proxy_server/src/config.rs b/proxy_server/src/config.rs index 2908c08f9c2..bcbaa00b02e 100644 --- a/proxy_server/src/config.rs +++ b/proxy_server/src/config.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, iter::FromIterator, path::Path}; +use engine_store_ffi::EngineStoreConfig; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use itertools::Itertools; use online_config::OnlineConfig; @@ -254,6 +255,9 @@ pub struct ProxyConfig { #[online_config(skip)] pub import: ImportConfig, + + #[online_config(skip)] + pub engine_store: EngineStoreConfig, } /// We use custom default, in case of later non-ordinary config items. @@ -269,6 +273,7 @@ impl Default for ProxyConfig { enable_io_snoop: false, readpool: ReadPoolConfig::default(), import: ImportConfig::default(), + engine_store: EngineStoreConfig::default(), } } } diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index f25c983727a..b5949e9ae65 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -1216,6 +1216,7 @@ impl TiKvServer { self.proxy_config.raft_store.snap_handle_pool_size, server.transport().clone(), snap_mgr.clone(), + self.proxy_config.engine_store.clone(), ); tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); diff --git a/proxy_tests/proxy/mod.rs b/proxy_tests/proxy/mod.rs index c2d2336999d..1d7edced540 100644 --- a/proxy_tests/proxy/mod.rs +++ b/proxy_tests/proxy/mod.rs @@ -8,6 +8,7 @@ extern crate slog_global; mod config; +mod fast_add_peer; mod flashback; mod normal; mod proxy; diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index e80989ce2b7..84c281e1396 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -112,9 +112,9 @@ pub fn maybe_collect_states( States { in_memory_apply_state: region.apply_state.clone(), in_memory_applied_term: region.applied_term, - in_disk_apply_state: get_apply_state(&engine, region_id), - in_disk_region_state: get_region_local_state(&engine, region_id), - in_disk_raft_state: get_raft_local_state(raft_engine, region_id), + in_disk_apply_state: get_apply_state(&engine, region_id).unwrap(), + in_disk_region_state: get_region_local_state(&engine, region_id).unwrap(), + in_disk_raft_state: get_raft_local_state(raft_engine, region_id).unwrap(), ident, }, ); @@ -173,7 +173,7 @@ pub fn must_get_mem( std::thread::sleep(std::time::Duration::from_millis(20)); } let s = std::str::from_utf8(key).unwrap_or(""); - panic!( + let e = format!( "can't get mem value {:?} for key {}({}) in store {} cf {:?}, actual {:?}", value.map(tikv_util::escape), log_wrappers::hex_encode_upper(key), @@ -181,7 +181,9 @@ pub fn must_get_mem( engine_store_server.id, cf, last_res, - ) + ); + error!("{}", e); + panic!("{}", e); } pub fn must_put_and_check_key_with_generator (String, String)>( @@ -263,7 +265,13 @@ pub fn check_key( }; match in_mem { Some(b) => { - let lock = cluster.ffi_helper_set.lock().unwrap(); + let lock = match cluster.ffi_helper_set.lock() { + Ok(l) => l, + Err(e) => { + error!("check_key poison"); + std::process::exit(1); + } + }; let server = &lock.get(&id).unwrap().engine_store_server; if b { must_get_mem(server, region_id, k, Some(v)); @@ -564,3 +572,21 @@ pub fn must_wait_until_cond_states( } } } + +pub fn force_compact_log( + cluster: &mut Cluster, + key: &[u8], + use_nodes: Option>, +) -> u64 { + let region = cluster.get_region(key); + let region_id = region.get_id(); + let prev_states = maybe_collect_states(&cluster, region_id, None); + + let (compact_index, compact_term) = get_valid_compact_index_by(&prev_states, use_nodes); + let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let _ = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + return compact_index; +} diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 6254efa428a..c22f4e446eb 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -423,24 +423,6 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ } } -fn force_compact_log( - cluster: &mut Cluster, - key: &[u8], - use_nodes: Option>, -) -> u64 { - let region = cluster.get_region(key); - let region_id = region.get_id(); - let prev_states = maybe_collect_states(&cluster, region_id, None); - - let (compact_index, compact_term) = get_valid_compact_index_by(&prev_states, use_nodes); - let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); - let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); - let _ = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - return compact_index; -} - #[test] fn test_add_delayed_started_learner_no_snapshot() { // fail::cfg("before_tiflash_check_double_write", "return").unwrap(); @@ -625,26 +607,3 @@ fn test_add_delayed_started_learner_snapshot() { fail::remove("on_pre_persist_with_finish"); cluster.shutdown(); } - -#[test] -fn test_fast_add_peer2() { - let (mut cluster, pd_client) = new_mock_cluster(0, 2); - fail::cfg("on_pre_persist_with_finish", "return").unwrap(); - disable_auto_gen_compact_log(&mut cluster); - // Siable auto generate peer. - pd_client.disable_default_operator(); - let _ = cluster.run_conf_change(); - - // If we don't write here, we will have the first MsgAppend with (6,6), which - // will cause "fast-forwarded commit to snapshot". - cluster.must_put(b"k0", b"v0"); - - pd_client.must_add_peer(1, new_learner_peer(2, 2)); - - std::thread::sleep(std::time::Duration::from_millis(1000)); - cluster.must_put(b"k1", b"v1"); - check_key(&cluster, b"k1", b"v1", Some(true), None, None); - - fail::remove("on_pre_persist_with_finish"); - cluster.shutdown(); -} diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 58b337ebfc1..813937fde26 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4624446451501389788ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 13418513559228271669ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index a5915fd5a54..67a173469ed 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -143,6 +143,14 @@ enum class KVGetStatus : uint32_t { NotFound, }; +enum class FastAddPeerRes : uint32_t { + Ok = 0, + OtherError, + NoSuitable, + BadData, + FailedInject, +}; + struct RaftStoreProxyFFIHelper { RaftStoreProxyPtr proxy_ptr; RaftProxyStatus (*fn_handle_get_proxy_status)(RaftStoreProxyPtr); @@ -217,5 +225,7 @@ struct EngineStoreServerHelper { uint64_t leader_safe_ts); RawVoidPtr (*fn_debug_func)(EngineStoreServerWrap *, uint64_t type, RawVoidPtr); + FastAddPeerRes (*fn_fast_add_peer)(EngineStoreServerWrap *, + uint64_t region_id); }; } // namespace DB From e6228f587648aa2cf93d2e9e9228aebdb1786e1a Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 9 Dec 2022 12:06:31 +0800 Subject: [PATCH 006/115] use snapshot to compose raft apply state Signed-off-by: CalvinNeo --- Cargo.lock | 2 + Cargo.toml | 8 +- components/raftstore/src/store/fsm/peer.rs | 20 --- .../raftstore/src/store/peer_storage.rs | 6 + components/raftstore/src/store/snap.rs | 7 +- engine_store_ffi/src/interfaces.rs | 19 ++- engine_store_ffi/src/lib.rs | 12 +- engine_store_ffi/src/observer.rs | 144 +++++++++++------- new-mock-engine-store/src/lib.rs | 124 ++++++++++++--- proxy_tests/proxy/proxy.rs | 34 ++++- proxy_tests/proxy/region.rs | 2 + .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 8 +- 13 files changed, 278 insertions(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3ded3d8e68a..fa64aa145a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4484,6 +4484,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" +source = "git+https://github.com/tikv/raft-rs?branch=master#36d3293a8b1a32c4b4115855419108386abcdc4a" dependencies = [ "bytes", "fxhash", @@ -4531,6 +4532,7 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" +source = "git+https://github.com/tikv/raft-rs?branch=master#36d3293a8b1a32c4b4115855419108386abcdc4a" dependencies = [ "bytes", "protobuf", diff --git a/Cargo.toml b/Cargo.toml index 38a721c49a5..d1a5ddad5b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -193,10 +193,10 @@ zipf = "6.1.0" prometheus = { git = "https://github.com/solotzg/rust-prometheus.git", rev = "b4fe98a06a58d29f9b9987a0d7186f6ed5230193" } # TODO: remove this when new raft-rs is published. -#raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } -raft = { path = "/Users/calvin/tiflash/raft-rs" } -# raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } -raft-proto = { path = "/Users/calvin/tiflash/raft-rs/proto" } +raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } +#raft = { path = "/Users/calvin/tiflash/raft-rs" } +raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } +# raft-proto = { path = "/Users/calvin/tiflash/raft-rs/proto" } protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 2cda4403d6f..23624c1dc5c 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2455,10 +2455,6 @@ where self.handle_reported_disk_usage(&msg); - tikv_util::debug!("!!!!! on_raft_message after check 0.1"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); let msg_type = msg.get_message().get_msg_type(); if matches!(self.ctx.self_disk_usage, DiskUsage::AlreadyFull) && MessageType::MsgTimeoutNow == msg_type @@ -2475,10 +2471,6 @@ where return Ok(()); } - tikv_util::debug!("!!!!! on_raft_message after check 0.2"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if msg.get_is_tombstone() { // we receive a message tells us to remove ourself. self.handle_gc_peer_msg(&msg); @@ -2493,18 +2485,10 @@ where return Ok(()); } - tikv_util::debug!("!!!!! on_raft_message after check 0.3"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if self.check_msg(&msg) { return Ok(()); } - tikv_util::debug!("!!!!! on_raft_message after check 2"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if msg.has_extra_msg() { self.on_extra_message(msg); return Ok(()); @@ -2512,10 +2496,6 @@ where let is_snapshot = msg.get_message().has_snapshot(); - tikv_util::debug!("!!!!! on_raft_message after check 3"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 61bf22dcd97..7346be77fbe 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -396,6 +396,7 @@ where #[inline] pub fn save_apply_state_to(&self, kv_wb: &mut impl Mutable) -> Result<()> { + debug!("!!!! save_apply_state_to {:?}", self.apply_state()); kv_wb.put_msg_cf( CF_RAFT, &keys::apply_state_key(self.region.get_id()), @@ -638,6 +639,11 @@ where let snap_index = snap.get_metadata().get_index(); let snap_term = snap.get_metadata().get_term(); + debug!("!!!! apply snapshot {}", self.peer_id; + "snap_index" => snap_index, + "snap_term" => snap_term, + ); + self.raft_state_mut().set_last_index(snap_index); self.set_last_term(snap_term); self.apply_state_mut().set_applied_index(snap_index); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 68c7977da92..e05e98a9bf7 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -967,6 +967,7 @@ impl Snapshot { debug!( "deleting snapshot file"; "snapshot" => %self.path(), + "!!!!! bt" => ?std::backtrace::Backtrace::capture(), ); for cf_file in &self.cf_files { // Delete cloned files. @@ -1118,10 +1119,11 @@ impl Snapshot { pub fn exists(&self) -> bool { self.cf_files.iter().all(|cf_file| { debug!( - "!!!!! copy_snapshot exists cf_file.size {:?} cf_file.file_paths() {:?} meta {:?}", + "!!!!! exists cf_file.size {:?} cf_file.file_paths() {:?} meta {:?} {}", cf_file.size, cf_file.file_paths(), - self.meta_file.path + self.meta_file.path, + file_exists(&self.meta_file.path) ); cf_file.size.is_empty() || (cf_file @@ -1670,6 +1672,7 @@ impl SnapManager { "register snapshot"; "key" => %key, "entry" => ?entry, + "!!!!! bt" => ?std::backtrace::Backtrace::capture(), ); match self.core.registry.wl().entry(key) { Entry::Occupied(mut e) => { diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 45d8f5d81a1..c0edbc4a04e 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -232,12 +232,19 @@ pub mod root { } #[repr(u32)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] - pub enum FastAddPeerRes { + pub enum FastAddPeerStatus { Ok = 0, - OtherError = 1, - NoSuitable = 2, - BadData = 3, - FailedInject = 4, + WaitForData = 1, + OtherError = 2, + NoSuitable = 3, + BadData = 4, + FailedInject = 5, + } + #[repr(C)] + #[derive(Debug)] + pub struct FastAddPeerRes { + pub status: root::DB::FastAddPeerStatus, + pub apply_state: root::DB::CppStrWithView, } #[repr(C)] #[derive(Debug)] @@ -474,7 +481,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 13418513559228271669; + pub const RAFT_STORE_PROXY_VERSION: u64 = 8449819960368956227; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 41c57fae8ef..6def49017dc 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -32,9 +32,10 @@ pub use read_index_helper::ReadIndexClient; pub use self::interfaces::root::DB::{ BaseBuffView, ColumnFamilyType, CppStrVecView, EngineStoreApplyRes, EngineStoreServerHelper, - EngineStoreServerStatus, FastAddPeerRes, FileEncryptionRes, FsStats, HttpRequestRes, - HttpRequestStatus, KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, - RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, FileEncryptionRes, FsStats, + HttpRequestRes, HttpRequestStatus, KVGetStatus, RaftCmdHeader, RaftProxyStatus, + RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, + WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, FileEncryptionInfoRaw, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, @@ -1066,7 +1067,10 @@ impl EngineStoreServerHelper { } } - fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { + // Generate a cpp string, so the other side can read. + // The string is owned by the otherside, and will be deleted by + // `gc_raw_cpp_ptr`. + pub fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { debug_assert!(self.fn_gen_cpp_string.is_some()); unsafe { (self.fn_gen_cpp_string.into_inner())(buff.into()).into_raw() as RawCppStringPtr } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 580ec9384b9..fe348a0d66e 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -215,6 +215,15 @@ impl TiFlashObserver { ) } + fn fallback_to_slow_path(&self, region_id: u64) { + // TODO clean local, and prepare to request snapshot from TiKV as a trivial + // procedure. + fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); + if let Err(e) = self.set_inited_or_fallback(region_id, true) { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } + // Returns whether we need to ignore this message and run fast path instead. pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { if !self.engine_store_cfg.enable_fast_add_peer { @@ -287,32 +296,56 @@ impl TiFlashObserver { ); // Feed data let res = self.engine_store_server_helper.fast_add_peer(region_id); - if res != crate::FastAddPeerRes::Ok { - error!( - "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", - self.store_id, region_id, res - ); - if let Err(e) = self.set_inited_or_fallback(region_id, true) { - tikv_util::safe_panic!("set_inited_or_fallback"); + match res.status { + crate::FastAddPeerStatus::Ok => (), + crate::FastAddPeerStatus::WaitForData => { + error!( + "fast path: ongoing {}:{}. remote peer preparing data, wait", + self.store_id, region_id + ); + return true; } - // TODO clean local, and prepare to request snapshot from TiKV as a trivial - // procedure. - return false; - } + _ => { + error!( + "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, res + ); + self.fallback_to_slow_path(region_id); + return false; + } + }; info!("fast path: ongoing {}:{}, start buid and send", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); - match self.build_and_send_snapshot(region_id, msg.get_to_peer().get_id(), msg) { + let apply_state_str = res.apply_state.view.to_slice(); + let mut apply_state = RaftApplyState::default(); + apply_state.merge_from_bytes(apply_state_str); + match self.build_and_send_snapshot(region_id, msg.get_to_peer().get_id(), msg, apply_state) + { Ok(s) => { - if s != crate::FastAddPeerRes::Ok { - error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; - "is_first" => is_first,); - if let Err(e) = self.set_inited_or_fallback(region_id, true) { - tikv_util::safe_panic!("set_inited_or_fallback"); + match s { + crate::FastAddPeerStatus::Ok => { + info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); } - } + crate::FastAddPeerStatus::WaitForData => { + error!( + "fast path: ongoing {}:{}. remote peer preparing data, wait", + self.store_id, region_id + ); + return true; + } + _ => { + error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; + "is_first" => is_first,); + self.fallback_to_slow_path(region_id); + return false; + } + }; } Err(e) => { error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; @@ -320,12 +353,9 @@ impl TiFlashObserver { if let Err(e) = self.set_inited_or_fallback(region_id, true) { tikv_util::safe_panic!("set_inited_or_fallback"); } + return false; } }; - info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - ); is_first } @@ -334,33 +364,36 @@ impl TiFlashObserver { region_id: u64, new_peer_id: u64, msg: &RaftMessage, - ) -> RaftStoreResult { + apply_state: RaftApplyState, + ) -> RaftStoreResult { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building - let (mut snap, key, apply_state, region_state) = { - let apply_state: RaftApplyState = match self - .engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id))? - { - Some(e) => e, - None => return Ok(crate::FastAddPeerRes::BadData), - }; + let (mut snap, key, region_state) = { let region_state: RegionLocalState = match self .engine .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id))? { Some(e) => e, - None => return Ok(crate::FastAddPeerRes::BadData), + None => return Ok(crate::FastAddPeerStatus::BadData), }; // check if the source already knows the know peer - match find_peer(region_state.get_region(), self.store_id) { + let has_peer = match find_peer(region_state.get_region(), self.store_id) { Some(peer) => { if peer.get_id() != new_peer_id { - return Ok(crate::FastAddPeerRes::BadData); + false + } else { + true } } - None => return Ok(crate::FastAddPeerRes::BadData), + None => false, + }; + if !has_peer { + warn!( + "build_and_send_snapshot remote peer has not applied conf change {:?}", + region_state.get_region() + ); + return Ok(crate::FastAddPeerStatus::WaitForData); } // Find term of entry at applied_index. @@ -384,7 +417,7 @@ impl TiFlashObserver { defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); let snapshot = self.snap_mgr.get_empty_snapshot_for_building(&key)?; - (snapshot, key.clone(), apply_state, region_state) + (snapshot, key.clone(), region_state) }; debug!( @@ -394,11 +427,11 @@ impl TiFlashObserver { snap.cf_files.len() ); // Build snapshot by do_snapshot - let mut snapshot: eraftpb::Snapshot = Default::default(); - let metadata: &mut eraftpb::SnapshotMetadata = snapshot.mut_metadata(); + let mut pb_snapshot: eraftpb::Snapshot = Default::default(); + let pb_snapshot_metadata: &mut eraftpb::SnapshotMetadata = pb_snapshot.mut_metadata(); let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); { - // Data + // eraftpb::SnapshotMetadata for (cf_enum, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { let cf_index: RaftStoreResult = snap .cf_files @@ -421,18 +454,21 @@ impl TiFlashObserver { snap_data.set_file_size(0); let SNAPSHOT_VERSION = 2; snap_data.set_version(SNAPSHOT_VERSION); - // MetaFile + + // SnapshotMeta // Which is snap.meta_file.meta - let meta_file_meta = + let mut snapshot_meta = raftstore::store::snap::gen_snapshot_meta(&snap.cf_files[..], true)?; + + // Write MetaFile { - let v = meta_file_meta.write_to_bytes()?; + let v = snapshot_meta.write_to_bytes()?; let mut f = std::fs::File::create(snap.meta_file.path.as_path())?; f.write_all(&v[..])?; f.flush()?; f.sync_all()?; } - + snap_data.set_meta(snapshot_meta); debug!( "!!!!! snap 2 {:?} {:?} XX {:?} {}", snap.meta_file.meta, @@ -440,24 +476,21 @@ impl TiFlashObserver { snap.meta_file.path, snap.cf_files.len() ); - - snap_data.set_meta(meta_file_meta.clone()); } - // Compose snapshot // TODO The rest is test, please remove it after we can fetch the real data. - metadata + pb_snapshot_metadata .mut_conf_state() .mut_voters() .push(msg.get_from_peer().get_id()); - metadata + pb_snapshot_metadata .mut_conf_state() .mut_learners() .push(msg.get_to_peer().get_id()); - metadata.set_index(inner_msg.get_index()); - metadata.set_term(inner_msg.get_term()); + pb_snapshot_metadata.set_index(key.idx); + pb_snapshot_metadata.set_term(key.term); - snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + pb_snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); // Send reponse let mut response = RaftMessage::default(); @@ -471,8 +504,11 @@ impl TiFlashObserver { .mut_message() .set_msg_type(MessageType::MsgSnapshot); response.mut_message().set_term(inner_msg.get_term()); - response.mut_message().set_snapshot(snapshot); - debug!("!!!!! send response {:?} data {:?}", response, snap_data); + response.mut_message().set_snapshot(pb_snapshot); + debug!( + "!!!!! send response key {} response {:?} data {:?}", + key, response, snap_data + ); match self.trans.lock() { Ok(mut trans) => { let res = trans.send(response); @@ -480,7 +516,7 @@ impl TiFlashObserver { Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), } - Ok(crate::FastAddPeerRes::Ok) + Ok(crate::FastAddPeerStatus::Ok) } } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 25f4498e1a8..64d154c0d52 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1233,6 +1233,35 @@ unsafe extern "C" fn ffi_debug_func( std::ptr::null_mut() } +fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { + match s { + Some(s) => { + let len = s.len() as u64; + let ptr = Box::into_raw(Box::new(s)); // leak + ffi_interfaces::CppStrWithView { + inner: ffi_interfaces::RawCppPtr { + ptr: ptr as *mut _, + type_: RawCppPtrTypeImpl::String.into(), + }, + view: ffi_interfaces::BaseBuffView { + data: ptr as *mut _, + len, + }, + } + } + None => ffi_interfaces::CppStrWithView { + inner: ffi_interfaces::RawCppPtr { + ptr: std::ptr::null_mut(), + type_: RawCppPtrTypeImpl::None.into(), + }, + view: ffi_interfaces::BaseBuffView { + data: std::ptr::null(), + len: 0, + }, + }, + } +} + unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, region_id: u64, @@ -1244,7 +1273,10 @@ unsafe extern "C" fn ffi_fast_add_peer( Ok(e) => e, Err(e) => { error!("ffi_debug_func failed to lock"); - return ffi_interfaces::FastAddPeerRes::OtherError; + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::OtherError, + apply_state: create_cpp_str(None), + }; } }; let from_store = (|| { @@ -1257,19 +1289,40 @@ unsafe extern "C" fn ffi_fast_add_peer( debug!("ffi_fast_add_peer from {}", from_store); let source_server = match guard.get(&from_store) { Some(s) => &s.engine_store_server, - None => return ffi_interfaces::FastAddPeerRes::NoSuitable, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::NoSuitable, + apply_state: create_cpp_str(None), + }; + } }; let source_engines = match source_server.engines.clone() { Some(s) => s, - None => return ffi_interfaces::FastAddPeerRes::BadData, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + }; + } }; let source_region = match source_server.kvstore.get(®ion_id) { Some(s) => s, - None => return ffi_interfaces::FastAddPeerRes::BadData, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + }; + } }; + // TODO We must ask the remote peer to persist before get a snapshot. let new_region_meta = match get_region_local_state(&source_engines.kv.rocks, region_id) { Some(s) => s.get_region().clone(), - None => return ffi_interfaces::FastAddPeerRes::BadData, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + }; + } }; let new_region = make_new_region( Some(new_region_meta.clone()), @@ -1280,33 +1333,66 @@ unsafe extern "C" fn ffi_fast_add_peer( .insert(region_id, Box::new(new_region)); let target_engines = match (*store.engine_store_server).engines.clone() { Some(s) => s, - None => return ffi_interfaces::FastAddPeerRes::OtherError, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::OtherError, + apply_state: create_cpp_str(None), + }; + } }; let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { Some(s) => s, - None => return ffi_interfaces::FastAddPeerRes::BadData, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + }; + } }; - debug!("recover from other peer"; "region_id" => region_id); + debug!("recover from remote peer: data"; "region_id" => region_id); if let Err(_) = copy_data_from( &source_engines, &target_engines, &source_region, target_region, ) { - return ffi_interfaces::FastAddPeerRes::FailedInject; + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::FailedInject, + apply_state: create_cpp_str(None), + }; } - debug!("recover meta from other peer"; "region_id" => region_id); + debug!("recover from remote peer: meta"; "region_id" => region_id); + // Do not copy apply state, otherwise may race with `apply_snapshot` if let Err(_) = copy_meta_from( &source_engines, &target_engines, &source_region, target_region, new_region_meta, + true, + false, ) { - return ffi_interfaces::FastAddPeerRes::FailedInject; + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::FailedInject, + apply_state: create_cpp_str(None), + }; + } + let apply_state: RaftApplyState = match general_get_apply_state(&source_engines.kv, region_id) { + Some(x) => x, + None => { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + }; + } + }; + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + + debug!("recover from remote peer: ok"; "region_id" => region_id); + ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::Ok, + apply_state: create_cpp_str(Some(apply_state_bytes)), } - debug!("recover from other peer ok"; "region_id" => region_id); - ffi_interfaces::FastAddPeerRes::Ok } use engine_store_ffi::RawVoidPtr; @@ -1376,6 +1462,8 @@ pub fn copy_meta_from< source: &Box, target: &mut Box, new_region_meta: kvproto::metapb::Region, + copy_region_state: bool, + copy_apply_state: bool, ) -> raftstore::Result<()> { let region_id = source.region.get_id(); @@ -1386,12 +1474,14 @@ pub fn copy_meta_from< // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); // region local state - let mut state = RegionLocalState::default(); - state.set_region(new_region_meta); - box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); + if copy_region_state { + let mut state = RegionLocalState::default(); + state.set_region(new_region_meta); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); + } // apply state - { + if copy_apply_state { let apply_state: RaftApplyState = match general_get_apply_state(&source_engines.kv, region_id) { Some(x) => x, diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 84c281e1396..97e8718b6d2 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -42,7 +42,7 @@ pub use test_raftstore::{new_learner_peer, new_peer}; pub use tikv_util::{ box_err, box_try, config::{ReadableDuration, ReadableSize}, - store::find_peer, + store::{find_peer, find_peer_by_id}, time::Duration, HandyRwLock, }; @@ -573,6 +573,38 @@ pub fn must_wait_until_cond_states( } } +// Must wait until some node satisfy cond given by `pref`. +pub fn must_wait_until_cond_node( + cluster: &Cluster, + region_id: u64, + store_ids: Option>, + pred: &dyn Fn(&States) -> bool, +) -> HashMap { + let mut retry = 0; + loop { + let new_states = maybe_collect_states(&cluster, region_id, store_ids.clone()); + if let Some(ref e) = store_ids { + assert_eq!(e.len(), new_states.len()); + } + let mut ok = true; + for i in new_states.keys() { + let new = new_states.get(i).unwrap(); + if !pred(new) { + ok = false; + break; + } + } + if ok { + break new_states; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retry += 1; + if retry >= 30 { + panic!("states not as expect after timeout") + } + } +} + pub fn force_compact_log( cluster: &mut Cluster, key: &[u8], diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index c22f4e446eb..2924d08a619 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -407,6 +407,8 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ &source_region, region, new_region_meta.clone(), + true, + true, ) .unwrap(); } else { diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 813937fde26..919198786b8 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 13418513559228271669ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 8449819960368956227ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 67a173469ed..bc55604fcc7 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -143,14 +143,20 @@ enum class KVGetStatus : uint32_t { NotFound, }; -enum class FastAddPeerRes : uint32_t { +enum class FastAddPeerStatus : uint32_t { Ok = 0, + WaitForData, OtherError, NoSuitable, BadData, FailedInject, }; +struct FastAddPeerRes { + FastAddPeerStatus status; + CppStrWithView apply_state; +}; + struct RaftStoreProxyFFIHelper { RaftStoreProxyPtr proxy_ptr; RaftProxyStatus (*fn_handle_get_proxy_status)(RaftStoreProxyPtr); From f529bce3917a8633cde790f3b102f29b5b9236fd Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 9 Dec 2022 20:08:24 +0800 Subject: [PATCH 007/115] do not copy all meta Signed-off-by: CalvinNeo --- components/raftstore/src/router.rs | 1 - components/raftstore/src/store/fsm/peer.rs | 42 +--- .../raftstore/src/store/peer_storage.rs | 9 +- components/raftstore/src/store/snap.rs | 7 +- components/test_raftstore/src/router.rs | 2 - engine_store_ffi/src/interfaces.rs | 4 +- engine_store_ffi/src/lib.rs | 4 +- engine_store_ffi/src/observer.rs | 98 ++++---- new-mock-engine-store/src/lib.rs | 222 +++++++++++------- new-mock-engine-store/src/node.rs | 1 - proxy_tests/proxy/proxy.rs | 44 ++-- proxy_tests/proxy/region.rs | 1 + .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 3 +- 14 files changed, 234 insertions(+), 206 deletions(-) diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 4e1eb4fe2f1..1ded8be3886 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -267,7 +267,6 @@ pub fn handle_send_error(region_id: u64, e: TrySendError) -> RaftStoreErro impl RaftStoreRouter for RaftRouter { fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { - tikv_util::debug!("!!!!! RaftStoreRouter::send_raft_msg"); let region_id = msg.get_region_id(); self.send_raft_message(msg) .map_err(|e| handle_send_error(region_id, e)) diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 23624c1dc5c..d039a8cd6e5 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2443,12 +2443,6 @@ where "to_peer_id" => msg.get_to_peer().get_id(), ); - tikv_util::debug!("!!!!! on_raft_message after check 0 {:?}", msg.get_message().get_msg_type(); - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id(), - "self.fsm.stopped" => self.fsm.stopped, - "commit" => msg.get_message().get_commit(), - ); if self.fsm.peer.pending_remove || self.fsm.stopped { return Ok(()); } @@ -2499,10 +2493,6 @@ where // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { - tikv_util::debug!("!!!!! on_raft_message after check 3.1"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if let Some(key) = key { // If the snapshot file is not used again, then it's OK to // delete them here. If the snapshot file will be reused when @@ -2513,19 +2503,9 @@ where } return Ok(()); } - Either::Right(v) => { - tikv_util::debug!("!!!!! on_raft_message after check 3.2"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); - v - } + Either::Right(v) => v, }; - tikv_util::debug!("!!!!! on_raft_message after check 4"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); if util::is_vote_msg(msg.get_message()) || msg_type == MessageType::MsgTimeoutNow { if self.fsm.hibernate_state.group_state() != GroupState::Chaos { self.fsm.reset_hibernate_state(GroupState::Chaos); @@ -2552,28 +2532,14 @@ where self.ctx.raft_metrics.message_dropped.stale_msg.inc(); return Ok(()); } - let res = self.fsm.peer.step(self.ctx, msg.take_message()); - tikv_util::debug!("!!!!! on_raft_message after check 4.1"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); - res + self.fsm.peer.step(self.ctx, msg.take_message()) }; stepped.set(result.is_ok()); - tikv_util::debug!("!!!!! on_raft_message after check 5"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id(), - "result" => ?result - ); if is_snapshot { if !self.fsm.peer.has_pending_snapshot() { // This snapshot is rejected by raft-rs. - tikv_util::debug!("!!!!! on_raft_message after check 5.1"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); let mut meta = self.ctx.store_meta.lock().unwrap(); meta.pending_snapshot_regions .retain(|r| self.fsm.region_id() != r.get_id()); @@ -2584,10 +2550,6 @@ where // region after applying that snapshot. // But if `regions_to_destroy` is not empty, the pending snapshot must be this // msg's snapshot because this kind of snapshot is exclusive. - tikv_util::debug!("!!!!! on_raft_message after check 5.2"; - "region_id" => self.region_id(), - "peer_id" => self.fsm.peer_id() - ); self.destroy_regions_for_snapshot(regions_to_destroy); } } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 7346be77fbe..62ee8549bbd 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -206,10 +206,6 @@ fn init_apply_state( state.set_index(RAFT_INIT_LOG_INDEX); state.set_term(RAFT_INIT_LOG_TERM); } - debug!( - "!!!!! init_raft_state {}", - util::is_region_initialized(region) - ); apply_state } }, @@ -863,6 +859,10 @@ where } pub fn schedule_applying_snapshot(&mut self) { + debug!( + "!!!!!! schedule_applying_snapshot {:?}", + std::backtrace::Backtrace::capture() + ); let status = Arc::new(AtomicUsize::new(JOB_STATUS_PENDING)); self.set_snap_state(SnapState::Applying(Arc::clone(&status))); let task = RegionTask::Apply { @@ -925,6 +925,7 @@ where // and has not applied snapshot yet, so skip persistent hard state. if self.raft_state().get_last_index() > 0 { if let Some(hs) = ready.hs() { + debug!("!!!! apply_snapshot set hard state"); self.raft_state_mut().set_hard_state(hs.clone()); } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index e05e98a9bf7..e5e85347a24 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -432,7 +432,7 @@ pub struct Snapshot { key: SnapKey, display_path: String, dir_path: PathBuf, - pub cf_files: Vec, + cf_files: Vec, cf_index: usize, cf_file_index: usize, pub meta_file: MetaFile, @@ -644,7 +644,6 @@ impl Snapshot { // new file at the temporary meta file path, so that all other try will fail. fn init_for_building(&mut self) -> RaftStoreResult<()> { if self.exists() { - debug!("!!!!! init_for_building exists"); return Ok(()); } let file = OpenOptions::new() @@ -874,10 +873,6 @@ impl Snapshot { for (cf_enum, cf) in SNAPSHOT_CFS_ENUM_PAIR { self.switch_to_cf_file(cf)?; let cf_file = &mut self.cf_files[self.cf_index]; - info!( - "!!!!! buuild {:?} {} {} {:?}", - cf_file.path, cf_file.file_prefix, cf_file.file_suffix, cf_file.file_names - ); let cf_stat = if plain_file_used(cf_file.cf) { let key_mgr = self.mgr.encryption_key_manager.as_ref(); snap_io::build_plain_cf_file::(cf_file, key_mgr, kv_snap, &begin_key, &end_key)? diff --git a/components/test_raftstore/src/router.rs b/components/test_raftstore/src/router.rs index 7fd978f65a1..3b6b1e962c3 100644 --- a/components/test_raftstore/src/router.rs +++ b/components/test_raftstore/src/router.rs @@ -58,7 +58,6 @@ impl ProposalRouter for MockRaftStoreRouter { impl CasualRouter for MockRaftStoreRouter { fn send(&self, region_id: u64, msg: CasualMessage) -> RaftStoreResult<()> { - debug!("!!!!! MockRaftStoreRouter"); let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { tx.try_send(PeerMsg::CasualMessage(msg)) @@ -75,7 +74,6 @@ impl SignificantRouter for MockRaftStoreRouter { region_id: u64, msg: SignificantMsg, ) -> RaftStoreResult<()> { - debug!("!!!!! MockRaftStoreRouter"); let mut senders = self.senders.lock().unwrap(); if let Some(tx) = senders.get_mut(®ion_id) { tx.force_send(PeerMsg::SignificantMsg(msg)).unwrap(); diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index c0edbc4a04e..78c2a31c280 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -245,6 +245,7 @@ pub mod root { pub struct FastAddPeerRes { pub status: root::DB::FastAddPeerStatus, pub apply_state: root::DB::CppStrWithView, + pub region: root::DB::CppStrWithView, } #[repr(C)] #[derive(Debug)] @@ -478,10 +479,11 @@ pub mod root { unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, region_id: u64, + new_peer_id: u64, ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 8449819960368956227; + pub const RAFT_STORE_PROXY_VERSION: u64 = 7429771182224851884; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 6def49017dc..ea3434dcd6a 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1168,9 +1168,9 @@ impl EngineStoreServerHelper { unsafe { (self.fn_debug_func.into_inner())(self.inner, debug_type, ptr) } } - pub fn fast_add_peer(&self, region_id: u64) -> FastAddPeerRes { + pub fn fast_add_peer(&self, region_id: u64, new_peer_id: u64) -> FastAddPeerRes { debug_assert!(self.fn_fast_add_peer.is_some()); - unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id) } + unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id, new_peer_id) } } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index fe348a0d66e..c7331732bd1 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -180,6 +180,23 @@ fn unhash_u64(mut i: u64) -> u64 { i ^ (i >> 30) ^ (i >> 60) } +pub fn validate_remote_peer_region( + new_region: &kvproto::metapb::Region, + store_id: u64, + new_peer_id: u64, +) -> bool { + match find_peer(new_region, store_id) { + Some(peer) => { + if peer.get_id() != new_peer_id { + false + } else { + true + } + } + None => false, + } +} + impl TiFlashObserver { #[inline] fn slot_index(id: u64) -> usize { @@ -237,6 +254,7 @@ impl TiFlashObserver { return false; } let region_id = msg.get_region_id(); + let new_peer_id = msg.get_to_peer().get_id(); let mut is_first = false; let mut is_replicated = false; let f = |info: MapEntry>| { @@ -290,12 +308,14 @@ impl TiFlashObserver { } } - info!("fast path: ongoing {}:{}, start load", self.store_id, region_id; + info!("fast path: ongoing {}:{}, fetch data from remote peer", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); // Feed data - let res = self.engine_store_server_helper.fast_add_peer(region_id); + let res = self + .engine_store_server_helper + .fast_add_peer(region_id, new_peer_id); match res.status { crate::FastAddPeerStatus::Ok => (), crate::FastAddPeerStatus::WaitForData => { @@ -315,15 +335,23 @@ impl TiFlashObserver { } }; - info!("fast path: ongoing {}:{}, start buid and send", self.store_id, region_id; + info!("fast path: ongoing {}:{}, parse", self.store_id, region_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); let apply_state_str = res.apply_state.view.to_slice(); + let region_str = res.region.view.to_slice(); let mut apply_state = RaftApplyState::default(); - apply_state.merge_from_bytes(apply_state_str); - match self.build_and_send_snapshot(region_id, msg.get_to_peer().get_id(), msg, apply_state) - { + let mut new_region = kvproto::metapb::Region::default(); + apply_state.merge_from_bytes(apply_state_str).unwrap(); + new_region.merge_from_bytes(region_str).unwrap(); + info!("fast path: ongoing {}:{}, start build and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "new_region" => ?new_region, + "apply_state" => ?apply_state, + ); + match self.build_and_send_snapshot(region_id, new_peer_id, msg, apply_state, new_region) { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { @@ -365,35 +393,24 @@ impl TiFlashObserver { new_peer_id: u64, msg: &RaftMessage, apply_state: RaftApplyState, + new_region: kvproto::metapb::Region, ) -> RaftStoreResult { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building - let (mut snap, key, region_state) = { - let region_state: RegionLocalState = match self - .engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id))? - { - Some(e) => e, - None => return Ok(crate::FastAddPeerStatus::BadData), - }; - + let (mut snap, key) = { // check if the source already knows the know peer - let has_peer = match find_peer(region_state.get_region(), self.store_id) { - Some(peer) => { - if peer.get_id() != new_peer_id { - false - } else { - true - } - } - None => false, - }; - if !has_peer { - warn!( - "build_and_send_snapshot remote peer has not applied conf change {:?}", - region_state.get_region() + if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { + info!( + "fast path: ongoing {}:{}. remote peer has not applied conf change for {}", + self.store_id, region_id, new_peer_id; + "region" => ?new_region, ); return Ok(crate::FastAddPeerStatus::WaitForData); + } else { + info!( + "fast path: ongoing {}:{}. remote peer has applied conf change for {}", + self.store_id, region_id, new_peer_id + ); } // Find term of entry at applied_index. @@ -402,9 +419,10 @@ impl TiFlashObserver { Some(apply_entry) => apply_entry.get_term(), None => { return Err(box_err!( - "can't find entry for applied_index {} of region {}", + "can't find entry for applied_index {} of region {}, peer_id: {}", applied_index, - region_id + region_id, + new_peer_id )); } }; @@ -415,16 +433,16 @@ impl TiFlashObserver { ); self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); - let snapshot = self.snap_mgr.get_empty_snapshot_for_building(&key)?; + let snapshot = self.snap_mgr.get_snapshot_for_building(&key)?; - (snapshot, key.clone(), region_state) + (snapshot, key.clone()) }; debug!( "!!!!! snap 1 {:?} {:?} {}", snap, snap.meta_file, - snap.cf_files.len() + snap.cf_files().len() ); // Build snapshot by do_snapshot let mut pb_snapshot: eraftpb::Snapshot = Default::default(); @@ -434,12 +452,12 @@ impl TiFlashObserver { // eraftpb::SnapshotMetadata for (cf_enum, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { let cf_index: RaftStoreResult = snap - .cf_files + .cf_files() .iter() .position(|x| &x.cf == cf) .ok_or(box_err!("can't find index for cf {}", cf)); let cf_index = cf_index?; - let cf_file = &mut snap.cf_files[cf_index]; + let cf_file = &snap.cf_files()[cf_index]; let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); @@ -450,7 +468,7 @@ impl TiFlashObserver { let mut file = std::fs::File::create(path.as_path())?; // let mut file = std::fs::create_dir(); } - snap_data.set_region(region_state.get_region().clone()); + snap_data.set_region(new_region.clone()); snap_data.set_file_size(0); let SNAPSHOT_VERSION = 2; snap_data.set_version(SNAPSHOT_VERSION); @@ -458,7 +476,7 @@ impl TiFlashObserver { // SnapshotMeta // Which is snap.meta_file.meta let mut snapshot_meta = - raftstore::store::snap::gen_snapshot_meta(&snap.cf_files[..], true)?; + raftstore::store::snap::gen_snapshot_meta(&snap.cf_files()[..], true)?; // Write MetaFile { @@ -474,7 +492,7 @@ impl TiFlashObserver { snap.meta_file.meta, snap.meta_file.file, snap.meta_file.path, - snap.cf_files.len() + snap.cf_files().len() ); } @@ -495,7 +513,7 @@ impl TiFlashObserver { // Send reponse let mut response = RaftMessage::default(); use kvproto::metapb::RegionEpoch; - let mut epoch = region_state.get_region().get_region_epoch(); + let mut epoch = new_region.get_region_epoch(); response.set_region_epoch(epoch.clone()); response.set_region_id(region_id); response.set_from_peer(msg.get_from_peer().clone()); diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 64d154c0d52..e6b923f2bbb 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -269,13 +269,22 @@ unsafe fn write_to_db_data( store: &mut EngineStoreServer, region: &mut Box, reason: String, +) { + let kv = &mut store.engines.as_mut().unwrap().kv; + write_to_db_data_by_engine(store.id, kv, region, reason) +} + +unsafe fn write_to_db_data_by_engine( + store_id: u64, + kv: &TiFlashEngine, + region: &mut Box, + reason: String, ) { info!("mock flush to engine"; "region" => ?region.region, - "store_id" => store.id, + "store_id" => store_id, "reason" => reason ); - let kv = &mut store.engines.as_mut().unwrap().kv; for cf in 0..3 { let pending_write = std::mem::take(region.pending_write.as_mut().get_mut(cf).unwrap()); let mut pending_remove = @@ -330,9 +339,9 @@ impl EngineStoreServerWrap { if region.apply_state.get_applied_index() >= header.index { // If it is a old entry. error!("obsolete admin index"; - "apply_state"=>?region.apply_state, - "header"=>?header, - "node_id"=>node_id, + "apply_state"=>?region.apply_state, + "header"=>?header, + "node_id"=>node_id, ); panic!("observe obsolete admin index"); // return ffi_interfaces::EngineStoreApplyRes::None; @@ -348,6 +357,7 @@ impl EngineStoreServerWrap { old_region.peer.get_store_id() }; + // Should we destroy ourself? let mut do_remove = true; if old_peer_id != 0 { for peer in new_region_meta.get_peers().iter() { @@ -1227,27 +1237,28 @@ unsafe extern "C" fn ffi_handle_compute_store_stats( unsafe extern "C" fn ffi_debug_func( arg1: *mut ffi_interfaces::EngineStoreServerWrap, - debug_type: u64, - ptr: ffi_interfaces::RawVoidPtr, + _debug_type: u64, + _ptr: ffi_interfaces::RawVoidPtr, ) -> ffi_interfaces::RawVoidPtr { std::ptr::null_mut() } -fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { +unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { match s { Some(s) => { let len = s.len() as u64; - let ptr = Box::into_raw(Box::new(s)); // leak - ffi_interfaces::CppStrWithView { + let ptr = Box::into_raw(Box::new(s.clone())); // leak + let s = ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { - ptr: ptr as *mut _, + ptr: ptr as RawVoidPtr, type_: RawCppPtrTypeImpl::String.into(), }, view: ffi_interfaces::BaseBuffView { - data: ptr as *mut _, + data: (*ptr).as_ptr() as *const _, len, }, - } + }; + s } None => ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { @@ -1265,20 +1276,12 @@ fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, region_id: u64, + new_peer_id: u64, ) -> ffi_interfaces::FastAddPeerRes { let store = into_engine_store_server_wrap(arg1); let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); - let lock = cluster.ffi_helper_set.lock(); - let guard = match lock { - Ok(e) => e, - Err(e) => { - error!("ffi_debug_func failed to lock"); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::OtherError, - apply_state: create_cpp_str(None), - }; - } - }; + let store_id = (*store.engine_store_server).id; + let from_store = (|| { fail::fail_point!("ffi_fast_add_peer_from_id", |t| { let t = t.unwrap().parse::().unwrap(); @@ -1286,44 +1289,95 @@ unsafe extern "C" fn ffi_fast_add_peer( }); 1 })(); - debug!("ffi_fast_add_peer from {}", from_store); - let source_server = match guard.get(&from_store) { - Some(s) => &s.engine_store_server, - None => { + debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); + + let lock = cluster.ffi_helper_set.lock(); + let mut guard = match lock { + Ok(e) => e, + Err(_) => { + error!("ffi_debug_func failed to lock"); return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::NoSuitable, + status: ffi_interfaces::FastAddPeerStatus::OtherError, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; - let source_engines = match source_server.engines.clone() { - Some(s) => s, + debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); + let source_server = match guard.get_mut(&from_store) { + Some(s) => &mut s.engine_store_server, None => { return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, + status: ffi_interfaces::FastAddPeerStatus::NoSuitable, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; - let source_region = match source_server.kvstore.get(®ion_id) { + let source_engines = match source_server.engines.clone() { Some(s) => s, None => { + error!("recover from remote peer: failed get source engine"; "region_id" => region_id); return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::BadData, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; + // TODO We must ask the remote peer to persist before get a snapshot. - let new_region_meta = match get_region_local_state(&source_engines.kv.rocks, region_id) { - Some(s) => s.get_region().clone(), + // { + // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { + // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add + // peer".to_string()); } else { + // error!("recover from remote peer: failed persist source region"; + // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes + // { status: ffi_interfaces::FastAddPeerStatus::BadData, + // apply_state: create_cpp_str(None), + // region: create_cpp_str(None), + // }; + // } + // } + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, None => { + error!("recover from remote peer: failed read source region info"; "region_id" => region_id); return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::BadData, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; + let region_local_state: RegionLocalState = + match general_get_region_local_state(&source_engines.kv, region_id) { + Some(x) => x, + None => { + // We don't return BadData here, since the data may not be persisted. + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::WaitForData, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; + } + }; + let new_region_meta = region_local_state.get_region(); + + debug!("recover from remote peer: preparing from {} to {}, check if conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); + if !engine_store_ffi::observer::validate_remote_peer_region( + new_region_meta, + store_id, + new_peer_id, + ) { + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::WaitForData, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; + } + + debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); let new_region = make_new_region( Some(new_region_meta.clone()), Some((*store.engine_store_server).id), @@ -1337,6 +1391,7 @@ unsafe extern "C" fn ffi_fast_add_peer( return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::OtherError, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; @@ -1346,10 +1401,26 @@ unsafe extern "C" fn ffi_fast_add_peer( return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::BadData, apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; + } + }; + debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); + // Must first dump meta then data, otherwise data may lag behind. + // We can see a raft log hole at applied_index otherwise. + let apply_state: RaftApplyState = match general_get_apply_state(&source_engines.kv, region_id) { + Some(x) => x, + None => { + error!("recover from remote peer: failed read apply state"; "region_id" => region_id); + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::BadData, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } }; - debug!("recover from remote peer: data"; "region_id" => region_id); + + debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); if let Err(_) = copy_data_from( &source_engines, &target_engines, @@ -1359,44 +1430,24 @@ unsafe extern "C" fn ffi_fast_add_peer( return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::FailedInject, apply_state: create_cpp_str(None), + region: create_cpp_str(None), }; } - debug!("recover from remote peer: meta"; "region_id" => region_id); - // Do not copy apply state, otherwise may race with `apply_snapshot` - if let Err(_) = copy_meta_from( - &source_engines, - &target_engines, - &source_region, - target_region, - new_region_meta, - true, - false, - ) { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::FailedInject, - apply_state: create_cpp_str(None), - }; - } - let apply_state: RaftApplyState = match general_get_apply_state(&source_engines.kv, region_id) { - Some(x) => x, - None => { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, - apply_state: create_cpp_str(None), - }; - } - }; - let apply_state_bytes = apply_state.write_to_bytes().unwrap(); - debug!("recover from remote peer: ok"; "region_id" => region_id); + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); + let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); + let region_ptr = create_cpp_str(Some(region_bytes)); + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::Ok, - apply_state: create_cpp_str(Some(apply_state_bytes)), + apply_state: apply_state_ptr, + region: region_ptr, } } use engine_store_ffi::RawVoidPtr; -use engine_traits::{KvEngine, Mutable, RaftEngine, RaftEngineDebug, RaftLogBatch, WriteBatch}; +use engine_traits::{KvEngine, Mutable, RaftEngine, RaftLogBatch, WriteBatch}; use kvproto::raft_serverpb::RaftLocalState; use tikv_util::{box_err, box_try}; @@ -1453,10 +1504,7 @@ pub fn get_raft_local_state( } } -pub fn copy_meta_from< - EK: engine_traits::KvEngine, - ER: RaftEngine + engine_traits::Peekable + RaftEngineDebug, ->( +pub fn copy_meta_from( source_engines: &Engines, target_engines: &Engines, source: &Box, @@ -1464,11 +1512,11 @@ pub fn copy_meta_from< new_region_meta: kvproto::metapb::Region, copy_region_state: bool, copy_apply_state: bool, + copy_raft_state: bool, ) -> raftstore::Result<()> { let region_id = source.region.get_id(); let mut wb = target_engines.kv.write_batch(); - let mut raft_wb = target_engines.raft.log_batch(1024); // Can't copy this key, otherwise will cause a bootstrap. // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); @@ -1495,8 +1543,9 @@ pub fn copy_meta_from< wb.write()?; target_engines.sync_kv()?; + let mut raft_wb = target_engines.raft.log_batch(1024); // raft state - { + if copy_raft_state { let raft_state = match get_raft_local_state(&source_engines.raft, region_id) { Some(x) => x, None => return Err(box_err!("bad RaftLocalState")), @@ -1504,33 +1553,34 @@ pub fn copy_meta_from< raft_wb.put_raft_state(region_id, &raft_state)?; }; - // raft log - let mut entries: Vec = Default::default(); - source_engines.raft.scan_entries(region_id, |e| { - debug!("copy raft log"; "e" => ?e); - entries.push(e.clone()); - Ok(true) - })?; - - raft_wb.append(region_id, entries)?; - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - Ok(()) } pub fn copy_data_from( - source_engines: &Engines< - impl KvEngine, - impl RaftEngine + engine_traits::Peekable + RaftEngineDebug, - >, + source_engines: &Engines, target_engines: &Engines, source: &Box, target: &mut Box, ) -> raftstore::Result<()> { + let region_id = source.region.get_id(); + + // kv data in memory for cf in 0..3 { for (k, v) in &source.data[cf] { write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); } } + + // raft log + let mut raft_wb = target_engines.raft.log_batch(1024); + let mut entries: Vec = Default::default(); + source_engines + .raft + .get_all_entries_to(region_id, &mut entries) + .unwrap(); + debug!("copy raft log {:?}", entries); + + raft_wb.append(region_id, entries)?; + box_try!(target_engines.raft.consume(&mut raft_wb, true)); Ok(()) } diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index c25bee26f8a..4e6c51d7ffb 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -135,7 +135,6 @@ impl Transport for ChannelTransport { match core.routers.get(&to_store) { Some(h) => { - debug!("!!!!! ChannelTransport send {} msg {:?}", to_store, msg); h.send_raft_msg(msg)?; if is_snapshot { // should report snapshot finish. diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 97e8718b6d2..6ec39a637d7 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -153,7 +153,8 @@ pub fn new_mock_cluster_snap(id: u64, count: usize) -> (Cluster, Ar } pub fn must_get_mem( - engine_store_server: &Box, + cluster: &Cluster, + node_id: u64, region_id: u64, key: &[u8], value: Option<&[u8]>, @@ -161,15 +162,24 @@ pub fn must_get_mem( let last_res: Option<&Vec> = None; let cf = new_mock_engine_store::ffi_interfaces::ColumnFamilyType::Default; for _ in 1..300 { - let res = engine_store_server.get_mem(region_id, cf, &key.to_vec()); + { + let lock = cluster.ffi_helper_set.lock(); + match lock { + Ok(l) => { + let server = &l.get(&node_id).unwrap().engine_store_server; + let res = server.get_mem(region_id, cf, &key.to_vec()); + if let (Some(value), Some(last_res)) = (value, res) { + assert_eq!(value, &last_res[..]); + return; + } + if value.is_none() && last_res.is_none() { + return; + } + } + Err(_) => std::process::exit(1), + } + }; - if let (Some(value), Some(last_res)) = (value, res) { - assert_eq!(value, &last_res[..]); - return; - } - if value.is_none() && last_res.is_none() { - return; - } std::thread::sleep(std::time::Duration::from_millis(20)); } let s = std::str::from_utf8(key).unwrap_or(""); @@ -178,7 +188,7 @@ pub fn must_get_mem( value.map(tikv_util::escape), log_wrappers::hex_encode_upper(key), s, - engine_store_server.id, + node_id, cf, last_res, ); @@ -251,7 +261,7 @@ pub fn check_key( } }; for id in engine_keys { - let engine = &cluster.get_engine(id); + let engine = cluster.get_engine(id); match in_disk { Some(b) => { @@ -265,18 +275,10 @@ pub fn check_key( }; match in_mem { Some(b) => { - let lock = match cluster.ffi_helper_set.lock() { - Ok(l) => l, - Err(e) => { - error!("check_key poison"); - std::process::exit(1); - } - }; - let server = &lock.get(&id).unwrap().engine_store_server; if b { - must_get_mem(server, region_id, k, Some(v)); + must_get_mem(cluster, id, region_id, k, Some(v)); } else { - must_get_mem(server, region_id, k, None); + must_get_mem(cluster, id, region_id, k, None); } } None => (), diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 2924d08a619..03894e36007 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -409,6 +409,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ new_region_meta.clone(), true, true, + true, ) .unwrap(); } else { diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 919198786b8..a932e40b568 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 8449819960368956227ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 7429771182224851884ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index bc55604fcc7..954b85c7ca4 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -155,6 +155,7 @@ enum class FastAddPeerStatus : uint32_t { struct FastAddPeerRes { FastAddPeerStatus status; CppStrWithView apply_state; + CppStrWithView region; }; struct RaftStoreProxyFFIHelper { @@ -232,6 +233,6 @@ struct EngineStoreServerHelper { RawVoidPtr (*fn_debug_func)(EngineStoreServerWrap *, uint64_t type, RawVoidPtr); FastAddPeerRes (*fn_fast_add_peer)(EngineStoreServerWrap *, - uint64_t region_id); + uint64_t region_id, uint64_t new_peer_id); }; } // namespace DB From 9da03d168eb529f24449dcf5cb757497622c2215 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 9 Dec 2022 20:34:53 +0800 Subject: [PATCH 008/115] fmt Signed-off-by: CalvinNeo --- components/raftstore/src/store/peer.rs | 1 - engine_store_ffi/src/observer.rs | 58 +++++++++---------- new-mock-engine-store/src/lib.rs | 78 +++++++++----------------- 3 files changed, 52 insertions(+), 85 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 374b371116f..cffb7e40a9a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1773,7 +1773,6 @@ where snap_data .merge_from_bytes(msg.get_message().get_snapshot().get_data()) .unwrap(); - debug!("!!!! send snapshot {:?} XXXXXXX {:?}", msg, snap_data); let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); if snap_index > self.last_sent_snapshot_idx { self.last_sent_snapshot_idx = snap_index; diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index c7331732bd1..052d8087811 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -225,7 +225,7 @@ impl TiFlashObserver { MapEntry::Occupied(mut o) => { o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); } - MapEntry::Vacant(v) => { + MapEntry::Vacant(_) => { tikv_util::safe_panic!("not inited!"); } }, @@ -378,7 +378,7 @@ impl TiFlashObserver { Err(e) => { error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; "is_first" => is_first,); - if let Err(e) = self.set_inited_or_fallback(region_id, true) { + if let Err(_) = self.set_inited_or_fallback(region_id, true) { tikv_util::safe_panic!("set_inited_or_fallback"); } return false; @@ -397,7 +397,7 @@ impl TiFlashObserver { ) -> RaftStoreResult { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building - let (mut snap, key) = { + let (snap, key) = { // check if the source already knows the know peer if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { info!( @@ -426,11 +426,7 @@ impl TiFlashObserver { )); } }; - let key = SnapKey::new( - region_id, - applied_term, // TODO apply index term - applied_index, - ); + let key = SnapKey::new(region_id, applied_term, applied_index); self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); let snapshot = self.snap_mgr.get_snapshot_for_building(&key)?; @@ -450,7 +446,7 @@ impl TiFlashObserver { let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); { // eraftpb::SnapshotMetadata - for (cf_enum, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { + for (_, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { let cf_index: RaftStoreResult = snap .cf_files() .iter() @@ -465,8 +461,7 @@ impl TiFlashObserver { "!!!! snap g cf_file.path {:?} {:?} {:?}", cf_file.path, cf_file.file_prefix, path ); - let mut file = std::fs::File::create(path.as_path())?; - // let mut file = std::fs::create_dir(); + let mut _file = std::fs::File::create(path.as_path())?; } snap_data.set_region(new_region.clone()); snap_data.set_file_size(0); @@ -475,7 +470,7 @@ impl TiFlashObserver { // SnapshotMeta // Which is snap.meta_file.meta - let mut snapshot_meta = + let snapshot_meta = raftstore::store::snap::gen_snapshot_meta(&snap.cf_files()[..], true)?; // Write MetaFile @@ -513,7 +508,7 @@ impl TiFlashObserver { // Send reponse let mut response = RaftMessage::default(); use kvproto::metapb::RegionEpoch; - let mut epoch = new_region.get_region_epoch(); + let epoch = new_region.get_region_epoch(); response.set_region_epoch(epoch.clone()); response.set_region_id(region_id); response.set_from_peer(msg.get_from_peer().clone()); @@ -528,9 +523,10 @@ impl TiFlashObserver { key, response, snap_data ); match self.trans.lock() { - Ok(mut trans) => { - let res = trans.send(response); - } + Ok(mut trans) => match trans.send(response) { + Ok(_) | Err(RaftStoreError::RegionNotFound(_)) => (), + _ => return Ok(crate::FastAddPeerStatus::OtherError), + }, Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), } @@ -1095,19 +1091,16 @@ impl RegionChangeObserver for TiFlashObs } fn on_peer_created(&self, region_id: u64) { - let mut f = |info: MapEntry>| { - debug!("!!!! on_peer_created"); - match info { - MapEntry::Occupied(mut o) => { - o.get_mut() - .replicated_or_created - .store(true, Ordering::SeqCst); - } - MapEntry::Vacant(v) => { - let mut c = CachedRegionInfo::default(); - c.replicated_or_created.store(true, Ordering::SeqCst); - v.insert(Arc::new(c)); - } + let f = |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + let c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); } }; // TODO remove unwrap @@ -1222,7 +1215,7 @@ impl ApplySnapshotObserver for TiFlashOb { let mut lock = match self.pre_handle_snapshot_ctx.lock() { Ok(l) => l, - Err(e) => fatal!("pre_apply_snapshot poisoned"), + Err(_) => fatal!("pre_apply_snapshot poisoned"), }; let ctx = lock.deref_mut(); ctx.tracer.insert(snap_key.clone(), task.clone()); @@ -1298,7 +1291,7 @@ impl ApplySnapshotObserver for TiFlashOb } }, ) { - Err(e) => fatal!("post_apply_snapshot poisoned"), + Err(_) => fatal!("post_apply_snapshot poisoned"), _ => (), }; let snap = match snap { @@ -1308,7 +1301,7 @@ impl ApplySnapshotObserver for TiFlashOb let maybe_snapshot = { let mut lock = match self.pre_handle_snapshot_ctx.lock() { Ok(l) => l, - Err(e) => fatal!("post_apply_snapshot poisoned"), + Err(_) => fatal!("post_apply_snapshot poisoned"), }; let ctx = lock.deref_mut(); ctx.tracer.remove(snap_key) @@ -1382,7 +1375,6 @@ impl ApplySnapshotObserver for TiFlashOb "region" => ?ob_ctx.region(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); - let region_id = ob_ctx.region().get_id(); } } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index e6b923f2bbb..733d3a2b4e1 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1273,6 +1273,15 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { } } +macro_rules! unwrap_or_return { + ($e:expr, $res:expr) => { + match $e { + Some(x) => x, + None => return $res, + } + }; +} + unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, region_id: u64, @@ -1282,6 +1291,12 @@ unsafe extern "C" fn ffi_fast_add_peer( let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); let store_id = (*store.engine_store_server).id; + let failed_add_peer_res = + |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { + status, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; let from_store = (|| { fail::fail_point!("ffi_fast_add_peer_from_id", |t| { let t = t.unwrap().parse::().unwrap(); @@ -1296,33 +1311,21 @@ unsafe extern "C" fn ffi_fast_add_peer( Ok(e) => e, Err(_) => { error!("ffi_debug_func failed to lock"); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::OtherError, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); } }; debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); let source_server = match guard.get_mut(&from_store) { Some(s) => &mut s.engine_store_server, None => { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::NoSuitable, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); } }; let source_engines = match source_server.engines.clone() { Some(s) => s, None => { error!("recover from remote peer: failed get source engine"; "region_id" => region_id); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } }; @@ -1343,11 +1346,7 @@ unsafe extern "C" fn ffi_fast_add_peer( Some(s) => s, None => { error!("recover from remote peer: failed read source region info"; "region_id" => region_id); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } }; let region_local_state: RegionLocalState = @@ -1355,11 +1354,7 @@ unsafe extern "C" fn ffi_fast_add_peer( Some(x) => x, None => { // We don't return BadData here, since the data may not be persisted. - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::WaitForData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } }; let new_region_meta = region_local_state.get_region(); @@ -1370,11 +1365,7 @@ unsafe extern "C" fn ffi_fast_add_peer( store_id, new_peer_id, ) { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::WaitForData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); @@ -1388,21 +1379,13 @@ unsafe extern "C" fn ffi_fast_add_peer( let target_engines = match (*store.engine_store_server).engines.clone() { Some(s) => s, None => { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::OtherError, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); } }; let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { Some(s) => s, None => { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } }; debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); @@ -1412,26 +1395,19 @@ unsafe extern "C" fn ffi_fast_add_peer( Some(x) => x, None => { error!("recover from remote peer: failed read apply state"; "region_id" => region_id); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::BadData, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } }; debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); - if let Err(_) = copy_data_from( + if let Err(e) = copy_data_from( &source_engines, &target_engines, &source_region, target_region, ) { - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::FailedInject, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; + error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); } let apply_state_bytes = apply_state.write_to_bytes().unwrap(); From a652cbd568a44ca2e4681942c8e9e01b066fa3c9 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 10:58:31 +0800 Subject: [PATCH 009/115] fast add peer Signed-off-by: CalvinNeo --- components/raftstore/src/store/fsm/store.rs | 1 - components/raftstore/src/store/snap.rs | 1 - engine_store_ffi/src/observer.rs | 2 +- new-mock-engine-store/src/lib.rs | 241 +++++++++++--------- proxy_tests/proxy/fast_add_peer.rs | 164 +++++++++++++ proxy_tests/proxy/proxy.rs | 1 - proxy_tests/proxy/region.rs | 1 - src/server/raft_client.rs | 1 - 8 files changed, 296 insertions(+), 116 deletions(-) create mode 100644 proxy_tests/proxy/fast_add_peer.rs diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 4de2f05c9cf..68aa7ddb241 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -715,7 +715,6 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { - debug!("!!!! store skip message"); continue; } if let Err(e) = self.on_raft_message(msg) { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index d7d78f71df6..038d1d2a880 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -188,7 +188,6 @@ where // A helper function to copy snapshot. // Only used in tests. pub fn copy_snapshot(mut from: Box, mut to: Box) -> io::Result<()> { - debug!("!!!!! copy_snapshot {}", to.exists()); if !to.exists() { io::copy(&mut from, &mut to)?; to.save()?; diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 3a951d8260f..9a428c5acd3 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -259,7 +259,7 @@ impl TiFlashObserver { let mut is_replicated = false; let f = |info: MapEntry>| { match info { - MapEntry::Occupied(mut o) => { + MapEntry::Occupied(o) => { is_first = !o.get().inited_or_fallback.load(Ordering::SeqCst); // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 733d3a2b4e1..c6ecf870ca3 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1304,122 +1304,148 @@ unsafe extern "C" fn ffi_fast_add_peer( }); 1 })(); + let block_wait: bool = (|| { + fail::fail_point!("ffi_fast_add_peer_block_wait", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); - let lock = cluster.ffi_helper_set.lock(); - let mut guard = match lock { - Ok(e) => e, - Err(_) => { - error!("ffi_debug_func failed to lock"); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); - } - }; - debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); - let source_server = match guard.get_mut(&from_store) { - Some(s) => &mut s.engine_store_server, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); - } - }; - let source_engines = match source_server.engines.clone() { - Some(s) => s, - None => { - error!("recover from remote peer: failed get source engine"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + for retry in 0..300 { + if retry > 0 { + std::thread::sleep(std::time::Duration::from_millis(30)); } - }; + let lock = cluster.ffi_helper_set.lock(); + let mut guard = match lock { + Ok(e) => e, + Err(_) => { + error!("ffi_debug_func failed to lock"); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); + } + }; + debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); + let source_server = match guard.get_mut(&from_store) { + Some(s) => &mut s.engine_store_server, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); + } + }; + let source_engines = match source_server.engines.clone() { + Some(s) => s, + None => { + error!("recover from remote peer: failed get source engine"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; - // TODO We must ask the remote peer to persist before get a snapshot. - // { - // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { - // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add - // peer".to_string()); } else { - // error!("recover from remote peer: failed persist source region"; - // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes - // { status: ffi_interfaces::FastAddPeerStatus::BadData, - // apply_state: create_cpp_str(None), - // region: create_cpp_str(None), - // }; - // } - // } - let source_region = match source_server.kvstore.get(®ion_id) { - Some(s) => s, - None => { - error!("recover from remote peer: failed read source region info"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - let region_local_state: RegionLocalState = - match general_get_region_local_state(&source_engines.kv, region_id) { + // TODO We must ask the remote peer to persist before get a snapshot. + // { + // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { + // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add + // peer".to_string()); } else { + // error!("recover from remote peer: failed persist source region"; + // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes + // { status: ffi_interfaces::FastAddPeerStatus::BadData, + // apply_state: create_cpp_str(None), + // region: create_cpp_str(None), + // }; + // } + // } + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, + None => { + error!("recover from remote peer: failed read source region info"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + let region_local_state: RegionLocalState = match general_get_region_local_state( + &source_engines.kv, + region_id, + ) { Some(x) => x, None => { + debug!("recover from remote peer: preparing from {} to {}, not region state {}", from_store, store_id, new_peer_id; "region_id" => region_id); // We don't return BadData here, since the data may not be persisted. + if block_wait { + continue; + } return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } }; - let new_region_meta = region_local_state.get_region(); - - debug!("recover from remote peer: preparing from {} to {}, check if conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); - if !engine_store_ffi::observer::validate_remote_peer_region( - new_region_meta, - store_id, - new_peer_id, - ) { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); - } - - debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); - let new_region = make_new_region( - Some(new_region_meta.clone()), - Some((*store.engine_store_server).id), - ); - (*store.engine_store_server) - .kvstore - .insert(region_id, Box::new(new_region)); - let target_engines = match (*store.engine_store_server).engines.clone() { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); - } - }; - let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); - // Must first dump meta then data, otherwise data may lag behind. - // We can see a raft log hole at applied_index otherwise. - let apply_state: RaftApplyState = match general_get_apply_state(&source_engines.kv, region_id) { - Some(x) => x, - None => { - error!("recover from remote peer: failed read apply state"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + let new_region_meta = region_local_state.get_region(); + + if !engine_store_ffi::observer::validate_remote_peer_region( + new_region_meta, + store_id, + new_peer_id, + ) { + debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); + if block_wait { + continue; + } + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } - }; - debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); - if let Err(e) = copy_data_from( - &source_engines, - &target_engines, - &source_region, - target_region, - ) { - error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); - } + debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = match (*store.engine_store_server).engines.clone() { + Some(s) => s, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); + } + }; + let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { + Some(s) => s, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); + // Must first dump meta then data, otherwise data may lag behind. + // We can see a raft log hole at applied_index otherwise. + let apply_state: RaftApplyState = match general_get_apply_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + error!("recover from remote peer: failed read apply state"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + + debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); + if let Err(e) = copy_data_from( + &source_engines, + &target_engines, + &source_region, + target_region, + ) { + error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); + } - let apply_state_bytes = apply_state.write_to_bytes().unwrap(); - let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); - let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); - let region_ptr = create_cpp_str(Some(region_bytes)); - debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); - ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::Ok, - apply_state: apply_state_ptr, - region: region_ptr, + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); + let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); + let region_ptr = create_cpp_str(Some(region_bytes)); + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::Ok, + apply_state: apply_state_ptr, + region: region_ptr, + }; } + error!("recover from remote peer: failed after retry"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } use engine_store_ffi::RawVoidPtr; @@ -1453,10 +1479,7 @@ pub fn get_region_local_state( engine: &engine_rocks::RocksEngine, region_id: u64, ) -> Option { - let region_state_key = keys::region_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, ®ion_state_key) - .unwrap_or(None) + general_get_region_local_state(engine, region_id) } // TODO Need refactor if moved to raft-engine @@ -1464,10 +1487,7 @@ pub fn get_apply_state( engine: &engine_rocks::RocksEngine, region_id: u64, ) -> Option { - let apply_state_key = keys::apply_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, &apply_state_key) - .unwrap_or(None) + general_get_apply_state(engine, region_id) } pub fn get_raft_local_state( @@ -1529,6 +1549,7 @@ pub fn copy_meta_from { + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + } + SourceType::InvalidSource => { + fail::cfg("ffi_fast_add_peer_from_id", "return(100)").unwrap(); + } + _ => (), + }; + + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + // std::thread::sleep(std::time::Duration::from_millis(2000)); + // match source_type { + // SourceType::Learner => { + // // Wait until Learner has applied ConfChange + // must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: + // &States| -> bool { + // find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + // }); + // } + // _ => {}, + // } + cluster.must_put(b"k2", b"v2"); + + match source_type { + SourceType::DelayedLearner => { + // Make sure conf change is applied. + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Recv), + )); + cluster.must_put(b"k3", b"v3"); + } + _ => (), + }; + + match source_type { + SourceType::DelayedLearner => { + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + } + SourceType::Learner => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + _ => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + }; + + match source_type { + SourceType::DelayedLearner => { + cluster.clear_send_filters(); + } + _ => (), + }; + + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("on_pre_persist_with_finish"); + fail::remove("ffi_fast_add_peer_block_wait"); + cluster.shutdown(); +} + +#[test] +fn test_fast_add_peer_from_leader() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Leader, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// Fast path by learner snapshot. +#[test] +fn test_fast_add_peer_from_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// If a learner is delayed, but already applied ConfChange. +#[test] +fn test_fast_add_peer_from_delayed_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// If we select a wrong source, or we can't run fast path, we can fallback to +/// normal. +#[test] +fn test_fast_add_peer_from_invalid_source() { + simple_fast_add_peer(SourceType::InvalidSource, false); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true); + fail::remove("on_pre_persist_with_finish"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true); + fail::remove("on_pre_persist_with_finish"); +} diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 6ec39a637d7..a7674ce5724 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::RangeBounds; pub use std::{ collections::HashMap, io::Write, diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 03894e36007..82d9416221f 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -2,7 +2,6 @@ use std::iter::FromIterator; use collections::HashSet; -use raft::eraftpb::Entry; use crate::proxy::*; diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index c0f2c0d282e..67581b0c4fc 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1010,7 +1010,6 @@ where let store_id = msg.get_to_peer().store_id; let grpc_raft_conn_num = self.builder.cfg.value().grpc_raft_conn_num as u64; - tikv_util::info!("!!!!! Client send {:?}", store_id); let conn_id = if grpc_raft_conn_num == 1 { 0 } else { From 22afec32f53ab16482e46d07de60ecd59ac9a83a Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 11:52:49 +0800 Subject: [PATCH 010/115] fix error router Signed-off-by: CalvinNeo --- new-mock-engine-store/src/node.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 4e6c51d7ffb..7f88e47a613 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -138,11 +138,14 @@ impl Transport for ChannelTransport { h.send_raft_msg(msg)?; if is_snapshot { // should report snapshot finish. - let _ = core.routers[&from_store].report_snapshot_status( - region_id, - to_peer_id, - SnapshotStatus::Finish, - ); + match core.routers.get(&from_store) { + Some(router) => router.report_snapshot_status( + region_id, + to_peer_id, + SnapshotStatus::Finish, + ), + None => return Err(box_err!("Find no from_store {}", from_store)), + }; } Ok(()) } From 0a5c25b6648cb79d5f4bd7143c6c0e61eda3c5a7 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 12:30:53 +0800 Subject: [PATCH 011/115] fix merge Signed-off-by: CalvinNeo --- engine_store_ffi/src/interfaces.rs | 9 +- engine_store_ffi/src/lib.rs | 5 - new-mock-engine-store/src/lib.rs | 347 ------------------ new-mock-engine-store/src/mock_cluster.rs | 2 +- new-mock-engine-store/src/mock_store.rs | 142 +------ proxy_tests/proxy/proxy.rs | 1 + .../ffi/src/RaftStoreProxyFFI/@version | 2 +- src/server/raft_client.rs | 1 - 8 files changed, 10 insertions(+), 499 deletions(-) diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 78c2a31c280..74ad857d70b 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -468,13 +468,6 @@ pub mod root { leader_safe_ts: u64, ), >, - pub fn_debug_func: ::std::option::Option< - unsafe extern "C" fn( - arg1: *mut root::DB::EngineStoreServerWrap, - type_: u64, - arg2: root::DB::RawVoidPtr, - ) -> root::DB::RawVoidPtr, - >, pub fn_fast_add_peer: ::std::option::Option< unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, @@ -483,7 +476,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 7429771182224851884; + pub const RAFT_STORE_PROXY_VERSION: u64 = 5489124786978559153; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index ea3434dcd6a..c88093ef571 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1163,11 +1163,6 @@ impl EngineStoreServerHelper { } } - pub fn debug_func(&self, debug_type: u64, ptr: RawVoidPtr) -> RawVoidPtr { - debug_assert!(self.fn_debug_func.is_some()); - unsafe { (self.fn_debug_func.into_inner())(self.inner, debug_type, ptr) } - } - pub fn fast_add_peer(&self, region_id: u64, new_peer_id: u64) -> FastAddPeerRes { debug_assert!(self.fn_fast_add_peer.is_some()); unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id, new_peer_id) } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 761f8d4d11e..58db2bb0f2c 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -138,350 +138,3 @@ pub fn get_raft_local_state( _ => None, } } - -unsafe extern "C" fn ffi_debug_func( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - _debug_type: u64, - _ptr: ffi_interfaces::RawVoidPtr, -) -> ffi_interfaces::RawVoidPtr { - std::ptr::null_mut() -} - -unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { - match s { - Some(s) => { - let len = s.len() as u64; - let ptr = Box::into_raw(Box::new(s.clone())); // leak - let s = ffi_interfaces::CppStrWithView { - inner: ffi_interfaces::RawCppPtr { - ptr: ptr as RawVoidPtr, - type_: RawCppPtrTypeImpl::String.into(), - }, - view: ffi_interfaces::BaseBuffView { - data: (*ptr).as_ptr() as *const _, - len, - }, - }; - s - } - None => ffi_interfaces::CppStrWithView { - inner: ffi_interfaces::RawCppPtr { - ptr: std::ptr::null_mut(), - type_: RawCppPtrTypeImpl::None.into(), - }, - view: ffi_interfaces::BaseBuffView { - data: std::ptr::null(), - len: 0, - }, - }, - } -} - -macro_rules! unwrap_or_return { - ($e:expr, $res:expr) => { - match $e { - Some(x) => x, - None => return $res, - } - }; -} - -unsafe extern "C" fn ffi_fast_add_peer( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - region_id: u64, - new_peer_id: u64, -) -> ffi_interfaces::FastAddPeerRes { - let store = into_engine_store_server_wrap(arg1); - let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); - let store_id = (*store.engine_store_server).id; - - let failed_add_peer_res = - |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { - status, - apply_state: create_cpp_str(None), - region: create_cpp_str(None), - }; - let from_store = (|| { - fail::fail_point!("ffi_fast_add_peer_from_id", |t| { - let t = t.unwrap().parse::().unwrap(); - t - }); - 1 - })(); - let block_wait: bool = (|| { - fail::fail_point!("ffi_fast_add_peer_block_wait", |t| { - let t = t.unwrap().parse::().unwrap(); - t - }); - 0 - })() != 0; - debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); - - for retry in 0..300 { - if retry > 0 { - std::thread::sleep(std::time::Duration::from_millis(30)); - } - let lock = cluster.ffi_helper_set.lock(); - let mut guard = match lock { - Ok(e) => e, - Err(_) => { - error!("ffi_debug_func failed to lock"); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); - } - }; - debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); - let source_server = match guard.get_mut(&from_store) { - Some(s) => &mut s.engine_store_server, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); - } - }; - let source_engines = match source_server.engines.clone() { - Some(s) => s, - None => { - error!("recover from remote peer: failed get source engine"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - - // TODO We must ask the remote peer to persist before get a snapshot. - // { - // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { - // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add - // peer".to_string()); } else { - // error!("recover from remote peer: failed persist source region"; - // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes - // { status: ffi_interfaces::FastAddPeerStatus::BadData, - // apply_state: create_cpp_str(None), - // region: create_cpp_str(None), - // }; - // } - // } - let source_region = match source_server.kvstore.get(®ion_id) { - Some(s) => s, - None => { - error!("recover from remote peer: failed read source region info"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - let region_local_state: RegionLocalState = match general_get_region_local_state( - &source_engines.kv, - region_id, - ) { - Some(x) => x, - None => { - debug!("recover from remote peer: preparing from {} to {}, not region state {}", from_store, store_id, new_peer_id; "region_id" => region_id); - // We don't return BadData here, since the data may not be persisted. - if block_wait { - continue; - } - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); - } - }; - let new_region_meta = region_local_state.get_region(); - - if !engine_store_ffi::observer::validate_remote_peer_region( - new_region_meta, - store_id, - new_peer_id, - ) { - debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); - if block_wait { - continue; - } - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); - } - - debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); - let new_region = make_new_region( - Some(new_region_meta.clone()), - Some((*store.engine_store_server).id), - ); - (*store.engine_store_server) - .kvstore - .insert(region_id, Box::new(new_region)); - let target_engines = match (*store.engine_store_server).engines.clone() { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); - } - }; - let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); - // Must first dump meta then data, otherwise data may lag behind. - // We can see a raft log hole at applied_index otherwise. - let apply_state: RaftApplyState = match general_get_apply_state( - &source_engines.kv, - region_id, - ) { - Some(x) => x, - None => { - error!("recover from remote peer: failed read apply state"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - - debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); - if let Err(e) = copy_data_from( - &source_engines, - &target_engines, - &source_region, - target_region, - ) { - error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); - } - - let apply_state_bytes = apply_state.write_to_bytes().unwrap(); - let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); - let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); - let region_ptr = create_cpp_str(Some(region_bytes)); - debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::Ok, - apply_state: apply_state_ptr, - region: region_ptr, - }; - } - error!("recover from remote peer: failed after retry"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); -} - -use engine_store_ffi::RawVoidPtr; -use engine_traits::{KvEngine, Mutable, RaftEngine, RaftLogBatch, WriteBatch}; -use kvproto::raft_serverpb::RaftLocalState; -use tikv_util::{box_err, box_try}; - -// TODO Need refactor if moved to raft-engine -pub fn general_get_region_local_state( - engine: &EK, - region_id: u64, -) -> Option { - let region_state_key = keys::region_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, ®ion_state_key) - .unwrap_or(None) -} - -// TODO Need refactor if moved to raft-engine -pub fn general_get_apply_state( - engine: &EK, - region_id: u64, -) -> Option { - let apply_state_key = keys::apply_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, &apply_state_key) - .unwrap_or(None) -} - -pub fn get_region_local_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> Option { - general_get_region_local_state(engine, region_id) -} - -// TODO Need refactor if moved to raft-engine -pub fn get_apply_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> Option { - general_get_apply_state(engine, region_id) -} - -pub fn get_raft_local_state( - raft_engine: &ER, - region_id: u64, -) -> Option { - match raft_engine.get_raft_state(region_id) { - Ok(Some(x)) => Some(x), - _ => None, - } -} - -pub fn copy_meta_from( - source_engines: &Engines, - target_engines: &Engines, - source: &Box, - target: &mut Box, - new_region_meta: kvproto::metapb::Region, - copy_region_state: bool, - copy_apply_state: bool, - copy_raft_state: bool, -) -> raftstore::Result<()> { - let region_id = source.region.get_id(); - - let mut wb = target_engines.kv.write_batch(); - - // Can't copy this key, otherwise will cause a bootstrap. - // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); - - // region local state - if copy_region_state { - let mut state = RegionLocalState::default(); - state.set_region(new_region_meta); - box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); - } - - // apply state - if copy_apply_state { - let apply_state: RaftApplyState = - match general_get_apply_state(&source_engines.kv, region_id) { - Some(x) => x, - None => return Err(box_err!("bad RaftApplyState")), - }; - wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - target.apply_state = apply_state.clone(); - target.applied_term = source.applied_term; - } - - wb.write()?; - target_engines.sync_kv()?; - - let mut raft_wb = target_engines.raft.log_batch(1024); - // raft state - if copy_raft_state { - let raft_state = match get_raft_local_state(&source_engines.raft, region_id) { - Some(x) => x, - None => return Err(box_err!("bad RaftLocalState")), - }; - raft_wb.put_raft_state(region_id, &raft_state)?; - }; - - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - Ok(()) -} - -pub fn copy_data_from( - source_engines: &Engines, - target_engines: &Engines, - source: &Box, - target: &mut Box, -) -> raftstore::Result<()> { - let region_id = source.region.get_id(); - - // kv data in memory - for cf in 0..3 { - for (k, v) in &source.data[cf] { - write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); - } - } - - // raft log - let mut raft_wb = target_engines.raft.log_batch(1024); - let mut entries: Vec = Default::default(); - source_engines - .raft - .get_all_entries_to(region_id, &mut entries) - .unwrap(); - debug!("copy raft log {:?}", entries); - - raft_wb.append(region_id, entries)?; - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - Ok(()) -} diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index fe03b231008..d1a6e175287 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -93,7 +93,7 @@ pub struct TestData { pub struct Cluster> { // Helper to set ffi_helper_set. pub ffi_helper_lst: Vec, - ffi_helper_set: Arc>>, + pub ffi_helper_set: Arc>>, pub cfg: Config, leaders: HashMap, diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index fc398c92a96..de6b5d5d805 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -13,7 +13,7 @@ pub use std::{ pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, - RawCppPtr, UnwrapExternCFunc, + RawCppPtr, RawVoidPtr, UnwrapExternCFunc, }; pub use engine_traits::{ Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, SyncMutable, @@ -26,9 +26,11 @@ pub use kvproto::{ pub use protobuf::Message; pub use tikv_util::{box_err, box_try, debug, error, info, warn}; +use crate::node::NodeCluster; pub use crate::{ config::MockConfig, - mock_cluster, + copy_data_from, copy_meta_from, general_get_apply_state, general_get_region_local_state, + get_apply_state, get_raft_local_state, get_region_local_state, mock_cluster, mock_cluster::{ must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, }, @@ -695,6 +697,7 @@ pub fn gen_engine_store_server_helper( fn_set_store: None, fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), + fn_fast_add_peer: Some(ffi_fast_add_peer), } } @@ -1293,6 +1296,7 @@ unsafe extern "C" fn ffi_fast_add_peer( if retry > 0 { std::thread::sleep(std::time::Duration::from_millis(30)); } + let lock = cluster.ffi_helper_set.lock(); let mut guard = match lock { Ok(e) => e, @@ -1423,137 +1427,3 @@ unsafe extern "C" fn ffi_fast_add_peer( error!("recover from remote peer: failed after retry"; "region_id" => region_id); return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } - -use engine_store_ffi::RawVoidPtr; -use engine_traits::{KvEngine, Mutable, RaftEngine, RaftLogBatch, WriteBatch}; -use kvproto::raft_serverpb::RaftLocalState; -use tikv_util::{box_err, box_try}; - -// TODO Need refactor if moved to raft-engine -pub fn general_get_region_local_state( - engine: &EK, - region_id: u64, -) -> Option { - let region_state_key = keys::region_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, ®ion_state_key) - .unwrap_or(None) -} - -// TODO Need refactor if moved to raft-engine -pub fn general_get_apply_state( - engine: &EK, - region_id: u64, -) -> Option { - let apply_state_key = keys::apply_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, &apply_state_key) - .unwrap_or(None) -} - -pub fn get_region_local_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> Option { - general_get_region_local_state(engine, region_id) -} - -// TODO Need refactor if moved to raft-engine -pub fn get_apply_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> Option { - general_get_apply_state(engine, region_id) -} - -pub fn get_raft_local_state( - raft_engine: &ER, - region_id: u64, -) -> Option { - match raft_engine.get_raft_state(region_id) { - Ok(Some(x)) => Some(x), - _ => None, - } -} - -pub fn copy_meta_from( - source_engines: &Engines, - target_engines: &Engines, - source: &Box, - target: &mut Box, - new_region_meta: kvproto::metapb::Region, - copy_region_state: bool, - copy_apply_state: bool, - copy_raft_state: bool, -) -> raftstore::Result<()> { - let region_id = source.region.get_id(); - - let mut wb = target_engines.kv.write_batch(); - - // Can't copy this key, otherwise will cause a bootstrap. - // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); - - // region local state - if copy_region_state { - let mut state = RegionLocalState::default(); - state.set_region(new_region_meta); - box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); - } - - // apply state - if copy_apply_state { - let apply_state: RaftApplyState = - match general_get_apply_state(&source_engines.kv, region_id) { - Some(x) => x, - None => return Err(box_err!("bad RaftApplyState")), - }; - wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - target.apply_state = apply_state.clone(); - target.applied_term = source.applied_term; - } - - wb.write()?; - target_engines.sync_kv()?; - - let mut raft_wb = target_engines.raft.log_batch(1024); - // raft state - if copy_raft_state { - let raft_state = match get_raft_local_state(&source_engines.raft, region_id) { - Some(x) => x, - None => return Err(box_err!("bad RaftLocalState")), - }; - raft_wb.put_raft_state(region_id, &raft_state)?; - }; - - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - Ok(()) -} - -pub fn copy_data_from( - source_engines: &Engines, - target_engines: &Engines, - source: &Box, - target: &mut Box, -) -> raftstore::Result<()> { - let region_id = source.region.get_id(); - - // kv data in memory - for cf in 0..3 { - for (k, v) in &source.data[cf] { - write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); - } - } - - // raft log - let mut raft_wb = target_engines.raft.log_batch(1024); - let mut entries: Vec = Default::default(); - source_engines - .raft - .get_all_entries_to(region_id, &mut entries) - .unwrap(); - debug!("copy raft log {:?}", entries); - - raft_wb.append(region_id, entries)?; - box_try!(target_engines.raft.consume(&mut raft_wb, true)); - Ok(()) -} \ No newline at end of file diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 0e88a8ff58d..724b9418807 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -168,6 +168,7 @@ pub fn must_get_mem( if value.is_none() && last_res.is_none() { ok = true; return; + } }, ); } diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index a932e40b568..753fcf39795 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 7429771182224851884ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 5489124786978559153ull; } \ No newline at end of file diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 67581b0c4fc..fa12600bb98 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1009,7 +1009,6 @@ where pub fn send(&mut self, msg: RaftMessage) -> result::Result<(), DiscardReason> { let store_id = msg.get_to_peer().store_id; let grpc_raft_conn_num = self.builder.cfg.value().grpc_raft_conn_num as u64; - let conn_id = if grpc_raft_conn_num == 1 { 0 } else { From 1c117474f819ca28ae2b1339ed0bb9685f99efe8 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 13:07:19 +0800 Subject: [PATCH 012/115] fix logs Signed-off-by: CalvinNeo --- Cargo.toml | 2 -- components/raftstore/src/store/fsm/peer.rs | 1 - components/raftstore/src/store/fsm/store.rs | 2 -- .../raftstore/src/store/peer_storage.rs | 11 ------- components/raftstore/src/store/snap.rs | 30 +++++-------------- components/test_pd_client/src/pd.rs | 2 +- engine_store_ffi/src/observer.rs | 30 ++++--------------- 7 files changed, 13 insertions(+), 65 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 450a18a34cc..55d6b086d42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -194,9 +194,7 @@ prometheus = { git = "https://github.com/solotzg/rust-prometheus.git", rev = "b4 # TODO: remove this when new raft-rs is published. raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } -#raft = { path = "/Users/calvin/tiflash/raft-rs" } raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } -# raft-proto = { path = "/Users/calvin/tiflash/raft-rs/proto" } protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index ffe360abe6f..75979a4afd5 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -611,7 +611,6 @@ where match m { PeerMsg::RaftMessage(msg) => { if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { - debug!("!!!! peer skip message"); continue; } if let Err(e) = self.on_raft_message(msg) { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 68aa7ddb241..fafc839dce2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1812,7 +1812,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER if local_state.get_state() != PeerState::Tombstone { // Maybe split, but not registered yet. if !util::is_first_message(msg.get_message()) { - debug!("!!!!! find RegionNotRegistered {:?}", msg); self.ctx .raft_metrics .message_dropped @@ -1984,7 +1983,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER error!( "missing epoch in raft message, ignore it"; "region_id" => region_id, - "!!!! msg" => ?msg, ); self.ctx .raft_metrics diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 62ee8549bbd..0d10b1f36cf 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -392,7 +392,6 @@ where #[inline] pub fn save_apply_state_to(&self, kv_wb: &mut impl Mutable) -> Result<()> { - debug!("!!!! save_apply_state_to {:?}", self.apply_state()); kv_wb.put_msg_cf( CF_RAFT, &keys::apply_state_key(self.region.get_id()), @@ -635,11 +634,6 @@ where let snap_index = snap.get_metadata().get_index(); let snap_term = snap.get_metadata().get_term(); - debug!("!!!! apply snapshot {}", self.peer_id; - "snap_index" => snap_index, - "snap_term" => snap_term, - ); - self.raft_state_mut().set_last_index(snap_index); self.set_last_term(snap_term); self.apply_state_mut().set_applied_index(snap_index); @@ -859,10 +853,6 @@ where } pub fn schedule_applying_snapshot(&mut self) { - debug!( - "!!!!!! schedule_applying_snapshot {:?}", - std::backtrace::Backtrace::capture() - ); let status = Arc::new(AtomicUsize::new(JOB_STATUS_PENDING)); self.set_snap_state(SnapState::Applying(Arc::clone(&status))); let task = RegionTask::Apply { @@ -925,7 +915,6 @@ where // and has not applied snapshot yet, so skip persistent hard state. if self.raft_state().get_last_index() > 0 { if let Some(hs) = ready.hs() { - debug!("!!!! apply_snapshot set hard state"); self.raft_state_mut().set_hard_state(hs.clone()); } } diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 038d1d2a880..d564bcb17e0 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -419,8 +419,8 @@ impl CfFile { } } -#[derive(Default, Debug)] -pub struct MetaFile { +#[derive(Default)] +struct MetaFile { pub meta: Option, pub path: PathBuf, pub file: Option, @@ -436,7 +436,7 @@ pub struct Snapshot { cf_files: Vec, cf_index: usize, cf_file_index: usize, - pub meta_file: MetaFile, + meta_file: MetaFile, hold_tmp_files: bool, mgr: SnapManagerCore, @@ -963,7 +963,6 @@ impl Snapshot { debug!( "deleting snapshot file"; "snapshot" => %self.path(), - "!!!!! bt" => ?std::backtrace::Backtrace::capture(), ); for cf_file in &self.cf_files { // Delete cloned files. @@ -1114,13 +1113,6 @@ impl Snapshot { pub fn exists(&self) -> bool { self.cf_files.iter().all(|cf_file| { - debug!( - "!!!!! exists cf_file.size {:?} cf_file.file_paths() {:?} meta {:?} {}", - cf_file.size, - cf_file.file_paths(), - self.meta_file.path, - file_exists(&self.meta_file.path) - ); cf_file.size.is_empty() || (cf_file .file_paths() @@ -1133,6 +1125,10 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + pub fn total_size(&self) -> u64 { self.cf_files .iter() @@ -1545,12 +1541,6 @@ impl SnapManager { Ok(Box::new(f)) } - pub fn get_empty_snapshot_for_building(&self, key: &SnapKey) -> RaftStoreResult> { - let base = &self.core.base; - let f = Snapshot::new_for_building(base, key, &self.core)?; - Ok(Box::new(f)) - } - pub fn get_snapshot_for_gc( &self, key: &SnapKey, @@ -1668,7 +1658,6 @@ impl SnapManager { "register snapshot"; "key" => %key, "entry" => ?entry, - "!!!!! bt" => ?std::backtrace::Backtrace::capture(), ); match self.core.registry.wl().entry(key) { Entry::Occupied(mut e) => { @@ -1790,11 +1779,6 @@ impl SnapManagerCore { ); return false; } - debug!( - "!!!!! deletee snapshot {:?} {:?}", - key, - std::backtrace::Backtrace::capture() - ); snap.delete(); true } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 8ea8a52bfbd..513d08643a7 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -740,7 +740,7 @@ impl PdCluster { let operator = operator?; debug!( "[region {}] schedule {:?} to {:?}, region: {:?}", - region_id, operator, leader, region, + region_id, operator, leader, region ); let mut resp = operator.make_region_heartbeat_response(region.get_id(), self); diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 9a428c5acd3..e8ef608c343 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cell::Cell, collections::hash_map::Entry as MapEntry, io::Write, ops::DerefMut, @@ -14,8 +13,7 @@ use std::{ use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::{Peekable, RaftEngine, SstMetaInfo, CF_RAFT}; -use into_other::into_other; +use engine_traits::{Peekable, RaftEngine, SstMetaInfo}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, @@ -212,7 +210,7 @@ impl TiFlashObserver { let slot_id = Self::slot_index(region_id); let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { Ok(g) => g, - Err(e) => return Err(box_err!("access_cached_region_info_mut poisoned")), + Err(_) => return Err(box_err!("access_cached_region_info_mut poisoned")), }; f(guard.entry(region_id)); Ok(()) @@ -236,7 +234,7 @@ impl TiFlashObserver { // TODO clean local, and prepare to request snapshot from TiKV as a trivial // procedure. fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); - if let Err(e) = self.set_inited_or_fallback(region_id, true) { + if let Err(_) = self.set_inited_or_fallback(region_id, true) { tikv_util::safe_panic!("set_inited_or_fallback"); } } @@ -434,12 +432,6 @@ impl TiFlashObserver { (snapshot, key.clone()) }; - debug!( - "!!!!! snap 1 {:?} {:?} {}", - snap, - snap.meta_file, - snap.cf_files().len() - ); // Build snapshot by do_snapshot let mut pb_snapshot: eraftpb::Snapshot = Default::default(); let pb_snapshot_metadata: &mut eraftpb::SnapshotMetadata = pb_snapshot.mut_metadata(); @@ -457,10 +449,6 @@ impl TiFlashObserver { let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); - debug!( - "!!!! snap g cf_file.path {:?} {:?} {:?}", - cf_file.path, cf_file.file_prefix, path - ); let mut _file = std::fs::File::create(path.as_path())?; } snap_data.set_region(new_region.clone()); @@ -476,19 +464,12 @@ impl TiFlashObserver { // Write MetaFile { let v = snapshot_meta.write_to_bytes()?; - let mut f = std::fs::File::create(snap.meta_file.path.as_path())?; + let mut f = std::fs::File::create(snap.meta_path())?; f.write_all(&v[..])?; f.flush()?; f.sync_all()?; } snap_data.set_meta(snapshot_meta); - debug!( - "!!!!! snap 2 {:?} {:?} XX {:?} {}", - snap.meta_file.meta, - snap.meta_file.file, - snap.meta_file.path, - snap.cf_files().len() - ); } // TODO The rest is test, please remove it after we can fetch the real data. @@ -507,7 +488,6 @@ impl TiFlashObserver { // Send reponse let mut response = RaftMessage::default(); - use kvproto::metapb::RegionEpoch; let epoch = new_region.get_region_epoch(); response.set_region_epoch(epoch.clone()); response.set_region_id(region_id); @@ -519,7 +499,7 @@ impl TiFlashObserver { response.mut_message().set_term(inner_msg.get_term()); response.mut_message().set_snapshot(pb_snapshot); debug!( - "!!!!! send response key {} response {:?} data {:?}", + "!!!! send snapshot key {} raft message {:?} snap data {:?}", key, response, snap_data ); match self.trans.lock() { From 2fb1939d94b9d8e916b94af0fec72f9f391759e3 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 14:28:25 +0800 Subject: [PATCH 013/115] f Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index e8ef608c343..9a9b28070cb 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -13,11 +13,11 @@ use std::{ use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::{Peekable, RaftEngine, SstMetaInfo}; +use engine_traits::{RaftEngine, SstMetaInfo}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, + raft_serverpb::{RaftApplyState, RaftMessage}, }; use protobuf::Message; use raft::{eraftpb, eraftpb::MessageType, StateRole}; @@ -185,11 +185,7 @@ pub fn validate_remote_peer_region( ) -> bool { match find_peer(new_region, store_id) { Some(peer) => { - if peer.get_id() != new_peer_id { - false - } else { - true - } + peer.get_id() == new_peer_id } None => false, } @@ -234,7 +230,7 @@ impl TiFlashObserver { // TODO clean local, and prepare to request snapshot from TiKV as a trivial // procedure. fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); - if let Err(_) = self.set_inited_or_fallback(region_id, true) { + if self.set_inited_or_fallback(region_id, true).is_err() { tikv_util::safe_panic!("set_inited_or_fallback"); } } @@ -310,6 +306,7 @@ impl TiFlashObserver { "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); + fail::fail_point!("go_fast_path_not_allow", |e| { return false }); // Feed data let res = self .engine_store_server_helper @@ -376,9 +373,7 @@ impl TiFlashObserver { Err(e) => { error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; "is_first" => is_first,); - if let Err(_) = self.set_inited_or_fallback(region_id, true) { - tikv_util::safe_panic!("set_inited_or_fallback"); - } + self.fallback_to_slow_path(region_id); return false; } }; @@ -453,13 +448,13 @@ impl TiFlashObserver { } snap_data.set_region(new_region.clone()); snap_data.set_file_size(0); - let SNAPSHOT_VERSION = 2; + const SNAPSHOT_VERSION: u64 = 2; snap_data.set_version(SNAPSHOT_VERSION); // SnapshotMeta // Which is snap.meta_file.meta let snapshot_meta = - raftstore::store::snap::gen_snapshot_meta(&snap.cf_files()[..], true)?; + raftstore::store::snap::gen_snapshot_meta(snap.cf_files(), true)?; // Write MetaFile { @@ -515,6 +510,7 @@ impl TiFlashObserver { } impl TiFlashObserver { + #[allow(clippy::too_many_arguments)] pub fn new( store_id: u64, engine: engine_tiflash::RocksEngine, @@ -1254,7 +1250,7 @@ impl ApplySnapshotObserver for TiFlashOb ); let region_id = ob_ctx.region().get_id(); let mut should_skip = false; - match self.access_cached_region_info_mut( + if self.access_cached_region_info_mut( region_id, |info: MapEntry>| match info { MapEntry::Occupied(mut o) => { @@ -1266,13 +1262,12 @@ impl ApplySnapshotObserver for TiFlashOb should_skip = o.get().inited_or_fallback.load(Ordering::SeqCst); o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } - MapEntry::Vacant(v) => { + MapEntry::Vacant(_) => { panic!("unknown snapshot!"); } }, - ) { - Err(_) => fatal!("post_apply_snapshot poisoned"), - _ => (), + ).is_err() { + fatal!("post_apply_snapshot poisoned") }; let snap = match snap { None => return, From fb0917bfa44ec1fc55967265349c5315d84ca439 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 16:49:08 +0800 Subject: [PATCH 014/115] f Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 99 ++++++++++++++++++------- new-mock-engine-store/src/mock_store.rs | 5 ++ new-mock-engine-store/src/node.rs | 2 +- proxy_tests/proxy/fast_add_peer.rs | 28 +++++++ proxy_tests/proxy/flashback.rs | 2 - proxy_tests/proxy/region.rs | 2 +- 6 files changed, 108 insertions(+), 30 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 9a9b28070cb..01acf7f4716 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -13,11 +13,11 @@ use std::{ use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::{RaftEngine, SstMetaInfo}; +use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::{RaftApplyState, RaftMessage}, + raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, }; use protobuf::Message; use raft::{eraftpb, eraftpb::MessageType, StateRole}; @@ -184,13 +184,21 @@ pub fn validate_remote_peer_region( new_peer_id: u64, ) -> bool { match find_peer(new_region, store_id) { - Some(peer) => { - peer.get_id() == new_peer_id - } + Some(peer) => peer.get_id() == new_peer_id, None => false, } } +pub fn get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) +} + impl TiFlashObserver { #[inline] fn slot_index(id: u64) -> usize { @@ -235,6 +243,13 @@ impl TiFlashObserver { } } + pub fn is_initialized(&self, region_id: u64) -> bool { + match get_region_local_state(&self.engine, region_id) { + None => false, + Some(r) => raftstore::store::util::is_region_initialized(r.get_region()), + } + } + // Returns whether we need to ignore this message and run fast path instead. pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { if !self.engine_store_cfg.enable_fast_add_peer { @@ -251,10 +266,24 @@ impl TiFlashObserver { let new_peer_id = msg.get_to_peer().get_id(); let mut is_first = false; let mut is_replicated = false; + let mut has_already_inited = None; let f = |info: MapEntry>| { match info { - MapEntry::Occupied(o) => { - is_first = !o.get().inited_or_fallback.load(Ordering::SeqCst); + MapEntry::Occupied(mut o) => { + (is_first, has_already_inited) = + if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + // If `has_already_inited` is true, usually means we recover from a + // restart. So we have data in disk, but not + // in memory. TODO maybe only check once, or + // we can remove apply snapshot. + let has_already_inited = self.is_initialized(region_id); + if has_already_inited { + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } + (!has_already_inited, Some(has_already_inited)) + } else { + (false, None) + }; // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { @@ -264,6 +293,8 @@ impl TiFlashObserver { "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, ); } } @@ -282,9 +313,13 @@ impl TiFlashObserver { self.access_cached_region_info_mut(region_id, f).unwrap(); if !is_first { + // TODO avoid too much log info!( "fast path: normal MsgAppend of {}:{}", - self.store_id, region_id + self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, ); return false; } @@ -380,6 +415,25 @@ impl TiFlashObserver { is_first } + fn check_entry_at_index( + &self, + region_id: u64, + index: u64, + peer_id: u64, + ) -> RaftStoreResult { + match self.raft_engine.get_entry(region_id, index)? { + Some(entry) => Ok(entry.get_term()), + None => { + return Err(box_err!( + "can't find entry for index {} of region {}, peer_id: {}", + index, + region_id, + peer_id + )); + } + } + } + fn build_and_send_snapshot( &self, region_id: u64, @@ -408,17 +462,10 @@ impl TiFlashObserver { // Find term of entry at applied_index. let applied_index = apply_state.get_applied_index(); - let applied_term = match self.raft_engine.get_entry(region_id, applied_index)? { - Some(apply_entry) => apply_entry.get_term(), - None => { - return Err(box_err!( - "can't find entry for applied_index {} of region {}, peer_id: {}", - applied_index, - region_id, - new_peer_id - )); - } - }; + let applied_term = self.check_entry_at_index(region_id, applied_index, new_peer_id)?; + // Will otherwise cause "got message with lower index than committed" loop. + self.check_entry_at_index(region_id, apply_state.get_commit_index(), new_peer_id)?; + let key = SnapKey::new(region_id, applied_term, applied_index); self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); @@ -453,8 +500,7 @@ impl TiFlashObserver { // SnapshotMeta // Which is snap.meta_file.meta - let snapshot_meta = - raftstore::store::snap::gen_snapshot_meta(snap.cf_files(), true)?; + let snapshot_meta = raftstore::store::snap::gen_snapshot_meta(snap.cf_files(), true)?; // Write MetaFile { @@ -494,8 +540,8 @@ impl TiFlashObserver { response.mut_message().set_term(inner_msg.get_term()); response.mut_message().set_snapshot(pb_snapshot); debug!( - "!!!! send snapshot key {} raft message {:?} snap data {:?}", - key, response, snap_data + "!!!! send snapshot key {} raft message {:?} snap data {:?} apply_state {:?}", + key, response, snap_data, apply_state ); match self.trans.lock() { Ok(mut trans) => match trans.send(response) { @@ -1254,13 +1300,14 @@ impl ApplySnapshotObserver for TiFlashOb region_id, |info: MapEntry>| match info { MapEntry::Occupied(mut o) => { - if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; "snap_key" => ?snap_key, ); + should_skip = true; + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } - should_skip = o.get().inited_or_fallback.load(Ordering::SeqCst); - o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } MapEntry::Vacant(_) => { panic!("unknown snapshot!"); diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index de6b5d5d805..9cb56d119fb 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -208,11 +208,13 @@ pub fn write_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8], v: &[u8]) let pending_delete = &mut region.pending_delete[cf_index]; let pending_write = &mut region.pending_write[cf_index]; pending_delete.remove(k); + debug!("!!!! write_kv_in_mem {:?}", k); data.insert(k.to_vec(), v.to_vec()); pending_write.insert(k.to_vec(), v.to_vec()); } fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { + debug!("!!!! delete_kv_in_mem {:?}", k); let data = &mut region.data[cf_index]; let pending_delete = &mut region.pending_delete[cf_index]; pending_delete.insert(k.to_vec()); @@ -1417,6 +1419,9 @@ unsafe extern "C" fn ffi_fast_add_peer( let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); let region_ptr = create_cpp_str(Some(region_bytes)); + + // Check if we have commit_index. + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::Ok, diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 7f88e47a613..e88b5a8acac 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -145,7 +145,7 @@ impl Transport for ChannelTransport { SnapshotStatus::Finish, ), None => return Err(box_err!("Find no from_store {}", from_store)), - }; + }?; } Ok(()) } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 962abcbe0b9..3dea073c9fa 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -162,3 +162,31 @@ fn test_fast_add_peer_from_delayed_learner_blocked() { simple_fast_add_peer(SourceType::DelayedLearner, true); fail::remove("on_pre_persist_with_finish"); } + +#[test] +fn test_existing_peer() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 3, 4, Some(true), None, None); + fail::remove("fallback_to_slow_path_not_allow"); + + stop_tiflash_node(&mut cluster, 2); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + restart_tiflash_node(&mut cluster, 2); + must_put_and_check_key(&mut cluster, 5, 6, Some(true), None, None); + + cluster.shutdown(); + fail::remove("go_fast_path_not_allow"); +} diff --git a/proxy_tests/proxy/flashback.rs b/proxy_tests/proxy/flashback.rs index b6d115376b5..c6286f3ae18 100644 --- a/proxy_tests/proxy/flashback.rs +++ b/proxy_tests/proxy/flashback.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::DerefMut; - use futures::executor::block_on; use tikv_util::time::Duration; use txn_types::WriteBatchFlags; diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index f93834c6423..51362ef3917 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -227,7 +227,7 @@ fn test_add_absent_learner_peer_by_joint() { } use engine_traits::{Engines, KvEngine, RaftEngine}; -use raftstore::store::{write_initial_apply_state, write_initial_raft_state, RAFT_INIT_LOG_INDEX}; +use raftstore::store::{write_initial_apply_state, write_initial_raft_state}; pub fn prepare_bootstrap_cluster_with( engines: &Engines, From 7da9b92afecf78ef944469f625e0dc6b706b798f Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 16:59:21 +0800 Subject: [PATCH 015/115] f Signed-off-by: CalvinNeo --- proxy_scripts/ci_check.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 70dbfdfa1f6..d443d70771a 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -1,6 +1,7 @@ set -uxeo pipefail if [[ $M == "fmt" ]]; then make gen_proxy_ffi + git status -s GIT_STATUS=$(git status -s) && if [[ ${GIT_STATUS} ]]; then echo "Error: found illegal git status"; echo ${GIT_STATUS}; [[ -z ${GIT_STATUS} ]]; fi cargo fmt -- --check >/dev/null elif [[ $M == "testold" ]]; then From fa6099ce13e04d8de8887ad8cb1aec54caf101f9 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 12 Dec 2022 17:29:48 +0800 Subject: [PATCH 016/115] f Signed-off-by: CalvinNeo --- engine_store_ffi/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index c88093ef571..9ee547cc3cd 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1264,6 +1264,7 @@ pub struct EngineStoreConfig { pub enable_fast_add_peer: bool, } +#[allow(clippy::derivable_impls)] impl Default for EngineStoreConfig { fn default() -> Self { Self { From 1b335a33564f1d9e95dda95aa8ff96076ebe1c43 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Mon, 12 Dec 2022 18:06:42 +0800 Subject: [PATCH 017/115] Merge fast add peer (#230) --- .github/workflows/pr-ci.yml | 10 +- CHANGELOG.md | 1348 +++++++++++++++ Cargo.lock | 676 ++++---- Cargo.toml | 54 +- Dockerfile | 113 ++ Makefile | 13 +- cmd/tikv-ctl/Cargo.toml | 2 +- cmd/tikv-ctl/src/cmd.rs | 21 +- cmd/tikv-ctl/src/executor.rs | 29 +- cmd/tikv-ctl/src/main.rs | 19 +- cmd/tikv-ctl/src/util.rs | 58 + components/api_version/Cargo.toml | 2 +- components/backup-stream/Cargo.toml | 12 +- .../backup-stream/src/checkpoint_manager.rs | 225 ++- components/backup-stream/src/endpoint.rs | 22 +- components/backup-stream/src/errors.rs | 24 +- .../backup-stream/src/metadata/client.rs | 2 +- .../src/metadata/store/lazy_etcd.rs | 28 +- components/backup-stream/src/router.rs | 18 +- components/backup-stream/src/service.rs | 12 + components/backup-stream/tests/mod.rs | 120 +- components/backup/Cargo.toml | 4 +- components/backup/src/endpoint.rs | 293 +++- components/backup/src/writer.rs | 4 +- components/batch-system/src/metrics.rs | 7 + components/batch-system/src/router.rs | 19 +- components/causal_ts/Cargo.toml | 2 +- components/causal_ts/benches/tso.rs | 6 +- components/causal_ts/src/lib.rs | 1 - components/causal_ts/src/tso.rs | 2 +- components/cdc/Cargo.toml | 3 +- components/cdc/src/delegate.rs | 80 +- components/cdc/src/endpoint.rs | 71 +- components/cdc/src/initializer.rs | 78 +- components/cdc/src/old_value.rs | 10 +- components/cdc/src/service.rs | 10 +- .../cdc/tests/failpoints/test_endpoint.rs | 8 +- components/cdc/tests/integrations/test_cdc.rs | 241 ++- components/cdc/tests/mod.rs | 116 +- components/cloud/Cargo.toml | 2 +- components/cloud/aws/Cargo.toml | 4 +- components/cloud/aws/src/s3.rs | 6 +- components/cloud/azure/Cargo.toml | 2 +- components/cloud/azure/src/azblob.rs | 6 +- components/cloud/gcp/Cargo.toml | 4 +- components/cloud/gcp/src/gcs.rs | 165 +- components/cloud/gcp/src/lib.rs | 19 + components/cloud/src/blob.rs | 10 +- components/codec/src/byte.rs | 2 +- components/concurrency_manager/Cargo.toml | 2 +- .../concurrency_manager/benches/lock_table.rs | 1 - .../concurrency_manager/src/lock_table.rs | 13 +- components/encryption/Cargo.toml | 2 +- components/encryption/export/Cargo.toml | 2 +- components/encryption/src/crypter.rs | 2 +- .../encryption/src/encrypted_file/mod.rs | 4 +- components/encryption/src/manager/mod.rs | 10 +- components/engine_panic/Cargo.toml | 2 +- components/engine_panic/src/checkpoint.rs | 29 + components/engine_panic/src/lib.rs | 2 +- components/engine_panic/src/raft_engine.rs | 4 + components/engine_rocks/Cargo.toml | 2 +- components/engine_rocks/src/checkpoint.rs | 55 + components/engine_rocks/src/lib.rs | 3 +- components/engine_rocks/src/raft_engine.rs | 4 + components/engine_rocks/src/util.rs | 2 +- components/engine_rocks_helper/Cargo.toml | 2 +- components/engine_test/src/lib.rs | 102 +- components/engine_tirocks/Cargo.toml | 2 +- components/engine_traits/Cargo.toml | 2 +- components/engine_traits/src/checkpoint.rs | 20 + components/engine_traits/src/engine.rs | 16 +- components/engine_traits/src/lib.rs | 3 +- components/engine_traits/src/raft_engine.rs | 4 + components/error_code/Cargo.toml | 2 +- components/error_code/bin.rs | 2 +- components/error_code/src/backup_stream.rs | 7 +- components/error_code/src/sst_importer.rs | 3 +- components/external_storage/Cargo.toml | 2 +- components/external_storage/export/Cargo.toml | 2 +- .../external_storage/export/src/dylib.rs | 2 +- .../external_storage/export/src/export.rs | 50 +- .../external_storage/src/dylib_client.rs | 2 +- .../external_storage/src/grpc_client.rs | 2 +- components/external_storage/src/hdfs.rs | 13 +- components/external_storage/src/lib.rs | 176 +- components/external_storage/src/local.rs | 6 +- components/external_storage/src/noop.rs | 9 +- components/file_system/src/io_stats/proc.rs | 2 +- components/file_system/src/lib.rs | 8 +- components/into_other/Cargo.toml | 2 +- components/keys/Cargo.toml | 2 +- components/pd_client/Cargo.toml | 7 +- components/pd_client/src/client.rs | 39 +- components/pd_client/src/client_v2.rs | 1408 ++++++++++++++++ components/pd_client/src/lib.rs | 7 +- components/pd_client/src/tso.rs | 59 +- components/pd_client/src/util.rs | 65 +- components/raft_log_engine/Cargo.toml | 2 +- components/raft_log_engine/src/engine.rs | 10 +- components/raft_log_engine/src/lib.rs | 1 - components/raftstore-v2/Cargo.toml | 8 +- components/raftstore-v2/src/batch/store.rs | 104 +- components/raftstore-v2/src/fsm/apply.rs | 21 +- components/raftstore-v2/src/fsm/peer.rs | 16 +- components/raftstore-v2/src/fsm/store.rs | 53 +- components/raftstore-v2/src/lib.rs | 5 +- .../operation/command/admin/conf_change.rs | 41 +- .../src/operation/command/admin/mod.rs | 45 +- .../src/operation/command/admin/split.rs | 834 ++++++++++ .../src/operation/command/control.rs | 428 +++++ .../raftstore-v2/src/operation/command/mod.rs | 84 +- .../src/operation/command/write/mod.rs | 61 +- .../operation/command/write/simple_write.rs | 31 +- components/raftstore-v2/src/operation/life.rs | 57 +- components/raftstore-v2/src/operation/mod.rs | 9 +- components/raftstore-v2/src/operation/pd.rs | 230 +++ .../raftstore-v2/src/operation/query/lease.rs | 8 +- .../raftstore-v2/src/operation/query/local.rs | 2 +- .../raftstore-v2/src/operation/query/mod.rs | 22 +- .../src/operation/ready/async_writer.rs | 11 +- .../raftstore-v2/src/operation/ready/mod.rs | 181 ++- .../src/operation/ready/snapshot.rs | 403 +++++ components/raftstore-v2/src/raft/apply.rs | 30 +- components/raftstore-v2/src/raft/peer.rs | 296 +++- components/raftstore-v2/src/raft/storage.rs | 341 +++- components/raftstore-v2/src/router/imp.rs | 13 +- .../src/router/internal_message.rs | 4 +- components/raftstore-v2/src/router/message.rs | 18 +- .../src/router/response_channel.rs | 7 + components/raftstore-v2/src/worker/mod.rs | 5 + components/raftstore-v2/src/worker/pd/mod.rs | 327 ++++ .../src/worker/pd/region_heartbeat.rs | 256 +++ .../raftstore-v2/src/worker/pd/split.rs | 99 ++ .../src/worker/pd/store_heartbeat.rs | 293 ++++ .../src/worker/pd/update_max_timestamp.rs | 114 ++ .../tests/integrations/cluster.rs | 107 +- .../raftstore-v2/tests/integrations/mod.rs | 4 + .../tests/integrations/test_basic_write.rs | 2 +- .../tests/integrations/test_conf_change.rs | 40 +- .../tests/integrations/test_life.rs | 4 +- .../tests/integrations/test_pd_heartbeat.rs | 60 + .../tests/integrations/test_read.rs | 10 +- .../tests/integrations/test_split.rs | 183 +++ components/raftstore/Cargo.toml | 4 +- .../raftstore/src/coprocessor/dispatcher.rs | 18 + components/raftstore/src/coprocessor/mod.rs | 8 +- components/raftstore/src/lib.rs | 3 +- .../raftstore/src/store/async_io/mod.rs | 1 + .../raftstore/src/store/async_io/read.rs | 241 +++ .../raftstore/src/store/async_io/write.rs | 21 +- components/raftstore/src/store/bootstrap.rs | 1 + components/raftstore/src/store/config.rs | 40 +- .../raftstore/src/store/entry_storage.rs | 38 +- components/raftstore/src/store/fsm/apply.rs | 563 +++++-- components/raftstore/src/store/fsm/peer.rs | 150 +- components/raftstore/src/store/fsm/store.rs | 84 +- .../raftstore/src/store/local_metrics.rs | 9 + components/raftstore/src/store/metrics.rs | 42 +- components/raftstore/src/store/mod.rs | 16 +- components/raftstore/src/store/msg.rs | 10 +- components/raftstore/src/store/peer.rs | 63 +- .../raftstore/src/store/peer_storage.rs | 242 ++- components/raftstore/src/store/region_meta.rs | 20 +- components/raftstore/src/store/snap.rs | 130 +- components/raftstore/src/store/snap/io.rs | 2 +- components/raftstore/src/store/transport.rs | 11 +- components/raftstore/src/store/txn_ext.rs | 8 +- components/raftstore/src/store/util.rs | 143 +- .../src/store/worker/check_leader.rs | 2 +- .../raftstore/src/store/worker/metrics.rs | 3 + components/raftstore/src/store/worker/mod.rs | 7 +- components/raftstore/src/store/worker/pd.rs | 121 +- .../src/store/worker/raftlog_fetch.rs | 124 -- components/raftstore/src/store/worker/read.rs | 40 +- .../src/store/worker/split_controller.rs | 2 +- components/resolved_ts/Cargo.toml | 2 +- components/resolved_ts/src/advance.rs | 224 +-- components/resolved_ts/src/cmd.rs | 1 + components/resolved_ts/src/endpoint.rs | 115 +- components/resolved_ts/src/lib.rs | 3 - components/resolved_ts/src/observer.rs | 24 +- components/resolved_ts/src/sinker.rs | 45 - components/resolved_ts/src/util.rs | 12 - .../resolved_ts/tests/integrations/mod.rs | 2 +- components/resolved_ts/tests/mod.rs | 6 +- components/resource_metering/Cargo.toml | 2 +- .../resource_metering/src/recorder/mod.rs | 4 +- components/security/Cargo.toml | 4 - components/security/src/lib.rs | 43 +- components/server/Cargo.toml | 6 +- components/server/src/server.rs | 207 ++- components/server/src/signal_handler.rs | 2 +- components/sst_importer/Cargo.toml | 3 +- .../sst_importer/src/caching/cache_map.rs | 211 +++ components/sst_importer/src/caching/mod.rs | 4 + .../sst_importer/src/caching/storage_cache.rs | 58 + components/sst_importer/src/config.rs | 10 + components/sst_importer/src/errors.rs | 20 +- components/sst_importer/src/import_file.rs | 10 +- components/sst_importer/src/import_mode.rs | 27 +- components/sst_importer/src/lib.rs | 1 + components/sst_importer/src/metrics.rs | 10 + components/sst_importer/src/sst_importer.rs | 945 +++++++++-- components/sst_importer/src/util.rs | 8 + components/test_backup/Cargo.toml | 2 +- components/test_backup/src/lib.rs | 2 +- components/test_coprocessor/Cargo.toml | 4 +- components/test_pd/Cargo.toml | 2 +- components/test_pd/src/mocker/service.rs | 27 +- components/test_pd/src/server.rs | 25 +- components/test_pd/src/util.rs | 21 +- components/test_pd_client/Cargo.toml | 4 +- components/test_pd_client/src/pd.rs | 4 +- components/test_raftstore/Cargo.toml | 4 +- components/test_raftstore/src/cluster.rs | 2 +- .../test_raftstore/src/common-test.toml | 3 +- components/test_raftstore/src/server.rs | 34 +- .../test_raftstore/src/transport_simulate.rs | 2 +- components/test_raftstore/src/util.rs | 148 +- components/test_sst_importer/Cargo.toml | 2 +- components/test_storage/Cargo.toml | 2 +- components/test_storage/src/sync_storage.rs | 8 +- components/test_util/Cargo.toml | 2 +- components/test_util/src/runner.rs | 4 +- components/tidb_query_aggr/Cargo.toml | 2 +- .../tidb_query_aggr/src/impl_max_min.rs | 6 +- .../tidb_query_codegen/src/rpn_function.rs | 2 +- components/tidb_query_common/Cargo.toml | 4 +- components/tidb_query_datatype/Cargo.toml | 6 +- .../src/codec/collation/charset.rs | 8 + .../src/codec/collation/mod.rs | 28 + .../tidb_query_datatype/src/codec/convert.rs | 14 +- .../src/codec/mysql/decimal.rs | 16 +- .../src/codec/mysql/duration.rs | 4 +- .../src/codec/mysql/json/binary.rs | 24 +- .../src/codec/mysql/json/json_extract.rs | 319 +++- .../src/codec/mysql/json/json_modify.rs | 2 +- .../src/codec/mysql/json/json_remove.rs | 7 +- .../src/codec/mysql/json/modifier.rs | 21 +- .../src/codec/mysql/json/path_expr.rs | 671 ++++---- .../src/codec/mysql/time/extension.rs | 2 +- .../tidb_query_datatype/src/codec/overflow.rs | 6 +- .../src/codec/row/v2/compat_v1.rs | 1 + .../tidb_query_datatype/src/codec/table.rs | 2 +- .../tidb_query_datatype/src/expr/ctx.rs | 4 +- components/tidb_query_executors/Cargo.toml | 6 +- .../src/simple_aggr_executor.rs | 2 +- .../src/top_n_executor.rs | 4 +- components/tidb_query_expr/Cargo.toml | 2 +- .../tidb_query_expr/src/impl_arithmetic.rs | 34 +- components/tidb_query_expr/src/impl_cast.rs | 16 +- .../tidb_query_expr/src/impl_compare.rs | 8 +- components/tidb_query_expr/src/impl_json.rs | 57 + components/tidb_query_expr/src/impl_like.rs | 185 ++- components/tidb_query_expr/src/impl_math.rs | 4 +- components/tidb_query_expr/src/impl_op.rs | 4 +- components/tidb_query_expr/src/impl_time.rs | 8 +- components/tidb_query_expr/src/lib.rs | 50 +- components/tikv_kv/Cargo.toml | 3 +- components/tikv_kv/src/btree_engine.rs | 32 +- components/tikv_kv/src/cursor.rs | 24 +- components/tikv_kv/src/lib.rs | 167 +- components/tikv_kv/src/mock_engine.rs | 33 +- components/tikv_kv/src/raft_extension.rs | 69 + components/tikv_kv/src/rocksdb_engine.rs | 136 +- components/tikv_util/Cargo.toml | 7 +- components/tikv_util/src/buffer_vec.rs | 86 +- components/tikv_util/src/codec/bytes.rs | 2 +- .../tikv_util/src/codec/stream_event.rs | 12 +- components/tikv_util/src/config.rs | 28 +- components/tikv_util/src/future.rs | 20 + components/tikv_util/src/lib.rs | 2 +- components/tikv_util/src/logger/file_log.rs | 2 +- components/tikv_util/src/mpsc/future.rs | 74 +- components/tikv_util/src/store/mod.rs | 79 +- components/tikv_util/src/store/peer.rs | 47 +- components/tikv_util/src/store/region.rs | 21 +- components/tikv_util/src/stream.rs | 86 +- components/tikv_util/src/sys/cgroup.rs | 30 +- components/tikv_util/src/sys/disk.rs | 9 + components/tikv_util/src/sys/inspector.rs | 2 +- components/tikv_util/src/sys/mod.rs | 61 + components/tikv_util/src/sys/thread.rs | 4 +- components/tipb_helper/Cargo.toml | 2 +- components/tracker/Cargo.toml | 2 +- components/txn_types/Cargo.toml | 2 +- components/txn_types/src/lib.rs | 4 +- components/txn_types/src/lock.rs | 137 +- components/txn_types/src/types.rs | 15 +- components/txn_types/src/write.rs | 104 ++ engine_store_ffi/src/interfaces.rs | 86 +- engine_store_ffi/src/lib.rs | 200 +-- engine_store_ffi/src/observer.rs | 560 ++++++- engine_store_ffi/src/ps_engine.rs | 177 +- engine_tiflash/src/engine.rs | 29 +- engine_tiflash/src/proxy_utils.rs | 2 + engine_tiflash/src/raft_engine.rs | 4 + etc/config-template.toml | 6 + fuzz/cli.rs | 18 +- metrics/grafana/tikv_details.json | 455 +++++- new-mock-engine-store/src/lib.rs | 1262 ++------------ new-mock-engine-store/src/mock_cluster.rs | 89 +- new-mock-engine-store/src/mock_store.rs | 1443 +++++++++++++++++ new-mock-engine-store/src/node.rs | 20 +- new-mock-engine-store/src/server.rs | 69 +- .../src/transport_simulate.rs | 2 +- proxy_scripts/ci_check.sh | 22 +- proxy_server/Cargo.toml | 2 +- proxy_server/src/config.rs | 14 +- proxy_server/src/run.rs | 69 +- proxy_tests/Cargo.toml | 3 +- proxy_tests/proxy/config.rs | 199 +++ proxy_tests/proxy/fast_add_peer.rs | 164 ++ proxy_tests/proxy/flashback.rs | 25 +- proxy_tests/proxy/mod.rs | 5 + proxy_tests/proxy/normal.rs | 1396 ++-------------- proxy_tests/proxy/proxy.rs | 542 +++++-- proxy_tests/proxy/region.rs | 615 +++++++ proxy_tests/proxy/server_cluster_test.rs | 52 +- proxy_tests/proxy/snapshot.rs | 454 ++++++ proxy_tests/proxy/write.rs | 520 ++++++ .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 56 +- rust-toolchain | 2 +- scripts/clippy | 6 +- src/config.rs | 49 +- src/coprocessor/endpoint.rs | 181 ++- src/coprocessor/statistics/analyze.rs | 4 +- src/coprocessor/statistics/histogram.rs | 2 +- src/coprocessor_v2/plugin_registry.rs | 34 +- src/import/sst_service.rs | 401 +++-- src/lib.rs | 4 +- src/read_pool.rs | 9 +- src/server/config.rs | 20 +- src/server/debug.rs | 44 +- src/server/engine_factory.rs | 2 +- src/server/engine_factory_v2.rs | 135 +- src/server/errors.rs | 5 +- .../gc_worker/applied_lock_collector.rs | 894 ---------- src/server/gc_worker/gc_manager.rs | 3 +- src/server/gc_worker/gc_worker.rs | 405 +---- src/server/gc_worker/mod.rs | 1 - src/server/lock_manager/mod.rs | 8 +- src/server/lock_manager/waiter_manager.rs | 47 +- src/server/metrics.rs | 2 - src/server/mod.rs | 1 + src/server/node.rs | 2 +- src/server/raft_client.rs | 282 ++-- src/server/{raftkv.rs => raftkv/mod.rs} | 536 +++--- src/server/raftkv/raft_extension.rs | 177 ++ src/server/reset_to_version.rs | 82 +- src/server/resolve.rs | 44 +- src/server/server.rs | 94 +- src/server/service/debug.rs | 117 +- src/server/service/diagnostics/log.rs | 8 +- src/server/service/diagnostics/sys.rs | 2 +- src/server/service/kv.rs | 637 ++------ src/server/snap.rs | 42 +- src/server/status_server/profile.rs | 4 +- src/server/tablet_snap.rs | 537 ++++++ src/server/transport.rs | 37 +- src/storage/config.rs | 6 +- src/storage/errors.rs | 49 +- src/storage/lock_manager/lock_wait_context.rs | 231 ++- .../lock_manager/lock_waiting_queue.rs | 175 +- src/storage/lock_manager/mod.rs | 25 +- src/storage/metrics.rs | 1 + src/storage/mod.rs | 1113 +++++++++++-- src/storage/mvcc/mod.rs | 10 + src/storage/mvcc/reader/point_getter.rs | 58 +- src/storage/mvcc/reader/reader.rs | 559 +++---- src/storage/mvcc/reader/scanner/forward.rs | 128 +- src/storage/mvcc/reader/scanner/mod.rs | 6 +- src/storage/mvcc/txn.rs | 5 + src/storage/raw/raw_mvcc.rs | 22 +- .../txn/actions/acquire_pessimistic_lock.rs | 458 +++++- src/storage/txn/actions/commit.rs | 52 +- .../txn/actions/flashback_to_version.rs | 550 +++++-- src/storage/txn/actions/prewrite.rs | 320 +++- src/storage/txn/actions/tests.rs | 37 + .../txn/commands/acquire_pessimistic_lock.rs | 201 +-- .../acquire_pessimistic_lock_resumed.rs | 441 +++++ src/storage/txn/commands/atomic_store.rs | 2 +- .../txn/commands/check_secondary_locks.rs | 2 +- src/storage/txn/commands/check_txn_status.rs | 25 +- src/storage/txn/commands/cleanup.rs | 2 +- src/storage/txn/commands/commit.rs | 2 +- src/storage/txn/commands/compare_and_swap.rs | 2 +- .../txn/commands/flashback_to_version.rs | 170 +- .../flashback_to_version_read_phase.rs | 260 ++- src/storage/txn/commands/mod.rs | 75 +- src/storage/txn/commands/pause.rs | 2 +- .../txn/commands/pessimistic_rollback.rs | 2 +- src/storage/txn/commands/prewrite.rs | 245 ++- src/storage/txn/commands/resolve_lock.rs | 2 +- src/storage/txn/commands/resolve_lock_lite.rs | 2 +- src/storage/txn/commands/rollback.rs | 2 +- src/storage/txn/commands/txn_heart_beat.rs | 2 +- src/storage/txn/latch.rs | 245 ++- src/storage/txn/mod.rs | 9 +- src/storage/txn/sched_pool.rs | 17 +- src/storage/txn/scheduler.rs | 594 +++++-- src/storage/txn/store.rs | 24 +- src/storage/types.rs | 246 ++- tests/Cargo.toml | 9 +- tests/benches/hierarchy/mvcc/mod.rs | 4 +- tests/benches/hierarchy/txn/mod.rs | 2 + .../misc/coprocessor/codec/mysql/json/mod.rs | 2 +- tests/benches/misc/raftkv/mod.rs | 25 +- tests/failpoints/cases/mod.rs | 2 + tests/failpoints/cases/test_coprocessor.rs | 4 +- tests/failpoints/cases/test_gc_metrics.rs | 3 - tests/failpoints/cases/test_gc_worker.rs | 284 ---- tests/failpoints/cases/test_kv_service.rs | 89 - tests/failpoints/cases/test_merge.rs | 8 + tests/failpoints/cases/test_pd_client.rs | 141 +- .../failpoints/cases/test_pd_client_legacy.rs | 230 +++ tests/failpoints/cases/test_snap.rs | 10 +- tests/failpoints/cases/test_split_region.rs | 4 + tests/failpoints/cases/test_storage.rs | 185 ++- tests/failpoints/cases/test_transaction.rs | 2 + .../failpoints/cases/test_transfer_leader.rs | 6 + tests/failpoints/cases/test_witness.rs | 71 + tests/integrations/backup/mod.rs | 2 +- .../integrations/config/dynamic/gc_worker.rs | 10 +- .../integrations/config/dynamic/raftstore.rs | 4 +- tests/integrations/config/dynamic/snap.rs | 3 +- tests/integrations/config/mod.rs | 7 +- tests/integrations/config/test-custom.toml | 3 + .../integrations/config/test_config_client.rs | 2 +- tests/integrations/coprocessor/test_select.rs | 263 ++- tests/integrations/pd/mod.rs | 1 + tests/integrations/pd/test_rpc_client.rs | 305 ++-- .../integrations/pd/test_rpc_client_legacy.rs | 691 ++++++++ tests/integrations/raftstore/mod.rs | 1 + .../integrations/raftstore/test_flashback.rs | 246 ++- tests/integrations/raftstore/test_merge.rs | 6 + tests/integrations/raftstore/test_multi.rs | 2 + .../raftstore/test_split_region.rs | 4 + .../raftstore/test_transfer_leader.rs | 4 + .../raftstore/test_unsafe_recovery.rs | 1 - tests/integrations/raftstore/test_witness.rs | 537 ++++++ .../resource_metering/test_read_keys.rs | 26 +- tests/integrations/server/gc_worker.rs | 258 +-- tests/integrations/server/kv_service.rs | 580 ++++--- tests/integrations/server/lock_manager.rs | 5 +- tests/integrations/server/raft_client.rs | 96 +- 448 files changed, 30972 insertions(+), 11306 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 Dockerfile create mode 100644 components/engine_panic/src/checkpoint.rs create mode 100644 components/engine_rocks/src/checkpoint.rs create mode 100644 components/engine_traits/src/checkpoint.rs create mode 100644 components/pd_client/src/client_v2.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/split.rs create mode 100644 components/raftstore-v2/src/operation/command/control.rs create mode 100644 components/raftstore-v2/src/operation/pd.rs create mode 100644 components/raftstore-v2/src/operation/ready/snapshot.rs create mode 100644 components/raftstore-v2/src/worker/mod.rs create mode 100644 components/raftstore-v2/src/worker/pd/mod.rs create mode 100644 components/raftstore-v2/src/worker/pd/region_heartbeat.rs create mode 100644 components/raftstore-v2/src/worker/pd/split.rs create mode 100644 components/raftstore-v2/src/worker/pd/store_heartbeat.rs create mode 100644 components/raftstore-v2/src/worker/pd/update_max_timestamp.rs create mode 100644 components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs create mode 100644 components/raftstore-v2/tests/integrations/test_split.rs create mode 100644 components/raftstore/src/store/async_io/read.rs delete mode 100644 components/raftstore/src/store/worker/raftlog_fetch.rs delete mode 100644 components/resolved_ts/src/sinker.rs delete mode 100644 components/resolved_ts/src/util.rs create mode 100644 components/sst_importer/src/caching/cache_map.rs create mode 100644 components/sst_importer/src/caching/mod.rs create mode 100644 components/sst_importer/src/caching/storage_cache.rs create mode 100644 components/tikv_kv/src/raft_extension.rs create mode 100644 new-mock-engine-store/src/mock_store.rs create mode 100644 proxy_tests/proxy/config.rs create mode 100644 proxy_tests/proxy/fast_add_peer.rs create mode 100644 proxy_tests/proxy/region.rs create mode 100644 proxy_tests/proxy/snapshot.rs create mode 100644 proxy_tests/proxy/write.rs delete mode 100644 src/server/gc_worker/applied_lock_collector.rs rename src/server/{raftkv.rs => raftkv/mod.rs} (63%) create mode 100644 src/server/raftkv/raft_extension.rs create mode 100644 src/server/tablet_snap.rs create mode 100644 src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs create mode 100644 tests/failpoints/cases/test_pd_client_legacy.rs create mode 100644 tests/failpoints/cases/test_witness.rs create mode 100644 tests/integrations/pd/test_rpc_client_legacy.rs create mode 100644 tests/integrations/raftstore/test_witness.rs diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 9afd74c413e..4480af5f7ba 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -6,7 +6,7 @@ on: # - 'raftstore-proxy*' pull_request: branches: - - 'raftstore-proxy*' + - 'ldz/*' jobs: build-check-old: @@ -26,6 +26,10 @@ jobs: key: ${{ runner.os }}-cargo-${{ hashFiles('**/rust-toolchain') }} restore-keys: | ${{ runner.os }}-cargo- + - name: Install dependencies (protocol buffers compiler) + uses: arduino/setup-protoc@v1 + with: + version: '3.8.0' - name: install rust if: steps.cache-cargo.outputs.cache-hit != 'true' run: | @@ -69,6 +73,10 @@ jobs: key: ${{ runner.os }}-cargo-${{ hashFiles('**/rust-toolchain') }} restore-keys: | ${{ runner.os }}-cargo- + - name: Install dependencies (protocol buffers compiler) + uses: arduino/setup-protoc@v1 + with: + version: '3.8.0' - name: install rust if: steps.cache-cargo.outputs.cache-hit != 'true' run: | diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000000..26fd52f2bd5 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,1348 @@ +# TiKV Change Log +All notable changes to this project are documented in this file. +See also [TiDB Release Notes](https://github.com/pingcap/docs/blob/master/releases/release-notes.md) and [PD Changelog](https://github.com/pingcap/pd/blob/master/CHANGELOG.md). + +## [5.3.0] - 2021-11-29 + ++ Improvements + + Enhance disk space protection to improve storage stability + + Simplify the algorithm of L0 flow control [#10879](https://github.com/tikv/tikv/pull/10879) + + Improve the error log report in the raft client module [#10944](https://github.com/tikv/tikv/pull/10944) + + Improve logging threads to avoid them becoming a performance bottleneck [#10841](https://github.com/tikv/tikv/pull/10841) + + Add more statistics types of write queries [#10507](https://github.com/tikv/tikv/pull/10507) ++ Bug Fixes + + Fix the issue of unavailable TiKV caused by Raftstore deadlock when migrating Regions. The workaround is to disable the scheduling and restart the unavailable TiKV. [#10909](https://github.com/tikv/tikv/pull/10909) + + Fix the issue that CDC adds scan retries frequently due to the Congest error [#11082](https://github.com/tikv/tikv/pull/11082) + + Fix the issue that the Raft connection is broken when the channel is full [#11047](https://github.com/tikv/tikv/pull/11047) + + Fix the issue that batch messages are too large in Raft client implementation [#9714](https://github.com/tikv/tikv/pull/9714) + + Fix the issue that some coroutines leak in resolved_ts [#10965](https://github.com/tikv/tikv/pull/10965) + + Fix a panic issue that occurs to the coprocessor when the size of response exceeds 4 GiB [#9012](https://github.com/tikv/tikv/pull/9012) + + Fix the issue that snapshot Garbage Collection (GC) misses GC snapshot files when snapshot files cannot be garbage collected [#10813](https://github.com/tikv/tikv/pull/10813) + + Fix a panic issue caused by timeout when processing Coprocessor requests [#10852](https://github.com/tikv/tikv/pull/10852) + + Fix a memory leak caused by monitoring data of statistics threads [#11195](https://github.com/tikv/tikv/pull/11195) + + Fix a panic issue caused by getting the cgroup information from some platforms [#10980](https://github.com/tikv/tikv/pull/10980) + +## [5.2.3] - 2021-12-02 ++ Bug Fixes + + Fix the issue that the GcKeys task does not work when it is called by multiple keys. Caused by this issue, compaction filer GC might not drop the MVCC deletion information. [#11217](https://github.com/tikv/tikv/pull/11217) + +## [5.2.2] - 2021-10-29 + ++ Improvements + + Simplify the algorithm of L0 flow control [#10879](https://github.com/tikv/tikv/pull/10879) + + Improve the error log report in raft client module [#10983](https://github.com/tikv/tikv/pull/10983) + + Make the slow log of TiKV coprocessor only consider the time spent on processing requests [#10841](https://github.com/tikv/tikv/pull/10841) + + Drop log instead of blocking threads when the slogger thread is overloaded and the queue is filled up [#10841](https://github.com/tikv/tikv/pull/10841) + + Add more statistics types of write queries [#10507](https://github.com/tikv/tikv/pull/10507) ++ Bug Fixes + + Fix the issue that CDC add scan retries frequently due to Congest error [#11082](https://github.com/tikv/tikv/pull/11082) + + Fix that the raft connection is broken when the channel is full [#11047](https://github.com/tikv/tikv/pull/11047) + + Fix the issue that batch messages are too large in Raft client implementation [#9714](https://github.com/tikv/tikv/pull/9714) + + Fix the issue that concurrent leaks in resolved_ts [#10965](https://github.com/tikv/tikv/pull/10965) + + Fix a panic issue that occurs to coprocessor when response size exceeds 4 GiB [#9012](https://github.com/tikv/tikv/pull/9012) + + Fix the issue that snapshot Garbage Collection (GC) misses GC snapshot files when snapshot files cannot be garbage collected [#10813](https://github.com/tikv/tikv/pull/10813) + + Fix a panic issue that occurs when processing coprocessor requests times out [#10852](https://github.com/tikv/tikv/pull/10852) + +## [5.2.1] - 2021-09-09 + ++ Bug Fixes + + Fix the issue of unavailable TiKV caused by Raftstore deadlock when migrating Regions. The workaround is to disable the scheduling and restart the unavailable TiKV. [#10909](https://github.com/tikv/tikv/pull/10909) + +## [5.2.0] - 2021-08-27 + ++ Bug Fixes + + Fix the wrong tikv_raftstore_hibernated_peer_state metric [#10330](https://github.com/tikv/tikv/pull/10330) + + Fix the wrong arguments type of the json_unquote() function in the coprocessor [#10176](https://github.com/tikv/tikv/pull/10176) + + Skip clearing callback during graceful shutdown to avoid breaking ACID in some cases [#10353](https://github.com/tikv/tikv/pull/10353) [#10307](https://github.com/tikv/tikv/pull/10307) + + Fix a bug that the read index is shared for replica reads on a Leader [#10347](https://github.com/tikv/tikv/pull/10347) + + Fix the wrong function that casts DOUBLE to DOUBLE [#25200](https://github.com/tikv/tikv/pull/25200) + +## [5.1.3] - 2021-12-03 + ++ Bug Fixes + + Fix the issue that the GcKeys task does not work when it is called by multiple keys. Caused by this issue, compaction filer GC might not drop the MVCC deletion information. [#11217](https://github.com/tikv/tikv/pull/11217) + +## [5.1.2] - 2021-09-27 + ++ Improvements + + Support dynamically modifying TiCDC configurations [#10645](https://github.com/tikv/tikv/pull/10645) + + Reduce the size of Resolved TS message to save network bandwidth [#2448](https://github.com/tikv/tikv/pull/2448) + + Limit the counts of peer stats in the heartbeat message reported by a single store [#10621](https://github.com/tikv/tikv/pull/10621) ++ Bug Fixes + + Fix a bug that some files are missed to be imported during the process of importing snapshot files when upgrading TiKV from v3.x to v4.x or v5.x [#10902](https://github.com/tikv/tikv/pull/10902) + + Fix the issue that the GC (Garbage Collection) failure (such as file corrupted) of a single snapshot file stops the GC process of all other GC-able files [#10813](https://github.com/tikv/tikv/pull/10813) + + The slow log of TiKV coprocessor only considers the time spent on processing requests [#10841](https://github.com/tikv/tikv/pull/10841) + + Drop log instead of blocking threads when the slogger thread is overloaded and the queue is filled up [#10841](https://github.com/tikv/tikv/pull/10841) + + Fix a bug of the panic caused by timeout when processing Coprocessor requests [#10852](https://github.com/tikv/tikv/pull/10852) + + Fix the TiKV panic issue that occurs when upgrading from a pre-5.0 version with Titan enabled [#10842](https://github.com/tikv/tikv/pull/10842) + + Fix the issue that TiKV of a newer version cannot be rolled back to v5.0.x [#10842](https://github.com/tikv/tikv/pull/10842) + + Fix the issue that TiKV might delete files before it ingests to RocksDB [#10438](https://github.com/tikv/tikv/pull/10438) + + Fix the parsing failure caused by the left pessimistic locks [#26404](https://github.com/tikv/tikv/pull/26404) + +## [5.1.1] - 2021-07-30 + ++ Improvements + + Make the prewrite requests as idempotent as possible to reduce the chance of undetermined errors [#10586](https://github.com/tikv/tikv/pull/10586) + + Prevent the risk of stack overflow when handling many expired commands [#10502](https://github.com/tikv/tikv/pull/10502) + + Avoid excessive commit request retrying by not using the Stale Read request's start_ts to update max_ts [#10451](https://github.com/tikv/tikv/pull/10451) + + Handle read ready and write ready separately to reduce read latency [#10592](https://github.com/tikv/tikv/pull/10592) + + Reduce the impact on data import speed when the I/O rate limiting is enabled [#10390](https://github.com/tikv/tikv/pull/10390) + + Improve the load balance between Raft gRPC connections [#10495](https://github.com/tikv/tikv/pull/10495) ++ Bug Fixes + + Fix the issue that the duration calculation might panic on certain platforms [#10569](https://github.com/tikv/tikv/pull/10569) + + Fix the issue that Load Base Split mistakenly uses the unencoded keys of batch_get_command [#10542](https://github.com/tikv/tikv/pull/10542) + + Fix the issue that changing the resolved-ts.advance-ts-interval configuration online cannot take effect immediately [#10426](https://github.com/tikv/tikv/pull/10426) + + Fix the issue of follower metadata corruption in rare cases with more than 4 replicas [#10225](https://github.com/tikv/tikv/pull/10225) + + Fix the panic issue that occurs when building a snapshot twice if encryption is enabled [#9786](https://github.com/tikv/tikv/pull/9786) [#10407](https://github.com/tikv/tikv/pull/10407) + + Fix the wrong tikv_raftstore_hibernated_peer_state metric [#10330](https://github.com/tikv/tikv/pull/10330) + + Fix the wrong arguments type of the json_unquote() function in the coprocessor [#10176](https://github.com/tikv/tikv/pull/10176) + + Fix a bug that the index keys in a pessimistic transaction might be repeatedly committed [#10468](https://github.com/tikv/tikv/pull/10468) + + Fix the issue that the ReadIndex request returns stale result right after the leader is transferred [#9351](https://github.com/tikv/tikv/pull/9351) + +## [5.1.0] - 2021-06-24 + ++ Improvements + + Use zstd to compress Region snapshots, preventing large space differences between nodes in case of heavy scheduling or scaling [#10005](https://github.com/tikv/tikv/pull/10005) + + Solve OOM issues in multiple cases [#10183](https://github.com/tikv/tikv/pull/10183) + + Add memory usage tracking for each module + + Solve the OOM issue caused by oversized Raft entries cache + + Solve the OOM issue caused by stacked GC tasks + + Solve the OOM issue caused by fetching too many Raft entries from the Raft log to memory at one time + + Split Regions more evenly to mitigate the issue that the growth of Region size exceeds the splitting speed when there are hotspot writes [#9785](https://github.com/tikv/tikv/pull/9785) ++ Bug Fixes + + Fix the issue that the coprocessor fails to properly handle the signed or unsigned integer types in the IN expression [#9821](https://github.com/tikv/tikv/pull/9821) + + Fix the issue of many empty Regions after batch ingesting SST files [#964](https://github.com/tikv/tikv/pull/964) + + Fix a bug that TiKV cannot start up after the file dictionary file is damaged [#9886](https://github.com/tikv/tikv/pull/9886) + + Fix a TiCDC OOM issue caused by reading old values [#9996](https://github.com/tikv/tikv/pull/9996) [#9981](https://github.com/tikv/tikv/pull/9981) + + Fix the issue of empty value in the secondary index for the clustered primary key column when collation is latin1_bin [#24548](https://github.com/tikv/tikv/pull/24548) + + Add the abort-on-panic configuration, which allows TiKV to generate the core dump file when panic occurs. Users still need to correctly configure the environment to enable core dump [#10216](https://github.com/tikv/tikv/pull/10216) + + Fix the performance regression issue of point get queries that occurs when TiKV is not busy [#10046](https://github.com/tikv/tikv/pull/10046) + +## [5.0.6] - 2021-12-30 + ++ Improvements + + Increase the speed of inserting SST files by moving the verification process to the Import thread pool from the Apply thread pool [#11239](https://github.com/tikv/tikv/pull/11239) + + Add more metrics for the garbage collection module of Raft logs to locate performance problems in the module [#11374](https://github.com/tikv/tikv/pull/11374) + + Collapse some uncommon storage-related metrics in Grafana dashboard [#11681](https://github.com/tikv/tikv/pull/11681) ++ Bug Fixes + + Fix the issue that a down TiKV node causes the resolved timestamp to lag [#11351](https://github.com/tikv/tikv/pull/11351) + + Fix the issue that TiKV cannot detect the memory lock when TiKV perform a reverse table scan [#11440](https://github.com/tikv/tikv/pull/11440) + + Fix the issue that the accumulation of GC tasks might cause TiKV to be OOM (out of memory) [#11410](https://github.com/tikv/tikv/pull/11410) + + Fix the issue of TiKV panic that occurs when the files do not exist when TiDB Lightning imports data [#10438](https://github.com/tikv/tikv/pull/10438) + + Fix the issue that the node of a TiKV replica is down after the node gets snapshots because TiKV cannot modify the metadata accurately [#10225](https://github.com/tikv/tikv/pull/10225) + + Fix the leak issue of the backup thread pool [#10287](https://github.com/tikv/tikv/pull/10287) + + Fix the issue of casting illegal strings into floating-point numbers [#23322](https://github.com/tikv/tikv/pull/23322) + +## [5.0.5] - 2021-12-02 + ++ Bug Fixes + + Fix the issue that the GcKeys task does not work when it is called by multiple keys. Caused by this issue, compaction filer GC might not drop the MVCC deletion information. [#11217](https://github.com/tikv/tikv/pull/11217) + +## [5.0.4] - 2021-09-14 + ++ Improvements + + Limit the TiCDC sink's memory consumption [#10305](https://github.com/tikv/tikv/pull/10305) + + Add the memory-bounded upper limit for the TiCDC old value cache [#10313](https://github.com/tikv/tikv/pull/10313) ++ Bug Fixes + + Fix the wrong tikv_raftstore_hibernated_peer_state metric [#10330](https://github.com/tikv/tikv/pull/10330) + + Fix the wrong arguments type of the json_unquote() function in the coprocessor [#10176](https://github.com/tikv/tikv/pull/10176) + + Skip clearing callback during graceful shutdown to avoid breaking ACID in some cases [#10353](https://github.com/tikv/tikv/pull/10353) [#10307](https://github.com/tikv/tikv/pull/10307) + + Fix a bug that the read index is shared for replica reads on a Leader [#10347](https://github.com/tikv/tikv/pull/10347) + + Fix the wrong function that casts DOUBLE to DOUBLE [#25200](https://github.com/tikv/tikv/pull/25200) + +## [5.0.3] - 2021-07-02 + ++ Improvements + + Limit the TiCDC sink's memory consumption [#10305](https://github.com/tikv/tikv/pull/10305) + + Add the memory-bounded upper limit for the TiCDC old value cache [#10313](https://github.com/tikv/tikv/pull/10313) ++ Bug Fixes + + Fix the wrong tikv_raftstore_hibernated_peer_state metric [#10330](https://github.com/tikv/tikv/pull/10330) + + Fix the wrong arguments type of the json_unquote() function in the coprocessor [#10176](https://github.com/tikv/tikv/pull/10176) + + Skip clearing callback during graceful shutdown to avoid breaking ACID in some cases [#10353](https://github.com/tikv/tikv/pull/10353) [#10307](https://github.com/tikv/tikv/pull/10307) + + Fix a bug that the read index is shared for replica reads on a Leader [#10347](https://github.com/tikv/tikv/pull/10347) + + Fix the wrong function that casts DOUBLE to DOUBLE [#25200](https://github.com/tikv/tikv/pull/25200) + +## [5.0.2] - 2021-06-09 + ++ New Features + + Enable the Hibernate Region feature by default [#10266](https://github.com/tikv/tikv/pull/10266) ++ Improvements + + BR now supports the S3-compatible storage using the virtual-host addressing mode [#10243](https://github.com/tikv/tikv/pull/10243) + + Support the back pressure for TiCDC's scan speed [#10151](https://github.com/tikv/tikv/pull/10151) + + Reduce the memory usage of TiCDC's initial scan [#10133](https://github.com/tikv/tikv/pull/10133) + + Improve the cache hit ratio of the TiCDC's Old Value feature in the pessimistic transaction [#10089](https://github.com/tikv/tikv/pull/10089) + + Split Regions more evenly [#10086](https://github.com/tikv/tikv/pull/10086) ++ Bug Fixes + + Fix a TiCDC OOM issue caused by reading old values [#9996](https://github.com/tikv/tikv/pull/9996) [#9981](https://github.com/tikv/tikv/pull/9981) + + Fix the issue of empty value in the secondary index for the clustered primary key column when collation is latin1_bin [#24548](https://github.com/tikv/tikv/pull/24548) + + Add the abort-on-panic configuration, which allows TiKV to generate the core dump file when panic occurs. Users still need to correctly configure the environment to enable core dump [#10216](https://github.com/tikv/tikv/pull/10216) + + Fix the performance regression issue of point get queries that occurs when TiKV is not busy [#10046](https://github.com/tikv/tikv/pull/10046) + +## [5.0.1] - 2021-04-23 + ++ Improvements + + Use `zstd` to compress the Region snapshot [#10005](https://github.com/tikv/tikv/pull/10005) ++ Bug Fixes + + Fix the issue that the coprocessor fails to properly handle the signed or unsigned integer types in the `IN` expression [#10018](https://github.com/tikv/tikv/pull/10018) + + Fix the issue of many empty Regions after batch ingesting SST files [#10015](https://github.com/tikv/tikv/pull/10015) + + Fix the potential panic that occurs when the input of `cast_string_as_time` is invalid UTF-8 bytes [#9995](https://github.com/tikv/tikv/pull/9995) + + Fix a bug that TiKV cannot start up after the file dictionary file is damaged [#9992](https://github.com/tikv/tikv/pull/9992) + +## [5.0.0] - 2021-04-07 + ++ Compatibility Changes + + Replace the `rocksdb.auto-tuned` configuration item with [`rocksdb.rate-limiter-auto-tuned`](/tikv-configuration-file.md#rate-limiter-auto-tuned-new-in-v50) + + Delete the `raftstore.sync-log` configuration item. By default, written data is forcibly spilled to the disk. Before v5.0, you can explicitly disable `raftstore.sync-log`. Since v5.0, the configuration value is forcibly set to `true` + + Change the default value of the `gc.enable-compaction-filter` configuration item from `false` to `true` + + Change the default value of the [`rate-limiter-auto-tuned`](/tikv-configuration-file.md#rate-limiter-auto-tuned-new-in-v50) configuration item from `false` to `true` ++ New features + + Support log redaction to desensitize the output log information. The configuration item `security.redact-info-log`. Its default value is `false`, which means that desensitization is disabled. To enable desensitization for tikv-server logs, set the variable value to `true` + + Support transaction async commit + + Support Raft joint consensus ++ Improvements + + Enable the system to automatically adjust the data compaction speed by default to balance the contention for I/O resources between background tasks and foreground reads and writes + + Enable the GC Compaction Filter feature by default to reduce GC’s consumption of CPU and I/O resources + +## [4.0.16] - 2021-12-17 + ++ Compatibility Changes + + Before v4.0.16, when TiDB converts an illegal UTF-8 string to a Real type, an error is reported directly. Starting from v4.0.16, TiDB processes the conversion according to the legal UTF-8 prefix in the string [#11466](https://github.com/tikv/tikv/pull/11466) ++ Improvements + + Reduce disk space consumption by adopting the zstd algorithm to compress SST files when restoring data using Backup & Restore or importing data using Local-backend of TiDB Lightning [#11469](https://github.com/tikv/tikv/pull/11469) ++ Bug Fixes + + Fix a panic issue that occurs when Region merge, ConfChange, and Snapshot happen at the same time in extreme conditions [#11475](https://github.com/tikv/tikv/pull/11475) + + Fix the issue of negative sign when the decimal divide result is zero [#29586](https://github.com/tikv/tikv/pull/29586) + + Fix the issue that the average latency of the by-instance gRPC requests is inaccurate in TiKV metrics [#11299](https://github.com/tikv/tikv/pull/11299) + + Fix the issue of TiCDC panic that occurs when the downstream database is missing [#11123](https://github.com/tikv/tikv/pull/11123) + + Fix the issue that the Raft connection is broken when the channel is full [#11047](https://github.com/tikv/tikv/pull/11047) + + Fix the issue that TiDB cannot correctly identify whether the Int64 types in Max/Min functions are a signed integer or not, which causes the wrong calculation result of Max/Min [#10158](https://github.com/tikv/tikv/pull/10158) + + Fix the issue that CDC adds scan retries frequently due to the Congest error [#11082](https://github.com/tikv/tikv/pull/11082) + +## [4.0.15] - 2021-09-23 + ++ Feature Enhancement + + Support changing TiCDC configurations dynamically [#10645](https://github.com/tikv/tikv/pull/10645) ++ Improvements + + Handle read ready and write ready separately to reduce read latency [#10475](https://github.com/tikv/tikv/pull/10475) + + The slow log of TiKV coprocessor only considers the time spent on processing requests. [#1084](https://github.com/tikv/tikv/pull/1084) + + Drop log instead of blocking threads when the slogger thread is overloaded and the queue is filled up [#10841](https://github.com/tikv/tikv/pull/10841) + + Reduce the size of Resolved TS messages to save network bandwidth [#2448](https://github.com/tikv/tikv/pull/2448) ++ Bug Fixes + + Fix the issue that BR reports the "file already exists" error when TDE is enabled during data restore [#10917](https://github.com/tikv/tikv/pull/10917) + + Fix the issue that TiKV deletes stale Regions too frequently [#10781](https://github.com/tikv/tikv/pull/10781) + + Fix the issue that TiKV frequently reconnects the PD client [#9818](https://github.com/tikv/tikv/pull/9818) + + Check stale file information from the encryption file dictionary [#10598](https://github.com/tikv/tikv/pull/10598) + +## [4.0.14] - 2021-07-27 + ++ Compatibility Changes + + Change the default value of merge-check-tick-interval from 10 to 2 to speed up the Region merge process [#9676](https://github.com/tikv/tikv/pull/9676) ++ Feature Enhancements + + Add a metric pending to monitor the number of pending PD heartbeats, which helps locate the issue of slow PD threads [#10008](https://github.com/tikv/tikv/pull/10008) + + Support using the virtual-host addressing mode to make BR support the S3-compatible storage [#10242](https://github.com/tikv/tikv/pull/10242) ++ Improvements + + Shutdown the status server first to make sure that the client can correctly check the shutdown status [#10504](https://github.com/tikv/tikv/pull/10504) + + Always respond to stale peers to make sure that these peers are cleared quicker [#10400](https://github.com/tikv/tikv/pull/10400) + + Limit the TiCDC sink's memory consumption [#10147](https://github.com/tikv/tikv/pull/10147) + + When a Region is too large, use the even split to speed up the split process [#10275](https://github.com/tikv/tikv/pull/10275) ++ Bug Fixes + + Fix the issue that the duration calculation might panic on certain platforms [#related-issue](https://github.com/rust-lang/rust/issues/86470#issuecomment-877557654) + + Fix the wrong function that casts DOUBLE to DOUBLE [#25200](https://github.com/tikv/tikv/pull/25200) + + Fix the issue that the panic log might be lost when using the async logger [#8998](https://github.com/tikv/tikv/pull/8998) + + Fix the panic issue that occurs when building a snapshot twice if encryption is enabled [#9786](https://github.com/tikv/tikv/pull/9786) [#10407](https://github.com/tikv/tikv/pull/10407) + + Fix the wrong arguments type of the json_unquote() function in the coprocessor [#10176](https://github.com/tikv/tikv/pull/10176) + + Fix the issues of suspicious warnings during shutdown and the non-deterministic response from Raftstore [#10353](https://github.com/tikv/tikv/pull/10353) [#10307](https://github.com/tikv/tikv/pull/10307) + + Fix the issue of backup threads leak [#10287](https://github.com/tikv/tikv/pull/10287) + + Fix the issue that Region split might panic and corrupt the metadata if the split process is too slow and Region merge is on-going [#8456](https://github.com/tikv/tikv/pull/8456) [#8783](https://github.com/tikv/tikv/pull/8783) + + Fix the issue that the Region heartbeats prevent TiKV from splitting large Regions in some situations [#10111](https://github.com/tikv/tikv/pull/10111) + + Fix the wrong statistics caused by the format inconsistency of CM Sketch between TiKV and TiDB [#25638](https://github.com/tikv/tikv/pull/25638) + + Fix the wrong statistics of the apply wait duration metric [#9893](https://github.com/tikv/tikv/pull/9893) + + Fix the "Missing Blob" error after using delete_files_in_range in Titan [#10232](https://github.com/tikv/tikv/pull/10232) + +## [4.0.13] - 2021-05-27 + ++ Improvements + + Make the calculation process of store used size more precise [#9904](https://github.com/tikv/tikv/pull/9904) + + Set more Regions in the EpochNotMatch message to reduce Region misses [#9731](https://github.com/tikv/tikv/pull/9731) + + Speed up freeing the memory accumulated in the long-running cluster [#10035](https://github.com/tikv/tikv/pull/10035) ++ Bug Fixes + + Fix a bug that TiKV cannot start if the file_dict file is not fully written into the disk that has been full [#9963](https://github.com/tikv/tikv/pull/9963) + + Limit TiCDC's scan speed at 128MB/s by default [#9983](https://github.com/tikv/tikv/pull/9983) + + Reduce the memory usage of TiCDC's initial scan [#10133](https://github.com/tikv/tikv/pull/10133) + + Support the back pressure for TiCDC's scan speed [#10142](https://github.com/tikv/tikv/pull/10142) + + Fix a potential OOM issue by avoiding unnecessary reads to get TiCDC old values [#10031](https://github.com/tikv/tikv/pull/10031) + + Fix a TiCDC OOM issue caused by reading old values [#10197](https://github.com/tikv/tikv/pull/10197) + + Add a timeout mechanism for S3 storages to avoid the client hanging without responses [#10132](https://github.com/tikv/tikv/pull/10132) + +## [4.0.12] - 2021-04-02 + ++ Improvements + + Prevent a large number of reconnections in a short period of time [#9879](https://github.com/tikv/tikv/pull/9879) + + Optimize the write operations in the scenarios of many tombstones [#9729](https://github.com/tikv/tikv/pull/9729) + + Change the default value of `leader-transfer-max-log-lag` to `128` to increase the success rate of leader transfer [#9605](https://github.com/tikv/tikv/pull/9605) ++ Bug Fixes + + Fix the issue that the `IN` expression does not properly handle unsigned/signed integers [#9850](https://github.com/tikv/tikv/pull/9850) + + Fix the issue that the ingest operation is not re-entrant [#9779](https://github.com/tikv/tikv/pull/9779) + + Fix the issue that the space is missed when converting JSON to string in TiKV coprocessor [#9666](https://github.com/tikv/tikv/pull/9666) + +## [4.0.11] - 2021-02-26 + ++ New Features + + Support the `utf8mb4_unicode_ci` collation [#9577](https://github.com/tikv/tikv/pull/9577) + + Support the `cast_year_as_time` collation [#9299](https://github.com/tikv/tikv/pull/9299) ++ Improvements + + Add metrics of server information for DBaaS [#9591](https://github.com/tikv/tikv/pull/9591) + + Support multiple clusters in Grafana dashboards [#9572](https://github.com/tikv/tikv/pull/9572) + + Report RocksDB metrics to TiDB [#9316](https://github.com/tikv/tikv/pull/9316) + + Record the suspension time for Coprocessor tasks [#9277](https://github.com/tikv/tikv/pull/9277) + + Add thresholds of key counts and key size for Load Base Split [#9354](https://github.com/tikv/tikv/pull/9354) + + Check whether the file exists before data import [#9544](https://github.com/tikv/tikv/pull/9544) + + Improve Fast Tune panels [#9180](https://github.com/tikv/tikv/pull/9180) ++ Bug Fixes + + Fix the issue that TiKV is failed to build with `PROST=1` [#9604](https://github.com/tikv/tikv/pull/9604) + + Fix the unmatched memory diagnostics [#9589](https://github.com/tikv/tikv/pull/9589) + + Fix the issue that the end key of a partial RawKV-restore range is inclusive [#9583](https://github.com/tikv/tikv/pull/9583) + + Fix the issue that TiKV might panic when loading the old value of a key of a rolled-back transaction during TiCDC's incremental scan [#9569](https://github.com/tikv/tikv/pull/9569) + + Fix the configuration glitch of old values when changefeeds with different settings connect to one Region [#9565](https://github.com/tikv/tikv/pull/9565) + + Fix a crash issue that occurs when running a TiKV cluster on a machine with a network interface that lacks the MAC address (introduced in v4.0.9) [#9516](https://github.com/tikv/tikv/pull/9516) + + Fix the issue of TiKV OOM when backing up a huge Region [#9448](https://github.com/tikv/tikv/pull/9448) + + Fix the issue that `region-split-check-diff` cannot be customized [#9530](https://github.com/tikv/tikv/pull/9530) + + Fix the issue of TiKV panic when the system time goes back [#9542](https://github.com/tikv/tikv/pull/9542) + +## [4.0.10] - 2021-01-15 + ++ Bug Fixes + + Fix the wrong mapping between ready and peer [#9409](https://github.com/tikv/tikv/pull/9409) + + Fix the issue that some logs are not redacted when `security.redact-info-log` is set to `true` [#9314](https://github.com/tikv/tikv/pull/9314) + +## [4.0.9] - 2020-12-18 + ++ Improvements + + Add the tag to trace the source of the `split` command [#8936](https://github.com/tikv/tikv/pull/8936) + + Support dynamically changing the `pessimistic-txn.pipelined` configuration [#9100](https://github.com/tikv/tikv/pull/9100) + + Reduce the impact on performance when running Backup & Restore and TiDB Lightning [#9098](https://github.com/tikv/tikv/pull/9098) + + Add monitoring metrics for the ingesting SST errors [#9096](https://github.com/tikv/tikv/pull/9096) + + Prevent hibernation when some peers are still catching up with logs [#9093](https://github.com/tikv/tikv/pull/9093) + + Increase the success rate of the pipelined pessimistic locking [#9086](https://github.com/tikv/tikv/pull/9086) + + Change the default value of `apply-max-batch-size` and `store-max-batch-size` to `1024` [#9020](https://github.com/tikv/tikv/pull/9020) + + Add the `max-background-flushes` configuration item [#8947](https://github.com/tikv/tikv/pull/8947) + + Enable the unified read pool for the storage module by default [#8887](https://github.com/tikv/tikv/pull/8887) + + Disable `force-consistency-checks` by default to improve performance [#9029](https://github.com/tikv/tikv/pull/9029) ++ Bug Fixes + + Fix the issue that Coprocessor might return wrong results when there are more than 255 columns [#9131](https://github.com/tikv/tikv/pull/9131) + + Fix the issue that Region Merge might cause data loss during network partition [#9108](https://github.com/tikv/tikv/pull/9108) + + Fix the issue that the `ANALYZE` statement might cause panic when using the `latin1` character set [#9082](https://github.com/tikv/tikv/pull/9082) + + Fix the wrong results returned when converting the numeric type to the time type [#9031](https://github.com/tikv/tikv/pull/9031) + + Fix a bug that TiDB Lightning fails to ingest SST files to TiKV with the Importer-backend or Local-backend when Transparent Data Encryption (TDE) is enabled [#8995](https://github.com/tikv/tikv/pull/8995) + + Fix the invalid `advertise-status-addr` value (`0.0.0.0`) [#9036](https://github.com/tikv/tikv/pull/9036) + + Fix the issue that an error is returned indicating that a key exists when this key is locked and deleted in a committed transaction [#8930](https://github.com/tikv/tikv/pull/8930) + + Fix the issue that the RocksDB cache mapping error causes data corruption [#9029](https://github.com/tikv/tikv/pull/9029) + + Fix a bug that Follower Read might return stale data after the leader is transferred [#9240](https://github.com/tikv/tikv/pull/9240) + +## [4.0.8] - 2020-10-30 + ++ Improvements + + Add the **Fast-Tune** panel page to assist performance diagnostics [#8804](https://github.com/tikv/tikv/pull/8804) + + Add the `security.redact-info-log` configuration item, which redacts user data from logs [#8746](https://github.com/tikv/tikv/pull/8746) + + Reformat the metafile of error codes [#8877](https://github.com/tikv/tikv/pull/8877) + + Enable dynamically changing the `pessimistic-txn.pipelined` configuration [#8853](https://github.com/tikv/tikv/pull/8853) + + Enable the memory profiling features by default [#8801](https://github.com/tikv/tikv/pull/8801) ++ Bug Fixes + + Fix the bug that the mutex conflict in encryption causes pd-worker to process heartbeats slowly [#8869](https://github.com/tikv/tikv/pull/8869) + + Fix the issue that the memory profile is mistakenly generated [#8790](https://github.com/tikv/tikv/pull/8790) + + Fix the failure to back up databases on GCS when the storage class is specified [#8763](https://github.com/tikv/tikv/pull/8763) + + Fix the bug that a learner cannot find a leader when the Region is restarted or newly split [#8864](https://github.com/tikv/tikv/pull/8864) + +## [4.0.7] - 2020-09-29 + ++ Improvements + + Support the JSON log format [#8382](https://github.com/tikv/tikv/pull/8382) ++ Bug Fixes + + Fix the issue of unavailable Status API when TLS handshake fails [#8649](https://github.com/tikv/tikv/pull/8649) + + Fix the potential undefined behaviors [#7782](https://github.com/tikv/tikv/pull/7782) + + Fix the possible panic caused by generating snapshots when executing `UnsafeDestroyRange` [#8681](https://github.com/tikv/tikv/pull/8681) + +## [4.0.6] - 2020-09-15 + ++ Improvements + + Reduce QPS drop when `DropTable` or `TruncateTable` is being executed [#8627](https://github.com/tikv/tikv/pull/8627) + + Support generating metafile of error codes [#8619](https://github.com/tikv/tikv/pull/8619) + + Add performance statistics for cf scan detail [#8618](https://github.com/tikv/tikv/pull/8618) + + Add the `rocksdb perf context` panel in the Grafana default template [#8467](https://github.com/tikv/tikv/pull/8467) ++ Bug Fixes + + Fix the estimation error for a non-index column when collation is enabled [#8620](https://github.com/tikv/tikv/pull/8620) + + Fix the issue that Green GC might miss locks during the process of Region transfer [#8460](https://github.com/tikv/tikv/pull/8460) + + Fix a panic issue that occurs when TiKV runs very slowly during Raft membership change [#8497](https://github.com/tikv/tikv/pull/8497) + + Fix the deadlock issue that occurs between the PD client thread and other threads when calling PD sync requests [#8612](https://github.com/tikv/tikv/pull/8612) + + Upgrade jemalloc to v5.2.1 to address the issue of memory allocation in huge page [#8463](https://github.com/tikv/tikv/pull/8463) + + Fix the issue that the unified thread pool hangs for long-running queries [#8427](https://github.com/tikv/tikv/pull/8427) + +## [4.0.5] - 2020-08-28 + ++ New Features + + Define error code for errors [#8387](https://github.com/tikv/tikv/pull/8387) ++ Bug Fixes + + Speed up leader election when Hibernate Region is enabled [#8292](https://github.com/tikv/tikv/pull/8292) + + Fix the memory leak issue during scheduling [#8357](https://github.com/tikv/tikv/pull/8357) + + Add the `hibernate-timeout` configuration item to prevent the leader from becoming hibernate too fast [#8208](https://github.com/tikv/tikv/pull/8208) + +## [4.0.3] - 2020-07-24 + ++ Improvements + + Introduce the new `backup.num-threads` configuration to control the size of the backup thread pool [#8199](https://github.com/tikv/tikv/pull/8199) + + Do not send store heartbeats when receiving snapshots [#8136](https://github.com/tikv/tikv/pull/8136) + + Support dynamically changing the shared block cache's capacity [#8232](https://github.com/tikv/tikv/pull/8232) ++ Bug Fixes + + Fix the issue that reads might get stale data during merging [#8113](https://github.com/tikv/tikv/pull/8113) + + Fix the issue that collation does not work on the `min`/`max` function when aggregation is pushed down to TiKV [#8108](https://github.com/tikv/tikv/pull/8108) + +## [4.0.2] - 2020-07-01 + ++ Bug Fixes + + Fix a memory safety issue for the status server [#8101](https://github.com/tikv/tikv/pull/8101) + + Fix the issue of lost precision in JSON numeric comparison [#8087](https://github.com/tikv/tikv/pull/8087) + + Fix the wrong query slow log [#8050](https://github.com/tikv/tikv/pull/8050) + + Fix the issue that a peer cannot be removed when its store is isolated during multiple merge processes [#8048](https://github.com/tikv/tikv/pull/8048) + + Fix the issue that `tikv-ctl recover-mvcc` does not remove invalid pessimistic locks [#8047](https://github.com/tikv/tikv/pull/8047) + + Fix the issue that some Titan histogram metrics are missing [#7997](https://github.com/tikv/tikv/pull/7997) + + Fix the issue that TiKV returns `duplicated error` to TiCDC [#7887](https://github.com/tikv/tikv/pull/7887) ++ New Features + + Support the `encryption-meta` command in TiKV Control [#8103](https://github.com/tikv/tikv/pull/8103) + + Add a perf context metric for `RocksDB::WriteImpl` [#7991](https://github.com/tikv/tikv/pull/7991) + +## [4.0.1] - 2020-06-12 + ++ Bug Fixes + + Fix the issue that the `use-unified-pool` configuration in the startup log is incorrectly printed [#7946](https://github.com/tikv/tikv/pull/7946) + + Fix the issue that the tikv-ctl does not support relative path [#7963](https://github.com/tikv/tikv/pull/7963) + + Fix the bug that the monitoring metric of Point Selects is inaccurate [#8033](https://github.com/tikv/tikv/pull/8033) + + Fix the issue that a peer might not be destroyed after the network isolation disappears [#8006](https://github.com/tikv/tikv/pull/8006) + + Fix the issue that a `read index` request may get out-of-date commit index [#8043](https://github.com/tikv/tikv/pull/8043) + + Improve the reliability of backup and restore with S3 and GCS storages [#7917](https://github.com/tikv/tikv/pull/7917) ++ New Features + + Add the `--advertise-status-addr` start flag to specify the status address to advertise [#8046](https://github.com/tikv/tikv/pull/8046) + +## [4.0.0] - 2020-05-27 + ++ Bug Fixes + + Fix the issue of backup fails with DefaultNotFound error. [#7937](https://github.com/tikv/tikv/pull/7937) + + Tolerate out-of-order read states from raft ready. [#7930](https://github.com/tikv/tikv/pull/7930) + + Handles the case when the callback is dropped by mistake when getting snapshot. [#7921](https://github.com/tikv/tikv/pull/7921) + + Fix the issue that `ascii_bin` and `latin1_bin` is not supported by TiKV for new collation framework. [#7919](https://github.com/tikv/tikv/pull/7919) + + Don't clean snapshot files when shutting down. [#7927](https://github.com/tikv/tikv/pull/7927) + + Encryption: fix master key not being able to rotate. [#7898](https://github.com/tikv/tikv/pull/7898) + + Encryption: fix snapshot apply failure caused lock CF snapshot file not being encrypted on received. [#7922](https://github.com/tikv/tikv/pull/7922) + +## [4.0.0-rc.2] - 2020-05-15 + ++ Compatibility Changes + + Move the encryption-related configuration to the security-related configuration, which means changing `[encryption]` in the TiKV configuration file to `[security.encryption]` [#7810](https://github.com/tikv/tikv/pull/7810) ++ New Features + + Support encryption debugging for tikv-ctl, so that tikv-ctl can be used to operate and manage the cluster when the encryption storage is enabled [#7698](https://github.com/tikv/tikv/pull/7698) + + Support encrypting the lock column family in snapshots [#7712](https://github.com/tikv/tikv/pull/7712) + + Use the heatmap in the Grafana dashboard for Raftstore latency summary to better diagnose the jitter issue [#7717](https://github.com/tikv/tikv/pull/7717) + + Support setting the upper limit for the size of the gRPC message [#7824](https://github.com/tikv/tikv/pull/7824) + + Add in Grafana dashboard the encryption-related monitoring metrics [#7827](https://github.com/tikv/tikv/pull/7827) + + Support Application-Layer Protocol Negotiation (ALPN) [#7825](https://github.com/tikv/tikv/pull/7825) + + Add more statistics about Titan [#7818](https://github.com/tikv/tikv/pull/7818) + + Support using the task ID provided by the client as the identifier in the unified read pool to avoid that the priority of a task is lowered by another task in the same transaction [#7814](https://github.com/tikv/tikv/pull/7814) + + Improve the performance of the `batch insert` request [#7718](https://github.com/tikv/tikv/pull/7718) ++ Bug Fixes + + Fix the issue that many empty Regions are generated after restoration [#7632](https://github.com/tikv/tikv/pull/7632) + + Fix the panic issue of Raftstore when receiving out-of-order read index responses [#7370](https://github.com/tikv/tikv/pull/7370) + + Fix the issue that an invalid storage or coprocessor read pool configuration might not be rejected when the unified thread pool is enabled [#7513](https://github.com/tikv/tikv/pull/7513) + + Fix the panic issue of the `join` operation when the TiKV server is shut down [#7713](https://github.com/tikv/tikv/pull/7713) + + Fix the issue that no result is returned when searching TiKV slow logs via diagnostics API [#7776](https://github.com/tikv/tikv/pull/7776) + + Fix the issue that notable memory fragmentation is generated when the TiKV node is running for a long time [#7556](https://github.com/tikv/tikv/pull/7556) + + Fix the issue that the SQL statement fails to execute when an invalid date is stored [#7268](https://github.com/tikv/tikv/pull/7268) + + Fix the issue that the backup data cannot be restored from GCS [#7739](https://github.com/tikv/tikv/pull/7739) + + Fix the issue that KMS key ID is not validated during encryption at rest [#7719](https://github.com/tikv/tikv/pull/7719) + + Fix the underlying correctness issue of the Coprocessor in compilers of different architecture [#7714](https://github.com/tikv/tikv/pull/7714) [#7730](https://github.com/tikv/tikv/pull/7730) + + Fix the `snapshot ingestion` error when encrytion is enabled [#7815](https://github.com/tikv/tikv/pull/7815) + + Fix the `Invalid cross-device link` error when rewriting the configuration file [#7817](https://github.com/tikv/tikv/pull/7817) + + Fix the issue of wrong toml format when writing the configuration file to an empty file [#7817](https://github.com/tikv/tikv/pull/7817) + + Fix the issue that a destroyed peer in Raftstore can still process requests [#7836](https://github.com/tikv/tikv/pull/7836) + +## [4.0.0-rc.1] - 2020-04-28 + ++ Compatibility Changes + + Disable the Hibernate Region feature by default [#7618](https://github.com/tikv/tikv/pull/7618) ++ Important Bug Fixes + + Fix the deadlock issue caused by the probe request from TiDB [#7540](https://github.com/tikv/tikv/pull/7540) + + Fix the issue that the minimum commit timestamp of a transaction might overflow which affects data correctness [#7638](https://github.com/tikv/tikv/pull/7638) ++ New Features + + Support using the user-owned KMS key for the server-side encryption when backing up data to S3 [#7630](https://github.com/tikv/tikv/pull/7630) + + Enable the load-based `split region` operation [#7623](https://github.com/tikv/tikv/pull/7623) + + Support validating common names [#7468](https://github.com/tikv/tikv/pull/7468) + + Add the file lock check to avoid starting multiple TiKV instances that are bound to the same address [#7447](https://github.com/tikv/tikv/pull/7447) + + Support AWS KMS in encryption at rest [#7465](https://github.com/tikv/tikv/pull/7465) ++ Bug Fixes + + Address the OpenSSL security issue: CVE-2020-1967 [#7622](https://github.com/tikv/tikv/pull/7622) + + Avoid protecting rollback records written by `BatchRollback` to improve performance when many write conflicts exist in optimistic transactions [#7604](https://github.com/tikv/tikv/pull/7604) + + Fix the issue that the needless wake-up of transactions results in useless retry and performance reduction in heavy lock-race workloads [#7551](https://github.com/tikv/tikv/pull/7551) + + Fix the issue that the Region might be stuck in the multi-time merging [#7518](https://github.com/tikv/tikv/pull/7518) + + Fix the issue that the learner is not deleted when deleting the learner [#7518](https://github.com/tikv/tikv/pull/7518) + + Fix the issue that follower read might cause panic in raft-rs [#7408](https://github.com/tikv/tikv/pull/7408) + + Fix the bug that a SQL operation might fail because of the `group by constant` error [#7383](https://github.com/tikv/tikv/pull/7383) + + Fix the issue that an optimistic lock might block reads if the corresponding primary lock is a pessimistic lock [#7328](https://github.com/tikv/tikv/pull/7328) + +## [4.0.0-rc] - 2020-04-08 + ++ Compatibility Changes + + Support the `pipelined` feature in pessimistic transactions, which improves the TPC-C performance by 20%. The risk is that the transaction commit might fail because of lock failure during the execution [#6984](https://github.com/tikv/tikv/pull/6984) + + Enable the `unify-read-pool` configuration item in new clusters by default and use the previous setting of this item in old clusters [#7059](https://github.com/tikv/tikv/pull/7059) ++ New Features + + Support the `pipelined` feature in pessimistic transactions, which improves the TPC-C performance by 20%. The risk is that the transaction commit might fail because of lock failure during the execution [#6984](https://github.com/tikv/tikv/pull/6984) + + Support TLS in the HTTP port [#5393](https://github.com/tikv/tikv/pull/5393) + + Enable the `unify-read-pool` configuration item in new clusters by default and use the previous setting of this item in old clusters [#7059](https://github.com/tikv/tikv/pull/7059) ++ Bug Fixes + + Fix the possible panic caused by transferring the leader when the Follower Read feature is enabled [#7101](https://github.com/tikv/tikv/pull/7101) + +## [4.0.0-beta.2] - 2020-03-18 + ++ New Features + + Support the configuration of persistent dynamic update [#6684](https://github.com/tikv/tikv/pull/6684) ++ Bug Fixes + + Fix the panic issue caused by empty short values during backup [#6718](https://github.com/tikv/tikv/pull/6718) + + Fix the issue that Hibernate Regions might not be woken up correctly [#6772](https://github.com/tikv/tikv/pull/6672) [#6648](https://github.com/tikv/tikv/pull/6648) [#6736](https://github.com/tikv/tikv/pull/6736) + +## [4.0.0-beta.1] - 2020-02-28 + ++ Compatibility Changes + + Add the `readpool.unify-read-pool` configuration item (`True` by default) to control whether point queries use the same threads with Coprocessor [#6375](https://github.com/tikv/tikv/pull/6375) [#6401](https://github.com/tikv/tikv/pull/6401) [#6534](https://github.com/tikv/tikv/pull/6534) [#6582](https://github.com/tikv/tikv/pull/6582) [#6585](https://github.com/tikv/tikv/pull/6585) [#6593](https://github.com/tikv/tikv/pull/6593) [#6597](https://github.com/tikv/tikv/pull/6597) [#6677](https://github.com/tikv/tikv/pull/6677) ++ New Features + + Support fetching configuration items from the status port via HTTP API [#6480](https://github.com/tikv/tikv/pull/6480) + + Optimize the performance of `Chunk Encoder` in Coprocessor [#6341](https://github.com/tikv/tikv/pull/6341) ++ Bug Fixes + + Fix the inconsistent behaviors of the `CAST` function in TiDB and TiKV [#6463](https://github.com/tikv/tikv/pull/6463) [#6461](https://github.com/tikv/tikv/pull/6461) [#6459](https://github.com/tikv/tikv/pull/6459) [#6474](https://github.com/tikv/tikv/pull/6474) [#6492](https://github.com/tikv/tikv/pull/6492) [#6569](https://github.com/tikv/tikv/pull/6569) + +## [4.0.0-beta] - 2020-01-17 ++ Upgrade the RocksDB version to 6.4.6 ++ Fix the issue that the system cannot perform the compaction task normally when the disk space is used up by automatically creating a 2GB empty file when TiKV is started [#6321](https://github.com/tikv/tikv/pull/6321) ++ Support quick backup and restoration + + [#6462](https://github.com/tikv/tikv/pull/6462) [#6395](https://github.com/tikv/tikv/pull/6395) [#6378](https://github.com/tikv/tikv/pull/6378) [#6374](https://github.com/tikv/tikv/pull/6374) [#6349](https://github.com/tikv/tikv/pull/6349) + + [#6339](https://github.com/tikv/tikv/pull/6339) [#6308](https://github.com/tikv/tikv/pull/6308) [#6295](https://github.com/tikv/tikv/pull/6295) [#6286](https://github.com/tikv/tikv/pull/6286) [#6283](https://github.com/tikv/tikv/pull/6283) + + [#6261](https://github.com/tikv/tikv/pull/6261) [#6222](https://github.com/tikv/tikv/pull/6222) [#6209](https://github.com/tikv/tikv/pull/6209) [#6204](https://github.com/tikv/tikv/pull/6204) [#6202](https://github.com/tikv/tikv/pull/6202) + + [#6198](https://github.com/tikv/tikv/pull/6198) [#6186](https://github.com/tikv/tikv/pull/6186) [#6177](https://github.com/tikv/tikv/pull/6177) [#6146](https://github.com/tikv/tikv/pull/6146) [#6071](https://github.com/tikv/tikv/pull/6071) + + [#6042](https://github.com/tikv/tikv/pull/6042) [#5877](https://github.com/tikv/tikv/pull/5877) [#5806](https://github.com/tikv/tikv/pull/5806) [#5803](https://github.com/tikv/tikv/pull/5803) [#5800](https://github.com/tikv/tikv/pull/5800) + + [#5781](https://github.com/tikv/tikv/pull/5781) [#5772](https://github.com/tikv/tikv/pull/5772) [#5689](https://github.com/tikv/tikv/pull/5689) [#5683](https://github.com/tikv/tikv/pull/5683) ++ Support reading data from Follower replicas + + [#5051](https://github.com/tikv/tikv/pull/5051) [#5118](https://github.com/tikv/tikv/pull/5118) [#5213](https://github.com/tikv/tikv/pull/5213) [#5316](https://github.com/tikv/tikv/pull/5316) [#5401](https://github.com/tikv/tikv/pull/5401) + + [#5919](https://github.com/tikv/tikv/pull/5919) [#5887](https://github.com/tikv/tikv/pull/5887) [#6340](https://github.com/tikv/tikv/pull/6340) [#6348](https://github.com/tikv/tikv/pull/6348) [#6396](https://github.com/tikv/tikv/pull/6396) ++ Improve the performance of TiDB reading data through index [#5682](https://github.com/tikv/tikv/pull/5682) ++ Fix the issue that the `CAST` function behaves inconsistently in TiKV and in TiDB + + [#6459](https://github.com/tikv/tikv/pull/6459) [#6461](https://github.com/tikv/tikv/pull/6461) [#6458](https://github.com/tikv/tikv/pull/6458) [#6447](https://github.com/tikv/tikv/pull/6447) [#6440](https://github.com/tikv/tikv/pull/6440) + + [#6425](https://github.com/tikv/tikv/pull/6425) [#6424](https://github.com/tikv/tikv/pull/6424) [#6390](https://github.com/tikv/tikv/pull/6390) [#5842](https://github.com/tikv/tikv/pull/5842) [#5528](https://github.com/tikv/tikv/pull/5528) + + [#5334](https://github.com/tikv/tikv/pull/5334) [#5199](https://github.com/tikv/tikv/pull/5199) [#5167](https://github.com/tikv/tikv/pull/5167) [#5146](https://github.com/tikv/tikv/pull/5146) [#5141](https://github.com/tikv/tikv/pull/5141) + + [#4998](https://github.com/tikv/tikv/pull/4998) [#5029](https://github.com/tikv/tikv/pull/5029) [#5099](https://github.com/tikv/tikv/pull/5099) [#5006](https://github.com/tikv/tikv/pull/5006) [#5095](https://github.com/tikv/tikv/pull/5095) + + [#5093](https://github.com/tikv/tikv/pull/5093) [#5090](https://github.com/tikv/tikv/pull/5090) [#4987](https://github.com/tikv/tikv/pull/4987) [#5066](https://github.com/tikv/tikv/pull/5066) [#5038](https://github.com/tikv/tikv/pull/5038) + + [#4962](https://github.com/tikv/tikv/pull/4962) [#4890](https://github.com/tikv/tikv/pull/4890) [#4727](https://github.com/tikv/tikv/pull/4727) [#6060](https://github.com/tikv/tikv/pull/6060) [#5761](https://github.com/tikv/tikv/pull/5761) + + [#5793](https://github.com/tikv/tikv/pull/5793) [#5468](https://github.com/tikv/tikv/pull/5468) [#5540](https://github.com/tikv/tikv/pull/5540) [#5548](https://github.com/tikv/tikv/pull/5548) [#5455](https://github.com/tikv/tikv/pull/5455) + + [#5543](https://github.com/tikv/tikv/pull/5543) [#5433](https://github.com/tikv/tikv/pull/5433) [#5431](https://github.com/tikv/tikv/pull/5431) [#5423](https://github.com/tikv/tikv/pull/5423) [#5179](https://github.com/tikv/tikv/pull/5179) + + [#5134](https://github.com/tikv/tikv/pull/5134) [#4685](https://github.com/tikv/tikv/pull/4685) [#4650](https://github.com/tikv/tikv/pull/4650) [#6463](https://github.com/tikv/tikv/pull/6463) + +## [3.1.0-beta.1] - 2020-01-10 ++ backup + + Change the name of the backup file from `start_key` to the hash value of `start_key` to reduce the file name's length for easy reading (https://github.com/tikv/tikv/pull/6198) + + Disable RocksDB's `force_consistency_checks` check to avoid false positives in the consistency check [#6249](https://github.com/tikv/tikv/pull/6249) + + Add the incremental backup feature [#6286](https://github.com/tikv/tikv/pull/6286) + ++ sst_importer + + Fix the issue that the SST file does not have MVCC properties during restoring [#6378](https://github.com/tikv/tikv/pull/6378) + + Add the monitoring items such as `tikv_import_download_duration`, `tikv_import_download_bytes`, `tikv_import_ingest_duration`, `tikv_import_ingest_bytes`, and `tikv_import_error_counter` to observe the overheads of downloading and ingesting SST files [#6404](https://github.com/tikv/tikv/pull/6404) ++ raftstore + + Fix the issue of Follower Read that the follower reads stale data when the leader changes, thus breaking transaction isolation [#6343](https://github.com/tikv/tikv/pull/6343) + +## [3.1.0-beta] - 2019-12-18 + ++ Support the distributed backup and restore feature [#5532](https://github.com/tikv/tikv/pull/5532) ++ Support the Follower Read feature [#5562](https://github.com/tikv/tikv/pull/5562) + +## [3.0.20] - 2020-12-25 +### Bug Fixes +- Fix the issue that an error is returned indicating that a key exists when this key is locked and deleted in a committed transaction [#8931](https://github.com/tikv/tikv/pull/8931) + +### Improvements +- Add the `end_point_slow_log_threshold` configuration item [#9145](https://github.com/tikv/tikv/pull/9145) + +## [3.0.19] +### Bug Fixes +- Fix the bug that TiKV panics when parsing responses with missing reason phrases [#8540](https://github.com/tikv/tikv/pull/8540) + +### Improvements +- Set `sync-log` to `true` as an nonadjustable value [#8636](https://github.com/tikv/tikv/pull/8636) + +## [3.0.18] +### Misc +- Change gc failure log to warning [#8444](https://github.com/tikv/tikv/pull/8444) + +## [3.0.17] +### Bug Fixes +- Fix a bug that might read stale data during region merging [#8111](https://github.com/tikv/tikv/pull/8111) +- Fix memory leak during scheduling [#8355](https://github.com/tikv/tikv/pull/8355) + +### Improvements +- Add the `hibernate-timeout` configuration that delays region hibernation to improve rolling update performance [#8207](https://github.com/tikv/tikv/pull/8207) + +## [3.0.16] +### Bug Fixes +- Fix the potential wrong result read from ingested files [#8039](https://github.com/tikv/tikv/pull/8039) +- Fix the issue that a peer can not be removed when its store is isolated during multiple merge processes [#8005](https://github.com/tikv/tikv/pull/8005) + +### Improvements +- Avoid sending store heartbeats to PD after snapshots are received [#8145](https://github.com/tikv/tikv/pull/8145) +- Improve the PD client log [#8091](https://github.com/tikv/tikv/pull/8091) + +## [3.0.15] +### Bug Fixes +- Fix a panic issue that Titan GC may delete an already deleted blob file [#7970](https://github.com/tikv/tikv/pull/7970) +- Fix the issue that clean snapshot files which were in used after restarting [#7925](https://github.com/tikv/tikv/pull/7925) +- Change schedule tick failure log to debug level to make logs less verbose [#7904](https://github.com/tikv/tikv/pull/7904) +- Make grpc message size limit configurable [#7822](https://github.com/tikv/tikv/pull/7822) +- Fix the issue that the memory defragmentation will not be very effective after running for a long time [#7790](https://github.com/tikv/tikv/pull/7790) + +## [3.0.14] +### Features +- Improve the performance when many conflicts and the `BatchRollback` condition exist in optimistic transactions [#7605](https://github.com/tikv/tikv/pull/7605) +- Fix the issue of decreased performance that occurs because the pessimistic lock `waiter` is frequently awakened when many conflicts exist in pessimistic transactions [#7584](https://github.com/tikv/tikv/pull/7584) +### Bug Fixes +- Fix the issue that the node cannot be deleted correctly after the isolation recovery in some cases [#7703](https://github.com/tikv/tikv/pull/7703) +- Fix the issue of data loss during network isolation caused by the Region Merge operation [#7679](https://github.com/tikv/tikv/pull/7679) +- Fix the issue that learner cannot be removed correctly in some cases [#7598](https://github.com/tikv/tikv/pull/7598) +- Fix the issue that the scanning result of raw key-value pairs might be out of order [#7597](https://github.com/tikv/tikv/pull/7597) +- Fix the issue of reconnection when the batch of Raft messages is too large [#7542](https://github.com/tikv/tikv/pull/7542) +- Fix the issue of gRPC thread deadlock caused by the empty request [#7538](https://github.com/tikv/tikv/pull/7538) +- Fix the issue that the processing logic of restarting the learner is incorrect during the merge process [#7457](https://github.com/tikv/tikv/pull/7457) +- Fix the issue that repeated requests on the cleanup of lock might destroy the atomicity of the transaction [#7388](https://github.com/tikv/tikv/pull/7388) + +## [3.0.12] +### Bug Fixes +- Fix the issue of conflict detection failure or data index inconsistency caused by inserting an existing key into a transaction and then deleting it immediately when disabling the consistency check parameter [#7054](https://github.com/tikv/tikv/pull/7054) +- Introduce a flow control mechanism in Raftstore to solve the problem that without flow control, it might lead to too slow tracking and cause the cluster to be stuck, and the transaction size might cause frequent reconnection of TiKV connections [#7072](https://github.com/tikv/tikv/pull/7072), [#7076](https://github.com/tikv/tikv/pull/7076) + +## [3.0.11] +### Bug Fixes +- Optimize the log output by removing unnecessary logs [#6657](https://github.com/tikv/tikv/pull/6657) +- Fix the panic that might occur when the peer is removed under high loads [#6704](https://github.com/tikv/tikv/pull/6704) +- Fix the issue that Hibernate Regions are not waken up in some cases [#6732](https://github.com/tikv/tikv/pull/6732) [#6738](https://github.com/tikv/tikv/pull/6738) + +## [3.0.10] +- Raftstore + - Fix the system panic issue #6460 or data loss issue #5981 caused by Region merge failure [#6614](https://github.com/tikv/tikv/pull/6614) + - Support `yield` to optimize scheduling fairness, and support pre-transferring the leader to improve leader scheduling stability [#6563](https://github.com/tikv/tikv/pull/6563) + +## [3.0.9] - 2020-01-14 +- Raftstore + - Speed up the configuration change to speed up the Region scattering [#6421](https://github.com/tikv/tikv/pull/6421) +- Transaction + - Add the `tikv_lock_manager_waiter_lifetime_duration`, `tikv_lock_manager_detect_duration`, and `tikv_lock_manager_detect_duration` monitoring metrics to monitor `waiter`s’ lifetime, the time cost of detecting deadlocks, and the status of `Wait` table [#6392](https://github.com/tikv/tikv/pull/6392) + - Optimize the following configuration items to reduce transaction execution latency caused by changing Region leader or the leader of deadlock detector in extreme situations [#6429](https://github.com/tikv/tikv/pull/6429) + - Change the default value of `wait-for-lock-time` from `3s` to `1s` + - Change the default value of `wake-up-delay-duration` from `100ms` to `20ms` + - Fix the issue that the leader of the deadlock detector might be incorrect during the Region Merge process [#6431](https://github.com/tikv/tikv/pull/6431) + +## [3.0.8] - 2019-12-31 +- Coprocessor + - Modify the level of the output log from `error` to `warn` when an error occurs in Coprocessor [#6051](https://github.com/tikv/tikv/pull/6051) + - Modify the update behavior of statistics sampling data from directly updating the row to deleting before inserting, to keep consistency with the update behavior of tidb-server [#6069](https://github.com/tikv/tikv/pull/6096) +- Raftstore + - Fix the panic caused by repeatedly sending the `destroy` message to `peerfsm` and `peerfsm` being destroyed multiple times [#6297](https://github.com/tikv/tikv/pull/6297) + - Update the default value of `split-region-on-table` from `true` to `false` to disable splitting Regions by table by default [#6253](https://github.com/tikv/tikv/pull/6253) +- Engine + - Fix the issue that empty data might be returned because RocksDB iterator errors are not correctly processed in extreme conditions [#6326](https://github.com/tikv/tikv/pull/6326) +- Transaction + - Fix the issue that TiKV fails to write data into keys and GC is blocked because the pessimistic locks are incorrectly cleaned up [#6354](https://github.com/tikv/tikv/pull/6354) + - Optimize the pessimistic lock waiting mechanism to improve the performance in scenarios where the lock conflict is severe [#6296](https://github.com/tikv/tikv/pull/6296) +- Update the default value of `tikv_alloc` from `tikv_alloc/default` to `jemalloc` [#6206](https://github.com/tikv/tikv/pull/6206) + +## [3.0.7] - 2019-12-04 ++ Update grpc to fix a potential memory leak issue [#6128](https://github.com/tikv/tikv/pull/6128) ++ Deadlock: only observe valid region in order to make sure the manager is in the valid region [#6110](https://github.com/tikv/tikv/pull/6110) + +## [3.0.6] - 2019-11-28 ++ Pessimistic Transaction: keep lock's ttl when receive a smaller ttl [#6056](https://github.com/tikv/tikv/pull/6056) ++ rust-rocksdb: fix titan options for cf when create cf [#6009](https://github.com/tikv/tikv/pull/6009) ++ Fix TiKV panic when aggregation expr type is not valid [#6002](https://github.com/tikv/tikv/pull/6002) ++ Pessimistic Transaction: reduce clean up requests in lock_manager [#5965](https://github.com/tikv/tikv/pull/5965) ++ Fix a region merge bug which may cause panic: set is_merging flag after restart in raftstore [#5892](https://github.com/tikv/tikv/pull/5892) ++ Generate flamegraph at runtime [#5961](https://github.com/tikv/tikv/pull/5961) ++ Support to change the config gc io limit dynamically [#5957](https://github.com/tikv/tikv/pull/5957) ++ Limit the speed of write for GC [#5735](https://github.com/tikv/tikv/pull/5735) ++ Engine: update rocksdb and titan [#5968](https://github.com/tikv/tikv/pull/5968) + + rocksdb: Fix OnFlushCompleted fired before flush result write to MANIFEST [pingcap/rocksdb#130](https://github.com/pingcap/rocksdb/pull/130) + + titan: Fix status overrided by mistake [pingcap/titan#111](https://github.com/pingcap/titan/pull/111) ++ Makefile: add a new rule for CI test [#5938](https://github.com/tikv/tikv/pull/5938) ++ Add metrics for commit log duration [#5881](https://github.com/tikv/tikv/pull/5881) ++ Pessimistic Transaction: Add support for lock wait timeout [#5848](https://github.com/tikv/tikv/pull/5848) ++ LockManager: make has_waiter accurate [#5845](https://github.com/tikv/tikv/pull/5845) ++ Fix wrong txn_size when acquire pessimistic lock [#5740](https://github.com/tikv/tikv/pull/5740) + +## [3.0.5] - 2019-10-25 ++ Fix the problem that split check is always scanning caused by updating approximate in pd-worker [#5716](https://github.com/tikv/tikv/pull/5716) ++ Update rust-rocksdb to avoid intra_L0 compaction issue [#5710](https://github.com/tikv/tikv/pull/5710) ++ Fix the bug which may break atomicity: product primary locks of pessimistic transactions from being collapsed [#5671](https://github.com/tikv/tikv/pull/5671) ++ Enable rocksdb force_consistency_checks and handle background error [#5662](https://github.com/tikv/tikv/pull/5662) ++ Fix the bug in raftstore that painic when getting value encouters an error [#5643](https://github.com/tikv/tikv/pull/5643) ++ Fix the bug that do not return the right tso when checking lock [#5634](https://github.com/tikv/tikv/pull/5634) ++ Reduce the overhead of region's heartbeat [#5620](https://github.com/tikv/tikv/pull/5620) ++ Reduce message flush in raftstore [#5617](https://github.com/tikv/tikv/pull/5617) ++ Check Lock's TTL when doing clean up [#5589](https://github.com/tikv/tikv/pull/5589) + +## [3.0.4] - 2019-10-08 + ++ Fix the issue that the approximate keys is not correct when region is empty [#5414](https://github.com/tikv/tikv/pull/5414) ++ Make the config support rocksdb doubly skiplist to optimize `reverse-scan` [#5368](https://github.com/tikv/tikv/pull/5368) ++ Optimize point-get in coprocessor [#5463](https://github.com/tikv/tikv/pull/5463) ++ Support batch-split command and empty batch command [#5470](https://github.com/tikv/tikv/pull/5470) ++ Fix `PointGetter` performance issue when there are concurrent write [#5495](https://github.com/tikv/tikv/pull/5495) ++ Fix the output on short version flag [#5501](https://github.com/tikv/tikv/pull/5501) ++ Support the pessmistic transaction API: txn-heart-beat [#5507](https://github.com/tikv/tikv/pull/5507) ++ `titan` GC and monitoring improvement [#5517](https://github.com/tikv/tikv/pull/5517) ++ Update `grpcio` to v0.4.5 [#5523](https://github.com/tikv/tikv/pull/5523) ++ Support GRPC memory quota [#5524](https://github.com/tikv/tikv/pull/5524) ++ Fix commit index is not forwarded when merge entry is empty [#5526](https://github.com/tikv/tikv/pull/5526) ++ Fix a resource leak bug in batch grpc [#5567](https://github.com/tikv/tikv/pull/5567) + +## [3.0.3] - 2019-08-29 ++ Fix the issue that ReadIndex might fail to respond to requests because of duplicate context [#5256](https://github.com/tikv/tikv/pull/5256) ++ Fix potential scheduling jitters caused by premature `PutStore` [#5277](https://github.com/tikv/tikv/pull/5277) ++ Fix incorrect timestamps reported from Region heartbeats [#5296](https://github.com/tikv/tikv/pull/5296) ++ Fix potential TiKV panics during region merge [#5291](https://github.com/tikv/tikv/pull/5291) ++ Speed up leader change check for the dead lock detector [#5317](https://github.com/tikv/tikv/pull/5317) ++ Support using `grpc env` to create deadlock clients [#5346](https://github.com/tikv/tikv/pull/5346) ++ Add `config-check` to check whether the configuration is correct [#5349](https://github.com/tikv/tikv/pull/5349) ++ Fix the issue that ReadIndex does not return anything when there is no leader [#5351](https://github.com/tikv/tikv/pull/5351) ++ Exclude shared block cache from core dump [#5322](https://github.com/tikv/tikv/pull/5322) + +## [3.0.2] - 2019-08-06 +* Fix the bug that TiKV panics if the Raft Log is not written in time [#5160](https://github.com/tikv/tikv/pull/5160) +* Fix the bug that the panic information is not written into the log file after TiKV panics [#5198](https://github.com/tikv/tikv/pull/5198) +* Fix the bug that the insert operation might be incorrectly performed in the pessimistic transaction [#5203](https://github.com/tikv/tikv/pull/5203) +* Lower the output level of some logs that require no manual intervention to INFO [#5193](https://github.com/tikv/tikv/pull/5193) +* Improve the accuracy of monitoring the storage engine size [#5200](https://github.com/tikv/tikv/pull/5200) +* Improve the accuracy of the Region size in TiKV Control [#5195](https://github.com/tikv/tikv/pull/5195) +* Improve the performance of the deadlock detector for pessimistic locks [#5192](https://github.com/tikv/tikv/pull/5192) +* Improve the performance of GC in the Titan storage engine [#5197](https://github.com/tikv/tikv/pull/5197) + +## [3.0.1] - 2019-06-16 +- Add the statistics of the size of blob files in statistics information [#5060](https://github.com/tikv/tikv/pull/5060) +- Fix the core dump issue caused by the incorrectly cleaned memory resources when the process exits [#5053](https://github.com/tikv/tikv/pull/5053) +- Add all monitoring metrics related to the Titan engine [#4772](https://github.com/tikv/tikv/pull/4772), [#4836](https://github.com/tikv/tikv/pull/4836) +- Add the number of open file handles for Titan when counting the number of open file handles to avoid the issue that no file handle is available because of inaccurate statistics of file handles [#5026](https://github.com/tikv/tikv/pull/5026) +- Set `blob_run_mode` to decide whether to enable the Titan engine on a specific CF [#4991](https://github.com/tikv/tikv/pull/4991) +- Fix the issue that the read operations cannot get the commit information of pessimistic transactions [#5067](https://github.com/tikv/tikv/pull/5067) +- Add the `blob-run-mode` configuration parameter to control the running mode of the Titan engine, and its value can be `normal`, `read-only` or `fallback` [#4865](https://github.com/tikv/tikv/pull/4865) +Improve the performance of detecting deadlocks [#5089](https://github.com/tikv/tikv/pull/5089) + +## [3.0.0] - 2019-06-28 ++ Engine + - Introduce Titan, a key-value plugin that improves write performance for + scenarios with value sizes greater than 1KiB, and relieves write + amplification in certain degrees + - Optimize memory management to reduce memory allocation and copying for `Iterator Key Bound Option` + - Support `block cache` sharing among different column families ++ Server + - Support reversed `raw_scan` and `raw_batch_scan` + - Support batch receiving and sending Raft messages, improving TPS by 7% for write intensive scenarios + - Support getting monitoring information via HTTP + - Support Local Reader in RawKV to improve performance + - Reduce context switch overhead from `batch commands` ++ Raftstore + - Support Multi-thread Raftstore and Multi-thread Apply to improve scalabilities, + concurrency capacity, and resource usage within a single node. + Performance improves by 70% under the same level of pressure + - Support checking RocksDB Level 0 files before applying snapshots to avoid write stall + - Support Hibernate Regions to optimize CPU consumption from RaftStore (Experimental) + - Remove the local reader thread ++ Transaction + - Support distributed GC and concurrent lock resolving for improved GC performance + - Support the pessimistic transaction model (Experimental) + - Modify the semantics of `Insert` to allow Prewrite to succeed only when there is no Key + - Remove `txn scheduler ` + - Add monitoring items related to `read index` and `GC worker` ++ Coprocessor + - Refactor the computation framework to implement vector operators, computation + using vector expressions, and vector aggregations to improve performance + - Support providing operator execution status for the `EXPLAIN ANALYZE` statement + in TiDB + - Switch to the `work-stealing` thread pool model to reduce context switch cost ++ Misc + - Develop a unified log format specification with restructured log system to + facilitate collection and analysis by tools + - Add performance metrics related to configuration information and key bound crossing. + +## [3.0.0-rc.3] - 2019-06-21 ++ Engine + - Check iterator status when scanning. [4936](https://github.com/tikv/tikv/pull/4936) + - Fix the issue that ingested files and directory are not synchronized. [4937](https://github.com/tikv/tikv/pull/4937) ++ Server + - Sanitize block size configuration. [4928](https://github.com/tikv/tikv/pull/4928) + - Support replicating the `delete_range` request without deleting the data when applying. [4490](https://github.com/tikv/tikv/pull/4490) + - Add read index related metrics. [4830](https://github.com/tikv/tikv/pull/4830) + - Add GC worker related metrics. [4922](https://github.com/tikv/tikv/pull/4922) ++ Raftstore + - Fix the issue that local reader cache is not cleared correctly. [4778](https://github.com/tikv/tikv/pull/4778) + - Fix request latency jetter when transferring leader and conf changes. [4734](https://github.com/tikv/tikv/pull/4734) + - Remove invalid empty callbacks. [4682](https://github.com/tikv/tikv/pull/4682) + - Clear stale reads after role change. [4810](https://github.com/tikv/tikv/pull/4810) + - Synchronize all CF files for the received snapshots. [4807](https://github.com/tikv/tikv/pull/4807) + - Fix missing fsync calls for snapshots. [4850](https://github.com/tikv/tikv/pull/4850) ++ Coprocessor + - Improve coprocessor batch executor. [4877](https://github.com/tikv/tikv/pull/4877) ++ Transaction + - Support `ResolveLockLite` to allow only resolving specified lock keys. [4882](https://github.com/tikv/tikv/pull/4882) + - Improve pessimistic lock transaction. [4889](https://github.com/tikv/tikv/pull/4889) ++ Tikv-ctl + - Improve `bad-regions` and `tombstone` subcommands. [4862](https://github.com/tikv/tikv/pull/4862) ++ Misc + - Add dist_release. [4841](https://github.com/tikv/tikv/pull/4841) + +## [3.0.0-rc.2] - 2019-05-28 ++ Engine + - Support multiple column families sharing a block cache [#4563](https://github.com/tikv/tikv/pull/4563) ++ Server + - Remove `TxnScheduler` [#4098](https://github.com/tikv/tikv/pull/4098) + - Support pessimistic lock transactions [#4698](https://github.com/tikv/tikv/pull/4698) ++ Raftstore + - Support hibernate Regions to reduce the consumption of the raftstore CPU [#4591](https://github.com/tikv/tikv/pull/4591) + - Fix the issue that the leader does not reply to the `ReadIndex` requests for the learner [#4653](https://github.com/tikv/tikv/pull/4653) + - Fix the issue of transferring leader failure in some cases [#4684](https://github.com/tikv/tikv/pull/4684) + - Fix the possible dirty read issue in some cases [#4688](https://github.com/tikv/tikv/pull/4688) + - Fix the issue that a snapshot lacks data in some cases [#4716](https://github.com/tikv/tikv/pull/4716) ++ Coprocessor + - Add more RPN functions + - `LogicalOr` [#4691](https://github.com/tikv/tikv/pull/4601) + - `LTReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `LEReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `GTReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `GEReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `NEReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `EQReal` [#4602](https://github.com/tikv/tikv/pull/4602) + - `IsNull` [#4720](https://github.com/tikv/tikv/pull/4720) + - `IsTrue` [#4720](https://github.com/tikv/tikv/pull/4720) + - `IsFalse` [#4720](https://github.com/tikv/tikv/pull/4720) + - Support comparison arithmetic for `Int` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support comparison arithmetic for `Decimal` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support comparison arithmetic for `String` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support comparison arithmetic for `Time` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support comparison arithmetic for `Duration` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support comparison arithmetic for `Json` [#4625](https://github.com/tikv/tikv/pull/4625) + - Support plus arithmetic for `Int` [#4733](https://github.com/tikv/tikv/pull/4733) + - Support plus arithmetic for `Real` [#4733](https://github.com/tikv/tikv/pull/4733) + - Support plus arithmetic for `Decimal` [#4733](https://github.com/tikv/tikv/pull/4733) + - Support MOD functions for `Int` [#4727](https://github.com/tikv/tikv/pull/4727) + - Support MOD functions for `Real` [#4727](https://github.com/tikv/tikv/pull/4727) + - Support MOD functions for `Decimal` [#4727](https://github.com/tikv/tikv/pull/4727) + - Support minus arithmetic for `Int` [#4746](https://github.com/tikv/tikv/pull/4746) + - Support minus arithmetic for `Real` [#4746](https://github.com/tikv/tikv/pull/4746) + - Support minus arithmetic for `Decimal` [#4746](https://github.com/tikv/tikv/pull/4746) + +## [3.0.0-rc.1] - 2019-05-10 ++ Engine + - Fix the issue that may cause incorrect statistics on read traffic [#4436](https://github.com/tikv/tikv/pull/4436) + - Fix the issue that may cause prefix extractor panic when deleting a range [#4503](https://github.com/tikv/tikv/pull/4503) + - Optimize memory management to reduce memory allocation and copying for `Iterator Key Bound Option` [#4537](https://github.com/tikv/tikv/pull/4537) + - Fix the issue that failing to consider learner log gap may in some cases cause panic [#4559](https://github.com/tikv/tikv/pull/4559) + - Support `block cache` sharing among different `column families`[#4612](https://github.com/tikv/tikv/pull/4612) ++ Server + - Reduce context switch overhead of `batch commands` [#4473](https://github.com/tikv/tikv/pull/4473) + - Check the validity of seek iterator status [#4470](https://github.com/tikv/tikv/pull/4470) ++ RaftStore + - Support configurable `properties index distance` [#4517](https://github.com/tikv/tikv/pull/4517) ++ Coprocessor + - Add batch index scan executor [#4419](https://github.com/tikv/tikv/pull/4419) + - Add vectorized evaluation framework [#4322](https://github.com/tikv/tikv/pull/4322) + - Add execution summary framework for batch executors [#4433](https://github.com/tikv/tikv/pull/4433) + - Check the maximum column when constructing the RPN expression to avoid invalid column offset that may cause evaluation panic [#4481](https://github.com/tikv/tikv/pull/4481) + - Add `BatchLimitExecutor` [#4469](https://github.com/tikv/tikv/pull/4469) + - Replace the original `futures-cpupool` with `tokio-threadpool` in ReadPool to reduce context switch [#4486](https://github.com/tikv/tikv/pull/4486) + - Add batch aggregation framework [#4533](https://github.com/tikv/tikv/pull/4533) + - Add `BatchSelectionExecutor` [#4562](https://github.com/tikv/tikv/pull/4562) + - Add batch aggression function `AVG` [#4570](https://github.com/tikv/tikv/pull/4570) + - Add RPN function `LogicalAnd`[#4575](https://github.com/tikv/tikv/pull/4575) ++ Misc + - Support `tcmalloc` as a memory allocator [#4370](https://github.com/tikv/tikv/pull/4370) + +## [3.0.0-beta.1] - 2019-03-26 +- Optimize the Coprocessor calculation execution framework and implement the TableScan section, with the Single TableScan performance improved by 5% ~ 30% + - Implement the definition of the `BatchRows` row and the `BatchColumn` column [#3660](https://github.com/tikv/tikv/pull/3660) + - Implement `VectorLike` to support accessing encoded and decoded data in the same way [#4242](https://github.com/tikv/tikv/pull/4242) + - Define the `BatchExecutor` to interface and implement the way of converting requests to `BatchExecutor` [#4243](https://github.com/tikv/tikv/pull/4243) + - Implement transforming the expression tree into the RPN format [#4329](https://github.com/tikv/tikv/pull/4329) + - Implement the `BatchTableScanExecutor` vectorization calculation operator [#4351](https://github.com/tikv/tikv/pull/4351) +- Unify the log format for easy collection and analysis by tools +- Support using the Local Reader to read in the Raw Read interface [#4222](https://github.com/tikv/tikv/pull/4222) +- Add metrics about configuration information [#4206](https://github.com/tikv/tikv/pull/4206) +- Add metrics about key exceeding bound [#4255](https://github.com/tikv/tikv/pull/4255) +- Add an option to control panic or return an error when encountering the key exceeding bound error [#4254](https://github.com/tikv/tikv/pull/4254) +- Add support for the `INSERT` operation, make prewrite succeed only when keys do not exist, and eliminate `Batch Get` [#4085](https://github.com/tikv/tikv/pull/4085) +- Use more fair batch strategy in the Batch System [#4200](https://github.com/tikv/tikv/pull/4200) +- Support Raw scan in tikv-ctl [#3825](https://github.com/tikv/tikv/pull/3825) +- Support hibernating regions [#4591](https://github.com/tikv/tikv/pull/4591) + +## [3.0.0-beta] - 2019-01-18 +- Support distributed GC [#3179](https://github.com/tikv/tikv/pull/3179) +- Check RocksDB Level 0 files before applying snapshots to avoid Write Stall [#3606](https://github.com/tikv/tikv/pull/3606) +- Support reverse `raw_scan` and `raw_batch_scan` [#3724](https://github.com/tikv/tikv/pull/3724) +- Support using HTTP to obtain monitoring information [#3855](https://github.com/tikv/tikv/pull/3855) +- Support DST better [#3786](https://github.com/tikv/tikv/pull/3786) +- Support receiving and sending Raft messages in batch [#3913](https://github.com/tikv/tikv/pull/3913) +- Introduce a new storage engine Titan [#3985](https://github.com/tikv/tikv/pull/3985) +- Upgrade gRPC to v1.17.2 [#4023](https://github.com/tikv/tikv/pull/4023) +- Support receiving the client requests and sending replies in batch [#4043](https://github.com/tikv/tikv/pull/4043) +- Support multi-thread Apply [#4044](https://github.com/tikv/tikv/pull/4044) +- Support multi-thread Raftstore [#4066](https://github.com/tikv/tikv/pull/4066) + +## [2.1.19] +- Raftstore:Fix the panic occurred when restarting TiKV and `is_merging` is given an incorrect value in the process of merging Regions and applying the Compact log [#5884](https://github.com/tikv/tikv/pull/5884) +- Importer:Remove the limit on the gRPC message length [#5809](https://github.com/tikv/tikv/pull/5809) + +## [2.1.18] + +## [2.1.17] - 2019-09-11 +- Fix the incorrect result of counting keys in a Region in some cases [#5415](https://github.com/tikv/tikv/pull/5415) +- Add the `config-check` option in TiKV to check whether the TiKV configuration item is valid [#5391](https://github.com/tikv/tikv/pull/5391) +- Optimize the starting process to reduce jitters caused by restarting nodes [#5277](https://github.com/tikv/tikv/pull/5277) +- Optimize the resolving locking process in some cases to speed up resolving locking for transactions [#5339](https://github.com/tikv/tikv/pull/5339) +- Optimize the `get_txn_commit_info` process to speed up committing transactions [#5062](https://github.com/tikv/tikv/pull/5062) +- Simplify Raft-related logs [#5425](https://github.com/tikv/tikv/pull/5425) +- Resolve the issue that TiKV exits abnormally in some cases [#5441](https://github.com/tikv/tikv/pull/5441) + +## [2.1.16] - 2019-08-15 +* Return region error when TiKV is closing [#4820](https://github.com/tikv/tikv/pull/4820) +* Support reverse `raw_scan` and `raw_batch_scan` [#5148](https://github.com/tikv/tikv/pull/5148) + +## [2.1.15] - 2019-07-18 +* Unify the log format [#5083](https://github.com/tikv/tikv/pull/5083) +* Improve the accuracy of Region's approximate size or keys in extreme cases to improve the accuracy of scheduling [#5085](https://github.com/tikv/tikv/pull/5085) + +## [2.1.14] - 2019-07-04 +- Optimize processing the empty callback when processing the Raftstore message to avoid sending unnecessary message [#4682](https://github.com/tikv/tikv/pull/4682) + +## [2.1.13] - 2019-06-21 +- Fix the issue that incomplete snapshots are generated in the system caused by the iterator not checking the status [#4940](https://github.com/tikv/tikv/pull/4940) +- Add a feature to check the validity for the `block-size` configuration [#4930](https://github.com/tikv/tikv/pull/4930) + +## [2.1.12] - 2019-06-13 +- Fix the issue that Regions are not available during the leader transfer process in extreme conditions [#4799](https://github.com/tikv/tikv/pull/4734) +- Fix the issue that TiKV loses data when the power of the machine fails abnormally, caused by delayed data flush to the disk when receiving snapshots [#4850](https://github.com/tikv/tikv/pull/4850) + +## [2.1.11] - 2019-06-03 +- Fix the issue that the learner reads an empty index when there is only one leader and one learner [#4751](https://github.com/tikv/tikv/pull/4751) +- Process `ScanLock` and `ResolveLock` in the thread pool with a high priority to reduce their impacts on commands with a normal priority [#4791](https://github.com/tikv/tikv/pull/4791) +- Synchronize all CF files for received snapshots [#4811](https://github.com/tikv/tikv/pull/4811) + +## [2.1.10] - 2019-05-21 +* Reject transfer leader when the region recently changed config [#4684](https://github.com/tikv/tikv/pull/4684) +* Add priority label to coprocessor metrics [#4643](https://github.com/tikv/tikv/pull/4643) +* Fix the issue that read index may read stale data during transferring leader [#4724](https://github.com/tikv/tikv/pull/4724) +* Fix the issue that `CommitMerge` may cause TiKV unable to restart [#4615](https://github.com/tikv/tikv/pull/4615) +* Fix unknown logs [#4730](https://github.com/tikv/tikv/pull/4730) + +## [2.1.9] - 2019-05-06 +* Fix potential quorum changes when transferring leader (https://github.com/pingcap/raft-rs/issues/221) +* Fix the Importer bug that some SST files fail to be imported but it still returns successful import result [#4566](https://github.com/tikv/tikv/pull/4566) +* Support setting a speed limit in Importer when uploading SST files to TiKV [#4607](https://github.com/tikv/tikv/pull/4607) +* Change Importer RocksDB SST default compression method to `lz4` to reduce CPU consumption [#4624](https://github.com/tikv/tikv/pull/4624) + +## [2.1.8] - 2019-04-11 +* Fix the issue of wrong statistics of the read traffic [#4441](https://github.com/tikv/tikv/pull/4441) +* Fix the raftstore performance issue when checking to decide whether to process pending snapshots when many Regions exist [#4484](https://github.com/tikv/tikv/pull/4484) +* Do not ingest files when the number of level 0 SST files exceeds `level_zero_slowdown_writes_trigger/2` [#4464](https://github.com/tikv/tikv/pull/4464) + +# [2.1.6] - 2019-03-15 +* Fix the `StoreNotMatch` issue caused by decoding protobuf error in some cases [#4303](https://github.com/tikv/tikv/pull/4303) +* Improve import speed by increasing default region-split-size to 512 MiB [#4347](https://github.com/tikv/tikv/pull/4347) +* Fix OOM issue by storing the intermediate SST files on disk instead of memory [#4348](https://github.com/tikv/tikv/pull/4348) +* Restrict memory usage by RocksDB [#4350](https://github.com/tikv/tikv/pull/4350) +* Fix the issue that scattering Region doesn't take effect [#4352](https://github.com/tikv/tikv/pull/4352) + +## [2.1.5] - 2019-02-28 +* Fix the panic issue caused by Region merge in some cases [#4235](https://github.com/tikv/tikv/pull/4235) +* Fix the issue that Importer fails to import data in some cases [#4223](https://github.com/tikv/tikv/pull/4223) +* Fix the `KeyNotInRegion` error in some cases [#4125](https://github.com/tikv/tikv/pull/4125) +* Add the detailed `StoreNotMatch` error message [#3885](https://github.com/tikv/tikv/pull/3885) + +## [2.1.4] - 2019-02-14 +* Fix the abnormal result issue of the event listener in some cases #4126 +* Fix the duplicate write issue when closing TiKV #4146 + +## [2.1.3] - 2019-01-27 +* Support obtaining the monitoring information using the HTTP method #3855 +* Fix the NULL issue of data_format #4075 +* Add verifying the range for scan requests #4124 + +## [2.1.2] - 2018-12-21 +- Support the configuration format in the unit of `DAY` (`d`) and fix the configuration compatibility issue [#3931](https://github.com/tikv/tikv/pull/3931) +- Fix the possible panic issue caused by `Approximate Size Split` [#3942](https://github.com/tikv/tikv/pull/3942) +- Fix two issues about Region merge [#3822](https://github.com/tikv/tikv/pull/3822), [#3873](https://github.com/tikv/tikv/pull/3873) + +## [2.1.1] - 2018-12-12 +- Avoid transferring the leader to a newly created peer, to optimize the possible delay [#3878](https://github.com/tikv/tikv/pull/3878) + +## [2.1.0] - 2018-11-30 +* Coprocessor + - Add more built-in functions + - [Add Coprocessor `ReadPool` to improve the concurrency in processing the requests](https://github.com/tikv/rfcs/blob/master/text/2017-12-22-read-pool.md) + - Fix the time function parsing issue and the time zone related issues + - Optimize the memory usage for pushdown aggregation computing +* Transaction + - Optimize the read logic and memory usage of MVCC to improve the performance of the scan operation and the performance of full table scan is 1 time better than that in TiDB 2.0 + - Fold the continuous Rollback records to ensure the read performance + - [Add the `UnsafeDestroyRange` API to support to collecting space for the dropping table/index](https://github.com/tikv/rfcs/blob/master/text/2018-08-29-unsafe-destroy-range.md) + - Separate the GC module to reduce the impact on write + - Add the`upper bound` support in the `kv_scan` command +* Raftstore + - Improve the snapshot writing process to avoid RocksDB stall + - [Add the `LocalReader` thread to process read requests and reduce the delay for read requests](https://github.com/tikv/rfcs/pull/17) + - [Support `BatchSplit` to avoid large Region brought by large amounts of write](https://github.com/tikv/rfcs/pull/6) + - Support `Region Split` according to statistics to reduce the I/O overhead + - Support `Region Split` according to the number of keys to improve the concurrency of index scan + - Improve the Raft message process to avoid unnecessary delay brought by `Region Split` + - Enable the `PreVote` feature by default to reduce the impact of network isolation on services +* Storage Engine + - Fix the `CompactFiles` bug in RocksDB and reduce the impact on importing data using Lightning + - Upgrade RocksDB to v5.15 to fix the possible issue of snapshot file corruption + - Improve `IngestExternalFile` to avoid the issue that flush could block write +* tikv-ctl + - [Add the `ldb` command to diagnose RocksDB related issues](https://github.com/tikv/tikv/blob/master/docs/tools/tikv-control.md#ldb-command) + - The `compact` command supports specifying whether to compact data in the bottommost level +* Tools + - Fast full import of large amounts of data: [TiDB-Lightning](https://pingcap.com/docs/tools/lightning/overview-architecture/) + - Support new [TiDB-Binlog](https://pingcap.com/docs/tools/tidb-binlog-cluster/) + +## [2.1.0-rc.5] - 2018-11-12 +- Improve the error message of `WriteConflict` [#3750](https://github.com/tikv/tikv/pull/3750) +- Add the panic mark file [#3746](https://github.com/tikv/tikv/pull/3746) +- Downgrade grpcio to avoid the segment fault issue caused by the new version of gRPC [#3650](https://github.com/tikv/tikv/pull/3650) +- Add the upper limit to the `kv_scan` interface [#3749](https://github.com/tikv/tikv/pull/3749) + +## [2.1.0-rc.4] - 2018-10-23 +- Optimize the RocksDB Write stall issue caused by applying snapshots [#3606](https://github.com/tikv/tikv/pull/3606) +- Add raftstore `tick` metrics [#3657](https://github.com/tikv/tikv/pull/3657) +- Upgrade RocksDB and fix the Write block issue and that the source file might be damaged by the Write operation when performing `IngestExternalFile` [#3661](https://github.com/tikv/tikv/pull/3661) +- Upgrade grpcio and fix the issue that “too many pings” is wrongly reported [#3650](https://github.com/tikv/tikv/pull/3650) + +## [2.1.0-rc.3] - 2018-09-29 +### Performance +- Optimize the concurrency for coprocessor requests [#3515](https://github.com/tikv/tikv/pull/3515) +### New features +- Add the support for Log functions [#3603](https://github.com/tikv/tikv/pull/3603) +- Add the support for the `sha1` function [#3612](https://github.com/tikv/tikv/pull/3612) +- Add the support for the `truncate_int` function [#3532](https://github.com/tikv/tikv/pull/3532) +- Add the support for the `year` function [#3622](https://github.com/tikv/tikv/pull/3622) +- Add the support for the `truncate_real` function [#3633](https://github.com/tikv/tikv/pull/3633) +### Bug Fixes +- Fix the reporting error behavior related to time functions [#3487](https://github.com/tikv/tikv/pull/3487), [#3615](https://github.com/tikv/tikv/pull/3615) +- Fix the issue that the time parsed from string is inconsistent with that in TiDB [#3589](https://github.com/tikv/tikv/pull/3589) + +## [2.1.0-rc.2] - 2018-09-17 +### Performance +* Support splitting Regions based on statistics estimation to reduce the I/O cost [#3511](https://github.com/tikv/tikv/pull/3511) +* Reduce clone in the transaction scheduler [#3530](https://github.com/tikv/tikv/pull/3530) +### Improvements +* Add the pushdown support for a large number of built-in functions +* Add the `leader-transfer-max-log-lag` configuration to fix the failure issue of leader scheduling in specific scenarios [#3507](https://github.com/tikv/tikv/pull/3507) +* Add the `max-open-engines` configuration to limit the number of engines opened by `tikv-importer` simultaneously [#3496](https://github.com/tikv/tikv/pull/3496) +* Limit the cleanup speed of garbage data to reduce the impact on `snapshot apply` [#3547](https://github.com/tikv/tikv/pull/3547) +* Broadcast the commit message for crucial Raft messages to avoid unnecessary delay [#3592](https://github.com/tikv/tikv/pull/3592) +### Bug Fixes +* Fix the leader election issue caused by discarding the `PreVote` message of the newly split Region [#3557](https://github.com/tikv/tikv/pull/3557) +* Fix follower related statistics after merging Regions [#3573](https://github.com/tikv/tikv/pull/3573) +* Fix the issue that the local reader uses obsolete Region information [#3565](https://github.com/tikv/tikv/pull/3565) +* Support UnsafeDestroyRange API to speedup garbage data cleaning after table/index has been truncated/dropped [#3560](https://github.com/tikv/tikv/pull/3560) + +## [2.1.0-rc.1] - 2018-08-24 +### Features +* Support `batch split` to avoid too large Regions caused by the Write operation on hot Regions +* Support splitting Regions based on the number of rows to improve the index scan efficiency +### Performance +* Use `LocalReader` to separate the Read operation from the raftstore thread to lower the Read latency +* Refactor the MVCC framework, optimize the memory usage and improve the scan Read performance +* Support splitting Regions based on statistics estimation to reduce the I/O usage +* Optimize the issue that the Read performance is affected by continuous Write operations on the rollback record +* Reduce the memory usage of pushdown aggregation computing +### Improvements +* Add the pushdown support for a large number of built-in functions and better charset support +* Optimize the GC workflow, improve the GC speed and decrease the impact of GC on the system +* Enable `prevote` to speed up service recovery when the network is abnormal +* Add the related configuration items of RocksDB log files +* Adjust the default configuration of `scheduler_latch` +* Support setting whether to compact the data in the bottom layer of RocksDB when using tikv-ctl to compact data manually +* Add the check for environment variables when starting TiKV +* Support dynamically configuring the `dynamic_level_bytes` parameter based on the existing data +* Support customizing the log format +* Integrate tikv-fail in tikv-ctl +* Add I/O metrics of threads +### Bug Fixes +* Fix decimal related issues +* Fix the issue that `gRPC max_send_message_len` is set mistakenly +* Fix the issue caused by misconfiguration of `region_size` + +## [2.1.0-beta] - 2018-06-30 +### Features +* Upgrade Rust to the `nightly-2018-06-14` version +* Provide a `Raft PreVote` configuration to avoid leader reelection generated when network recovers after network isolation +* Add a metric to display the number of files and `ingest` related information in each layer of RocksDB +* Print `key` with too many versions when GC works +### Performance +* Use `static metric` to optimize multi-label metric performance (YCSB `raw get` is improved by 3%) +* Remove `box` in multiple modules and use patterns to improve the operating performance (YCSB `raw get` is improved by 3%) +* Use `asynchronous log` to improve the performance of writing logs +* Add a metric to collect the thread status +* Decease memory copy times by decreasing `box` used in the application to improve the performance + +## [2.0.11] - 2019-01-12 +* Fix two issues about Region merge #4003 and #4004 + +## [2.0.10] - 2018-12-16 +* Avoid transferring the leader to a newly created peer, to optimize the possible delay #3929 +* Fix redundant Region heartbeats #3930 + +## [2.0.9] - 2018-11-19 +* Add the end-key limit to the kv_scan interface #3749 +* Abandon the max-tasks-xxx configuration and add max-tasks-per-worker-xxx #3093 +* Fix the CompactFiles issue in RocksDB #3789 + +## [2.0.8] - 2018-10-15 +### Bug Fixes +* Fix the issue that the memory consumed by Raftstore EntryCache keeps increasing when a node goes down 3529 + +## [2.0.7] - 2018-09-17 +### Improvements +* Enable dynamic-level-bytes by default to reduce space amplification +### Bug Fixes +* Update Region's approximate size and approximate keys count after Region merge + +## [2.0.6] 0 - 2018-08-03 +### Improvements +* Enlarge scheduler’s default slots to reduce false conflicts +* Reduce continuous records of rollback transactions, to improve the Read +* performance when conflicts are extremely severe +* Limit the size and number of RocksDB log files, to reduce unnecessary +* disk usage in long-running condition +### Bug Fixes +* Fix the crash issue when converting the data type from string to decimal + +## [2.0.5] - 2018-07-06 +* Fix the potential overflow issue in decimal operations +* Fix the dirty read issue that might occur in the process of merge + +## [2.0.4] - 2018-06-16 +### Features +* Add the RocksDB `PerfContext` interface for debugging +* Add the `region-properties` command for `tikv-ctl` +### Improvements +* Make GC record the log when GC encounters many versions of data +* Remove the `import-mode` parameter +### Bug Fixes +* Fix the issue that `reverse-seek` is slow when many RocksDB tombstones exist +* Fix the crash issue caused by `do_sub` + +## [2.0.3] - 2018-06-01 +### Bug Fixes +* Correct wrong peer meta for learners +* Report an error instead of getting a result if divisor/dividend is 0 in do_div_mod + +## [2.0.2] - 2018-05-21 +### Improvements +* Support configuring more gRPC related parameters +* Support configuring the timeout range of leader election +### Bug Fixes +* Fix the issue that the Raft log is not printed +* Fix the issue that obsolete learner is not deleted +* Fix the issue that the snapshot intermediate file is mistakenly deleted + +## [2.0.1] - 2018-05-16 +### Performance +* Reduced number of `thread_yield` calls +* Fix the issue that `SELECT FOR UPDATE` prevents others from reading +### Improvements +* More verbose logs for slow query +* Speed up delete range +### Bug Fixes +* Fix the bug that raftstore is accidentally blocked when generating the snapshot +* Fix the issue that Learner cannot be successfully elected in special conditions +* Fix the issue that split might cause dirty read in extreme conditions +* Correct the default value of the read thread pool configuration + +## [2.0.0] - 2018-04-27 +### Features +* Protect critical configuration from incorrect modification +* Support `Region Merge` [experimental] +* Add the `Raw DeleteRange` API +* Add the `GetMetric` API +* Add `Raw Batch Put`, `Raw Batch Get`, `Raw Batch Delete` and `Raw Batch Scan` +* Add Column Family options for the RawKV API and support executing operation on a specific Column Family +* Support Streaming and Streaming Aggregation in Coprocessor +* Support configuring the request timeout of Coprocessor +* Carry timestamps with Region heartbeats +* Support modifying some RocksDB parameters online, such as `block-cache-size` +* Support configuring the behavior of Coprocessor when it encounters some warnings or errors +* Support starting in the importing data mode to reduce write amplification during the data importing process +* Support manually splitting Region in halves +* Improve the data recovery tool `tikv-ctl` +* Return more statistics in Coprocessor to guide the behavior of TiDB +* Support the `ImportSST` API to import SST files [experimental] +* Add the TiKV Importer binary to integrate with TiDB Lightning to import data quickly [experimental] +### Performance +* Optimize read performance using `ReadPool` and increase the `raw_get/get/batch_get` by 30% +* Improve metrics performance +* Inform PD immediately once the Raft snapshot process is completed to speed up balancing +* Solve performance jitter caused by RocksDB flushing +* Optimize the space reclaiming mechanism after deleting data +* Speed up garbage cleaning while starting the server +* Reduce the I/O overhead during replica migration using `DeleteFilesInRanges` +### Stability +* Fix the issue that gRPC call does not returned when the PD leader switches +* Fix the issue that it is slow to offline nodes caused by snapshots +* Limit the temporary space usage consumed by migrating replicas +* Report the Regions that cannot elect a leader for a long time +* Update the Region size information in time according to compaction events +* Limit the size of scan lock to avoid request timeout +* Limit the memory usage when receiving snapshots to avoid OOM +* Increase the speed of CI test +* Fix the OOM issue caused by too many snapshots +* Configure `keepalive` of gRPC +* Fix the OOM issue caused by an increase of the Region number + +## [2.0.0-rc6] - 2018-04-19 +### Improvements +* Reduce lock contention in Worker +* Add metrics to the FuturePool +### Bug Fixes +* Fix misused metrics in Coprocessor + +## [2.0.0-rc.5] - 2018-04-17 +### New Features +* Support compacting Regions in `tikv-ctl` +* Add raw batch put/get/delete/scan API for TiKV service +* Add ImportKV service +* Support eval error in Coprocessor +* Support dynamic adjustment of RocksDB cache size by `tikv-ctl` +* Collect number of rows scanned for each range in Coprocessor +* Support treating overflow as warning in Coprocessor +* Support learner in raftstore +### Improvements +* Increase snap GC timeout + +## [2.0.0-rc.4] - 2018-04-01 +### New Features +* Limit the memory usage during receiving snapshots, to avoid OOM in extreme conditions +* Support configuring the behavior of Coprocessor when it encounters warnings +* Support importing the data pattern in TiKV +* Support splitting Region in the middle +### Improvements +* Fix the issue that too many logs are output caused by leader missing when TiKV is isolated +* Use crossbeam channel in worker + +## [2.0.0-rc.3] - 2018-03-23 +### New Features +* Support Region Merge +* Add the Raw DeleteRange API +* Add the GetMetric API +* Support streaming in Coprocessor +* Support modifying RocksDB parameters online +### Improvements +* Inform PD immediately once the Raft snapshot process is completed, to speed up balancing +* Reduce the I/O fluctuation caused by RocksDB sync files +* Optimize the space reclaiming mechanism after deleting data +* Improve the data recovery tool `tikv-ctl` +* Fix the issue that it is slow to make nodes down caused by snapshot +* Increase the raw_get/get/batch_get by 30% with ReadPool +* Support configuring the request timeout of Coprocessor +* Carry time information in Region heartbeats +* Limit the space usage of snapshot files to avoid consuming too much disk space +* Record and report the Regions that cannot elect a leader for a long time +* Speed up garbage cleaning when starting the server +* Update the size information about the corresponding Region according to compaction events +* Limit the size of scan lock to avoid request timeout +* Use DeleteRange to speed up Region deletion + +## [2.0.0-rc.2] - 2018-03-15 +### New Features +* Implement IngestSST API +* `tikv-ctl` now can send consistency-check requests to TiKV +* Support dumping stats of RocksDB and malloc in `tikv-ctl` +### Improvements +* Reclaim disk space after data have been deleted + +## [2.0.0-rc.1] - 2018-03-09 +### New Features +* Protect important configuration which cannot be changed after initial configuration +* Check whether SSD is used when you start the cluster +### Improvements +* Fix the issue that gRPC call is not cancelled when PD leaders switch +* Optimize the read performance using ReadPool, and improve the performance by 30% for raw get +* Improve metrics and optimize the usage of metrics + +## [1.1.0-beta] - 2018-02-24 +### Improvements +* Traverse locks using offset + limit to avoid potential GC problems +* Support resolving locks in batches to improve GC speed +* Support GC concurrency to improve GC speed +* Update the Region size using the RocksDB compaction listener for more accurate PD scheduling +* Delete the outdated data in batches using DeleteFilesInRanges, to make TiKV start faster +* Configure the Raft snapshot max size to avoid the retained files taking up too much space +* Support more recovery operations in tikv-ctl +* Optimize the ordered flow aggregation operation + +## [1.1.0-alpha] - 2018-01-19 +### New Features +* Support Raft learner +* Support TLS +### Improvements +* Optimize Raft Snapshot and reduce the I/O overhead +* Optimize the RocksDB configuration to improve performance +* Optimize count (*) and query performance of unique index in Coprocessor +* Solve the reconnection issue between PD and TiKV +* Enhance the features of the data recovery tool `tikv-ctl` +* Support the Delete Range feature +* Support splitting according to table in Regions +* Support setting the I/O limit caused by snapshot +* Improve the flow control mechanism + +## [1.0.8] - 2018-02-11 +### Improvements +* Use DeleteFilesInRanges to clear stale data and improve the TiKV starting speed +* Sync the metadata of the received Snapshot compulsorily to ensure its safety +### Bug Fixes +* Use Decimal in Coprocessor sum + +## [1.0.7] - 2018-01-22 +### Improvements +* Support key-only option in Table Scan executor +* Support the remote mode in tikv-ctl +* Fix the loss of scheduling command from PD +### Bug Fixes +* Fix the format compatibility issue of tikv-ctl proto +* Add timeout in Push metric + +## [1.0.5] - 2017-12-26 +* Fix the issue that it is slow to get the CPU ID using the get_cpuid function. +* Support the dynamic-level-bytes parameter to improve the space collection situation. + +## [1.0.4] - 2017-12-11 +* Fix a possible performance issue when a snapshot is applied +* Fix the performance issue for reverse scan after removing a lot of data +* Fix the wrong encoded result for the Decimal type under special circumstances + +## [1.0.2] - 2017-11-13 +* Support splitting table to ensure one region does not contain data from multiple tables. +* Limit the length of a key to be no more than 4 KB. +* More accurate read traffic statistics. +* Implement deep protection on the coprocessor stack. +* Fix the LIKE behavior and the do_div_mod bug. + +## [1.0.1] - 2017-10-30 +* Support flow control with write bytes. +* Reduce Raft allocation. +* Increase coprocessor stack size to 10MB. +* Remove the useless log from coprocessor. + +## [1.0.0] - 2017-10-15 +* Coprocessor now supports more pushdown functions +* Support pushing down the sampling operation +* Support manually triggering data compact to collect space quickly +* Improve the performance and stability +* Add a Debug API for debugging diff --git a/Cargo.lock b/Cargo.lock index 158f78fa52b..ec5469e991a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -94,15 +94,9 @@ dependencies = [ [[package]] name = "arbitrary" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db55d72333851e17d572bec876e390cd3b11eb1ef53ae821dd9f3b653d2b4569" - -[[package]] -name = "arbitrary" -version = "1.2.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29d47fbf90d5149a107494b15a7dc8d69b351be2db3bb9691740e88ec17fd880" +checksum = "16971f2f0ce65c5cf2a1546cc6a0af102ecb11e265ddaa9433fb3e5bfdf676a4" [[package]] name = "arc-swap" @@ -136,7 +130,7 @@ dependencies = [ "lexical-core", "multiversion", "num 0.4.0", - "rand 0.8.3", + "rand 0.8.5", "regex", "serde", "serde_derive", @@ -237,9 +231,9 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.22" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8df72488e87761e772f14ae0c2480396810e51b2c2ade912f97f0f7e5b95e3c" +checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" dependencies = [ "proc-macro2", "quote", @@ -305,6 +299,51 @@ dependencies = [ "uuid 0.8.2", ] +[[package]] +name = "axum" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acee9fd5073ab6b045a275b3e709c163dd36c90685219cb21804a147b58dba43" +dependencies = [ + "async-trait", + "axum-core", + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa 1.0.1", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde", + "sync_wrapper", + "tokio", + "tower", + "tower-http", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e5939e02c56fecd5c017c37df4238c0a839fa76b7f97acdd7efb804fd181cc" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "tower-layer", + "tower-service", +] + [[package]] name = "azure" version = "0.0.1" @@ -342,7 +381,7 @@ dependencies = [ "http", "log", "oauth2", - "rand 0.8.3", + "rand 0.8.5", "reqwest", "rustc_version 0.4.0", "serde", @@ -444,7 +483,7 @@ dependencies = [ "prometheus", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "security", "serde", "serde_derive", @@ -487,6 +526,7 @@ dependencies = [ "futures-io", "grpcio", "hex 0.4.2", + "indexmap", "kvproto", "lazy_static", "log_wrappers", @@ -497,9 +537,10 @@ dependencies = [ "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "resolved_ts", + "security", "slog", "slog-global", "tempdir", @@ -514,7 +555,7 @@ dependencies = [ "tikv_util", "tokio", "tokio-stream", - "tokio-util 0.7.2", + "tokio-util", "tonic", "txn_types", "url", @@ -951,7 +992,7 @@ dependencies = [ "libc 0.2.132", "panic_hook", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "static_assertions", "thiserror", "tikv_alloc", @@ -975,7 +1016,7 @@ dependencies = [ "futures 0.3.15", "kvproto", "parking_lot 0.12.0", - "rand 0.8.3", + "rand 0.8.5", "tikv_alloc", "tikv_util", "tokio", @@ -1128,9 +1169,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils 0.8.8", @@ -1138,13 +1179,12 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +version = "0.8.2" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.8", - "crossbeam-utils 0.8.8", + "crossbeam-epoch 0.9.10", + "crossbeam-utils 0.8.11", ] [[package]] @@ -1173,6 +1213,19 @@ dependencies = [ "scopeguard", ] +[[package]] +name = "crossbeam-epoch" +version = "0.9.10" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" +dependencies = [ + "autocfg", + "cfg-if 1.0.0", + "crossbeam-utils 0.8.11", + "memoffset", + "once_cell", + "scopeguard", +] + [[package]] name = "crossbeam-queue" version = "0.3.5" @@ -1225,6 +1278,15 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.11" +source = "git+https://github.com/crossbeam-rs/crossbeam?rev=41ed3d948720f26149b2ebeaf58fe8a193134056#41ed3d948720f26149b2ebeaf58fe8a193134056" +dependencies = [ + "cfg-if 1.0.0", + "once_cell", +] + [[package]] name = "crypto-mac" version = "0.10.0" @@ -1294,9 +1356,9 @@ dependencies = [ [[package]] name = "dashmap" -version = "5.2.0" +version = "5.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8858831f7781322e539ea39e72449c46b059638250c14344fec8d0aa6e539c" +checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", @@ -1343,15 +1405,6 @@ dependencies = [ "generic-array", ] -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - [[package]] name = "dirs-next" version = "2.0.0" @@ -1362,17 +1415,6 @@ dependencies = [ "dirs-sys-next", ] -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc 0.2.132", - "redox_users", - "winapi 0.3.9", -] - [[package]] name = "dirs-sys-next" version = "0.1.2" @@ -1443,7 +1485,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -1513,7 +1555,7 @@ dependencies = [ "prometheus-static-metric", "protobuf", "raft", - "rand 0.8.3", + "rand 0.8.5", "regex", "rocksdb", "serde", @@ -1596,7 +1638,7 @@ dependencies = [ "raft", "raft-proto", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "serde_with", @@ -1657,7 +1699,7 @@ dependencies = [ "prometheus-static-metric", "protobuf", "raft", - "rand 0.8.3", + "rand 0.8.5", "regex", "rocksdb", "serde", @@ -1782,15 +1824,19 @@ dependencies = [ [[package]] name = "etcd-client" -version = "0.7.2" -source = "git+https://github.com/pingcap/etcd-client?rev=e0321a1990ee561cf042973666c0db61c8d82364#e0321a1990ee561cf042973666c0db61c8d82364" +version = "0.10.2" +source = "git+https://github.com/pingcap/etcd-client?rev=14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e#14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e" dependencies = [ "http", - "prost 0.8.0", + "hyper", + "hyper-openssl", + "openssl", + "prost 0.11.2", "tokio", "tokio-stream", "tonic", "tonic-build", + "tower", "tower-service", "visible", ] @@ -1832,7 +1878,7 @@ dependencies = [ "openssl", "prometheus", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "rusoto_core", "rust-ini", "slog", @@ -1842,7 +1888,7 @@ dependencies = [ "tikv_alloc", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] @@ -1883,7 +1929,7 @@ dependencies = [ "tempfile", "tikv_util", "tokio", - "tokio-util 0.7.2", + "tokio-util", "url", ] @@ -1895,7 +1941,7 @@ checksum = "ec3245a0ca564e7f3c797d20d833a6870f57a728ac967d5225b3ffdef4465011" dependencies = [ "lazy_static", "log", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -1906,9 +1952,9 @@ checksum = "f35ce9c8fb9891c75ceadbc330752951a4e369b50af10775955aeb9af3eee34b" [[package]] name = "ffi-support" -version = "0.4.4" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27838c6815cfe9de2d3aeb145ffd19e565f577414b33f3bdbf42fe040e9e0ff6" +checksum = "f85d4d1be103c0b2d86968f0b0690dc09ac0ba205b90adb0389b552869e5000e" dependencies = [ "lazy_static", "log", @@ -1931,7 +1977,7 @@ dependencies = [ "parking_lot 0.12.0", "prometheus", "prometheus-static-metric", - "rand 0.8.3", + "rand 0.8.5", "serde", "slog", "slog-global", @@ -1945,9 +1991,9 @@ dependencies = [ [[package]] name = "filedescriptor" -version = "0.8.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7199d965852c3bac31f779ef99cbb4537f80e952e2d6aa0ffeb30cce00f4f46e" +checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ "libc 0.2.132", "thiserror", @@ -1980,9 +2026,9 @@ dependencies = [ [[package]] name = "fixedbitset" -version = "0.2.0" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" @@ -2278,6 +2324,7 @@ dependencies = [ "hyper-tls", "kvproto", "matches", + "pin-project", "slog", "slog-global", "tame-gcs", @@ -2367,9 +2414,9 @@ dependencies = [ [[package]] name = "grpcio" -version = "0.10.3" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9bcdd3694fa08158334501af37bdf5b4f00b1865b602d917e3cd74ecf80cd0a" +checksum = "1f2506de56197d01821c2d1d21082d2dcfd6c82d7a1d6e04d33f37aab6130632" dependencies = [ "futures-executor", "futures-util", @@ -2382,18 +2429,18 @@ dependencies = [ [[package]] name = "grpcio-compiler" -version = "0.9.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4caa0700833147dcfbe4f0758bd92545cc0f4506ee7fa154e499745a8b24e86c" +checksum = "ed97a17310fd00ff4109357584a00244e2a785d05b7ee0ef4d1e8fb1d84266df" dependencies = [ "protobuf", ] [[package]] name = "grpcio-health" -version = "0.10.0" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "641a95bace445aed36b31ae8731513c4c4d1d3dcdbc05aaeeefefe4fd673ada1" +checksum = "a37eae605cd21f144b7c7fd0e64e57af9f73d132756fef5b706db110c3ec7ea0" dependencies = [ "futures-executor", "futures-util", @@ -2420,9 +2467,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.3" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +checksum = "5f9f29bc9dda355256b2916cf526ab02ce0aeaaaf2bad60d65ef3f12f11dd0f4" dependencies = [ "bytes", "fnv", @@ -2433,7 +2480,7 @@ dependencies = [ "indexmap", "slab", "tokio", - "tokio-util 0.6.6", + "tokio-util", "tracing", ] @@ -2506,43 +2553,48 @@ dependencies = [ [[package]] name = "honggfuzz" -version = "0.5.55" +version = "0.5.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "848e9c511092e0daa0a35a63e8e6e475a3e8f870741448b9f6028d69b142f18e" +checksum = "c3de2c3273ef7735df1c5a72128ca85b1d20105b9aac643cdfd7a6e581311150" dependencies = [ - "arbitrary 1.2.0", + "arbitrary", "lazy_static", - "memmap2", - "rustc_version 0.4.0", + "memmap", ] [[package]] name = "http" -version = "0.2.4" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399" dependencies = [ "bytes", "fnv", - "itoa 0.4.4", + "itoa 1.0.1", ] [[package]] name = "http-body" -version = "0.4.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +checksum = "d5f38f16d184e36f2408a55281cd658ecbd3ca05cce6d6510a176eca393e26d1" dependencies = [ "bytes", "http", "pin-project-lite", ] +[[package]] +name = "http-range-header" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bfe8eed0a9285ef776bb792479ea3834e8b94e13d615c2f66d03dd50a435a29" + [[package]] name = "httparse" -version = "1.4.1" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" [[package]] name = "httpdate" @@ -2558,9 +2610,9 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "0.14.11" +version = "0.14.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b61cf2d1aebcf6e6352c97b81dc2244ca29194be1b276f5d8ad5c6330fffb11" +checksum = "034711faac9d2166cb1baf1a2fb0b60b1f277f8492fd72176c17f3515e1abd3c" dependencies = [ "bytes", "futures-channel", @@ -2571,7 +2623,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa 0.4.4", + "itoa 1.0.1", "pin-project-lite", "socket2", "tokio", @@ -2582,9 +2634,9 @@ dependencies = [ [[package]] name = "hyper-openssl" -version = "0.9.2" +version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6ee5d7a8f718585d1c3c61dfde28ef5b0bb14734b4db13f5ada856cdc6c612b" +checksum = "a9d52322a69f0a93f177d76ca82073fcec8d5b4eb6e28525d5b3142fa718195c" dependencies = [ "http", "hyper", @@ -2592,7 +2644,7 @@ dependencies = [ "once_cell", "openssl", "openssl-sys", - "parking_lot 0.12.0", + "parking_lot 0.11.1", "tokio", "tokio-openssl", "tower-layer", @@ -2820,7 +2872,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#26e28e6a281abb927f91ef992eb8f93b39698ffa" +source = "git+https://github.com/pingcap/kvproto.git#e53d558bc6d7d8b7bb2d283cdf6dda52a2615632" dependencies = [ "futures 0.3.15", "grpcio", @@ -2919,11 +2971,11 @@ checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" [[package]] name = "libfuzzer-sys" -version = "0.3.5" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf184a4b6b274f82a5df6b357da6055d3e82272327bba281c28bbba6f1664ef" +checksum = "fb789afcc589a08928d1e466087445ab740a0f70a2ee23d9349a0e3723d65e1b" dependencies = [ - "arbitrary 0.4.7", + "arbitrary", "cc", ] @@ -2994,9 +3046,9 @@ dependencies = [ [[package]] name = "linked-hash-map" -version = "0.5.6" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" +checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" [[package]] name = "linked_hash_set" @@ -3069,6 +3121,12 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" +[[package]] +name = "matchit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73cbba799671b762df5a175adf59ce145165747bb891505c43d09aefbbf38beb" + [[package]] name = "md-5" version = "0.9.1" @@ -3095,6 +3153,16 @@ dependencies = [ "libc 0.2.132", ] +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc 0.2.132", + "winapi 0.3.9", +] + [[package]] name = "memmap2" version = "0.5.3" @@ -3132,9 +3200,9 @@ dependencies = [ [[package]] name = "mime" -version = "0.3.14" +version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd1d63acd1b78403cc0c325605908475dd9b9a3acbf65ed8bcab97e27014afcf" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" [[package]] name = "minimal-lexical" @@ -3174,7 +3242,7 @@ dependencies = [ "kernel32-sys", "libc 0.2.132", "log", - "miow 0.2.2", + "miow", "net2", "slab", "winapi 0.2.8", @@ -3182,15 +3250,14 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.0" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba272f85fa0b41fc91872be579b3bbe0f56b792aa361a380eb669469f68dafb2" +checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ "libc 0.2.132", "log", - "miow 0.3.7", - "ntapi", - "winapi 0.3.9", + "wasi 0.11.0+wasi-snapshot-preview1", + "windows-sys 0.42.0", ] [[package]] @@ -3217,15 +3284,6 @@ dependencies = [ "ws2_32-sys", ] -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi 0.3.9", -] - [[package]] name = "mmap" version = "0.1.1" @@ -3236,6 +3294,15 @@ dependencies = [ "tempdir", ] +[[package]] +name = "mnt" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" +dependencies = [ + "libc 0.2.132", +] + [[package]] name = "more-asserts" version = "0.2.1" @@ -3336,7 +3403,7 @@ dependencies = [ "proxy_server", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", "resource_metering", "security", @@ -3368,9 +3435,9 @@ dependencies = [ [[package]] name = "nix" -version = "0.23.1" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f866317acbd3a240710c63f065ffb1e4fd466259045ccb504130b7f668f35c6" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" dependencies = [ "bitflags", "cc", @@ -3625,7 +3692,7 @@ dependencies = [ "chrono", "getrandom 0.2.3", "http", - "rand 0.8.3", + "rand 0.8.5", "reqwest", "serde", "serde_json", @@ -3646,9 +3713,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.10.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" +checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" [[package]] name = "online_config" @@ -3804,7 +3871,7 @@ dependencies = [ "libc 0.2.132", "redox_syscall 0.2.11", "smallvec", - "windows-sys", + "windows-sys 0.32.0", ] [[package]] @@ -3895,9 +3962,9 @@ dependencies = [ [[package]] name = "petgraph" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "467d164a6de56270bd7c4d070df81d07beace25012d5103ced4e9ff08d6afdb7" +checksum = "4a13a2fa9d0b63e5f22328828741e523766fff0ee9e779316902290dff3f824f" dependencies = [ "fixedbitset", "indexmap", @@ -3929,7 +3996,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d43f3220d96e0080cc9ea234978ccd80d904eafb17be31bb0f76daaea6493082" dependencies = [ "phf_shared", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] @@ -3963,9 +4030,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.6" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" [[package]] name = "pin-utils" @@ -4040,7 +4107,8 @@ dependencies = [ [[package]] name = "pprof" version = "0.11.0" -source = "git+https://github.com/CalvinNeo/pprof-rs?branch=master#383921febf2c12e65de682442c1d7cebec3ad195" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e20150f965e0e4c925982b9356da71c84bcd56cb66ef4e894825837cbcf6613e" dependencies = [ "backtrace", "cfg-if 1.0.0", @@ -4065,6 +4133,16 @@ version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" +[[package]] +name = "prettyplease" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c142c0e46b57171fe0c528bee8c5b7569e80f0c17e377cd0e30ea57dbc11bb51" +dependencies = [ + "proc-macro2", + "syn", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -4103,11 +4181,11 @@ checksum = "369a6ed065f249a159e06c45752c780bda2fb53c995718f9e484d08daa9eb42e" [[package]] name = "proc-macro2" -version = "1.0.36" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +checksum = "5ea3d908b0e36316caf9e9e2c4625cdde190a7e6f440d794667ed17a1855e725" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] @@ -4185,28 +4263,32 @@ dependencies = [ [[package]] name = "prost" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de5e2533f59d08fcf364fd374ebda0692a70bd6d7e66ef97f306f45c6c5d8020" +checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" dependencies = [ "bytes", - "prost-derive 0.8.0", + "prost-derive 0.11.2", ] [[package]] name = "prost-build" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "355f634b43cdd80724ee7848f95770e7e70eefa6dcf14fea676216573b8fd603" +checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" dependencies = [ "bytes", - "heck 0.3.1", + "heck 0.4.0", "itertools 0.10.0", + "lazy_static", "log", "multimap", "petgraph", - "prost 0.8.0", + "prettyplease", + "prost 0.11.2", "prost-types", + "regex", + "syn", "tempfile", "which 4.2.4", ] @@ -4226,9 +4308,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600d2f334aa05acb02a755e217ef1ab6dea4d51b58b7846588b747edec04efba" +checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", "itertools 0.10.0", @@ -4239,12 +4321,12 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.8.0" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "603bbd6394701d13f3f25aada59c7de9d35a6a5887cfc156181234a44002771b" +checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", - "prost 0.8.0", + "prost 0.11.2", ] [[package]] @@ -4326,7 +4408,7 @@ dependencies = [ "log", "log_wrappers", "mime", - "nix 0.23.1", + "nix 0.23.2", "online_config", "openssl", "pd_client", @@ -4337,7 +4419,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "regex", "resolved_ts", "resource_metering", @@ -4387,6 +4469,7 @@ dependencies = [ "engine_rocks_helper", "engine_store_ffi", "engine_test", + "engine_tiflash", "engine_traits", "error_code", "external_storage_export", @@ -4414,7 +4497,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "rand_xorshift", "resource_metering", "security", @@ -4475,7 +4558,7 @@ dependencies = [ "getset", "protobuf", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", ] @@ -4483,7 +4566,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#a0d29980f1448565a6d03f911ebb103c4266f1f4" +source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" dependencies = [ "byteorder", "crc32fast", @@ -4593,7 +4676,7 @@ dependencies = [ "protobuf", "raft", "raft-proto", - "rand 0.8.3", + "rand 0.8.5", "resource_metering", "serde", "serde_derive", @@ -4628,22 +4711,27 @@ name = "raftstore-v2" version = "0.1.0" dependencies = [ "batch-system", + "causal_ts", "collections", + "concurrency_manager", "crossbeam", "engine_test", "engine_traits", "error_code", "fail", "file_system", + "fs2", "futures 0.3.15", "keys", "kvproto", "log_wrappers", "pd_client", + "prometheus", "protobuf", "raft", "raft-proto", "raftstore", + "resource_metering", "slog", "slog-global", "smallvec", @@ -4654,6 +4742,7 @@ dependencies = [ "time", "tracker", "txn_types", + "yatp", ] [[package]] @@ -4679,19 +4768,18 @@ dependencies = [ "libc 0.2.132", "rand_chacha 0.2.1", "rand_core 0.5.1", - "rand_hc 0.2.0", + "rand_hc", ] [[package]] name = "rand" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc 0.2.132", "rand_chacha 0.3.0", "rand_core 0.6.2", - "rand_hc 0.3.0", ] [[package]] @@ -4756,15 +4844,6 @@ dependencies = [ "rand_core 0.5.1", ] -[[package]] -name = "rand_hc" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" -dependencies = [ - "rand_core 0.6.2", -] - [[package]] name = "rand_isaac" version = "0.3.0" @@ -4976,7 +5055,7 @@ dependencies = [ "pin-project", "procinfo", "prometheus", - "rand 0.8.3", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -5026,7 +5105,7 @@ dependencies = [ [[package]] name = "rusoto_core" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "base64", @@ -5050,7 +5129,7 @@ dependencies = [ [[package]] name = "rusoto_credential" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -5067,7 +5146,7 @@ dependencies = [ [[package]] name = "rusoto_kms" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -5080,7 +5159,7 @@ dependencies = [ [[package]] name = "rusoto_mock" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "chrono", @@ -5094,7 +5173,7 @@ dependencies = [ [[package]] name = "rusoto_s3" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -5108,7 +5187,7 @@ dependencies = [ [[package]] name = "rusoto_signature" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "base64", "bytes", @@ -5133,7 +5212,7 @@ dependencies = [ [[package]] name = "rusoto_sts" version = "0.46.0" -source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#5fcf2d1c36b93d0146cc49f257dd850e01b6e4db" +source = "git+https://github.com/tikv/rusoto?branch=gh1482-s3-addr-styles#0d6df7b119c4e757daaa715f261c3150c7ae0a3b" dependencies = [ "async-trait", "bytes", @@ -5195,19 +5274,6 @@ dependencies = [ "semver 1.0.4", ] -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64", - "log", - "ring", - "sct", - "webpki", -] - [[package]] name = "rustversion" version = "1.0.4" @@ -5251,16 +5317,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sct" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "seahash" version = "4.1.0" @@ -5279,7 +5335,6 @@ dependencies = [ "serde_json", "tempfile", "tikv_util", - "tonic", ] [[package]] @@ -5507,7 +5562,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", "resource_metering", "security", @@ -5659,9 +5714,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.9.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" [[package]] name = "snap_recovery" @@ -5723,9 +5778,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.4.4" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66d72b759436ae32898a2af0a14218dbf55efde3feeb170eb623637db85ee1e0" +checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ "libc 0.2.132", "winapi 0.3.9", @@ -5759,6 +5814,7 @@ dependencies = [ "log_wrappers", "openssl", "prometheus", + "rand 0.8.5", "serde", "serde_derive", "slog", @@ -5908,13 +5964,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.86" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +checksum = "a864042229133ada95abf3b54fdc62ef5ccabe9515b64717bcb9a1919e59445d" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] [[package]] @@ -6016,7 +6072,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", "libc 0.2.132", - "rand 0.8.3", + "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", "winapi 0.3.9", @@ -6060,7 +6116,7 @@ dependencies = [ "grpcio", "kvproto", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "tempfile", "test_raftstore", "tidb_query_common", @@ -6155,7 +6211,7 @@ dependencies = [ "protobuf", "raft", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "resolved_ts", "resource_metering", "security", @@ -6211,7 +6267,7 @@ dependencies = [ "fail", "grpcio", "kvproto", - "rand 0.8.3", + "rand 0.8.5", "rand_isaac", "security", "slog", @@ -6268,7 +6324,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", - "rand 0.8.3", + "rand 0.8.5", "rand_xorshift", "resource_metering", "security", @@ -6291,6 +6347,7 @@ dependencies = [ "tidb_query_executors", "tidb_query_expr", "tikv", + "tikv_kv", "tikv_util", "time", "tipb", @@ -6407,7 +6464,7 @@ dependencies = [ "lazy_static", "log_wrappers", "match-template", - "nom 5.1.0", + "nom 7.1.0", "num 0.3.0", "num-derive", "num-traits", @@ -6475,7 +6532,7 @@ dependencies = [ "panic_hook", "profiler", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "safemem", "serde", @@ -6494,7 +6551,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.4.0-alpha" +version = "6.5.0-alpha" dependencies = [ "anyhow", "api_version", @@ -6518,7 +6575,6 @@ dependencies = [ "engine_panic", "engine_rocks", "engine_test", - "engine_tiflash", "engine_traits", "engine_traits_tests", "error_code", @@ -6691,6 +6747,7 @@ dependencies = [ "pd_client", "prometheus", "prometheus-static-metric", + "raft", "raftstore", "slog", "slog-global", @@ -6729,6 +6786,7 @@ dependencies = [ "libc 0.2.132", "log", "log_wrappers", + "mnt", "nix 0.24.1", "num-traits", "num_cpus", @@ -6742,7 +6800,7 @@ dependencies = [ "prometheus", "prometheus-static-metric", "protobuf", - "rand 0.8.3", + "rand 0.8.5", "regex", "rusoto_core", "serde", @@ -6810,16 +6868,16 @@ dependencies = [ [[package]] name = "tokio" -version = "1.17.0" +version = "1.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2af73ac49756f3f7c01172e34a23e5d0216f6c32333757c2c61feb2bbff5a5ee" +checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ + "autocfg", "bytes", "libc 0.2.132", "memchr", - "mio 0.8.0", + "mio 0.8.5", "num_cpus", - "once_cell", "parking_lot 0.12.0", "pin-project-lite", "signal-hook-registry", @@ -6881,22 +6939,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls", - "tokio", - "webpki", -] - [[package]] name = "tokio-stream" -version = "0.1.8" +version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50145484efff8818b5ccd256697f36863f587da82cf8b409c53adf1e840798e3" +checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" dependencies = [ "futures-core", "pin-project-lite", @@ -6914,20 +6961,6 @@ dependencies = [ "tokio-executor", ] -[[package]] -name = "tokio-util" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940a12c99365c31ea8dd9ba04ec1be183ffe4920102bb7122c2f515437601e8e" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.2" @@ -6940,6 +6973,7 @@ dependencies = [ "futures-sink", "pin-project-lite", "tokio", + "tracing", ] [[package]] @@ -6953,12 +6987,13 @@ dependencies = [ [[package]] name = "tonic" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "796c5e1cd49905e65dd8e700d4cb1dffcbfdb4fc9d017de08c1a537afd83627c" +checksum = "55b9af819e54b8f33d453655bef9b9acc171568fb49523078d0cc4e7484200ec" dependencies = [ "async-stream 0.3.3", "async-trait", + "axum", "base64", "bytes", "futures-core", @@ -6970,12 +7005,11 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost 0.8.0", - "prost-derive 0.8.0", + "prost 0.11.2", + "prost-derive 0.11.2", "tokio", - "tokio-rustls", "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower", "tower-layer", "tower-service", @@ -6985,10 +7019,11 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.5.2" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b52d07035516c2b74337d2ac7746075e7dcae7643816c1b12c5ff8a7484c08" +checksum = "48c6fd7c2581e36d63388a9e04c350c21beb7a8b059580b2e93993c526899ddc" dependencies = [ + "prettyplease", "proc-macro2", "prost-build", "quote", @@ -6997,24 +7032,43 @@ dependencies = [ [[package]] name = "tower" -version = "0.4.8" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60422bc7fefa2f3ec70359b8ff1caff59d785877eb70595904605bcc412470f" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", "indexmap", "pin-project", - "rand 0.8.3", + "pin-project-lite", + "rand 0.8.5", "slab", "tokio", - "tokio-stream", - "tokio-util 0.6.6", + "tokio-util", "tower-layer", "tower-service", "tracing", ] +[[package]] +name = "tower-http" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c530c8675c1dbf98facee631536fa116b5fb6382d7dd6dc1b118d970eafe3ba" +dependencies = [ + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + [[package]] name = "tower-layer" version = "0.3.1" @@ -7023,9 +7077,9 @@ checksum = "343bc9466d3fe6b0f960ef45960509f84480bf4fd96f92901afe7ff3df9d3a62" [[package]] name = "tower-service" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" [[package]] name = "tracing" @@ -7119,7 +7173,7 @@ dependencies = [ "kvproto", "log_wrappers", "panic_hook", - "rand 0.8.3", + "rand 0.8.5", "slog", "thiserror", "tikv_alloc", @@ -7153,6 +7207,12 @@ dependencies = [ "matches", ] +[[package]] +name = "unicode-ident" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" + [[package]] name = "unicode-normalization" version = "0.1.12" @@ -7174,12 +7234,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20" -[[package]] -name = "unicode-xid" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" - [[package]] name = "untrusted" version = "0.7.1" @@ -7305,6 +7359,12 @@ version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + [[package]] name = "wasm-bindgen" version = "0.2.79" @@ -7383,16 +7443,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab146130f5f790d45f82aeeb09e55a256573373ec64409fc19a6fb82fb1032ae" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "which" version = "3.1.1" @@ -7462,43 +7512,100 @@ version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3df6e476185f92a12c072be4a189a0210dcdcf512a1891d6dff9edb874deadc6" dependencies = [ - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_msvc", + "windows_aarch64_msvc 0.32.0", + "windows_i686_gnu 0.32.0", + "windows_i686_msvc 0.32.0", + "windows_x86_64_gnu 0.32.0", + "windows_x86_64_msvc 0.32.0", +] + +[[package]] +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc 0.42.0", + "windows_i686_gnu 0.42.0", + "windows_i686_msvc 0.42.0", + "windows_x86_64_gnu 0.42.0", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc 0.42.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d2aa71f6f0cbe00ae5167d90ef3cfe66527d6f613ca78ac8024c3ccab9a19e" + [[package]] name = "windows_aarch64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8e92753b1c443191654ec532f14c199742964a061be25d77d7a96f09db20bf5" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd0f252f5a35cac83d6311b2e795981f5ee6e67eb1f9a7f64eb4500fbc4dcdb4" + [[package]] name = "windows_i686_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a711c68811799e017b6038e0922cb27a5e2f43a2ddb609fe0b6f3eeda9de615" +[[package]] +name = "windows_i686_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbeae19f6716841636c28d695375df17562ca208b2b7d0dc47635a50ae6c5de7" + [[package]] name = "windows_i686_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "146c11bb1a02615db74680b32a68e2d61f553cc24c4eb5b4ca10311740e44172" +[[package]] +name = "windows_i686_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c12f65daa39dd2babe6e442988fc329d6243fdce47d7d2d155b8d874862246" + [[package]] name = "windows_x86_64_gnu" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c912b12f7454c6620635bbff3450962753834be2a594819bd5e945af18ec64bc" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf7b1b21b5362cbc318f686150e5bcea75ecedc74dd157d874d754a2ca44b0ed" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09d525d2ba30eeb3297665bd434a54297e4170c7f1a44cad4ef58095b4cd2028" + [[package]] name = "windows_x86_64_msvc" version = "0.32.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "504a2476202769977a040c6364301a3f65d0cc9e3fb08600b2bda150a0488316" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40009d85759725a34da6d89a94e63d7bdc50a862acf0dbc7c8e488f1edcb6f5" + [[package]] name = "winreg" version = "0.7.0" @@ -7535,12 +7642,9 @@ dependencies = [ [[package]] name = "xdg" -version = "2.4.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4583db5cbd4c4c0303df2d15af80f0539db703fa1c68802d4cbbd2dd0f88f6" -dependencies = [ - "dirs", -] +checksum = "d089681aa106a86fade1b0128fb5daf07d5867a509ab036d99988dec80429a57" [[package]] name = "xml-rs" @@ -7551,7 +7655,7 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#2f5f6e47ba6fce8d55e7a57b7ee39a93bc0e8194" +source = "git+https://github.com/tikv/yatp.git?branch=master#39cb495953d40a7e846363c06090755c2eac65fa" dependencies = [ "crossbeam-deque", "dashmap", @@ -7560,7 +7664,7 @@ dependencies = [ "num_cpus", "parking_lot_core 0.9.1", "prometheus", - "rand 0.8.3", + "rand 0.8.5", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 19c8775635f..55d6b086d42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.4.0-alpha" +version = "6.5.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -38,7 +38,7 @@ cloud-azure = [ "encryption_export/cloud-azure", "sst_importer/cloud-azure", ] -testexport = ["raftstore/testexport", "api_version/testexport", "engine_tiflash/testexport", "causal_ts/testexport"] +testexport = ["raftstore/testexport", "api_version/testexport", "causal_ts/testexport"] test-engine-kv-rocksdb = [ "engine_test/test-engine-kv-rocksdb" ] @@ -85,7 +85,6 @@ encryption_export = { workspace = true } engine_panic = { workspace = true } engine_rocks = { workspace = true } engine_test = { workspace = true } -engine_tiflash = { workspace = true } engine_traits = { workspace = true } engine_traits_tests = { workspace = true } error_code = { workspace = true } @@ -98,8 +97,8 @@ futures-timer = "3.0" futures-util = { version = "0.3.1", default-features = false, features = ["io", "async-await"] } fxhash = "0.2.1" getset = "0.1" -grpcio = { version = "0.10.3", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } -grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +grpcio = { workspace = true } +grpcio-health = { workspace = true } hex = "0.4" http = "0" hyper = { version = "0.14", features = ["full"] } @@ -108,7 +107,7 @@ into_other = { workspace = true } itertools = "0.10" keyed_priority_queue = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" libloading = "0.7" @@ -130,7 +129,7 @@ paste = "1.0" pd_client = { workspace = true } pin-project = "1.0" pnet_datalink = "0.23" -pprof = { git = "https://github.com/CalvinNeo/pprof-rs", branch = "master", default-features = false, features = ["flamegraph", "protobuf-codec"] } +pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec"] } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" prost = "0.7" @@ -167,17 +166,17 @@ tikv_alloc = { workspace = true } tikv_kv = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } tokio = { version = "1.17", features = ["full"] } tokio-openssl = "0.6" -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } toml = "0.5" tracker = { workspace = true } txn_types = { workspace = true } url = "2" uuid = { version = "0.8.1", features = ["serde", "v4"] } walkdir = "2" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] api_version = { workspace = true, features = ["testexport"] } @@ -190,20 +189,13 @@ test_util = { workspace = true } tokio = { version = "1.17", features = ["macros", "rt-multi-thread", "time"] } zipf = "6.1.0" -[target.'cfg(target_os = "linux")'.dependencies] -procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } -# When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to -# kvproto at the same time. -# After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. -[patch.'https://github.com/pingcap/kvproto'] -# kvproto = { git = "https://github.com/your_github_id/kvproto", branch="your_branch" } - [patch.crates-io] prometheus = { git = "https://github.com/solotzg/rust-prometheus.git", rev = "b4fe98a06a58d29f9b9987a0d7186f6ed5230193" } # TODO: remove this when new raft-rs is published. raft = { git = "https://github.com/tikv/raft-rs", branch = "master" } raft-proto = { git = "https://github.com/tikv/raft-rs", branch = "master" } + protobuf = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } protobuf-codegen = { git = "https://github.com/pingcap/rust-protobuf", branch = "v2.8" } @@ -223,6 +215,18 @@ fs2 = { git = "https://github.com/tabokie/fs2-rs", branch = "tikv" } # Remove this when a new version is release. We need to solve rust-lang/cmake-rs#143. cmake = { git = "https://github.com/rust-lang/cmake-rs" } +# TODO: remove this after crossbeam-deque is updated to the next release version. +# This is a workaround for cargo can't resolving the this patch in yatp. +crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "41ed3d948720f26149b2ebeaf58fe8a193134056" } + +[target.'cfg(target_os = "linux")'.dependencies] +procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } +# When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to +# kvproto at the same time. +# After the PR to kvproto is merged, remember to comment this out and run `cargo update -p kvproto`. +# [patch.'https://github.com/pingcap/kvproto'] +# kvproto = { git = "https://github.com/your_github_id/kvproto", branch = "your_branch" } + [workspace] members = [ "components/api_version", @@ -286,6 +290,7 @@ members = [ "components/tipb_helper", "components/tracker", "components/txn_types", + "engine_tiflash", "fuzz", "fuzz/fuzzer-afl", "fuzz/fuzzer-honggfuzz", @@ -365,7 +370,12 @@ tipb_helper = { path = "components/tipb_helper" } tracker = { path = "components/tracker" } txn_types = { path = "components/txn_types" } # External libs -grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } +grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } +grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } +tipb = { git = "https://github.com/pingcap/tipb.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } # TiFlash libs engine_store_ffi = { path = "engine_store_ffi", default-features = false } @@ -385,6 +395,10 @@ opt-level = 1 debug = false opt-level = 1 +[profile.dev.package.tirocks-sys] +debug = false +opt-level = 1 + [profile.dev.package.tests] debug = 1 opt-level = 1 @@ -413,7 +427,7 @@ rpath = false [profile.test] opt-level = 0 -debug = 0 +debug = true codegen-units = 16 lto = false incremental = true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000000..c4ad36dc6e7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,113 @@ +# This Docker image contains a minimal build environment for TiKV +# +# It contains all the tools necessary to reproduce official production builds of TiKV + +# We need to use CentOS 7 because many of our users choose this as their deploy machine. +# Since the glibc it uses (2.17) is from 2012 (https://sourceware.org/glibc/wiki/Glibc%20Timeline) +# it is our lowest common denominator in terms of distro support. + +# Some commands in this script are structured in order to reduce the number of layers Docker +# generates. Unfortunately Docker is limited to only 125 layers: +# https://github.com/moby/moby/blob/a9507c6f76627fdc092edc542d5a7ef4a6df5eec/layer/layer.go#L50-L53 + +# We require epel packages, so enable the fedora EPEL repo then install dependencies. +# Install the system dependencies +# Attempt to clean and rebuild the cache to avoid 404s + +# To avoid rebuilds we first install all Cargo dependencies + + +# The prepare image avoid ruining the cache of the builder +FROM centos:7.6.1810 as prepare +WORKDIR /tikv + +# This step will always ruin the cache +# There isn't a way with docker to wildcard COPY and preserve the directory structure +COPY . . +RUN mkdir /output +RUN for component in $(find . -type f -name 'Cargo.toml' -exec dirname {} \; | sort -u); do \ + mkdir -p "/output/${component}/src" \ + && touch "/output/${component}/src/lib.rs" \ + && cp "${component}/Cargo.toml" "/output/${component}/Cargo.toml" \ + ; done + + +FROM centos:7.6.1810 as builder + +RUN yum install -y epel-release && \ + yum clean all && \ + yum makecache + +RUN yum install -y centos-release-scl && \ + yum install -y \ + devtoolset-8 \ + perl cmake3 && \ + yum clean all + +# CentOS gives cmake 3 a weird binary name, so we link it to something more normal +# This is required by many build scripts, including ours. +RUN ln -s /usr/bin/cmake3 /usr/bin/cmake +ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH +ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH + +# Install Rustup +RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y +ENV PATH /root/.cargo/bin/:$PATH + +# Install the Rust toolchain +WORKDIR /tikv +COPY rust-toolchain ./ +RUN rustup self update \ + && rustup set profile minimal \ + && rustup default $(cat "rust-toolchain") + +# For cargo +COPY scripts ./scripts +COPY etc ./etc +COPY Cargo.lock ./Cargo.lock + +COPY --from=prepare /output/ ./ + +RUN mkdir -p ./cmd/tikv-ctl/src ./cmd/tikv-server/src && \ + echo 'fn main() {}' > ./cmd/tikv-ctl/src/main.rs && \ + echo 'fn main() {}' > ./cmd/tikv-server/src/main.rs && \ + for cargotoml in $(find . -type f -name "Cargo.toml"); do \ + sed -i '/fuzz/d' ${cargotoml} && \ + sed -i '/profiler/d' ${cargotoml} ; \ + done + +COPY Makefile ./ +RUN source /opt/rh/devtoolset-8/enable && make build_dist_release + +# Remove fingerprints for when we build the real binaries. +RUN rm -rf ./target/release/.fingerprint/tikv-* && \ + for i in $(find . -type f -name 'Cargo.toml' -exec dirname {} \; | sort -u); do \ + rm -rf ./target/release/.fingerprint/$(basename ${i})-*; \ + done + +# Add full source code +COPY cmd/ ./cmd/ +COPY components/ ./components/ +COPY src/ ./src/ + +# Build real binaries now +ARG GIT_FALLBACK="Unknown (no git or not git repo)" +ARG GIT_HASH=${GIT_FALLBACK} +ARG GIT_TAG=${GIT_FALLBACK} +ARG GIT_BRANCH=${GIT_FALLBACK} +ENV TIKV_BUILD_GIT_HASH=${GIT_HASH} +ENV TIKV_BUILD_GIT_TAG=${GIT_TAG} +ENV TIKV_BUILD_GIT_BRANCH=${GIT_BRANCH} +RUN source /opt/rh/devtoolset-8/enable && make build_dist_release + +# Export to a clean image +FROM pingcap/alpine-glibc +COPY --from=builder /tikv/target/release/tikv-server /tikv-server +COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl + +RUN apk add --no-cache \ + curl + +EXPOSE 20160 20180 + +ENTRYPOINT ["/tikv-server"] diff --git a/Makefile b/Makefile index 68a0606a3a3..8c595643828 100644 --- a/Makefile +++ b/Makefile @@ -213,17 +213,26 @@ pre-format: unset-override @rustup component add rustfmt @cargo install --force -q cargo-sort +pre-format-fast: unset-override + @rustup component add rustfmt + @cargo install -q cargo-sort + ci_fmt_check: M="fmt" ./proxy_scripts/ci_check.sh ci_test: - M="testold" ./proxy_scripts/ci_check.sh - M="testnew" ./proxy_scripts/ci_check.sh + wget https://github.com/protocolbuffers/protobuf/releases/download/v3.8.0/protoc-3.8.0-linux-x86_64.zip + unzip protoc-3.8.0-linux-x86_64.zip + PROTOC="`pwd`/bin/protoc" M="testold" ./proxy_scripts/ci_check.sh + PROTOC="`pwd`/bin/protoc" M="testnew" ./proxy_scripts/ci_check.sh make debug gen_proxy_ffi: pre-format ./gen-proxy-ffi.sh +gen_proxy_ffi_fast: pre-format-fast + ./gen-proxy-ffi.sh + format: pre-format @cargo fmt @cargo sort -w ./Cargo.toml ./*/Cargo.toml components/*/Cargo.toml cmd/*/Cargo.toml >/dev/null diff --git a/cmd/tikv-ctl/Cargo.toml b/cmd/tikv-ctl/Cargo.toml index 3b2d1dd2f75..1e0699f64cf 100644 --- a/cmd/tikv-ctl/Cargo.toml +++ b/cmd/tikv-ctl/Cargo.toml @@ -62,7 +62,7 @@ gag = "1.0" grpcio = { workspace = true } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } diff --git a/cmd/tikv-ctl/src/cmd.rs b/cmd/tikv-ctl/src/cmd.rs index 2fec7ea9cef..657d296109c 100644 --- a/cmd/tikv-ctl/src/cmd.rs +++ b/cmd/tikv-ctl/src/cmd.rs @@ -558,7 +558,11 @@ pub enum Cmd { version: u64, }, /// Control for Raft Engine - RaftEngineCtl { args: Vec }, + /// Usage: tikv-ctl raft-engine-ctl -- --help + RaftEngineCtl { + #[structopt(last = true)] + args: Vec, + }, #[structopt(external_subcommand)] External(Vec), } @@ -588,7 +592,6 @@ pub enum RaftCmd { #[structopt( short = "r", aliases = &["region"], - required_unless = "all-regions", conflicts_with = "all-regions", use_delimiter = true, require_delimiter = true, @@ -600,10 +603,22 @@ pub enum RaftCmd { // `regions` must be None when `all_regions` is present, // so we left `all_regions` unused. #[allow(dead_code)] - #[structopt(long, required_unless = "regions", conflicts_with = "regions")] + #[structopt(long, conflicts_with = "regions")] /// Print info for all regions all_regions: bool, + #[structopt(long, default_value = "")] + /// hex start key + start: String, + + #[structopt(long, default_value = "")] + /// hex end key + end: String, + + #[structopt(long, default_value = "16")] + /// Limit the number of keys to scan + limit: usize, + #[structopt(long)] /// Skip tombstone regions skip_tombstone: bool, diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index aa2f604b547..b2d25a32d5b 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -23,7 +23,7 @@ use pd_client::{Config as PdConfig, PdClient, RpcClient}; use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; use raft_log_engine::RaftLogEngine; -use raftstore::store::INIT_EPOCH_CONF_VER; +use raftstore::store::{util::build_key_range, INIT_EPOCH_CONF_VER}; use security::SecurityManager; use serde_json::json; use tikv::{ @@ -151,17 +151,38 @@ pub trait DebugExecutor { println!("total region size: {}", convert_gbmb(total_size as u64)); } - fn dump_region_info(&self, region_ids: Option>, skip_tombstone: bool) { + fn dump_region_info( + &self, + region_ids: Option>, + start_key: &[u8], + end_key: &[u8], + limit: usize, + skip_tombstone: bool, + ) { let region_ids = region_ids.unwrap_or_else(|| self.get_all_regions_in_store()); let mut region_objects = serde_json::map::Map::new(); for region_id in region_ids { + if limit > 0 && region_objects.len() >= limit { + break; + } let r = self.get_region_info(region_id); if skip_tombstone { let region_state = r.region_local_state.as_ref(); if region_state.map_or(false, |s| s.get_state() == PeerState::Tombstone) { - return; + continue; } } + let region = r + .region_local_state + .as_ref() + .map(|s| s.get_region().clone()) + .unwrap(); + if !check_intersect_of_range( + &build_key_range(region.get_start_key(), region.get_end_key(), false), + &build_key_range(start_key, end_key, false), + ) { + continue; + } let region_object = json!({ "region_id": region_id, "region_local_state": r.region_local_state.map(|s| { @@ -866,7 +887,7 @@ impl DebugExecutor for Debugger { self.region_size(region, cfs) .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) .into_iter() - .map(|(cf, size)| (cf.to_owned(), size as usize)) + .map(|(cf, size)| (cf.to_owned(), size)) .collect() } diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index ce39c121300..72078d07f62 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -68,7 +68,7 @@ fn main() { cfg }, |path| { - let s = fs::read_to_string(&path).unwrap(); + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }, ); @@ -169,7 +169,7 @@ fn main() { .unwrap(); let iv = Iv::from_slice(&file_info.iv).unwrap(); - let f = File::open(&infile).unwrap(); + let f = File::open(infile).unwrap(); let mut reader = DecrypterReader::new(f, mthd, &file_info.key, iv).unwrap(); io::copy(&mut reader, &mut outf).unwrap(); @@ -272,9 +272,20 @@ fn main() { RaftCmd::Region { regions, skip_tombstone, + start, + end, + limit, .. } => { - debug_executor.dump_region_info(regions, skip_tombstone); + let start_key = from_hex(&start).unwrap(); + let end_key = from_hex(&end).unwrap(); + debug_executor.dump_region_info( + regions, + &start_key, + &end_key, + limit, + skip_tombstone, + ); } }, Cmd::Size { region, cf } => { @@ -333,7 +344,7 @@ fn main() { let to_data_dir = to_data_dir.as_deref(); let to_host = to_host.as_deref(); let to_config = to_config.map_or_else(TikvConfig::default, |path| { - let s = fs::read_to_string(&path).unwrap(); + let s = fs::read_to_string(path).unwrap(); toml::from_str(&s).unwrap() }); debug_executor.diff_region(region, to_host, to_data_dir, &to_config, mgr); diff --git a/cmd/tikv-ctl/src/util.rs b/cmd/tikv-ctl/src/util.rs index d7e83511d3e..0e67c905e8d 100644 --- a/cmd/tikv-ctl/src/util.rs +++ b/cmd/tikv-ctl/src/util.rs @@ -2,6 +2,7 @@ use std::{borrow::ToOwned, error::Error, str, str::FromStr, u64}; +use kvproto::kvrpcpb::KeyRange; use server::setup::initial_logger; use tikv::config::TikvConfig; @@ -62,8 +63,27 @@ pub fn perror_and_exit(prefix: &str, e: E) -> ! { tikv_util::logger::exit_process_gracefully(-1); } +// Check if region's `key_range` intersects with `key_range_limit`. +pub fn check_intersect_of_range(key_range: &KeyRange, key_range_limit: &KeyRange) -> bool { + if !key_range.get_end_key().is_empty() + && !key_range_limit.get_start_key().is_empty() + && key_range.get_end_key() <= key_range_limit.get_start_key() + { + return false; + } + if !key_range_limit.get_end_key().is_empty() + && !key_range.get_start_key().is_empty() + && key_range_limit.get_end_key() < key_range.get_start_key() + { + return false; + } + true +} + #[cfg(test)] mod tests { + use raftstore::store::util::build_key_range; + use super::*; #[test] @@ -73,4 +93,42 @@ mod tests { assert_eq!(from_hex("0x74").unwrap(), result); assert_eq!(from_hex("0X74").unwrap(), result); } + + #[test] + fn test_included_region_in_range() { + // To avoid unfolding the code when `make format` is called + fn range(start: &[u8], end: &[u8]) -> KeyRange { + build_key_range(start, end, false) + } + let mut region = range(&[0x02], &[0x05]); + // region absolutely in range + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x06]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + // region intersects with range + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x04], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x03]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); // region is left-closed and right-open interval + // range absolutely in region also need to return true + assert!(check_intersect_of_range(®ion, &range(&[0x03], &[0x04]))); + // region not intersects with range + assert!(!check_intersect_of_range(®ion, &range(&[0x05], &[]))); // region is left-closed and right-open interval + assert!(!check_intersect_of_range(®ion, &range(&[0x06], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + // check last region + region = range(&[0x02], &[]); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[0x02], &[]))); + assert!(check_intersect_of_range(®ion, &range(&[0x01], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x05]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[0x02]))); + assert!(check_intersect_of_range(®ion, &range(&[], &[]))); + assert!(!check_intersect_of_range(®ion, &range(&[], &[0x01]))); + } } diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index 421c01a1514..7362ca25ccc 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -11,7 +11,7 @@ testexport = [] bitflags = "1.0.1" codec = { workspace = true } engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/backup-stream/Cargo.toml b/components/backup-stream/Cargo.toml index 0f3b97461bb..b1a61580cb6 100644 --- a/components/backup-stream/Cargo.toml +++ b/components/backup-stream/Cargo.toml @@ -32,17 +32,18 @@ engine_traits = { workspace = true } error_code = { workspace = true } # We cannot update the etcd-client to latest version because of the cyclic requirement. # Also we need wait until https://github.com/etcdv3/etcd-client/pull/43/files to be merged. -etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "e0321a1990ee561cf042973666c0db61c8d82364", features = ["pub-response-field", "tls"] } +etcd-client = { git = "https://github.com/pingcap/etcd-client", rev = "14a6f8731f1890d5fd2f6e16a9f0d0a306b0599e", features = ["pub-response-field", "tls-openssl-vendored"] } external_storage = { workspace = true } external_storage_export = { workspace = true } fail = "0.5" file_system = { workspace = true } futures = "0.3" futures-io = "0.3" - grpcio = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +# Fixing ahash cyclic dep: https://github.com/tkaitchuck/ahash/issues/95 +indexmap = "=1.6.2" +kvproto = { workspace = true } lazy_static = "1.4" log_wrappers = { workspace = true } online_config = { workspace = true } @@ -54,6 +55,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true } regex = "1" resolved_ts = { workspace = true } +security = { path = "../security" } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } thiserror = "1" @@ -65,10 +67,10 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread", "macros", "time", "sync"] } tokio-stream = "0.1" tokio-util = { version = "0.7", features = ["compat"] } -tonic = "0.5" +tonic = "0.8" txn_types = { workspace = true } uuid = "0.8" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] async-trait = "0.1" diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index 4b80eb44a2f..f34211ef7a5 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -2,16 +2,25 @@ use std::{collections::HashMap, sync::Arc, time::Duration}; +use futures::{ + channel::mpsc::{self as async_mpsc, Receiver, Sender}, + SinkExt, StreamExt, +}; +use grpcio::{RpcStatus, RpcStatusCode, ServerStreamingSink, WriteFlags}; use kvproto::{ errorpb::{Error as PbError, *}, + logbackuppb::{FlushEvent, SubscribeFlushEventResponse}, metapb::Region, }; use pd_client::PdClient; -use tikv_util::{info, worker::Scheduler}; +use tikv_util::{box_err, defer, info, warn, worker::Scheduler}; use txn_types::TimeStamp; +use uuid::Uuid; use crate::{ - errors::{Error, Result}, + annotate, + errors::{Error, ReportableResult, Result}, + future, metadata::{store::MetaStore, Checkpoint, CheckpointProvider, MetadataClient}, metrics, try_send, RegionCheckpointOperation, Task, }; @@ -20,11 +29,85 @@ use crate::{ /// This information is provided for the `advancer` in checkpoint V3, /// which involved a central node (typically TiDB) for collecting all regions' /// checkpoint then advancing the global checkpoint. -#[derive(Debug, Default)] +#[derive(Default)] pub struct CheckpointManager { items: HashMap, + manager_handle: Option>, +} + +impl std::fmt::Debug for CheckpointManager { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CheckpointManager") + .field("items", &self.items) + .finish() + } +} + +enum SubscriptionOp { + Add(Subscription), + Emit(Box<[FlushEvent]>), } +struct SubscriptionManager { + subscribers: HashMap, + input: Receiver, +} + +impl SubscriptionManager { + pub async fn main_loop(mut self) { + info!("subscription manager started!"); + defer! { info!("subscription manager exit.") } + while let Some(msg) = self.input.next().await { + match msg { + SubscriptionOp::Add(sub) => { + self.subscribers.insert(Uuid::new_v4(), sub); + } + SubscriptionOp::Emit(events) => { + let mut canceled = vec![]; + for (id, sub) in &mut self.subscribers { + let send_all = async { + for es in events.chunks(1024) { + let mut resp = SubscribeFlushEventResponse::new(); + resp.set_events(es.to_vec().into()); + sub.feed((resp, WriteFlags::default())).await?; + } + sub.flush().await + }; + + match send_all.await { + Err(grpcio::Error::RemoteStopped) => { + canceled.push(*id); + } + Err(err) => { + Error::from(err).report("sending subscription"); + } + _ => {} + } + } + + for c in canceled { + match self.subscribers.remove(&c) { + Some(mut sub) => { + info!("client is gone, removing subscription"; "id" => %c); + sub.close().await.report_if_err(format_args!( + "during removing subscription {}", + c + )) + } + None => { + warn!("BUG: the subscriber has been removed before we are going to remove it."; "id" => %c); + } + } + } + } + } + } + } +} + +// Note: can we make it more generic...? +pub type Subscription = ServerStreamingSink; + /// The result of getting a checkpoint. /// The possibility of failed to getting checkpoint is pretty high: /// because there is a gap between region leader change and flushing. @@ -76,8 +159,81 @@ impl CheckpointManager { self.items.clear(); } + pub fn spawn_subscription_mgr(&mut self) -> future![()] { + let (tx, rx) = async_mpsc::channel(1024); + let sub = SubscriptionManager { + subscribers: Default::default(), + input: rx, + }; + self.manager_handle = Some(tx); + sub.main_loop() + } + + pub fn update_region_checkpoints(&mut self, region_and_checkpoint: Vec<(Region, TimeStamp)>) { + for (region, checkpoint) in ®ion_and_checkpoint { + self.do_update(region, *checkpoint); + } + + self.notify(region_and_checkpoint.into_iter()); + } + /// update a region checkpoint in need. + #[cfg(test)] pub fn update_region_checkpoint(&mut self, region: &Region, checkpoint: TimeStamp) { + self.do_update(region, checkpoint); + self.notify(std::iter::once((region.clone(), checkpoint))); + } + + pub fn add_subscriber(&mut self, sub: Subscription) -> future![Result<()>] { + let mgr = self.manager_handle.as_ref().cloned(); + + // NOTE: we cannot send the real error into the client directly because once + // we send the subscription into the sink, we cannot fetch it again :( + async move { + let mgr = mgr.ok_or(Error::Other(box_err!("subscription manager not get ready"))); + let mut mgr = match mgr { + Ok(mgr) => mgr, + Err(err) => { + sub.fail(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + "subscription manager not get ready.".to_owned(), + )) + .await + .map_err(|err| { + annotate!(err, "failed to send request to subscriber manager") + })?; + return Err(err); + } + }; + mgr.send(SubscriptionOp::Add(sub)) + .await + .map_err(|err| annotate!(err, "failed to send request to subscriber manager"))?; + Ok(()) + } + } + + fn notify(&mut self, items: impl Iterator) { + if let Some(mgr) = self.manager_handle.as_mut() { + let r = items + .map(|(r, ts)| { + let mut f = FlushEvent::new(); + f.set_checkpoint(ts.into_inner()); + f.set_start_key(r.start_key); + f.set_end_key(r.end_key); + f + }) + .collect::>(); + let event_size = r.len(); + let res = mgr.try_send(SubscriptionOp::Emit(r)); + // Note: perhaps don't batch in the channel but batch in the receiver side? + // If so, we can control the memory usage better. + if let Err(err) = res { + warn!("the channel is full, dropping some events."; "length" => %event_size, "err" => %err); + } + } + } + + fn do_update(&mut self, region: &Region, checkpoint: TimeStamp) { let e = self.items.entry(region.get_id()); e.and_modify(|old_cp| { if old_cp.checkpoint < checkpoint @@ -173,7 +329,7 @@ pub trait FlushObserver: Send + 'static { /// Note the new resolved ts cannot be greater than the old resolved ts. async fn rewrite_resolved_ts( &mut self, - #[allow(unused_variables)] task: &str, + #[allow(unused_variables)] _task: &str, ) -> Option { None } @@ -199,7 +355,7 @@ impl FlushObserver for BasicFlushObserver { .pd_cli .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), - TimeStamp::new(rts), + TimeStamp::new(rts.saturating_sub(1)), // Add a service safe point for 30 mins (6x the default flush interval). // It would probably be safe. Duration::from_secs(1800), @@ -299,12 +455,19 @@ where #[cfg(test)] mod tests { - use std::assert_matches; - + use std::{ + assert_matches, + collections::HashMap, + sync::{Arc, RwLock}, + time::Duration, + }; + + use futures::future::ok; use kvproto::metapb::*; + use pd_client::{PdClient, PdFuture}; use txn_types::TimeStamp; - use super::RegionIdWithVersion; + use super::{BasicFlushObserver, FlushObserver, RegionIdWithVersion}; use crate::GetCheckpointResult; fn region(id: u64, version: u64, conf_version: u64) -> Region { @@ -342,4 +505,50 @@ mod tests { let r = mgr.get_from_region(RegionIdWithVersion::new(1, 33)); assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } + + struct MockPdClient { + safepoint: RwLock>, + } + + impl PdClient for MockPdClient { + fn update_service_safe_point( + &self, + name: String, + safepoint: TimeStamp, + _ttl: Duration, + ) -> PdFuture<()> { + // let _ = self.safepoint.insert(name, safepoint); + self.safepoint.write().unwrap().insert(name, safepoint); + + Box::pin(ok(())) + } + } + + impl MockPdClient { + fn new() -> Self { + Self { + safepoint: RwLock::new(HashMap::default()), + } + } + + fn get_service_safe_point(&self, name: String) -> Option { + self.safepoint.read().unwrap().get(&name).copied() + } + } + + #[tokio::test] + async fn test_after() { + let store_id = 1; + let pd_cli = Arc::new(MockPdClient::new()); + let mut flush_observer = BasicFlushObserver::new(pd_cli.clone(), store_id); + let task = String::from("test"); + let rts = 12345; + + let r = flush_observer.after(&task, rts).await; + assert_eq!(r.is_ok(), true); + + let serivce_id = format!("backup-stream-{}-{}", task, store_id); + let r = pd_cli.get_service_safe_point(serivce_id).unwrap(); + assert_eq!(r.into_inner(), rts - 1); + } } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 22a415ca6bb..2ebeee2ea66 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -42,7 +42,7 @@ use crate::{ annotate, checkpoint_manager::{ BasicFlushObserver, CheckpointManager, CheckpointV3FlushObserver, FlushObserver, - GetCheckpointResult, RegionIdWithVersion, + GetCheckpointResult, RegionIdWithVersion, Subscription, }, errors::{Error, Result}, event_loader::{InitialDataLoader, PendingMemoryQuota}, @@ -165,6 +165,8 @@ where ((config.num_threads + 1) / 2).max(1), ); pool.spawn(op_loop); + let mut checkpoint_mgr = CheckpointManager::default(); + pool.spawn(checkpoint_mgr.spawn_subscription_mgr()); Endpoint { meta_client, range_router, @@ -183,7 +185,7 @@ where region_operator, failover_time: None, config, - checkpoint_mgr: Default::default(), + checkpoint_mgr, } } } @@ -887,11 +889,7 @@ where // Let's clear all stale checkpoints first. // Or they may slow down the global checkpoint. self.checkpoint_mgr.clear(); - for (region, checkpoint) in u { - debug!("setting region checkpoint"; "region" => %region.get_id(), "ts" => %checkpoint); - self.checkpoint_mgr - .update_region_checkpoint(®ion, checkpoint) - } + self.checkpoint_mgr.update_region_checkpoints(u); } RegionCheckpointOperation::Get(g, cb) => { let _guard = self.pool.handle().enter(); @@ -911,6 +909,14 @@ where .collect()), } } + RegionCheckpointOperation::Subscribe(sub) => { + let fut = self.checkpoint_mgr.add_subscriber(sub); + self.pool.spawn(async move { + if let Err(err) = fut.await { + err.report("adding subscription"); + } + }); + } } } @@ -957,6 +963,7 @@ pub enum RegionSet { pub enum RegionCheckpointOperation { Update(Vec<(Region, TimeStamp)>), Get(RegionSet, Box) + Send>), + Subscribe(Subscription), } impl fmt::Debug for RegionCheckpointOperation { @@ -964,6 +971,7 @@ impl fmt::Debug for RegionCheckpointOperation { match self { Self::Update(arg0) => f.debug_tuple("Update").field(arg0).finish(), Self::Get(arg0, _) => f.debug_tuple("Get").field(arg0).finish(), + Self::Subscribe(_) => f.debug_tuple("Subscription").finish(), } } } diff --git a/components/backup-stream/src/errors.rs b/components/backup-stream/src/errors.rs index 493cf28babc..a3f76e0255f 100644 --- a/components/backup-stream/src/errors.rs +++ b/components/backup-stream/src/errors.rs @@ -6,6 +6,7 @@ use std::{ use error_code::ErrorCodeExt; use etcd_client::Error as EtcdError; +use grpcio::Error as GrpcError; use kvproto::{errorpb::Error as StoreError, metapb::*}; use pd_client::Error as PdError; use protobuf::ProtobufError; @@ -18,6 +19,8 @@ use crate::{endpoint::Task, metrics}; #[derive(ThisError, Debug)] pub enum Error { + #[error("gRPC meet error {0}")] + Grpc(#[from] GrpcError), #[error("Etcd meet error {0}")] Etcd(#[from] EtcdError), #[error("Protobuf meet error {0}")] @@ -66,6 +69,7 @@ impl ErrorCodeExt for Error { Error::Other(_) => OTHER, Error::RaftStore(_) => RAFTSTORE, Error::ObserveCanceled(..) => OBSERVE_CANCELED, + Error::Grpc(_) => GRPC, } } } @@ -115,6 +119,22 @@ where } } +pub trait ReportableResult { + fn report_if_err(self, context: impl ToString); +} + +impl ReportableResult for StdResult<(), E> +where + Error: From, +{ + #[inline(always)] + fn report_if_err(self, context: impl ToString) { + if let Err(err) = self { + Error::from(err).report(context.to_string()) + } + } +} + /// Like `errors.Annotate` in Go. /// Wrap an unknown error with [`Error::Other`]. #[macro_export(crate)] @@ -132,14 +152,14 @@ macro_rules! annotate { impl Error { pub fn report(&self, context: impl Display) { - warn!("backup stream meet error"; "context" => %context, "err" => %self); + warn!("backup stream meet error"; "context" => %context, "err" => %self, "verbose_err" => ?self); metrics::STREAM_ERROR .with_label_values(&[self.kind()]) .inc() } pub fn report_fatal(&self) { - error!(%self; "backup stream meet fatal error"); + error!(%self; "backup stream meet fatal error"; "verbose" => ?self, ); metrics::STREAM_FATAL_ERROR .with_label_values(&[self.kind()]) .inc() diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index 2ebf553e1cb..b7f1fcb2025 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -546,7 +546,7 @@ impl MetadataClient { )) .await?; - let mut result = Vec::with_capacity(all.len() as usize + 1); + let mut result = Vec::with_capacity(all.len() + 1); if !prev.kvs.is_empty() { let kv = &mut prev.kvs[0]; if kv.value() > start_key.as_slice() { diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 8cd6b87ec71..6fc3a5332ea 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -2,9 +2,9 @@ use std::{sync::Arc, time::Duration}; -use etcd_client::{ConnectOptions, Error as EtcdError, TlsOptions}; +use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use tikv_util::stream::RetryError; +use tikv_util::stream::{RetryError, RetryExt}; use tokio::sync::OnceCell; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; @@ -15,8 +15,9 @@ const RPC_TIMEOUT: Duration = Duration::from_secs(30); #[derive(Clone)] pub struct LazyEtcdClient(Arc); +#[derive(Debug)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Option, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } @@ -26,12 +27,16 @@ impl ConnectionConfig { fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); if let Some(tls) = &self.tls { - opts = opts.with_tls(tls.clone()) + opts = opts.with_openssl_tls( + OpenSslClientConfig::default() + .ca_cert_pem(&tls.ca) + .client_cert_pem_and_key(&tls.client_cert, &tls.client_key.0), + ) } opts = opts .with_keep_alive(self.keep_alive_interval, self.keep_alive_timeout) - .with_timeout(RPC_TIMEOUT) - .keep_alive_while_idle(false); + .with_keep_alive_while_idle(false) + .with_timeout(RPC_TIMEOUT); opts } @@ -68,7 +73,9 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { EtcdError::InvalidArgs(_) | EtcdError::InvalidUri(_) | EtcdError::Utf8Error(_) - | EtcdError::InvalidHeaderValue(_) => false, + | EtcdError::InvalidHeaderValue(_) + | EtcdError::EndpointError(_) + | EtcdError::OpenSsl(_) => false, EtcdError::TransportError(_) | EtcdError::IoError(_) | EtcdError::WatchError(_) @@ -84,6 +91,7 @@ fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { } } +#[derive(Debug)] struct RetryableEtcdError(EtcdError); impl RetryError for RetryableEtcdError { @@ -103,7 +111,11 @@ where F: Future>, { use futures::TryFutureExt; - let r = tikv_util::stream::retry(move || action().err_into::()).await; + let r = tikv_util::stream::retry_ext( + move || action().err_into::(), + RetryExt::default().with_fail_hook(|err| println!("meet error {:?}", err)), + ) + .await; r.map_err(|err| err.0.into()) } diff --git a/components/backup-stream/src/router.rs b/components/backup-stream/src/router.rs index 56bd00bba87..ead124c103a 100644 --- a/components/backup-stream/src/router.rs +++ b/components/backup-stream/src/router.rs @@ -1506,11 +1506,10 @@ struct TaskRange { #[cfg(test)] mod tests { - use std::{ffi::OsStr, marker::Unpin, time::Duration}; + use std::{ffi::OsStr, time::Duration}; - use external_storage::NoopStorage; + use external_storage::{ExternalData, NoopStorage}; use futures::AsyncReadExt; - use futures_io::AsyncRead; use kvproto::brpb::{Local, Noop, StorageBackend, StreamBackupTaskInfo}; use tikv_util::{ codec::number::NumberEncoder, @@ -1929,16 +1928,11 @@ mod tests { self.inner.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.inner.read(name) } - fn read_part( - &self, - name: &str, - off: u64, - len: u64, - ) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.inner.read_part(name, off, len) } } @@ -2277,11 +2271,11 @@ mod tests { } } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> external_storage::ExternalData<'_> { self.s.read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> external_storage::ExternalData<'_> { self.s.read_part(name, off, len) } } diff --git a/components/backup-stream/src/service.rs b/components/backup-stream/src/service.rs index 47a149973b2..9d312a984d1 100644 --- a/components/backup-stream/src/service.rs +++ b/components/backup-stream/src/service.rs @@ -89,4 +89,16 @@ impl LogBackup for Service { )); try_send!(self.endpoint, t); } + + fn subscribe_flush_event( + &mut self, + _ctx: grpcio::RpcContext<'_>, + _req: kvproto::logbackuppb::SubscribeFlushEventRequest, + sink: grpcio::ServerStreamingSink, + ) { + try_send!( + self.endpoint, + Task::RegionCheckpointsOp(RegionCheckpointOperation::Subscribe(sink)) + ); + } } diff --git a/components/backup-stream/tests/mod.rs b/components/backup-stream/tests/mod.rs index 2cc6016aeb1..7256cd62c03 100644 --- a/components/backup-stream/tests/mod.rs +++ b/components/backup-stream/tests/mod.rs @@ -19,13 +19,15 @@ use backup_stream::{ }, observer::BackupStreamObserver, router::Router, - Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, + Endpoint, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Service, Task, }; -use futures::{executor::block_on, AsyncWriteExt, Future}; -use grpcio::ChannelBuilder; +use futures::{executor::block_on, AsyncWriteExt, Future, Stream, StreamExt, TryStreamExt}; +use grpcio::{ChannelBuilder, Server, ServerBuilder}; use kvproto::{ brpb::{CompressionType, Local, Metadata, StorageBackend}, kvrpcpb::*, + logbackuppb::{SubscribeFlushEventRequest, SubscribeFlushEventResponse}, + logbackuppb_grpc::{create_log_backup, LogBackupClient}, tikvpb::*, }; use pd_client::PdClient; @@ -156,6 +158,8 @@ impl SuiteBuilder { }, obs: Default::default(), tikv_cli: Default::default(), + log_backup_cli: Default::default(), + servers: Default::default(), env: Arc::new(grpcio::Environment::new(1)), cluster, @@ -172,6 +176,8 @@ impl SuiteBuilder { cfg_f(&mut cfg); for id in 1..=(n as u64) { suite.start_endpoint(id, cfg.clone()); + let cli = suite.start_log_backup_client_on(id); + suite.log_backup_cli.insert(id, cli); } // We must wait until the endpoints get ready to watching the metastore, or some // modifies may be lost. Either make Endpoint::with_client wait until watch did @@ -222,8 +228,11 @@ pub struct Suite { meta_store: ErrorStore, cluster: Cluster, tikv_cli: HashMap, + log_backup_cli: HashMap, obs: HashMap, env: Arc, + // The place to make services live as long as suite. + servers: Vec, temp_files: TempDir, flushed_files: TempDir, @@ -263,6 +272,51 @@ impl Suite { worker } + /// create a subscription stream. this has simply asserted no error, because + /// in theory observing flushing should not emit error. change that if + /// needed. + fn flush_stream(&self) -> impl Stream { + let streams = self + .log_backup_cli + .iter() + .map(|(id, cli)| { + let stream = cli + .subscribe_flush_event(&{ + let mut r = SubscribeFlushEventRequest::default(); + r.set_client_id(format!("test-{}", id)); + r + }) + .unwrap_or_else(|err| panic!("failed to subscribe on {} because {}", id, err)); + let id = *id; + stream.map_ok(move |x| (id, x)).map(move |x| { + x.unwrap_or_else(move |err| panic!("failed to rec from {} because {}", id, err)) + }) + }) + .collect::>(); + + futures::stream::select_all(streams) + } + + fn start_log_backup_client_on(&mut self, id: u64) -> LogBackupClient { + let endpoint = self + .endpoints + .get(&id) + .expect("must register endpoint first"); + + let serv = Service::new(endpoint.scheduler()); + let builder = + ServerBuilder::new(self.env.clone()).register_service(create_log_backup(serv)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(self.env.clone()).connect(&addr); + println!("connecting channel to {} for store {}", addr, id); + let client = LogBackupClient::new(channel); + self.servers.push(server); + client + } + fn start_endpoint(&mut self, id: u64, mut cfg: BackupStreamConfig) { let cluster = &mut self.cluster; let worker = self.endpoints.get_mut(&id).unwrap(); @@ -476,7 +530,7 @@ impl Suite { decoder.close().await.unwrap(); let content = decoder.into_inner(); - let mut iter = EventIterator::new(content); + let mut iter = EventIterator::new(&content); loop { if !iter.valid() { break; @@ -747,8 +801,10 @@ mod test { errors::Error, router::TaskSelector, GetCheckpointResult, RegionCheckpointOperation, RegionSet, Task, }; + use futures::{Stream, StreamExt}; use pd_client::PdClient; use tikv_util::{box_err, defer, info, HandyRwLock}; + use tokio::time::timeout; use txn_types::{Key, TimeStamp}; use crate::{ @@ -1174,4 +1230,60 @@ mod test { checkpoint ); } + + async fn collect_current(mut s: impl Stream + Unpin, goal: usize) -> Vec { + let mut r = vec![]; + while let Ok(Some(x)) = timeout(Duration::from_secs(10), s.next()).await { + r.push(x); + if r.len() >= goal { + return r; + } + } + r + } + + #[test] + fn subscribe_flushing() { + let mut suite = super::SuiteBuilder::new_named("sub_flush").build(); + let stream = suite.flush_stream(); + for i in 1..10 { + let split_key = make_split_key_at_record(1, i * 20); + suite.must_split(&split_key); + suite.must_shuffle_leader(suite.cluster.get_region_id(&split_key)); + } + + let round1 = run_async_test(suite.write_records(0, 128, 1)); + suite.must_register_task(1, "sub_flush"); + let round2 = run_async_test(suite.write_records(256, 128, 1)); + suite.sync(); + suite.force_flush_files("sub_flush"); + + let mut items = run_async_test(async { + collect_current( + stream.flat_map(|(_, r)| futures::stream::iter(r.events.into_iter())), + 10, + ) + .await + }); + + items.sort_by(|x, y| x.start_key.cmp(&y.start_key)); + + println!("{:?}", items); + assert_eq!(items.len(), 10); + + assert_eq!(items.first().unwrap().start_key, Vec::::default()); + for w in items.windows(2) { + let a = &w[0]; + let b = &w[1]; + assert!(a.checkpoint > 512); + assert!(b.checkpoint > 512); + assert_eq!(a.end_key, b.start_key); + } + assert_eq!(items.last().unwrap().end_key, Vec::::default()); + + run_async_test(suite.check_for_write_records( + suite.flushed_files.path(), + round1.union(&round2).map(|x| x.as_slice()), + )); + } } diff --git a/components/backup/Cargo.toml b/components/backup/Cargo.toml index 17439a0f615..27f7d68e8e3 100644 --- a/components/backup/Cargo.toml +++ b/components/backup/Cargo.toml @@ -52,7 +52,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } @@ -74,7 +74,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-stream = "0.1" txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] rand = "0.8" diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index 92131381017..b880da7a3dc 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -19,7 +19,7 @@ use futures::{channel::mpsc::*, executor::block_on}; use kvproto::{ brpb::*, encryptionpb::EncryptionMethod, - kvrpcpb::{ApiVersion, Context, IsolationLevel}, + kvrpcpb::{ApiVersion, Context, IsolationLevel, KeyRange}, metapb::*, }; use online_config::OnlineConfig; @@ -59,6 +59,7 @@ const BACKUP_BATCH_LIMIT: usize = 1024; struct Request { start_key: Vec, end_key: Vec, + sub_ranges: Vec, start_ts: TimeStamp, end_ts: TimeStamp, limiter: Limiter, @@ -119,6 +120,7 @@ impl Task { request: Request { start_key: req.get_start_key().to_owned(), end_key: req.get_end_key().to_owned(), + sub_ranges: req.get_sub_ranges().to_owned(), start_ts: req.get_start_version().into(), end_ts: req.get_end_version().into(), backend: req.get_storage_backend().clone(), @@ -676,6 +678,8 @@ pub struct Endpoint { /// The progress of a backup task pub struct Progress { store_id: u64, + ranges: Vec<(Option, Option)>, + next_index: usize, next_start: Option, end_key: Option, region_info: R, @@ -685,7 +689,7 @@ pub struct Progress { } impl Progress { - fn new( + fn new_with_range( store_id: u64, next_start: Option, end_key: Option, @@ -693,14 +697,41 @@ impl Progress { codec: KeyValueCodec, cf: CfName, ) -> Self { - Progress { + let ranges = vec![(next_start, end_key)]; + Self::new_with_ranges(store_id, ranges, region_info, codec, cf) + } + + fn new_with_ranges( + store_id: u64, + ranges: Vec<(Option, Option)>, + region_info: R, + codec: KeyValueCodec, + cf: CfName, + ) -> Self { + let mut prs = Progress { store_id, - next_start, - end_key, + ranges, + next_index: 0, + next_start: None, + end_key: None, region_info, finished: false, codec, cf, + }; + prs.try_next(); + prs + } + + /// try the next range. If all the ranges are consumed, + /// set self.finish true. + fn try_next(&mut self) { + if self.ranges.len() > self.next_index { + (self.next_start, self.end_key) = self.ranges[self.next_index].clone(); + + self.next_index += 1; + } else { + self.finished = true; } } @@ -770,11 +801,12 @@ impl Progress { // region, we need to set the `finished` flag here in case // we run with `next_start` set to None if b.region.get_end_key().is_empty() || b.end_key == self.end_key { - self.finished = true; + self.try_next(); + } else { + self.next_start = b.end_key.clone(); } - self.next_start = b.end_key.clone(); } else { - self.finished = true; + self.try_next(); } branges } @@ -958,6 +990,39 @@ impl Endpoint { }); } + fn get_progress_by_req( + &self, + request: &Request, + codec: KeyValueCodec, + ) -> Arc>> { + if request.sub_ranges.is_empty() { + let start_key = codec.encode_backup_key(request.start_key.clone()); + let end_key = codec.encode_backup_key(request.end_key.clone()); + Arc::new(Mutex::new(Progress::new_with_range( + self.store_id, + start_key, + end_key, + self.region_info.clone(), + codec, + request.cf, + ))) + } else { + let mut ranges = Vec::with_capacity(request.sub_ranges.len()); + for k in &request.sub_ranges { + let start_key = codec.encode_backup_key(k.start_key.clone()); + let end_key = codec.encode_backup_key(k.end_key.clone()); + ranges.push((start_key, end_key)); + } + Arc::new(Mutex::new(Progress::new_with_ranges( + self.store_id, + ranges, + self.region_info.clone(), + codec, + request.cf, + ))) + } + } + pub fn handle_backup_task(&self, task: Task) { let Task { request, resp } = task; let codec = KeyValueCodec::new(request.is_raw_kv, self.api_version, request.dst_api_ver); @@ -996,17 +1061,9 @@ impl Endpoint { return; } } - let start_key = codec.encode_backup_key(request.start_key.clone()); - let end_key = codec.encode_backup_key(request.end_key.clone()); - let prs = Arc::new(Mutex::new(Progress::new( - self.store_id, - start_key, - end_key, - self.region_info.clone(), - codec, - request.cf, - ))); + let prs = self.get_progress_by_req(&request, codec); + let backend = match create_storage(&request.backend, self.get_config()) { Ok(backend) => backend, Err(err) => { @@ -1279,7 +1336,7 @@ pub mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, @@ -1384,17 +1441,9 @@ pub mod tests { // Test seek backup range. let test_seek_backup_range = |start_key: &[u8], end_key: &[u8], expect: Vec<(&[u8], &[u8])>| { - let start_key = if start_key.is_empty() { - None - } else { - Some(Key::from_raw(start_key)) - }; - let end_key = if end_key.is_empty() { - None - } else { - Some(Key::from_raw(end_key)) - }; - let mut prs = Progress::new( + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + let mut prs = Progress::new_with_range( endpoint.store_id, start_key, end_key, @@ -1446,6 +1495,7 @@ pub mod tests { request: Request { start_key: start_key.to_vec(), end_key: end_key.to_vec(), + sub_ranges: Vec::new(), start_ts: 1.into(), end_ts: 1.into(), backend, @@ -1512,6 +1562,189 @@ pub mod tests { } } + #[test] + fn test_seek_ranges() { + let (_tmp, endpoint) = new_endpoint(); + + endpoint.region_info.set_regions(vec![ + (b"".to_vec(), b"1".to_vec(), 1), + (b"1".to_vec(), b"2".to_vec(), 2), + (b"3".to_vec(), b"4".to_vec(), 3), + (b"7".to_vec(), b"9".to_vec(), 4), + (b"9".to_vec(), b"".to_vec(), 5), + ]); + // Test seek backup range. + let test_seek_backup_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let start_key = (!start_key.is_empty()).then_some(Key::from_raw(start_key)); + let end_key = (!end_key.is_empty()).then_some(Key::from_raw(end_key)); + ranges.push((start_key, end_key)); + } + let mut prs = Progress::new_with_ranges( + endpoint.store_id, + ranges, + endpoint.region_info.clone(), + KeyValueCodec::new(false, ApiVersion::V1, ApiVersion::V1), + engine_traits::CF_DEFAULT, + ); + + let mut ranges = Vec::with_capacity(expect.len()); + while ranges.len() != expect.len() { + let n = (rand::random::() % 3) + 1; + let mut r = prs.forward(n); + // The returned backup ranges should <= n + assert!(r.len() <= n); + + if r.is_empty() { + // if return a empty vec then the progress is finished + assert_eq!( + ranges.len(), + expect.len(), + "got {:?}, expect {:?}", + ranges, + expect + ); + } + ranges.append(&mut r); + } + + for (a, b) in ranges.into_iter().zip(expect) { + assert_eq!( + a.start_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.0 + ); + assert_eq!( + a.end_key.map_or_else(Vec::new, |k| k.into_raw().unwrap()), + b.1 + ); + } + }; + + // Test whether responses contain correct range. + #[allow(clippy::blocks_in_if_conditions)] + let test_handle_backup_task_ranges = + |sub_ranges: Vec<(&[u8], &[u8])>, expect: Vec<(&[u8], &[u8])>| { + let tmp = TempDir::new().unwrap(); + let backend = make_local_backend(tmp.path()); + let (tx, rx) = unbounded(); + + let mut ranges = Vec::with_capacity(sub_ranges.len()); + for &(start_key, end_key) in &sub_ranges { + let key_range = KeyRange { + start_key: start_key.to_vec(), + end_key: end_key.to_vec(), + ..Default::default() + }; + ranges.push(key_range); + } + let task = Task { + request: Request { + start_key: b"1".to_vec(), + end_key: b"2".to_vec(), + sub_ranges: ranges, + start_ts: 1.into(), + end_ts: 1.into(), + backend, + limiter: Limiter::new(f64::INFINITY), + cancel: Arc::default(), + is_raw_kv: false, + dst_api_ver: ApiVersion::V1, + cf: engine_traits::CF_DEFAULT, + compression_type: CompressionType::Unknown, + compression_level: 0, + cipher: CipherInfo::default(), + }, + resp: tx, + }; + endpoint.handle_backup_task(task); + let resps: Vec<_> = block_on(rx.collect()); + for a in &resps { + assert!( + expect + .iter() + .any(|b| { a.get_start_key() == b.0 && a.get_end_key() == b.1 }), + "{:?} {:?}", + resps, + expect + ); + } + assert_eq!(resps.len(), expect.len()); + }; + + // Backup range from case.0 to case.1, + // the case.2 is the expected results. + type Case<'a> = (Vec<(&'a [u8], &'a [u8])>, Vec<(&'a [u8], &'a [u8])>); + + let case: Vec> = vec![ + ( + vec![(b"", b"1"), (b"1", b"2")], + vec![(b"", b"1"), (b"1", b"2")], + ), + ( + vec![(b"", b"2"), (b"3", b"4")], + vec![(b"", b"1"), (b"1", b"2"), (b"3", b"4")], + ), + ( + vec![(b"7", b"8"), (b"8", b"9")], + vec![(b"7", b"8"), (b"8", b"9")], + ), + ( + vec![(b"8", b"9"), (b"6", b"8")], + vec![(b"8", b"9"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + vec![(b"8", b"85"), (b"88", b"89"), (b"7", b"8")], + ), + ( + vec![(b"8", b"85"), (b"", b"35"), (b"88", b"89"), (b"7", b"8")], + vec![ + (b"8", b"85"), + (b"", b"1"), + (b"1", b"2"), + (b"3", b"35"), + (b"88", b"89"), + (b"7", b"8"), + ], + ), + (vec![(b"", b"1")], vec![(b"", b"1")]), + (vec![(b"", b"2")], vec![(b"", b"1"), (b"1", b"2")]), + (vec![(b"1", b"2")], vec![(b"1", b"2")]), + (vec![(b"1", b"3")], vec![(b"1", b"2")]), + (vec![(b"1", b"4")], vec![(b"1", b"2"), (b"3", b"4")]), + (vec![(b"4", b"5")], vec![]), + (vec![(b"4", b"6")], vec![]), + (vec![(b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"3"), (b"4", b"6"), (b"6", b"7")], vec![]), + (vec![(b"2", b"7")], vec![(b"3", b"4")]), + (vec![(b"7", b"8")], vec![(b"7", b"8")]), + ( + vec![(b"3", b"")], + vec![(b"3", b"4"), (b"7", b"9"), (b"9", b"")], + ), + (vec![(b"5", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"7", b"")], vec![(b"7", b"9"), (b"9", b"")]), + (vec![(b"8", b"91")], vec![(b"8", b"9"), (b"9", b"91")]), + (vec![(b"8", b"")], vec![(b"8", b"9"), (b"9", b"")]), + ( + vec![(b"", b"")], + vec![ + (b"", b"1"), + (b"1", b"2"), + (b"3", b"4"), + (b"7", b"9"), + (b"9", b""), + ], + ), + ]; + for (ranges, expect_ranges) in case { + test_seek_backup_ranges(ranges.clone(), expect_ranges.clone()); + test_handle_backup_task_ranges(ranges, expect_ranges); + } + } + #[test] fn test_handle_backup_task() { let limiter = Arc::new(IoRateLimiter::new_for_test()); diff --git a/components/backup/src/writer.rs b/components/backup/src/writer.rs index 4e0750bd7d8..7a853fe485f 100644 --- a/components/backup/src/writer.rs +++ b/components/backup/src/writer.rs @@ -443,7 +443,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) + .cfs([engine_traits::CF_DEFAULT, engine_traits::CF_WRITE]) .build() .unwrap(); let db = rocks.get_rocksdb(); @@ -480,7 +480,7 @@ mod tests { let temp = TempDir::new().unwrap(); let rocks = TestEngineBuilder::new() .path(temp.path()) - .cfs(&[ + .cfs([ engine_traits::CF_DEFAULT, engine_traits::CF_LOCK, engine_traits::CF_WRITE, diff --git a/components/batch-system/src/metrics.rs b/components/batch-system/src/metrics.rs index 9edcd656bf4..a4728f32ad7 100644 --- a/components/batch-system/src/metrics.rs +++ b/components/batch-system/src/metrics.rs @@ -10,4 +10,11 @@ lazy_static! { &["type"] ) .unwrap(); + + pub static ref BROADCAST_NORMAL_DURATION: Histogram = + register_histogram!( + "tikv_broadcast_normal_duration_seconds", + "Duration of broadcasting normals.", + exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s + ).unwrap(); } diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index 660ab014939..bfcb93c9d6b 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,17 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{debug, info, lru::LruCache, Either}; +use tikv_util::{ + debug, info, + lru::LruCache, + time::{duration_to_sec, Instant}, + Either, +}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, mailbox::{BasicMailbox, Mailbox}, - metrics::CHANNEL_FULL_COUNTER_VEC, + metrics::*, }; /// A struct that traces the approximate memory usage of router. @@ -289,7 +294,7 @@ where } } - /// Force sending message to control FSM. + /// Sending message to control FSM. #[inline] pub fn send_control(&self, msg: C::Message) -> Result<(), TrySendError> { match self.control_box.try_send(msg, &self.control_scheduler) { @@ -304,12 +309,20 @@ where } } + /// Force sending message to control FSM. + #[inline] + pub fn force_send_control(&self, msg: C::Message) -> Result<(), SendError> { + self.control_box.force_send(msg, &self.control_scheduler) + } + /// Try to notify all normal FSMs a message. pub fn broadcast_normal(&self, mut msg_gen: impl FnMut() -> N::Message) { + let timer = Instant::now_coarse(); let mailboxes = self.normals.lock().unwrap(); for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } + BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed())); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/causal_ts/Cargo.toml b/components/causal_ts/Cargo.toml index d05e9b66ddd..a5dd62cd5d2 100644 --- a/components/causal_ts/Cargo.toml +++ b/components/causal_ts/Cargo.toml @@ -16,7 +16,7 @@ enum_dispatch = "0.3.8" error_code = { workspace = true } fail = "0.5" futures = { version = "0.3" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } parking_lot = "0.12" diff --git a/components/causal_ts/benches/tso.rs b/components/causal_ts/benches/tso.rs index 72d381a4be7..f7e1980d15f 100644 --- a/components/causal_ts/benches/tso.rs +++ b/components/causal_ts/benches/tso.rs @@ -19,11 +19,7 @@ fn bench_batch_tso_list_pop(c: &mut Criterion) { batch_list.flush(); for i in 0..CAPACITY { batch_list - .push( - batch_size as u32, - TimeStamp::compose(i as u64, batch_size), - false, - ) + .push(batch_size as u32, TimeStamp::compose(i, batch_size), false) .unwrap(); } }, diff --git a/components/causal_ts/src/lib.rs b/components/causal_ts/src/lib.rs index 3eb59f35c36..ab57fbf734f 100644 --- a/components/causal_ts/src/lib.rs +++ b/components/causal_ts/src/lib.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -#![feature(map_first_last)] // For `BTreeMap::pop_first`. #![feature(div_duration)] #[macro_use] diff --git a/components/causal_ts/src/tso.rs b/components/causal_ts/src/tso.rs index 5056cfe2ebd..e63c3c2c3ba 100644 --- a/components/causal_ts/src/tso.rs +++ b/components/causal_ts/src/tso.rs @@ -712,7 +712,7 @@ pub mod tests { for (i, (remain, usage, need_flush, expected)) in cases.into_iter().enumerate() { let batch_list = Arc::new(TsoBatchList { inner: Default::default(), - tso_remain: AtomicI32::new(remain as i32), + tso_remain: AtomicI32::new(remain), tso_usage: AtomicU32::new(usage), capacity: cache_multiplier, }); diff --git a/components/cdc/Cargo.toml b/components/cdc/Cargo.toml index 62ef4cc29f5..94d80bf1d9f 100644 --- a/components/cdc/Cargo.toml +++ b/components/cdc/Cargo.toml @@ -26,7 +26,6 @@ portable = ["tikv/portable"] sse = ["tikv/sse"] mem-profiling = ["tikv/mem-profiling"] failpoints = ["tikv/failpoints"] -pprof-fp = ["tikv/pprof-fp"] [dependencies] api_version = { workspace = true } @@ -43,7 +42,7 @@ futures-timer = "3.0" getset = "0.1" grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index de38a7b1fc8..120806588dc 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -129,6 +129,7 @@ pub struct Downstream { sink: Option, state: Arc>, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, } impl Downstream { @@ -142,6 +143,7 @@ impl Downstream { req_id: u64, conn_id: ConnId, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, ) -> Downstream { Downstream { id: DownstreamId::new(), @@ -152,6 +154,7 @@ impl Downstream { sink: None, state: Arc::new(AtomicCell::new(DownstreamState::default())), kv_api, + filter_loop, } } @@ -203,6 +206,10 @@ impl Downstream { self.id } + pub fn get_filter_loop(&self) -> bool { + self.filter_loop + } + pub fn get_state(&self) -> Arc> { self.state.clone() } @@ -471,6 +478,7 @@ impl Delegate { region_id: u64, request_id: u64, entries: Vec>, + filter_loop: bool, ) -> Result> { let entries_len = entries.len(); let mut rows = vec![Vec::with_capacity(entries_len)]; @@ -527,6 +535,10 @@ impl Delegate { row_size = 0; } } + // if the `txn_source` is not 0 and we should filter it out, skip this event. + if row.txn_source != 0 && filter_loop { + continue; + } if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { rows.push(Vec::with_capacity(entries_len)); current_rows_size = 0; @@ -620,6 +632,48 @@ impl Delegate { if entries.is_empty() { return Ok(()); } + + let downstreams = self.downstreams(); + assert!( + !downstreams.is_empty(), + "region {} miss downstream", + self.region_id + ); + + let mut need_filter = false; + for ds in downstreams { + if ds.filter_loop { + need_filter = true; + break; + } + } + + // collect the change event cause by user write, which is `txn_source` = 0. + // for changefeed which only need the user write, send the `filtered`, or else, + // send them all. + let filtered = if need_filter { + let filtered = entries + .iter() + .filter(|x| x.txn_source == 0) + .cloned() + .collect::>(); + if filtered.is_empty() { + None + } else { + Some(Event { + region_id: self.region_id, + index, + event: Some(Event_oneof_event::Entries(EventEntries { + entries: filtered.into(), + ..Default::default() + })), + ..Default::default() + }) + } + } else { + None + }; + let event_entries = EventEntries { entries: entries.into(), ..Default::default() @@ -630,6 +684,7 @@ impl Delegate { event: Some(Event_oneof_event::Entries(event_entries)), ..Default::default() }; + let send = move |downstream: &Downstream| { // No ready downstream or a downstream that does not match the kv_api type, will // be ignored. There will be one region that contains both Txn & Raw entries. @@ -637,7 +692,15 @@ impl Delegate { if !downstream.state.load().ready_for_change_events() || downstream.kv_api != kv_api { return Ok(()); } - let event = change_data_event.clone(); + if downstream.filter_loop && filtered.is_none() { + return Ok(()); + } + + let event = if downstream.filter_loop { + filtered.clone().unwrap() + } else { + change_data_event.clone() + }; // Do not force send for real time change data events. let force_send = false; downstream.sink_event(event, force_send) @@ -918,6 +981,7 @@ fn decode_write( } }; let commit_ts = if write.write_type == WriteType::Rollback { + assert_eq!(write.txn_source, 0); 0 } else { key.decode_ts().unwrap().into_inner() @@ -926,6 +990,8 @@ fn decode_write( row.commit_ts = commit_ts; row.key = key.truncate_ts().unwrap().into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = write.txn_source; set_event_row_type(row, r_type); if let Some(value) = write.short_value { row.value = value; @@ -952,6 +1018,8 @@ fn decode_lock(key: Vec, lock: Lock, row: &mut EventRow, has_value: &mut boo row.start_ts = lock.ts.into_inner(); row.key = key.into_raw().unwrap(); row.op_type = op_type as _; + // used for filter out the event. see `txn_source` field for more detail. + row.txn_source = lock.txn_source; set_event_row_type(row, EventLogType::Prewrite); if let Some(value) = lock.short_value { row.value = value; @@ -1021,6 +1089,7 @@ mod tests { request_id, ConnId::new(), ChangeDataRequestKvApi::TiDb, + false, ); downstream.set_sink(sink); let mut delegate = Delegate::new(region_id, Default::default()); @@ -1138,7 +1207,14 @@ mod tests { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(region_version); epoch.set_version(region_version); - Downstream::new(peer, epoch, id, ConnId::new(), ChangeDataRequestKvApi::TiDb) + Downstream::new( + peer, + epoch, + id, + ConnId::new(), + ChangeDataRequestKvApi::TiDb, + false, + ) }; // Create a new delegate. diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 614e282a5d9..6d64754d042 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -40,7 +40,7 @@ use tikv_util::{ mpsc::bounded, slow_log, sys::thread::ThreadBuildWrapper, - time::{Limiter, SlowTimer}, + time::{Instant, Limiter, SlowTimer}, timer::SteadyTimer, warn, worker::{Runnable, RunnableWithTimer, ScheduleError, Scheduler}, @@ -154,6 +154,8 @@ pub enum Task { }, RegisterMinTsEvent { leader_resolver: LeadershipResolver, + // The time at which the event actually occurred. + event_time: Instant, }, // The result of ChangeCmd should be returned from CDC Endpoint to ensure // the downstream switches to Normal after the previous commands was sunk. @@ -222,7 +224,9 @@ impl fmt::Debug for Task { .field("observe_id", &observe_id) .field("region_id", ®ion.get_id()) .finish(), - Task::RegisterMinTsEvent { .. } => de.field("type", &"register_min_ts").finish(), + Task::RegisterMinTsEvent { ref event_time, .. } => { + de.field("event_time", &event_time).finish() + } Task::InitDownstream { ref region_id, ref downstream_id, @@ -447,13 +451,12 @@ impl, E: KvEngine> Endpoint { resolved_region_count: 0, unresolved_region_count: 0, sink_memory_quota, - // store_resolver, // Log the first resolved ts warning. warn_resolved_ts_repeat_count: WARN_RESOLVED_TS_COUNT_THRESHOLD, current_ts: TimeStamp::zero(), causal_ts_provider, }; - ep.register_min_ts_event(leader_resolver); + ep.register_min_ts_event(leader_resolver, Instant::now()); ep } @@ -617,6 +620,7 @@ impl, E: KvEngine> Endpoint { let api_version = self.api_version; let downstream_id = downstream.get_id(); let downstream_state = downstream.get_state(); + let filter_loop = downstream.get_filter_loop(); // Register must follow OpenConn, so the connection must be available. let conn = self.connections.get_mut(&conn_id).unwrap(); @@ -743,6 +747,7 @@ impl, E: KvEngine> Endpoint { build_resolver: is_new_delegate, ts_filter_ratio: self.config.incremental_scan_ts_filter_ratio, kv_api, + filter_loop, }; let raft_router = self.raft_router.clone(); @@ -996,16 +1001,20 @@ impl, E: KvEngine> Endpoint { let _ = downstream.sink_event(resolved_ts_event, force_send); } - fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver) { - let timeout = self.timer.delay(self.config.min_ts_interval.0); + fn register_min_ts_event(&self, mut leader_resolver: LeadershipResolver, event_time: Instant) { + // Try to keep advance resolved ts every `min_ts_interval`, thus + // the actual wait interval = `min_ts_interval` - the last register min_ts event + // time. + let interval = self + .config + .min_ts_interval + .0 + .checked_sub(event_time.saturating_elapsed()); + let timeout = self.timer.delay(interval.unwrap_or_default()); let pd_client = self.pd_client.clone(); let scheduler = self.scheduler.clone(); let raft_router = self.raft_router.clone(); - let regions: Vec = self - .capture_regions - .iter() - .map(|(region_id, _)| *region_id) - .collect(); + let regions: Vec = self.capture_regions.keys().copied().collect(); let cm: ConcurrencyManager = self.concurrency_manager.clone(); let hibernate_regions_compatible = self.config.hibernate_regions_compatible; let causal_ts_provider = self.causal_ts_provider.clone(); @@ -1043,7 +1052,10 @@ impl, E: KvEngine> Endpoint { defer!({ slow_log!(T slow_timer, "cdc resolve region leadership"); if let Ok(leader_resolver) = leader_resolver_rx.try_recv() { - match scheduler.schedule(Task::RegisterMinTsEvent { leader_resolver }) { + match scheduler.schedule(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }) { Ok(_) | Err(ScheduleError::Stopped(_)) => (), // Must schedule `RegisterMinTsEvent` event otherwise resolved ts can not // advance normally. @@ -1129,8 +1141,9 @@ impl, E: KvEngine> Runnable for Endpoint { } => self.on_multi_batch(multi, old_value_cb), Task::OpenConn { conn } => self.on_open_conn(conn), Task::RegisterMinTsEvent { - leader_resolver: store_resolver, - } => self.register_min_ts_event(store_resolver), + leader_resolver, + event_time, + } => self.register_min_ts_event(leader_resolver, event_time), Task::InitDownstream { region_id, downstream_id, @@ -1384,7 +1397,10 @@ mod tests { #[test] fn test_api_version_check() { - let cfg = CdcConfig::default(); + let mut cfg = CdcConfig::default(); + // To make the case more stable. + cfg.min_ts_interval = ReadableDuration(Duration::from_secs(1)); + let mut suite = mock_endpoint(&cfg, None, ApiVersion::V1); suite.add_region(1, 100); let quota = crate::channel::MemoryQuota::new(usize::MAX); @@ -1409,6 +1425,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::RawKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::RawKv); suite.run(Task::Register { @@ -1444,6 +1461,7 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TxnKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1480,6 +1498,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TxnKv, + false, ); req.set_kv_api(ChangeDataRequestKvApi::TxnKv); suite.run(Task::Register { @@ -1524,7 +1543,7 @@ mod tests { } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.min_ts_interval, ReadableDuration::secs(1)); + assert_eq!(ep.config.min_ts_interval, ReadableDuration::millis(200)); assert_eq!(ep.config.hibernate_regions_compatible, true); { @@ -1658,6 +1677,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1704,6 +1724,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); // Enable batch resolved ts in the test. let version = FeatureGate::batch_resolved_ts(); @@ -1726,6 +1747,7 @@ mod tests { 2, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), @@ -1762,6 +1784,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1806,6 +1829,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.add_local_reader(100); suite.run(Task::Register { @@ -1837,6 +1861,7 @@ mod tests { 1, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -1871,7 +1896,10 @@ mod tests { let mut suite = mock_endpoint_with_ts_provider(&cfg, None, ApiVersion::V2, Some(ts_provider.clone())); let leader_resolver = suite.leader_resolver.take().unwrap(); - suite.run(Task::RegisterMinTsEvent { leader_resolver }); + suite.run(Task::RegisterMinTsEvent { + leader_resolver, + event_time: Instant::now(), + }); suite .task_rx .recv_timeout(Duration::from_millis(1500)) @@ -1909,6 +1937,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); // Enable batch resolved ts in the test. @@ -1945,6 +1974,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(2, 100); @@ -1990,6 +2020,7 @@ mod tests { 3, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.add_region(3, 100); @@ -2060,6 +2091,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); let downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2102,6 +2134,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); let new_downstream_id = downstream.get_id(); suite.run(Task::Register { @@ -2153,6 +2186,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req, @@ -2207,6 +2241,7 @@ mod tests { 0, conn_id, ChangeDataRequestKvApi::TiDb, + false, ); downstream.get_state().store(DownstreamState::Normal); suite.run(Task::Register { @@ -2324,6 +2359,7 @@ mod tests { 0, conn_id_a, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), @@ -2347,6 +2383,7 @@ mod tests { 0, conn_id_b, ChangeDataRequestKvApi::TiDb, + false, ); suite.run(Task::Register { request: req.clone(), diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 36c1636a7e8..38c8603900e 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -96,6 +96,8 @@ pub(crate) struct Initializer { pub(crate) ts_filter_ratio: f64, pub(crate) kv_api: ChangeDataRequestKvApi, + + pub(crate) filter_loop: bool, } impl Initializer { @@ -425,8 +427,12 @@ impl Initializer { async fn sink_scan_events(&mut self, entries: Vec>, done: bool) -> Result<()> { let mut barrier = None; - let mut events = - Delegate::convert_to_grpc_events(self.region_id, self.request_id, entries)?; + let mut events = Delegate::convert_to_grpc_events( + self.region_id, + self.request_id, + entries, + self.filter_loop, + )?; if done { let (cb, fut) = tikv_util::future::paired_future_callback(); events.push(CdcEvent::Barrier(Some(cb))); @@ -558,13 +564,17 @@ mod tests { use engine_rocks::RocksEngine; use engine_traits::{MiscExt, CF_WRITE}; use futures::{executor::block_on, StreamExt}; - use kvproto::{cdcpb::Event_oneof_event, errorpb::Error as ErrorHeader}; + use kvproto::{ + cdcpb::{EventLogType, Event_oneof_event}, + errorpb::Error as ErrorHeader, + }; use raftstore::{coprocessor::ObserveHandle, store::RegionSnapshot}; use test_raftstore::MockRaftStoreRouter; use tikv::storage::{ kv::Engine, txn::tests::{ must_acquire_pessimistic_lock, must_commit, must_prewrite_delete, must_prewrite_put, + must_prewrite_put_with_txn_soucre, }, TestEngineBuilder, }; @@ -601,6 +611,7 @@ mod tests { buffer: usize, engine: Option, kv_api: ChangeDataRequestKvApi, + filter_loop: bool, ) -> ( LazyWorker, Runtime, @@ -645,6 +656,7 @@ mod tests { build_resolver: true, ts_filter_ratio: 1.0, // always enable it. kv_api, + filter_loop, }; (receiver_worker, pool, initializer, rx, drain) @@ -686,6 +698,7 @@ mod tests { buffer, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); let check_result = || loop { let task = rx.recv().unwrap(); @@ -754,6 +767,53 @@ mod tests { worker.stop(); } + #[test] + fn test_initializer_filter_loop() { + let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); + + let mut total_bytes = 0; + + for i in 10..100 { + let (k, v) = (&[b'k', i], &[b'v', i]); + total_bytes += k.len(); + total_bytes += v.len(); + let ts = TimeStamp::new(i as _); + must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, 1); + } + + let snap = engine.snapshot(Default::default()).unwrap(); + // Buffer must be large enough to unblock async incremental scan. + let buffer = 1000; + let (mut worker, pool, mut initializer, _rx, mut drain) = mock_initializer( + total_bytes, + buffer, + engine.kv_engine(), + ChangeDataRequestKvApi::TiDb, + true, + ); + let th = pool.spawn(async move { + initializer + .async_incremental_scan(snap, Region::default()) + .await + .unwrap(); + }); + let mut drain = drain.drain(); + while let Some((event, _)) = block_on(drain.next()) { + let event = match event { + CdcEvent::Event(x) if x.event.is_some() => x.event.unwrap(), + _ => continue, + }; + let entries = match event { + Event_oneof_event::Entries(mut x) => x.take_entries().into_vec(), + _ => continue, + }; + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].get_type(), EventLogType::Initialized); + } + block_on(th).unwrap(); + worker.stop(); + } + // Test `hint_min_ts` works fine with `ExtraOp::ReadOldValue`. // Whether `DeltaScanner` emits correct old values or not is already tested by // another case `test_old_value_with_hint_min_ts`, so here we only care about @@ -782,6 +842,7 @@ mod tests { 1000, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, + false, ); initializer.checkpoint_ts = checkpoint_ts.into(); let mut drain = drain.drain(); @@ -840,8 +901,13 @@ mod tests { fn test_initializer_deregister_downstream() { let total_bytes = 1; let buffer = 1; - let (mut worker, _pool, mut initializer, rx, _drain) = - mock_initializer(total_bytes, buffer, None, ChangeDataRequestKvApi::TiDb); + let (mut worker, _pool, mut initializer, rx, _drain) = mock_initializer( + total_bytes, + buffer, + None, + ChangeDataRequestKvApi::TiDb, + false, + ); // Errors reported by region should deregister region. initializer.build_resolver = false; @@ -891,7 +957,7 @@ mod tests { let total_bytes = 1; let buffer = 1; let (mut worker, pool, mut initializer, _rx, _drain) = - mock_initializer(total_bytes, buffer, None, kv_api); + mock_initializer(total_bytes, buffer, None, kv_api, false); let change_cmd = ChangeObserver::from_cdc(1, ObserveHandle::new()); let raft_router = MockRaftStoreRouter::new(); diff --git a/components/cdc/src/old_value.rs b/components/cdc/src/old_value.rs index 1149d8ce3e0..d91266c92c2 100644 --- a/components/cdc/src/old_value.rs +++ b/components/cdc/src/old_value.rs @@ -1,6 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::Deref; +use std::ops::{Bound, Deref}; use engine_traits::{ReadOptions, CF_DEFAULT, CF_WRITE}; use getset::CopyGetters; @@ -261,7 +261,7 @@ fn new_write_cursor_on_key(snapshot: &S, key: &Key) -> Cursor .range(Some(key.clone()), upper) // Use bloom filter to speed up seeking on a given prefix. .prefix_seek(true) - .hint_max_ts(Some(ts)) + .hint_max_ts(Some(Bound::Included(ts))) .build() .unwrap() } @@ -341,8 +341,8 @@ mod tests { old_value_cache.cache.insert(key, value.clone()); } - assert_eq!(old_value_cache.cache.size(), size * cases as usize); - assert_eq!(old_value_cache.cache.len(), cases as usize); + assert_eq!(old_value_cache.cache.size(), size * cases); + assert_eq!(old_value_cache.cache.len(), cases); assert_eq!(old_value_cache.capacity(), capacity as usize); // Reduces capacity. @@ -360,7 +360,7 @@ mod tests { assert_eq!(old_value_cache.cache.size(), size * remaining_count); assert_eq!(old_value_cache.cache.len(), remaining_count); - assert_eq!(old_value_cache.capacity(), new_capacity as usize); + assert_eq!(old_value_cache.capacity(), new_capacity); for i in dropped_count..cases { let key = Key::from_raw(&i.to_be_bytes()); assert_eq!(old_value_cache.cache.get(&key).is_some(), true); diff --git a/components/cdc/src/service.rs b/components/cdc/src/service.rs index e7bec568f67..f9665283c45 100644 --- a/components/cdc/src/service.rs +++ b/components/cdc/src/service.rs @@ -240,8 +240,14 @@ impl ChangeData for Service { semver::Version::new(0, 0, 0) } }; - let downstream = - Downstream::new(peer.clone(), region_epoch, req_id, conn_id, req_kvapi); + let downstream = Downstream::new( + peer.clone(), + region_epoch, + req_id, + conn_id, + req_kvapi, + request.filter_loop, + ); let ret = scheduler .schedule(Task::Register { request, diff --git a/components/cdc/tests/failpoints/test_endpoint.rs b/components/cdc/tests/failpoints/test_endpoint.rs index 6e208ccac90..3fdd6048971 100644 --- a/components/cdc/tests/failpoints/test_endpoint.rs +++ b/components/cdc/tests/failpoints/test_endpoint.rs @@ -8,7 +8,7 @@ use std::{ use api_version::{test_kv_format_impl, KvFormat}; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, OldValueCache, Task, Validate}; +use cdc::{recv_timeout, Delegate, OldValueCache, Task, Validate}; use futures::{executor::block_on, sink::SinkExt}; use grpcio::{ChannelBuilder, Environment, WriteFlags}; use kvproto::{cdcpb::*, kvrpcpb::*, tikvpb_grpc::TikvClient}; @@ -58,6 +58,12 @@ fn test_cdc_double_scan_deregister_impl() { new_event_feed(suite.get_region_cdc_client(1)); block_on(req_tx_1.send((req, WriteFlags::default()))).unwrap(); + // wait for the second connection register to the delegate. + suite.must_wait_delegate_condition( + 1, + Arc::new(|d: Option<&Delegate>| d.unwrap().downstreams().len() == 2), + ); + // close connection block_on(req_tx.close()).unwrap(); event_feed_wrap.replace(None); diff --git a/components/cdc/tests/integrations/test_cdc.rs b/components/cdc/tests/integrations/test_cdc.rs index 3be68c5905c..73f46fe6427 100644 --- a/components/cdc/tests/integrations/test_cdc.rs +++ b/components/cdc/tests/integrations/test_cdc.rs @@ -12,7 +12,7 @@ use pd_client::PdClient; use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::server::DEFAULT_CLUSTER_ID; -use tikv_util::HandyRwLock; +use tikv_util::{config::ReadableDuration, HandyRwLock}; use txn_types::{Key, Lock, LockType}; use crate::{new_event_feed, TestSuite, TestSuiteBuilder}; @@ -2359,3 +2359,242 @@ fn test_prewrite_without_value() { let event = receive_event(false); assert_eq!(event.get_events()[0].get_entries().entries[0].commit_ts, 14); } + +#[test] +fn test_filter_loop() { + test_kv_format_impl!(test_filter_loop_impl); +} + +fn test_filter_loop_impl() { + let mut suite = TestSuite::new(1, F::TAG); + let mut req = suite.new_changedata_request(1); + req.set_extra_op(ExtraOp::ReadOldValue); + req.set_filter_loop(true); + let (mut req_tx, event_feed_wrap, receive_event) = + new_event_feed(suite.get_region_cdc_client(1)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let row = &es.take_entries().to_vec()[0]; + assert_eq!(row.get_type(), EventLogType::Initialized); + } + other => panic!("unknown event {:?}", other), + } + + // Insert value, simulate INSERT INTO. + let mut m1 = Mutation::default(); + let k1 = b"xk1".to_vec(); + m1.set_op(Op::Insert); + m1.key = k1.clone(); + m1.value = b"v1".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m1], k1.clone(), 10.into(), 1); + let mut m2 = Mutation::default(); + let k2 = b"xk2".to_vec(); + m2.set_op(Op::Insert); + m2.key = k2.clone(); + m2.value = b"v2".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m2], k2.clone(), 12.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_value(), b"v2"); + assert_eq!(row.get_old_value(), b""); + assert_eq!(row.get_type(), EventLogType::Prewrite); + assert_eq!(row.get_start_ts(), 12); + } + other => panic!("unknown event {:?}", other), + } + suite.must_kv_commit_with_source(1, vec![k1], 10.into(), 15.into(), 1); + suite.must_kv_commit_with_source(1, vec![k2], 12.into(), 17.into(), 0); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Commit); + assert_eq!(row.get_commit_ts(), 17); + } + other => panic!("unknown event {:?}", other), + } + + // Rollback + let mut m3 = Mutation::default(); + let k3 = b"xk3".to_vec(); + m3.set_op(Op::Put); + m3.key = k3.clone(); + m3.value = b"v3".to_vec(); + suite.must_kv_prewrite_with_source(1, vec![m3], k3.clone(), 30.into(), 1); + suite.must_kv_rollback(1, vec![k3], 30.into()); + let mut events = receive_event(false).events.to_vec(); + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + let row = &events[0]; + assert_eq!(row.get_type(), EventLogType::Rollback); + assert_eq!(row.get_commit_ts(), 0); + } + other => panic!("unknown event {:?}", other), + } + + // Update value + let k1 = b"xk1".to_vec(); + let mut m4 = Mutation::default(); + m4.set_op(Op::Put); + m4.key = k1.clone(); + m4.value = vec![b'3'; 5120]; + suite.must_kv_prewrite_with_source(1, vec![m4], k1.clone(), 40.into(), 1); + suite.must_kv_commit_with_source(1, vec![k1], 40.into(), 42.into(), 1); + let k2 = b"xk2".to_vec(); + let mut m5 = Mutation::default(); + m5.set_op(Op::Put); + m5.key = k2.clone(); + m5.value = vec![b'4'; 5121]; + suite.must_kv_prewrite(1, vec![m5], k2.clone(), 44.into()); + suite.must_kv_commit(1, vec![k2.clone()], 44.into(), 46.into()); + let mut events = receive_event(false).events.to_vec(); + if events.len() == 1 { + events.extend(receive_event(false).events.into_iter()); + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Prewrite); + assert_eq!(events[0].get_start_ts(), 44); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + match events.remove(0).event.unwrap() { + Event_oneof_event::Entries(mut es) => { + let events = es.take_entries().to_vec(); + assert_eq!(events.len(), 1); + assert_eq!(events[0].get_type(), EventLogType::Commit); + assert_eq!(events[0].get_commit_ts(), 46); + assert_eq!(events[0].get_key(), k2.as_slice()); + } + other => panic!("unknown event {:?}", other), + } + + event_feed_wrap.replace(None); + suite.stop(); +} + +#[test] +fn test_flashback() { + let mut cluster = new_server_cluster(0, 1); + cluster.cfg.resolved_ts.advance_ts_interval = ReadableDuration::millis(50); + let mut suite = TestSuiteBuilder::new().cluster(cluster).build(); + + let key = Key::from_raw(b"a"); + let region = suite.cluster.get_region(key.as_encoded()); + let region_id = region.get_id(); + let req = suite.new_changedata_request(region_id); + let (mut req_tx, _, receive_event) = new_event_feed(suite.get_region_cdc_client(region_id)); + block_on(req_tx.send((req, WriteFlags::default()))).unwrap(); + let event = receive_event(false); + event.events.into_iter().for_each(|e| { + match e.event.unwrap() { + // Even if there is no write, + // it should always outputs an Initialized event. + Event_oneof_event::Entries(es) => { + assert!(es.entries.len() == 1, "{:?}", es); + let e = &es.entries[0]; + assert_eq!(e.get_type(), EventLogType::Initialized, "{:?}", es); + } + other => panic!("unknown event {:?}", other), + } + }); + // Sleep a while to make sure the stream is registered. + sleep_ms(1000); + let start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + for i in 0..2 { + let (k, v) = ( + format!("key{}", i).as_bytes().to_vec(), + format!("value{}", i).as_bytes().to_vec(), + ); + // Prewrite + let start_ts1 = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k.clone(); + mutation.value = v; + suite.must_kv_prewrite(1, vec![mutation], k.clone(), start_ts1); + // Commit + let commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_commit(1, vec![k.clone()], start_ts1, commit_ts); + } + let (start_key, end_key) = (b"key0".to_vec(), b"key2".to_vec()); + // Prepare flashback. + let flashback_start_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_prepare_flashback(region_id, &start_key, &end_key, flashback_start_ts); + // resolved ts should not be advanced anymore. + let mut counter = 0; + let mut last_resolved_ts = 0; + loop { + let event = receive_event(true); + if let Some(resolved_ts) = event.resolved_ts.as_ref() { + if resolved_ts.ts == last_resolved_ts { + counter += 1; + } + last_resolved_ts = resolved_ts.ts; + } + if counter > 20 { + break; + } + sleep_ms(50); + } + // Flashback. + let flashback_commit_ts = block_on(suite.cluster.pd_client.get_tso()).unwrap(); + suite.must_kv_flashback( + region_id, + &start_key, + &end_key, + flashback_start_ts, + flashback_commit_ts, + start_ts, + ); + // Check the flashback event. + let mut resolved_ts = 0; + let mut event_counter = 0; + loop { + let mut cde = receive_event(true); + if cde.get_resolved_ts().get_ts() > resolved_ts { + resolved_ts = cde.get_resolved_ts().get_ts(); + } + let events = cde.mut_events(); + if !events.is_empty() { + assert_eq!(events.len(), 1); + match events.pop().unwrap().event.unwrap() { + Event_oneof_event::Entries(entries) => { + assert_eq!(entries.entries.len(), 1); + event_counter += 1; + let e = &entries.entries[0]; + assert!(e.commit_ts > resolved_ts); + assert_eq!(e.get_op_type(), EventRowOpType::Delete); + match e.get_type() { + EventLogType::Committed => { + // First entry should be a 1PC flashback. + assert_eq!(e.get_key(), b"key1"); + assert_eq!(event_counter, 1); + } + EventLogType::Commit => { + // Second entry should be a 2PC commit. + assert_eq!(e.get_key(), b"key0"); + assert_eq!(event_counter, 2); + break; + } + _ => panic!("unknown event type {:?}", e.get_type()), + } + } + other => panic!("unknown event {:?}", other), + } + } + } +} diff --git a/components/cdc/tests/mod.rs b/components/cdc/tests/mod.rs index c14a91de99a..77e50bb10b2 100644 --- a/components/cdc/tests/mod.rs +++ b/components/cdc/tests/mod.rs @@ -1,9 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::*, time::Duration}; +use std::{ + sync::*, + time::{Duration, Instant}, +}; use causal_ts::CausalTsProvider; -use cdc::{recv_timeout, CdcObserver, FeatureGate, MemoryQuota, Task}; +use cdc::{recv_timeout, CdcObserver, Delegate, FeatureGate, MemoryQuota, Task, Validate}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_rocks::RocksEngine; @@ -266,9 +269,22 @@ impl TestSuite { muts: Vec, pk: Vec, ts: TimeStamp, + ) { + self.must_kv_prewrite_with_source(region_id, muts, pk, ts, 0); + } + + pub fn must_kv_prewrite_with_source( + &mut self, + region_id: u64, + muts: Vec, + pk: Vec, + ts: TimeStamp, + txn_source: u64, ) { let mut prewrite_req = PrewriteRequest::default(); - prewrite_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + prewrite_req.set_context(context); prewrite_req.set_mutations(muts.into_iter().collect()); prewrite_req.primary_lock = pk; prewrite_req.start_version = ts.into_inner(); @@ -311,9 +327,22 @@ impl TestSuite { keys: Vec>, start_ts: TimeStamp, commit_ts: TimeStamp, + ) { + self.must_kv_commit_with_source(region_id, keys, start_ts, commit_ts, 0); + } + + pub fn must_kv_commit_with_source( + &mut self, + region_id: u64, + keys: Vec>, + start_ts: TimeStamp, + commit_ts: TimeStamp, + txn_source: u64, ) { let mut commit_req = CommitRequest::default(); - commit_req.set_context(self.get_context(region_id)); + let mut context = self.get_context(region_id); + context.set_txn_source(txn_source); + commit_req.set_context(context); commit_req.start_version = start_ts.into_inner(); commit_req.set_keys(keys.into_iter().collect()); commit_req.commit_version = commit_ts.into_inner(); @@ -523,4 +552,83 @@ impl TestSuite { ) .unwrap(); } + + pub fn must_wait_delegate_condition( + &self, + region_id: u64, + cond: Arc) -> bool + Sync + Send>, + ) { + let scheduler = self.endpoints[®ion_id].scheduler(); + let start = Instant::now(); + loop { + sleep_ms(100); + let (tx, rx) = mpsc::sync_channel(1); + let c = cond.clone(); + let checker = move |d: Option<&Delegate>| { + tx.send(c(d)).unwrap(); + }; + scheduler + .schedule(Task::Validate(Validate::Region( + region_id, + Box::new(checker), + ))) + .unwrap(); + if rx.recv().unwrap() { + return; + } + if start.elapsed() > Duration::from_secs(5) { + panic!("wait delegate timeout"); + } + } + } + + pub fn must_kv_prepare_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + end_key: &[u8], + start_ts: TimeStamp, + ) { + let mut prepare_flashback_req = PrepareFlashbackToVersionRequest::default(); + prepare_flashback_req.set_context(self.get_context(region_id)); + prepare_flashback_req.set_start_key(start_key.to_vec()); + prepare_flashback_req.set_end_key(end_key.to_vec()); + prepare_flashback_req.set_start_ts(start_ts.into_inner()); + let prepare_flashback_resp = self + .get_tikv_client(region_id) + .kv_prepare_flashback_to_version(&prepare_flashback_req) + .unwrap(); + assert!( + !prepare_flashback_resp.has_region_error(), + "{:?}", + prepare_flashback_resp.get_region_error() + ); + } + + pub fn must_kv_flashback( + &mut self, + region_id: u64, + start_key: &[u8], + end_key: &[u8], + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + ) { + let mut flashback_req = FlashbackToVersionRequest::default(); + flashback_req.set_context(self.get_context(region_id)); + flashback_req.set_start_key(start_key.to_vec()); + flashback_req.set_end_key(end_key.to_vec()); + flashback_req.set_start_ts(start_ts.into_inner()); + flashback_req.set_commit_ts(commit_ts.into_inner()); + flashback_req.set_version(version.into_inner()); + let flashback_resp = self + .get_tikv_client(region_id) + .kv_flashback_to_version(&flashback_req) + .unwrap(); + assert!( + !flashback_resp.has_region_error(), + "{:?}", + flashback_resp.get_region_error() + ); + } } diff --git a/components/cloud/Cargo.toml b/components/cloud/Cargo.toml index 45ae2b40b23..10f8b113b2b 100644 --- a/components/cloud/Cargo.toml +++ b/components/cloud/Cargo.toml @@ -9,7 +9,7 @@ async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } futures-io = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" openssl = "0.10" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } diff --git a/components/cloud/aws/Cargo.toml b/components/cloud/aws/Cargo.toml index 964048121d6..5d28e09e8f4 100644 --- a/components/cloud/aws/Cargo.toml +++ b/components/cloud/aws/Cargo.toml @@ -22,7 +22,7 @@ grpcio = { workspace = true } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" md5 = "0.7.0" prometheus = { version = "0.13", default-features = false, features = ["nightly"] } @@ -38,7 +38,7 @@ tikv_util = { workspace = true } # better to not use slog-global, but pass in the logger tokio = { version = "1.5", features = ["time"] } url = "2.0" -uuid = "0.8" +uuid = { version = "0.8", features = ["v4"] } [dev-dependencies] futures = "0.3" diff --git a/components/cloud/aws/src/s3.rs b/components/cloud/aws/src/s3.rs index 469cac97d6c..a7ea47ec9d2 100644 --- a/components/cloud/aws/src/s3.rs +++ b/components/cloud/aws/src/s3.rs @@ -222,7 +222,7 @@ impl S3Storage { key.to_owned() } - fn get_range(&self, name: &str, range: Option) -> Box { + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { let key = self.maybe_prefix_key(name); let bucket = self.config.bucket.bucket.clone(); debug!("read file from s3 storage"; "key" => %key); @@ -595,11 +595,11 @@ impl BlobStorage for S3Storage { }) } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { // inclusive, bytes=0-499 -> [0, 499] self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } diff --git a/components/cloud/azure/Cargo.toml b/components/cloud/azure/Cargo.toml index 3d8b01e893b..c08dc76fdff 100644 --- a/components/cloud/azure/Cargo.toml +++ b/components/cloud/azure/Cargo.toml @@ -14,7 +14,7 @@ chrono = "0.4" cloud = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } oauth2 = { version = "4.0.0", default-features = false } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/cloud/azure/src/azblob.rs b/components/cloud/azure/src/azblob.rs index 5bf02696de7..12b6149fad5 100644 --- a/components/cloud/azure/src/azblob.rs +++ b/components/cloud/azure/src/azblob.rs @@ -558,7 +558,7 @@ impl AzureStorage { &self, name: &str, range: Option>, - ) -> Box { + ) -> cloud::blob::BlobStream<'_> { let name = self.maybe_prefix_key(name); debug!("read file from Azure storage"; "key" => %name); let t = async move { @@ -602,11 +602,11 @@ impl BlobStorage for AzureStorage { uploader.run(&mut reader, content_length).await } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { self.get_range(name, Some(off..off + len)) } } diff --git a/components/cloud/gcp/Cargo.toml b/components/cloud/gcp/Cargo.toml index f184377c0af..5074a3c9da4 100644 --- a/components/cloud/gcp/Cargo.toml +++ b/components/cloud/gcp/Cargo.toml @@ -11,7 +11,7 @@ futures-util = { version = "0.3", default-features = false, features = ["io"] } http = "0.2.0" hyper = "0.14" hyper-tls = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } # better to not use slog-global, but pass in the logger slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } @@ -23,3 +23,5 @@ url = "2.0" [dev-dependencies] matches = "0.1.8" +pin-project = "1" +tokio = { version = "1.5", features = ["rt"] } diff --git a/components/cloud/gcp/src/gcs.rs b/components/cloud/gcp/src/gcs.rs index e8e8ad20ee9..61e432c9431 100644 --- a/components/cloud/gcp/src/gcs.rs +++ b/components/cloud/gcp/src/gcs.rs @@ -2,12 +2,13 @@ use std::{convert::TryInto, fmt::Display, io, sync::Arc}; use async_trait::async_trait; -use cloud::blob::{ - none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty, +use cloud::{ + blob::{none_to_empty, BlobConfig, BlobStorage, BucketConf, PutResource, StringNonEmpty}, + metrics, }; use futures_util::{ future::TryFutureExt, - io::{AsyncRead, AsyncReadExt, Cursor}, + io::{self as async_io, AsyncRead, Cursor}, stream::{StreamExt, TryStreamExt}, }; use http::HeaderValue; @@ -20,7 +21,12 @@ use tame_gcs::{ types::{BucketName, ObjectId}, }; use tame_oauth::gcp::{ServiceAccountAccess, ServiceAccountInfo, TokenOrRequest}; -use tikv_util::stream::{error_stream, retry, AsyncReadAsSyncStreamOfBytes, RetryError}; +use tikv_util::{ + stream::{error_stream, AsyncReadAsSyncStreamOfBytes, RetryError}, + time::Instant, +}; + +use crate::utils::retry; const GOOGLE_APIS: &str = "https://www.googleapis.com"; const HARDCODED_ENDPOINTS_SUFFIX: &[&str] = &["upload/storage/v1/", "storage/v1/"]; @@ -156,6 +162,7 @@ impl ResultExt for Result { } } +#[derive(Debug)] enum RequestError { Hyper(hyper::Error, String), OAuth(tame_oauth::Error, String), @@ -340,14 +347,14 @@ impl GcsStorage { Ok(res) } - fn error_to_async_read(kind: io::ErrorKind, e: E) -> Box + fn error_to_async_read(kind: io::ErrorKind, e: E) -> cloud::blob::BlobStream<'static> where E: Into>, { Box::new(error_stream(io::Error::new(kind, e)).into_async_read()) } - fn get_range(&self, name: &str, range: Option) -> Box { + fn get_range(&self, name: &str, range: Option) -> cloud::blob::BlobStream<'_> { let bucket = self.config.bucket.bucket.to_string(); let name = self.maybe_prefix_key(name); debug!("read file from GCS storage"; "key" => %name); @@ -433,6 +440,14 @@ fn parse_predefined_acl(acl: &str) -> Result, &str> { })) } +/// Like AsyncReadExt::read_to_end, but only try to initialize the buffer once. +/// Check https://github.com/rust-lang/futures-rs/issues/2658 for the reason we cannot +/// directly use it. +async fn read_to_end(r: R, v: &mut Vec) -> std::io::Result { + let mut c = Cursor::new(v); + async_io::copy(r, &mut c).await +} + const STORAGE_NAME: &str = "gcs"; #[async_trait] @@ -441,12 +456,7 @@ impl BlobStorage for GcsStorage { Box::new(self.config.clone()) as Box } - async fn put( - &self, - name: &str, - mut reader: PutResource, - content_length: u64, - ) -> io::Result<()> { + async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()> { if content_length == 0 { // It is probably better to just write the empty file // However, currently going forward results in a body write aborted error @@ -470,33 +480,44 @@ impl BlobStorage for GcsStorage { // FIXME: Switch to upload() API so we don't need to read the entire data into // memory in order to retry. + let begin = Instant::now_coarse(); let mut data = Vec::with_capacity(content_length as usize); - reader.read_to_end(&mut data).await?; - retry(|| async { - let data = Cursor::new(data.clone()); - let req = Object::insert_multipart( - &bucket, - data, - content_length, - &metadata, - Some(InsertObjectOptional { - predefined_acl: self.config.predefined_acl, - ..Default::default() - }), - ) - .map_err(RequestError::Gcs)? - .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); - self.make_request(req, tame_gcs::Scopes::ReadWrite).await - }) + read_to_end(reader, &mut data).await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "read_local"]) + .observe(begin.saturating_elapsed_secs()); + let begin = Instant::now_coarse(); + retry( + || async { + let data = Cursor::new(data.clone()); + let req = Object::insert_multipart( + &bucket, + data, + content_length, + &metadata, + Some(InsertObjectOptional { + predefined_acl: self.config.predefined_acl, + ..Default::default() + }), + ) + .map_err(RequestError::Gcs)? + .map(|reader| Body::wrap_stream(AsyncReadAsSyncStreamOfBytes::new(reader))); + self.make_request(req, tame_gcs::Scopes::ReadWrite).await + }, + "insert_multipart", + ) .await?; + metrics::CLOUD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["gcp", "insert_multipart"]) + .observe(begin.saturating_elapsed_secs()); Ok::<_, io::Error>(()) } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> cloud::blob::BlobStream<'_> { self.get_range(name, None) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> cloud::blob::BlobStream<'_> { // inclusive, bytes=0-499 -> [0, 499] self.get_range(name, Some(format!("bytes={}-{}", off, off + len - 1))) } @@ -504,6 +525,10 @@ impl BlobStorage for GcsStorage { #[cfg(test)] mod tests { + extern crate test; + use std::task::Poll; + + use futures_util::AsyncReadExt; use matches::assert_matches; use super::*; @@ -605,6 +630,84 @@ mod tests { assert_eq!(c1.bucket.prefix, c2.bucket.prefix); } + enum ThrottleReadState { + Spawning, + Emitting, + } + /// ThrottleRead throttles a `Read` -- make it emits 2 chars for each + /// `read` call. This is copy & paste from the implmentation from s3.rs. + #[pin_project::pin_project] + struct ThrottleRead { + #[pin] + inner: R, + state: ThrottleReadState, + } + impl AsyncRead for ThrottleRead { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let this = self.project(); + match this.state { + ThrottleReadState::Spawning => { + *this.state = ThrottleReadState::Emitting; + cx.waker().wake_by_ref(); + Poll::Pending + } + ThrottleReadState::Emitting => { + *this.state = ThrottleReadState::Spawning; + this.inner.poll_read(cx, &mut buf[..2]) + } + } + } + } + impl ThrottleRead { + fn new(r: R) -> Self { + Self { + inner: r, + state: ThrottleReadState::Spawning, + } + } + } + + const BENCH_READ_SIZE: usize = 128 * 1024; + + // 255,120,895 ns/iter (+/- 73,332,249) (futures-util 0.3.15) + #[bench] + fn bench_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + + b.iter(|| { + let mut r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(r.read_to_end(&mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + + // 5,850,042 ns/iter (+/- 3,787,438) + #[bench] + fn bench_manual_read_to_end(b: &mut test::Bencher) { + let mut v = [0; BENCH_READ_SIZE]; + let mut dst = Vec::with_capacity(BENCH_READ_SIZE); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + b.iter(|| { + let r = ThrottleRead::new(Cursor::new(&mut v)); + dst.clear(); + + rt.block_on(read_to_end(r, &mut dst)).unwrap(); + assert_eq!(dst.len(), BENCH_READ_SIZE) + }) + } + fn cloud_dynamic_from_input(mut gcs: InputConfig) -> CloudDynamic { let mut bucket = InputBucket::default(); if !gcs.endpoint.is_empty() { diff --git a/components/cloud/gcp/src/lib.rs b/components/cloud/gcp/src/lib.rs index 4652bbf5b74..9ad97793988 100644 --- a/components/cloud/gcp/src/lib.rs +++ b/components/cloud/gcp/src/lib.rs @@ -1,7 +1,26 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(test)] #[macro_use] extern crate slog_global; mod gcs; pub use gcs::{Config, GcsStorage}; + +pub mod utils { + use std::future::Future; + + use cloud::metrics; + use tikv_util::stream::{retry_ext, RetryError, RetryExt}; + pub async fn retry(action: G, name: &'static str) -> Result + where + G: FnMut() -> F, + F: Future>, + E: RetryError + std::fmt::Debug, + { + retry_ext(action, RetryExt::default().with_fail_hook(move |err: &E| { + warn!("gcp request meet error."; "err" => ?err, "retry?" => %err.is_retryable(), "context" => %name); + metrics::CLOUD_ERROR_VEC.with_label_values(&["gcp", name]).inc(); + })).await + } +} diff --git a/components/cloud/src/blob.rs b/components/cloud/src/blob.rs index d80d3a47a28..84ca77042d7 100644 --- a/components/cloud/src/blob.rs +++ b/components/cloud/src/blob.rs @@ -19,6 +19,8 @@ pub trait BlobConfig: 'static + Send + Sync { /// wrappers exists. pub struct PutResource(pub Box); +pub type BlobStream<'a> = Box; + impl AsyncRead for PutResource { fn poll_read( self: Pin<&mut Self>, @@ -45,10 +47,10 @@ pub trait BlobStorage: 'static + Send + Sync { async fn put(&self, name: &str, reader: PutResource, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn get(&self, name: &str) -> Box; + fn get(&self, name: &str) -> BlobStream<'_>; /// Read part of contents of the given path. - fn get_part(&self, name: &str, off: u64, len: u64) -> Box; + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_>; } impl BlobConfig for dyn BlobStorage { @@ -72,11 +74,11 @@ impl BlobStorage for Box { fut.await } - fn get(&self, name: &str) -> Box { + fn get(&self, name: &str) -> BlobStream<'_> { (**self).get(name) } - fn get_part(&self, name: &str, off: u64, len: u64) -> Box { + fn get_part(&self, name: &str, off: u64, len: u64) -> BlobStream<'_> { (**self).get_part(name, off, len) } } diff --git a/components/codec/src/byte.rs b/components/codec/src/byte.rs index aa7baba9e75..8b5fd928edf 100644 --- a/components/codec/src/byte.rs +++ b/components/codec/src/byte.rs @@ -759,7 +759,7 @@ mod tests { for (exp, encoded) in cases { let mut path = env::temp_dir(); path.push("read-compact-codec-file"); - fs::write(&path, &encoded).unwrap(); + fs::write(&path, encoded).unwrap(); let f = File::open(&path).unwrap(); let mut rdr = BufReader::new(f); let decoded = rdr.read_compact_bytes().unwrap(); diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index 2d008cf49f1..e225cbe0519 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -6,7 +6,7 @@ version = "0.0.1" [dependencies] fail = "0.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } parking_lot = "0.12" tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } diff --git a/components/concurrency_manager/benches/lock_table.rs b/components/concurrency_manager/benches/lock_table.rs index f2d4a9b92c9..52c9bea960a 100644 --- a/components/concurrency_manager/benches/lock_table.rs +++ b/components/concurrency_manager/benches/lock_table.rs @@ -1,7 +1,6 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(test)] -#![feature(bench_black_box)] use std::{borrow::Cow, hint::black_box, mem::forget}; diff --git a/components/concurrency_manager/src/lock_table.rs b/components/concurrency_manager/src/lock_table.rs index bf7a224aa28..ad013a863a1 100644 --- a/components/concurrency_manager/src/lock_table.rs +++ b/components/concurrency_manager/src/lock_table.rs @@ -158,9 +158,9 @@ mod test { assert_eq!(counter.load(Ordering::SeqCst), 100); } - fn ts_check(lock: &Lock, ts: u64) -> Result<(), Lock> { + fn ts_check(lock: &Lock, ts: u64) -> Result<(), Box> { if lock.ts.into_inner() < ts { - Err(lock.clone()) + Err(Box::new(lock.clone())) } else { Ok(()) } @@ -193,7 +193,10 @@ mod test { lock_table.check_key(&key_k, |l| ts_check(l, 5)).unwrap(); // lock does not pass check_fn - assert_eq!(lock_table.check_key(&key_k, |l| ts_check(l, 20)), Err(lock)); + assert_eq!( + lock_table.check_key(&key_k, |l| ts_check(l, 20)), + Err(Box::new(lock)) + ); } #[tokio::test] @@ -247,13 +250,13 @@ mod test { // first lock does not pass check_fn assert_eq!( lock_table.check_range(Some(&Key::from_raw(b"a")), None, |_, l| ts_check(l, 25)), - Err(lock_k) + Err(Box::new(lock_k)) ); // first lock passes check_fn but the second does not assert_eq!( lock_table.check_range(None, None, |_, l| ts_check(l, 15)), - Err(lock_l) + Err(Box::new(lock_l)) ); } diff --git a/components/encryption/Cargo.toml b/components/encryption/Cargo.toml index b66ef2aa147..18b6cb7305c 100644 --- a/components/encryption/Cargo.toml +++ b/components/encryption/Cargo.toml @@ -21,7 +21,7 @@ file_system = { workspace = true } futures = "0.3" futures-util = { version = "0.3", default-features = false, features = ["std", "io"] } hex = "0.4.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" online_config = { workspace = true } openssl = "0.10" diff --git a/components/encryption/export/Cargo.toml b/components/encryption/export/Cargo.toml index f76c2b8f03c..fc4fe59d3fb 100644 --- a/components/encryption/export/Cargo.toml +++ b/components/encryption/export/Cargo.toml @@ -18,7 +18,7 @@ derive_more = "0.99.3" encryption = { workspace = true } error_code = { workspace = true } file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } openssl = "0.10" protobuf = { version = "2.8", features = ["bytes"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/encryption/src/crypter.rs b/components/encryption/src/crypter.rs index 13286e416c9..7379b8a32a3 100644 --- a/components/encryption/src/crypter.rs +++ b/components/encryption/src/crypter.rs @@ -275,7 +275,7 @@ mod tests { let crypter = AesGcmCrypter::new(&key, iv); let (ciphertext, gcm_tag) = crypter.encrypt(&pt).unwrap(); assert_eq!(ciphertext, ct, "{}", hex::encode(&ciphertext)); - assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(&gcm_tag.0)); + assert_eq!(gcm_tag.0.to_vec(), tag, "{}", hex::encode(gcm_tag.0)); let plaintext = crypter.decrypt(&ct, gcm_tag).unwrap(); assert_eq!(plaintext, pt, "{}", hex::encode(&plaintext)); diff --git a/components/encryption/src/encrypted_file/mod.rs b/components/encryption/src/encrypted_file/mod.rs index 57b5527b7bf..9c76b857c70 100644 --- a/components/encryption/src/encrypted_file/mod.rs +++ b/components/encryption/src/encrypted_file/mod.rs @@ -64,7 +64,7 @@ impl<'a> EncryptedFile<'a> { let start = Instant::now(); // Write to a tmp file. // TODO what if a tmp file already exists? - let origin_path = self.base.join(&self.name); + let origin_path = self.base.join(self.name); let mut tmp_path = origin_path.clone(); tmp_path.set_extension(format!("{}.{}", thread_rng().next_u64(), TMP_FILE_SUFFIX)); let mut tmp_file = OpenOptions::new() @@ -92,7 +92,7 @@ impl<'a> EncryptedFile<'a> { // Replace old file with the tmp file aomticlly. rename(tmp_path, origin_path)?; - let base_dir = File::open(&self.base)?; + let base_dir = File::open(self.base)?; base_dir.sync_all()?; ENCRYPT_DECRPTION_FILE_HISTOGRAM diff --git a/components/encryption/src/manager/mod.rs b/components/encryption/src/manager/mod.rs index 0f78e794629..0f3233d7819 100644 --- a/components/encryption/src/manager/mod.rs +++ b/components/encryption/src/manager/mod.rs @@ -815,7 +815,7 @@ mod tests { } fn new_mock_backend() -> Box { - Box::new(MockBackend::default()) + Box::::default() } fn new_key_manager_def( @@ -829,7 +829,7 @@ mod tests { } match DataKeyManager::new_previous_loaded( master_backend, - Box::new(MockBackend::default()), + Box::::default(), args, ) { Ok(None) => panic!("expected encryption"), @@ -932,7 +932,7 @@ mod tests { let manager = new_key_manager( &tmp_dir, Some(EncryptionMethod::Aes256Ctr), - Box::new(PlaintextBackend::default()), + Box::::default(), new_mock_backend() as Box, ); manager.err().unwrap(); @@ -1301,13 +1301,13 @@ mod tests { encrypt_fail: false, ..MockBackend::default() }); - let previous = Box::new(PlaintextBackend::default()) as Box; + let previous = Box::::default() as Box; let result = new_key_manager(&tmp_dir, None, wrong_key, previous); // When the master key is invalid, the key manager left a empty file dict and // return errors. assert!(result.is_err()); - let previous = Box::new(PlaintextBackend::default()) as Box; + let previous = Box::::default() as Box; new_key_manager(&tmp_dir, None, right_key, previous).unwrap(); } diff --git a/components/engine_panic/Cargo.toml b/components/engine_panic/Cargo.toml index c5703994c73..55e42f2595f 100644 --- a/components/engine_panic/Cargo.toml +++ b/components/engine_panic/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } tikv_alloc = { workspace = true } # FIXME: Remove this dep from the engine_traits interface diff --git a/components/engine_panic/src/checkpoint.rs b/components/engine_panic/src/checkpoint.rs new file mode 100644 index 00000000000..6743810eb90 --- /dev/null +++ b/components/engine_panic/src/checkpoint.rs @@ -0,0 +1,29 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use core::panic; +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::PanicEngine; + +pub struct PanicCheckpointer {} + +impl Checkpointable for PanicEngine { + type Checkpointer = PanicCheckpointer; + + fn new_checkpointer(&self) -> Result { + panic!() + } +} + +impl Checkpointer for PanicCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + panic!() + } +} diff --git a/components/engine_panic/src/lib.rs b/components/engine_panic/src/lib.rs index 0573c936135..93555f5ba5f 100644 --- a/components/engine_panic/src/lib.rs +++ b/components/engine_panic/src/lib.rs @@ -9,7 +9,6 @@ //! with your engine's own name; then fill in the implementations; remove //! the allow(unused) attribute; -#![feature(generic_associated_types)] #![allow(unused)] mod cf_names; @@ -46,5 +45,6 @@ pub mod flow_control_factors; pub use crate::flow_control_factors::*; pub mod table_properties; pub use crate::table_properties::*; +pub mod checkpoint; mod raft_engine; diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 75e0e68269d..ad05e66c6fa 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -144,6 +144,10 @@ impl RaftEngine for PanicEngine { panic!() } + fn get_engine_path(&self) -> &str { + panic!() + } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { panic!() } diff --git a/components/engine_rocks/Cargo.toml b/components/engine_rocks/Cargo.toml index 44dd708271d..a0e3e878c54 100644 --- a/components/engine_rocks/Cargo.toml +++ b/components/engine_rocks/Cargo.toml @@ -32,7 +32,7 @@ engine_traits = { workspace = true } fail = "0.5" file_system = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.4.0" log_wrappers = { workspace = true } num_cpus = "1" diff --git a/components/engine_rocks/src/checkpoint.rs b/components/engine_rocks/src/checkpoint.rs new file mode 100644 index 00000000000..8b82043a392 --- /dev/null +++ b/components/engine_rocks/src/checkpoint.rs @@ -0,0 +1,55 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use engine_traits::{Checkpointable, Checkpointer, Result}; + +use crate::{r2e, RocksEngine}; + +impl Checkpointable for RocksEngine { + type Checkpointer = RocksEngineCheckpointer; + + fn new_checkpointer(&self) -> Result { + match self.as_inner().new_checkpointer() { + Ok(pointer) => Ok(RocksEngineCheckpointer(pointer)), + Err(e) => Err(r2e(e)), + } + } +} + +pub struct RocksEngineCheckpointer(rocksdb::Checkpointer); + +impl Checkpointer for RocksEngineCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + self.0 + .create_at(db_out_dir, titan_out_dir, log_size_for_flush) + .map_err(|e| r2e(e)) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Checkpointable, Checkpointer, Peekable, SyncMutable, ALL_CFS}; + use tempfile::tempdir; + + use crate::util::new_engine; + + #[test] + fn test_checkpoint() { + let dir = tempdir().unwrap(); + let path = dir.path().join("origin"); + let engine = new_engine(path.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + engine.put(b"key", b"value").unwrap(); + + let mut check_pointer = engine.new_checkpointer().unwrap(); + let path2 = dir.path().join("checkpoint"); + check_pointer.create_at(path2.as_path(), None, 0).unwrap(); + let engine2 = new_engine(path2.as_path().to_str().unwrap(), ALL_CFS).unwrap(); + assert_eq!(engine2.get_value(b"key").unwrap().unwrap(), b"value"); + } +} diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index 774fe9cb37b..b6f3e36146c 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -16,7 +16,6 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #[allow(unused_extern_crates)] extern crate tikv_alloc; @@ -28,6 +27,8 @@ mod cf_names; pub use crate::cf_names::*; mod cf_options; pub use crate::cf_options::*; +mod checkpoint; +pub use crate::checkpoint::*; mod compact; pub use crate::compact::*; mod db_options; diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index b66a56caadf..da15b1708b8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -339,6 +339,10 @@ impl RaftEngine for RocksEngine { Ok(used_size) } + fn get_engine_path(&self) -> &str { + self.as_inner().path() + } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { self.put_msg(keys::STORE_IDENT_KEY, ident) } diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index f749f78851c..778e16c1a67 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -150,7 +150,7 @@ pub fn db_exist(path: &str) -> bool { // If path is not an empty directory but db has not been created, // `DB::list_column_families` fails and we can clean up the directory by // this indication. - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } /// Returns a Vec of cf which is in `a' but not in `b'. diff --git a/components/engine_rocks_helper/Cargo.toml b/components/engine_rocks_helper/Cargo.toml index 16e79a3b007..ec66aa474a9 100644 --- a/components/engine_rocks_helper/Cargo.toml +++ b/components/engine_rocks_helper/Cargo.toml @@ -24,5 +24,5 @@ tikv_util = { workspace = true } [dev-dependencies] engine_test = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } tempfile = "3.0" diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index b2a574422fb..77bd2d3be7c 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -55,6 +55,8 @@ //! storage engines, and that it be extracted into its own crate for use in //! TiKV, once the full requirements are better understood. +#![feature(let_chains)] + /// Types and constructors for the "raft" engine pub mod raft { #[cfg(feature = "test-engine-raft-panic")] @@ -126,12 +128,17 @@ pub mod kv { db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, ) -> Self { - Self { + let factory = Self { root_path: root_path.to_path_buf(), db_opt, cf_opts, root_db: Arc::new(Mutex::default()), + }; + let tablet_path = factory.tablets_path(); + if !tablet_path.exists() { + std::fs::create_dir_all(tablet_path).unwrap(); } + factory } fn create_tablet(&self, tablet_path: &Path) -> Result { @@ -189,8 +196,8 @@ pub mod kv { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - Path::new(&self.root_path).join(format!("tablets/{}_{}", id, suffix)) + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { + self.root_path.join("db") } #[inline] @@ -226,7 +233,8 @@ pub mod kv { #[derive(Clone)] pub struct TestTabletFactoryV2 { inner: TestTabletFactory, - registry: Arc>>, + // region_id -> (tablet, tablet_suffix) + registry: Arc>>, } impl TestTabletFactoryV2 { @@ -242,17 +250,6 @@ pub mod kv { } } - // Extract tablet id and tablet suffix from the path. - fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } - (tablet_id, tablet_suffix) - } - impl TabletFactory for TestTabletFactoryV2 { /// See the comment above the same name method in KvEngineFactoryV2 fn open_tablet( @@ -261,33 +258,34 @@ pub mod kv { suffix: Option, mut options: OpenOptions, ) -> Result { + if options.create_new() && suffix.is_none() { + return Err(box_err!( + "suffix should be provided when creating new tablet" + )); + } + if options.create_new() || options.create() { options = options.set_cache_only(false); } let mut reg = self.registry.lock().unwrap(); if let Some(suffix) = suffix { - if let Some(tablet) = reg.get(&(id, suffix)) { + if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { // Target tablet exist in the cache - if options.create_new() { - return Err(box_err!("region {} {} already exists", id, tablet.path())); + return Err(box_err!("region {} {} already exists", id, cached_tablet.path())); } - return Ok(tablet.clone()); + return Ok(cached_tablet.clone()); } else if !options.cache_only() { let tablet_path = self.tablet_path(id, suffix); let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; if !options.skip_cache() { - reg.insert((id, suffix), tablet.clone()); + reg.insert(id, (tablet.clone(), suffix)); } return Ok(tablet); } - } else if options.cache_only() { - // This branch reads an arbitrary tablet with region id `id` - - if let Some(k) = reg.keys().find(|k| k.0 == id) { - return Ok(reg.get(k).unwrap().clone()); - } + } else if let Some((tablet, _)) = reg.get(&id) { + return Ok(tablet.clone()); } Err(box_err!( @@ -343,17 +341,24 @@ pub mod kv { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { self.inner .root_path - .join(format!("tablets/{}_{}", id, suffix)) + .join(format!("tablets/{}{}_{}", prefix, id, suffix)) } #[inline] fn mark_tombstone(&self, region_id: u64, suffix: u64) { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - std::fs::File::create(&path).unwrap(); - self.registry.lock().unwrap().remove(&(region_id, suffix)); + // When the full directory path does not exsit, create will return error and in + // this case, we just ignore it. + let _ = std::fs::File::create(path); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } } #[inline] @@ -364,37 +369,40 @@ pub mod kv { } #[inline] - fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(id, suffix); - self.registry.lock().unwrap().remove(&(id, suffix)); + fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(region_id, suffix); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } let _ = std::fs::remove_dir_all(path); Ok(()) } #[inline] - fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { { let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { - return Err(box_err!("region {} {} already exists", id, db.path())); + if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { + return Err(box_err!("region {} {} already exists", region_id, db.path())); } } - let db_path = self.tablet_path(id, suffix); - std::fs::rename(path, &db_path)?; - let new_engine = - self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); - if new_engine.is_ok() { - let (old_id, old_suffix) = get_id_and_suffix_from_path(path); - self.registry.lock().unwrap().remove(&(old_id, old_suffix)); - } - new_engine + let db_path = self.tablet_path(region_id, suffix); + std::fs::rename(path, db_path)?; + self.open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_create(true), + ) } fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity - if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity)?; } @@ -406,7 +414,7 @@ pub mod kv { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { let reg = self.registry.lock().unwrap(); - for ((id, suffix), tablet) in &*reg { + for (id, (tablet, suffix)) in &*reg { f(*id, *suffix, tablet) } } diff --git a/components/engine_tirocks/Cargo.toml b/components/engine_tirocks/Cargo.toml index 8ecce112579..07c2a7ec42c 100644 --- a/components/engine_tirocks/Cargo.toml +++ b/components/engine_tirocks/Cargo.toml @@ -24,6 +24,6 @@ tracker = { workspace = true } txn_types = { workspace = true } [dev-dependencies] -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } rand = "0.8" tempfile = "3.0" diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index c2e9d729868..d38962e71c9 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -12,7 +12,7 @@ case_macros = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } protobuf = "2" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/engine_traits/src/checkpoint.rs b/components/engine_traits/src/checkpoint.rs new file mode 100644 index 00000000000..6ea3556938f --- /dev/null +++ b/components/engine_traits/src/checkpoint.rs @@ -0,0 +1,20 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::path::Path; + +use crate::Result; + +pub trait Checkpointable { + type Checkpointer: Checkpointer; + + fn new_checkpointer(&self) -> Result; +} + +pub trait Checkpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()>; +} diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 5ad9a13b86f..55ab5d63caa 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -40,6 +40,7 @@ pub trait KvEngine: + Clone + Debug + Unpin + + Checkpointable + 'static { /// A consistent read-only snapshot of the database @@ -224,6 +225,9 @@ impl OpenOptions { } } +pub const SPLIT_PREFIX: &str = "split_"; +pub const MERGE_PREFIX: &str = "merge_"; + /// A factory trait to create new engine. // It should be named as `EngineFactory` for consistency, but we are about to // rename engine to tablet, so always use tablet for new traits/types. @@ -261,7 +265,15 @@ pub trait TabletFactory: TabletAccessor + Send + Sync { fn exists_raw(&self, path: &Path) -> bool; /// Get the tablet path by id and suffix - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf; + fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + self.tablet_path_with_prefix("", id, suffix) + } + + /// Get the tablet path by id and suffix + /// + /// Used in special situations + /// Ex: split/merge. + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf; /// Tablets root path fn tablets_path(&self) -> PathBuf; @@ -323,7 +335,7 @@ where true } - fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { PathBuf::from(&self.root_path) } diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index 47fe16b4768..b9cf8847751 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -251,7 +251,6 @@ #![cfg_attr(test, feature(test))] #![feature(min_specialization)] #![feature(assert_matches)] -#![feature(generic_associated_types)] #[macro_use(fail_point)] extern crate fail; @@ -303,6 +302,8 @@ mod flow_control_factors; pub use crate::flow_control_factors::*; mod table_properties; pub use crate::table_properties::*; +mod checkpoint; +pub use crate::checkpoint::*; // These modules contain more general traits, some of which may be implemented // by multiple types. diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index b7a3f50699c..7df681c96d5 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -68,6 +68,7 @@ pub struct RaftLogGcTask { pub to: u64, } +// TODO: Refactor common methods between Kv and Raft engine into a shared trait. pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; @@ -140,6 +141,9 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn get_engine_size(&self) -> Result; + /// The path to the directory on the filesystem where the raft log is stored + fn get_engine_path(&self) -> &str; + /// Visit all available raft groups. /// /// If any error is returned, the iteration will stop. diff --git a/components/error_code/Cargo.toml b/components/error_code/Cargo.toml index 484f8d24ad3..b98fc8dfcb5 100644 --- a/components/error_code/Cargo.toml +++ b/components/error_code/Cargo.toml @@ -14,7 +14,7 @@ path = "bin.rs" [dependencies] grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } serde = { version = "1.0", features = ["derive"] } diff --git a/components/error_code/bin.rs b/components/error_code/bin.rs index ba6a21ac6fa..8f1ad087355 100644 --- a/components/error_code/bin.rs +++ b/components/error_code/bin.rs @@ -18,7 +18,7 @@ fn main() { storage::ALL_ERROR_CODES.iter(), ]; let path = Path::new("./etc/error_code.toml"); - let mut f = fs::File::create(&path).unwrap(); + let mut f = fs::File::create(path).unwrap(); err_codes .into_iter() .flatten() diff --git a/components/error_code/src/backup_stream.rs b/components/error_code/src/backup_stream.rs index 9448169cc05..a4b28b0e9ee 100644 --- a/components/error_code/src/backup_stream.rs +++ b/components/error_code/src/backup_stream.rs @@ -41,12 +41,17 @@ define_error_codes! { ), RAFTREQ => ("RaftReq", "Error happened when sending raft command.", - "This is an internal error, please ask the community for help." + "This is an internal error, most of them are happen while initial scanning and can be simply retried." ), RAFTSTORE => ("RaftStore", "Error happened reported from raft store.", "This is an internal error, please ask the community for help." ), + GRPC => ("gRPC", + "Error happened during executing gRPC", + "This error is often relative to the network, please check the network connection and network config, say, TLS config." + ), + OTHER => ("Unknown", "Some random error happens.", "This is an generic error, please check the error message for further information." diff --git a/components/error_code/src/sst_importer.rs b/components/error_code/src/sst_importer.rs index 2eb6177458b..001f4f146f6 100644 --- a/components/error_code/src/sst_importer.rs +++ b/components/error_code/src/sst_importer.rs @@ -21,5 +21,6 @@ define_error_codes!( TTL_NOT_ENABLED => ("TtlNotEnabled", "", ""), TTL_LEN_NOT_EQUALS_TO_PAIRS => ("TtlLenNotEqualsToPairs", "", ""), INCOMPATIBLE_API_VERSION => ("IncompatibleApiVersion", "", ""), - INVALID_KEY_MODE => ("InvalidKeyMode", "", "") + INVALID_KEY_MODE => ("InvalidKeyMode", "", ""), + RESOURCE_NOT_ENOUTH => ("ResourceNotEnough", "", "") ); diff --git a/components/external_storage/Cargo.toml b/components/external_storage/Cargo.toml index 8c92b79583e..839e34e3f22 100644 --- a/components/external_storage/Cargo.toml +++ b/components/external_storage/Cargo.toml @@ -29,7 +29,7 @@ futures-executor = "0.3" futures-io = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true, optional = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libloading = { optional = true, version = "0.7.0" } openssl = "0.10" diff --git a/components/external_storage/export/Cargo.toml b/components/external_storage/export/Cargo.toml index 076bdd9d0dd..61e9bfa58df 100644 --- a/components/external_storage/export/Cargo.toml +++ b/components/external_storage/export/Cargo.toml @@ -65,7 +65,7 @@ futures-io = { version = "0.3" } futures-util = { version = "0.3", default-features = false, features = ["io"] } gcp = { optional = true, workspace = true } grpcio = { workspace = true, optional = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = { optional = true, version = "1.3" } libloading = { optional = true, version = "0.7.0" } once_cell = { optional = true, version = "1.3.1" } diff --git a/components/external_storage/export/src/dylib.rs b/components/external_storage/export/src/dylib.rs index a02f5f2fade..308973de95e 100644 --- a/components/external_storage/export/src/dylib.rs +++ b/components/external_storage/export/src/dylib.rs @@ -188,7 +188,7 @@ pub mod staticlib { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/export/src/export.rs b/components/external_storage/export/src/export.rs index 3cba0eaad8b..ad31dc363ae 100644 --- a/components/external_storage/export/src/export.rs +++ b/components/external_storage/export/src/export.rs @@ -3,11 +3,7 @@ //! To use External storage with protobufs as an application, import this //! module. external_storage contains the actual library code //! Cloud provider backends are under components/cloud -use std::{ - io::{self, Write}, - path::Path, - sync::Arc, -}; +use std::{io, path::Path, sync::Arc}; use async_trait::async_trait; #[cfg(feature = "cloud-aws")] @@ -22,27 +18,21 @@ use encryption::DataKeyManager; use external_storage::dylib_client; #[cfg(feature = "cloud-storage-grpc")] use external_storage::grpc_client; -use external_storage::{ - compression_reader_dispatcher, encrypt_wrap_reader, record_storage_create, BackendConfig, - HdfsStorage, -}; pub use external_storage::{ - read_external_storage_into_file, ExternalStorage, LocalStorage, NoopStorage, RestoreConfig, - UnpinReader, + compression_reader_dispatcher, encrypt_wrap_reader, read_external_storage_info_buff, + read_external_storage_into_file, record_storage_create, BackendConfig, ExternalData, + ExternalStorage, HdfsStorage, LocalStorage, NoopStorage, RestoreConfig, UnpinReader, + MIN_READ_SPEED, }; -use futures_io::AsyncRead; #[cfg(feature = "cloud-gcp")] pub use gcp::{Config as GcsConfig, GcsStorage}; pub use kvproto::brpb::StorageBackend_oneof_backend as Backend; #[cfg(any(feature = "cloud-gcp", feature = "cloud-aws", feature = "cloud-azure"))] use kvproto::brpb::{AzureBlobStorage, Gcs, S3}; use kvproto::brpb::{CloudDynamic, Noop, StorageBackend}; +use tikv_util::time::{Instant, Limiter}; #[cfg(feature = "cloud-storage-dylib")] use tikv_util::warn; -use tikv_util::{ - stream::block_on_external_io, - time::{Instant, Limiter}, -}; #[cfg(feature = "cloud-storage-dylib")] use crate::dylib; @@ -186,7 +176,9 @@ fn create_backend_inner( Backend::Hdfs(hdfs) => { Box::new(HdfsStorage::new(&hdfs.remote, backend_config.hdfs_config)?) } - Backend::Noop(_) => Box::new(NoopStorage::default()) as Box, + Backend::Noop(_) => { + Box::::default() as Box + } #[cfg(feature = "cloud-aws")] Backend::S3(config) => { let mut s = S3Storage::from_input(config.clone())?; @@ -308,13 +300,13 @@ impl std::ops::Deref for BlobStore { } } -pub struct EncryptedExternalStorage { +pub struct EncryptedExternalStorage { pub key_manager: Arc, - pub storage: Box, + pub storage: S, } #[async_trait] -impl ExternalStorage for EncryptedExternalStorage { +impl ExternalStorage for EncryptedExternalStorage { fn name(&self) -> &'static str { self.storage.name() } @@ -324,13 +316,13 @@ impl ExternalStorage for EncryptedExternalStorage { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()> { self.storage.write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.storage.read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.storage.read_part(name, off, len) } - fn restore( + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, @@ -354,19 +346,19 @@ impl ExternalStorage for EncryptedExternalStorage { compression_reader_dispatcher(compression_type, inner)? }; - let file_writer: &mut dyn Write = - &mut self.key_manager.create_file_for_write(&restore_name)?; + let file_writer = self.key_manager.create_file_for_write(&restore_name)?; let min_read_speed: usize = 8192; let mut input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( + read_external_storage_into_file( &mut input, file_writer, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -384,11 +376,11 @@ impl ExternalStorage for BlobStore { .await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).get(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { (**self).get_part(name, off, len) } } diff --git a/components/external_storage/src/dylib_client.rs b/components/external_storage/src/dylib_client.rs index 6d6dc35cf8a..9e2748c2011 100644 --- a/components/external_storage/src/dylib_client.rs +++ b/components/external_storage/src/dylib_client.rs @@ -92,7 +92,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/grpc_client.rs b/components/external_storage/src/grpc_client.rs index 3d715dfcd47..e836d8fb58a 100644 --- a/components/external_storage/src/grpc_client.rs +++ b/components/external_storage/src/grpc_client.rs @@ -95,7 +95,7 @@ impl ExternalStorage for ExternalStorageClient { .map_err(anyhow_to_io_log_error) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> crate::ExternalData<'_> { unimplemented!("use restore instead of read") } diff --git a/components/external_storage/src/hdfs.rs b/components/external_storage/src/hdfs.rs index 53574633c73..17556490320 100644 --- a/components/external_storage/src/hdfs.rs +++ b/components/external_storage/src/hdfs.rs @@ -7,7 +7,7 @@ use tokio::{io as async_io, process::Command}; use tokio_util::compat::FuturesAsyncReadCompatExt; use url::Url; -use crate::{ExternalStorage, UnpinReader}; +use crate::{ExternalData, ExternalStorage, UnpinReader}; /// Convert `hdfs:///path` to `/path` fn try_convert_to_path(url: &Url) -> &str { @@ -101,7 +101,7 @@ impl ExternalStorage for HdfsStorage { } cmd_with_args.extend([&cmd_path, "dfs", "-put", "-", path]); info!("calling hdfs"; "cmd" => ?cmd_with_args); - let mut hdfs_cmd = Command::new(&cmd_with_args[0]) + let mut hdfs_cmd = Command::new(cmd_with_args[0]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) @@ -131,16 +131,11 @@ impl ExternalStorage for HdfsStorage { } } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } - fn read_part( - &self, - _name: &str, - _off: u64, - _len: u64, - ) -> Box { + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { unimplemented!("currently only HDFS export is implemented") } } diff --git a/components/external_storage/src/lib.rs b/components/external_storage/src/lib.rs index 6bcbcfc839f..c344f09968b 100644 --- a/components/external_storage/src/lib.rs +++ b/components/external_storage/src/lib.rs @@ -26,7 +26,7 @@ use futures_util::AsyncReadExt; use kvproto::brpb::CompressionType; use openssl::hash::{Hasher, MessageDigest}; use tikv_util::{ - stream::{block_on_external_io, READ_BUF_SIZE}, + stream::READ_BUF_SIZE, time::{Instant, Limiter}, }; use tokio::time::timeout; @@ -58,6 +58,8 @@ pub fn record_storage_create(start: Instant, storage: &dyn ExternalStorage) { /// signature of write.) see https://github.com/rust-lang/rust/issues/63033 pub struct UnpinReader(pub Box); +pub type ExternalData<'a> = Box; + #[derive(Debug, Default)] pub struct BackendConfig { pub s3_multi_part_size: usize, @@ -73,10 +75,10 @@ pub struct RestoreConfig { } /// a reader dispatcher for different compression type. -pub fn compression_reader_dispatcher<'a>( +pub fn compression_reader_dispatcher( compression_type: Option, - inner: Box, -) -> io::Result> { + inner: ExternalData<'_>, +) -> io::Result> { match compression_type { Some(c) => match c { // The log files generated from TiKV v6.2.0 use the default value (0). @@ -107,13 +109,13 @@ pub trait ExternalStorage: 'static + Send + Sync { async fn write(&self, name: &str, reader: UnpinReader, content_length: u64) -> io::Result<()>; /// Read all contents of the given path. - fn read(&self, name: &str) -> Box; + fn read(&self, name: &str) -> ExternalData<'_>; /// Read part of contents of the given path. - fn read_part(&self, name: &str, off: u64, len: u64) -> Box; + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_>; /// Read from external storage and restore to the given path - fn restore( + async fn restore( &self, storage_name: &str, restore_name: std::path::PathBuf, @@ -137,22 +139,23 @@ pub trait ExternalStorage: 'static + Send + Sync { compression_reader_dispatcher(compression_type, inner)? }; - let output: &mut dyn Write = &mut File::create(restore_name)?; + let output = File::create(restore_name)?; // the minimum speed of reading data, in bytes/second. // if reading speed is slower than this rate, we will stop with // a "TimedOut" error. // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) let min_read_speed: usize = 8192; - let mut input = encrypt_wrap_reader(file_crypter, reader)?; + let input = encrypt_wrap_reader(file_crypter, reader)?; - block_on_external_io(read_external_storage_into_file( - &mut input, + read_external_storage_into_file( + input, output, speed_limiter, expected_length, expected_sha256, min_read_speed, - )) + ) + .await } } @@ -170,13 +173,32 @@ impl ExternalStorage for Arc { (**self).write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { (**self).read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { (**self).read_part(name, off, len) } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } #[async_trait] @@ -193,21 +215,40 @@ impl ExternalStorage for Box { self.as_ref().write(name, reader, content_length).await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> ExternalData<'_> { self.as_ref().read(name) } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> ExternalData<'_> { self.as_ref().read_part(name, off, len) } + + async fn restore( + &self, + storage_name: &str, + restore_name: std::path::PathBuf, + expected_length: u64, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> io::Result<()> { + self.as_ref() + .restore( + storage_name, + restore_name, + expected_length, + speed_limiter, + restore_config, + ) + .await + } } /// Wrap the reader with file_crypter. /// Return the reader directly if file_crypter is None. -pub fn encrypt_wrap_reader<'a>( +pub fn encrypt_wrap_reader( file_crypter: Option, - reader: Box, -) -> io::Result> { + reader: ExternalData<'_>, +) -> io::Result> { let input = match file_crypter { Some(x) => Box::new(DecrypterReader::new( reader, @@ -221,14 +262,18 @@ pub fn encrypt_wrap_reader<'a>( Ok(input) } -pub async fn read_external_storage_into_file( - input: &mut (dyn AsyncRead + Unpin), - output: &mut dyn Write, +pub async fn read_external_storage_into_file( + mut input: In, + mut output: Out, speed_limiter: &Limiter, expected_length: u64, expected_sha256: Option>, min_read_speed: usize, -) -> io::Result<()> { +) -> io::Result<()> +where + In: AsyncRead + Unpin, + Out: Write, +{ let dur = Duration::from_secs((READ_BUF_SIZE / min_read_speed) as u64); // do the I/O copy from external_storage to the local file. @@ -296,3 +341,88 @@ pub async fn read_external_storage_into_file( Ok(()) } + +pub const MIN_READ_SPEED: usize = 8192; + +pub async fn read_external_storage_info_buff( + reader: &mut (dyn AsyncRead + Unpin), + speed_limiter: &Limiter, + expected_length: u64, + expected_sha256: Option>, + min_read_speed: usize, +) -> io::Result> { + // the minimum speed of reading data, in bytes/second. + // if reading speed is slower than this rate, we will stop with + // a "TimedOut" error. + // (at 8 KB/s for a 2 MB buffer, this means we timeout after 4m16s.) + let read_speed = if min_read_speed > 0 { + min_read_speed + } else { + MIN_READ_SPEED + }; + let dur = Duration::from_secs((READ_BUF_SIZE / read_speed) as u64); + let mut output = Vec::new(); + let mut buffer = vec![0u8; READ_BUF_SIZE]; + + loop { + // separate the speed limiting from actual reading so it won't + // affect the timeout calculation. + let bytes_read = timeout(dur, reader.read(&mut buffer)) + .await + .map_err(|_| io::ErrorKind::TimedOut)??; + if bytes_read == 0 { + break; + } + + speed_limiter.consume(bytes_read).await; + output.append(&mut buffer[..bytes_read].to_vec()); + } + + // check length of file + if expected_length > 0 && output.len() != expected_length as usize { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "length not match, downloaded size {}, expected {}", + output.len(), + expected_length + ), + )); + } + // check sha256 of file + if let Some(sha256) = expected_sha256 { + let mut hasher = Hasher::new(MessageDigest::sha256()).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher failed to init: {}", err), + ) + })?; + hasher.update(&output).map_err(|err| { + io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher udpate failed: {}", err), + ) + })?; + + let cal_sha256 = hasher.finish().map_or_else( + |err| { + Err(io::Error::new( + io::ErrorKind::Other, + format!("openssl hasher finish failed: {}", err), + )) + }, + |bytes| Ok(bytes.to_vec()), + )?; + if !sha256.eq(&cal_sha256) { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "sha256 not match, expect: {:?}, calculate: {:?}", + sha256, cal_sha256, + ), + )); + } + } + + Ok(output) +} diff --git a/components/external_storage/src/local.rs b/components/external_storage/src/local.rs index 4b22de96a6a..0bf6be65107 100644 --- a/components/external_storage/src/local.rs +++ b/components/external_storage/src/local.rs @@ -3,14 +3,12 @@ use std::{ fs::File as StdFile, io::{self, BufReader, Read, Seek}, - marker::Unpin, path::{Path, PathBuf}, sync::Arc, }; use async_trait::async_trait; use futures::io::AllowStdIo; -use futures_io::AsyncRead; use futures_util::stream::TryStreamExt; use rand::Rng; use tikv_util::stream::error_stream; @@ -119,7 +117,7 @@ impl ExternalStorage for LocalStorage { self.base_dir.sync_all().await } - fn read(&self, name: &str) -> Box { + fn read(&self, name: &str) -> crate::ExternalData<'_> { debug!("read file from local storage"; "name" => %name, "base" => %self.base.display()); // We used std i/o here for removing the requirement of tokio reactor when @@ -131,7 +129,7 @@ impl ExternalStorage for LocalStorage { } } - fn read_part(&self, name: &str, off: u64, len: u64) -> Box { + fn read_part(&self, name: &str, off: u64, len: u64) -> crate::ExternalData<'_> { debug!("read part of file from local storage"; "name" => %name, "off" => %off, "len" => %len, "base" => %self.base.display()); diff --git a/components/external_storage/src/noop.rs b/components/external_storage/src/noop.rs index 42746742624..50e9c43c7bc 100644 --- a/components/external_storage/src/noop.rs +++ b/components/external_storage/src/noop.rs @@ -1,14 +1,11 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::marker::Unpin; - use async_trait::async_trait; -use futures_io::AsyncRead; use tokio::io; use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt}; use super::ExternalStorage; -use crate::UnpinReader; +use crate::{ExternalData, UnpinReader}; /// A storage saves files into void. /// It is mainly for test use. @@ -44,11 +41,11 @@ impl ExternalStorage for NoopStorage { Ok(()) } - fn read(&self, _name: &str) -> Box { + fn read(&self, _name: &str) -> ExternalData<'_> { Box::new(io::empty().compat()) } - fn read_part(&self, _name: &str, _off: u64, _len: u64) -> Box { + fn read_part(&self, _name: &str, _off: u64, _len: u64) -> ExternalData<'_> { Box::new(io::empty().compat()) } } diff --git a/components/file_system/src/io_stats/proc.rs b/components/file_system/src/io_stats/proc.rs index 60c8cac9c36..51c74ae56a8 100644 --- a/components/file_system/src/io_stats/proc.rs +++ b/components/file_system/src/io_stats/proc.rs @@ -225,7 +225,7 @@ mod tests { .write(true) .create(true) .custom_flags(O_DIRECT) - .open(&file_path) + .open(file_path) .unwrap(); let w = vec![A512::default(); 8]; let base_local_bytes = id.fetch_io_bytes().unwrap(); diff --git a/components/file_system/src/lib.rs b/components/file_system/src/lib.rs index 36acbc65a91..058b2a3a5f9 100644 --- a/components/file_system/src/lib.rs +++ b/components/file_system/src/lib.rs @@ -426,7 +426,7 @@ pub fn reserve_space_for_recover>(data_dir: P, file_size: u64) -> delete_file_if_exist(&path)?; } fn do_reserve(dir: &Path, path: &Path, file_size: u64) -> io::Result<()> { - let f = File::create(&path)?; + let f = File::create(path)?; f.allocate(file_size)?; f.sync_all()?; sync_dir(dir) @@ -483,7 +483,7 @@ mod tests { // Ensure it works for non-existent file. let non_existent_file = dir_path.join("non_existent_file"); - get_file_size(&non_existent_file).unwrap_err(); + get_file_size(non_existent_file).unwrap_err(); } #[test] @@ -504,7 +504,7 @@ mod tests { assert_eq!(file_exists(&existent_file), true); let non_existent_file = dir_path.join("non_existent_file"); - assert_eq!(file_exists(&non_existent_file), false); + assert_eq!(file_exists(non_existent_file), false); } #[test] @@ -525,7 +525,7 @@ mod tests { assert_eq!(file_exists(&existent_file), false); let non_existent_file = dir_path.join("non_existent_file"); - delete_file_if_exist(&non_existent_file).unwrap(); + delete_file_if_exist(non_existent_file).unwrap(); } fn gen_rand_file>(path: P, size: usize) -> u32 { diff --git a/components/into_other/Cargo.toml b/components/into_other/Cargo.toml index 39989a4bf75..d31f04f4e12 100644 --- a/components/into_other/Cargo.toml +++ b/components/into_other/Cargo.toml @@ -6,5 +6,5 @@ publish = false [dependencies] engine_traits = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index f8318237b20..5f2bf5935ee 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -6,7 +6,7 @@ publish = false [dependencies] byteorder = "1.2" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index c2ee9982bcd..c25e37f23b5 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -6,6 +6,7 @@ publish = false [features] failpoints = ["fail/failpoints"] +testexport = [] [dependencies] collections = { workspace = true } @@ -13,7 +14,7 @@ error_code = { workspace = true } fail = "0.5" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -28,6 +29,6 @@ thiserror = "1.0" tikv_alloc = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1", features = ["sync"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index e25e4a595bb..9f466a6a351 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -42,8 +42,8 @@ use super::{ UnixSecs, REQUEST_TIMEOUT, }; -const CQ_COUNT: usize = 1; -const CLIENT_PREFIX: &str = "pd"; +pub const CQ_COUNT: usize = 1; +pub const CLIENT_PREFIX: &str = "pd"; pub struct RpcClient { cluster_id: u64, @@ -86,7 +86,7 @@ impl RpcClient { ); let pd_connector = PdConnector::new(env.clone(), security_mgr.clone()); for i in 0..retries { - match pd_connector.validate_endpoints(cfg).await { + match pd_connector.validate_endpoints(cfg, true).await { Ok((client, target, members, tso)) => { let cluster_id = members.get_header().get_cluster_id(); let rpc_client = RpcClient { @@ -97,7 +97,7 @@ impl RpcClient { client, members, target, - tso, + tso.unwrap(), cfg.enable_forwarding, )), monitor: monitor.clone(), @@ -554,13 +554,16 @@ impl PdClient for RpcClient { .client_stub .get_region_by_id_async_opt(&req, call_option_inner(&inner)) .unwrap_or_else(|e| { - panic!("fail to request PD {} err {:?}", "get_region_by_id", e) + panic!( + "fail to request PD {} err {:?}", + "get_region_leader_by_id", e + ) }) }; Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) + .with_label_values(&["get_region_leader_by_id"]) .observe(duration_to_sec(timer.saturating_elapsed())); check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { @@ -1088,27 +1091,3 @@ impl PdClient for RpcClient { .execute() } } - -pub struct DummyPdClient { - pub next_ts: TimeStamp, -} - -impl DummyPdClient { - pub fn new() -> DummyPdClient { - DummyPdClient { - next_ts: TimeStamp::zero(), - } - } -} - -impl Default for DummyPdClient { - fn default() -> Self { - Self::new() - } -} - -impl PdClient for DummyPdClient { - fn batch_get_tso(&self, _count: u32) -> PdFuture { - Box::pin(future::ok(self.next_ts)) - } -} diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs new file mode 100644 index 00000000000..55f0c31b3c5 --- /dev/null +++ b/components/pd_client/src/client_v2.rs @@ -0,0 +1,1408 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! PD Client V2 +//! +//! In V1, the connection to PD and related states are all shared under a +//! `RwLock`. The maintenance of these states are implemented in a +//! decentralized way: each request will try to rebuild the connection on its +//! own if it encounters a network error. +//! +//! In V2, the responsibility to maintain the connection is moved into one +//! single long-running coroutine, namely [`reconnect_loop`]. Users of the +//! connection subscribe changes instead of altering it themselves. + +use std::{ + collections::HashMap, + fmt::Debug, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::{Duration, Instant as StdInstant}, + u64, +}; + +use fail::fail_point; +use futures::{ + compat::{Compat, Future01CompatExt}, + executor::block_on, + future::FutureExt, + select, + sink::SinkExt, + stream::{Stream, StreamExt}, + task::{Context, Poll}, +}; +use grpcio::{ + CallOption, Channel, ClientDuplexReceiver, ConnectivityState, EnvBuilder, Environment, + Error as GrpcError, Result as GrpcResult, WriteFlags, +}; +use kvproto::{ + metapb, + pdpb::{ + self, GetMembersResponse, PdClient as PdClientStub, RegionHeartbeatRequest, + RegionHeartbeatResponse, ReportBucketsRequest, TsoRequest, TsoResponse, + }, + replication_modepb::{ReplicationStatus, StoreDrAutoSyncStatus}, +}; +use security::SecurityManager; +use tikv_util::{ + box_err, + config::ReadableDuration, + error, info, + mpsc::future as mpsc, + slow_log, thd_name, + time::{duration_to_sec, Instant}, + timer::GLOBAL_TIMER_HANDLE, + warn, +}; +use tokio::sync::{broadcast, mpsc as tokio_mpsc}; +use txn_types::TimeStamp; + +use super::{ + client::{CLIENT_PREFIX, CQ_COUNT}, + metrics::*, + util::{check_resp_header, PdConnector, TargetInfo}, + Config, Error, FeatureGate, RegionInfo, Result, UnixSecs, + REQUEST_TIMEOUT as REQUEST_TIMEOUT_SEC, +}; +use crate::PdFuture; + +fn request_timeout() -> Duration { + fail_point!("pd_client_v2_request_timeout", |s| { + use std::str::FromStr; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + Duration::from_secs(REQUEST_TIMEOUT_SEC) +} + +/// Immutable context for making new connections. +struct ConnectContext { + cfg: Config, + connector: PdConnector, +} + +#[derive(Clone)] +struct RawClient { + stub: PdClientStub, + target_info: TargetInfo, + members: GetMembersResponse, +} + +impl RawClient { + async fn connect(ctx: &ConnectContext) -> Result { + // -1 means the max. + let retries = match ctx.cfg.retry_max_count { + -1 => std::isize::MAX, + v => v.saturating_add(1), + }; + for i in 0..retries { + match ctx.connector.validate_endpoints(&ctx.cfg, false).await { + Ok((stub, target_info, members, _)) => { + return Ok(RawClient { + stub, + target_info, + members, + }); + } + Err(e) => { + if i as usize % ctx.cfg.retry_log_every == 0 { + warn!("validate PD endpoints failed"; "err" => ?e); + } + let _ = GLOBAL_TIMER_HANDLE + .delay(StdInstant::now() + ctx.cfg.retry_interval.0) + .compat() + .await; + } + } + } + Err(box_err!("PD endpoints are invalid")) + } + + /// Returns Ok(true) when a new connection is established. + async fn maybe_reconnect(&mut self, ctx: &ConnectContext, force: bool) -> Result { + PD_RECONNECT_COUNTER_VEC.with_label_values(&["try"]).inc(); + let start = Instant::now(); + + let members = self.members.clone(); + let direct_connected = self.target_info.direct_connected(); + slow_log!(start.saturating_elapsed(), "try reconnect pd"); + let (stub, target_info, members, _) = match ctx + .connector + .reconnect_pd( + members, + direct_connected, + force, + ctx.cfg.enable_forwarding, + false, + ) + .await + { + Err(e) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["failure"]) + .inc(); + return Err(e); + } + Ok(None) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["no-need"]) + .inc(); + return Ok(false); + } + Ok(Some(tuple)) => { + PD_RECONNECT_COUNTER_VEC + .with_label_values(&["success"]) + .inc(); + tuple + } + }; + + fail_point!("pd_client_v2_reconnect", |_| Ok(true)); + + self.stub = stub; + self.target_info = target_info; + self.members = members; + + info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); + Ok(true) + } +} + +struct CachedRawClientCore { + context: ConnectContext, + + latest: Mutex, + version: AtomicU64, + on_reconnect_tx: broadcast::Sender<()>, +} + +/// A shared [`RawClient`] with a local copy of cache. +pub struct CachedRawClient { + core: Arc, + should_reconnect_tx: broadcast::Sender, + on_reconnect_rx: broadcast::Receiver<()>, + + cache: RawClient, + cache_version: u64, +} + +impl Clone for CachedRawClient { + fn clone(&self) -> Self { + Self { + core: self.core.clone(), + should_reconnect_tx: self.should_reconnect_tx.clone(), + on_reconnect_rx: self.core.on_reconnect_tx.subscribe(), + cache: self.cache.clone(), + cache_version: self.cache_version, + } + } +} + +impl CachedRawClient { + fn new( + cfg: Config, + env: Arc, + security_mgr: Arc, + should_reconnect_tx: broadcast::Sender, + ) -> Self { + let lame_stub = PdClientStub::new(Channel::lame(env.clone(), "0.0.0.0:0")); + let client = RawClient { + stub: lame_stub, + target_info: TargetInfo::new("0.0.0.0:0".to_string(), ""), + members: GetMembersResponse::new(), + }; + let context = ConnectContext { + cfg, + connector: PdConnector::new(env, security_mgr), + }; + let (tx, rx) = broadcast::channel(1); + let core = CachedRawClientCore { + context, + latest: Mutex::new(client.clone()), + version: AtomicU64::new(0), + on_reconnect_tx: tx, + }; + Self { + core: Arc::new(core), + should_reconnect_tx, + on_reconnect_rx: rx, + cache: client, + cache_version: 0, + } + } + + #[inline] + fn refresh_cache(&mut self) -> bool { + if self.cache_version < self.core.version.load(Ordering::Acquire) { + let latest = self.core.latest.lock().unwrap(); + self.cache = (*latest).clone(); + self.cache_version = self.core.version.load(Ordering::Relaxed); + true + } else { + false + } + } + + #[inline] + fn publish_cache(&mut self) { + let latest_version = { + let mut latest = self.core.latest.lock().unwrap(); + *latest = self.cache.clone(); + let _ = self.core.on_reconnect_tx.send(()); + self.core.version.fetch_add(1, Ordering::Relaxed) + 1 + }; + debug_assert!(self.cache_version < latest_version); + self.cache_version = latest_version; + } + + #[inline] + async fn wait_for_a_new_client( + rx: &mut broadcast::Receiver<()>, + current_version: u64, + latest_version: &AtomicU64, + ) -> bool { + let deadline = StdInstant::now() + request_timeout(); + loop { + if GLOBAL_TIMER_HANDLE + .timeout(Compat::new(Box::pin(rx.recv())), deadline) + .compat() + .await + .is_ok() + { + if current_version < latest_version.load(Ordering::Acquire) { + return true; + } + } else { + return false; + } + } + } + + /// Refreshes the local cache with latest client, then waits for the + /// connection to be ready. + /// The connection must be available if this function returns `Ok(())`. + async fn wait_for_ready(&mut self) -> Result<()> { + self.refresh_cache(); + if self.channel().check_connectivity_state(false) == ConnectivityState::GRPC_CHANNEL_READY { + return Ok(()); + } + select! { + r = self + .cache + .stub + .client + .channel() + .wait_for_connected(request_timeout()) + .fuse() => + { + if r { + return Ok(()); + } + } + r = Self::wait_for_a_new_client( + &mut self.on_reconnect_rx, + self.cache_version, + &self.core.version, + ).fuse() => { + if r { + assert!(self.refresh_cache()); + return Ok(()); + } + } + } + let _ = self.should_reconnect_tx.send(self.cache_version); + Err(box_err!( + "Connection unavailable {:?}", + self.channel().check_connectivity_state(false) + )) + } + + /// Makes the first connection. + async fn connect(&mut self) -> Result<()> { + self.cache = RawClient::connect(&self.core.context).await?; + self.publish_cache(); + Ok(()) + } + + /// Increases global version only when a new connection is established. + /// Might panic if `wait_for_ready` isn't called up-front. + async fn reconnect(&mut self) -> Result { + let force = (|| { + fail_point!("pd_client_force_reconnect", |_| true); + self.channel().check_connectivity_state(true) + == ConnectivityState::GRPC_CHANNEL_SHUTDOWN + })(); + if self + .cache + .maybe_reconnect(&self.core.context, force) + .await? + { + self.publish_cache(); + return Ok(true); + } + Ok(false) + } + + #[inline] + fn check_resp(&mut self, resp: GrpcResult) -> GrpcResult { + if matches!( + resp, + Err(GrpcError::RpcFailure(_) | GrpcError::RemoteStopped | GrpcError::RpcFinished(_)) + ) { + let _ = self.should_reconnect_tx.send(self.cache_version); + } + resp + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn stub(&self) -> &PdClientStub { + &self.cache.stub + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn channel(&self) -> &Channel { + self.cache.stub.client.channel() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn call_option(&self) -> CallOption { + self.cache.target_info.call_option() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn cluster_id(&self) -> u64 { + self.cache.members.get_header().get_cluster_id() + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[inline] + fn header(&self) -> pdpb::RequestHeader { + let mut header = pdpb::RequestHeader::default(); + header.set_cluster_id(self.cluster_id()); + header + } + + /// Might panic if `wait_for_ready` isn't called up-front. + #[cfg(feature = "testexport")] + #[inline] + fn leader(&self) -> pdpb::Member { + self.cache.members.get_leader().clone() + } + + #[inline] + fn initialized(&self) -> bool { + self.cache_version != 0 + } +} + +async fn reconnect_loop( + mut client: CachedRawClient, + cfg: Config, + mut should_reconnect: broadcast::Receiver, +) { + if let Err(e) = client.connect().await { + error!("failed to connect pd"; "err" => ?e); + return; + } + let backoff = (|| { + fail_point!("pd_client_v2_backoff", |s| { + use std::str::FromStr; + ReadableDuration::from_str(&s.unwrap()).unwrap().0 + }); + request_timeout() + })(); + let mut last_connect = StdInstant::now(); + loop { + if client.channel().wait_for_connected(request_timeout()).await { + let state = ConnectivityState::GRPC_CHANNEL_READY; + select! { + // Checks for leader change periodically. + _ = client + .channel() + .wait_for_state_change(state, cfg.update_interval.0) + .fuse() => {} + v = should_reconnect.recv().fuse() => { + match v { + Ok(v) if v < client.cache_version => continue, + Ok(_) => {} + Err(broadcast::error::RecvError::Lagged(_)) => continue, + Err(broadcast::error::RecvError::Closed) => break, + } + } + } + } + let target = last_connect + backoff; + if target > StdInstant::now() { + let _ = GLOBAL_TIMER_HANDLE.delay(target).compat().await; + } + last_connect = StdInstant::now(); + if let Err(e) = client.reconnect().await { + warn!("failed to reconnect pd"; "err" => ?e); + } + } +} + +#[derive(Clone)] +pub struct RpcClient { + pub raw_client: CachedRawClient, + feature_gate: FeatureGate, +} + +impl RpcClient { + pub fn new( + cfg: &Config, + shared_env: Option>, + security_mgr: Arc, + ) -> Result { + let env = shared_env.unwrap_or_else(|| { + Arc::new( + EnvBuilder::new() + .cq_count(CQ_COUNT) + .name_prefix(thd_name!(CLIENT_PREFIX)) + .build(), + ) + }); + + // Use broadcast channel for the lagging feature. + let (tx, rx) = broadcast::channel(1); + let raw_client = CachedRawClient::new(cfg.clone(), env, security_mgr, tx); + raw_client + .stub() + .spawn(reconnect_loop(raw_client.clone(), cfg.clone(), rx)); + + Ok(Self { + raw_client, + feature_gate: Default::default(), + }) + } + + #[inline] + pub fn subscribe_reconnect(&self) -> broadcast::Receiver<()> { + self.raw_client.clone().on_reconnect_rx + } + + #[cfg(feature = "testexport")] + pub fn feature_gate(&self) -> &FeatureGate { + &self.feature_gate + } + + #[cfg(feature = "testexport")] + pub fn get_leader(&mut self) -> pdpb::Member { + block_on(self.raw_client.wait_for_ready()).unwrap(); + self.raw_client.leader() + } + + #[cfg(feature = "testexport")] + pub fn reconnect(&mut self) -> Result { + block_on(self.raw_client.wait_for_ready())?; + block_on(self.raw_client.reconnect()) + } + + #[cfg(feature = "testexport")] + pub fn reset_to_lame_client(&mut self) { + let env = self.raw_client.core.context.connector.env.clone(); + let lame = PdClientStub::new(Channel::lame(env, "0.0.0.0:0")); + self.raw_client.core.latest.lock().unwrap().stub = lame.clone(); + self.raw_client.cache.stub = lame; + } + + #[cfg(feature = "testexport")] + pub fn initialized(&self) -> bool { + self.raw_client.initialized() + } +} + +pub trait PdClient { + type ResponseChannel: Stream>; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )>; + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result>; + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)>; + + fn fetch_cluster_id(&mut self) -> Result; + + fn load_global_config(&mut self, list: Vec) -> PdFuture>; + + fn watch_global_config( + &mut self, + ) -> Result>; + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result>; + + fn is_cluster_bootstrapped(&mut self) -> Result; + + fn alloc_id(&mut self) -> Result; + + fn is_recovering_marked(&mut self) -> Result; + + fn put_store(&mut self, store: metapb::Store) -> Result>; + + fn get_store_and_stats(&mut self, store_id: u64) + -> PdFuture<(metapb::Store, pdpb::StoreStats)>; + + fn get_store(&mut self, store_id: u64) -> Result { + block_on(self.get_store_and_stats(store_id)).map(|r| r.0) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result>; + + fn get_cluster_config(&mut self) -> Result; + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)>; + + fn get_region(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| r.0) + } + + fn get_region_info(&mut self, key: &[u8]) -> Result { + block_on(self.get_region_and_leader(key)).map(|r| RegionInfo::new(r.0, r.1)) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture>; + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture>; + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture; + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture; + + fn store_heartbeat( + &mut self, + stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture; + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()>; + + fn scatter_region(&mut self, region: RegionInfo) -> Result<()>; + + fn get_gc_safe_point(&mut self) -> PdFuture; + + fn get_operator(&mut self, region_id: u64) -> Result; + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()>; + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()>; +} + +pub struct CachedDuplexResponse { + latest: tokio_mpsc::Receiver>, + cache: Option>, +} + +impl CachedDuplexResponse { + fn new() -> (tokio_mpsc::Sender>, Self) { + let (tx, rx) = tokio_mpsc::channel(1); + ( + tx, + Self { + latest: rx, + cache: None, + }, + ) + } +} + +impl Stream for CachedDuplexResponse { + type Item = Result; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + loop { + if let Some(ref mut receiver) = self.cache { + match Pin::new(receiver).poll_next(cx) { + Poll::Ready(Some(Ok(item))) => return Poll::Ready(Some(Ok(item))), + Poll::Pending => return Poll::Pending, + // If it's None or there's error, we need to update receiver. + _ => {} + } + } + + match Pin::new(&mut self.latest).poll_recv(cx) { + Poll::Ready(Some(receiver)) => self.cache = Some(receiver), + Poll::Ready(None) => return Poll::Ready(None), + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl PdClient for RpcClient { + type ResponseChannel = CachedDuplexResponse; + + fn create_region_heartbeat_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<( + mpsc::Sender, + Self::ResponseChannel, + )> { + // TODO: use bounded channel. + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| { + fail::fail_point!("region_heartbeat_send_failed", |_| { + Err(grpcio::Error::RemoteStopped) + }); + Ok((r, WriteFlags::default())) + }); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for RegionHeartbeat stream"; "err" => ?e); + continue; + } + let (mut hb_tx, hb_rx) = raw_client + .stub() + .region_heartbeat_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "region_heartbeat", e) + }); + if resp_tx.send(hb_rx).await.is_err() { + break; + } + let res = hb_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("region heartbeat stream exited"; "res" => ?res); + } + let _ = hb_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn create_report_region_buckets_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for ReportRegionBuckets stream"; "err" => ?e); + continue; + } + let (mut bk_tx, bk_rx) = raw_client + .stub() + .report_buckets_opt(raw_client.call_option()) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_region_buckets", e) + }); + select! { + send_res = bk_tx.send_all(&mut requests).fuse() => { + if send_res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(send_res); + warn!("region buckets stream exited: {:?}", res); + } + } + recv_res = bk_rx.fuse() => { + let res = raw_client.check_resp(recv_res); + warn!("region buckets stream exited: {:?}", res); + } + } + let _ = bk_tx.close().await; + } + }); + Ok(tx) + } + + fn create_tso_stream( + &mut self, + wake_policy: mpsc::WakePolicy, + ) -> Result<(mpsc::Sender, Self::ResponseChannel)> { + let (tx, rx) = mpsc::unbounded(wake_policy); + let (resp_tx, resp_rx) = CachedDuplexResponse::::new(); + let mut raw_client = self.raw_client.clone(); + let mut requests = Box::pin(rx).map(|r| Ok((r, WriteFlags::default()))); + self.raw_client.stub().spawn(async move { + loop { + if let Err(e) = raw_client.wait_for_ready().await { + warn!("failed to acquire client for Tso stream"; "err" => ?e); + continue; + } + let (mut tso_tx, tso_rx) = raw_client + .stub() + .tso_opt(raw_client.call_option()) + .unwrap_or_else(|e| panic!("fail to request PD {} err {:?}", "tso", e)); + if resp_tx.send(tso_rx).await.is_err() { + break; + } + let res = tso_tx.send_all(&mut requests).await; + if res.is_ok() { + // requests are drained. + break; + } else { + let res = raw_client.check_resp(res); + warn!("tso exited"; "res" => ?res); + } + let _ = tso_tx.close().await; + } + }); + Ok((tx, resp_rx)) + } + + fn load_global_config(&mut self, list: Vec) -> PdFuture> { + use kvproto::pdpb::LoadGlobalConfigRequest; + let mut req = LoadGlobalConfigRequest::new(); + req.set_names(list.into()); + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + let fut = raw_client.stub().load_global_config_async(&req)?; + match fut.await { + Ok(grpc_response) => { + let mut res = HashMap::with_capacity(grpc_response.get_items().len()); + for c in grpc_response.get_items() { + if c.has_error() { + error!("failed to load global config with key {:?}", c.get_error()); + } else { + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); + } + } + Ok(res) + } + Err(err) => Err(box_err!("{:?}", err)), + } + }) + } + + fn watch_global_config( + &mut self, + ) -> Result> { + let req = pdpb::WatchGlobalConfigRequest::default(); + block_on(self.raw_client.wait_for_ready())?; + Ok(self.raw_client.stub().watch_global_config(&req)?) + } + + fn fetch_cluster_id(&mut self) -> Result { + if !self.raw_client.initialized() { + block_on(self.raw_client.wait_for_ready())?; + } + let id = self.raw_client.cluster_id(); + assert!(id > 0); + Ok(id) + } + + fn bootstrap_cluster( + &mut self, + stores: metapb::Store, + region: metapb::Region, + ) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["bootstrap_cluster"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::BootstrapRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(stores); + req.set_region(region); + + let resp = self.raw_client.stub().bootstrap_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.replication_status.take()) + } + + fn is_cluster_bootstrapped(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["is_cluster_bootstrapped"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsBootstrappedRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_bootstrapped_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_bootstrapped()) + } + + fn alloc_id(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["alloc_id"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::AllocIdRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().alloc_id_opt( + &req, + self.raw_client + .call_option() + .timeout(Duration::from_secs(10)), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + let id = resp.get_id(); + if id == 0 { + return Err(box_err!("pd alloc weird id 0")); + } + Ok(id) + } + + fn is_recovering_marked(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["is_recovering_marked"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::IsSnapshotRecoveringRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().is_snapshot_recovering_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.get_marked()) + } + + fn put_store(&mut self, store: metapb::Store) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["put_store"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::PutStoreRequest::default(); + req.set_header(self.raw_client.header()); + req.set_store(store); + + let resp = self.raw_client.stub().put_store_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.replication_status.take()) + } + + fn get_store_and_stats( + &mut self, + store_id: u64, + ) -> PdFuture<(metapb::Store, pdpb::StoreStats)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetStoreRequest::default(); + req.set_store_id(store_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_store_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_store_and_stats", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_store_and_stats"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let store = resp.take_store(); + if store.get_state() != metapb::StoreState::Tombstone { + Ok((store, resp.take_stats())) + } else { + Err(Error::StoreTombstone(format!("{:?}", store))) + } + }) + } + + fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_all_stores"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetAllStoresRequest::default(); + req.set_header(self.raw_client.header()); + req.set_exclude_tombstone_stores(exclude_tombstone); + + let resp = self.raw_client.stub().get_all_stores_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_stores().into()) + } + + fn get_cluster_config(&mut self) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_cluster_config"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetClusterConfigRequest::default(); + req.set_header(self.raw_client.header()); + + let resp = self.raw_client.stub().get_cluster_config_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let mut resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp.take_cluster()) + } + + fn get_region_and_leader( + &mut self, + key: &[u8], + ) -> PdFuture<(metapb::Region, Option)> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionRequest::default(); + req.set_region_key(key.to_vec()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_async_opt", e) + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + let region = if resp.has_region() { + resp.take_region() + } else { + return Err(Error::RegionNotFound(req.region_key)); + }; + let leader = if resp.has_leader() { + Some(resp.take_leader()) + } else { + None + }; + Ok((region, leader)) + }) + } + + fn get_region_by_id(&mut self, region_id: u64) -> PdFuture> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_region_by_id", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region_by_id"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + if resp.has_region() { + Ok(Some(resp.take_region())) + } else { + Ok(None) + } + }) + } + + fn get_region_leader_by_id( + &mut self, + region_id: u64, + ) -> PdFuture> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetRegionByIdRequest::default(); + req.set_region_id(region_id); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_region_by_id_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "get_region_leader_by_id", e + ); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_region_leader_by_id"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let mut resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + if resp.has_region() && resp.has_leader() { + Ok(Some((resp.take_region(), resp.take_leader()))) + } else { + Ok(None) + } + }) + } + + fn ask_split(&mut self, region: metapb::Region) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskSplitRequest::default(); + req.set_region(region); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_split_async_opt(&req, raw_client.call_option().timeout(request_timeout())) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["ask_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn ask_batch_split( + &mut self, + region: metapb::Region, + count: usize, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::AskBatchSplitRequest::default(); + req.set_region(region); + req.set_split_count(count as u32); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .ask_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "ask_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["ask_batch_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp) + }) + } + + fn store_heartbeat( + &mut self, + mut stats: pdpb::StoreStats, + store_report: Option, + dr_autosync_status: Option, + ) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::StoreHeartbeatRequest::default(); + stats + .mut_interval() + .set_end_timestamp(UnixSecs::now().into_inner()); + req.set_stats(stats); + if let Some(report) = store_report { + req.set_store_report(report); + } + if let Some(status) = dr_autosync_status { + req.set_dr_autosync_status(status); + } + + let mut raw_client = self.raw_client.clone(); + let feature_gate = self.feature_gate.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .store_heartbeat_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "store_heartbeat", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["store_heartbeat"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + match feature_gate.set_version(resp.get_cluster_version()) { + Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), + Ok(true) => info!("set cluster version to {}", resp.get_cluster_version()), + _ => {} + }; + Ok(resp) + }) + } + + fn report_batch_split(&mut self, regions: Vec) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportBatchSplitRequest::default(); + req.set_regions(regions.into()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_batch_split_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "report_batch_split", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["report_batch_split"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["scatter_region"]) + .start_coarse_timer(); + + let mut req = pdpb::ScatterRegionRequest::default(); + req.set_region_id(region.get_id()); + if let Some(leader) = region.leader.take() { + req.set_leader(leader); + } + req.set_region(region.region); + + block_on(self.raw_client.wait_for_ready())?; + req.set_header(self.raw_client.header()); + let resp = self.raw_client.stub().scatter_region_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header()) + } + + fn get_gc_safe_point(&mut self) -> PdFuture { + let timer = Instant::now_coarse(); + + let mut req = pdpb::GetGcSafePointRequest::default(); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .get_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "get_gc_saft_point", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_gc_saft_point"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(resp.get_safe_point()) + }) + } + + fn get_operator(&mut self, region_id: u64) -> Result { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["get_operator"]) + .start_coarse_timer(); + + block_on(self.raw_client.wait_for_ready())?; + + let mut req = pdpb::GetOperatorRequest::default(); + req.set_header(self.raw_client.header()); + req.set_region_id(region_id); + + let resp = self.raw_client.stub().get_operator_opt( + &req, + self.raw_client.call_option().timeout(request_timeout()), + ); + let resp = self.raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + + Ok(resp) + } + + fn update_service_safe_point( + &mut self, + name: String, + safe_point: TimeStamp, + ttl: Duration, + ) -> PdFuture<()> { + let timer = Instant::now_coarse(); + let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); + req.set_service_id(name.into()); + req.set_ttl(ttl.as_secs() as _); + req.set_safe_point(safe_point.into_inner()); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .update_service_gc_safe_point_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!( + "fail to request PD {} err {:?}", + "update_service_safe_point", e + ); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["update_service_safe_point"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } + + fn report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) -> PdFuture<()> { + let timer = Instant::now_coarse(); + + let mut req = pdpb::ReportMinResolvedTsRequest::default(); + req.set_store_id(store_id); + req.set_min_resolved_ts(min_resolved_ts); + + let mut raw_client = self.raw_client.clone(); + Box::pin(async move { + raw_client.wait_for_ready().await?; + req.set_header(raw_client.header()); + let resp = raw_client + .stub() + .report_min_resolved_ts_async_opt( + &req, + raw_client.call_option().timeout(request_timeout()), + ) + .unwrap_or_else(|e| { + panic!("fail to request PD {} err {:?}", "min_resolved_ts", e); + }) + .await; + PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["min_resolved_ts"]) + .observe(duration_to_sec(timer.saturating_elapsed())); + let resp = raw_client.check_resp(resp)?; + check_resp_header(resp.get_header())?; + Ok(()) + }) + } +} diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index e4350e3d396..8674130c799 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -1,8 +1,12 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. + +#![feature(let_chains)] + #[allow(unused_extern_crates)] extern crate tikv_alloc; mod client; +mod client_v2; mod feature_gate; pub mod metrics; mod tso; @@ -23,7 +27,8 @@ use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; pub use self::{ - client::{DummyPdClient, RpcClient}, + client::RpcClient, + client_v2::{PdClient as PdClientV2, RpcClient as RpcClientV2}, config::Config, errors::{Error, Result}, feature_gate::{Feature, FeatureGate}, diff --git a/components/pd_client/src/tso.rs b/components/pd_client/src/tso.rs index a19d7af8f06..feec5061a8c 100644 --- a/components/pd_client/src/tso.rs +++ b/components/pd_client/src/tso.rs @@ -180,40 +180,41 @@ impl<'a> Stream for TsoRequestStream<'a> { fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let pending_requests = self.pending_requests.clone(); let mut pending_requests = pending_requests.borrow_mut(); - let mut requests = Vec::new(); - while requests.len() < MAX_BATCH_SIZE && pending_requests.len() < MAX_PENDING_COUNT { - match self.request_rx.poll_recv(cx) { - Poll::Ready(Some(sender)) => { - requests.push(sender); + if pending_requests.len() < MAX_PENDING_COUNT { + let mut requests = Vec::new(); + while requests.len() < MAX_BATCH_SIZE { + match self.request_rx.poll_recv(cx) { + Poll::Ready(Some(sender)) => { + requests.push(sender); + } + Poll::Ready(None) if requests.is_empty() => { + return Poll::Ready(None); + } + _ => break, } - Poll::Ready(None) if requests.is_empty() => { - return Poll::Ready(None); - } - _ => break, + } + if !requests.is_empty() { + let mut req = TsoRequest::default(); + req.mut_header().cluster_id = self.cluster_id; + req.count = requests.iter().map(|r| r.count).sum(); + + let request_group = RequestGroup { + tso_request: req.clone(), + requests, + }; + pending_requests.push_back(request_group); + PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); + + let write_flags = WriteFlags::default().buffer_hint(false); + return Poll::Ready(Some((req, write_flags))); } } - if !requests.is_empty() { - let mut req = TsoRequest::default(); - req.mut_header().cluster_id = self.cluster_id; - req.count = requests.iter().map(|r| r.count).sum(); - - let request_group = RequestGroup { - tso_request: req.clone(), - requests, - }; - pending_requests.push_back(request_group); - PD_PENDING_TSO_REQUEST_GAUGE.set(pending_requests.len() as i64); - - let write_flags = WriteFlags::default().buffer_hint(false); - Poll::Ready(Some((req, write_flags))) - } else { - // Set the waker to the context, then the stream can be waked up after the - // pending queue is no longer full. - self.self_waker.register(cx.waker()); - Poll::Pending - } + // Set the waker to the context, then the stream can be waked up after the + // pending queue is no longer full. + self.self_waker.register(cx.waker()); + Poll::Pending } } diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 2aa74176627..72c8cc16b04 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -51,13 +51,14 @@ const MAX_RETRY_DURATION: Duration = Duration::from_secs(10); const GLOBAL_RECONNECT_INTERVAL: Duration = Duration::from_millis(100); // 0.1s pub const REQUEST_RECONNECT_INTERVAL: Duration = Duration::from_secs(1); // 1s +#[derive(Clone)] pub struct TargetInfo { target_url: String, via: String, } impl TargetInfo { - fn new(target_url: String, via: &str) -> TargetInfo { + pub(crate) fn new(target_url: String, via: &str) -> TargetInfo { TargetInfo { target_url, via: trim_http_prefix(via).to_string(), @@ -340,7 +341,13 @@ impl Client { async move { let direct_connected = self.inner.rl().target_info().direct_connected(); connector - .reconnect_pd(members, direct_connected, force, self.enable_forwarding) + .reconnect_pd( + members, + direct_connected, + force, + self.enable_forwarding, + true, + ) .await } }; @@ -383,7 +390,7 @@ impl Client { fail_point!("pd_client_reconnect", |_| Ok(())); - self.update_client(client, target_info, members, tso); + self.update_client(client, target_info, members, tso.unwrap()); info!("trying to update PD client done"; "spend" => ?start.saturating_elapsed()); Ok(()) } @@ -521,11 +528,13 @@ pub type StubTuple = ( PdClientStub, TargetInfo, GetMembersResponse, - TimestampOracle, + // Only used by RpcClient, not by RpcClientV2. + Option, ); +#[derive(Clone)] pub struct PdConnector { - env: Arc, + pub(crate) env: Arc, security_mgr: Arc, } @@ -534,7 +543,7 @@ impl PdConnector { PdConnector { env, security_mgr } } - pub async fn validate_endpoints(&self, cfg: &Config) -> Result { + pub async fn validate_endpoints(&self, cfg: &Config, build_tso: bool) -> Result { let len = cfg.endpoints.len(); let mut endpoints_set = HashSet::with_capacity_and_hasher(len, Default::default()); let mut members = None; @@ -575,7 +584,7 @@ impl PdConnector { match members { Some(members) => { let res = self - .reconnect_pd(members, true, true, cfg.enable_forwarding) + .reconnect_pd(members, true, true, cfg.enable_forwarding, build_tso) .await? .unwrap(); info!("all PD endpoints are consistent"; "endpoints" => ?cfg.endpoints); @@ -593,7 +602,9 @@ impl PdConnector { .max_send_message_len(-1) .max_receive_message_len(-1) .keepalive_time(Duration::from_secs(10)) - .keepalive_timeout(Duration::from_secs(3)); + .keepalive_timeout(Duration::from_secs(3)) + .max_reconnect_backoff(Duration::from_secs(5)) + .initial_reconnect_backoff(Duration::from_secs(1)); self.security_mgr.connect(cb, addr_trim) }; fail_point!("cluster_id_is_not_ready", |_| { @@ -602,7 +613,7 @@ impl PdConnector { GetMembersResponse::default(), )) }); - let client = PdClientStub::new(channel); + let client = PdClientStub::new(channel.clone()); let option = CallOption::default().timeout(Duration::from_secs(REQUEST_TIMEOUT)); let response = client .get_members_async_opt(&GetMembersRequest::default(), option) @@ -680,12 +691,13 @@ impl PdConnector { // not empty and it can connect the leader now which represents the network // partition problem to leader may be recovered 3. the member information of // PD has been changed - async fn reconnect_pd( + pub async fn reconnect_pd( &self, members_resp: GetMembersResponse, direct_connected: bool, force: bool, enable_forwarding: bool, + build_tso: bool, ) -> Result> { let resp = self.load_members(&members_resp).await?; let leader = resp.get_leader(); @@ -699,11 +711,15 @@ impl PdConnector { match res { Some((client, target_url)) => { let info = TargetInfo::new(target_url, ""); - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } None => { @@ -714,11 +730,15 @@ impl PdConnector { } if enable_forwarding && has_network_error { if let Ok(Some((client, info))) = self.try_forward(members, leader).await { - let tso = TimestampOracle::new( - resp.get_header().get_cluster_id(), - &client, - info.call_option(), - )?; + let tso = if build_tso { + Some(TimestampOracle::new( + resp.get_header().get_cluster_id(), + &client, + info.call_option(), + )?) + } else { + None + }; return Ok(Some((client, info, resp, tso))); } } @@ -774,7 +794,9 @@ impl PdConnector { loop { let (res, has_network_err) = self.connect_member(leader).await?; match res { - Some((client, ep, _)) => return Ok((Some((client, ep)), has_network_err)), + Some((client, ep, _)) => { + return Ok((Some((client, ep)), has_network_err)); + } None => { if has_network_err && retry_times > 0 @@ -854,6 +876,7 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::Ok => Ok(()), ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), ErrorType::Unknown => Err(box_err!(err.get_message())), + ErrorType::InvalidValue => Err(box_err!(err.get_message())), } } diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 2b9d2de73ff..0ee185fd365 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" encryption = { workspace = true } engine_traits = { workspace = true } file_system = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.4.0" num_cpus = "1" online_config = { workspace = true } diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 1da553cb22e..a376adc25b7 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -305,13 +305,17 @@ impl RaftLogEngine { ))) } + pub fn path(&self) -> &str { + self.0.path() + } + /// If path is not an empty directory, we say db exists. pub fn exists(path: &str) -> bool { let path = Path::new(path); if !path.exists() || !path.is_dir() { return false; } - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } pub fn raft_groups(&self) -> Vec { @@ -615,6 +619,10 @@ impl RaftEngine for RaftLogEngine { Ok(self.0.get_used_size() as u64) } + fn get_engine_path(&self) -> &str { + self.path() + } + fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 6156771afa8..8eda4e5ae24 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -16,7 +16,6 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #[macro_use] extern crate tikv_util; diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 8bb91b40bb9..46ed20f8d10 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -30,27 +30,33 @@ cloud-azure = ["raftstore/cloud-azure"] [dependencies] batch-system = { workspace = true } +causal_ts = { workspace = true } collections = { workspace = true } +concurrency_manager = { workspace = true } crossbeam = "0.8" engine_traits = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } +fs2 = "0.4" futures = { version = "0.3", features = ["compat"] } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } +prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { workspace = true } +resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } txn_types = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } [dev-dependencies] engine_test = { workspace = true } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index b387300b40e..199e8cafbd8 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -2,14 +2,20 @@ use std::{ ops::{Deref, DerefMut}, - sync::{Arc, Mutex}, + path::Path, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, Mutex, + }, time::Duration, }; use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, HandleResult, HandlerBuilder, PollHandler, }; +use causal_ts::CausalTsProviderImpl; use collections::HashMap; +use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{Sender, TrySendError}; use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; use file_system::{set_io_type, IoType}; @@ -18,10 +24,11 @@ use kvproto::{ metapb::Store, raft_serverpb::{PeerState, RaftMessage}, }; +use pd_client::PdClient; use raft::INVALID_ID; use raftstore::store::{ - fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, RaftlogFetchRunner, - RaftlogFetchTask, StoreWriters, Transport, WriteSenders, + fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, }; use slog::Logger; use tikv_util::{ @@ -42,6 +49,7 @@ use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, + worker::{PdRunner, PdTask}, Error, Result, }; @@ -68,7 +76,9 @@ pub struct StoreContext { pub engine: ER, pub tablet_factory: Arc>, pub apply_pool: FuturePool, - pub log_fetch_scheduler: Scheduler, + pub read_scheduler: Scheduler>, + pub snap_mgr: TabletSnapManager, + pub pd_scheduler: Scheduler, } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -215,11 +225,13 @@ struct StorePollerBuilder { tablet_factory: Arc>, trans: T, router: StoreRouter, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, + pd_scheduler: Scheduler, write_senders: WriteSenders, apply_pool: FuturePool, logger: Logger, store_meta: Arc>>, + snap_mgr: TabletSnapManager, } impl StorePollerBuilder { @@ -230,10 +242,12 @@ impl StorePollerBuilder { tablet_factory: Arc>, trans: T, router: StoreRouter, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, + pd_scheduler: Scheduler, store_writers: &mut StoreWriters, logger: Logger, store_meta: Arc>>, + snap_mgr: TabletSnapManager, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -252,11 +266,13 @@ impl StorePollerBuilder { tablet_factory, trans, router, - log_fetch_scheduler, + read_scheduler, + pd_scheduler, apply_pool, logger, write_senders: store_writers.senders(), store_meta, + snap_mgr, } } @@ -264,6 +280,7 @@ impl StorePollerBuilder { fn init(&self) -> Result>> { let mut regions = HashMap::default(); let cfg = self.cfg.value(); + let mut meta = self.store_meta.lock().unwrap(); self.engine .for_each_raft_group::(&mut |region_id| { assert_ne!(region_id, INVALID_ID); @@ -271,14 +288,17 @@ impl StorePollerBuilder { region_id, self.store_id, self.engine.clone(), - self.log_fetch_scheduler.clone(), + self.read_scheduler.clone(), &self.logger, )? { Some(p) => p, None => return Ok(()), }; - let pair = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; - let prev = regions.insert(region_id, pair); + let (sender, peer_fsm) = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; + meta.region_read_progress + .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); + + let prev = regions.insert(region_id, (sender, peer_fsm)); if let Some((_, p)) = prev { return Err(box_err!( "duplicate region {:?} vs {:?}", @@ -324,7 +344,9 @@ where engine: self.engine.clone(), tablet_factory: self.tablet_factory.clone(), apply_pool: self.apply_pool.clone(), - log_fetch_scheduler: self.log_fetch_scheduler.clone(), + read_scheduler: self.read_scheduler.clone(), + snap_mgr: self.snap_mgr.clone(), + pd_scheduler: self.pd_scheduler.clone(), }; let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) @@ -335,14 +357,16 @@ where /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously - log_fetch_worker: Worker, + async_read_worker: Worker, + pd_worker: Worker, store_writers: StoreWriters, } impl Default for Workers { fn default() -> Self { Self { - log_fetch_worker: Worker::new("raftlog-fetch-worker"), + async_read_worker: Worker::new("async-read-worker"), + pd_worker: Worker::new("pd-worker"), store_writers: StoreWriters::default(), } } @@ -353,29 +377,61 @@ pub struct StoreSystem { system: BatchSystem, StoreFsm>, workers: Option>, logger: Logger, + shutdown: Arc, } impl StoreSystem { - pub fn start( + pub fn start( &mut self, store_id: u64, cfg: Arc>, raft_engine: ER, tablet_factory: Arc>, trans: T, + pd_client: Arc, router: &StoreRouter, store_meta: Arc>>, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Result<()> where T: Transport + 'static, + C: PdClient + 'static, { + let sync_router = Mutex::new(router.clone()); + pd_client.handle_reconnect(move || { + sync_router + .lock() + .unwrap() + .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); + }); + let mut workers = Workers::default(); workers .store_writers .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; - let log_fetch_scheduler = workers.log_fetch_worker.start( - "raftlog-fetch-worker", - RaftlogFetchRunner::new(router.clone(), raft_engine.clone()), + + let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); + read_runner.set_snap_mgr(snap_mgr.clone()); + let read_scheduler = workers + .async_read_worker + .start("async-read-worker", read_runner); + + let pd_scheduler = workers.pd_worker.start( + "pd-worker", + PdRunner::new( + store_id, + pd_client, + raft_engine.clone(), + tablet_factory.clone(), + router.clone(), + workers.pd_worker.remote(), + concurrency_manager, + causal_ts_provider, + self.logger.clone(), + self.shutdown.clone(), + ), ); let mut builder = StorePollerBuilder::new( @@ -385,10 +441,12 @@ impl StoreSystem { tablet_factory, trans, router.clone(), - log_fetch_scheduler, + read_scheduler, + pd_scheduler, &mut workers.store_writers, self.logger.clone(), store_meta.clone(), + snap_mgr, ); self.workers = Some(workers); let peers = builder.init()?; @@ -425,6 +483,8 @@ impl StoreSystem { } pub fn shutdown(&mut self) { + self.shutdown.store(true, Ordering::Relaxed); + if self.workers.is_none() { return; } @@ -435,7 +495,8 @@ impl StoreSystem { self.system.shutdown(); workers.store_writers.shutdown(); - workers.log_fetch_worker.stop(); + workers.async_read_worker.stop(); + workers.pd_worker.stop(); } } @@ -457,7 +518,7 @@ impl StoreRouter { ) -> std::result::Result<(), TrySendError>> { let id = msg.get_region_id(); let peer_msg = PeerMsg::RaftMessage(msg); - let store_msg = match self.try_send(id, peer_msg) { + let store_msg = match self.router.try_send(id, peer_msg) { Either::Left(Ok(())) => return Ok(()), Either::Left(Err(TrySendError::Full(PeerMsg::RaftMessage(m)))) => { return Err(TrySendError::Full(m)); @@ -468,7 +529,7 @@ impl StoreRouter { Either::Right(PeerMsg::RaftMessage(m)) => StoreMsg::RaftMessage(m), _ => unreachable!(), }; - match self.send_control(store_msg) { + match self.router.send_control(store_msg) { Ok(()) => Ok(()), Err(TrySendError::Full(StoreMsg::RaftMessage(m))) => Err(TrySendError::Full(m)), Err(TrySendError::Disconnected(StoreMsg::RaftMessage(m))) => { @@ -512,6 +573,7 @@ where system, workers: None, logger: logger.clone(), + shutdown: Arc::new(AtomicBool::new(false)), }; (StoreRouter { router, logger }, system) } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 4a1e05b8f75..b8faf589760 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -11,11 +11,15 @@ use std::{ use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::KvEngine; +use engine_traits::{KvEngine, TabletFactory}; use futures::{Future, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; +use raftstore::store::ReadTask; use slog::Logger; -use tikv_util::mpsc::future::{self, Receiver, Sender, WakePolicy}; +use tikv_util::{ + mpsc::future::{self, Receiver, Sender, WakePolicy}, + worker::Scheduler, +}; use crate::{ raft::Apply, @@ -61,10 +65,20 @@ impl ApplyFsm { region_state: RegionLocalState, res_reporter: R, remote_tablet: CachedTablet, + tablet_factory: Arc>, + read_scheduler: Scheduler>, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); - let apply = Apply::new(peer, region_state, res_reporter, remote_tablet, logger); + let apply = Apply::new( + peer, + region_state, + res_reporter, + remote_tablet, + tablet_factory, + read_scheduler, + logger, + ); ( ApplyScheduler { sender: tx }, Self { @@ -86,6 +100,7 @@ impl ApplyFsm { match task { // TODO: flush by buffer size. ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, + ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index a1beedef968..cd93463a524 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -40,7 +40,7 @@ impl PeerFsm { pub fn new( cfg: &Config, tablet_factory: &dyn TabletFactory, - storage: Storage, + storage: Storage, ) -> Result> { let peer = Peer::new(cfg, tablet_factory, storage)?; info!(peer.logger, "create peer"); @@ -187,15 +187,15 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.store_ctx .raft_metrics .propose_wait_time - .observe(duration_to_sec(send_time.saturating_elapsed()) as f64); + .observe(duration_to_sec(send_time.saturating_elapsed())); } fn on_tick(&mut self, tick: PeerTick) { match tick { PeerTick::Raft => self.on_raft_tick(), + PeerTick::PdHeartbeat => self.on_pd_heartbeat(), PeerTick::RaftLogGc => unimplemented!(), PeerTick::SplitRegionCheck => unimplemented!(), - PeerTick::PdHeartbeat => unimplemented!(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => unimplemented!(), @@ -220,6 +220,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), + PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { @@ -229,8 +230,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .fsm .peer_mut() .on_persisted(self.store_ctx, peer_id, ready_number), - PeerMsg::FetchedLogs(fetched_logs) => { - self.fsm.peer_mut().on_fetched_logs(fetched_logs) + PeerMsg::LogsFetched(fetched_logs) => { + self.fsm.peer_mut().on_logs_fetched(fetched_logs) + } + PeerMsg::SnapshotGenerated(snap_res) => { + self.fsm.peer_mut().on_snapshot_generated(snap_res) } PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), #[cfg(feature = "testexport")] @@ -238,6 +242,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } } // TODO: instead of propose pending commands immediately, we should use timeout. - self.fsm.peer.propose_pending_command(self.store_ctx); + self.fsm.peer.propose_pending_writes(self.store_ctx); } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 3be571bdfbc..546ec95a604 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,16 +1,26 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::SystemTime; +use std::time::{Duration, SystemTime}; use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; -use raftstore::store::{Config, ReadDelegate}; -use slog::{o, Logger}; -use tikv_util::mpsc::{self, LooseBoundedSender, Receiver}; +use futures::{compat::Future01CompatExt, FutureExt}; +use kvproto::{metapb::Region, raft_serverpb::RaftMessage}; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{Config, ReadDelegate, RegionReadProgressRegistry}, +}; +use slog::{info, o, Logger}; +use tikv_util::{ + future::poll_future_notify, + is_zero_duration, + mpsc::{self, LooseBoundedSender, Receiver}, +}; use crate::{ batch::StoreContext, + raft::Peer, router::{StoreMsg, StoreTick}, tablet::CachedTablet, }; @@ -24,6 +34,8 @@ where pub readers: HashMap, /// region_id -> tablet cache pub tablet_caches: HashMap>, + /// region_id -> `RegionReadProgress` + pub region_read_progress: RegionReadProgressRegistry, } impl StoreMeta @@ -35,6 +47,7 @@ where store_id: None, readers: HashMap::default(), tablet_caches: HashMap::default(), + region_read_progress: RegionReadProgressRegistry::new(), } } } @@ -74,7 +87,7 @@ impl Store { } pub struct StoreFsm { - store: Store, + pub store: Store, receiver: Receiver, } @@ -118,8 +131,8 @@ impl Fsm for StoreFsm { } pub struct StoreFsmDelegate<'a, EK: KvEngine, ER: RaftEngine, T> { - fsm: &'a mut StoreFsm, - store_ctx: &'a mut StoreContext, + pub fsm: &'a mut StoreFsm, + pub store_ctx: &'a mut StoreContext, } impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { @@ -137,10 +150,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .duration_since(SystemTime::UNIX_EPOCH) .map_or(0, |d| d.as_secs()), ); + + self.on_pd_store_heartbeat(); + } + + pub fn schedule_tick(&mut self, tick: StoreTick, timeout: Duration) { + if !is_zero_duration(&timeout) { + let mb = self.store_ctx.router.control_mailbox(); + let logger = self.fsm.store.logger().clone(); + let delay = self.store_ctx.timer.delay(timeout).compat().map(move |_| { + if let Err(e) = mb.force_send(StoreMsg::Tick(tick)) { + info!( + logger, + "failed to schedule store tick, are we shutting down?"; + "tick" => ?tick, + "err" => ?e + ); + } + }); + poll_future_notify(delay); + } } fn on_tick(&mut self, tick: StoreTick) { - unimplemented!() + match tick { + StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat(), + _ => unimplemented!(), + } } pub fn handle_msgs(&mut self, store_msg_buf: &mut Vec) { @@ -149,6 +185,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { StoreMsg::Start => self.on_start(), StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), + StoreMsg::SplitInit(msg) => self.fsm.store.on_split_init(self.store_ctx, msg), } } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 0c1a460298d..7dea9d55901 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -22,7 +22,9 @@ // using a standalone modules. #![allow(unused)] -#![feature(let_else)] +#![feature(let_chains)] +#![feature(array_windows)] +#![feature(div_duration)] mod batch; mod bootstrap; @@ -31,6 +33,7 @@ mod operation; mod raft; pub mod router; mod tablet; +mod worker; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 03d0690fe25..69e318c3a2e 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -7,6 +7,8 @@ //! - Apply after conf change is committed //! - Update raft state using the result of conf change +use std::time::Instant; + use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ @@ -39,12 +41,12 @@ use crate::{ #[derive(Default, Debug)] pub struct ConfChangeResult { pub index: u64, - // The proposed ConfChangeV2 or (legacy) ConfChange - // ConfChange (if it is) will convert to ConfChangeV2 + // The proposed ConfChangeV2 or (legacy) ConfChange. + // ConfChange (if it is) will be converted to ConfChangeV2. pub conf_change: ConfChangeV2, // The change peer requests come along with ConfChangeV2 - // or (legacy) ConfChange, for ConfChange, it only contains - // one element + // or (legacy) ConfChange. For ConfChange, it only contains + // one element. pub changes: Vec, pub region_state: RegionLocalState, } @@ -95,6 +97,7 @@ impl Peer { util::check_conf_change( &ctx.cfg, self.raft_group(), + self.region(), self.peer(), changes.as_ref(), &cc, @@ -126,7 +129,11 @@ impl Peer { Ok(proposal_index) } - pub fn on_apply_res_conf_change(&mut self, conf_change: ConfChangeResult) { + pub fn on_apply_res_conf_change( + &mut self, + ctx: &mut StoreContext, + conf_change: ConfChangeResult, + ) { // TODO: cancel generating snapshot. // Snapshot is applied in memory without waiting for all entries being @@ -149,6 +156,7 @@ impl Peer { "notify pd with change peer region"; "region" => ?self.region(), ); + self.region_heartbeat_pd(ctx); let demote_self = tikv_util::store::is_learner(self.peer()); if remove_self || demote_self { warn!(self.logger, "removing or demoting leader"; "remove" => remove_self, "demote" => demote_self); @@ -156,12 +164,23 @@ impl Peer { self.raft_group_mut() .raft .become_follower(term, raft::INVALID_ID); - } else if conf_change.changes.iter().any(|c| { - matches!( - c.get_change_type(), - ConfChangeType::AddNode | ConfChangeType::AddLearnerNode - ) - }) { + } + let mut has_new_peer = None; + for c in conf_change.changes { + let peer_id = c.get_peer().get_id(); + match c.get_change_type() { + ConfChangeType::AddNode | ConfChangeType::AddLearnerNode => { + if has_new_peer.is_none() { + has_new_peer = Some(Instant::now()); + } + self.add_peer_heartbeat(peer_id, has_new_peer.unwrap()); + } + ConfChangeType::RemoveNode => { + self.remove_peer_heartbeat(peer_id); + } + } + } + if has_new_peer.is_some() { // Speed up snapshot instead of waiting another heartbeat. self.raft_group_mut().ping(); self.set_has_ready(); diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 396e3ede98f..eb6560d239e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -1,12 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. mod conf_change; +mod split; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{ - raft_cmdpb::{AdminRequest, RaftCmdRequest}, - raft_serverpb::PeerState, -}; +use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}; use protobuf::Message; use raft::prelude::ConfChangeV2; use raftstore::{ @@ -19,6 +17,7 @@ use raftstore::{ Result, }; use slog::info; +pub use split::{SplitInit, SplitResult}; use tikv_util::box_err; use self::conf_change::ConfChangeResult; @@ -30,6 +29,7 @@ use crate::{ #[derive(Debug)] pub enum AdminCmdResult { + SplitRegion(SplitResult), ConfChange(ConfChangeResult), } @@ -65,23 +65,38 @@ impl Peer { ch.report_error(resp); return; } - // To maintain propose order, we need to make pending proposal first. - self.propose_pending_command(ctx); let cmd_type = req.get_admin_request().get_cmd_type(); + if let Some(conflict) = self.proposal_control_mut().check_conflict(Some(cmd_type)) { + conflict.delay_channel(ch); + return; + } + // To maintain propose order, we need to make pending proposal first. + self.propose_pending_writes(ctx); let res = if apply::is_conf_change_cmd(&req) { self.propose_conf_change(ctx, req) } else { // propose other admin command. - unimplemented!() + match cmd_type { + AdminCmdType::Split => Err(box_err!( + "Split is deprecated. Please use BatchSplit instead." + )), + AdminCmdType::BatchSplit => self.propose_split(ctx, req), + _ => unimplemented!(), + } }; - if let Err(e) = &res { - info!( - self.logger, - "failed to propose admin command"; - "cmd_type" => ?cmd_type, - "error" => ?e, - ); + match &res { + Ok(index) => self + .proposal_control_mut() + .record_proposed_admin(cmd_type, *index), + Err(e) => { + info!( + self.logger, + "failed to propose admin command"; + "cmd_type" => ?cmd_type, + "error" => ?e, + ); + } } - self.post_propose_write(ctx, res, vec![ch]); + self.post_propose_command(ctx, res, vec![ch], true); } } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs new file mode 100644 index 00000000000..2782b436439 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -0,0 +1,834 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains batch split related processing logic. +//! +//! Process Overview +//! +//! Propose: +//! - Nothing special except for validating batch split requests (ex: split keys +//! are in ascending order). +//! +//! Apply: +//! - apply_batch_split: Create and initialize metapb::region for split regions +//! and derived regions. Then, create checkpoints of the current talbet for +//! split regions and derived region to make tablet physical isolated. Update +//! the parent region's region state without persistency. Send the new regions +//! (including derived region) back to raftstore. +//! +//! On Apply Result: +//! - on_ready_split_region: Update the relevant in memory meta info of the +//! parent peer, then send to the store the relevant info needed to create and +//! initialize the split regions. +//! +//! Split peer creation and initlization: +//! - on_split_init: In normal cases, the uninitialized split region will be +//! created by the store, and here init it using the data sent from the parent +//! peer. + +use std::collections::VecDeque; + +use crossbeam::channel::{SendError, TrySendError}; +use engine_traits::{ + Checkpointer, DeleteStrategy, KvEngine, OpenOptions, RaftEngine, RaftLogBatch, Range, + CF_DEFAULT, SPLIT_PREFIX, +}; +use fail::fail_point; +use keys::enc_end_key; +use kvproto::{ + metapb::{self, Region, RegionEpoch}, + raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, + raft_serverpb::RegionLocalState, +}; +use protobuf::Message; +use raft::RawNode; +use raftstore::{ + coprocessor::RegionChangeReason, + store::{ + fsm::apply::validate_batch_split, + metrics::PEER_ADMIN_CMD_COUNTER, + util::{self, KeysInfoFormatter}, + PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, + }, + Result, +}; +use slog::{error, info, warn, Logger}; +use tikv_util::box_err; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + operation::AdminCmdResult, + raft::{write_initial_states, Apply, Peer, Storage}, + router::{ApplyRes, PeerMsg, StoreMsg}, +}; + +#[derive(Debug)] +pub struct SplitResult { + pub regions: Vec, + // The index of the derived region in `regions` + pub derived_index: usize, + pub tablet_index: u64, +} +pub struct SplitInit { + /// Split region + pub region: metapb::Region, + pub check_split: bool, + pub parent_is_leader: bool, + + /// In-memory pessimistic locks that should be inherited from parent region + pub locks: PeerPessimisticLocks, +} + +impl Peer { + pub fn propose_split( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + validate_batch_split(req.get_admin_request(), self.region())?; + // We rely on ConflictChecker to detect conflicts, so no need to set proposal + // context. + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + info!( + self.logger, + "split is deprecated, redirect to use batch split"; + ); + let split = req.get_split().to_owned(); + let mut admin_req = AdminRequest::default(); + admin_req + .mut_splits() + .set_right_derive(split.get_right_derive()); + admin_req.mut_splits().mut_requests().push(split); + // This method is executed only when there are unapplied entries after being + // restarted. So there will be no callback, it's OK to return a response + // that does not matched with its request. + self.apply_batch_split(req, log_index) + } + + pub fn apply_batch_split( + &mut self, + req: &AdminRequest, + log_index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); + + let region = self.region_state().get_region(); + let region_id = region.get_id(); + validate_batch_split(req, self.region_state().get_region())?; + + let mut boundaries: Vec<&[u8]> = Vec::default(); + boundaries.push(self.region_state().get_region().get_start_key()); + for req in req.get_splits().get_requests() { + boundaries.push(req.get_split_key()); + } + boundaries.push(self.region_state().get_region().get_end_key()); + + info!( + self.logger, + "split region"; + "region" => ?region, + "boundaries" => %KeysInfoFormatter(boundaries.iter()), + ); + + let split_reqs = req.get_splits(); + let new_region_cnt = split_reqs.get_requests().len(); + let new_version = region.get_region_epoch().get_version() + new_region_cnt as u64; + + let mut derived_req = SplitRequest::default(); + derived_req.new_region_id = region.id; + let derived_req = &[derived_req]; + + let right_derive = split_reqs.get_right_derive(); + let reqs = if right_derive { + split_reqs.get_requests().iter().chain(derived_req) + } else { + derived_req.iter().chain(split_reqs.get_requests()) + }; + + let regions: Vec<_> = boundaries + .array_windows::<2>() + .zip(reqs) + .map(|([start_key, end_key], req)| { + let mut new_region = Region::default(); + new_region.set_id(req.get_new_region_id()); + new_region.set_region_epoch(region.get_region_epoch().to_owned()); + new_region.mut_region_epoch().set_version(new_version); + new_region.set_start_key(start_key.to_vec()); + new_region.set_end_key(end_key.to_vec()); + new_region.set_peers(region.get_peers().to_vec().into()); + // If the `req` is the `derived_req`, the peers are already set correctly and + // the following loop will not be executed due to the empty `new_peer_ids` in + // the `derived_req` + for (peer, peer_id) in new_region + .mut_peers() + .iter_mut() + .zip(req.get_new_peer_ids()) + { + peer.set_id(*peer_id); + } + new_region + }) + .collect(); + + let derived_index = if right_derive { regions.len() - 1 } else { 0 }; + + // We will create checkpoint of the current tablet for both derived region and + // split regions. Before the creation, we should flush the writes and remove the + // write batch + self.flush(); + + // todo(SpadeA): Here: we use a temporary solution that we use checkpoint API to + // clone new tablets. It may cause large jitter as we need to flush the + // memtable. And more what is more important is that after removing WAL, the API + // will never flush. + // We will freeze the memtable rather than flush it in the following PR. + let tablet = self.tablet().clone(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint object: {:?}", + self.logger.list(), + e + ) + }); + + for new_region in ®ions { + let new_region_id = new_region.id; + if new_region_id == region_id { + continue; + } + + let split_temp_path = self.tablet_factory().tablet_path_with_prefix( + SPLIT_PREFIX, + new_region_id, + RAFT_INIT_LOG_INDEX, + ); + checkpointer + .create_at(&split_temp_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + split_temp_path, + e + ) + }); + } + + let derived_path = self.tablet_factory().tablet_path(region_id, log_index); + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + derived_path, + e + ) + }); + let tablet = self + .tablet_factory() + .open_tablet(region_id, Some(log_index), OpenOptions::default()) + .unwrap(); + // Remove the old write batch. + self.write_batch_mut().take(); + self.publish_tablet(tablet); + + self.region_state_mut() + .set_region(regions[derived_index].clone()); + self.region_state_mut().set_tablet_index(log_index); + + let mut resp = AdminResponse::default(); + resp.mut_splits().set_regions(regions.clone().into()); + PEER_ADMIN_CMD_COUNTER.batch_split.success.inc(); + + Ok(( + resp, + AdminCmdResult::SplitRegion(SplitResult { + regions, + derived_index, + tablet_index: log_index, + }), + )) + } +} + +impl Peer { + pub fn on_ready_split_region( + &mut self, + store_ctx: &mut StoreContext, + derived_index: usize, + tablet_index: u64, + regions: Vec, + ) { + fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); + + let derived = ®ions[derived_index]; + let derived_epoch = derived.get_region_epoch().clone(); + let region_id = derived.get_id(); + + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let region_locks = { + let mut pessimistic_locks = self.txn_ext().pessimistic_locks.write(); + info!(self.logger, "moving {} locks to new regions", pessimistic_locks.len();); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived_epoch.get_version(); + pessimistic_locks.group_by_regions(®ions, derived) + }; + fail_point!("on_split_invalidate_locks"); + + // Roughly estimate the size and keys for new regions. + let new_region_count = regions.len() as u64; + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); + self.set_region( + reader, + derived.clone(), + RegionChangeReason::Split, + tablet_index, + ); + } + + self.post_split(); + + if self.is_leader() { + self.region_heartbeat_pd(store_ctx); + // Notify pd immediately to let it update the region meta. + info!( + self.logger, + "notify pd with split"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "split_count" => regions.len(), + ); + // Now pd only uses ReportBatchSplit for history operation show, + // so we send it independently here. + self.report_batch_split_pd(store_ctx, regions.to_vec()); + } + + let last_region_id = regions.last().unwrap().get_id(); + for (new_region, locks) in regions.into_iter().zip(region_locks) { + let new_region_id = new_region.get_id(); + if new_region_id == region_id { + continue; + } + + let split_init = PeerMsg::SplitInit(Box::new(SplitInit { + region: new_region, + parent_is_leader: self.is_leader(), + check_split: last_region_id == new_region_id, + locks, + })); + + // First, send init msg to peer directly. Returning error means the peer is not + // existed in which case we should redirect it to the store. + match store_ctx.router.force_send(new_region_id, split_init) { + Ok(_) => {} + Err(SendError(PeerMsg::SplitInit(msg))) => { + store_ctx + .router + .force_send_control(StoreMsg::SplitInit(msg)) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to send split peer intialization msg to store : {:?}", + self.logger.list(), + e + ) + }); + } + _ => unreachable!(), + } + } + } + + pub fn on_split_init( + &mut self, + store_ctx: &mut StoreContext, + split_init: Box, + ) { + let region_id = split_init.region.id; + let replace = split_init.region.get_region_epoch().get_version() + > self + .storage() + .region_state() + .get_region() + .get_region_epoch() + .get_version(); + + if !self.storage().is_initialized() || replace { + let split_temp_path = store_ctx.tablet_factory.tablet_path_with_prefix( + SPLIT_PREFIX, + region_id, + RAFT_INIT_LOG_INDEX, + ); + + let tablet = store_ctx + .tablet_factory + .load_tablet(&split_temp_path, region_id, RAFT_INIT_LOG_INDEX) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to load tablet {:?} :{:?}", + self.logger.list(), + split_temp_path, + e + ) + }); + + self.tablet_mut().set(tablet); + + let storage = Storage::with_split( + self.peer().get_store_id(), + &split_init.region, + store_ctx.engine.clone(), + store_ctx.read_scheduler.clone(), + &store_ctx.logger, + ) + .unwrap_or_else(|e| panic!("fail to create storage: {:?}", e)) + .unwrap(); + + let applied_index = storage.apply_state().get_applied_index(); + let peer_id = storage.peer().get_id(); + let raft_cfg = store_ctx.cfg.new_raft_config(peer_id, applied_index); + + let mut raft_group = RawNode::new(&raft_cfg, storage, &self.logger).unwrap(); + // If this region has only one peer and I am the one, campaign directly. + if split_init.region.get_peers().len() == 1 { + raft_group.campaign().unwrap(); + self.set_has_ready(); + } + self.set_raft_group(raft_group); + } else { + // TODO: when reaching here (peer is initalized before and cannot be replaced), + // it is much complexer. + return; + } + + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + + info!( + self.logger, + "init split region"; + "region" => ?split_init.region, + ); + + // TODO: GlobalReplicationState + + for p in split_init.region.get_peers() { + self.insert_peer_cache(p.clone()); + } + + if split_init.parent_is_leader { + if self.maybe_campaign() { + self.set_has_ready(); + } + + *self.txn_ext().pessimistic_locks.write() = split_init.locks; + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. + self.region_heartbeat_pd(store_ctx); + } + + meta.tablet_caches.insert(region_id, self.tablet().clone()); + meta.readers + .insert(region_id, self.generate_read_delegate()); + meta.region_read_progress + .insert(region_id, self.read_progress().clone()); + } + + if split_init.check_split { + // TODO: check if the last region needs to split again + } + + self.schedule_apply_fsm(store_ctx); + } +} + +#[cfg(test)] +mod test { + use std::sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }; + + use collections::HashMap; + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::TestTabletFactoryV2, + raft, + }; + use engine_traits::{CfOptionsExt, Peekable, TabletFactory, WriteBatch, ALL_CFS}; + use futures::channel::mpsc::unbounded; + use kvproto::{ + metapb::RegionEpoch, + raft_cmdpb::{AdminCmdType, BatchSplitRequest, PutRequest, RaftCmdResponse, SplitRequest}, + raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}, + }; + use raftstore::store::{cmd_resp::new_error, Config, ReadRunner}; + use slog::o; + use tempfile::TempDir; + use tikv_util::{ + codec::bytes::encode_bytes, + config::VersionTrack, + store::{new_learner_peer, new_peer}, + worker::{dummy_future_scheduler, dummy_scheduler, FutureScheduler, Scheduler, Worker}, + }; + + use super::*; + use crate::{ + fsm::{ApplyFsm, ApplyResReporter}, + raft::Apply, + tablet::CachedTablet, + }; + + struct MockReporter { + sender: Sender, + } + + impl MockReporter { + fn new() -> (Self, Receiver) { + let (tx, rx) = channel(); + (MockReporter { sender: tx }, rx) + } + } + + impl ApplyResReporter for MockReporter { + fn report(&self, apply_res: ApplyRes) { + let _ = self.sender.send(apply_res); + } + } + + fn new_split_req(key: &[u8], id: u64, children: Vec) -> SplitRequest { + let mut req = SplitRequest::default(); + req.set_split_key(key.to_vec()); + req.set_new_region_id(id); + req.set_new_peer_ids(children); + req + } + + fn assert_split( + apply: &mut Apply, + factory: &Arc, + parent_id: u64, + right_derived: bool, + new_region_ids: Vec, + split_keys: Vec>, + children_peers: Vec>, + log_index: u64, + region_boundries: Vec<(Vec, Vec)>, + expected_region_epoch: RegionEpoch, + expected_derived_index: usize, + ) { + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(right_derived); + + for ((new_region_id, children), split_key) in new_region_ids + .into_iter() + .zip(children_peers.clone()) + .zip(split_keys) + { + splits + .mut_requests() + .push(new_split_req(&split_key, new_region_id, children)); + } + + let mut req = AdminRequest::default(); + req.set_splits(splits); + + // Exec batch split + let (resp, apply_res) = apply.apply_batch_split(&req, log_index).unwrap(); + + let regions = resp.get_splits().get_regions(); + assert!(regions.len() == region_boundries.len()); + + let mut child_idx = 0; + for (i, region) in regions.iter().enumerate() { + assert_eq!(region.get_start_key().to_vec(), region_boundries[i].0); + assert_eq!(region.get_end_key().to_vec(), region_boundries[i].1); + assert_eq!(*region.get_region_epoch(), expected_region_epoch); + + if region.id == parent_id { + let state = apply.region_state(); + assert_eq!(state.tablet_index, log_index); + assert_eq!(state.get_region(), region); + let tablet_path = factory.tablet_path(region.id, log_index); + assert!(factory.exists_raw(&tablet_path)); + + match apply_res { + AdminCmdResult::SplitRegion(SplitResult { + derived_index, + tablet_index, + .. + }) => { + assert_eq!(expected_derived_index, derived_index); + assert_eq!(tablet_index, log_index); + } + _ => panic!(), + } + } else { + assert_eq! { + region.get_peers().iter().map(|peer| peer.id).collect::>(), + children_peers[child_idx] + } + child_idx += 1; + + let tablet_path = + factory.tablet_path_with_prefix(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); + assert!(factory.exists_raw(&tablet_path)); + } + } + } + + #[test] + fn test_split() { + let store_id = 2; + + let mut region = Region::default(); + region.set_id(1); + region.set_end_key(b"k10".to_vec()); + region.mut_region_epoch().set_version(3); + let peers = vec![new_peer(2, 3), new_peer(4, 5), new_learner_peer(6, 7)]; + region.set_peers(peers.into()); + + let logger = slog_global::borrow_global().new(o!()); + let path = TempDir::new().unwrap(); + let cf_opts = ALL_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path(), + DbOptions::default(), + cf_opts, + )); + + let tablet = factory + .open_tablet( + region.id, + Some(5), + OpenOptions::default().set_create_new(true), + ) + .unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region.clone()); + region_state.set_tablet_index(5); + + let (read_scheduler, _rx) = dummy_scheduler(); + let (reporter, _) = MockReporter::new(); + let mut apply = Apply::new( + region + .get_peers() + .iter() + .find(|p| p.store_id == store_id) + .unwrap() + .clone(), + region_state, + reporter, + CachedTablet::new(Some(tablet)), + factory.clone(), + read_scheduler, + logger.clone(), + ); + + let mut splits = BatchSplitRequest::default(); + splits.set_right_derive(true); + splits.mut_requests().push(new_split_req(b"k1", 1, vec![])); + let mut req = AdminRequest::default(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // 3 followers are required. + assert!(err.to_string().contains("invalid new peer id count")); + + splits.mut_requests().clear(); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // Empty requests should be rejected. + assert!(err.to_string().contains("missing split requests")); + + splits + .mut_requests() + .push(new_split_req(b"k11", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let resp = new_error(apply.apply_batch_split(&req, 0).unwrap_err()); + // Out of range keys should be rejected. + assert!( + resp.get_header().get_error().has_key_not_in_region(), + "{:?}", + resp + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // Empty key will not in any region exclusively. + assert!(err.to_string().contains("missing split key"), "{:?}", err); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // keys should be in ascend order. + assert!( + err.to_string().contains("invalid split request"), + "{:?}", + err + ); + + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k1", 1, vec![11, 12, 13])); + splits + .mut_requests() + .push(new_split_req(b"k2", 1, vec![11, 12])); + req.set_splits(splits.clone()); + let err = apply.apply_batch_split(&req, 0).unwrap_err(); + // All requests should be checked. + assert!(err.to_string().contains("id count"), "{:?}", err); + + let cases = vec![ + // region 1["", "k10"] + // After split: region 1 ["", "k09"], + // region 10 ["k09", "k10"] + ( + 1, + false, + vec![10], + vec![b"k09".to_vec()], + vec![vec![11, 12, 13]], + 10, + vec![ + (b"".to_vec(), b"k09".to_vec()), + (b"k09".to_vec(), b"k10".to_vec()), + ], + 4, + 0, + ), + // region 1 ["", "k09"] + // After split: region 20 ["", "k01"], + // region 1 ["k01", "k09"] + ( + 1, + true, + vec![20], + vec![b"k01".to_vec()], + vec![vec![21, 22, 23]], + 20, + vec![ + (b"".to_vec(), b"k01".to_vec()), + (b"k01".to_vec(), b"k09".to_vec()), + ], + 5, + 1, + ), + // region 1 ["k01", "k09"] + // After split: region 30 ["k01", "k02"], + // region 40 ["k02", "k03"], + // region 1 ["k03", "k09"] + ( + 1, + true, + vec![30, 40], + vec![b"k02".to_vec(), b"k03".to_vec()], + vec![vec![31, 32, 33], vec![41, 42, 43]], + 30, + vec![ + (b"k01".to_vec(), b"k02".to_vec()), + (b"k02".to_vec(), b"k03".to_vec()), + (b"k03".to_vec(), b"k09".to_vec()), + ], + 7, + 2, + ), + // region 1 ["k03", "k09"] + // After split: region 1 ["k03", "k07"], + // region 50 ["k07", "k08"], + // region 60 ["k08", "k09"] + ( + 1, + false, + vec![50, 60], + vec![b"k07".to_vec(), b"k08".to_vec()], + vec![vec![51, 52, 53], vec![61, 62, 63]], + 40, + vec![ + (b"k03".to_vec(), b"k07".to_vec()), + (b"k07".to_vec(), b"k08".to_vec()), + (b"k08".to_vec(), b"k09".to_vec()), + ], + 9, + 0, + ), + ]; + + for ( + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + version, + expected_derived_index, + ) in cases + { + let mut expected_epoch = RegionEpoch::new(); + expected_epoch.set_version(version); + + assert_split( + &mut apply, + &factory, + parent_id, + right_derive, + new_region_ids, + split_keys, + children_peers, + log_index, + region_boundries, + expected_epoch, + expected_derived_index, + ); + } + + // Split will create checkpoint tablet, so if there are some writes before + // split, they should be flushed immediately. + apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); + assert!(!WriteBatch::is_empty( + apply.write_batch_mut().as_ref().unwrap() + )); + splits.mut_requests().clear(); + splits + .mut_requests() + .push(new_split_req(b"k05", 70, vec![71, 72, 73])); + req.set_splits(splits); + apply.apply_batch_split(&req, 50).unwrap(); + assert!(apply.write_batch_mut().is_none()); + assert_eq!(apply.tablet().get_value(b"k04").unwrap().unwrap(), b"v4"); + } +} diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs new file mode 100644 index 00000000000..5fb25b4e20d --- /dev/null +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -0,0 +1,428 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::LinkedList, mem, num::NonZeroU64}; + +use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, +}; +use raftstore::{ + store::{ + cmd_resp, + fsm::apply, + msg::ErrorCallback, + util::{ + admin_cmd_epoch_lookup, AdminCmdEpochState, NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + }, + }, + Error, +}; + +use crate::router::CmdResChannel; + +#[derive(Debug)] +pub struct ProposedAdminCmd { + cmd_type: AdminCmdType, + committed: bool, + epoch_state: AdminCmdEpochState, + index: u64, + /// Callbacks of commands that are conflict with on going admin command. + /// + /// Callbacks are delayed to avoid making client retry with arbitrary + /// backoff. + delayed_chs: Vec, +} + +impl ProposedAdminCmd { + fn new( + cmd_type: AdminCmdType, + epoch_state: AdminCmdEpochState, + index: u64, + ) -> ProposedAdminCmd { + ProposedAdminCmd { + cmd_type, + committed: false, + epoch_state, + index, + delayed_chs: Vec::new(), + } + } + + pub fn cmd_type(&self) -> AdminCmdType { + self.cmd_type + } + + /// Delay responding to channel until the command is applied so client won't + /// retry with arbitrary timeout. + pub fn delay_channel(&mut self, ch: CmdResChannel) { + self.delayed_chs.push(ch); + } + + /// Same as `delay_channel`, but accepts a batch. + pub fn delay_channels(&mut self, chs: Vec) { + if self.delayed_chs.is_empty() { + self.delayed_chs = chs; + } else { + self.delayed_chs.extend(chs); + } + } +} + +/// `ProposalControl` is a rewrite of `CmdEpochChecker` from v1. +/// +/// Admin command may change the epoch of a region. If a proposal is proposed +/// after the admin command is proposed but before the command is applied, the +/// proposal is probably to fail because of epoch not match. `ProposalControl` +/// aims to detect the failure early. With `ProposalControl`, users can assume +/// once a command is proposed, it's likely to succeed in the end. +/// +/// Compared to `CmdEpochChecker`, `ProposalControl` also traces the whole +/// lifetime of prepare merge. +pub struct ProposalControl { + // Use `LinkedList` to reduce memory footprint. In most cases, the list + // should be empty or 1 element. And access speed is not a concern. + proposed_admin_cmd: LinkedList, + pending_merge_index: u64, + term: u64, +} + +impl ProposalControl { + pub fn new(term: u64) -> ProposalControl { + ProposalControl { + proposed_admin_cmd: LinkedList::new(), + pending_merge_index: 0, + term, + } + } + + /// Clears all queued conflict callbacks if term changed. + /// + /// If term is changed, leader is probably changed. Clear all callbacks to + /// notify clients to retry with new leader. + #[inline] + pub fn maybe_update_term(&mut self, term: u64) { + match term.cmp(&self.term) { + std::cmp::Ordering::Equal => (), + std::cmp::Ordering::Greater => { + for cmd in mem::take(&mut self.proposed_admin_cmd) { + for cb in cmd.delayed_chs { + apply::notify_stale_req(term, cb); + } + } + self.term = term; + } + std::cmp::Ordering::Less => { + panic!("term should not decrease, old {}, new {}", self.term, term) + } + } + } + + /// Check if a proposal is conflict with proposed admin commands in current + /// term. If the proposal is an admin command, then its type should be + /// passed, otherwise just provide `None`. + /// + /// Returns None if passing the epoch check, otherwise returns the last + /// conflict conflict proposal meta. + pub fn check_conflict( + &mut self, + cmd_type: Option, + ) -> Option<&mut ProposedAdminCmd> { + let (check_ver, check_conf_ver) = match cmd_type { + None => (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER), + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } + }; + self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (check_ver && cmd.epoch_state.change_ver) + || (check_conf_ver && cmd.epoch_state.change_conf_ver) + }) + } + + /// Record an admin proposal. + /// + /// Further requests that is conflict with the admin proposal will be + /// rejected in `check_proposal_conflict`. + pub fn record_proposed_admin(&mut self, cmd_type: AdminCmdType, index: u64) { + let epoch_state = admin_cmd_epoch_lookup(cmd_type); + if !epoch_state.change_conf_ver && !epoch_state.change_ver { + return; + } + + let conflict_cmd = self.proposed_admin_cmd.iter_mut().rev().find(|cmd| { + (epoch_state.check_ver && cmd.epoch_state.change_ver) + || (epoch_state.check_conf_ver && cmd.epoch_state.change_conf_ver) + }); + assert!(conflict_cmd.is_none(), "{:?}", conflict_cmd); + + if let Some(cmd) = self.proposed_admin_cmd.back() { + assert!(cmd.index < index, "{:?} {}", cmd, index); + } + self.proposed_admin_cmd + .push_back(ProposedAdminCmd::new(cmd_type, epoch_state, index)); + } + + /// Commit the admin commands. + #[inline] + pub fn commit_to(&mut self, index: u64, mut on_commit: impl FnMut(&ProposedAdminCmd)) { + if self.proposed_admin_cmd.is_empty() { + return; + } + + for cmd in &mut self.proposed_admin_cmd { + if cmd.committed { + continue; + } + if cmd.index <= index { + cmd.committed = true; + on_commit(cmd); + continue; + } + return; + } + } + + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { + while !self.proposed_admin_cmd.is_empty() { + let cmd = self.proposed_admin_cmd.front_mut().unwrap(); + if cmd.index <= index { + for ch in cmd.delayed_chs.drain(..) { + let mut resp = cmd_resp::new_error(Error::EpochNotMatch( + format!( + "current epoch of region {} is {:?}", + region.get_id(), + region.get_region_epoch(), + ), + vec![region.to_owned()], + )); + cmd_resp::bind_term(&mut resp, term); + ch.report_error(resp); + } + } else { + break; + } + self.proposed_admin_cmd.pop_front(); + } + } + + #[inline] + pub fn enter_prepare_merge(&mut self, prepare_merge_index: u64) { + self.pending_merge_index = prepare_merge_index; + } + + #[inline] + pub fn leave_prepare_merge(&mut self, prepare_merge_index: u64) { + if self.pending_merge_index != 0 { + assert_eq!(self.pending_merge_index, prepare_merge_index); + self.pending_merge_index = 0; + } + } + + /// Check if there is an on-going split command on current term. + /// + /// The answer is reliable only when the peer is leader. + #[inline] + pub fn is_splitting(&self) -> bool { + if self.proposed_admin_cmd.is_empty() { + return false; + } + // Split is deprecated in v2, only needs to check `BatchSplit`. + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::BatchSplit && c.committed) + } + + /// Check if there the current peer is waiting for being merged. + /// + /// The answer is reliable only when the peer is leader or `PrepareMerge` is + /// applied. + #[inline] + pub fn is_merging(&self) -> bool { + if self.proposed_admin_cmd.is_empty() { + return self.pending_merge_index != 0; + } + self.proposed_admin_cmd + .iter() + .any(|c| c.cmd_type == AdminCmdType::PrepareMerge && c.committed) + } +} + +impl Drop for ProposalControl { + fn drop(&mut self) { + for state in mem::take(&mut self.proposed_admin_cmd) { + for ch in state.delayed_chs { + apply::notify_stale_req(self.term, ch); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn new_admin_request(cmd_type: AdminCmdType) -> RaftCmdRequest { + let mut request = RaftCmdRequest::default(); + request.mut_admin_request().set_cmd_type(cmd_type); + request + } + + #[test] + fn test_proposal_control() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(10); + assert_eq!(control.term, 10); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::BatchSplit, 5); + assert_eq!(control.proposed_admin_cmd.len(), 1); + + // Both conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + assert_eq!(conflict.cmd_type, AdminCmdType::BatchSplit); + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 5); + + assert!( + control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .is_none() + ); + control.record_proposed_admin(AdminCmdType::ChangePeerV2, 6); + assert_eq!(control.proposed_admin_cmd.len(), 2); + + assert!(!control.is_splitting()); + assert!(!control.is_merging()); + + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::ChangePeerV2)) + .unwrap(); + assert_eq!(conflict.index, 6); + // Conflict with the split admin cmd + let conflict = control.check_conflict(None).unwrap(); + assert_eq!(conflict.index, 5); + // Conflict with the change peer admin cmd + let conflict = control + .check_conflict(Some(AdminCmdType::PrepareMerge)) + .unwrap(); + assert_eq!(conflict.index, 6); + + let mut commit_split = false; + control.commit_to(4, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(!commit_split); + assert!(!control.is_splitting()); + control.commit_to(5, |c| commit_split = c.cmd_type == AdminCmdType::BatchSplit); + assert!(commit_split); + assert!(control.is_splitting()); + + control.advance_apply(4, 10, ®ion); + // Have no effect on `proposed_admin_cmd` + assert_eq!(control.proposed_admin_cmd.len(), 2); + assert!(control.is_splitting()); + + control.advance_apply(5, 10, ®ion); + // Left one change peer admin cmd + assert_eq!(control.proposed_admin_cmd.len(), 1); + assert!(!control.is_splitting()); + + assert!(control.check_conflict(None).is_none()); + let conflict = control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .unwrap(); + assert_eq!(conflict.index, 6); + + // Change term to 11 + control.maybe_update_term(11); + assert!( + control + .check_conflict(Some(AdminCmdType::BatchSplit)) + .is_none() + ); + assert_eq!(control.term, 11); + // Should be empty + assert_eq!(control.proposed_admin_cmd.len(), 0); + + // Test attaching multiple callbacks. + control.record_proposed_admin(AdminCmdType::BatchSplit, 7); + let mut subs = vec![]; + for _ in 0..3 { + let conflict = control.check_conflict(None).unwrap(); + let (ch, sub) = CmdResChannel::pair(); + conflict.delay_channel(ch); + subs.push(sub); + } + // Delayed channel should not be notified immediately. + for sub in &subs { + assert!(!sub.has_result()); + } + control.advance_apply(7, 12, ®ion); + for sub in subs { + assert!(sub.has_result()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_epoch_not_match(), + "{:?}", + res + ); + } + + // Should invoke callbacks when term is increased. + control.record_proposed_admin(AdminCmdType::BatchSplit, 8); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + control.maybe_update_term(13); + assert!(control.check_conflict(None).is_none()); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + + // Should invoke callbacks when it's dropped. + control.record_proposed_admin(AdminCmdType::BatchSplit, 9); + let (ch, sub) = CmdResChannel::pair(); + control.check_conflict(None).unwrap().delay_channel(ch); + drop(control); + let res = futures::executor::block_on(sub.result()).unwrap(); + assert!( + res.get_header().get_error().has_stale_command(), + "{:?}", + res + ); + } + + #[test] + fn test_proposal_control_merge() { + let region = metapb::Region::default(); + + let mut control = ProposalControl::new(5); + assert!(!control.is_merging()); + control.record_proposed_admin(AdminCmdType::PrepareMerge, 5); + assert!(!control.is_merging()); + control.commit_to(5, |_| ()); + assert!(control.is_merging()); + control.advance_apply(5, 5, ®ion); + assert!(!control.is_merging()); + + control.record_proposed_admin(AdminCmdType::PrepareMerge, 6); + assert!(!control.is_merging()); + control.commit_to(6, |_| ()); + assert!(control.is_merging()); + control.enter_prepare_merge(6); + control.advance_apply(6, 5, ®ion); + assert!(control.is_merging()); + control.leave_prepare_merge(6); + assert!(!control.is_merging()); + } +} diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index fe863a74b8a..7e69a3f1c7c 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -39,7 +39,8 @@ use raftstore::{ local_metrics::RaftMetrics, metrics::*, msg::ErrorCallback, - util, WriteCallback, + util::{self, admin_cmd_epoch_lookup}, + WriteCallback, }, Error, Result, }; @@ -49,14 +50,17 @@ use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, + operation::GenSnapTask, raft::{Apply, Peer}, router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, }; mod admin; +mod control; mod write; -pub use admin::AdminCmdResult; +pub use admin::{AdminCmdResult, SplitInit, SplitResult}; +pub use control::ProposalControl; pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; use self::write::SimpleWrite; @@ -120,8 +124,17 @@ impl Peer { let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); let tablet = self.tablet().clone(); let logger = self.logger.clone(); - let (apply_scheduler, mut apply_fsm) = - ApplyFsm::new(self.peer().clone(), region_state, mailbox, tablet, logger); + let read_scheduler = self.storage().read_scheduler(); + let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + self.peer().clone(), + region_state, + mailbox, + tablet, + store_ctx.tablet_factory.clone(), + read_scheduler, + logger, + ); + store_ctx .apply_pool .spawn(async move { apply_fsm.handle_all_tasks().await }) @@ -173,17 +186,34 @@ impl Peer { } #[inline] - fn propose(&mut self, ctx: &mut StoreContext, data: Vec) -> Result { - ctx.raft_metrics.propose.normal.inc(); - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); - if data.len() as u64 > ctx.cfg.raft_entry_max_size.0 { + fn propose( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + ) -> Result { + self.propose_with_ctx(store_ctx, data, vec![]) + } + + #[inline] + fn propose_with_ctx( + &mut self, + store_ctx: &mut StoreContext, + data: Vec, + proposal_ctx: Vec, + ) -> Result { + store_ctx.raft_metrics.propose.normal.inc(); + store_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); + if data.len() as u64 > store_ctx.cfg.raft_entry_max_size.0 { return Err(Error::RaftEntryTooLarge { region_id: self.region_id(), entry_size: data.len() as u64, }); } let last_index = self.raft_group().raft.raft_log.last_index(); - self.raft_group_mut().propose(vec![], data)?; + self.raft_group_mut().propose(proposal_ctx, data)?; if self.raft_group().raft.raft_log.last_index() == last_index { // The message is dropped silently, this usually due to leader absence // or transferring leader. Both cases can be considered as NotLeader error. @@ -193,18 +223,28 @@ impl Peer { } #[inline] - fn enqueue_pending_proposal( + pub fn post_propose_command( &mut self, ctx: &mut StoreContext, - mut proposal: Proposal>, + res: Result, + ch: Vec, + call_proposed_on_success: bool, ) { - let applied_to_current_term = self.applied_to_current_term(); - if applied_to_current_term { + let idx = match res { + Ok(i) => i, + Err(e) => { + ch.report_error(cmd_resp::err_resp(e, self.term())); + return; + } + }; + let mut proposal = Proposal::new(idx, self.term(), ch); + if call_proposed_on_success { proposal.cb.notify_proposed(); } - proposal.must_pass_epoch_check = applied_to_current_term; + proposal.must_pass_epoch_check = self.applied_to_current_term(); proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); self.proposals_mut().push(proposal); + self.set_has_ready(); } #[inline] @@ -255,15 +295,23 @@ impl Peer { // region. return; } + for admin_res in apply_res.admin_result { match admin_res { AdminCmdResult::ConfChange(conf_change) => { - self.on_apply_res_conf_change(conf_change) + self.on_apply_res_conf_change(ctx, conf_change) } + AdminCmdResult::SplitRegion(SplitResult { + regions, + derived_index, + tablet_index, + }) => self.on_ready_split_region(ctx, derived_index, tablet_index, regions), } } + self.raft_group_mut() .advance_apply_to(apply_res.applied_index); + self.proposal_control_advance_apply(apply_res.applied_index); let is_leader = self.is_leader(); let progress_to_be_updated = self.entry_storage().applied_term() != apply_res.applied_term; let entry_storage = self.entry_storage_mut(); @@ -396,8 +444,8 @@ impl Apply { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { AdminCmdType::CompactLog => unimplemented!(), - AdminCmdType::Split => unimplemented!(), - AdminCmdType::BatchSplit => unimplemented!(), + AdminCmdType::Split => self.apply_split(admin_req, entry.index)?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, entry.index)?, AdminCmdType::PrepareMerge => unimplemented!(), AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), @@ -412,10 +460,12 @@ impl Apply { AdminCmdType::VerifyHash => unimplemented!(), AdminCmdType::PrepareFlashback => unimplemented!(), AdminCmdType::FinishFlashback => unimplemented!(), + AdminCmdType::BatchSwitchWitness => unimplemented!(), AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } }; + self.push_admin_result(admin_result); let mut resp = new_response(req.get_header()); resp.set_admin_response(admin_resp); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index a760a5acfb2..59c5679f95f 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -7,9 +7,10 @@ use raftstore::{ cmd_resp, fsm::{apply, Proposal, MAX_PROPOSAL_SIZE_RATIO}, msg::ErrorCallback, - util, WriteCallback, + util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, + WriteCallback, }, - Result, + Error, Result, }; use crate::{ @@ -53,10 +54,17 @@ impl Peer { return; } // To maintain propose order, we need to make pending proposal first. - self.propose_pending_command(ctx); + self.propose_pending_writes(ctx); + if let Some(conflict) = self.proposal_control_mut().check_conflict(None) { + conflict.delay_channel(ch); + return; + } + // ProposalControl is reliable only when applied to current term. + let call_proposed_on_success = self.applied_to_current_term(); match SimpleWriteEncoder::new( req, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, + call_proposed_on_success, ) { Ok(mut encoder) => { encoder.add_response_channel(ch); @@ -65,35 +73,38 @@ impl Peer { } Err(req) => { let res = self.propose_command(ctx, req); - self.post_propose_write(ctx, res, vec![ch]); + self.post_propose_command(ctx, res, vec![ch], call_proposed_on_success); } } } - #[inline] - pub fn post_propose_write( - &mut self, - ctx: &mut StoreContext, - res: Result, - ch: Vec, - ) { - let idx = match res { - Ok(i) => i, - Err(e) => { - ch.report_error(cmd_resp::err_resp(e, self.term())); - return; - } - }; - let p = Proposal::new(idx, self.term(), ch); - self.enqueue_pending_proposal(ctx, p); - self.set_has_ready(); - } - - pub fn propose_pending_command(&mut self, ctx: &mut StoreContext) { + pub fn propose_pending_writes(&mut self, ctx: &mut StoreContext) { if let Some(encoder) = self.simple_write_encoder_mut().take() { + let call_proposed_on_success = if encoder.notify_proposed() { + // The request has pass conflict check and called all proposed callbacks. + false + } else { + // Epoch may have changed since last check. + let from_epoch = encoder.header().get_region_epoch(); + let res = util::compare_region_epoch( + from_epoch, + self.region(), + NORMAL_REQ_CHECK_CONF_VER, + NORMAL_REQ_CHECK_VER, + true, + ); + if let Err(mut e) = res { + // TODO: query sibling regions. + ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); + encoder.encode().1.report_error(cmd_resp::new_error(e)); + return; + } + // Only when it applies to current term, the epoch check can be reliable. + self.applied_to_current_term() + }; let (data, chs) = encoder.encode(); let res = self.propose(ctx, data); - self.post_propose_write(ctx, res, chs); + self.post_propose_command(ctx, res, chs, call_proposed_on_success); } } } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 364e2741868..ca9e7d39366 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -3,6 +3,7 @@ use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; use protobuf::{CodedInputStream, Message, SingularPtrField}; +use raftstore::store::WriteCallback; use slog::Logger; use crate::{operation::command::parse_at, router::CmdResChannel}; @@ -21,12 +22,18 @@ pub struct SimpleWriteEncoder { buf: Vec, channels: Vec, size_limit: usize, + notify_proposed: bool, } impl SimpleWriteEncoder { + /// Create an encoder. + /// + /// If `notify_proposed` is true, channels will be called `notify_proposed` + /// when it's appended. pub fn new( mut req: RaftCmdRequest, size_limit: usize, + notify_proposed: bool, ) -> Result { if !Self::allow_request(&req) { return Err(req); @@ -46,6 +53,7 @@ impl SimpleWriteEncoder { buf, channels: vec![], size_limit, + notify_proposed, }) } @@ -96,9 +104,24 @@ impl SimpleWriteEncoder { } #[inline] - pub fn add_response_channel(&mut self, ch: CmdResChannel) { + pub fn add_response_channel(&mut self, mut ch: CmdResChannel) { + if self.notify_proposed { + ch.notify_proposed(); + } self.channels.push(ch); } + + #[inline] + pub fn notify_proposed(&self) -> bool { + self.notify_proposed + } + + #[inline] + pub fn header(&self) -> &RaftRequestHeader { + self.header + .as_ref() + .unwrap_or_else(|| RaftRequestHeader::default_instance()) + } } #[derive(Debug)] @@ -382,7 +405,7 @@ mod tests { delete_req.set_key(delete_key.clone()); cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX).unwrap(); + let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX, false).unwrap(); cmd.clear_requests(); req = Request::default(); @@ -471,7 +494,7 @@ mod tests { let mut req = Request::default(); req.set_cmd_type(CmdType::Invalid); invalid_cmd.mut_requests().push(req); - let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX).unwrap_err(); + let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX, false).unwrap_err(); let bytes = fallback.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); let decoded = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); @@ -486,7 +509,7 @@ mod tests { put_req.set_key(b"key".to_vec()); put_req.set_value(b"".to_vec()); valid_cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX).unwrap(); + let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX, false).unwrap(); // Only simple write command can be batched. encoder.amend(invalid_cmd.clone()).unwrap_err(); let mut valid_cmd2 = valid_cmd.clone(); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 678cf6ece4b..60884f63b03 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -13,16 +13,17 @@ use std::cmp; use batch_system::BasicMailbox; -use crossbeam::channel::TrySendError; +use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; use raftstore::store::{util, ExtraStates, WriteTask}; -use slog::{debug, error, info}; +use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; +use super::command::SplitInit; use crate::{ batch::StoreContext, fsm::{PeerFsm, Store}, @@ -89,6 +90,44 @@ impl DestroyProgress { } impl Store { + /// The method is called during split. + /// The creation process is: + /// 1. create an uninitialized peer if not existed before + /// 2. initialize the peer by the information sent from parent peer + #[inline] + pub fn on_split_init( + &mut self, + ctx: &mut StoreContext, + msg: Box, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let region_id = msg.region.id; + let mut raft_msg = Box::::default(); + raft_msg.set_region_id(region_id); + raft_msg.set_region_epoch(msg.region.get_region_epoch().clone()); + raft_msg.set_to_peer( + msg.region + .get_peers() + .iter() + .find(|p| p.get_store_id() == self.store_id()) + .unwrap() + .clone(), + ); + + // It will create the peer if it does not exist + self.on_raft_message(ctx, raft_msg); + + if let Err(SendError(m)) = ctx.router.force_send(region_id, PeerMsg::SplitInit(msg)) { + warn!( + self.logger(), + "Split peer is destroyed before sending the intialization msg"; + "split init msg" => ?m, + ) + } + } + /// When a message's recipient doesn't exist, it will be redirected to /// store. Store is responsible for checking if it's neccessary to create /// a peer to handle the message. @@ -174,15 +213,21 @@ impl Store { let mut region = Region::default(); region.set_id(region_id); region.set_region_epoch(from_epoch.clone()); + // Peer list doesn't have to be complete, as it's uninitialized. - region.mut_peers().push(from_peer.clone()); + // + // If the id of the from_peer is INVALID_ID, this msg must be sent from parent + // peer in the split execution in which case we do not add it into the region. + if from_peer.id != raft::INVALID_ID { + region.mut_peers().push(from_peer.clone()); + } region.mut_peers().push(to_peer.clone()); // We don't set the region range here as we allow range conflict. let (tx, fsm) = match Storage::uninit( self.store_id(), region, ctx.engine.clone(), - ctx.log_fetch_scheduler.clone(), + ctx.read_scheduler.clone(), &ctx.logger, ) .and_then(|s| PeerFsm::new(&ctx.cfg, &*ctx.tablet_factory, s)) @@ -235,7 +280,7 @@ impl Peer { /// are split. It's a waste to use snapshot to restore newly split /// tablet. #[inline] - pub fn postpond_destroy(&self) -> bool { + pub fn postponed_destroy(&self) -> bool { let entry_storage = self.storage().entry_storage(); // TODO: check actual split index instead of commit index. entry_storage.applied_index() != entry_storage.commit_index() @@ -248,7 +293,7 @@ impl Peer { /// memory states. pub fn start_destroy(&mut self, write_task: &mut WriteTask) { let entry_storage = self.storage().entry_storage(); - if self.postpond_destroy() { + if self.postponed_destroy() { return; } let first_index = entry_storage.first_index(); diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 1eaeb21ec18..7df897f2b26 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -2,11 +2,14 @@ mod command; mod life; +mod pd; mod query; mod ready; -pub use command::{AdminCmdResult, CommittedEntries, SimpleWriteDecoder, SimpleWriteEncoder}; +pub use command::{ + AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteDecoder, SimpleWriteEncoder, +}; pub use life::DestroyProgress; -pub use ready::AsyncWriter; +pub use ready::{AsyncWriter, GenSnapTask, SnapState}; -pub(crate) use self::query::LocalReader; +pub(crate) use self::{command::SplitInit, query::LocalReader}; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs new file mode 100644 index 00000000000..659fab00754 --- /dev/null +++ b/components/raftstore-v2/src/operation/pd.rs @@ -0,0 +1,230 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module implements the interactions with pd. + +use std::cmp; + +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::{metapb, pdpb}; +use raftstore::store::Transport; +use slog::error; +use tikv_util::time::InstantExt; + +use crate::{ + batch::StoreContext, + fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, + raft::Peer, + router::{PeerTick, StoreTick}, + worker::{PdRegionHeartbeatTask, PdTask}, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_store_heartbeat(&mut self) { + self.fsm.store.store_heartbeat_pd(self.store_ctx); + self.schedule_tick( + StoreTick::PdStoreHeartbeat, + self.store_ctx.cfg.pd_store_heartbeat_tick_interval.0, + ); + } +} + +impl Store { + pub fn store_heartbeat_pd(&self, ctx: &StoreContext) + where + EK: KvEngine, + ER: RaftEngine, + { + let mut stats = pdpb::StoreStats::default(); + + stats.set_store_id(self.store_id()); + { + let meta = ctx.store_meta.lock().unwrap(); + stats.set_region_count(meta.tablet_caches.len() as u32); + } + + stats.set_sending_snap_count(0); + stats.set_receiving_snap_count(0); + + stats.set_start_time(self.start_time().unwrap() as u32); + + stats.set_bytes_written(0); + stats.set_keys_written(0); + stats.set_is_busy(false); + + // stats.set_query_stats(query_stats); + + let task = PdTask::StoreHeartbeat { stats }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!(self.logger(), "notify pd failed"; + "store_id" => self.store_id(), + "err" => ?e + ); + } + } +} + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + #[inline] + pub fn on_pd_heartbeat(&mut self) { + self.fsm.peer_mut().update_peer_statistics(); + if self.fsm.peer().is_leader() { + self.fsm.peer_mut().region_heartbeat_pd(self.store_ctx); + } + // TODO: hibernate region + self.schedule_tick(PeerTick::PdHeartbeat); + } +} + +impl Peer { + #[inline] + pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { + let task = PdTask::RegionHeartbeat(PdRegionHeartbeatTask { + term: self.term(), + region: self.region().clone(), + down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), + peer: self.peer().clone(), + pending_peers: self.collect_pending_peers(ctx), + written_bytes: self.self_stat().written_bytes, + written_keys: self.self_stat().written_keys, + approximate_size: None, + approximate_keys: None, + wait_data_peers: Vec::new(), + }); + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => ?e, + ); + return; + } + fail_point!("schedule_check_split"); + } + + /// Collects all pending peers and update `peers_start_pending_time`. + fn collect_pending_peers(&self, ctx: &StoreContext) -> Vec { + let mut pending_peers = Vec::with_capacity(self.region().get_peers().len()); + let status = self.raft_group().status(); + let truncated_idx = self + .storage() + .apply_state() + .get_truncated_state() + .get_index(); + + if status.progress.is_none() { + return pending_peers; + } + + // TODO: update `peers_start_pending_time`. + + let progresses = status.progress.unwrap().iter(); + for (&id, progress) in progresses { + if id == self.peer_id() { + continue; + } + // The `matched` is 0 only in these two cases: + // 1. Current leader hasn't communicated with this peer. + // 2. This peer does not exist yet(maybe it is created but not initialized) + // + // The correctness of region merge depends on the fact that all target peers + // must exist during merging. (PD rely on `pending_peers` to check whether all + // target peers exist) + // + // So if the `matched` is 0, it must be a pending peer. + // It can be ensured because `truncated_index` must be greater than + // `RAFT_INIT_LOG_INDEX`(5). + if progress.matched < truncated_idx { + if let Some(p) = self.peer_from_cache(id) { + pending_peers.push(p); + } else { + if ctx.cfg.dev_assert { + panic!( + "{:?} failed to get peer {} from cache", + self.logger.list(), + id + ); + } + error!( + self.logger, + "failed to get peer from cache"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "get_peer_id" => id, + ); + } + } + } + pending_peers + } + + #[inline] + pub fn destroy_peer_pd(&self, ctx: &StoreContext) { + let task = PdTask::DestroyPeer { + region_id: self.region_id(), + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with DestroyPeer"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => %e, + ); + } + } + + #[inline] + pub fn ask_batch_split_pd(&self, ctx: &StoreContext, split_keys: Vec>) { + let task = PdTask::AskBatchSplit { + region: self.region().clone(), + split_keys, + peer: self.peer().clone(), + right_derive: ctx.cfg.right_derive_when_split, + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with AskBatchSplit"; + "region_id" => self.region_id(), + "peer_id" => self.peer_id(), + "err" => %e, + ); + } + } + + #[inline] + pub fn report_batch_split_pd( + &self, + ctx: &StoreContext, + regions: Vec, + ) { + let task = PdTask::ReportBatchSplit { regions }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with ReportBatchSplit"; + "err" => %e, + ); + } + } + + #[inline] + pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { + let task = PdTask::UpdateMaxTimestamp { + region_id: self.region_id(), + initial_status, + txn_ext: self.txn_ext().clone(), + }; + if let Err(e) = ctx.pd_scheduler.schedule(task) { + error!( + self.logger, + "failed to notify pd with UpdateMaxTimestamp"; + "err" => %e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 1ae4aecd1cc..114080bcdbb 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -151,14 +151,14 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &mut Arc>>, + store_meta: &Mutex>, progress: Option, ) { // A nonleader peer should never has leader lease. let read_progress = if !should_renew_lease( self.is_leader(), - self.is_splitting(), - self.is_merging(), + self.proposal_control().is_splitting(), + self.proposal_control().is_merging(), self.has_force_leader(), ) { None @@ -186,7 +186,7 @@ impl Peer { // TODO: remove this block of code when snapshot is done; add the logic into // on_persist_snapshot. - pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &mut Arc>>) { + pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &Mutex>) { let mut meta = store_meta.lock().unwrap(); // TODO: remove this block of code when snapshot is done; add the logic into // on_persist_snapshot. diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 12df1e7926f..0736bc13fd8 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -565,7 +565,7 @@ mod tests { region1.set_region_epoch(epoch13.clone()); let term6 = 6; let mut lease = Lease::new(Duration::seconds(10), Duration::milliseconds(2500)); - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); let mut cmd = RaftCmdRequest::default(); let mut header = RaftRequestHeader::default(); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 8b84b0788ce..77ca7b90074 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -216,7 +216,7 @@ impl Peer { self.pending_reads_mut().advance_leader_reads(states); if let Some(propose_time) = self.pending_reads().last_ready().map(|r| r.propose_time) { if !self.leader_lease_mut().is_suspect() { - self.maybe_renew_leader_lease(propose_time, &mut ctx.store_meta, None); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); } } @@ -288,6 +288,24 @@ impl Peer { && !self.has_pending_merge_state() } + #[inline] + pub fn ready_to_handle_read(&self) -> bool { + // TODO: It may cause read index to wait a long time. + + // There may be some values that are not applied by this leader yet but the old + // leader, if applied_term isn't equal to current term. + self.applied_to_current_term() + // There may be stale read if the old leader splits really slow, + // the new region may already elected a new leader while + // the old leader still think it owns the split range. + && !self.proposal_control().is_splitting() + // There may be stale read if a target leader is in another store and + // applied commit merge, written new values, but the sibling peer in + // this store does not apply commit merge, so the leader is not ready + // to read, until the merge is rollbacked. + && !self.proposal_control().is_merging() + } + fn send_read_command( &self, ctx: &mut StoreContext, @@ -409,7 +427,7 @@ impl Peer { // TODO: add coprocessor_host hook let progress = ReadProgress::applied_term(applied_term); // TODO: remove it - self.add_reader_if_necessary(&mut ctx.store_meta); + self.add_reader_if_necessary(&ctx.store_meta); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index d5673d76a40..a7bce44fe05 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -22,6 +22,7 @@ struct UnpersistedReady { /// Max number of following ready whose data to be persisted is empty. max_empty_number: u64, raft_msgs: Vec>, + has_snapshot: bool, } /// A writer that handles asynchronous writes. @@ -70,6 +71,7 @@ impl AsyncWriter { fn send(&mut self, ctx: &mut impl WriteRouterContext, task: WriteTask) { let ready_number = task.ready_number(); + let has_snapshot = task.has_snapshot; self.write_router.send_write_msg( ctx, self.unpersisted_readies.back().map(|r| r.number), @@ -79,6 +81,7 @@ impl AsyncWriter { number: ready_number, max_empty_number: ready_number, raft_msgs: vec![], + has_snapshot, }); } @@ -108,9 +111,9 @@ impl AsyncWriter { ctx: &mut impl WriteRouterContext, ready_number: u64, logger: &Logger, - ) -> Vec> { + ) -> (Vec>, bool) { if self.persisted_number >= ready_number { - return vec![]; + return (vec![], false); } let last_unpersisted = self.unpersisted_readies.back(); @@ -124,11 +127,13 @@ impl AsyncWriter { } let mut raft_messages = vec![]; + let mut has_snapshot = false; // There must be a match in `self.unpersisted_readies`. loop { let Some(v) = self.unpersisted_readies.pop_front() else { panic!("{:?} ready number not found {}", logger.list(), ready_number); }; + has_snapshot |= v.has_snapshot; if v.number > ready_number { panic!( "{:?} ready number not matched {:?} vs {}", @@ -151,7 +156,7 @@ impl AsyncWriter { self.write_router .check_new_persisted(ctx, self.persisted_number); - raft_messages + (raft_messages, has_snapshot) } pub fn persisted_number(&self) -> u64 { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index cfc3d086163..1c8c9d80338 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -18,25 +18,34 @@ //! There two steps can be processed concurrently. mod async_writer; +mod snapshot; -use std::cmp; +use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, MiscExt, OpenOptions, RaftEngine, TabletFactory}; use error_code::ErrorCodeExt; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::{ + raft_cmdpb::AdminCmdType, + raft_serverpb::{PeerState, RaftMessage, RaftSnapshotData}, +}; use protobuf::Message as _; -use raft::{eraftpb, Ready}; -use raftstore::store::{util, ExtraStates, FetchedLogs, Transport, WriteTask}; +use raft::{eraftpb, Ready, StateRole, INVALID_ID}; +use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; use slog::{debug, error, trace, warn}; use tikv_util::time::{duration_to_sec, monotonic_raw_now}; -pub use self::async_writer::AsyncWriter; +pub use self::{ + async_writer::AsyncWriter, + snapshot::{GenSnapTask, SnapState}, +}; use crate::{ batch::StoreContext, fsm::PeerFsmDelegate, raft::{Peer, Storage}, - router::PeerTick, + router::{ApplyTask, PeerTick}, + Result, }; + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { /// Raft relies on periodic ticks to keep the state machine sync with other /// peers. @@ -107,7 +116,11 @@ impl Peer { } // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. - self.insert_peer_cache(msg.take_from_peer()); + let from_peer = msg.take_from_peer(); + if self.is_leader() && from_peer.get_id() != INVALID_ID { + self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); + } + self.insert_peer_cache(from_peer); if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); } @@ -115,7 +128,7 @@ impl Peer { } /// Callback for fetching logs asynchronously. - pub fn on_fetched_logs(&mut self, fetched_logs: FetchedLogs) { + pub fn on_logs_fetched(&mut self, fetched_logs: FetchedLogs) { let FetchedLogs { context, logs } = fetched_logs; let low = logs.low; if !self.is_leader() { @@ -243,7 +256,7 @@ impl Peer { ctx.raft_metrics.commit_log.observe(duration_to_sec( (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), )); - self.maybe_renew_leader_lease(propose_time, &mut ctx.store_meta, None); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); break; } } @@ -267,7 +280,7 @@ impl Peer { } ctx.has_ready = true; - if !self.raft_group().has_ready() && (self.serving() || self.postpond_destroy()) { + if !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -284,6 +297,22 @@ impl Peer { |entry| entry.index == self.raft_group().raft.raft_log.last_index() )); + self.on_role_changed(ctx, &ready); + + if let Some(hs) = ready.hs() { + let prev_commit_index = self.entry_storage().commit_index(); + assert!( + hs.get_commit() >= prev_commit_index, + "{:?} {:?} {}", + self.logger.list(), + hs, + prev_commit_index + ); + if self.is_leader() && hs.get_commit() > prev_commit_index { + self.on_leader_commit_index_changed(hs.get_commit()); + } + } + if !ready.messages().is_empty() { debug_assert!(self.is_leader()); for msg in ready.take_messages() { @@ -298,10 +327,18 @@ impl Peer { self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); } + // Check whether there is a pending generate snapshot task, the task + // needs to be sent to the apply system. + // Always sending snapshot task after apply task, so it gets latest + // snapshot. + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler().send(ApplyTask::Snapshot(gen_task)); + } + let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); self.storage_mut() - .handle_raft_ready(&mut ready, &mut write_task); + .handle_raft_ready(ctx, &mut ready, &mut write_task); if !ready.persisted_messages().is_empty() { write_task.messages = ready .take_persisted_messages() @@ -355,17 +392,27 @@ impl Peer { error!(self.logger, "peer id not matched"; "persisted_peer_id" => peer_id, "persisted_number" => ready_number); return; } - let persisted_message = self - .async_writer - .on_persisted(ctx, ready_number, &self.logger); + let (persisted_message, has_snapshot) = + self.async_writer + .on_persisted(ctx, ready_number, &self.logger); for msgs in persisted_message { for msg in msgs { self.send_raft_message(ctx, msg); } } + let persisted_number = self.async_writer.persisted_number(); self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.raft_group().raft.raft_log.persisted; + /// The apply snapshot process order would be: + /// - Get the snapshot from the ready + /// - Wait for async writer to load this tablet + /// In this step, the snapshot has loaded finish, but some apply state + /// need to update. + if has_snapshot { + self.on_applied_snapshot(ctx); + } + self.storage_mut() .entry_storage_mut() .update_cache_persisted(persisted_index); @@ -383,20 +430,118 @@ impl Peer { pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { self.async_writer.subscirbe_flush(ch); } + + pub fn on_role_changed(&mut self, ctx: &mut StoreContext, ready: &Ready) { + // Update leader lease when the Raft state changes. + if let Some(ss) = ready.ss() { + let term = self.term(); + match ss.raft_state { + StateRole::Leader => { + // The local read can only be performed after a new leader has applied + // the first empty entry on its term. After that the lease expiring time + // should be updated to + // send_to_quorum_ts + max_lease + // as the comments in `Lease` explain. + // It is recommended to update the lease expiring time right after + // this peer becomes leader because it's more convenient to do it here and + // it has no impact on the correctness. + let progress_term = ReadProgress::term(term); + self.maybe_renew_leader_lease( + monotonic_raw_now(), + &ctx.store_meta, + Some(progress_term), + ); + debug!( + self.logger, + "becomes leader with lease"; + "lease" => ?self.leader_lease(), + ); + // If the predecessor reads index during transferring leader and receives + // quorum's heartbeat response after that, it may wait for applying to + // current term to apply the read. So broadcast eagerly to avoid unexpected + // latency. + self.raft_group_mut().skip_bcast_commit(false); + + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx); + // Exit entry cache warmup state when the peer becomes leader. + self.entry_storage_mut().clear_entry_cache_warmup_state(); + + self.region_heartbeat_pd(ctx); + } + StateRole::Follower => { + self.leader_lease_mut().expire(); + self.storage_mut().cancel_generating_snap(None); + } + _ => {} + } + self.proposal_control_mut().maybe_update_term(term); + } + } + + /// If leader commits new admin commands, it may break lease assumption. So + /// we need to cancel lease whenever necessary. + /// + /// Note this method should be called before sending out any messages. + fn on_leader_commit_index_changed(&mut self, commit_index: u64) { + let mut committed_prepare_merge = false; + self.proposal_control_mut().commit_to(commit_index, |cmd| { + committed_prepare_merge |= cmd.cmd_type() == AdminCmdType::PrepareMerge + }); + // There are two types of operations that will change the ownership of a range: + // split and merge. + // + // - For split, after the split command is committed, it's + // possible that the same range is govened by different region on different + // nodes due to different apply progress. But because only the peers on the + // same node as old leader will campaign despite election timeout, so there + // will be no modification to the overlapped range until either the original + // leader apply the split command or an election timeout is passed since split + // is committed. We already forbid renewing lease after committing split, and + // original leader will update the reader delegate with latest epoch after + // applying split before the split peer starts campaign, so here the only thing + // we need to do is marking split is committed (which is done by `commit_to` + // above). It's correct to allow local read during split. + // + // - For merge, after the prepare merge command is committed, the target peers + // may apply commit merge at any time, so we need to forbid any type of read + // to avoid missing the modifications from target peers. + if committed_prepare_merge { + // After prepare_merge is committed and the leader broadcasts commit + // index to followers, the leader can not know when the target region + // merges majority of this region, also it can not know when the target + // region writes new values. + // To prevent unsafe local read, we suspect its leader lease. + self.leader_lease_mut().suspect(monotonic_raw_now()); + // Stop updating `safe_ts` + self.read_progress_mut().discard(); + } + } } -impl Storage { +impl Storage { /// Apply the ready to the storage. If there is any states need to be /// persisted, it will be written to `write_task`. - fn handle_raft_ready( + fn handle_raft_ready( &mut self, + ctx: &mut StoreContext, ready: &mut Ready, write_task: &mut WriteTask, ) { let prev_raft_state = self.entry_storage().raft_state().clone(); let ever_persisted = self.ever_persisted(); - // TODO: handle snapshot + if !ready.snapshot().is_empty() { + if let Err(e) = self.apply_snapshot( + ready.snapshot(), + write_task, + ctx.snap_mgr.clone(), + ctx.tablet_factory.clone(), + ) { + error!(self.logger(),"failed to apply snapshot";"error" => ?e) + } + } let entry_storage = self.entry_storage_mut(); if !ready.entries().is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs new file mode 100644 index 00000000000..32e8a3f8ff8 --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -0,0 +1,403 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains snapshot relative processing logic. +//! +//! # Snapshot State +//! +//! generator and apply snapshot works asynchronously. the snap_sate indicates +//! the curren snapshot state. +//! +//! # Process Overview +//! +//! generate snapshot: +//! - Raft call `snapshot` interface to acquire a snapshot, then storage setup +//! the gen_snap_task. +//! - handle ready will send the gen_snap_task to the apply work +//! - apply worker schedule a gen tablet snapshot task to async read worker with +//! region state and apply state. +//! - async read worker generates the tablet snapshot and sends the result to +//! peer fsm, then Raft will get the snapshot. + +use std::{ + borrow::BorrowMut, + fmt::{self, Debug}, + mem, + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + mpsc, Arc, + }, +}; + +use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use protobuf::Message; +use raft::eraftpb::Snapshot; +use raftstore::store::{ + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, + TabletSnapManager, Transport, WriteTask, +}; +use slog::{error, info, warn}; +use tikv_util::{box_err, box_try, worker::Scheduler}; + +use crate::{ + fsm::ApplyResReporter, + raft::{Apply, Peer, Storage}, + router::{ApplyTask, PeerTick}, + Result, StoreContext, +}; + +#[derive(Debug)] +pub enum SnapState { + Relax, + Generating { + canceled: Arc, + index: Arc, + }, + Generated(Box), +} + +impl PartialEq for SnapState { + fn eq(&self, other: &SnapState) -> bool { + match (self, other) { + (&SnapState::Relax, &SnapState::Relax) + | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, + (&SnapState::Generated(ref snap1), &SnapState::Generated(ref snap2)) => { + *snap1 == *snap2 + } + _ => false, + } + } +} + +pub struct GenSnapTask { + region_id: u64, + // The snapshot will be sent to the peer. + to_peer: u64, + // Fill it when you are going to generate the snapshot. + // index used to check if the gen task should be canceled. + index: Arc, + // Set it to true to cancel the task if necessary. + canceled: Arc, + // indicates whether the snapshot is triggered due to load balance + for_balance: bool, +} + +impl GenSnapTask { + pub fn new( + region_id: u64, + to_peer: u64, + index: Arc, + canceled: Arc, + ) -> GenSnapTask { + GenSnapTask { + region_id, + to_peer, + index, + canceled, + for_balance: false, + } + } + + pub fn set_for_balance(&mut self) { + self.for_balance = true; + } +} + +impl Debug for GenSnapTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("GenSnapTask") + .field("region_id", &self.region_id) + .finish() + } +} + +impl Peer { + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { + if self.storage_mut().on_snapshot_generated(snapshot) { + self.raft_group_mut().ping(); + self.set_has_ready(); + } + } + + pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { + let persisted_index = self.raft_group().raft.raft_log.persisted; + let first_index = self.storage().entry_storage().first_index(); + if first_index == persisted_index + 1 { + let region_id = self.region_id(); + let tablet = ctx + .tablet_factory + .open_tablet(region_id, Some(persisted_index), OpenOptions::default()) + .unwrap(); + self.tablet_mut().set(tablet); + self.schedule_apply_fsm(ctx); + self.storage_mut().on_applied_snapshot(); + self.raft_group_mut().advance_apply_to(persisted_index); + self.read_progress_mut() + .update_applied_core(persisted_index); + info!(self.logger, "apply tablet snapshot completely"); + } + } +} + +impl Apply { + /// Handle snapshot. + /// + /// Will schedule a task to read worker and then generate a snapshot + /// asynchronously. + pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + // Do not generate, the peer is removed. + if self.tombstone() { + snap_task.canceled.store(true, Ordering::SeqCst); + error!( + self.logger, + "cancel generating snapshot because it's already destroyed"; + ); + return; + } + // Flush before do snapshot. + if snap_task.canceled.load(Ordering::SeqCst) { + return; + } + self.flush(); + + // Send generate snapshot task to region worker. + let (last_applied_index, last_applied_term) = self.apply_progress(); + snap_task.index.store(last_applied_index, Ordering::SeqCst); + let gen_tablet_sanp_task = ReadTask::GenTabletSnapshot { + region_id: snap_task.region_id, + to_peer: snap_task.to_peer, + tablet: self.tablet().clone(), + region_state: self.region_state().clone(), + last_applied_term, + last_applied_index, + for_balance: snap_task.for_balance, + canceled: snap_task.canceled.clone(), + }; + if let Err(e) = self.read_scheduler().schedule(gen_tablet_sanp_task) { + error!( + self.logger, + "schedule snapshot failed"; + "error" => ?e, + ); + snap_task.canceled.store(true, Ordering::SeqCst); + } + } +} + +impl Storage { + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no + /// unavailable snapshot. + pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + let mut snap_state = self.snap_state_mut(); + match *snap_state { + SnapState::Generating { ref canceled, .. } => { + if canceled.load(Ordering::SeqCst) { + self.cancel_generating_snap(None); + } else { + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + } + SnapState::Generated(ref s) => { + let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; + if self.validate_snap(&snap, request_index) { + return Ok(*snap); + } + } + _ => {} + } + + if SnapState::Relax != *snap_state { + panic!( + "{:?} unexpected state: {:?}", + self.logger().list(), + *snap_state + ); + } + + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + let canceled = Arc::new(AtomicBool::new(false)); + let index = Arc::new(AtomicU64::new(0)); + *snap_state = SnapState::Generating { + canceled: canceled.clone(), + index: index.clone(), + }; + + let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); + let mut gen_snap_task = self.gen_snap_task_mut(); + assert!(gen_snap_task.is_none()); + *gen_snap_task = Box::new(Some(task)); + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + } + + /// Validate the snapshot. Returns true if it's valid. + fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { + let idx = snap.get_metadata().get_index(); + // TODO(nolouch): check tuncated index + if idx < request_index { + // stale snapshot, should generate again. + info!( + self.logger(), + "snapshot is stale, generate again"; + "snap_index" => idx, + "request_index" => request_index, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.stale.inc(); + return false; + } + + let mut snap_data = RaftSnapshotData::default(); + if let Err(e) = snap_data.merge_from_bytes(snap.get_data()) { + error!( + self.logger(), + "failed to decode snapshot, it may be corrupted"; + "err" => ?e, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.decode.inc(); + return false; + } + let snap_epoch = snap_data.get_region().get_region_epoch(); + let latest_epoch = self.region().get_region_epoch(); + if snap_epoch.get_conf_ver() < latest_epoch.get_conf_ver() { + info!( + self.logger(), + "snapshot epoch is stale"; + "snap_epoch" => ?snap_epoch, + "latest_epoch" => ?latest_epoch, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.epoch.inc(); + return false; + } + + true + } + + /// Cancel generating snapshot. + pub fn cancel_generating_snap(&self, compact_to: Option) { + let mut snap_state = self.snap_state_mut(); + let SnapState::Generating { + ref canceled, + ref index, + } = *snap_state else { return }; + + if let Some(idx) = compact_to { + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || idx <= snap_index + 1 { + return; + } + } + canceled.store(true, Ordering::SeqCst); + *snap_state = SnapState::Relax; + self.gen_snap_task_mut().take(); + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + ); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + } + + /// Try to switch snap state to generated. only `Generating` can switch to + /// `Generated`. + /// TODO: make the snap state more clearer, the snapshot must be consumed. + pub fn on_snapshot_generated(&self, res: GenSnapRes) -> bool { + if res.is_none() { + self.cancel_generating_snap(None); + return false; + } + let snap = res.unwrap(); + let mut snap_state = self.snap_state_mut(); + let SnapState::Generating { + ref canceled, + ref index, + } = *snap_state else { return false }; + + if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { + warn!( + self.logger(), + "snapshot is staled, skip"; + "snap index" => snap.get_metadata().get_index(), + "required index" => index.load(Ordering::SeqCst), + ); + return false; + } + // Should changed `SnapState::Generated` to `SnapState::Relax` when the + // snap is consumed or canceled. Such as leader changed, the state of generated + // should be reset. + *snap_state = SnapState::Generated(snap); + true + } + + pub fn on_applied_snapshot(&mut self) { + let mut entry = self.entry_storage_mut(); + let term = entry.truncated_term(); + let index = entry.truncated_index(); + entry.set_applied_term(term); + entry.apply_state_mut().set_applied_index(index); + self.region_state_mut().set_tablet_index(index); + } + + pub fn apply_snapshot( + &mut self, + snap: &Snapshot, + task: &mut WriteTask, + snap_mgr: TabletSnapManager, + tablet_factory: Arc>, + ) -> Result<()> { + let region_id = self.region().get_id(); + let peer_id = self.peer().get_id(); + info!( + self.logger(), + "begin to apply snapshot"; + ); + + let mut snap_data = RaftSnapshotData::default(); + snap_data.merge_from_bytes(snap.get_data())?; + let region = snap_data.take_region(); + if region.get_id() != region_id { + return Err(box_err!( + "mismatch region id {}!={}", + region_id, + region.get_id() + )); + } + + let last_index = snap.get_metadata().get_index(); + let last_term = snap.get_metadata().get_term(); + self.region_state_mut().set_state(PeerState::Normal); + self.region_state_mut().set_region(region); + self.entry_storage_mut() + .raft_state_mut() + .set_last_index(last_index); + self.entry_storage_mut().set_truncated_index(last_index); + self.entry_storage_mut().set_truncated_term(last_term); + self.entry_storage_mut().set_last_term(last_term); + + let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); + let mut path = snap_mgr.final_recv_path(&key); + let logger = self.logger().clone(); + // The snapshot require no additional processing such as ingest them to DB, but + // it should load it into the factory after it persisted. + let hook = move || { + if let Err(e) = tablet_factory.load_tablet(path.as_path(), region_id, last_index) { + panic!( + "{:?} failed to load tablet, path: {}, {:?}", + logger.list(), + path.display(), + e + ); + } + }; + task.persisted_cb = (Some(Box::new(hook))); + task.has_snapshot = true; + Ok(()) + } +} diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 068e5124c0c..06101da8d83 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -1,11 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::mem; +use std::{mem, sync::Arc}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, TabletFactory}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; -use raftstore::store::fsm::apply::DEFAULT_APPLY_WB_SIZE; +use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; use slog::Logger; +use tikv_util::worker::Scheduler; use super::Peer; use crate::{ @@ -18,10 +19,13 @@ use crate::{ /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, + /// publish the update of the tablet remote_tablet: CachedTablet, tablet: EK, write_batch: Option, + tablet_factory: Arc>, + callbacks: Vec<(Vec, RaftCmdResponse)>, /// A flag indicates whether the peer is destroyed by applying admin @@ -34,6 +38,7 @@ pub struct Apply { region_state: RegionLocalState, res_reporter: R, + read_scheduler: Scheduler>, pub(crate) logger: Logger, } @@ -44,6 +49,8 @@ impl Apply { region_state: RegionLocalState, res_reporter: R, mut remote_tablet: CachedTablet, + tablet_factory: Arc>, + read_scheduler: Scheduler>, logger: Logger, ) -> Self { Apply { @@ -57,11 +64,18 @@ impl Apply { applied_term: 0, admin_cmd_result: vec![], region_state, + tablet_factory, + read_scheduler, res_reporter, logger, } } + #[inline] + pub fn tablet_factory(&self) -> &Arc> { + &self.tablet_factory + } + #[inline] pub fn res_reporter(&self) -> &R { &self.res_reporter @@ -96,6 +110,11 @@ impl Apply { (self.applied_index, self.applied_term) } + #[inline] + pub fn read_scheduler(&self) -> &Scheduler> { + &self.read_scheduler + } + #[inline] pub fn region_state(&self) -> &RegionLocalState { &self.region_state @@ -116,6 +135,11 @@ impl Apply { self.tablet = tablet; } + #[inline] + pub fn tablet(&self) -> &EK { + &self.tablet + } + #[inline] pub fn peer(&self) -> &metapb::Peer { &self.peer diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 650c410cef9..a9730a036e7 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,26 +1,45 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{mem, sync::Arc}; +use std::{ + mem, + sync::{atomic::Ordering, Arc}, + time::{Duration, Instant}, +}; +use collections::HashMap; use crossbeam::atomic::AtomicCell; use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb}; +use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; -use raftstore::store::{ - util::{Lease, RegionReadProgress}, - Config, EntryStorage, ProposalQueue, ReadDelegate, ReadIndexQueue, TrackVer, TxnExt, +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, + store::{ + fsm::Proposal, + util::{Lease, RegionReadProgress}, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TxnExt, + }, + Error, +}; +use slog::{debug, error, info, o, warn, Logger}; +use tikv_util::{ + box_err, + config::ReadableSize, + time::{monotonic_raw_now, Instant as TiInstant}, + worker::Scheduler, + Either, }; -use slog::Logger; -use tikv_util::{box_err, config::ReadableSize}; use time::Timespec; use super::{storage::Storage, Apply}; use crate::{ + batch::StoreContext, fsm::{ApplyFsm, ApplyScheduler}, - operation::{AsyncWriter, DestroyProgress, SimpleWriteEncoder}, + operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, router::{CmdResChannel, QueryResChannel}, tablet::CachedTablet, + worker::PdTask, Result, }; @@ -28,12 +47,18 @@ const REGION_READ_PROGRESS_CAP: usize = 128; /// A peer that delegates commands between state machine and raft. pub struct Peer { - raft_group: RawNode>, + raft_group: RawNode>, tablet: CachedTablet, + + /// Statistics for self. + self_stat: PeerStat, + /// We use a cache for looking up peers. Not all peers exist in region's /// peer list, for example, an isolated peer may need to send/receive /// messages with unknown peers after recovery. peer_cache: Vec, + /// Statistics for other peers, only maintained when self is the leader. + peer_heartbeats: HashMap, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -55,9 +80,14 @@ pub struct Peer { /// region buckets. region_buckets: Option, + last_region_buckets: Option, + /// Transaction extensions related to this peer. txn_ext: Arc, txn_extra_op: Arc>, + + /// Check whether this proposal can be proposed based on its epoch. + proposal_control: ProposalControl, } impl Peer { @@ -67,28 +97,13 @@ impl Peer { pub fn new( cfg: &Config, tablet_factory: &dyn TabletFactory, - storage: Storage, + storage: Storage, ) -> Result { let logger = storage.logger().clone(); let applied_index = storage.apply_state().get_applied_index(); let peer_id = storage.peer().get_id(); - - let raft_cfg = raft::Config { - id: peer_id, - election_tick: cfg.raft_election_timeout_ticks, - heartbeat_tick: cfg.raft_heartbeat_ticks, - min_election_tick: cfg.raft_min_election_timeout_ticks, - max_election_tick: cfg.raft_max_election_timeout_ticks, - max_size_per_msg: cfg.raft_max_size_per_msg.0, - max_inflight_msgs: cfg.raft_max_inflight_msgs, - applied: applied_index, - check_quorum: true, - skip_bcast_commit: true, - pre_vote: cfg.prevote, - max_committed_size_per_ready: ReadableSize::mb(16).0, - ..Default::default() - }; + let raft_cfg = cfg.new_raft_config(peer_id, applied_index); let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); @@ -119,7 +134,9 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet, + self_stat: PeerStat::default(), peer_cache: vec![], + peer_heartbeats: HashMap::default(), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -128,20 +145,22 @@ impl Peer { destroy_progress: DestroyProgress::None, raft_group, logger, - pending_reads: ReadIndexQueue::new(tag.clone()), + pending_reads: ReadIndexQueue::new(tag), read_progress: Arc::new(RegionReadProgress::new( ®ion, applied_index, REGION_READ_PROGRESS_CAP, - tag, + peer_id, )), leader_lease: Lease::new( cfg.raft_store_max_leader_lease(), cfg.renew_leader_lease_advance_duration(), ), region_buckets: None, + last_region_buckets: None, txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), + proposal_control: ProposalControl::new(0), }; // If this region has only one peer and I am the one, campaign directly. @@ -153,6 +172,8 @@ impl Peer { peer.raft_group.campaign()?; peer.set_has_ready(); } + let term = peer.term(); + peer.proposal_control.maybe_update_term(term); Ok(peer) } @@ -167,6 +188,63 @@ impl Peer { self.region().get_id() } + /// Set the region of a peer. + /// + /// This will update the region of the peer, caller must ensure the region + /// has been preserved in a durable device. + pub fn set_region( + &mut self, + // host: &CoprocessorHost, + reader: &mut ReadDelegate, + region: metapb::Region, + reason: RegionChangeReason, + tablet_index: u64, + ) { + if self.region().get_region_epoch().get_version() < region.get_region_epoch().get_version() + { + // Epoch version changed, disable read on the local reader for this region. + self.leader_lease.expire_remote_lease(); + } + + let mut region_state = RegionLocalState::default(); + region_state.set_region(region.clone()); + region_state.set_tablet_index(tablet_index); + region_state.set_state(self.storage().region_state().get_state()); + self.storage_mut().set_region_state(region_state); + + let progress = ReadProgress::region(region); + // Always update read delegate's region to avoid stale region info after a + // follower becoming a leader. + self.maybe_update_read_progress(reader, progress); + + if self.is_leader() { + // Unlike v1, we should renew remote lease if it's leader. This is because v2 + // only provides read in local reader which requires passing the lease check. If + // lease check fails, it sends query to raftstore to make it renew the remote + // lease. However, raftstore will answer immediately if the `bound` in + // `leader_lease` is valid, so the remote lease will not be updated. + if let Some(progress) = self + .leader_lease + .maybe_new_remote_lease(self.term()) + .map(ReadProgress::leader_lease) + { + self.maybe_update_read_progress(reader, progress); + } + } + + // Update leader info + self.read_progress + .update_leader_info(self.leader_id(), self.term(), self.region()); + + { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + + // TODO: CoprocessorHost + } + #[inline] pub fn peer(&self) -> &metapb::Peer { self.raft_group.store().peer() @@ -178,7 +256,7 @@ impl Peer { } #[inline] - pub fn storage(&self) -> &Storage { + pub fn storage(&self) -> &Storage { self.raft_group.store() } @@ -203,7 +281,7 @@ impl Peer { } #[inline] - pub fn storage_mut(&mut self) -> &mut Storage { + pub fn storage_mut(&mut self) -> &mut Storage { self.raft_group.mut_store() } @@ -218,12 +296,12 @@ impl Peer { } #[inline] - pub fn entry_storage(&self) -> &EntryStorage { + pub fn entry_storage(&self) -> &EntryStorage { self.raft_group.store().entry_storage() } #[inline] - pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { self.raft_group.mut_store().entry_storage_mut() } @@ -238,15 +316,25 @@ impl Peer { } #[inline] - pub fn raft_group(&self) -> &RawNode> { + pub fn raft_group(&self) -> &RawNode> { &self.raft_group } #[inline] - pub fn raft_group_mut(&mut self) -> &mut RawNode> { + pub fn raft_group_mut(&mut self) -> &mut RawNode> { &mut self.raft_group } + #[inline] + pub fn set_raft_group(&mut self, raft_group: RawNode>) { + self.raft_group = raft_group; + } + + #[inline] + pub fn self_stat(&self) -> &PeerStat { + &self.self_stat + } + /// Mark the peer has a ready so it will be checked at the end of every /// processing round. #[inline] @@ -294,6 +382,57 @@ impl Peer { .cloned() } + #[inline] + pub fn update_peer_statistics(&mut self) { + if !self.is_leader() { + self.peer_heartbeats.clear(); + return; + } + + if self.peer_heartbeats.len() == self.region().get_peers().len() { + return; + } + + // Insert heartbeats in case that some peers never response heartbeats. + let region = self.raft_group.store().region(); + for peer in region.get_peers() { + self.peer_heartbeats + .entry(peer.get_id()) + .or_insert_with(Instant::now); + } + } + + #[inline] + pub fn add_peer_heartbeat(&mut self, peer_id: u64, now: Instant) { + self.peer_heartbeats.insert(peer_id, now); + } + + #[inline] + pub fn remove_peer_heartbeat(&mut self, peer_id: u64) { + self.peer_heartbeats.remove(&peer_id); + } + + pub fn collect_down_peers(&self, max_duration: Duration) -> Vec { + let mut down_peers = Vec::new(); + let now = Instant::now(); + for p in self.region().get_peers() { + if p.get_id() == self.peer_id() { + continue; + } + if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { + let elapsed = instant.saturating_duration_since(now); + if elapsed >= max_duration { + let mut stats = pdpb::PeerStats::default(); + stats.set_peer(p.clone()); + stats.set_down_seconds(elapsed.as_secs()); + down_peers.push(stats); + } + } + } + // TODO: `refill_disk_full_peers` + down_peers + } + #[inline] pub fn is_leader(&self) -> bool { self.raft_group.raft.state == StateRole::Leader @@ -323,18 +462,6 @@ impl Peer { self.raft_group.raft.term } - #[inline] - // TODO - pub fn is_splitting(&self) -> bool { - false - } - - #[inline] - // TODO - pub fn is_merging(&self) -> bool { - false - } - #[inline] // TODO pub fn has_force_leader(&self) -> bool { @@ -391,24 +518,6 @@ impl Peer { &self.proposals } - #[inline] - pub fn ready_to_handle_read(&self) -> bool { - // TODO: It may cause read index to wait a long time. - - // There may be some values that are not applied by this leader yet but the old - // leader, if applied_term isn't equal to current term. - self.applied_to_current_term() - // There may be stale read if the old leader splits really slow, - // the new region may already elected a new leader while - // the old leader still think it owns the split range. - && !self.is_splitting() - // There may be stale read if a target leader is in another store and - // applied commit merge, written new values, but the sibling peer in - // this store does not apply commit merge, so the leader is not ready - // to read, until the merge is rollbacked. - && !self.is_merging() - } - pub fn apply_scheduler(&self) -> &ApplyScheduler { self.apply_scheduler.as_ref().unwrap() } @@ -418,6 +527,34 @@ impl Peer { self.apply_scheduler = Some(apply_scheduler); } + #[inline] + pub fn post_split(&mut self) { + self.reset_region_buckets(); + } + + pub fn reset_region_buckets(&mut self) { + if self.region_buckets.is_some() { + self.last_region_buckets = self.region_buckets.take(); + } + } + + pub fn maybe_campaign(&mut self) -> bool { + if self.region().get_peers().len() <= 1 { + // The peer campaigned when it was created, no need to do it again. + return false; + } + + // If last peer is the leader of the region before split, it's intuitional for + // it to become the leader of new split region. + let _ = self.raft_group.campaign(); + true + } + + #[inline] + pub fn txn_ext(&self) -> &Arc { + &self.txn_ext + } + pub fn generate_read_delegate(&self) -> ReadDelegate { let peer_id = self.peer().get_id(); @@ -432,4 +569,35 @@ impl Peer { self.region_buckets.as_ref().map(|b| b.meta.clone()), ) } + + #[inline] + pub fn proposal_control_mut(&mut self) -> &mut ProposalControl { + &mut self.proposal_control + } + + #[inline] + pub fn proposal_control(&self) -> &ProposalControl { + &self.proposal_control + } + + #[inline] + pub fn proposal_control_advance_apply(&mut self, apply_index: u64) { + let region = self.raft_group.store().region(); + let term = self.term(); + self.proposal_control + .advance_apply(apply_index, term, region); + } + + // TODO: find a better place to put all txn related stuff. + pub fn require_updating_max_ts(&self, ctx: &StoreContext) { + let epoch = self.region().get_region_epoch(); + let term_low_bits = self.term() & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.txn_ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + + self.update_max_timestamp_pd(ctx, initial_status); + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b08624b1185..b3ad56af4fd 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -1,8 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::fmt::{self, Debug, Formatter}; +use std::{ + cell::{RefCell, RefMut}, + fmt::{self, Debug, Formatter}, + sync::{mpsc::Receiver, Arc}, +}; -use engine_traits::{RaftEngine, RaftLogBatch}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, Region}, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, @@ -12,12 +16,15 @@ use raft::{ GetEntriesContext, RaftState, INVALID_ID, }; use raftstore::store::{ - util, EntryStorage, RaftlogFetchTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + util, EntryStorage, ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{o, Logger}; +use slog::{info, o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; -use crate::Result; +use crate::{ + operation::{GenSnapTask, SnapState}, + Result, +}; pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { let region_id = region.get_id(); @@ -49,8 +56,8 @@ pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Resul /// A storage for raft. /// /// It's similar to `PeerStorage` in v1. -pub struct Storage { - entry_storage: EntryStorage, +pub struct Storage { + entry_storage: EntryStorage, peer: metapb::Peer, region_state: RegionLocalState, /// Whether states has been persisted before. If a peer is just created by @@ -58,9 +65,13 @@ pub struct Storage { /// at least once dispite whether the state changes since create. ever_persisted: bool, logger: Logger, + + /// Snapshot part. + snap_state: RefCell, + gen_snap_task: RefCell>>, } -impl Debug for Storage { +impl Debug for Storage { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -71,14 +82,14 @@ impl Debug for Storage { } } -impl Storage { +impl Storage { #[inline] - pub fn entry_storage(&self) -> &EntryStorage { + pub fn entry_storage(&self) -> &EntryStorage { &self.entry_storage } #[inline] - pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { + pub fn entry_storage_mut(&mut self) -> &mut EntryStorage { &mut self.entry_storage } @@ -101,9 +112,19 @@ impl Storage { pub fn logger(&self) -> &Logger { &self.logger } + + #[inline] + pub fn snap_state_mut(&self) -> RefMut<'_, SnapState> { + self.snap_state.borrow_mut() + } + + #[inline] + pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { + self.gen_snap_task.borrow_mut() + } } -impl Storage { +impl Storage { /// Creates a new storage with uninit states. /// /// This should only be used for creating new peer from raft message. @@ -111,7 +132,7 @@ impl Storage { store_id: u64, region: Region, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, logger: &Logger, ) -> Result { let mut region_state = RegionLocalState::default(); @@ -122,7 +143,7 @@ impl Storage { RaftLocalState::default(), RaftApplyState::default(), engine, - log_fetch_scheduler, + read_scheduler, false, logger, ) @@ -136,9 +157,9 @@ impl Storage { region_id: u64, store_id: u64, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, logger: &Logger, - ) -> Result>> { + ) -> Result>> { let region_state = match engine.get_region_state(region_id) { Ok(Some(s)) => s, res => { @@ -174,7 +195,50 @@ impl Storage { raft_state, apply_state, engine, - log_fetch_scheduler, + read_scheduler, + true, + logger, + ) + .map(Some) + } + + /// Creates a new storage for split peer. + /// + /// Except for region local state which uses the `region` provided with the + /// inital tablet index, all uses the inital states. + pub fn with_split( + store_id: u64, + region: &metapb::Region, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result>> { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region.clone()); + region_state.set_state(PeerState::Normal); + region_state.set_tablet_index(RAFT_INIT_LOG_INDEX); + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + read_scheduler, true, logger, ) @@ -187,7 +251,7 @@ impl Storage { raft_state: RaftLocalState, apply_state: RaftApplyState, engine: ER, - log_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, persisted: bool, logger: &Logger, ) -> Result { @@ -206,7 +270,7 @@ impl Storage { raft_state, apply_state, region, - log_fetch_scheduler, + read_scheduler, )?; Ok(Storage { @@ -215,14 +279,26 @@ impl Storage { region_state, ever_persisted: persisted, logger, + snap_state: RefCell::new(SnapState::Relax), + gen_snap_task: RefCell::new(Box::new(None)), }) } + #[inline] + pub fn region_state_mut(&mut self) -> &mut RegionLocalState { + &mut self.region_state + } + #[inline] pub fn raft_state(&self) -> &RaftLocalState { self.entry_storage.raft_state() } + #[inline] + pub fn read_scheduler(&self) -> Scheduler> { + self.entry_storage.read_scheduler() + } + #[inline] pub fn apply_state(&self) -> &RaftApplyState { self.entry_storage.apply_state() @@ -241,6 +317,19 @@ impl Storage { self.ever_persisted = true; } + #[inline] + pub fn take_gen_snap_task(&mut self) -> Option { + self.gen_snap_task.get_mut().take() + } + + #[inline] + pub fn tablet_index(&self) -> u64 { + match self.region_state.get_state() { + PeerState::Tombstone | PeerState::Applying => 0, + _ => self.region_state.get_tablet_index(), + } + } + #[inline] pub fn set_region_state(&mut self, state: RegionLocalState) { self.region_state = state; @@ -253,7 +342,7 @@ impl Storage { } } -impl raft::Storage for Storage { +impl raft::Storage for Storage { fn initial_state(&self) -> raft::Result { let hard_state = self.raft_state().get_hard_state().clone(); // We will persist hard state no matter if it's initialized or not in @@ -306,24 +395,68 @@ impl raft::Storage for Storage { } fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { - Err(raft::Error::Store( - raft::StorageError::SnapshotTemporarilyUnavailable, - )) + self.snapshot(request_index, to) } } #[cfg(test)] mod tests { - use engine_traits::{RaftEngine, RaftEngineReadOnly, RaftLogBatch}; + use std::{ + sync::mpsc::{sync_channel, SyncSender}, + time::Duration, + }; + + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactoryV2}, + raft::RaftTestEngine, + }; + use engine_traits::{ + KvEngine, OpenOptions, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletFactory, ALL_CFS, + }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, }; - use raftstore::store::{RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM}; + use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; + use raftstore::store::{ + util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, + TabletSnapKey, TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + }; + use slog::o; use tempfile::TempDir; + use tikv_util::worker::{Runnable, Worker}; - #[test] - fn test_write_initial_states() { + use super::*; + use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes, tablet::CachedTablet}; + + #[derive(Clone)] + pub struct TestRouter { + ch: SyncSender, + } + + impl TestRouter { + pub fn new() -> (Self, Receiver) { + let (tx, rx) = sync_channel(1); + (Self { ch: tx }, rx) + } + } + + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, _fetched_logs: FetchedLogs) { + unreachable!(); + } + + fn notify_snapshot_generated(&self, _region_id: u64, res: GenSnapRes) { + self.ch.send(res).unwrap(); + } + } + + impl ApplyResReporter for TestRouter { + fn report(&self, _res: ApplyRes) {} + } + + fn new_region() -> Region { let mut region = Region::default(); region.set_id(4); let mut p = Peer::default(); @@ -332,12 +465,17 @@ mod tests { region.mut_peers().push(p); region.mut_region_epoch().set_version(2); region.mut_region_epoch().set_conf_ver(4); + region + } + #[test] + fn test_write_initial_states() { + let region = new_region(); let path = TempDir::new().unwrap(); let engine = engine_test::new_temp_engine(&path); let raft_engine = &engine.raft; let mut wb = raft_engine.log_batch(10); - super::write_initial_states(&mut wb, region.clone()).unwrap(); + write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); @@ -358,4 +496,151 @@ mod tests { assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); } + + #[test] + fn test_apply_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); + mgr.init().unwrap(); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path().join("tablet").as_path(), + ops, + cf_opts, + )); + let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let mut s = Storage::new(4, 6, raft_engine.clone(), sched, &logger.clone()) + .unwrap() + .unwrap(); + + let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); + let mut task = WriteTask::new(region.get_id(), 5, 0); + s.apply_snapshot(&snapshot, &mut task, mgr, factory) + .unwrap(); + + // It can be set before load tablet. + assert_eq!(PeerState::Normal, s.region_state().get_state()); + assert_eq!(10, s.entry_storage().truncated_index()); + assert_eq!(1, s.entry_storage().truncated_term()); + assert_eq!(1, s.entry_storage().last_term()); + assert_eq!(10, s.entry_storage().raft_state().last_index); + // This index can't be set before load tablet. + assert_ne!(10, s.entry_storage().applied_index()); + assert_ne!(1, s.entry_storage().applied_term()); + assert_ne!(10, s.region_state().get_tablet_index()); + assert!(task.persisted_cb.is_some()); + + s.on_applied_snapshot(); + assert_eq!(10, s.entry_storage().applied_index()); + assert_eq!(1, s.entry_storage().applied_term()); + assert_eq!(10, s.region_state().get_tablet_index()); + } + + #[test] + fn test_storage_create_snapshot() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let raft_engine = + engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) + .unwrap(); + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); + mgr.init().unwrap(); + // building a tablet factory + let ops = DbOptions::default(); + let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Arc::new(TestTabletFactoryV2::new( + path.path().join("tablet").as_path(), + ops, + cf_opts, + )); + // create tablet with region_id 1 + let tablet = factory + .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) + .unwrap(); + // setup read runner worker and peer storage + let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let sched = worker.scheduler(); + let logger = slog_global::borrow_global().new(o!()); + let mut s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) + .unwrap() + .unwrap(); + let (router, rx) = TestRouter::new(); + let mut read_runner = ReadRunner::new(router.clone(), raft_engine); + read_runner.set_snap_mgr(mgr.clone()); + worker.start(read_runner); + // setup peer applyer + let mut apply = Apply::new( + region.get_peers()[0].clone(), + RegionLocalState::default(), + router, + CachedTablet::new(Some(tablet)), + factory, + sched, + logger, + ); + + // Test get snapshot + let snap = s.snapshot(0, 7); + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.on_snapshot_generated(res); + let snap = match *s.snap_state.borrow() { + SnapState::Generated(ref snap) => *snap.clone(), + ref s => panic!("unexpected state: {:?}", s), + }; + assert_eq!(snap.get_metadata().get_index(), 0); + assert_eq!(snap.get_metadata().get_term(), 0); + assert_eq!(snap.get_data().is_empty(), false); + let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); + let checkpointer_path = mgr.tablet_gen_path(&snap_key); + assert!(checkpointer_path.exists()); + + // Test cancel snapshot + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.schedule_gen_snapshot(gen_task); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + assert_eq!(*s.snap_state.borrow(), SnapState::Relax); + + // Test get twice snapshot and cancel once. + // get snapshot a + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_a = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(1, 5); + apply.schedule_gen_snapshot(gen_task_a); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + s.cancel_generating_snap(None); + // cancel get snapshot a, try get snaphsot b + let snap = s.snapshot(0, 0); + assert_eq!(snap.unwrap_err(), unavailable); + let gen_task_b = s.gen_snap_task.borrow_mut().take().unwrap(); + apply.set_apply_progress(10, 5); + apply.schedule_gen_snapshot(gen_task_b); + // on snapshot a and b + assert_eq!(s.on_snapshot_generated(res), false); + let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(s.on_snapshot_generated(res), true); + } } diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 78abef13247..8cb65e40a3c 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -8,15 +8,20 @@ use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raftstore::store::{FetchedLogs, LogFetchedNotifier, RegionSnapshot}; +use raft::eraftpb::Snapshot as RaftSnapshot; +use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; use slog::Logger; use super::PeerMsg; use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; -impl LogFetchedNotifier for StoreRouter { - fn notify(&self, region_id: u64, fetched: FetchedLogs) { - let _ = self.force_send(region_id, PeerMsg::FetchedLogs(fetched)); +impl AsyncReadNotifier for StoreRouter { + fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { + let _ = self.force_send(region_id, PeerMsg::LogsFetched(fetched_logs)); + } + + fn notify_snapshot_generated(&self, region_id: u64, snapshot: GenSnapRes) { + let _ = self.force_send(region_id, PeerMsg::SnapshotGenerated(snapshot)); } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index e9893bad968..1507d404297 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,13 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use kvproto::raft_serverpb::RegionLocalState; use raftstore::store::fsm::ChangePeer; -use crate::operation::{AdminCmdResult, CommittedEntries}; +use crate::operation::{AdminCmdResult, CommittedEntries, GenSnapTask}; #[derive(Debug)] pub enum ApplyTask { CommittedEntries(CommittedEntries), + Snapshot(GenSnapTask), } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index c607e389135..a4681d8a873 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -3,8 +3,10 @@ // #[PerformanceCriticalPath] use std::fmt; +use engine_traits::Snapshot; use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; -use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs}; +use raft::eraftpb::Snapshot as RaftSnapshot; +use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; use tikv_util::time::Instant; use super::{ @@ -13,6 +15,7 @@ use super::{ }, ApplyRes, }; +use crate::operation::SplitInit; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] @@ -123,9 +126,12 @@ pub enum PeerMsg { Tick(PeerTick), /// Result of applying committed entries. The message can't be lost. ApplyRes(ApplyRes), - FetchedLogs(FetchedLogs), + LogsFetched(FetchedLogs), + SnapshotGenerated(GenSnapRes), /// Start the FSM. Start, + /// Messages from peer to peer in the same store + SplitInit(Box), /// A message only used to notify a peer. Noop, /// A message that indicates an asynchronous write has finished. @@ -164,6 +170,9 @@ impl fmt::Debug for PeerMsg { }, PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), PeerMsg::Start => write!(fmt, "Startup"), + PeerMsg::SplitInit(_) => { + write!(fmt, "Split initialization") + } PeerMsg::Noop => write!(fmt, "Noop"), PeerMsg::Persisted { peer_id, @@ -173,7 +182,8 @@ impl fmt::Debug for PeerMsg { "Persisted peer_id {}, ready_number {}", peer_id, ready_number ), - PeerMsg::FetchedLogs(fetched) => write!(fmt, "FetchedLogs {:?}", fetched), + PeerMsg::LogsFetched(fetched) => write!(fmt, "LogsFetched {:?}", fetched), + PeerMsg::SnapshotGenerated(_) => write!(fmt, "SnapshotGenerated"), PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), @@ -183,6 +193,7 @@ impl fmt::Debug for PeerMsg { pub enum StoreMsg { RaftMessage(Box), + SplitInit(Box), Tick(StoreTick), Start, } @@ -191,6 +202,7 @@ impl fmt::Debug for StoreMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), + StoreMsg::SplitInit(_) => write!(fmt, "Split initialization"), StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), StoreMsg::Start => write!(fmt, "Start store"), } diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index d68c414ca5f..b6da3c804f0 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -221,6 +221,13 @@ impl BaseSubscriber { pub async fn result(self) -> Option { WaitResult { core: &self.core }.await } + + /// Test if the result is ready without any polling. + #[inline] + pub fn has_result(&self) -> bool { + let e = self.core.event.load(Ordering::Relaxed); + check_bit(e, fired_bit_of(PAYLOAD_EVENT)).is_some() + } } unsafe impl Send for BaseSubscriber {} diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs new file mode 100644 index 00000000000..ad8249d22a4 --- /dev/null +++ b/components/raftstore-v2/src/worker/mod.rs @@ -0,0 +1,5 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod pd; + +pub use pd::{RegionHeartbeatTask as PdRegionHeartbeatTask, Runner as PdRunner, Task as PdTask}; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs new file mode 100644 index 00000000000..132678e21f2 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -0,0 +1,327 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use causal_ts::CausalTsProviderImpl; +use collections::HashMap; +use concurrency_manager::ConcurrencyManager; +use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use kvproto::{metapb, pdpb}; +use pd_client::PdClient; +use raftstore::store::{util::KeysInfoFormatter, TxnExt}; +use slog::{error, info, Logger}; +use tikv_util::{time::UnixSecs, worker::Runnable}; +use yatp::{task::future::TaskCell, Remote}; + +use crate::{batch::StoreRouter, router::PeerMsg}; + +mod region_heartbeat; +mod split; +mod store_heartbeat; +mod update_max_timestamp; + +pub use region_heartbeat::RegionHeartbeatTask; + +pub enum Task { + RegionHeartbeat(RegionHeartbeatTask), + StoreHeartbeat { + stats: pdpb::StoreStats, + // TODO: StoreReport, StoreDrAutoSyncStatus + }, + DestroyPeer { + region_id: u64, + }, + AskBatchSplit { + region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + }, + ReportBatchSplit { + regions: Vec, + }, + UpdateMaxTimestamp { + region_id: u64, + initial_status: u64, + txn_ext: Arc, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::RegionHeartbeat(ref hb_task) => write!( + f, + "region heartbeat for region {:?}, leader {}", + hb_task.region, + hb_task.peer.get_id(), + ), + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {:?}", stats) + } + Task::DestroyPeer { ref region_id } => { + write!(f, "destroy peer of region {}", region_id) + } + Task::AskBatchSplit { + ref region, + ref split_keys, + .. + } => write!( + f, + "ask split region {} with {}", + region.get_id(), + KeysInfoFormatter(split_keys.iter()) + ), + Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::UpdateMaxTimestamp { region_id, .. } => write!( + f, + "update the max timestamp for region {} in the concurrency manager", + region_id + ), + } + } +} + +pub struct Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_factory: Arc>, + router: StoreRouter, + + remote: Remote, + + region_peers: HashMap, + + // For store_heartbeat. + start_ts: UnixSecs, + store_stat: store_heartbeat::StoreStat, + + // For region_heartbeat. + region_cpu_records: HashMap, + is_hb_receiver_scheduled: bool, + + // For update_max_timestamp. + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, + + logger: Logger, + shutdown: Arc, +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn new( + store_id: u64, + pd_client: Arc, + raft_engine: ER, + tablet_factory: Arc>, + router: StoreRouter, + remote: Remote, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + logger: Logger, + shutdown: Arc, + ) -> Self { + Self { + store_id, + pd_client, + raft_engine, + tablet_factory, + router, + remote, + region_peers: HashMap::default(), + start_ts: UnixSecs::zero(), + store_stat: store_heartbeat::StoreStat::default(), + region_cpu_records: HashMap::default(), + is_hb_receiver_scheduled: false, + concurrency_manager, + causal_ts_provider, + logger, + shutdown, + } + } +} + +impl Runnable for Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + self.maybe_schedule_heartbeat_receiver(); + match task { + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), + Task::AskBatchSplit { + region, + split_keys, + peer, + right_derive, + } => self.handle_ask_batch_split(region, split_keys, peer, right_derive), + Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::UpdateMaxTimestamp { + region_id, + initial_status, + txn_ext, + } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + } + } +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } +} + +pub mod requests { + use kvproto::raft_cmdpb::{ + AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, + SplitRequest, + }; + use raft::eraftpb::ConfChangeType; + + use super::*; + + pub fn send_admin_request( + logger: &Logger, + router: &StoreRouter, + region_id: u64, + epoch: metapb::RegionEpoch, + peer: metapb::Peer, + request: AdminRequest, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let cmd_type = request.get_cmd_type(); + + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(epoch); + req.mut_header().set_peer(peer); + req.set_admin_request(request); + + let (msg, _) = PeerMsg::raft_command(req); + if let Err(e) = router.send(region_id, msg) { + error!( + logger, + "send request failed"; + "region_id" => region_id, "cmd_type" => ?cmd_type, "err" => ?e, + ); + } + } + + pub fn new_change_peer_request( + change_type: ConfChangeType, + peer: metapb::Peer, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeer); + req.mut_change_peer().set_change_type(change_type); + req.mut_change_peer().set_peer(peer); + req + } + + pub fn new_change_peer_v2_request(changes: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::ChangePeerV2); + let change_peer_reqs = changes + .into_iter() + .map(|mut c| { + let mut cp = ChangePeerRequest::default(); + cp.set_change_type(c.get_change_type()); + cp.set_peer(c.take_peer()); + cp + }) + .collect(); + let mut cp = ChangePeerV2Request::default(); + cp.set_changes(change_peer_reqs); + req.set_change_peer_v2(cp); + req + } + + pub fn new_split_region_request( + split_key: Vec, + new_region_id: u64, + peer_ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::Split); + req.mut_split().set_split_key(split_key); + req.mut_split().set_new_region_id(new_region_id); + req.mut_split().set_new_peer_ids(peer_ids); + req.mut_split().set_right_derive(right_derive); + req + } + + pub fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req + } + + pub fn new_transfer_leader_request( + peer: metapb::Peer, + peers: Vec, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::TransferLeader); + req.mut_transfer_leader().set_peer(peer); + req.mut_transfer_leader().set_peers(peers.into()); + req + } + + pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::PrepareMerge); + req.mut_prepare_merge() + .set_target(merge.get_target().to_owned()); + req + } +} diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs new file mode 100644 index 00000000000..ad0293d0b6d --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs @@ -0,0 +1,256 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, + SplitRequest, + }, + raft_serverpb::RaftMessage, + replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, +}; +use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; +use raft::eraftpb::ConfChangeType; +use slog::{debug, error, info}; +use tikv_util::{store::QueryStats, time::UnixSecs}; + +use super::{requests::*, Runner}; + +pub struct RegionHeartbeatTask { + pub term: u64, + pub region: metapb::Region, + pub peer: metapb::Peer, + pub down_peers: Vec, + pub pending_peers: Vec, + pub written_bytes: u64, + pub written_keys: u64, + pub approximate_size: Option, + pub approximate_keys: Option, + pub wait_data_peers: Vec, + // TODO: RegionReplicationStatus +} + +#[derive(Default)] +pub struct PeerStat { + pub read_bytes: u64, + pub read_keys: u64, + pub query_stats: QueryStats, + // last_region_report_attributes records the state of the last region heartbeat + pub last_region_report_read_bytes: u64, + pub last_region_report_read_keys: u64, + pub last_region_report_query_stats: QueryStats, + pub last_region_report_written_bytes: u64, + pub last_region_report_written_keys: u64, + pub last_region_report_ts: UnixSecs, + // last_store_report_attributes records the state of the last store heartbeat + pub last_store_report_read_bytes: u64, + pub last_store_report_read_keys: u64, + pub last_store_report_query_stats: QueryStats, + pub approximate_keys: u64, + pub approximate_size: u64, +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_region_heartbeat(&mut self, task: RegionHeartbeatTask) { + // HACK! In order to keep the compatible of protos, we use 0 to identify + // the size uninitialized regions, and use 1 to identify the empty regions. + // + // See tikv/tikv#11114 for details. + let approximate_size = match task.approximate_size { + Some(0) => 1, + Some(v) => v, + None => 0, // size uninitialized + }; + let approximate_keys = task.approximate_keys.unwrap_or_default(); + let region_id = task.region.get_id(); + + let peer_stat = self + .region_peers + .entry(region_id) + .or_insert_with(PeerStat::default); + peer_stat.approximate_size = approximate_size; + peer_stat.approximate_keys = approximate_keys; + + let read_bytes_delta = peer_stat.read_bytes - peer_stat.last_region_report_read_bytes; + let read_keys_delta = peer_stat.read_keys - peer_stat.last_region_report_read_keys; + let written_bytes_delta = task.written_bytes - peer_stat.last_region_report_written_bytes; + let written_keys_delta = task.written_keys - peer_stat.last_region_report_written_keys; + let query_stats = peer_stat + .query_stats + .sub_query_stats(&peer_stat.last_region_report_query_stats); + let mut last_report_ts = peer_stat.last_region_report_ts; + if last_report_ts.is_zero() { + last_report_ts = self.start_ts; + } + peer_stat.last_region_report_written_bytes = task.written_bytes; + peer_stat.last_region_report_written_keys = task.written_keys; + peer_stat.last_region_report_read_bytes = peer_stat.read_bytes; + peer_stat.last_region_report_read_keys = peer_stat.read_keys; + peer_stat.last_region_report_query_stats = peer_stat.query_stats.clone(); + let unix_secs_now = UnixSecs::now(); + peer_stat.last_region_report_ts = unix_secs_now; + + // Calculate the CPU usage since the last region heartbeat. + let cpu_usage = { + // Take out the region CPU record. + let cpu_time_duration = Duration::from_millis( + self.region_cpu_records.remove(®ion_id).unwrap_or(0) as u64, + ); + let interval_second = unix_secs_now.into_inner() - last_report_ts.into_inner(); + // Keep consistent with the calculation of cpu_usages in a store heartbeat. + // See components/tikv_util/src/metrics/threads_linux.rs for more details. + if interval_second > 0 { + ((cpu_time_duration.as_secs_f64() * 100.0) / interval_second as f64) as u64 + } else { + 0 + } + }; + + let region_stat = RegionStat { + down_peers: task.down_peers, + pending_peers: task.pending_peers, + written_bytes: written_bytes_delta, + written_keys: written_keys_delta, + read_bytes: read_bytes_delta, + read_keys: read_keys_delta, + query_stats: query_stats.0, + approximate_size, + approximate_keys, + last_report_ts, + cpu_usage, + }; + self.store_stat + .region_bytes_written + .observe(region_stat.written_bytes as f64); + self.store_stat + .region_keys_written + .observe(region_stat.written_keys as f64); + self.store_stat + .region_bytes_read + .observe(region_stat.read_bytes as f64); + self.store_stat + .region_keys_read + .observe(region_stat.read_keys as f64); + + let resp = self.pd_client.region_heartbeat( + task.term, + task.region.clone(), + task.peer, + region_stat, + None, + ); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send heartbeat"; + "region_id" => task.region.get_id(), + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn maybe_schedule_heartbeat_receiver(&mut self) { + if self.is_hb_receiver_scheduled { + return; + } + let router = self.router.clone(); + let store_id = self.store_id; + let logger = self.logger.clone(); + + let fut = + self.pd_client + .handle_region_heartbeat_response(self.store_id, move |mut resp| { + let region_id = resp.get_region_id(); + let epoch = resp.take_region_epoch(); + let peer = resp.take_target_peer(); + + if resp.has_change_peer() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer = resp.take_change_peer(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "change_type" => ?change_peer.get_change_type(), + "peer" => ?change_peer.get_peer() + ); + let req = new_change_peer_request( + change_peer.get_change_type(), + change_peer.take_peer(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_change_peer_v2() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["change peer"]) + .inc(); + + let mut change_peer_v2 = resp.take_change_peer_v2(); + info!( + logger, + "try to change peer"; + "region_id" => region_id, + "changes" => ?change_peer_v2.get_changes(), + ); + let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_transfer_leader() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["transfer leader"]) + .inc(); + + let mut transfer_leader = resp.take_transfer_leader(); + info!( + logger, + "try to transfer leader"; + "region_id" => region_id, + "from_peer" => ?peer, + "to_peer" => ?transfer_leader.get_peer(), + "to_peers" => ?transfer_leader.get_peers(), + ); + let req = new_transfer_leader_request( + transfer_leader.take_peer(), + transfer_leader.take_peers().into(), + ); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } else if resp.has_split_region() { + // TODO + info!(logger, "pd asks for split but ignored"); + } else if resp.has_merge() { + // TODO + info!(logger, "pd asks for merge but ignored"); + } else { + PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); + } + }); + let logger = self.logger.clone(); + let f = async move { + match fut.await { + Ok(_) => { + info!( + logger, + "region heartbeat response handler exit"; + "store_id" => store_id, + ); + } + Err(e) => panic!("unexpected error: {:?}", e), + } + }; + self.remote.spawn(f); + self.is_hb_receiver_scheduled = true; + } +} diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs new file mode 100644 index 00000000000..3cb85f6698c --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -0,0 +1,99 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, +}; +use pd_client::PdClient; +use slog::{info, warn}; + +use super::{requests::*, Runner}; + +fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, +) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_ask_batch_split( + &mut self, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ) { + if split_keys.is_empty() { + info!(self.logger, "empty split key, skip ask batch split"; + "region_id" => region.get_id()); + return; + } + let resp = self + .pd_client + .ask_batch_split(region.clone(), split_keys.len()); + let router = self.router.clone(); + let logger = self.logger.clone(); + let f = async move { + match resp.await { + Ok(mut resp) => { + info!( + logger, + "try to batch split region"; + "region_id" => region.get_id(), + "new_region_ids" => ?resp.get_ids(), + "region" => ?region, + ); + + let req = new_batch_split_region_request( + split_keys, + resp.take_ids().into(), + right_derive, + ); + let region_id = region.get_id(); + let epoch = region.take_region_epoch(); + send_admin_request(&logger, &router, region_id, epoch, peer, req); + } + Err(e) => { + warn!( + logger, + "ask batch split failed"; + "region_id" => region.get_id(), + "err" => ?e, + ); + } + } + }; + self.remote.spawn(f); + } + + pub fn handle_report_batch_split(&mut self, regions: Vec) { + let resp = self.pd_client.report_batch_split(regions); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report split failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } +} diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs new file mode 100644 index 00000000000..1caa96a5225 --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -0,0 +1,293 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp; + +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use kvproto::pdpb; +use pd_client::{ + metrics::{ + REGION_READ_BYTES_HISTOGRAM, REGION_READ_KEYS_HISTOGRAM, REGION_WRITTEN_BYTES_HISTOGRAM, + REGION_WRITTEN_KEYS_HISTOGRAM, STORE_SIZE_GAUGE_VEC, + }, + PdClient, +}; +use prometheus::local::LocalHistogram; +use slog::{error, warn}; +use tikv_util::{metrics::RecordPairVec, store::QueryStats, time::UnixSecs, topn::TopN}; + +use super::Runner; + +const HOTSPOT_REPORT_CAPACITY: usize = 1000; + +fn hotspot_key_report_threshold() -> u64 { + const HOTSPOT_KEY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_KEY_RATE_THRESHOLD * 10 +} + +fn hotspot_byte_report_threshold() -> u64 { + const HOTSPOT_BYTE_RATE_THRESHOLD: u64 = 8 * 1024; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_BYTE_RATE_THRESHOLD * 10 +} + +fn hotspot_query_num_report_threshold() -> u64 { + const HOTSPOT_QUERY_RATE_THRESHOLD: u64 = 128; + fail_point!("mock_hotspot_threshold", |_| { 0 }); + HOTSPOT_QUERY_RATE_THRESHOLD * 10 +} + +pub struct StoreStat { + pub engine_total_bytes_read: u64, + pub engine_total_keys_read: u64, + pub engine_total_query_num: QueryStats, + pub engine_last_total_bytes_read: u64, + pub engine_last_total_keys_read: u64, + pub engine_last_query_num: QueryStats, + pub last_report_ts: UnixSecs, + + pub region_bytes_read: LocalHistogram, + pub region_keys_read: LocalHistogram, + pub region_bytes_written: LocalHistogram, + pub region_keys_written: LocalHistogram, + + pub store_cpu_usages: RecordPairVec, + pub store_read_io_rates: RecordPairVec, + pub store_write_io_rates: RecordPairVec, +} + +impl Default for StoreStat { + fn default() -> StoreStat { + StoreStat { + region_bytes_read: REGION_READ_BYTES_HISTOGRAM.local(), + region_keys_read: REGION_READ_KEYS_HISTOGRAM.local(), + region_bytes_written: REGION_WRITTEN_BYTES_HISTOGRAM.local(), + region_keys_written: REGION_WRITTEN_KEYS_HISTOGRAM.local(), + + last_report_ts: UnixSecs::zero(), + engine_total_bytes_read: 0, + engine_total_keys_read: 0, + engine_last_total_bytes_read: 0, + engine_last_total_keys_read: 0, + engine_total_query_num: QueryStats::default(), + engine_last_query_num: QueryStats::default(), + + store_cpu_usages: RecordPairVec::default(), + store_read_io_rates: RecordPairVec::default(), + store_write_io_rates: RecordPairVec::default(), + } + } +} + +#[derive(Default, Clone)] +struct PeerCmpReadStat { + pub region_id: u64, + pub report_stat: u64, +} + +impl Ord for PeerCmpReadStat { + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.report_stat.cmp(&other.report_stat) + } +} + +impl Eq for PeerCmpReadStat {} + +impl PartialEq for PeerCmpReadStat { + fn eq(&self, other: &Self) -> bool { + self.report_stat == other.report_stat + } +} + +impl PartialOrd for PeerCmpReadStat { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.report_stat.cmp(&other.report_stat)) + } +} + +fn collect_report_read_peer_stats( + capacity: usize, + mut report_read_stats: HashMap, + mut stats: pdpb::StoreStats, +) -> pdpb::StoreStats { + if report_read_stats.len() < capacity * 3 { + for (_, read_stat) in report_read_stats { + stats.peer_stats.push(read_stat); + } + return stats; + } + let mut keys_topn_report = TopN::new(capacity); + let mut bytes_topn_report = TopN::new(capacity); + let mut stats_topn_report = TopN::new(capacity); + for read_stat in report_read_stats.values() { + let mut cmp_stat = PeerCmpReadStat::default(); + cmp_stat.region_id = read_stat.region_id; + let mut key_cmp_stat = cmp_stat.clone(); + key_cmp_stat.report_stat = read_stat.read_keys; + keys_topn_report.push(key_cmp_stat); + let mut byte_cmp_stat = cmp_stat.clone(); + byte_cmp_stat.report_stat = read_stat.read_bytes; + bytes_topn_report.push(byte_cmp_stat); + let mut query_cmp_stat = cmp_stat.clone(); + query_cmp_stat.report_stat = get_read_query_num(read_stat.get_query_stats()); + stats_topn_report.push(query_cmp_stat); + } + + for x in keys_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in bytes_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + + for x in stats_topn_report { + if let Some(report_stat) = report_read_stats.remove(&x.region_id) { + stats.peer_stats.push(report_stat); + } + } + stats +} + +fn get_read_query_num(stat: &pdpb::QueryStats) -> u64 { + stat.get_get() + stat.get_coprocessor() + stat.get_scan() +} + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_store_heartbeat(&mut self, mut stats: pdpb::StoreStats) { + let mut report_peers = HashMap::default(); + for (region_id, region_peer) in &mut self.region_peers { + let read_bytes = region_peer.read_bytes - region_peer.last_store_report_read_bytes; + let read_keys = region_peer.read_keys - region_peer.last_store_report_read_keys; + let query_stats = region_peer + .query_stats + .sub_query_stats(®ion_peer.last_store_report_query_stats); + region_peer.last_store_report_read_bytes = region_peer.read_bytes; + region_peer.last_store_report_read_keys = region_peer.read_keys; + region_peer + .last_store_report_query_stats + .fill_query_stats(®ion_peer.query_stats); + if read_bytes < hotspot_byte_report_threshold() + && read_keys < hotspot_key_report_threshold() + && query_stats.get_read_query_num() < hotspot_query_num_report_threshold() + { + continue; + } + let mut read_stat = pdpb::PeerStat::default(); + read_stat.set_region_id(*region_id); + read_stat.set_read_keys(read_keys); + read_stat.set_read_bytes(read_bytes); + read_stat.set_query_stats(query_stats.0); + report_peers.insert(*region_id, read_stat); + } + + stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); + let (capacity, used_size, available) = self.collect_engine_size().unwrap_or_default(); + if available == 0 { + warn!(self.logger, "no available space"); + } + + stats.set_capacity(capacity); + stats.set_used_size(used_size); + stats.set_available(available); + stats.set_bytes_read( + self.store_stat.engine_total_bytes_read - self.store_stat.engine_last_total_bytes_read, + ); + stats.set_keys_read( + self.store_stat.engine_total_keys_read - self.store_stat.engine_last_total_keys_read, + ); + + self.store_stat + .engine_total_query_num + .add_query_stats(stats.get_query_stats()); // add write query stat + let res = self + .store_stat + .engine_total_query_num + .sub_query_stats(&self.store_stat.engine_last_query_num); + stats.set_query_stats(res.0); + + stats.set_cpu_usages(self.store_stat.store_cpu_usages.clone().into()); + stats.set_read_io_rates(self.store_stat.store_read_io_rates.clone().into()); + stats.set_write_io_rates(self.store_stat.store_write_io_rates.clone().into()); + + let mut interval = pdpb::TimeInterval::default(); + interval.set_start_timestamp(self.store_stat.last_report_ts.into_inner()); + stats.set_interval(interval); + self.store_stat.engine_last_total_bytes_read = self.store_stat.engine_total_bytes_read; + self.store_stat.engine_last_total_keys_read = self.store_stat.engine_total_keys_read; + self.store_stat + .engine_last_query_num + .fill_query_stats(&self.store_stat.engine_total_query_num); + self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.region_bytes_written.flush(); + self.store_stat.region_keys_written.flush(); + self.store_stat.region_bytes_read.flush(); + self.store_stat.region_keys_read.flush(); + + STORE_SIZE_GAUGE_VEC + .with_label_values(&["capacity"]) + .set(capacity as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["available"]) + .set(available as i64); + STORE_SIZE_GAUGE_VEC + .with_label_values(&["used"]) + .set(used_size as i64); + + // TODO: slow score + + let router = self.router.clone(); + let resp = self.pd_client.store_heartbeat(stats, None, None); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + error!(logger, "store heartbeat failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } + + /// Returns (capacity, used, available). + fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { + let disk_stats = match fs2::statvfs(self.tablet_factory.tablets_path()) { + Err(e) => { + error!( + self.logger, + "get disk stat for rocksdb failed"; + "engine_path" => self.tablet_factory.tablets_path().display(), + "err" => ?e + ); + return None; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + // TODO: custom capacity. + let capacity = disk_cap; + // TODO: accurate snapshot size and kv engines size. + let snap_size = 0; + let kv_size = 0; + let used_size = snap_size + + kv_size + + self + .raft_engine + .get_engine_size() + .expect("raft engine used size"); + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + // We only care about rocksdb SST file size, so we should check disk available + // here. + available = cmp::min(available, disk_stats.available_space()); + Some((capacity, used_size, available)) + } +} diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs new file mode 100644 index 00000000000..cbfecb8171d --- /dev/null +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -0,0 +1,114 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{atomic::Ordering, Arc}, + time::{Duration, Instant}, +}; + +use causal_ts::CausalTsProvider; +use engine_traits::{KvEngine, RaftEngine}; +use fail::fail_point; +use futures::{compat::Future01CompatExt, FutureExt}; +use pd_client::PdClient; +use raftstore::{store::TxnExt, Result}; +use slog::{info, warn}; +use tikv_util::{box_err, timer::GLOBAL_TIMER_HANDLE}; +use txn_types::TimeStamp; + +use super::Runner; + +impl Runner +where + EK: KvEngine, + ER: RaftEngine, + T: PdClient + 'static, +{ + pub fn handle_update_max_timestamp( + &mut self, + region_id: u64, + initial_status: u64, + txn_ext: Arc, + ) { + let pd_client = self.pd_client.clone(); + let concurrency_manager = self.concurrency_manager.clone(); + let causal_ts_provider = self.causal_ts_provider.clone(); + let logger = self.logger.clone(); + let shutdown = self.shutdown.clone(); + + let f = async move { + let mut success = false; + while txn_ext.max_ts_sync_status.load(Ordering::SeqCst) == initial_status + && !shutdown.load(Ordering::Relaxed) + { + // On leader transfer / region merge, RawKV API v2 need to + // invoke causal_ts_provider.flush() to renew + // cached TSO, to ensure that the next TSO + // returned by causal_ts_provider.get_ts() on current + // store must be larger than the store where the leader is on + // before. + // + // And it won't break correctness of transaction commands, as + // causal_ts_provider.flush() is implemented as + // pd_client.get_tso() + renew TSO cached. + let res: Result = if let Some(causal_ts_provider) = &causal_ts_provider { + causal_ts_provider + .async_flush() + .await + .map_err(|e| box_err!(e)) + } else { + pd_client.get_tso().await.map_err(Into::into) + }; + + match res { + Ok(ts) => { + concurrency_manager.update_max_ts(ts); + success = txn_ext + .max_ts_sync_status + .compare_exchange( + initial_status, + initial_status | 1, + Ordering::SeqCst, + Ordering::SeqCst, + ) + .is_ok(); + break; + } + Err(e) => { + warn!( + logger, + "failed to update max timestamp for region {}: {:?}", region_id, e + ); + } + } + } + + if success { + info!(logger, "succeed to update max timestamp"; "region_id" => region_id); + } else { + info!( + logger, + "updating max timestamp is stale"; + "region_id" => region_id, + "initial_status" => initial_status, + ); + } + }; + + #[cfg(feature = "failpoints")] + let delay = (|| { + fail_point!("delay_update_max_ts", |_| true); + false + })(); + #[cfg(not(feature = "failpoints"))] + let delay = false; + + if delay { + info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); + let deadline = Instant::now() + Duration::from_secs(1); + self.remote + .spawn(GLOBAL_TIMER_HANDLE.delay(deadline).compat().then(|_| f)); + } else { + self.remote.spawn(f); + } + } +} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 1d458d7a73e..24184233117 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -11,7 +11,9 @@ use std::{ time::{Duration, Instant}, }; +use causal_ts::CausalTsProviderImpl; use collections::HashSet; +use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, @@ -21,12 +23,16 @@ use engine_test::{ use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; use futures::executor::block_on; use kvproto::{ - metapb::Store, + metapb::{self, RegionEpoch, Store}, raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; use pd_client::RpcClient; -use raftstore::store::{region_meta::RegionMeta, Config, Transport, RAFT_INIT_LOG_INDEX}; +use raft::eraftpb::MessageType; +use raftstore::store::{ + region_meta::{RegionLocalState, RegionMeta}, + Config, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, +}; use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, @@ -145,6 +151,32 @@ impl TestRouter { req.mut_header().set_term(meta.raft_status.hard_state.term); req } + + pub fn region_detail(&self, region_id: u64) -> metapb::Region { + let RegionLocalState { + id, + start_key, + end_key, + epoch, + peers, + .. + } = self + .must_query_debug_info(region_id, Duration::from_secs(1)) + .unwrap() + .region_state; + let mut region = metapb::Region::default(); + region.set_id(id); + region.set_start_key(start_key); + region.set_end_key(end_key); + let mut region_epoch = RegionEpoch::default(); + region_epoch.set_conf_ver(epoch.conf_ver); + region_epoch.set_version(epoch.version); + region.set_region_epoch(region_epoch); + for peer in peers { + region.mut_peers().push(new_peer(peer.store_id, peer.id)); + } + region + } } pub struct RunningState { @@ -160,12 +192,14 @@ pub struct RunningState { impl RunningState { fn new( - pd_client: &RpcClient, + pd_client: &Arc, path: &Path, cfg: Arc>, transport: TestTransport, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, logger: &Logger, - ) -> (TestRouter, Self) { + ) -> (TestRouter, TabletSnapManager, Self) { let cf_opts = ALL_CFS .iter() .copied() @@ -179,7 +213,7 @@ impl RunningState { let raft_engine = engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); - let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client, logger.clone()); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); let store_id = bootstrap.bootstrap_store().unwrap(); let mut store = Store::default(); store.set_id(store_id); @@ -206,7 +240,8 @@ impl RunningState { let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); - + let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); + snap_mgr.init().unwrap(); system .start( store_id, @@ -214,8 +249,12 @@ impl RunningState { raft_engine.clone(), factory.clone(), transport.clone(), + pd_client.clone(), router.store_router(), store_meta.clone(), + snap_mgr.clone(), + concurrency_manager, + causal_ts_provider, ) .unwrap(); @@ -228,7 +267,7 @@ impl RunningState { transport, store_meta, }; - (TestRouter(router), state) + (TestRouter(router), snap_mgr, state) } } @@ -239,29 +278,38 @@ impl Drop for RunningState { } pub struct TestNode { - pd_client: RpcClient, + pd_client: Arc, path: TempDir, running_state: Option, logger: Logger, + snap_mgr: Option, } impl TestNode { fn with_pd(pd_server: &test_pd::Server, logger: Logger) -> TestNode { - let pd_client = test_pd::util::new_client(pd_server.bind_addrs(), None); + let pd_client = Arc::new(test_pd::util::new_client(pd_server.bind_addrs(), None)); let path = TempDir::new().unwrap(); - TestNode { pd_client, path, running_state: None, logger, + snap_mgr: None, } } fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, state) = - RunningState::new(&self.pd_client, self.path.path(), cfg, trans, &self.logger); + let (router, snap_mgr, state) = RunningState::new( + &self.pd_client, + self.path.path(), + cfg, + trans, + ConcurrencyManager::new(1.into()), + None, + &self.logger, + ); self.running_state = Some(state); + self.snap_mgr = Some(snap_mgr); router } @@ -269,6 +317,10 @@ impl TestNode { &self.running_state().unwrap().factory } + pub fn pd_client(&self) -> &Arc { + &self.pd_client + } + fn stop(&mut self) { if let Some(state) = std::mem::take(&mut self.running_state) { let mut meta = state.store_meta.lock().unwrap(); @@ -288,6 +340,10 @@ impl TestNode { self.running_state.as_ref() } + pub fn snap_mgr(&self) -> Option<&TabletSnapManager> { + self.snap_mgr.as_ref() + } + pub fn id(&self) -> u64 { self.running_state().unwrap().store_id } @@ -437,6 +493,33 @@ impl Cluster { continue; } }; + // Simulate already received the snapshot. + if msg.get_message().get_msg_type() == MessageType::MsgSnapshot { + let from_offset = match self + .nodes + .iter() + .position(|n| n.id() == msg.get_from_peer().get_store_id()) + { + Some(offset) => offset, + None => { + debug!(self.logger, "failed to find snapshot source node"; "message" => ?msg); + continue; + } + }; + let key = TabletSnapKey::new( + region_id, + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot().get_metadata().get_term(), + msg.get_message().get_snapshot().get_metadata().get_index(), + ); + let from_snap_mgr = self.node(from_offset).snap_mgr().unwrap(); + let to_snap_mgr = self.node(offset).snap_mgr().unwrap(); + let gen_path = from_snap_mgr.tablet_gen_path(&key); + let recv_path = to_snap_mgr.final_recv_path(&key); + assert!(gen_path.exists()); + std::fs::rename(gen_path, recv_path.clone()).unwrap(); + assert!(recv_path.exists()); + } regions.insert(msg.get_region_id()); if let Err(e) = self.routers[offset].send_raft_message(msg) { debug!(self.logger, "failed to send raft message"; "err" => ?e); diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 50fb5c4e16a..52c8ba5e1f8 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -5,9 +5,13 @@ #![feature(custom_test_frameworks)] #![test_runner(test_util::run_tests)] +// TODO: test conflict control in integration tests after split is supported. + mod cluster; mod test_basic_write; mod test_conf_change; mod test_life; +mod test_pd_heartbeat; mod test_read; +mod test_split; mod test_status; diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index 7c8bdb369a1..fc23e46e12f 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -96,7 +96,7 @@ fn test_basic_write() { ); // Make it step down and follower should reject write. - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(2); msg.set_to_peer(new_peer(1, 3)); msg.mut_region_epoch().set_conf_ver(INIT_EPOCH_CONF_VER); diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index f9479786a7b..558962f8ef6 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -1,9 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{self, time::Duration}; -use kvproto::raft_cmdpb::AdminCmdType; +use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use kvproto::raft_cmdpb::{AdminCmdType, CmdType, Request}; use raft::prelude::ConfChangeType; +use raftstore_v2::router::{PeerMsg, PeerTick}; use tikv_util::store::new_learner_peer; use crate::cluster::Cluster; @@ -11,6 +13,7 @@ use crate::cluster::Cluster; #[test] fn test_simple_change() { let cluster = Cluster::with_node_count(2, None); + let region_id = 2; let router0 = cluster.router(0); let mut req = router0.new_request_for(2); let admin_req = req.mut_admin_request(); @@ -29,6 +32,7 @@ fn test_simple_change() { let meta = router0 .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); + let match_index = meta.raft_apply.applied_index; assert_eq!(meta.region_state.epoch.version, epoch.get_version()); assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); assert_eq!(meta.region_state.peers, vec![leader_peer, new_peer]); @@ -46,6 +50,38 @@ fn test_simple_change() { meta.raft_status.soft_state.leader_id, req.get_header().get_peer().get_id() ); + // Trigger the raft tick to replica the log to the learner and execute the + // snapshot task. + router0 + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + + // write one kv after snapshot + let (key, val) = (b"key", b"value"); + let mut write_req = router0.new_request_for(region_id); + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(key.to_vec()); + put_req.mut_put().set_value(val.to_vec()); + write_req.mut_requests().push(put_req); + let (msg, _) = PeerMsg::raft_command(write_req.clone()); + router0.send(region_id, msg).unwrap(); + std::thread::sleep(Duration::from_millis(1000)); + cluster.dispatch(region_id, vec![]); + + let meta = router1 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + // the learner truncated index muse be equal the leader applied index and can + // read the new written kv. + assert_eq!(match_index, meta.raft_apply.truncated_state.index); + assert!(meta.raft_apply.applied_index >= match_index); + let tablet_factory = cluster.node(1).tablet_factory(); + let tablet = tablet_factory + .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) + .unwrap(); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), val); req.mut_header() .mut_region_epoch() diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index e905e7e4ac2..ed0ebcc9b8a 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -71,7 +71,7 @@ fn test_life_by_message() { assert_peer_not_exist(test_region_id, test_peer_id, &router); // Build a correct message. - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(test_region_id); msg.set_to_peer(new_peer(1, test_peer_id)); msg.mut_region_epoch().set_conf_ver(1); @@ -147,7 +147,7 @@ fn test_destroy_by_larger_id() { let test_region_id = 4; let test_peer_id = 6; let init_term = 5; - let mut msg = Box::new(RaftMessage::default()); + let mut msg = Box::::default(); msg.set_region_id(test_region_id); msg.set_to_peer(new_peer(1, test_peer_id)); msg.mut_region_epoch().set_conf_ver(1); diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs new file mode 100644 index 00000000000..c22ef4908bf --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -0,0 +1,60 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use futures::executor::block_on; +use kvproto::raft_cmdpb::{RaftCmdRequest, StatusCmdType}; +use pd_client::PdClient; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +#[test] +fn test_region_heartbeat() { + let region_id = 2; + let cluster = Cluster::with_node_count(1, None); + let router = cluster.router(0); + + // When there is only one peer, it should campaign immediately. + let mut req = RaftCmdRequest::default(); + req.mut_header().set_peer(new_peer(1, 3)); + req.mut_status_request() + .set_cmd_type(StatusCmdType::RegionLeader); + let res = router.query(region_id, req.clone()).unwrap(); + let status_resp = res.response().unwrap().get_status_response(); + assert_eq!( + *status_resp.get_region_leader().get_leader(), + new_peer(1, 3) + ); + + for _ in 0..5 { + let resp = block_on( + cluster + .node(0) + .pd_client() + .get_region_leader_by_id(region_id), + ) + .unwrap(); + if let Some((region, peer)) = resp { + assert_eq!(region.get_id(), region_id); + assert_eq!(peer.get_id(), 3); + assert_eq!(peer.get_store_id(), 1); + return; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + panic!("failed to get region leader"); +} + +#[test] +fn test_store_heartbeat() { + let cluster = Cluster::with_node_count(1, None); + let store_id = cluster.node(0).id(); + for _ in 0..5 { + let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); + if stats.get_start_time() > 0 { + assert_ne!(stats.get_capacity(), 0); + return; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + panic!("failed to get store stats"); +} diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 4f49757085f..2155a4775c6 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -67,14 +67,8 @@ fn test_snap_without_read_index() { req.mut_requests().push(request_inner); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); - // single node commited index should be 6. - assert_eq!(resp.read_index, 6); - - // run again, this time we expect the lease is not expired and the read index - // should be 0. - let res = router.query(region_id, req.clone()).unwrap(); - let resp = res.read().unwrap(); - // the request can be processed locally, read index should be 0. + // When it becomes leader, it will get a lease automatically because of empty + // entry. assert_eq!(resp.read_index, 0); // run with header read_quorum diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs new file mode 100644 index 00000000000..336a9c9d038 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -0,0 +1,183 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{thread, time::Duration}; + +use futures::executor::block_on; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, + }, +}; +use raftstore_v2::router::PeerMsg; +use tikv_util::store::new_peer; + +use crate::cluster::{Cluster, TestRouter}; + +fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, +) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req +} + +fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { + let (msg, sub) = PeerMsg::raft_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); +} + +fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { + let mut req = router.new_request_for(region_id); + + let mut put_req = Request::default(); + put_req.set_cmd_type(CmdType::Put); + put_req.mut_put().set_key(key.to_vec()); + put_req.mut_put().set_value(b"v1".to_vec()); + req.mut_requests().push(put_req); + + let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + router.send(region_id, msg).unwrap(); + assert!(block_on(sub.wait_proposed())); + assert!(block_on(sub.wait_committed())); + block_on(sub.result()).unwrap() +} + +// Split the region according to the parameters +// return the updated original region +fn split_region( + router: &mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + left_key: &[u8], + right_key: &[u8], + split_key: &[u8], + right_derive: bool, +) -> (metapb::Region, metapb::Region) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = + new_batch_split_region_request(vec![split_key.to_vec()], vec![split_id], right_derive); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + must_split(region_id, req, router); + + let (left, right) = if !right_derive { + ( + router.region_detail(region_id), + router.region_detail(split_region_id), + ) + } else { + ( + router.region_detail(split_region_id), + router.region_detail(region_id), + ) + }; + + // The end key of left region is `split_key` + // So writing `right_key` will fail + let resp = put(router, left.id, right_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + // But `left_key` should succeed + let resp = put(router, left.id, left_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Mirror of above case + let resp = put(router, right.id, left_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, right_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + assert_eq!(left.get_end_key(), split_key); + assert_eq!(right.get_start_key(), split_key); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + (left, right) +} + +#[test] +fn test_split() { + let cluster = Cluster::default(); + let store_id = cluster.node(0).id(); + let mut router = cluster.router(0); + // let factory = cluster.node(0).tablet_factory(); + + let region_id = 2; + let peer = new_peer(store_id, 3); + let region = router.region_detail(region_id); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Region 2 ["", ""] peer(1, 3) + // -> Region 2 ["", "k22"] peer(1, 3) + // Region 1000 ["k22", ""] peer(1, 10) + let (left, right) = split_region( + &mut router, + region, + peer.clone(), + 1000, + new_peer(store_id, 10), + b"k11", + b"k33", + b"k22", + false, + ); + + // Region 2 ["", "k22"] peer(1, 3) + // -> Region 2 ["", "k11"] peer(1, 3) + // Region 1001 ["k11", "k22"] peer(1, 11) + let _ = split_region( + &mut router, + left, + peer, + 1001, + new_peer(store_id, 11), + b"k00", + b"k11", + b"k11", + false, + ); + + // Region 1000 ["k22", ""] peer(1, 10) + // -> Region 1000 ["k22", "k33"] peer(1, 10) + // Region 1002 ["k33", ""] peer(1, 12) + let _ = split_region( + &mut router, + right, + new_peer(store_id, 10), + 1002, + new_peer(store_id, 12), + b"k22", + b"k33", + b"k33", + false, + ); +} diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 54eb07e8161..548693b71ac 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -56,7 +56,7 @@ grpcio-health = { version = "0.10", default-features = false, features = ["proto into_other = { workspace = true } itertools = "0.10" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -90,7 +90,7 @@ tokio = { version = "1.5", features = ["sync", "rt-multi-thread"] } tracker = { workspace = true } txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] encryption_export = { workspace = true } diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 99228aef44c..69ebfa7b385 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -8,6 +8,7 @@ use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; @@ -669,6 +670,23 @@ impl CoprocessorHost { true } + pub fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + if observer.should_skip_raft_message(msg) { + return true; + } + } + false + } + + pub fn on_peer_created(&self, region_id: u64) { + for observer in &self.registry.region_change_observers { + let observer = observer.observer.inner(); + observer.on_peer_created(region_id) + } + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 7ac783c0d6d..70427df9922 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -14,7 +14,7 @@ use kvproto::{ metapb::Region, pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, RaftCmdResponse, Request}, - raft_serverpb::RaftApplyState, + raft_serverpb::{RaftApplyState, RaftMessage}, }; use raft::{eraftpb, StateRole}; @@ -328,6 +328,12 @@ pub trait RegionChangeObserver: Coprocessor { fn pre_write_apply_state(&self, _: &mut ObserverContext<'_>) -> bool { true } + + fn should_skip_raft_message(&self, _: &RaftMessage) -> bool { + false + } + + fn on_peer_created(&self, _: u64) {} } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 2950329de46..e56678edec2 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -6,9 +6,8 @@ #![feature(min_specialization)] #![feature(box_patterns)] #![feature(hash_drain_filter)] -#![feature(let_else)] +#![feature(let_chains)] #![recursion_limit = "256"] -#![feature(drain_filter)] #[cfg(test)] extern crate test; diff --git a/components/raftstore/src/store/async_io/mod.rs b/components/raftstore/src/store/async_io/mod.rs index c9b2fad532f..56cc2d576e1 100644 --- a/components/raftstore/src/store/async_io/mod.rs +++ b/components/raftstore/src/store/async_io/mod.rs @@ -1,4 +1,5 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. +pub mod read; pub mod write; pub mod write_router; diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs new file mode 100644 index 00000000000..5dc01b40ef3 --- /dev/null +++ b/components/raftstore/src/store/async_io/read.rs @@ -0,0 +1,241 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt, + marker::PhantomData, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, + }, +}; + +use engine_traits::{Checkpointer, KvEngine, RaftEngine}; +use fail::fail_point; +use file_system::{IoType, WithIoType}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use protobuf::Message; +use raft::{eraftpb::Snapshot, GetEntriesContext}; +use tikv_util::{error, info, time::Instant, worker::Runnable}; + +use crate::store::{ + snap::TABLET_SNAPSHOT_VERSION, + util, + worker::metrics::{SNAP_COUNTER, SNAP_HISTOGRAM}, + RaftlogFetchResult, TabletSnapKey, TabletSnapManager, MAX_INIT_ENTRY_COUNT, +}; + +pub enum ReadTask { + FetchLogs { + region_id: u64, + context: GetEntriesContext, + low: u64, + high: u64, + max_size: usize, + tried_cnt: usize, + term: u64, + }, + + // GenTabletSnapshot is used to generate tablet snapshot. + GenTabletSnapshot { + region_id: u64, + to_peer: u64, + tablet: EK, + region_state: RegionLocalState, + last_applied_term: u64, + last_applied_index: u64, + canceled: Arc, + for_balance: bool, + }, +} + +impl fmt::Display for ReadTask { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + ReadTask::FetchLogs { + region_id, + context, + low, + high, + max_size, + tried_cnt, + term, + } => write!( + f, + "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", + region_id, low, high, max_size, context, tried_cnt, term, + ), + ReadTask::GenTabletSnapshot { + region_id, to_peer, .. + } => { + write!(f, "Snapshot gen for {}, to peer {}", region_id, to_peer) + } + } + } +} + +#[derive(Debug)] +pub struct FetchedLogs { + pub context: GetEntriesContext, + pub logs: Box, +} + +pub type GenSnapRes = Option>; + +/// A router for receiving fetched result. +pub trait AsyncReadNotifier: Send { + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); + fn notify_snapshot_generated(&self, region_id: u64, res: Option>); +} + +pub struct ReadRunner +where + EK: KvEngine, + ER: RaftEngine, + N: AsyncReadNotifier, +{ + notifier: N, + raft_engine: ER, + sanp_mgr: Option, + _phantom: PhantomData, +} + +impl ReadRunner { + pub fn new(notifier: N, raft_engine: ER) -> ReadRunner { + ReadRunner { + notifier, + raft_engine, + sanp_mgr: None, + _phantom: PhantomData, + } + } + + #[inline] + pub fn set_snap_mgr(&mut self, mgr: TabletSnapManager) { + self.sanp_mgr = Some(mgr); + } + + #[inline] + fn snap_mgr(&self) -> &TabletSnapManager { + self.sanp_mgr.as_ref().unwrap() + } + + fn generate_snap(&self, snap_key: &TabletSnapKey, tablet: EK) -> crate::Result<()> { + let checkpointer_path = self.snap_mgr().tablet_gen_path(snap_key); + if checkpointer_path.as_path().exists() { + // Remove the old checkpoint directly. + std::fs::remove_dir_all(checkpointer_path.as_path())?; + } + // Here not checkpoint to a temporary directory first, the temporary directory + // logic already implemented in rocksdb. + let mut checkpointer = tablet.new_checkpointer()?; + + checkpointer.create_at(checkpointer_path.as_path(), None, 0)?; + Ok(()) + } +} + +impl Runnable for ReadRunner +where + EK: KvEngine, + ER: RaftEngine, + N: AsyncReadNotifier, +{ + type Task = ReadTask; + fn run(&mut self, task: ReadTask) { + match task { + ReadTask::FetchLogs { + region_id, + low, + high, + max_size, + context, + tried_cnt, + term, + } => { + let mut ents = + Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); + let res = self.raft_engine.fetch_entries_to( + region_id, + low, + high, + Some(max_size), + &mut ents, + ); + + let hit_size_limit = res + .as_ref() + .map(|c| (*c as u64) != high - low) + .unwrap_or(false); + fail_point!("worker_async_fetch_raft_log"); + self.notifier.notify_logs_fetched( + region_id, + FetchedLogs { + context, + logs: Box::new(RaftlogFetchResult { + ents: res.map(|_| ents).map_err(|e| e.into()), + low, + max_size: max_size as u64, + hit_size_limit, + tried_cnt, + term, + }), + }, + ); + } + + ReadTask::GenTabletSnapshot { + region_id, + to_peer, + tablet, + region_state, + last_applied_term, + last_applied_index, + canceled, + for_balance, + } => { + SNAP_COUNTER.generate.start.inc(); + if canceled.load(Ordering::Relaxed) { + info!("generate snap is canceled"; "region_id" => region_id); + SNAP_COUNTER.generate.abort.inc(); + return; + } + let start = Instant::now(); + let _io_type_guard = WithIoType::new(if for_balance { + IoType::LoadBalance + } else { + IoType::Replication + }); + // the state should already checked in apply workers. + assert_ne!(region_state.get_state(), PeerState::Tombstone); + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(last_applied_term); + snapshot.mut_metadata().set_index(last_applied_index); + let conf_state = util::conf_state_from_region(region_state.get_region()); + snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region_state.get_region().clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_balance(for_balance); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + + // create checkpointer. + let snap_key = TabletSnapKey::from_region_snap(region_id, to_peer, &snapshot); + let mut res = None; + if let Err(e) = self.generate_snap(&snap_key, tablet) { + error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); + SNAP_COUNTER.generate.fail.inc(); + } else { + SNAP_COUNTER.generate.success.inc(); + SNAP_HISTOGRAM + .generate + .observe(start.saturating_elapsed_secs()); + res = Some(Box::new(snapshot)) + } + + self.notifier.notify_snapshot_generated(region_id, res); + } + } + } +} diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index e534a17fad1..354a796c99c 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -169,12 +169,15 @@ where ready_number: u64, pub send_time: Instant, pub raft_wb: Option, + // called after writing to kvdb and raftdb. + pub persisted_cb: Option>, pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, pub extra_write: ExtraWrite, pub messages: Vec, pub trackers: Vec, + pub has_snapshot: bool, } impl WriteTask @@ -195,6 +198,8 @@ where extra_write: ExtraWrite::None, messages: vec![], trackers: vec![], + persisted_cb: None, + has_snapshot: false, } } @@ -361,6 +366,7 @@ where pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, + pub persisted_cbs: Vec>, // region_id -> (peer_id, ready_number) pub readies: HashMap, } @@ -377,6 +383,7 @@ where extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], + persisted_cbs: vec![], readies: HashMap::default(), } } @@ -430,7 +437,9 @@ where ); } } - + if let Some(v) = task.persisted_cb.take() { + self.persisted_cbs.push(v); + }; self.tasks.push(task); } @@ -511,6 +520,12 @@ where } } + fn after_write_all(&mut self) { + for hook in mem::take(&mut self.persisted_cbs) { + hook(); + } + } + fn after_write_to_raft_db(&mut self, metrics: &StoreWriteMetrics) { if metrics.waterfall_metrics { let now = std::time::Instant::now(); @@ -706,10 +721,8 @@ where write_kv_time = duration_to_sec(now.saturating_elapsed()); STORE_WRITE_KVDB_DURATION_HISTOGRAM.observe(write_kv_time); } - self.batch.after_write_to_kv_db(&self.metrics); } - fail_point!("raft_between_save"); let mut write_raft_time = 0f64; @@ -746,6 +759,8 @@ where self.batch.after_write_to_raft_db(&self.metrics); + self.batch.after_write_all(); + fail_point!("raft_before_follower_send"); let mut now = Instant::now(); diff --git a/components/raftstore/src/store/bootstrap.rs b/components/raftstore/src/store/bootstrap.rs index 09706470a27..249ae4b704f 100644 --- a/components/raftstore/src/store/bootstrap.rs +++ b/components/raftstore/src/store/bootstrap.rs @@ -74,6 +74,7 @@ pub fn prepare_bootstrap_cluster( ) -> Result<()> { let mut state = RegionLocalState::default(); state.set_region(region.clone()); + let mut wb = engines.kv.write_batch(); box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region)); box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region.get_id()), &state)); diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 4d9cd73d207..454cf61a4c8 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -218,6 +218,14 @@ pub struct Config { pub dev_assert: bool, #[online_config(hidden)] pub apply_yield_duration: ReadableDuration, + /// yield the fsm when apply flushed data size exceeds this threshold. + /// the yield is check after commit, so the actual handled messages can be + /// bigger than the configed value. + // NOTE: the default value is much smaller than the default max raft batch msg size(0.2 + // * raft_entry_max_size), this is intentional because in the common case, a raft entry + // is unlikely to exceed this threshold, but in case when raftstore is the bottleneck, + // we still allow big raft batch for better throughput. + pub apply_yield_write_size: ReadableSize, #[serde(with = "perf_level_serde")] #[online_config(skip)] @@ -386,6 +394,7 @@ impl Default for Config { hibernate_regions: true, dev_assert: false, apply_yield_duration: ReadableDuration::millis(500), + apply_yield_write_size: ReadableSize::kb(32), perf_level: PerfLevel::Uninitialized, evict_cache_on_memory_ratio: 0.0, cmd_batch: true, @@ -429,6 +438,24 @@ impl Config { Config::default() } + pub fn new_raft_config(&self, peer_id: u64, applied_index: u64) -> raft::Config { + raft::Config { + id: peer_id, + election_tick: self.raft_election_timeout_ticks, + heartbeat_tick: self.raft_heartbeat_ticks, + min_election_tick: self.raft_min_election_timeout_ticks, + max_election_tick: self.raft_max_election_timeout_ticks, + max_size_per_msg: self.raft_max_size_per_msg.0, + max_inflight_msgs: self.raft_max_inflight_msgs, + applied: applied_index, + check_quorum: true, + skip_bcast_commit: true, + pre_vote: self.prevote, + max_committed_size_per_ready: ReadableSize::mb(16).0, + ..Default::default() + } + } + pub fn raft_store_max_leader_lease(&self) -> TimeDuration { TimeDuration::from_std(self.raft_store_max_leader_lease.0).unwrap() } @@ -548,7 +575,7 @@ impl Config { let election_timeout = self.raft_base_tick_interval.as_millis() * self.raft_election_timeout_ticks as u64; - let lease = self.raft_store_max_leader_lease.as_millis() as u64; + let lease = self.raft_store_max_leader_lease.as_millis(); if election_timeout < lease { return Err(box_err!( "election timeout {} ms is less than lease {} ms", @@ -557,7 +584,7 @@ impl Config { )); } - let tick = self.raft_base_tick_interval.as_millis() as u64; + let tick = self.raft_base_tick_interval.as_millis(); if lease > election_timeout - tick { return Err(box_err!( "lease {} ms should not be greater than election timeout {} ms - 1 tick({} ms)", @@ -571,7 +598,7 @@ impl Config { return Err(box_err!("raftstore.merge-check-tick-interval can't be 0.")); } - let stale_state_check = self.peer_stale_state_check_interval.as_millis() as u64; + let stale_state_check = self.peer_stale_state_check_interval.as_millis(); if stale_state_check < election_timeout * 2 { return Err(box_err!( "peer stale state check interval {} ms is less than election timeout x 2 {} ms", @@ -586,7 +613,7 @@ impl Config { )); } - let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis() as u64; + let abnormal_leader_missing = self.abnormal_leader_missing_duration.as_millis(); if abnormal_leader_missing < stale_state_check { return Err(box_err!( "abnormal leader missing {} ms is less than peer stale state check interval {} ms", @@ -595,7 +622,7 @@ impl Config { )); } - let max_leader_missing = self.max_leader_missing_duration.as_millis() as u64; + let max_leader_missing = self.max_leader_missing_duration.as_millis(); if max_leader_missing < abnormal_leader_missing { return Err(box_err!( "max leader missing {} ms is less than abnormal leader missing {} ms", @@ -898,6 +925,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["local_read_batch_size"]) .set(self.local_read_batch_size as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["apply_yield_write_size"]) + .set(self.apply_yield_write_size.0 as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["apply_max_batch_size"]) .set(self.apply_batch_system.max_batch_size() as f64); diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index a0828d12332..c6278c890f7 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -30,7 +30,7 @@ use super::{ metrics::*, peer_storage::storage_error, WriteTask, MEMTRACE_ENTRY_CACHE, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use crate::{bytes_capacity, store::worker::RaftlogFetchTask, Result}; +use crate::{bytes_capacity, store::ReadTask, Result}; const MAX_ASYNC_FETCH_TRY_CNT: usize = 3; const SHRINK_CACHE_CAPACITY: usize = 64; @@ -622,7 +622,7 @@ impl Default for CacheWarmupState { } /// A subset of `PeerStorage` that focus on accessing log entries. -pub struct EntryStorage { +pub struct EntryStorage { region_id: u64, peer_id: u64, raft_engine: ER, @@ -631,20 +631,20 @@ pub struct EntryStorage { apply_state: RaftApplyState, last_term: u64, applied_term: u64, - raftlog_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, raftlog_fetch_stats: AsyncFetchStats, async_fetch_results: RefCell>, cache_warmup_state: Option, } -impl EntryStorage { +impl EntryStorage { pub fn new( peer_id: u64, raft_engine: ER, mut raft_state: RaftLocalState, apply_state: RaftApplyState, region: &metapb::Region, - raftlog_fetch_scheduler: Scheduler, + read_scheduler: Scheduler>, ) -> Result { if let Err(e) = validate_states(region.id, &raft_engine, &mut raft_state, &apply_state) { return Err(box_err!( @@ -665,7 +665,7 @@ impl EntryStorage { apply_state, last_term, applied_term, - raftlog_fetch_scheduler, + read_scheduler, raftlog_fetch_stats: AsyncFetchStats::default(), async_fetch_results: RefCell::new(HashMap::default()), cache_warmup_state: None, @@ -862,8 +862,8 @@ impl EntryStorage { self.async_fetch_results .borrow_mut() .insert(low, RaftlogFetchState::Fetching(Instant::now_coarse())); - self.raftlog_fetch_scheduler - .schedule(RaftlogFetchTask::PeerStorage { + self.read_scheduler + .schedule(ReadTask::FetchLogs { region_id, context, low, @@ -958,6 +958,16 @@ impl EntryStorage { } } + #[inline] + pub fn set_truncated_index(&mut self, index: u64) { + self.apply_state.mut_truncated_state().set_index(index) + } + + #[inline] + pub fn set_truncated_term(&mut self, term: u64) { + self.apply_state.mut_truncated_state().set_term(term) + } + #[inline] pub fn first_index(&self) -> u64 { first_index(&self.apply_state) @@ -1004,7 +1014,7 @@ impl EntryStorage { } #[inline] - pub fn set_applied_state(&mut self, apply_state: RaftApplyState) { + pub fn set_apply_state(&mut self, apply_state: RaftApplyState) { self.apply_state = apply_state; } @@ -1046,7 +1056,7 @@ impl EntryStorage { // Append the given entries to the raft log using previous last index or // self.last_index. - pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { + pub fn append(&mut self, entries: Vec, task: &mut WriteTask) { if entries.is_empty() { return; } @@ -1242,13 +1252,17 @@ impl EntryStorage { pub fn clear(&mut self) { self.cache = EntryCache::default(); } + + pub fn read_scheduler(&self) -> Scheduler> { + self.read_scheduler.clone() + } } #[cfg(test)] pub mod tests { use std::sync::mpsc; - use engine_test::raft::RaftTestEngine; + use engine_test::{kv::KvTestEngine, raft::RaftTestEngine}; use engine_traits::RaftEngineReadOnly; use protobuf::Message; use raft::{GetEntriesContext, StorageError}; @@ -1273,7 +1287,7 @@ pub mod tests { } } - pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { + pub fn validate_cache(store: &EntryStorage, exp_ents: &[Entry]) { assert_eq!(store.cache.cache, exp_ents); for e in exp_ents { let entry = store diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index d3eb7f86461..bd582d1c24a 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -9,6 +9,7 @@ use std::{ cmp::{Ord, Ordering as CmpOrdering}, collections::VecDeque, fmt::{self, Debug, Formatter}, + io::BufRead, mem, ops::{Deref, DerefMut, Range as StdRange}, sync::{ @@ -36,15 +37,16 @@ use fail::fail_point; use kvproto::{ import_sstpb::SstMeta, kvrpcpb::ExtraOp as TxnExtraOp, - metapb::{PeerRole, Region, RegionEpoch}, + metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; +use protobuf::{wire_format::WireType, CodedInputStream}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; @@ -59,7 +61,7 @@ use tikv_util::{ memory::HeapSize, mpsc::{loose_bounded, LooseBoundedSender, Receiver}, safe_panic, slow_log, - store::{find_peer, find_peer_mut, is_learner, remove_peer}, + store::{find_peer, find_peer_by_id, find_peer_mut, is_learner, remove_peer}, time::{duration_to_sec, Instant}, warn, worker::Scheduler, @@ -378,6 +380,7 @@ where perf_context: EK::PerfContext, yield_duration: Duration, + yield_msg_size: u64, store_id: u64, /// region_id -> (peer_id, is_splitting) @@ -467,6 +470,7 @@ where use_delete_range: cfg.use_delete_range, perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, + yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], pending_delete_ssts: vec![], store_id, @@ -635,7 +639,7 @@ where apply_state: delegate.apply_state.clone(), write_seqno: mem::take(&mut delegate.unfinished_write_seqno), exec_res: results, - metrics: delegate.metrics.clone(), + metrics: mem::take(&mut delegate.metrics), applied_term: delegate.applied_term, bucket_stat: delegate.buckets.clone().map(Box::new), }); @@ -687,7 +691,7 @@ where } let elapsed = t.saturating_elapsed(); - STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed) as f64); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(elapsed)); for mut inspector in std::mem::take(&mut self.pending_latency_inspect) { inspector.record_apply_process(elapsed); inspector.finish(); @@ -814,6 +818,43 @@ fn should_sync_log(cmd: &RaftCmdRequest) -> bool { false } +fn can_witness_skip(entry: &Entry) -> bool { + // need to handle ConfChange entry type + if entry.get_entry_type() != EntryType::EntryNormal { + return false; + } + + // HACK: check admin request field in serialized data from `RaftCmdRequest` + // without deserializing all. It's done by checking the existence of the + // field number of `admin_request`. + // See the encoding in `write_to_with_cached_sizes()` of `RaftCmdRequest` in + // `raft_cmdpb.rs` for reference. + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return true; + } + let (mut field_number, wire_type) = is.read_tag_unpack().unwrap(); + // Header field is of number 1 + if field_number == 1 { + if wire_type != WireType::WireTypeLengthDelimited { + panic!("unexpected wire type"); + } + let len = is.read_raw_varint32().unwrap(); + // skip parsing the content of `Header` + is.consume(len as usize); + // read next field number + (field_number, _) = is.read_tag_unpack().unwrap(); + } + + // `Requests` field is of number 2 and `AdminRequest` field is of number 3. + // - If the next field is 2, there must be no admin request as in one + // `RaftCmdRequest`, either requests or admin_request is filled. + // - If the next field is 3, it's exactly an admin request. + // - If the next field is others, neither requests nor admin_request is filled, + // so there is no admin request. + field_number != 3 +} + /// A struct that stores the state related to Merge. /// /// When executing a `CommitMerge`, the source peer may have not applied @@ -893,12 +934,12 @@ pub struct ApplyDelegate where EK: KvEngine, { - /// The ID of the peer. - id: u64, /// The term of the Region. term: u64, /// The Region information of the peer. region: Region, + /// The Peer information. + peer: metapb::Peer, /// Peer_tag, "[region region_id] peer_id". tag: String, @@ -971,8 +1012,8 @@ where { fn from_registration(reg: Registration) -> ApplyDelegate { ApplyDelegate { - id: reg.id, tag: format!("[region {}] {}", reg.region.get_id(), reg.id), + peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, last_flush_applied_index: reg.apply_state.get_applied_index(), @@ -1004,7 +1045,7 @@ where } pub fn id(&self) -> u64 { - self.id + self.peer.get_id() } /// Handles all the committed_entries, namely, applies the committed @@ -1124,54 +1165,60 @@ where let data = entry.get_data(); if !data.is_empty() { - let cmd = util::parse_data_at(data, index, &self.tag); - - if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { - self.priority = Priority::Low; - } - let mut has_unflushed_data = - self.last_flush_applied_index != self.apply_state.get_applied_index(); - if (has_unflushed_data && should_write_to_engine(&cmd) - || apply_ctx.kv_wb().should_write_to_engine()) - && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) - { - apply_ctx.commit(self); - if let Some(start) = self.handle_start.as_ref() { - if start.saturating_elapsed() >= apply_ctx.yield_duration { + if !self.peer.is_witness || !can_witness_skip(entry) { + let cmd = util::parse_data_at(data, index, &self.tag); + if apply_ctx.yield_high_latency_operation && has_high_latency_operation(&cmd) { + self.priority = Priority::Low; + } + let mut has_unflushed_data = + self.last_flush_applied_index != self.apply_state.get_applied_index(); + if (has_unflushed_data && should_write_to_engine(&cmd) + || apply_ctx.kv_wb().should_write_to_engine()) + && apply_ctx.host.pre_persist(&self.region, false, Some(&cmd)) + { + apply_ctx.commit(self); + if self.metrics.written_bytes >= apply_ctx.yield_msg_size + || self + .handle_start + .as_ref() + .map_or(Duration::ZERO, Instant::saturating_elapsed) + >= apply_ctx.yield_duration + { return ApplyResult::Yield; } + has_unflushed_data = false; + } + if self.priority != apply_ctx.priority { + if has_unflushed_data { + apply_ctx.commit(self); + } + return ApplyResult::Yield; } - has_unflushed_data = false; + + return self.process_raft_cmd(apply_ctx, index, term, cmd); } - if self.priority != apply_ctx.priority { - if has_unflushed_data { - apply_ctx.commit(self); + } else { + // we should observe empty cmd, aka leader change, + // read index during confchange, or other situations. + apply_ctx.host.on_empty_cmd(&self.region, index, term); + + // 1. When a peer become leader, it will send an empty entry. + // 2. When a leader tries to read index during transferring leader, + // it will also propose an empty entry. But that entry will not contain + // any associated callback. So no need to clear callback. + while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { + if let Some(cb) = cmd.cb.take() { + apply_ctx + .applied_batch + .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); } - return ApplyResult::Yield; } - - return self.process_raft_cmd(apply_ctx, index, term, cmd); } - // we should observe empty cmd, aka leader change, - // read index during confchange, or other situations. - apply_ctx.host.on_empty_cmd(&self.region, index, term); - self.apply_state.set_applied_index(index); self.applied_term = term; assert!(term > 0); - // 1. When a peer become leader, it will send an empty entry. - // 2. When a leader tries to read index during transferring leader, - // it will also propose an empty entry. But that entry will not contain - // any associated callback. So no need to clear callback. - while let Some(mut cmd) = self.pending_cmds.pop_normal(u64::MAX, term - 1) { - if let Some(cb) = cmd.cb.take() { - apply_ctx - .applied_batch - .push_cb(cb, cmd_resp::err_resp(Error::StaleCommand, term)); - } - } ApplyResult::None } @@ -1257,7 +1304,7 @@ where apply_ctx: &mut ApplyContext, index: u64, term: u64, - cmd: RaftCmdRequest, + req: RaftCmdRequest, ) -> ApplyResult { if index == 0 { panic!( @@ -1267,11 +1314,10 @@ where } // Set sync log hint if the cmd requires so. - apply_ctx.sync_log_hint |= should_sync_log(&cmd); + apply_ctx.sync_log_hint |= should_sync_log(&req); - apply_ctx.host.pre_apply(&self.region, &cmd); - let (mut resp, exec_result, should_write) = - self.apply_raft_cmd(apply_ctx, index, term, &cmd); + apply_ctx.host.pre_apply(&self.region, &req); + let (mut cmd, exec_result, should_write) = self.apply_raft_cmd(apply_ctx, index, term, req); if let ApplyResult::WaitMergeSource(_) = exec_result { return exec_result; } @@ -1285,9 +1331,8 @@ where // TODO: if we have exec_result, maybe we should return this callback too. Outer // store will call it after handing exec result. - cmd_resp::bind_term(&mut resp, self.term); - let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd)); - let cmd = Cmd::new(index, term, cmd, resp); + cmd_resp::bind_term(&mut cmd.response, self.term); + let cmd_cb = self.find_pending(index, term, is_conf_change_cmd(&cmd.request)); apply_ctx .applied_batch .push(cmd_cb, cmd, &self.observe_info, self.region_id()); @@ -1315,8 +1360,8 @@ where ctx: &mut ApplyContext, index: u64, term: u64, - req: &RaftCmdRequest, - ) -> (RaftCmdResponse, ApplyResult, bool) { + req: RaftCmdRequest, + ) -> (Cmd, ApplyResult, bool) { // if pending remove, apply should be aborted already. assert!(!self.pending_remove); @@ -1324,7 +1369,7 @@ where // E.g. `RaftApplyState` must not be changed. let mut origin_epoch = None; - let (resp, exec_result) = if ctx.host.pre_exec(&self.region, req, index, term) { + let (resp, exec_result) = if ctx.host.pre_exec(&self.region, &req, index, term) { // One of the observers want to filter execution of the command. let mut resp = RaftCmdResponse::default(); if !req.get_header().get_uuid().is_empty() { @@ -1336,7 +1381,7 @@ where ctx.exec_log_index = index; ctx.exec_log_term = term; ctx.kv_wb_mut().set_save_point(); - let (resp, exec_result) = match self.exec_raft_cmd(ctx, req) { + let (resp, exec_result) = match self.exec_raft_cmd(ctx, &req) { Ok(a) => { ctx.kv_wb_mut().pop_save_point().unwrap(); if req.has_admin_request() { @@ -1377,14 +1422,15 @@ where }; (resp, exec_result) }; + + let cmd = Cmd::new(index, term, req, resp); if let ApplyResult::WaitMergeSource(_) = exec_result { - return (resp, exec_result, false); + return (cmd, exec_result, false); } self.apply_state.set_applied_index(index); self.applied_term = term; - let cmd = Cmd::new(index, term, req.clone(), resp.clone()); let (modified_region, mut pending_handle_ssts) = match exec_result { ApplyResult::Res(ref e) => match e { ExecResult::SplitRegion { ref derived, .. } => (Some(derived.clone()), None), @@ -1433,6 +1479,9 @@ where match *exec_result { ExecResult::ChangePeer(ref cp) => { self.region = cp.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } } ExecResult::ComputeHash { .. } | ExecResult::VerifyHash { .. } @@ -1463,7 +1512,7 @@ where } } if let Some(epoch) = origin_epoch { - let cmd_type = req.get_admin_request().get_cmd_type(); + let cmd_type = cmd.request.get_admin_request().get_cmd_type(); let epoch_state = admin_cmd_epoch_lookup(cmd_type); // The change-epoch behavior **MUST BE** equal to the settings in // `admin_cmd_epoch_lookup` @@ -1475,7 +1524,7 @@ where panic!( "{} apply admin cmd {:?} but epoch change is not expected, epoch state {:?}, before {:?}, after {:?}", self.tag, - req, + cmd.request, epoch_state, epoch, self.region.get_region_epoch() @@ -1483,17 +1532,18 @@ where } } - (resp, exec_result, should_write) + (cmd, exec_result, should_write) } fn destroy(&mut self, apply_ctx: &mut ApplyContext) { self.stopped = true; apply_ctx.router.close(self.region_id()); + let id = self.id(); for cmd in self.pending_cmds.normals.drain(..) { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); } if let Some(cmd) = self.pending_cmds.conf_change.take() { - notify_region_removed(self.region.get_id(), self.id, cmd); + notify_region_removed(self.region.get_id(), id, cmd); } self.yield_state = None; @@ -1573,13 +1623,13 @@ where AdminCmdType::TransferLeader => self.exec_transfer_leader(request, ctx.exec_log_term), AdminCmdType::ComputeHash => self.exec_compute_hash(ctx, request), AdminCmdType::VerifyHash => self.exec_verify_hash(ctx, request), - // TODO: is it backward compatible to add new cmd_type? AdminCmdType::PrepareMerge => self.exec_prepare_merge(ctx, request), AdminCmdType::CommitMerge => self.exec_commit_merge(ctx, request), AdminCmdType::RollbackMerge => self.exec_rollback_merge(ctx, request), AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } + AdminCmdType::BatchSwitchWitness => Err(box_err!("unsupported admin command type")), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); @@ -1879,24 +1929,56 @@ where mod confchange_cmd_metric { use super::*; - fn write_metric(cct: ConfChangeType, kind: &str) { - let metric = match cct { - ConfChangeType::AddNode => "add_peer", - ConfChangeType::RemoveNode => "remove_peer", - ConfChangeType::AddLearnerNode => "add_learner", + pub fn inc_all(cct: ConfChangeType) { + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, }; - PEER_ADMIN_CMD_COUNTER_VEC - .with_label_values(&[metric, kind]) - .inc(); + metrics.all.inc(); } - pub fn inc_all(cct: ConfChangeType) { - write_metric(cct, "all") + pub fn inc_success(cct: ConfChangeType) { + let metrics = match cct { + ConfChangeType::AddNode => &PEER_ADMIN_CMD_COUNTER.add_peer, + ConfChangeType::RemoveNode => &PEER_ADMIN_CMD_COUNTER.remove_peer, + ConfChangeType::AddLearnerNode => &PEER_ADMIN_CMD_COUNTER.add_learner, + }; + metrics.success.inc(); } +} - pub fn inc_success(cct: ConfChangeType) { - write_metric(cct, "success") +pub fn validate_batch_split(req: &AdminRequest, region: &Region) -> Result<()> { + if req.get_splits().get_requests().is_empty() { + return Err(box_err!("missing split requests")); + } + + let split_reqs: &[SplitRequest] = req.get_splits().get_requests(); + let mut last_key = region.get_start_key(); + for req in split_reqs { + let split_key = req.get_split_key(); + if split_key.is_empty() { + return Err(box_err!("missing split key")); + } + + if split_key <= last_key { + return Err(box_err!("invalid split request: {:?}", split_reqs)); + } + + if req.get_new_peer_ids().len() != region.get_peers().len() { + return Err(box_err!( + "invalid new peer id count, need {:?}, but got {:?}", + region.get_peers(), + req.get_new_peer_ids() + )); + } + + last_key = req.get_split_key(); } + + util::check_key_in_region_exclusive(last_key, region)?; + + Ok(()) } // Admin commands related. @@ -1904,6 +1986,8 @@ impl ApplyDelegate where EK: KvEngine, { + // Legacy code for compatibility. All new conf changes are dispatched by + // ChangePeerV2 now. fn exec_change_peer( &mut self, ctx: &mut ApplyContext, @@ -1918,12 +2002,12 @@ where fail_point!( "apply_on_conf_change_1_3_1", - (self.id == 1 || self.id == 3) && self.region_id() == 1, + (self.id() == 1 || self.id() == 3) && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( "apply_on_conf_change_3_1", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| panic!("should not use return") ); fail_point!( @@ -1948,7 +2032,7 @@ where let add_ndoe_fp = || { fail_point!( "apply_on_add_node_1_2", - self.id == 2 && self.region_id() == 1, + self.id() == 2 && self.region_id() == 1, |_| {} ) }; @@ -2015,7 +2099,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -2208,6 +2292,7 @@ where // The peer is already the requested role || (role, change_type) == (PeerRole::Voter, ConfChangeType::AddNode) || (role, change_type) == (PeerRole::Learner, ConfChangeType::AddLearnerNode) + || exist_peer.get_is_witness() != peer.get_is_witness() { error!( "can't add duplicated peer"; @@ -2215,7 +2300,7 @@ where "peer_id" => self.id(), "peer" => ?peer, "exist peer" => ?exist_peer, - "confchnage type" => ?change_type, + "confchange type" => ?change_type, "region" => ?&self.region ); return Err(box_err!( @@ -2269,7 +2354,7 @@ where "region_id" => self.region_id(), "peer_id" => self.id(), "expect_peer" => ?peer, - "get_peeer" => ?p + "get_peer" => ?p ); return Err(box_err!( "remove unmatched peer: expect: {:?}, get {:?}, ignore", @@ -2277,7 +2362,7 @@ where p )); } - if self.id == peer.get_id() { + if self.id() == peer.get_id() { // Remove ourself, we will destroy all region data later. // So we need not to apply following logs. self.stopped = true; @@ -2361,44 +2446,21 @@ where fail_point!("apply_before_split"); fail_point!( "apply_before_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); - let split_reqs = req.get_splits(); - let right_derive = split_reqs.get_right_derive(); - if split_reqs.get_requests().is_empty() { - return Err(box_err!("missing split requests")); - } let mut derived = self.region.clone(); - let new_region_cnt = split_reqs.get_requests().len(); - let mut regions = Vec::with_capacity(new_region_cnt + 1); - let mut keys: VecDeque> = VecDeque::with_capacity(new_region_cnt + 1); - for req in split_reqs.get_requests() { - let split_key = req.get_split_key(); - if split_key.is_empty() { - return Err(box_err!("missing split key")); - } - if split_key - <= keys - .back() - .map_or_else(|| derived.get_start_key(), Vec::as_slice) - { - return Err(box_err!("invalid split request: {:?}", split_reqs)); - } - if req.get_new_peer_ids().len() != derived.get_peers().len() { - return Err(box_err!( - "invalid new peer id count, need {:?}, but got {:?}", - derived.get_peers(), - req.get_new_peer_ids() - )); - } - keys.push_back(split_key.to_vec()); - } + validate_batch_split(req, &derived)?; - util::check_key_in_region(keys.back().unwrap(), &self.region)?; + let split_reqs = req.get_splits(); + let mut keys: VecDeque<_> = split_reqs + .get_requests() + .iter() + .map(|req| req.get_split_key().to_vec()) + .collect(); info!( "split region"; @@ -2407,8 +2469,13 @@ where "region" => ?derived, "keys" => %KeysInfoFormatter(keys.iter()), ); + + let new_region_cnt = split_reqs.get_requests().len(); let new_version = derived.get_region_epoch().get_version() + new_region_cnt as u64; derived.mut_region_epoch().set_version(new_version); + + let right_derive = split_reqs.get_right_derive(); + let mut regions = Vec::with_capacity(new_region_cnt + 1); // Note that the split requests only contain ids for new regions, so we need // to handle new regions and old region separately. if right_derive { @@ -2423,6 +2490,7 @@ where regions.push(derived.clone()); } + // Init split regions' meta info let mut new_split_regions: HashMap = HashMap::default(); for req in split_reqs.get_requests() { let mut new_region = Region::default(); @@ -2453,6 +2521,11 @@ where regions.push(derived.clone()); } + // Generally, a peer is created in pending_create_peers when it is + // created by raft_message (or by split here) and removed from + // pending_create_peers when it has applied the snapshot. So, if the + // peer of the split region is already created by raft_message in + // pending_create_peers ,we decide to replace it. let mut replace_regions = HashSet::default(); { let mut pending_create_peers = ctx.pending_create_peers.lock().unwrap(); @@ -2498,6 +2571,9 @@ where self.tag, region_id, new_split_peer.peer_id, state ) } + // If the peer's state is already persisted, add some info in + // new_split_peer.result so that we will skip this region in later + // executions. already_exist_regions.push((*region_id, new_split_peer.peer_id)); new_split_peer.result = Some(format!("state {:?} exist in kv engine", state)); } @@ -2553,7 +2629,7 @@ where fail_point!( "apply_after_split_1_3", - self.id == 3 && self.region_id() == 1, + self.id() == 3 && self.region_id() == 1, |_| { unreachable!() } ); @@ -2657,7 +2733,7 @@ where let apply_before_commit_merge = || { fail_point!( "apply_before_commit_merge_except_1_4", - self.region_id() == 1 && self.id != 4, + self.region_id() == 1 && self.id() != 4, |_| {} ); }; @@ -2929,7 +3005,7 @@ where let peer = req.get_transfer_leader().get_peer(); // Only execute TransferLeader if the expected new leader is self. - if peer.get_id() == self.id { + if peer.get_id() == self.id() { Ok((resp, ApplyResult::Res(ExecResult::TransferLeader { term }))) } else { Ok((resp, ApplyResult::None)) @@ -3505,7 +3581,7 @@ where "peer_id" => self.delegate.id(), "term" => reg.term ); - assert_eq!(self.delegate.id, reg.id); + assert_eq!(self.delegate.id(), reg.id); self.delegate.term = reg.term; self.delegate.clear_all_commands_as_stale(); self.delegate = ApplyDelegate::from_registration(reg); @@ -3555,7 +3631,6 @@ where RAFT_ENTRIES_CACHES_GAUGE.sub(dangle_size as i64); } - self.delegate.metrics = ApplyMetrics::default(); self.delegate.term = apply.term; if let Some(meta) = apply.bucket_meta.clone() { let buckets = self @@ -3653,7 +3728,7 @@ where PeerMsg::ApplyRes { res: TaskRes::Destroy { region_id: self.delegate.region_id(), - peer_id: self.delegate.id, + peer_id: self.delegate.id(), merge_from_snapshot: d.merge_from_snapshot, }, }, @@ -3734,6 +3809,10 @@ where if self.delegate.pending_remove || self.delegate.stopped { return; } + if self.delegate.peer.is_witness { + // witness shouldn't generate snapshot. + return; + } let applied_index = self.delegate.apply_state.get_applied_index(); let need_sync = apply_ctx .apply_res @@ -3751,7 +3830,7 @@ where self.delegate.maybe_write_apply_state(apply_ctx); fail_point!( "apply_on_handle_snapshot_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); @@ -3777,7 +3856,7 @@ where .fetch_sub(1, Ordering::SeqCst); fail_point!( "apply_on_handle_snapshot_finish_1_1", - self.delegate.id == 1 && self.delegate.region_id() == 1, + self.delegate.id() == 1 && self.delegate.region_id() == 1, |_| unimplemented!() ); } @@ -4075,6 +4154,7 @@ where } _ => {} } + self.apply_ctx.yield_msg_size = incoming.apply_yield_write_size.0; update_cfg(&incoming.apply_batch_system); } } @@ -4501,6 +4581,7 @@ mod tests { time::*, }; + use bytes::Bytes; use engine_panic::PanicEngine; use engine_test::kv::{new_engine, KvTestEngine, KvTestSnapshot}; use engine_traits::{Peekable as PeekableTrait, SyncMutable, WriteBatchExt}; @@ -4510,11 +4591,12 @@ mod tests { raft_cmdpb::*, }; use protobuf::Message; + use raft::eraftpb::{ConfChange, ConfChangeV2}; use sst_importer::Config as ImportConfig; use tempfile::{Builder, TempDir}; use test_sst_importer::*; use tikv_util::{ - config::VersionTrack, + config::{ReadableSize, VersionTrack}, store::{new_learner_peer, new_peer}, worker::dummy_scheduler, }; @@ -4615,6 +4697,42 @@ mod tests { } } + #[test] + fn test_can_witness_skip() { + let mut entry = Entry::new(); + let mut req = RaftCmdRequest::default(); + entry.set_entry_type(EntryType::EntryNormal); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + req.mut_admin_request() + .set_cmd_type(AdminCmdType::CompactLog); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + let mut req = RaftCmdRequest::default(); + let mut request = Request::default(); + request.set_cmd_type(CmdType::Put); + req.set_requests(vec![request].into()); + let data = req.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChange); + let conf_change = ConfChange::new(); + let data = conf_change.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + + entry.set_entry_type(EntryType::EntryConfChangeV2); + let conf_change_v2 = ConfChangeV2::new(); + let data = conf_change_v2.write_to_bytes().unwrap(); + entry.set_data(Bytes::copy_from_slice(&data)); + assert!(!can_witness_skip(&entry)); + } + #[test] fn test_should_sync_log() { // Admin command @@ -4812,10 +4930,14 @@ mod tests { ..Default::default() }; reg.region.set_id(2); + let mut peer = metapb::Peer::default(); + peer.set_id(1); + reg.region.mut_peers().push(peer.clone()); reg.apply_state.set_applied_index(3); router.schedule_task(2, Msg::Registration(reg.dup())); validate(&router, 2, move |delegate| { - assert_eq!(delegate.id, 1); + assert_eq!(delegate.id(), 1); + assert_eq!(delegate.peer, peer); assert_eq!(delegate.tag, "[region 2] 1"); assert_eq!(delegate.region, reg.region); assert!(!delegate.pending_remove); @@ -5575,6 +5697,92 @@ mod tests { system.shutdown(); } + #[test] + fn test_apply_yield_with_msg_size() { + let (_path, engine) = create_tmp_engine("test-apply-yield"); + let (_import_dir, importer) = create_tmp_importer("test-apply-yield"); + let obs = ApplyObserver::default(); + let mut host = CoprocessorHost::::default(); + host.registry + .register_query_observer(1, BoxQueryObserver::new(obs)); + + let (tx, rx) = mpsc::channel(); + let (region_scheduler, _) = dummy_scheduler(); + let sender = Box::new(TestNotifier { tx }); + let cfg = Arc::new(VersionTrack::new(Config::default())); + let (router, mut system) = create_apply_batch_system(&cfg.value()); + let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); + let builder = super::Builder:: { + tag: "test-store".to_owned(), + cfg: cfg.clone(), + sender, + region_scheduler, + coprocessor_host: host, + importer, + engine, + router: router.clone(), + store_id: 1, + pending_create_peers, + }; + system.spawn("test-handle-raft".to_owned(), builder); + + let peer_id = 3; + let mut reg = Registration { + id: peer_id, + ..Default::default() + }; + reg.region.set_id(1); + reg.region.mut_peers().push(new_peer(2, 3)); + reg.region.set_end_key(b"k5".to_vec()); + reg.region.mut_region_epoch().set_conf_ver(1); + reg.region.mut_region_epoch().set_version(3); + router.schedule_task(1, Msg::Registration(reg)); + + let schedule_apply = |idx: u64, count: usize, size: usize| { + let mut entries = Vec::with_capacity(count); + for i in 0..count { + let put_entry = EntryBuilder::new(idx + i as u64, 3) + .put(format!("k{:03}", i).as_ref(), &vec![0; size - 4]) + .epoch(1, 3) + .build(); + entries.push(put_entry); + } + router.schedule_task(1, Msg::apply(apply(peer_id, 1, 3, entries, vec![]))); + }; + + fn approximate_eq(a: u64, b: u64, delta: u64) { + assert!( + a >= b - delta && a <= b + delta, + "left: {}, right: {}, delta: {}", + a, + b, + delta + ); + } + + // schedule a batch with 512 keys and 64k total size will trigger 2 flush and + // yield. + schedule_apply(1, 512, 128); + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); + // the second part, note that resume apply not clean up the metrics + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 32768, 2048); + approximate_eq(apply_res.metrics.written_keys, 256, 15); + + // update apply yeild size to 64kb + _ = cfg.update(|c| { + c.apply_yield_write_size = ReadableSize::kb(64); + Ok::<(), ()>(()) + }); + // only trigger one time of + schedule_apply(513, 512, 128); + let apply_res = fetch_apply_res(&rx); + approximate_eq(apply_res.metrics.written_bytes, 65536, 4096); + approximate_eq(apply_res.metrics.written_keys, 512, 20); + } + #[test] fn test_handle_ingest_sst() { let (_path, engine) = create_tmp_engine("test-ingest"); @@ -5653,7 +5861,7 @@ mod tests { } } let sst_path = import_dir.path().join("test.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); @@ -5684,7 +5892,7 @@ mod tests { } } let sst_path = import_dir.path().join("test2.sst"); - let (mut meta, data) = gen_sst_file_with_kvs(&sst_path, &kvs); + let (mut meta, data) = gen_sst_file_with_kvs(sst_path, &kvs); meta.set_region_id(1); meta.mut_region_epoch().set_conf_ver(1); meta.mut_region_epoch().set_version(3); @@ -6448,12 +6656,13 @@ mod tests { resp ); + splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"", 8, vec![9, 10, 11])); let resp = exec_split(&router, splits.clone()); - // Empty key should be rejected. - assert!(error_msg(&resp).contains("missing"), "{:?}", resp); + // Empty key will not in any region exclusively. + assert!(error_msg(&resp).contains("missing split key"), "{:?}", resp); splits.mut_requests().clear(); splits @@ -6672,4 +6881,100 @@ mod tests { rx.recv_timeout(Duration::from_millis(500)).unwrap(); system.shutdown(); } + + fn new_batch_split_request(keys: Vec>) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + for key in keys { + let mut split_req = SplitRequest::default(); + split_req.set_split_key(key); + split_req.set_new_peer_ids(vec![1]); + req.mut_splits().mut_requests().push(split_req); + } + req + } + + #[test] + fn test_validate_batch_split() { + let mut region = Region::default(); + region.set_start_key(b"k05".to_vec()); + region.set_end_key(b"k10".to_vec()); + region.set_peers(vec![new_peer(1, 2)].into()); + + let missing_error = "missing split requests"; + let invalid_error = "invalid split request"; + let not_in_region_error = "not in region"; + let empty_error = "missing split key"; + let peer_id_error = "invalid new peer id count"; + + // case: split is deprecated + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::Split); + let mut split_req = SplitRequest::default(); + split_req.set_split_key(b"k06".to_vec()); + req.set_split(split_req); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(missing_error) + ); + + // case: missing peer ids + let mut req = new_batch_split_request(vec![b"k07".to_vec()]); + req.mut_splits() + .mut_requests() + .get_mut(0) + .unwrap() + .new_peer_ids + .clear(); + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(peer_id_error) + ); + + let fail_cases = vec![ + // case: default admin request should be rejected + (vec![], missing_error), + // case: empty split key + (vec![vec![]], empty_error), + // case: out of order split keys + ( + vec![b"k07".to_vec(), b"k08".to_vec(), b"k06".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k04".to_vec(), b"k07".to_vec(), b"k08".to_vec()], + invalid_error, + ), + // case: split keys are not in region range + ( + vec![b"k06".to_vec(), b"k07".to_vec(), b"k11".to_vec()], + not_in_region_error, + ), + // case: duplicated split keys + (vec![b"k06".to_vec(), b"k06".to_vec()], invalid_error), + ]; + + for (split_keys, fail_str) in fail_cases { + let req = if split_keys.is_empty() { + AdminRequest::default() + } else { + new_batch_split_request(split_keys) + }; + assert!( + validate_batch_split(&req, ®ion) + .unwrap_err() + .to_string() + .contains(fail_str) + ); + } + + // case: pass the validation + let req = new_batch_split_request(vec![b"k06".to_vec(), b"k07".to_vec(), b"k08".to_vec()]); + validate_batch_split(&req, ®ion).unwrap(); + } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 63761321405..75979a4afd5 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -55,7 +55,7 @@ use tikv_util::{ mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, is_learner, region_on_same_stores}, sys::{disk::DiskUsage, memory_usage_reaches_high_water}, - time::{monotonic_raw_now, Instant as TiInstant}, + time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -94,11 +94,10 @@ use crate::{ util::{KeysInfoFormatter, LeaseState}, worker::{ new_change_peer_v2_request, Bucket, BucketRange, CleanupTask, ConsistencyCheckTask, - GcSnapshotTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadProgress, - RegionTask, SplitCheckTask, + GcSnapshotTask, RaftlogGcTask, ReadDelegate, ReadProgress, RegionTask, SplitCheckTask, }, CasualMessage, Config, LocksStatus, MergeResultKind, PdTask, PeerMsg, PeerTick, - ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, + ProposalContext, RaftCmdExtraOpts, RaftCommand, RaftlogFetchResult, ReadCallback, ReadTask, SignificantMsg, SnapKey, StoreMsg, WriteCallback, }, Error, Result, @@ -245,7 +244,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, ) -> Result> { @@ -304,7 +303,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region_id: u64, peer: metapb::Peer, @@ -606,9 +605,14 @@ where } pub fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); + let count = msgs.len(); for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; @@ -688,6 +692,12 @@ where } } self.on_loop_finished(); + self.ctx.raft_metrics.peer_msg_len.observe(count as f64); + self.ctx + .raft_metrics + .event_time + .peer_msg + .observe(duration_to_sec(timer.saturating_elapsed())); } #[inline] @@ -1323,8 +1333,15 @@ where ) { fail_point!("raft_on_capture_change"); let region_id = self.region_id(); - let msg = + let mut msg = new_read_index_request(region_id, region_epoch.clone(), self.fsm.peer.peer.clone()); + // Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + if self.fsm.peer.is_in_flashback { + let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); + flags.insert(WriteBatchFlags::FLASHBACK); + msg.mut_header().set_flags(flags.bits()); + } let apply_router = self.ctx.apply_router.clone(); self.propose_raft_command_internal( msg, @@ -1383,7 +1400,7 @@ where SignificantMsg::CatchUpLogs(catch_up_logs) => { self.on_catch_up_logs_for_merge(catch_up_logs); } - SignificantMsg::StoreResolved { group_id, .. } => { + SignificantMsg::StoreResolved { group_id, store_id } => { let state = self.ctx.global_replication_state.lock().unwrap(); if state.status().get_mode() != ReplicationMode::DrAutoSync { return; @@ -1392,11 +1409,13 @@ where return; } drop(state); - self.fsm - .peer - .raft_group - .raft - .assign_commit_groups(&[(self.fsm.peer_id(), group_id)]); + if let Some(peer_id) = find_peer(self.region(), store_id).map(|p| p.get_id()) { + self.fsm + .peer + .raft_group + .raft + .assign_commit_groups(&[(peer_id, group_id)]); + } } SignificantMsg::CaptureChange { cmd, @@ -2480,12 +2499,14 @@ where // TODO: spin off the I/O code (delete_snapshot) let regions_to_destroy = match self.check_snapshot(&msg)? { Either::Left(key) => { - // If the snapshot file is not used again, then it's OK to - // delete them here. If the snapshot file will be reused when - // receiving, then it will fail to pass the check again, so - // missing snapshot files should not be noticed. - let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + if let Some(key) = key { + // If the snapshot file is not used again, then it's OK to + // delete them here. If the snapshot file will be reused when + // receiving, then it will fail to pass the check again, so + // missing snapshot files should not be noticed. + let s = self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + self.ctx.snap_mgr.delete_snapshot(&key, s.as_ref(), false); + } return Ok(()); } Either::Right(v) => v, @@ -2651,7 +2672,17 @@ where match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { if self.fsm.hibernate_state.group_state() == GroupState::Idle { - self.reset_raft_tick(GroupState::Ordered); + if msg.get_extra_msg().forcely_awaken { + // Forcely awaken this region by manually setting this GroupState + // into Chaos to trigger a new voting in this RaftGroup. + self.reset_raft_tick(if !self.fsm.peer.is_leader() { + GroupState::Chaos + } else { + GroupState::Ordered + }); + } else { + self.reset_raft_tick(GroupState::Ordered); + } } if msg.get_extra_msg().get_type() == ExtraMessageType::MsgRegionWakeUp && self.fsm.peer.is_leader() @@ -2937,16 +2968,55 @@ where // Returns `Vec<(u64, bool)>` indicated (source_region_id, merge_to_this_peer) // if the `msg` doesn't contain a snapshot or this snapshot doesn't conflict // with any other snapshots or regions. Otherwise a `SnapKey` is returned. - fn check_snapshot(&mut self, msg: &RaftMessage) -> Result>> { + fn check_snapshot( + &mut self, + msg: &RaftMessage, + ) -> Result, Vec<(u64, bool)>>> { if !msg.get_message().has_snapshot() { return Ok(Either::Right(vec![])); } let region_id = msg.get_region_id(); let snap = msg.get_message().get_snapshot(); - let key = SnapKey::from_region_snap(region_id, snap); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; + + let key = if !snap_data.get_meta().get_for_witness() { + // Check if snapshot file exists. + // No need to get snapshot for witness, as witness's empty snapshot bypass + // snapshot manager. + let key = SnapKey::from_region_snap(region_id, snap); + self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; + Some(key) + } else { + None + }; + + // If the index of snapshot is not newer than peer's apply index, it + // is possibly because there is witness -> non-witness switch, and the peer + // requests snapshot from leader but leader doesn't applies the switch yet. + // In that case, the snapshot is a witness snapshot whereas non-witness snapshot + // is expected. + if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() + && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() + { + info!( + "mismatch witness snapshot"; + "region_id" => region_id, + "peer_id" => self.fsm.peer_id(), + "for_witness" => snap_data.get_meta().get_for_witness(), + "is_witness" => self.fsm.peer.is_witness(), + "index" => snap.get_metadata().get_index(), + "applied_index" => self.fsm.peer.get_store().applied_index(), + ); + self.ctx + .raft_metrics + .message_dropped + .mismatch_witness_snapshot + .inc(); + return Ok(Either::Left(key)); + } + let snap_region = snap_data.take_region(); let peer_id = msg.get_to_peer().get_id(); let snap_enc_start_key = enc_start_key(&snap_region); @@ -3097,9 +3167,6 @@ where return Ok(Either::Left(key)); } - // Check if snapshot file exists. - self.ctx.snap_mgr.get_snapshot_for_applying(&key)?; - // WARNING: The checking code must be above this line. // Now all checking passed. @@ -3646,6 +3713,11 @@ where self.update_region(cp.region); fail_point!("change_peer_after_update_region"); + fail_point!( + "change_peer_after_update_region_store_3", + self.store_id() == 3, + |_| panic!("should not use return") + ); let now = Instant::now(); let (mut remove_self, mut need_ping) = (false, false); @@ -4936,6 +5008,16 @@ where let leader_id = self.fsm.peer.leader_id(); let request = msg.get_requests(); + // peer_id must be the same as peer's. + if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { + self.ctx + .raft_metrics + .invalid_proposal + .mismatch_peer_id + .inc(); + return Err(e); + } + if self.fsm.peer.force_leader.is_some() { self.ctx.raft_metrics.invalid_proposal.force_leader.inc(); // in force leader state, forbid requests to make the recovery progress less @@ -4973,15 +5055,17 @@ where self.register_raft_base_tick(); return Err(Error::NotLeader(region_id, leader)); } - // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { - self.ctx - .raft_metrics - .invalid_proposal - .mismatch_peer_id - .inc(); - return Err(e); + + // Forbid requests when it's a witness unless it's transfer leader + if self.fsm.peer.is_witness() + && !(msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + // TODO: use a dedicated error type + return Err(Error::RecoveryInProgress(self.region_id())); } + // check whether the peer is initialized. if !self.fsm.peer.is_initialized() { self.ctx diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index c83309011ac..fafc839dce2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -36,7 +36,7 @@ use kvproto::{ metapb::{self, Region, RegionEpoch}, pdpb::{self, QueryStats, StoreStats}, raft_cmdpb::{AdminCmdType, AdminRequest}, - raft_serverpb::{ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, + raft_serverpb::{ExtraMessage, ExtraMessageType, PeerState, RaftMessage, RegionLocalState}, replication_modepb::{ReplicationMode, ReplicationStatus}, }; use pd_client::{Feature, FeatureGate, PdClient}; @@ -53,7 +53,7 @@ use tikv_util::{ info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, slow_log, - store::find_peer, + store::{find_peer, region_on_stores}, sys as sys_util, sys::disk::{get_disk_status, DiskUsage}, time::{duration_to_sec, Instant as TiInstant}, @@ -72,6 +72,7 @@ use crate::{ }, store::{ async_io::{ + read::{ReadRunner, ReadTask}, write::{StoreWriters, Worker as WriteWorker, WriteMsg}, write_router::WriteSenders, }, @@ -95,9 +96,9 @@ use crate::{ worker::{ AutoSplitController, CleanupRunner, CleanupSstRunner, CleanupSstTask, CleanupTask, CompactRunner, CompactTask, ConsistencyCheckRunner, ConsistencyCheckTask, - GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogFetchRunner, RaftlogFetchTask, - RaftlogGcRunner, RaftlogGcTask, ReadDelegate, RefreshConfigRunner, RefreshConfigTask, - RegionRunner, RegionTask, SplitCheckTask, + GcSnapshotRunner, GcSnapshotTask, PdRunner, RaftlogGcRunner, RaftlogGcTask, + ReadDelegate, RefreshConfigRunner, RefreshConfigTask, RegionRunner, RegionTask, + SplitCheckTask, }, Callback, CasualMessage, GlobalReplicationState, InspectedRaftMessage, MergeResultKind, PdTask, PeerMsg, PeerTick, RaftCommand, SignificantMsg, SnapManager, StoreMsg, StoreTick, @@ -474,7 +475,7 @@ where // handle Compact, CleanupSst task pub cleanup_scheduler: Scheduler, pub raftlog_gc_scheduler: Scheduler, - pub raftlog_fetch_scheduler: Scheduler, + pub raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, pub apply_router: ApplyRouter, pub router: RaftRouter, @@ -684,7 +685,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreFsmDelegate<'a, EK, ER, T> { fn on_tick(&mut self, tick: StoreTick) { - let t = TiInstant::now_coarse(); + let timer = TiInstant::now_coarse(); match tick { StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat_tick(), StoreTick::SnapGc => self.on_snap_mgr_gc(), @@ -693,10 +694,12 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> StoreTick::ConsistencyCheck => self.on_consistency_check_tick(), StoreTick::CleanupImportSst => self.on_cleanup_import_sst_tick(), } - let elapsed = t.saturating_elapsed(); - RAFT_EVENT_DURATION + let elapsed = timer.saturating_elapsed(); + self.ctx + .raft_metrics + .event_time .get(tick.tag()) - .observe(duration_to_sec(elapsed) as f64); + .observe(duration_to_sec(elapsed)); slow_log!( elapsed, "[store {}] handle timeout {:?}", @@ -706,10 +709,14 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> } fn handle_msgs(&mut self, msgs: &mut Vec>) { + let timer = TiInstant::now_coarse(); for m in msgs.drain(..) { match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if self.ctx.coprocessor_host.should_skip_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { if matches!(&e, Error::RegionNotRegistered { .. }) { // This may happen in normal cases when add-peer runs slowly @@ -754,8 +761,16 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> drop(syncer); } StoreMsg::GcSnapshotFinish => self.register_snap_mgr_gc_tick(), + StoreMsg::AwakenRegions { abnormal_stores } => { + self.on_wake_up_regions(abnormal_stores); + } } } + self.ctx + .raft_metrics + .event_time + .store_msg + .observe(duration_to_sec(timer.saturating_elapsed())); } fn start(&mut self, store: metapb::Store) { @@ -1081,7 +1096,7 @@ pub struct RaftPollerBuilder { split_check_scheduler: Scheduler, cleanup_scheduler: Scheduler, raftlog_gc_scheduler: Scheduler, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, pub region_scheduler: Scheduler>, apply_router: ApplyRouter, pub router: RaftRouter, @@ -1531,7 +1546,7 @@ impl RaftBatchSystem { let raftlog_fetch_scheduler = workers.raftlog_fetch_worker.start( "raftlog-fetch-worker", - RaftlogFetchRunner::new(self.router.clone(), engines.raft.clone()), + ReadRunner::new(self.router.clone(), engines.raft.clone()), ); let compact_runner = CompactRunner::new(engines.kv.clone()); @@ -2215,6 +2230,8 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER target.clone(), )?; + self.ctx.coprocessor_host.on_peer_created(region_id); + // WARNING: The checking code must be above this line. // Now all checking passed @@ -2438,11 +2455,11 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER ); stats.set_query_stats(query_stats); - let store_info = StoreInfo { + let store_info = Some(StoreInfo { kv_engine: self.ctx.engines.kv.clone(), raft_engine: self.ctx.engines.raft.clone(), capacity: self.ctx.cfg.capacity.0, - }; + }); let task = PdTask::StoreHeartbeat { stats, @@ -2525,6 +2542,45 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.register_compact_lock_cf_tick(); } + fn on_wake_up_regions(&self, abnormal_stores: Vec) { + info!("try to wake up all hibernated regions in this store"; + "to_all" => abnormal_stores.is_empty()); + let meta = self.ctx.store_meta.lock().unwrap(); + for region_id in meta.regions.keys() { + let region = &meta.regions[region_id]; + // Check whether the current region is not found on abnormal stores. If so, + // this region is not the target to be awaken. + if !region_on_stores(region, &abnormal_stores) { + continue; + } + let peer = { + match find_peer(region, self.ctx.store_id()) { + None => continue, + Some(p) => p.clone(), + } + }; + { + // Send MsgRegionWakeUp to Peer for awakening hibernated regions. + let mut message = RaftMessage::default(); + message.set_region_id(*region_id); + message.set_from_peer(peer.clone()); + message.set_to_peer(peer); + message.set_region_epoch(region.get_region_epoch().clone()); + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgRegionWakeUp); + msg.forcely_awaken = true; + message.set_extra_msg(msg); + if let Err(e) = self.ctx.router.send_raft_message(message) { + error!( + "send awaken region message failed"; + "region_id" => region_id, + "err" => ?e + ); + } + } + } + } + fn register_pd_store_heartbeat_tick(&self) { self.ctx.schedule_store_tick( StoreTick::PdStoreHeartbeat, diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index aa33ae49fea..5cfbb645612 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -82,8 +82,11 @@ pub struct RaftMetrics { pub store_time: LocalHistogram, pub propose_wait_time: LocalHistogram, pub process_ready: LocalHistogram, + pub event_time: RaftEventDurationVec, + pub peer_msg_len: LocalHistogram, pub commit_log: LocalHistogram, pub write_block_wait: LocalHistogram, + pub propose_log_size: LocalHistogram, // waterfall metrics pub waterfall_metrics: bool, @@ -117,8 +120,11 @@ impl RaftMetrics { process_ready: PEER_RAFT_PROCESS_DURATION .with_label_values(&["ready"]) .local(), + event_time: RaftEventDurationVec::from(&RAFT_EVENT_DURATION_VEC), + peer_msg_len: PEER_MSG_LEN.local(), commit_log: PEER_COMMIT_LOG_HISTOGRAM.local(), write_block_wait: STORE_WRITE_MSG_BLOCK_WAIT_DURATION_HISTOGRAM.local(), + propose_log_size: PEER_PROPOSE_LOG_SIZE_HISTOGRAM.local(), waterfall_metrics, wf_batch_wait: STORE_WF_BATCH_WAIT_DURATION_HISTOGRAM.local(), wf_send_to_queue: STORE_WF_SEND_TO_QUEUE_DURATION_HISTOGRAM.local(), @@ -149,8 +155,11 @@ impl RaftMetrics { self.store_time.flush(); self.propose_wait_time.flush(); self.process_ready.flush(); + self.event_time.flush(); + self.peer_msg_len.flush(); self.commit_log.flush(); self.write_block_wait.flush(); + self.propose_log_size.flush(); if self.waterfall_metrics { self.wf_batch_wait.flush(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 7ab47cc90c6..b0f44c30c0f 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -48,6 +48,7 @@ make_auto_flush_static_metric! { stale, decode, epoch, + cancel, } pub label_enum RegionHashType { @@ -86,16 +87,6 @@ make_auto_flush_static_metric! { finished, } - pub label_enum RaftEventDurationType { - compact_check, - pd_store_heartbeat, - snap_gc, - compact_lock_cf, - consistency_check, - cleanup_import_sst, - raft_engine_purge, - } - pub label_enum CompactionGuardAction { init, init_failure, @@ -103,10 +94,6 @@ make_auto_flush_static_metric! { skip_partition, } - pub struct RaftEventDuration : LocalHistogram { - "type" => RaftEventDurationType - } - pub struct RaftEntryFetches : LocalIntCounter { "type" => RaftEntryType } @@ -182,6 +169,7 @@ make_static_metric! { pub label_enum RaftDroppedMessage { mismatch_store_id, mismatch_region_epoch, + mismatch_witness_snapshot, stale_msg, region_overlap, region_no_peer, @@ -214,10 +202,23 @@ make_static_metric! { region_not_initialized, is_applying_snapshot, force_leader, + witness, flashback_in_progress, flashback_not_prepared } + pub label_enum RaftEventDurationType { + compact_check, + pd_store_heartbeat, + snap_gc, + compact_lock_cf, + consistency_check, + cleanup_import_sst, + raft_engine_purge, + peer_msg, + store_msg, + } + pub label_enum RaftLogGcSkippedReason { reserve_log, compact_idx_too_small, @@ -279,6 +280,10 @@ make_static_metric! { "type" => RaftInvalidProposal } + pub struct RaftEventDurationVec : LocalHistogram { + "type" => RaftEventDurationType + } + pub struct RaftLogGcSkippedCounterVec: LocalIntCounter { "reason" => RaftLogGcSkippedReason, } @@ -662,8 +667,13 @@ lazy_static! { &["type"], exponential_buckets(0.001, 1.59, 20).unwrap() // max 10s ).unwrap(); - pub static ref RAFT_EVENT_DURATION: RaftEventDuration = - auto_flush_from!(RAFT_EVENT_DURATION_VEC, RaftEventDuration); + + pub static ref PEER_MSG_LEN: Histogram = + register_histogram!( + "tikv_raftstore_peer_msg_len", + "Length of peer msg.", + exponential_buckets(1.0, 2.0, 20).unwrap() // max 1000s + ).unwrap(); pub static ref RAFT_READ_INDEX_PENDING_DURATION: Histogram = register_histogram!( diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index a60eb087562..5d7455b2d1c 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -30,6 +30,7 @@ mod worker; pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ + read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, write::{ ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask, @@ -69,18 +70,17 @@ pub use self::{ check_abort, copy_snapshot, snap_io::{apply_sst_cf_file, build_sst_cf_file_list}, ApplyOptions, CfFile, Error as SnapError, SnapEntry, SnapKey, SnapManager, - SnapManagerBuilder, Snapshot, SnapshotStatistics, + SnapManagerBuilder, Snapshot, SnapshotStatistics, TabletSnapKey, TabletSnapManager, }, transport::{CasualRouter, ProposalRouter, SignificantRouter, StoreRouter, Transport}, txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, - CheckLeaderTask, FetchedLogs, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LocalReaderCore, LogFetchedNotifier, PdTask, - RaftlogFetchRunner, RaftlogFetchTask, ReadDelegate, ReadExecutor, ReadExecutorProvider, - ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, - SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, - TLS_LOCAL_READ_METRICS, + metrics::TLS_LOCAL_READ_METRICS, AutoSplitController, Bucket, BucketRange, + CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, + KeyEntry, LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, + TrackVer, WriteStats, }, }; diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 6851ebd30d8..a4c6c435741 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -26,9 +26,7 @@ use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; -use super::{ - local_metrics::TimeTracker, region_meta::RegionMeta, worker::FetchedLogs, RegionSnapshot, -}; +use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ fsm::apply::{CatchUpLogs, ChangeObserver, TaskRes as ApplyTaskRes}, metrics::RaftEventDurationType, @@ -721,6 +719,7 @@ pub struct InspectedRaftMessage { } /// Message that can be sent to a peer. +#[allow(clippy::large_enum_variant)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target @@ -847,6 +846,10 @@ where }, GcSnapshotFinish, + + AwakenRegions { + abnormal_stores: Vec, + }, } impl fmt::Debug for StoreMsg @@ -880,6 +883,7 @@ where write!(fmt, "UnsafeRecoveryCreatePeer") } StoreMsg::GcSnapshotFinish => write!(fmt, "GcSnapshotFinish"), + StoreMsg::AwakenRegions { .. } => write!(fmt, "AwakenRegions"), } } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index b06eb5c0c3f..cffb7e40a9a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -84,8 +84,9 @@ use super::{ use crate::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason, RoleChange}, errors::RAFTSTORE_IS_BUSY, + router::RaftStoreRouter, store::{ - async_io::{write::WriteMsg, write_router::WriteRouter}, + async_io::{read::ReadTask, write::WriteMsg, write_router::WriteRouter}, fsm::{ apply::{self, CatchUpLogs}, store::{PollContext, RaftRouter}, @@ -93,12 +94,13 @@ use crate::{ }, hibernate_state::GroupState, memory::{needs_evict_entry_cache, MEMTRACE_RAFT_ENTRIES}, - msg::{ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + msg::{CasualMessage, ErrorCallback, PeerMsg, RaftCommand, SignificantMsg, StoreMsg}, + peer_storage::HandleSnapshotResult, txn_ext::LocksStatus, util::{admin_cmd_epoch_lookup, RegionReadProgress}, worker::{ - HeartbeatTask, RaftlogFetchTask, RaftlogGcTask, ReadDelegate, ReadExecutor, - ReadProgress, RegionTask, SplitCheckTask, + HeartbeatTask, RaftlogGcTask, ReadDelegate, ReadExecutor, ReadProgress, RegionTask, + SplitCheckTask, }, Callback, Config, GlobalReplicationState, PdTask, ReadCallback, ReadIndexContext, ReadResponse, TxnExt, WriteCallback, RAFT_INIT_LOG_INDEX, @@ -466,6 +468,7 @@ pub struct PersistSnapshotResult { pub prev_region: metapb::Region, pub region: metapb::Region, pub destroy_regions: Vec, + pub for_witness: bool, } #[derive(Debug)] @@ -1041,7 +1044,7 @@ where store_id: u64, cfg: &Config, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, peer: metapb::Peer, @@ -1076,6 +1079,7 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, + // TODO: if peer.is_witness { 0 } else { 1 }, ..Default::default() }; @@ -1150,7 +1154,7 @@ where region, applied_index, REGION_READ_PROGRESS_CAP, - tag.clone(), + peer_id, )), memtrace_raft_entries: 0, write_router: WriteRouter::new(tag), @@ -1684,6 +1688,11 @@ where self.raft_group.raft.state == StateRole::Leader } + #[inline] + pub fn is_witness(&self) -> bool { + self.peer.is_witness + } + #[inline] pub fn get_role(&self) -> StateRole { self.raft_group.raft.state @@ -1760,6 +1769,10 @@ where for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgSnapshot { + let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + snap_data + .merge_from_bytes(msg.get_message().get_snapshot().get_data()) + .unwrap(); let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); if snap_index > self.last_sent_snapshot_idx { self.last_sent_snapshot_idx = snap_index; @@ -2013,7 +2026,6 @@ where if p.get_id() == self.peer.get_id() { continue; } - // TODO if let Some(instant) = self.peer_heartbeats.get(&p.get_id()) { let elapsed = instant.saturating_elapsed(); if elapsed >= max_duration { @@ -2856,13 +2868,20 @@ where } } - if let HandleReadyResult::Snapshot { + if let HandleReadyResult::Snapshot(box HandleSnapshotResult { msgs, snap_region, destroy_regions, last_first_index, - } = res + for_witness, + }) = res { + if for_witness { + // inform next round to check apply status + ctx.router + .send_casual_msg(snap_region.get_id(), CasualMessage::SnapshotApplied) + .unwrap(); + } // When applying snapshot, there is no log applied and not compacted yet. self.raft_log_size_hint = 0; @@ -2874,6 +2893,7 @@ where prev_region: self.region().clone(), region: snap_region, destroy_regions, + for_witness, }), }); if self.last_compacted_idx == 0 && last_first_index >= RAFT_INIT_LOG_INDEX { @@ -2977,6 +2997,7 @@ where } else { vec![] }; + // Note that the `commit_index` and `commit_term` here may be used to // forward the commit index. So it must be less than or equal to persist // index. @@ -2985,6 +3006,7 @@ where self.raft_group.raft.raft_log.persisted, ); let commit_term = self.get_store().term(commit_index).unwrap(); + let mut apply = Apply::new( self.peer_id(), self.region_id, @@ -3094,6 +3116,9 @@ where "after" => ?peer, ); self.peer = peer; + // TODO: set priority for witness + // self.raft_group + // .set_priority(if self.peer.is_witness { 0 } else { 1 }); }; self.activate(ctx); @@ -3443,7 +3468,7 @@ where } let progress_to_be_updated = self.mut_store().applied_term() != applied_term; - self.mut_store().set_applied_state(apply_state); + self.mut_store().set_apply_state(apply_state); self.mut_store().set_applied_term(applied_term); self.peer_stat.written_keys += apply_metrics.written_keys; @@ -4331,9 +4356,10 @@ where }; let data = req.write_to_bytes()?; - - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data.len() as f64); + poll_ctx + .raft_metrics + .propose_log_size + .observe(data.len() as f64); if data.len() as u64 > poll_ctx.cfg.raft_entry_max_size.0 { error!( @@ -4394,6 +4420,11 @@ where msg: &eraftpb::Message, peer_disk_usage: DiskUsage, ) -> bool { + if self.is_witness() { + // shouldn't transfer leader to witness peer + return true; + } + let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); if pending_snapshot || msg.get_from() != self.leader_id() @@ -4646,6 +4677,7 @@ where util::check_conf_change( &ctx.cfg, &self.raft_group, + self.region(), &self.peer, changes.as_ref(), &cc, @@ -4653,8 +4685,7 @@ where )?; ctx.raft_metrics.propose.conf_change.inc(); - // TODO: use local histogram metrics - PEER_PROPOSE_LOG_SIZE_HISTOGRAM.observe(data_size as f64); + ctx.raft_metrics.propose_log_size.observe(data_size as f64); info!( "propose conf change peer"; "region_id" => self.region_id, @@ -5011,6 +5042,7 @@ where Some(ForceLeaderState::ForceLeader { .. }) ) } + pub fn unsafe_recovery_maybe_finish_wait_apply(&mut self, force: bool) { if let Some(UnsafeRecoveryState::WaitApply { target_index, .. }) = &self.unsafe_recovery_state @@ -5698,6 +5730,7 @@ mod tests { AdminCmdType::TransferLeader, AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, + AdminCmdType::BatchSwitchWitness, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 7f4b6778860..0d10b1f36cf 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -30,14 +30,18 @@ use raft::{ Error as RaftError, GetEntriesContext, RaftState, Ready, Storage, StorageError, }; use tikv_util::{ - box_err, box_try, debug, defer, error, info, time::Instant, warn, worker::Scheduler, + box_err, box_try, debug, defer, error, info, store::find_peer_by_id, time::Instant, warn, + worker::Scheduler, }; use super::{metrics::*, worker::RegionTask, SnapEntry, SnapKey, SnapManager}; use crate::{ store::{ - async_io::write::WriteTask, entry_storage::EntryStorage, fsm::GenSnapTask, - peer::PersistSnapshotResult, util, worker::RaftlogFetchTask, + async_io::{read::ReadTask, write::WriteTask}, + entry_storage::EntryStorage, + fsm::GenSnapTask, + peer::PersistSnapshotResult, + util, }, Error, Result, }; @@ -110,17 +114,21 @@ impl From for RaftError { } } +#[derive(PartialEq, Debug)] +pub struct HandleSnapshotResult { + pub msgs: Vec, + pub snap_region: metapb::Region, + /// The regions whose range are overlapped with this region + pub destroy_regions: Vec, + /// The first index before applying the snapshot. + pub last_first_index: u64, + pub for_witness: bool, +} + #[derive(PartialEq, Debug)] pub enum HandleReadyResult { SendIoTask, - Snapshot { - msgs: Vec, - snap_region: metapb::Region, - /// The regions whose range are overlapped with this region - destroy_regions: Vec, - /// The first index before applying the snapshot. - last_first_index: u64, - }, + Snapshot(Box), // use boxing to reduce total size of the enum NoIoTask, } @@ -211,6 +219,7 @@ where pub engines: Engines, peer_id: u64, + peer: Option, // when uninitialized the peer info is unknown. region: metapb::Region, snap_state: RefCell, @@ -218,13 +227,13 @@ where region_scheduler: Scheduler>, snap_tried_cnt: RefCell, - entry_storage: EntryStorage, + entry_storage: EntryStorage, pub tag: String, } impl Deref for PeerStorage { - type Target = EntryStorage; + type Target = EntryStorage; #[inline] fn deref(&self) -> &Self::Target { @@ -286,7 +295,7 @@ where engines: Engines, region: &metapb::Region, region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, peer_id: u64, tag: String, ) -> Result> { @@ -311,6 +320,7 @@ where Ok(PeerStorage { engines, peer_id, + peer: find_peer_by_id(region, peer_id).cloned(), region: region.clone(), snap_state: RefCell::new(SnapState::Relax), gen_snap_task: RefCell::new(None), @@ -351,6 +361,7 @@ where #[inline] pub fn set_region(&mut self, region: metapb::Region) { + self.peer = find_peer_by_id(®ion, self.peer_id).cloned(); self.region = region; } @@ -434,18 +445,33 @@ where } /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no - /// unavailable snapshot. + /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + if self.peer.as_ref().unwrap().is_witness { + // witness could be the leader for a while, do not generate snapshot now + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } + + if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // generate an empty snapshot for witness directly + return Ok(util::new_empty_snapshot( + self.region.clone(), + self.applied_index(), + self.applied_term(), + true, // for witness + )); + } + let mut snap_state = self.snap_state.borrow_mut(); let mut tried_cnt = self.snap_tried_cnt.borrow_mut(); let mut tried = false; let mut last_canceled = false; if let SnapState::Generating { - ref canceled, - ref receiver, - .. - } = *snap_state + canceled, receiver, .. + } = &*snap_state { tried = true; last_canceled = canceled.load(Ordering::SeqCst); @@ -548,7 +574,7 @@ where snap: &Snapshot, task: &mut WriteTask, destroy_regions: &[metapb::Region], - ) -> Result { + ) -> Result<(metapb::Region, bool)> { info!( "begin to apply snapshot"; "region_id" => self.region.get_id(), @@ -558,8 +584,9 @@ where let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap.get_data())?; - let region_id = self.get_region_id(); + let for_witness = snap_data.get_meta().get_for_witness(); + let region_id = self.get_region_id(); let region = snap_data.take_region(); if region.get_id() != region_id { return Err(box_err!( @@ -594,24 +621,32 @@ where for r in destroy_regions { write_peer_state(kv_wb, r, PeerState::Tombstone, None)?; } - write_peer_state(kv_wb, ®ion, PeerState::Applying, None)?; - let last_index = snap.get_metadata().get_index(); + // Witness snapshot is applied atomically as no async applying operation to + // region worker, so no need to set the peer state to `Applying` + let state = if for_witness { + PeerState::Normal + } else { + PeerState::Applying + }; + write_peer_state(kv_wb, ®ion, state, None)?; + + let snap_index = snap.get_metadata().get_index(); + let snap_term = snap.get_metadata().get_term(); - self.raft_state_mut().set_last_index(last_index); - self.set_last_term(snap.get_metadata().get_term()); - self.apply_state_mut().set_applied_index(last_index); - let last_term = self.last_term(); - self.set_applied_term(last_term); + self.raft_state_mut().set_last_index(snap_index); + self.set_last_term(snap_term); + self.apply_state_mut().set_applied_index(snap_index); + self.set_applied_term(snap_term); // The snapshot only contains log which index > applied index, so // here the truncate state's (index, term) is in snapshot metadata. self.apply_state_mut() .mut_truncated_state() - .set_index(last_index); + .set_index(snap_index); self.apply_state_mut() .mut_truncated_state() - .set_term(snap.get_metadata().get_term()); + .set_term(snap_term); // `region` will be updated after persisting. // Although there is an interval that other metadata are updated while `region` @@ -631,7 +666,7 @@ where "state" => ?self.apply_state(), ); - Ok(region) + Ok((region, for_witness)) } /// Delete all meta belong to the region. Results are stored in `wb`. @@ -853,20 +888,23 @@ where let mut write_task = WriteTask::new(region_id, self.peer_id, ready.number()); - let mut res = HandleReadyResult::SendIoTask; - if !ready.snapshot().is_empty() { + let mut res = if ready.snapshot().is_empty() { + HandleReadyResult::SendIoTask + } else { fail_point!("raft_before_apply_snap"); let last_first_index = self.first_index().unwrap(); - let snap_region = + let (snap_region, for_witness) = self.apply_snapshot(ready.snapshot(), &mut write_task, &destroy_regions)?; - res = HandleReadyResult::Snapshot { + let res = HandleReadyResult::Snapshot(Box::new(HandleSnapshotResult { msgs: ready.take_persisted_messages(), snap_region, destroy_regions, last_first_index, - }; + for_witness, + })); fail_point!("raft_after_apply_snap"); + res }; if !ready.entries().is_empty() { @@ -927,7 +965,7 @@ where // - After `PrepareMerge` log is committed, the source region leader's lease // will be suspected immediately which makes the local reader not serve read // request. - // - No read request can be responsed in peer fsm during merging. These + // - No read request can be responded in peer fsm during merging. These // conditions are used to prevent reading **stale** data in the past. At // present, they are also used to prevent reading **corrupt** data. for r in &res.destroy_regions { @@ -939,7 +977,14 @@ where } } - self.schedule_applying_snapshot(); + if !res.for_witness { + self.schedule_applying_snapshot(); + } else { + // Bypass apply snapshot process for witness as the snapshot is empty, so mark + // status as finished directly here + let status = Arc::new(AtomicUsize::new(JOB_STATUS_FINISHED)); + self.set_snap_state(SnapState::Applying(Arc::clone(&status))); + } // The `region` is updated after persisting in order to stay consistent with the // one in `StoreMeta::regions` (will be updated soon). @@ -1130,27 +1175,28 @@ pub mod tests { Error as RaftError, GetEntriesContext, StorageError, }; use tempfile::{Builder, TempDir}; - use tikv_util::worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}; + use tikv_util::{ + store::{new_peer, new_witness_peer}, + worker::{dummy_scheduler, LazyWorker, Scheduler, Worker}, + }; use super::*; use crate::{ coprocessor::CoprocessorHost, store::{ - async_io::write::write_to_db_for_test, + async_io::{read::ReadRunner, write::write_to_db_for_test}, bootstrap_store, entry_storage::tests::validate_cache, fsm::apply::compact_raft_log, initial_region, prepare_bootstrap_cluster, - worker::{ - make_region_worker_raftstore_cfg, FetchedLogs, LogFetchedNotifier, - RaftlogFetchRunner, RegionRunner, RegionTask, - }, + worker::{make_region_worker_raftstore_cfg, RegionRunner, RegionTask}, + AsyncReadNotifier, FetchedLogs, GenSnapRes, }, }; fn new_storage( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ) -> PeerStorage { let kv_db = engine_test::kv::new_engine(path.path().to_str().unwrap(), ALL_CFS).unwrap(); @@ -1183,7 +1229,7 @@ pub mod tests { pub fn new_storage_from_ents( region_scheduler: Scheduler>, - raftlog_fetch_scheduler: Scheduler, + raftlog_fetch_scheduler: Scheduler>, path: &TempDir, ents: &[Entry], ) -> PeerStorage { @@ -1378,10 +1424,14 @@ pub mod tests { } } - impl LogFetchedNotifier for TestRouter { - fn notify(&self, _region_id: u64, fetched_logs: FetchedLogs) { + impl AsyncReadNotifier for TestRouter { + fn notify_logs_fetched(&self, _region_id: u64, fetched_logs: FetchedLogs) { self.ch.send(fetched_logs).unwrap(); } + + fn notify_snapshot_generated(&self, _region_id: u64, _res: GenSnapRes) { + unreachable!(); + } } #[test] @@ -1455,7 +1505,7 @@ pub mod tests { let raftlog_fetch_scheduler = raftlog_fetch_worker.scheduler(); let mut store = new_storage_from_ents(region_scheduler, raftlog_fetch_scheduler, &td, &ents); - raftlog_fetch_worker.start(RaftlogFetchRunner::new(router, store.engines.raft.clone())); + raftlog_fetch_worker.start(ReadRunner::new(router, store.engines.raft.clone())); store.compact_entry_cache(5); let mut e = store.entries(lo, hi, maxsize, GetEntriesContext::empty(true)); if e == Err(raft::Error::Store( @@ -1564,7 +1614,7 @@ pub mod tests { Option::>::None, ); worker.start_with_timer(runner); - let snap = s.snapshot(0, 0); + let snap = s.snapshot(0, 1); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); @@ -1588,11 +1638,11 @@ pub mod tests { let (tx, rx) = channel(); s.set_snap_state(gen_snap_for_test(rx)); // Empty channel should cause snapshot call to wait. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); tx.send(snap.clone()).unwrap(); - assert_eq!(s.snapshot(0, 0), Ok(snap.clone())); + assert_eq!(s.snapshot(0, 1), Ok(snap.clone())); assert_eq!(*s.snap_tried_cnt.borrow(), 0); let (tx, rx) = channel(); @@ -1633,7 +1683,7 @@ pub mod tests { s.set_snap_state(gen_snap_for_test(rx)); *s.snap_tried_cnt.borrow_mut() = 1; // stale snapshot should be abandoned, snapshot index < truncated index. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); assert_eq!(*s.snap_tried_cnt.borrow(), 1); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); @@ -1650,7 +1700,7 @@ pub mod tests { ref s => panic!("unexpected state {:?}", s), } // Disconnected channel should trigger another try. - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); assert_eq!(*s.snap_tried_cnt.borrow(), 2); @@ -1665,13 +1715,13 @@ pub mod tests { } // Scheduled job failed should trigger . - assert_eq!(s.snapshot(0, 0).unwrap_err(), unavailable); + assert_eq!(s.snapshot(0, 1).unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap_err(); } // When retry too many times, it should report a different error. - match s.snapshot(0, 0) { + match s.snapshot(0, 1) { Err(RaftError::Store(StorageError::Other(_))) => {} res => panic!("unexpected res: {:?}", res), } @@ -1747,6 +1797,80 @@ pub mod tests { test_storage_create_snapshot_for_role("tikv", 5); } + #[test] + fn test_storage_create_snapshot_for_witness() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut cs = ConfState::default(); + cs.set_voters(vec![1, 2, 3]); + + let td = Builder::new().prefix("tikv-store-test").tempdir().unwrap(); + let snap_dir = Builder::new().prefix("snap_dir").tempdir().unwrap(); + let mgr = SnapManager::new(snap_dir.path().to_str().unwrap()); + let mut worker = Worker::new("region-worker").lazy_build("region-worker"); + let sched = worker.scheduler(); + let (dummy_scheduler, _) = dummy_scheduler(); + let mut s = new_storage_from_ents(sched.clone(), dummy_scheduler, &td, &ents); + let cfg = make_region_worker_raftstore_cfg(true); + let (router, _) = mpsc::sync_channel(100); + let runner = RegionRunner::new( + s.engines.kv.clone(), + mgr, + cfg, + CoprocessorHost::::default(), + router, + Option::>::None, + ); + worker.start_with_timer(runner); + + let mut r = s.region().clone(); + r.mut_peers().push(new_peer(2, 2)); + r.mut_peers().push(new_witness_peer(3, 3)); + + let mut kv_wb = s.engines.kv.write_batch(); + write_peer_state(&mut kv_wb, &r, PeerState::Normal, None).unwrap(); + kv_wb.write().unwrap(); + s.set_region(r); + + let wait_snapshot = |snap: raft::Result| -> Snapshot { + if let Ok(s) = snap { + return s; + } + let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); + assert_eq!(snap.unwrap_err(), unavailable); + assert_eq!(*s.snap_tried_cnt.borrow(), 1); + let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); + generate_and_schedule_snapshot(gen_task, &s.engines, &sched).unwrap(); + let snap = match *s.snap_state.borrow() { + SnapState::Generating { ref receiver, .. } => { + receiver.recv_timeout(Duration::from_secs(3)).unwrap() + } + ref s => panic!("unexpected state: {:?}", s), + }; + snap + }; + + // generate snapshot for peer + let snap = wait_snapshot(s.snapshot(0, 2)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); + + // generate snapshot for witness peer + let snap = wait_snapshot(s.snapshot(0, 3)); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); + assert!(!snap.get_data().is_empty()); + + let mut data = RaftSnapshotData::default(); + protobuf::Message::merge_from_bytes(&mut data, snap.get_data()).unwrap(); + assert_eq!(data.get_region().get_id(), 1); + assert_eq!(data.get_region().get_peers().len(), 3); + let files = data.get_meta().get_cf_files(); + for file in files { + assert_eq!(file.get_size(), 0); + } + } + #[test] fn test_storage_apply_snapshot() { let ents = vec![ @@ -1776,7 +1900,7 @@ pub mod tests { Option::>::None, ); worker.start(runner); - s1.snapshot(0, 0).unwrap_err(); + s1.snapshot(0, 1).unwrap_err(); let gen_task = s1.gen_snap_task.borrow_mut().take().unwrap(); generate_and_schedule_snapshot(gen_task, &s1.engines, &sched).unwrap(); @@ -1794,7 +1918,7 @@ pub mod tests { let mut s2 = new_storage(sched.clone(), dummy_scheduler.clone(), &td2); assert_eq!(s2.first_index(), Ok(s2.applied_index() + 1)); let mut write_task = WriteTask::new(s2.get_region_id(), s2.peer_id, 1); - let snap_region = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s2.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); @@ -1811,7 +1935,7 @@ pub mod tests { let mut s3 = new_storage_from_ents(sched, dummy_scheduler, &td3, ents); validate_cache(&s3, &ents[1..]); let mut write_task = WriteTask::new(s3.get_region_id(), s3.peer_id, 1); - let snap_region = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); + let (snap_region, _) = s3.apply_snapshot(&snap1, &mut write_task, &[]).unwrap(); let mut snap_data = RaftSnapshotData::default(); snap_data.merge_from_bytes(snap1.get_data()).unwrap(); assert_eq!(snap_region, snap_data.take_region(),); diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 0370c7604ec..7de687e9dbb 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -60,7 +60,7 @@ pub struct RaftHardState { pub commit: u64, } -#[derive(Debug, Copy, Clone, Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)] pub enum RaftStateRole { Follower, Candidate, @@ -178,12 +178,27 @@ pub struct RegionPeer { pub id: u64, pub store_id: u64, pub role: RaftPeerRole, + pub is_witness: bool, } impl PartialEq for RegionPeer { #[inline] fn eq(&self, other: &metapb::Peer) -> bool { - self.id == other.id && self.store_id == other.store_id && self.role == other.role + // May not be sufficent, but always correct. + let s: metapb::Peer = (*self).into(); + s == *other + } +} + +impl From for metapb::Peer { + fn from(p: RegionPeer) -> Self { + metapb::Peer { + id: p.id, + store_id: p.store_id, + role: p.role.into(), + is_witness: p.is_witness, + ..Default::default() + } } } @@ -247,6 +262,7 @@ impl RegionMeta { id: peer.get_id(), store_id: peer.get_store_id(), role: peer.get_role().into(), + is_witness: peer.is_witness, }); } let merge_state = if local_state.has_merge_state() { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index eda0ffaa9cb..d564bcb17e0 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -4,6 +4,7 @@ use std::{ cmp::{self, Ordering as CmpOrdering, Reverse}, error::Error as StdError, fmt::{self, Display, Formatter}, + fs, io::{self, ErrorKind, Read, Write}, path::{Path, PathBuf}, result, str, @@ -56,6 +57,7 @@ pub const SNAPSHOT_CFS_ENUM_PAIR: &[(CfNames, CfName)] = &[ (CfNames::write, CF_WRITE), ]; pub const SNAPSHOT_VERSION: u64 = 2; +pub const TABLET_SNAPSHOT_VERSION: u64 = 3; pub const IO_LIMITER_CHUNK_SIZE: usize = 4 * 1024; /// Name prefix for the self-generated snapshot file. @@ -205,7 +207,7 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -557,7 +559,7 @@ impl Snapshot { for (i, file_path) in file_paths.iter().enumerate() { if cf_file.size[i] > 0 { let path = Path::new(file_path); - let file = File::open(&path)?; + let file = File::open(path)?; cf_file .file_for_sending .push(Box::new(file) as Box); @@ -600,7 +602,7 @@ impl Snapshot { let f = OpenOptions::new() .write(true) .create_new(true) - .open(&file_path)?; + .open(file_path)?; cf_file.file_for_recving.push(CfFileForRecving { file: f, encrypter: None, @@ -788,7 +790,7 @@ impl Snapshot { if !for_send && !plain_file_used(cf_file.cf) { sst_importer::prepare_sst_for_ingestion( file_path, - &Path::new(&clone_file_paths[i]), + Path::new(&clone_file_paths[i]), self.mgr.encryption_key_manager.as_deref(), )?; } @@ -811,7 +813,7 @@ impl Snapshot { } // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -972,7 +974,7 @@ impl Snapshot { } else { // delete snapshot files according to meta file for clone_file_path in clone_file_paths { - delete_file_if_exist(&clone_file_path).unwrap(); + delete_file_if_exist(clone_file_path).unwrap(); } } @@ -983,7 +985,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file, gen_tmp_file_name); } else { for tmp_file_path in tmp_file_paths { - delete_file_if_exist(&tmp_file_path).unwrap(); + delete_file_if_exist(tmp_file_path).unwrap(); } } } @@ -994,7 +996,7 @@ impl Snapshot { try_delete_snapshot_files!(cf_file); } else { for file_path in &file_paths { - delete_file_if_exist(&file_path).unwrap(); + delete_file_if_exist(file_path).unwrap(); } if let Some(ref mgr) = self.mgr.encryption_key_manager { for file_path in &file_paths { @@ -1047,7 +1049,7 @@ impl Snapshot { snap_data.set_version(SNAPSHOT_VERSION); snap_data.set_meta(self.meta_file.meta.as_ref().unwrap().clone()); - SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed()) as f64); + SNAPSHOT_BUILD_TIME_HISTOGRAM.observe(duration_to_sec(t.saturating_elapsed())); SNAPSHOT_KV_COUNT_HISTOGRAM.observe(total_count as f64); SNAPSHOT_SIZE_HISTOGRAM.observe(total_size as f64); info!( @@ -1115,7 +1117,7 @@ impl Snapshot { || (cf_file .file_paths() .iter() - .all(|file_path| file_exists(&Path::new(file_path)))) + .all(|file_path| file_exists(Path::new(file_path)))) }) && file_exists(&self.meta_file.path) } @@ -1123,6 +1125,10 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + pub fn total_size(&self) -> u64 { self.cf_files .iter() @@ -1184,7 +1190,7 @@ impl Snapshot { let tmp_paths = cf_file.tmp_file_paths(); let paths = cf_file.file_paths(); for (i, tmp_path) in tmp_paths.iter().enumerate() { - file_system::rename(&tmp_path, &paths[i])?; + file_system::rename(tmp_path, &paths[i])?; } } sync_dir(&self.dir_path)?; @@ -1488,7 +1494,7 @@ impl SnapManager { "{}_{}{}{}", DEL_RANGE_PREFIX, sst_id, SST_FILE_SUFFIX, TMP_FILE_SUFFIX ); - let path = PathBuf::from(&self.core.base).join(&filename); + let path = PathBuf::from(&self.core.base).join(filename); path.to_str().unwrap().to_string() } @@ -1802,7 +1808,7 @@ impl SnapManagerCore { } r?; } else { - file_system::rename(&tmp_file_path, &file_paths[i])?; + file_system::rename(tmp_file_path, &file_paths[i])?; } let file = Path::new(&file_paths[i]); let (checksum, size) = calc_checksum_and_size(file, mgr)?; @@ -1886,6 +1892,104 @@ impl SnapManagerBuilder { } } +#[derive(Clone, Hash, PartialEq, Eq, PartialOrd, Ord, Debug)] +pub struct TabletSnapKey { + pub region_id: u64, + pub to_peer: u64, + pub term: u64, + pub idx: u64, +} + +impl TabletSnapKey { + #[inline] + pub fn new(region_id: u64, to_peer: u64, term: u64, idx: u64) -> TabletSnapKey { + TabletSnapKey { + region_id, + to_peer, + term, + idx, + } + } + + pub fn from_region_snap(region_id: u64, to_peer: u64, snap: &RaftSnapshot) -> TabletSnapKey { + let index = snap.get_metadata().get_index(); + let term = snap.get_metadata().get_term(); + TabletSnapKey::new(region_id, to_peer, term, index) + } +} + +impl Display for TabletSnapKey { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!( + f, + "{}_{}_{}_{}", + self.region_id, self.to_peer, self.term, self.idx + ) + } +} + +/// `TabletSnapManager` manager tablet snapshot and shared between raftstore v2. +/// It's similar `SnapManager`, but simpler in tablet version. +/// +/// TODO: +/// - clean up expired tablet checkpointer +#[derive(Clone)] +pub struct TabletSnapManager { + // directory to store snapfile. + base: String, +} + +impl TabletSnapManager { + pub fn new>(path: T) -> Self { + Self { base: path.into() } + } + + pub fn init(&self) -> io::Result<()> { + // Initialize the directory if it doesn't exist. + let path = Path::new(&self.base); + if !path.exists() { + file_system::create_dir_all(path)?; + return Ok(()); + } + if !path.is_dir() { + return Err(io::Error::new( + ErrorKind::Other, + format!("{} should be a directory", path.display()), + )); + } + Ok(()) + } + + pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_GEN_PREFIX, key); + PathBuf::from(&self.base).join(prefix) + } + + pub fn final_recv_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}", SNAP_REV_PREFIX, key); + PathBuf::from(&self.base).join(prefix) + } + + pub fn tmp_recv_path(&self, key: &TabletSnapKey) -> PathBuf { + let prefix = format!("{}_{}{}", SNAP_REV_PREFIX, key, TMP_FILE_SUFFIX); + PathBuf::from(&self.base).join(prefix) + } + + pub fn delete_snapshot(&self, key: &TabletSnapKey) -> bool { + let path = self.tablet_gen_path(key); + if path.exists() && let Err(e) = fs::remove_dir_all(path.as_path()) { + error!( + "delete snapshot failed"; + "path" => %path.display(), + "err" => ?e, + ); + false + } else { + true + } + } +} + #[cfg(test)] pub mod tests { use std::{ diff --git a/components/raftstore/src/store/snap/io.rs b/components/raftstore/src/store/snap/io.rs index 31bf3156c58..3cdee1e40f1 100644 --- a/components/raftstore/src/store/snap/io.rs +++ b/components/raftstore/src/store/snap/io.rs @@ -152,7 +152,7 @@ where Ok(new_sst_writer) => { let old_writer = sst_writer.replace(new_sst_writer); box_try!(old_writer.finish()); - box_try!(File::open(&prev_path).and_then(|f| f.sync_all())); + box_try!(File::open(prev_path).and_then(|f| f.sync_all())); } Err(e) => { let io_error = io::Error::new(io::ErrorKind::Other, e); diff --git a/components/raftstore/src/store/transport.rs b/components/raftstore/src/store/transport.rs index 19b825ac20c..7f10e7cd249 100644 --- a/components/raftstore/src/store/transport.rs +++ b/components/raftstore/src/store/transport.rs @@ -8,7 +8,7 @@ use engine_traits::{KvEngine, RaftEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; use tikv_util::{error, warn}; -use super::worker::{FetchedLogs, LogFetchedNotifier}; +use super::{AsyncReadNotifier, FetchedLogs, GenSnapRes}; use crate::{ store::{CasualMessage, PeerMsg, RaftCommand, RaftRouter, SignificantMsg, StoreMsg}, DiscardReason, Error, Result, @@ -173,10 +173,15 @@ where } } -impl LogFetchedNotifier for RaftRouter { +impl AsyncReadNotifier for RaftRouter { #[inline] - fn notify(&self, region_id: u64, fetched: FetchedLogs) { + fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs) { // Ignore region not found as it may be removed. let _ = self.significant_send(region_id, SignificantMsg::RaftlogFetched(fetched)); } + + #[inline] + fn notify_snapshot_generated(&self, _region_id: u64, _snapshot: GenSnapRes) { + unreachable!() + } } diff --git a/components/raftstore/src/store/txn_ext.rs b/components/raftstore/src/store/txn_ext.rs index 1270ae104c9..ccc4027e9d1 100644 --- a/components/raftstore/src/store/txn_ext.rs +++ b/components/raftstore/src/store/txn_ext.rs @@ -322,8 +322,10 @@ mod tests { primary: primary.to_vec().into_boxed_slice(), start_ts: 100.into(), ttl: 3000, - for_update_ts: 100.into(), - min_commit_ts: Default::default(), + for_update_ts: 110.into(), + min_commit_ts: 110.into(), + last_change_ts: 105.into(), + versions_to_last_change: 2, } } @@ -424,6 +426,8 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 2, }, deleted, ), diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 9f49730e1d0..5f78065d32b 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -20,20 +20,25 @@ use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, raft_cmdpb::{AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest}, - raft_serverpb::RaftMessage, + raft_serverpb::{RaftMessage, RaftSnapshotData}, }; use protobuf::{self, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType}, + eraftpb::{self, ConfChangeType, ConfState, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; -use tikv_util::{box_err, debug, info, store::region, time::monotonic_raw_now, Either}; +use tikv_util::{ + box_err, debug, info, + store::{find_peer_by_id, region}, + time::monotonic_raw_now, + Either, +}; use time::{Duration, Timespec}; use txn_types::{TimeStamp, WriteBatchFlags}; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; -use crate::{coprocessor::CoprocessorHost, Error, Result}; +use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; const INVALID_TIMESTAMP: u64 = u64::MAX; @@ -125,6 +130,27 @@ pub fn is_initial_msg(msg: &eraftpb::Message) -> bool { || (msg_type == MessageType::MsgHeartbeat && msg.get_commit() == INVALID_INDEX) } +pub fn new_empty_snapshot( + region: Region, + applied_index: u64, + applied_term: u64, + for_witness: bool, +) -> Snapshot { + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().set_index(applied_index); + snapshot.mut_metadata().set_term(applied_term); + snapshot + .mut_metadata() + .set_conf_state(conf_state_from_region(®ion)); + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(region); + snap_data.set_file_size(0); + snap_data.set_version(SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_witness(for_witness); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot +} + const STR_CONF_CHANGE_ADD_NODE: &str = "AddNode"; const STR_CONF_CHANGE_REMOVE_NODE: &str = "RemoveNode"; const STR_CONF_CHANGE_ADDLEARNER_NODE: &str = "AddLearner"; @@ -200,6 +226,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { AdminCmdEpochState::new(true, true, false, false) } + AdminCmdType::BatchSwitchWitness => unimplemented!(), } } @@ -742,29 +769,35 @@ pub fn conf_state_from_region(region: &metapb::Region) -> ConfState { pub struct KeysInfoFormatter< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, >(pub I); impl< 'a, - I: std::iter::DoubleEndedIterator> - + std::iter::ExactSizeIterator> + T: 'a + AsRef<[u8]>, + I: std::iter::DoubleEndedIterator + + std::iter::ExactSizeIterator + Clone, -> fmt::Display for KeysInfoFormatter<'a, I> +> fmt::Display for KeysInfoFormatter<'a, T, I> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut it = self.0.clone(); match it.len() { 0 => write!(f, "(no key)"), - 1 => write!(f, "key {}", log_wrappers::Value::key(it.next().unwrap())), + 1 => write!( + f, + "key {}", + log_wrappers::Value::key(it.next().unwrap().as_ref()) + ), _ => write!( f, "{} keys range from {} to {}", it.len(), - log_wrappers::Value::key(it.next().unwrap()), - log_wrappers::Value::key(it.next_back().unwrap()) + log_wrappers::Value::key(it.next().unwrap().as_ref()), + log_wrappers::Value::key(it.next_back().unwrap().as_ref()) ), } } @@ -863,6 +896,7 @@ impl<'a> ChangePeerI for &'a ChangePeerV2Request { pub fn check_conf_change( cfg: &Config, node: &RawNode, + region: &metapb::Region, leader: &metapb::Peer, change_peers: &[ChangePeerRequest], cc: &impl ConfChangeI, @@ -909,6 +943,18 @@ pub fn check_conf_change( } } + if region + .get_peers() + .iter() + .find(|p| p.get_id() == peer.get_id()) + .map_or(false, |p| p.get_is_witness() != peer.get_is_witness()) + { + return Err(box_err!( + "invalid conf change request: {:?}, can not switch witness in conf change", + cp + )); + } + if !check_dup.insert(peer.get_id()) { return Err(box_err!( "have multiple commands for the same peer {}", @@ -1116,9 +1162,19 @@ pub struct RegionReadProgress { } impl RegionReadProgress { - pub fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgress { + pub fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgress { RegionReadProgress { - core: Mutex::new(RegionReadProgressCore::new(region, applied_index, cap, tag)), + core: Mutex::new(RegionReadProgressCore::new( + region, + applied_index, + cap, + peer_id, + )), safe_ts: AtomicU64::from(0), } } @@ -1215,11 +1271,11 @@ impl RegionReadProgress { } // Dump the `LeaderInfo` and the peer list - pub fn dump_leader_info(&self) -> (Vec, LeaderInfo) { + pub fn dump_leader_info(&self) -> (LeaderInfo, Option) { let core = self.core.lock().unwrap(); ( - core.get_local_leader_info().peers.clone(), core.get_leader_info(), + core.get_local_leader_info().leader_store_id, ) } @@ -1231,6 +1287,8 @@ impl RegionReadProgress { core.leader_info.epoch = region.get_region_epoch().clone(); core.leader_info.peers = region.get_peers().to_vec(); } + core.leader_info.leader_store_id = + find_store_id(&core.leader_info.peers, core.leader_info.leader_id) } /// Reset `safe_ts` to 0 and stop updating it @@ -1275,7 +1333,7 @@ impl RegionReadProgress { #[derive(Debug)] pub struct RegionReadProgressCore { - tag: String, + peer_id: u64, region_id: u64, applied_index: u64, // A wrapper of `(apply_index, safe_ts)` item, where the `read_state.ts` is the peer's current @@ -1308,6 +1366,7 @@ pub struct ReadState { pub struct LocalLeaderInfo { leader_id: u64, leader_term: u64, + leader_store_id: Option, epoch: RegionEpoch, peers: Vec, } @@ -1317,6 +1376,7 @@ impl LocalLeaderInfo { LocalLeaderInfo { leader_id: raft::INVALID_ID, leader_term: 0, + leader_store_id: None, epoch: region.get_region_epoch().clone(), peers: region.get_peers().to_vec(), } @@ -1329,20 +1389,40 @@ impl LocalLeaderInfo { pub fn get_leader_id(&self) -> u64 { self.leader_id } + + pub fn get_leader_store_id(&self) -> Option { + self.leader_store_id + } +} + +fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { + for peer in peer_list { + if peer.id == peer_id { + return Some(peer.store_id); + } + } + None } impl RegionReadProgressCore { - fn new(region: &Region, applied_index: u64, cap: usize, tag: String) -> RegionReadProgressCore { + fn new( + region: &Region, + applied_index: u64, + cap: usize, + peer_id: u64, + ) -> RegionReadProgressCore { + // forbids stale read for witness + let is_witness = find_peer_by_id(region, peer_id).map_or(false, |p| p.is_witness); RegionReadProgressCore { - tag, + peer_id, region_id: region.get_id(), applied_index, read_state: ReadState::default(), leader_info: LocalLeaderInfo::new(region), pending_items: VecDeque::with_capacity(cap), last_merge_index: 0, - pause: false, - discard: false, + pause: is_witness, + discard: is_witness, } } @@ -1357,10 +1437,11 @@ impl RegionReadProgressCore { self.read_state.ts = cmp::min(source_safe_ts, target_safe_ts); info!( "reset safe_ts due to merge"; - "tag" => &self.tag, "source_safe_ts" => source_safe_ts, "target_safe_ts" => target_safe_ts, "safe_ts" => self.read_state.ts, + "region_id" => self.region_id, + "peer_id" => self.peer_id, ); if self.read_state.ts != target_safe_ts { Some(self.read_state.ts) @@ -1444,7 +1525,6 @@ impl RegionReadProgressCore { } pub fn get_leader_info(&self) -> LeaderInfo { - let mut leader_info = LeaderInfo::default(); let read_state = { // Get the latest `read_state` let ReadState { idx, ts } = self.pending_items.back().unwrap_or(&self.read_state); @@ -1454,12 +1534,15 @@ impl RegionReadProgressCore { rs }; let li = &self.leader_info; - leader_info.set_peer_id(li.leader_id); - leader_info.set_term(li.leader_term); - leader_info.set_region_id(self.region_id); - leader_info.set_region_epoch(li.epoch.clone()); - leader_info.set_read_state(read_state); - leader_info + LeaderInfo { + peer_id: li.leader_id, + region_id: self.region_id, + term: li.leader_term, + region_epoch: protobuf::SingularPtrField::some(li.epoch.clone()), + read_state: protobuf::SingularPtrField::some(read_state), + unknown_fields: protobuf::UnknownFields::default(), + cached_size: protobuf::CachedSize::default(), + } } pub fn get_local_leader_info(&self) -> &LocalLeaderInfo { @@ -2071,7 +2154,7 @@ mod tests { } let cap = 10; - let rrp = RegionReadProgress::new(&Default::default(), 10, cap, "".to_owned()); + let rrp = RegionReadProgress::new(&Default::default(), 10, cap, 1); for i in 1..=20 { rrp.update_safe_ts(i, i); } diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index 696caab7d69..ab83752d8c3 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -157,7 +157,7 @@ mod tests { region.set_start_key(kr.get_start_key().to_vec()); region.set_end_key(kr.get_end_key().to_vec()); region.set_peers(vec![kvproto::metapb::Peer::default()].into()); - let rrp = RegionReadProgress::new(®ion, 1, 1, "".to_owned()); + let rrp = RegionReadProgress::new(®ion, 1, 1, 1); rrp.update_safe_ts(1, safe_ts); assert_eq!(rrp.safe_ts(), safe_ts); meta.region_ranges.insert(enc_end_key(®ion), id); diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 7a680e4d7a6..5861e27a508 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -56,6 +56,9 @@ make_static_metric! { channel_full, cache_miss, safe_ts, + witness, + flashback_not_prepared, + flashback_in_progress, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index 4335369c3cb..e021651ba3d 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -6,9 +6,8 @@ mod cleanup_snapshot; mod cleanup_sst; mod compact; mod consistency_check; -mod metrics; +pub mod metrics; mod pd; -mod raftlog_fetch; mod raftlog_gc; mod read; mod refresh_config; @@ -26,14 +25,10 @@ pub use self::{ cleanup_sst::{Runner as CleanupSstRunner, Task as CleanupSstTask}, compact::{Runner as CompactRunner, Task as CompactTask}, consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, - metrics::TLS_LOCAL_READ_METRICS, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, Runner as PdRunner, Task as PdTask, }, - raftlog_fetch::{ - FetchedLogs, LogFetchedNotifier, Runner as RaftlogFetchRunner, Task as RaftlogFetchTask, - }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ CachedReadDelegate, LocalReadContext, LocalReader, LocalReaderCore, diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index ec06d756fe9..fdfa1b44c85 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -53,6 +53,7 @@ use yatp::Remote; use crate::{ coprocessor::CoprocessorHost, + router::RaftStoreRouter, store::{ cmd_resp::new_error, metrics::*, @@ -151,7 +152,7 @@ where Heartbeat(HeartbeatTask), StoreHeartbeat { stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, report: Option, dr_autosync_status: Option, }, @@ -204,6 +205,9 @@ pub struct StoreStat { pub engine_last_total_bytes_read: u64, pub engine_last_total_keys_read: u64, pub engine_last_query_num: QueryStats, + pub engine_last_capacity_size: u64, + pub engine_last_used_size: u64, + pub engine_last_available_size: u64, pub last_report_ts: UnixSecs, pub region_bytes_read: LocalHistogram, @@ -229,6 +233,9 @@ impl Default for StoreStat { engine_total_keys_read: 0, engine_last_total_bytes_read: 0, engine_last_total_keys_read: 0, + engine_last_capacity_size: 0, + engine_last_used_size: 0, + engine_last_available_size: 0, engine_total_query_num: QueryStats::default(), engine_last_query_num: QueryStats::default(), @@ -733,6 +740,9 @@ fn hotspot_query_num_report_threshold() -> u64 { HOTSPOT_QUERY_RATE_THRESHOLD * 10 } +/// Max limitation of delayed store_heartbeat. +const STORE_HEARTBEAT_DELAY_LIMIT: u64 = 5 * 60; + // Slow score is a value that represents the speed of a store and ranges in [1, // 100]. It is maintained in the AIMD way. // If there are some inspecting requests timeout during a round, by default the @@ -829,6 +839,10 @@ impl SlowScore { self.last_update_time = Instant::now(); self.value } + + fn should_force_report_slow_store(&self) -> bool { + self.value >= OrderedFloat(100.0) && (self.last_tick_id % self.round_ticks == 0) + } } // RegionCpuMeteringCollector is used to collect the region-related CPU info. @@ -883,6 +897,7 @@ where // calls Runner's run() on Task received. scheduler: Scheduler>, stats_monitor: StatsMonitor, + store_heartbeat_interval: Duration, collector_reg_handle: CollectorRegHandle, region_cpu_records_collector: Option, @@ -958,6 +973,7 @@ where store_stat: StoreStat::default(), start_ts: UnixSecs::now(), scheduler, + store_heartbeat_interval, stats_monitor, collector_reg_handle, region_cpu_records_collector, @@ -1176,7 +1192,7 @@ where fn handle_store_heartbeat( &mut self, mut stats: pdpb::StoreStats, - store_info: StoreInfo, + store_info: Option>, store_report: Option, dr_autosync_status: Option, ) { @@ -1207,13 +1223,27 @@ where } stats = collect_report_read_peer_stats(HOTSPOT_REPORT_CAPACITY, report_peers, stats); - let (capacity, used_size, available) = match collect_engine_size( - &self.coprocessor_host, - Some(&store_info), - self.snap_mgr.get_total_snap_size().unwrap(), - ) { - Some((capacity, used_size, available)) => (capacity, used_size, available), - None => return, + let (capacity, used_size, available) = if store_info.is_some() { + match collect_engine_size( + &self.coprocessor_host, + store_info.as_ref(), + self.snap_mgr.get_total_snap_size().unwrap(), + ) { + Some((capacity, used_size, available)) => { + // Update last reported infos on engine_size. + self.store_stat.engine_last_capacity_size = capacity; + self.store_stat.engine_last_used_size = used_size; + self.store_stat.engine_last_available_size = available; + (capacity, used_size, available) + } + None => return, + } + } else { + ( + self.store_stat.engine_last_capacity_size, + self.store_stat.engine_last_used_size, + self.store_stat.engine_last_available_size, + ) }; stats.set_capacity(capacity); @@ -1251,7 +1281,14 @@ where self.store_stat .engine_last_query_num .fill_query_stats(&self.store_stat.engine_total_query_num); - self.store_stat.last_report_ts = UnixSecs::now(); + self.store_stat.last_report_ts = if store_info.is_some() { + UnixSecs::now() + } else { + // If `store_info` is None, the given Task::StoreHeartbeat should be a fake + // heartbeat to PD, we won't update the last_report_ts to avoid incorrectly + // marking current TiKV node in normal state. + self.store_stat.last_report_ts + }; self.store_stat.region_bytes_written.flush(); self.store_stat.region_keys_written.flush(); self.store_stat.region_bytes_read.flush(); @@ -1338,6 +1375,14 @@ where } } } + // Forcely awaken all hibernated regions if there existed slow stores in this + // cluster. + if let Some(awaken_regions) = resp.awaken_regions.take() { + info!("forcely awaken hibernated regions in this store"); + let _ = router.send_store_msg(StoreMsg::AwakenRegions { + abnormal_stores: awaken_regions.get_abnormal_stores().to_vec(), + }); + } } Err(e) => { error!("store heartbeat failed"; "err" => ?e); @@ -1786,6 +1831,55 @@ where health_service.set_serving_status("", status); } } + + /// Force to send a special heartbeat to pd when current store is hung on + /// some special circumstances, i.e. disk busy, handler busy and others. + fn handle_fake_store_heartbeat(&mut self) { + let mut stats = pdpb::StoreStats::default(); + stats.set_store_id(self.store_id); + stats.set_region_count(self.region_peers.len() as u32); + + let snap_stats = self.snap_mgr.stats(); + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["sending"]) + .set(snap_stats.sending_count as i64); + STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC + .with_label_values(&["receiving"]) + .set(snap_stats.receiving_count as i64); + + stats.set_start_time(self.start_ts.into_inner() as u32); + + // This calling means that the current node cannot report heartbeat in normaly + // scheduler. That is, the current node must in `busy` state. + stats.set_is_busy(true); + + // We do not need to report store_info, so we just set `None` here. + let task = Task::StoreHeartbeat { + stats, + store_info: None, + report: None, + dr_autosync_status: None, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!("force report store heartbeat failed"; + "store_id" => self.store_id, + "err" => ?e + ); + } else { + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); + } + } + + fn is_store_heartbeat_delayed(&self) -> bool { + let now = UnixSecs::now(); + let interval_second = now.into_inner() - self.store_stat.last_report_ts.into_inner(); + (interval_second >= self.store_heartbeat_interval.as_secs()) + && (interval_second <= STORE_HEARTBEAT_DELAY_LIMIT) + } } fn calculate_region_cpu_records( @@ -2065,6 +2159,13 @@ where } if !self.slow_score.last_tick_finished { self.slow_score.record_timeout(); + // If the last slow_score already reached abnormal state and was delayed for + // reporting by `store-heartbeat` to PD, we should report it here manually as + // a FAKE `store-heartbeat`. + if self.slow_score.should_force_report_slow_store() && self.is_store_heartbeat_delayed() + { + self.handle_fake_store_heartbeat(); + } } let scheduler = self.scheduler.clone(); let id = self.slow_score.last_tick_id + 1; diff --git a/components/raftstore/src/store/worker/raftlog_fetch.rs b/components/raftstore/src/store/worker/raftlog_fetch.rs deleted file mode 100644 index b3de87f7715..00000000000 --- a/components/raftstore/src/store/worker/raftlog_fetch.rs +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::fmt; - -use engine_traits::RaftEngine; -use fail::fail_point; -use raft::GetEntriesContext; -use tikv_util::worker::Runnable; - -use crate::store::{RaftlogFetchResult, MAX_INIT_ENTRY_COUNT}; - -pub enum Task { - PeerStorage { - region_id: u64, - context: GetEntriesContext, - low: u64, - high: u64, - max_size: usize, - tried_cnt: usize, - term: u64, - }, - // More to support, suck as fetch entries ayschronously when apply and schedule merge -} - -impl fmt::Display for Task { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Task::PeerStorage { - region_id, - context, - low, - high, - max_size, - tried_cnt, - term, - } => write!( - f, - "Fetch Raft Logs [region: {}, low: {}, high: {}, max_size: {}] for sending with context {:?}, tried: {}, term: {}", - region_id, low, high, max_size, context, tried_cnt, term, - ), - } - } -} - -#[derive(Debug)] -pub struct FetchedLogs { - pub context: GetEntriesContext, - pub logs: Box, -} - -/// A router for receiving fetched result. -pub trait LogFetchedNotifier: Send { - fn notify(&self, region_id: u64, fetched: FetchedLogs); -} - -pub struct Runner -where - ER: RaftEngine, - N: LogFetchedNotifier, -{ - notifier: N, - raft_engine: ER, -} - -impl Runner { - pub fn new(notifier: N, raft_engine: ER) -> Runner { - Runner { - notifier, - raft_engine, - } - } -} - -impl Runnable for Runner -where - ER: RaftEngine, - N: LogFetchedNotifier, -{ - type Task = Task; - - fn run(&mut self, task: Task) { - match task { - Task::PeerStorage { - region_id, - low, - high, - max_size, - context, - tried_cnt, - term, - } => { - let mut ents = - Vec::with_capacity(std::cmp::min((high - low) as usize, MAX_INIT_ENTRY_COUNT)); - let res = self.raft_engine.fetch_entries_to( - region_id, - low, - high, - Some(max_size), - &mut ents, - ); - - let hit_size_limit = res - .as_ref() - .map(|c| (*c as u64) != high - low) - .unwrap_or(false); - fail_point!("worker_async_fetch_raft_log"); - self.notifier.notify( - region_id, - FetchedLogs { - context, - logs: Box::new(RaftlogFetchResult { - ents: res.map(|_| ents).map_err(|e| e.into()), - low, - max_size: max_size as u64, - hit_size_limit, - tried_cnt, - term, - }), - }, - ); - } - } - } -} diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index d62f2f6c1db..08e56aa7481 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -25,6 +25,7 @@ use tikv_util::{ codec::number::decode_u64, debug, error, lru::LruCache, + store::find_peer_by_id, time::{monotonic_raw_now, ThreadReadId}, }; use time::Timespec; @@ -563,9 +564,11 @@ impl ReadDelegate { } debug!( "reject stale read by safe ts"; - "tag" => &self.tag, - "safe ts" => safe_ts, - "read ts" => read_ts + "safe_ts" => safe_ts, + "read_ts" => read_ts, + + "region_id" => self.region.get_id(), + "peer_id" => self.peer_id, ); TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.safe_ts.inc()); let mut response = cmd_resp::new_error(Error::DataIsNotReady { @@ -581,7 +584,7 @@ impl ReadDelegate { pub fn mock(region_id: u64) -> Self { let mut region: metapb::Region = Default::default(); region.set_id(region_id); - let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, "mock".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)); ReadDelegate { region: Arc::new(region), peer_id: 1, @@ -801,6 +804,29 @@ where return Ok(None); } + // Check witness + if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); + return Err(Error::RecoveryInProgress(region_id)); + } + + // Check whether the region is in the flashback state and the local read could + // be performed. + let is_in_flashback = delegate.region.is_in_flashback; + if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id) { + TLS_LOCAL_READ_METRICS.with(|m| match e { + Error::FlashbackNotPrepared(_) => { + m.borrow_mut().reject_reason.flashback_not_prepared.inc() + } + Error::FlashbackInProgress(_) => { + m.borrow_mut().reject_reason.flashback_in_progress.inc() + } + _ => unreachable!(), + }); + debug!("rejected by flashback state"; "is_in_flashback" => is_in_flashback, "tag" => &delegate.tag); + return Ok(None); + } + Ok(Some(delegate)) } } @@ -1241,7 +1267,7 @@ mod tests { region1.set_region_epoch(epoch13.clone()); let term6 = 6; let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion1, 1, 1, 1)); let mut cmd = RaftCmdRequest::default(); let mut header = RaftRequestHeader::default(); @@ -1573,7 +1599,7 @@ mod tests { txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), txn_ext: Arc::new(TxnExt::default()), track_ver: TrackVer::new(), - read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, "".to_owned())), + read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, bucket_meta: None, }; @@ -1680,7 +1706,7 @@ mod tests { let leader = prs[0].clone(); region.set_region_epoch(region_epoch); let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. - let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, "".to_owned())); + let read_progress = Arc::new(RegionReadProgress::new(®ion, 1, 1, 1)); // Register region lease.renew(monotonic_raw_now()); diff --git a/components/raftstore/src/store/worker/split_controller.rs b/components/raftstore/src/store/worker/split_controller.rs index a211a8f0a60..7e00daa2764 100644 --- a/components/raftstore/src/store/worker/split_controller.rs +++ b/components/raftstore/src/store/worker/split_controller.rs @@ -361,7 +361,7 @@ impl RegionInfo { if n == 0 || self.key_ranges.len() < self.sample_num { self.key_ranges.push(key_range); } else { - let j = rand::thread_rng().gen_range(0..n) as usize; + let j = rand::thread_rng().gen_range(0..n); if j < self.sample_num { self.key_ranges[j] = key_range; } diff --git a/components/resolved_ts/Cargo.toml b/components/resolved_ts/Cargo.toml index d4a7e3d1ca2..10a555678c3 100644 --- a/components/resolved_ts/Cargo.toml +++ b/components/resolved_ts/Cargo.toml @@ -31,7 +31,7 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } online_config = { workspace = true } diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index 35426f4861d..a78e903bc72 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -14,9 +14,9 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::KvEngine; use fail::fail_point; use futures::{compat::Future01CompatExt, future::select_all, FutureExt, TryFutureExt}; -use grpcio::{ChannelBuilder, Environment}; +use grpcio::{ChannelBuilder, Environment, Error as GrpcError, RpcStatusCode}; use kvproto::{ - kvrpcpb::{CheckLeaderRequest, LeaderInfo}, + kvrpcpb::{CheckLeaderRequest, CheckLeaderResponse}, metapb::{Peer, PeerRole}, tikvpb::TikvClient, }; @@ -43,24 +43,24 @@ use tokio::{ }; use txn_types::TimeStamp; -use crate::{endpoint::Task, metrics::*, util}; +use crate::{endpoint::Task, metrics::*}; const DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS: u64 = 5_000; // 5s -pub struct AdvanceTsWorker { +pub struct AdvanceTsWorker { pd_client: Arc, timer: SteadyTimer, worker: Runtime, - scheduler: Scheduler>, + scheduler: Scheduler, /// The concurrency manager for transactions. It's needed for CDC to check /// locks when calculating resolved_ts. concurrency_manager: ConcurrencyManager, } -impl AdvanceTsWorker { +impl AdvanceTsWorker { pub fn new( pd_client: Arc, - scheduler: Scheduler>, + scheduler: Scheduler, concurrency_manager: ConcurrencyManager, ) -> Self { let worker = Builder::new_multi_thread() @@ -81,7 +81,7 @@ impl AdvanceTsWorker { } } -impl AdvanceTsWorker { +impl AdvanceTsWorker { // Advance ts asynchronously and register RegisterAdvanceEvent when its done. pub fn advance_ts_for_regions( &self, @@ -143,11 +143,11 @@ pub struct LeadershipResolver { region_read_progress: RegionReadProgressRegistry, store_id: u64, - // store_id -> leaders info, record the request to each stores - store_map: HashMap>, - // region_id -> region, cache the information of regions + // store_id -> check leader request, record the request to each stores. + store_req_map: HashMap, + // region_id -> region, cache the information of regions. region_map: HashMap>, - // region_id -> peers id, record the responses + // region_id -> peers id, record the responses. resp_map: HashMap>, valid_regions: HashSet, @@ -172,7 +172,7 @@ impl LeadershipResolver { security_mgr, region_read_progress, - store_map: HashMap::default(), + store_req_map: HashMap::default(), region_map: HashMap::default(), resp_map: HashMap::default(), valid_regions: HashSet::default(), @@ -184,7 +184,7 @@ impl LeadershipResolver { fn gc(&mut self) { let now = Instant::now_coarse(); if now - self.last_gc_time > self.gc_interval { - self.store_map = HashMap::default(); + self.store_req_map = HashMap::default(); self.region_map = HashMap::default(); self.resp_map = HashMap::default(); self.valid_regions = HashSet::default(); @@ -193,9 +193,16 @@ impl LeadershipResolver { } fn clear(&mut self) { - self.store_map.clear(); - self.region_map.clear(); - self.resp_map.clear(); + for v in self.store_req_map.values_mut() { + v.regions.clear(); + v.ts = 0; + } + for v in self.region_map.values_mut() { + v.clear(); + } + for v in self.resp_map.values_mut() { + v.clear(); + } self.valid_regions.clear(); } @@ -241,7 +248,7 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve(&mut self, _regions: Vec, min_ts: TimeStamp) -> Vec { // Clear previous result before resolving. self.clear(); // GC when necessary to prevent memory leak. @@ -249,21 +256,22 @@ impl LeadershipResolver { PENDING_RTS_COUNT.inc(); defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| regions.clone()); + fail_point!("before_sync_replica_read_state", |_| _regions.clone()); let store_id = self.store_id; let valid_regions = &mut self.valid_regions; let region_map = &mut self.region_map; let resp_map = &mut self.resp_map; - let store_map = &mut self.store_map; + let store_req_map = &mut self.store_req_map; self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { let core = read_progress.get_core(); let local_leader_info = core.get_local_leader_info(); let leader_id = local_leader_info.get_leader_id(); + let leader_store_id = local_leader_info.get_leader_store_id(); let peer_list = local_leader_info.get_peers(); // Check if the leader in this store - if util::find_store_id(peer_list, leader_id) != Some(store_id) { + if leader_store_id != Some(store_id) { continue; } let leader_info = core.get_leader_info(); @@ -271,13 +279,21 @@ impl LeadershipResolver { let mut unvotes = 0; for peer in peer_list { if peer.store_id == store_id && peer.id == leader_id { - resp_map.entry(*region_id).or_default().push(store_id); + resp_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .push(store_id); } else { // It's still necessary to check leader on learners even if they don't vote // because performing stale read on learners require it. - store_map + store_req_map .entry(peer.store_id) - .or_default() + .or_insert_with(|| { + let mut req = CheckLeaderRequest::default(); + req.regions = Vec::with_capacity(registry.len()).into(); + req + }) + .regions .push(leader_info.clone()); if peer.get_role() != PeerRole::Learner { unvotes += 1; @@ -289,7 +305,10 @@ impl LeadershipResolver { if unvotes == 0 && region_has_quorum(peer_list, &resp_map[region_id]) { valid_regions.insert(*region_id); } else { - region_map.insert(*region_id, peer_list.to_vec()); + region_map + .entry(*region_id) + .or_insert_with(|| Vec::with_capacity(peer_list.len())) + .extend_from_slice(peer_list); } } }); @@ -299,62 +318,69 @@ impl LeadershipResolver { let security_mgr = &self.security_mgr; let tikv_clients = &self.tikv_clients; // Approximate `LeaderInfo` size - let leader_info_size = store_map + let leader_info_size = store_req_map .values() - .next() - .map_or(0, |regions| regions[0].compute_size()); - let store_count = store_map.len(); - let mut stores: Vec<_> = store_map - .drain() - .map(|(to_store, regions)| { - let env = env.clone(); - let region_num = regions.len() as u32; - CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); - CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); - - // Check leadership for `regions` on `to_store`. - async move { - PENDING_CHECK_LEADER_REQ_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); - let client = - get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) - .await - .map_err(|e| { - (to_store, e.retryable(), format!("[get tikv client] {}", e)) - })?; - - let mut req = CheckLeaderRequest::default(); - req.set_regions(regions.into()); - req.set_ts(min_ts.into_inner()); - let slow_timer = SlowTimer::default(); - defer!({ - slow_log!( - T - slow_timer, - "check leader rpc costs too long, to_store: {}", - to_store - ); - let elapsed = slow_timer.saturating_elapsed(); - RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC - .with_label_values(&["rpc"]) - .observe(elapsed.as_secs_f64()); - }); - - let rpc = client - .check_leader_async(&req) - .map_err(|e| (to_store, true, format!("[rpc create failed]{}", e)))?; - PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); - defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); - let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); - let resp = tokio::time::timeout(timeout, rpc) - .map_err(|e| (to_store, true, format!("[timeout] {}", e))) - .await? - .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; - Ok((to_store, resp)) - } - .boxed() - }) - .collect(); + .find(|req| !req.regions.is_empty()) + .map_or(0, |req| req.regions[0].compute_size()); + let store_count = store_req_map.len(); + let mut check_leader_rpcs = Vec::with_capacity(store_req_map.len()); + for (store_id, req) in store_req_map { + if req.regions.is_empty() { + continue; + } + let env = env.clone(); + let to_store = *store_id; + let region_num = req.regions.len() as u32; + CHECK_LEADER_REQ_SIZE_HISTOGRAM.observe((leader_info_size * region_num) as f64); + CHECK_LEADER_REQ_ITEM_COUNT_HISTOGRAM.observe(region_num as f64); + + // Check leadership for `regions` on `to_store`. + let rpc = async move { + PENDING_CHECK_LEADER_REQ_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_COUNT.dec()); + let client = get_tikv_client(to_store, pd_client, security_mgr, env, tikv_clients) + .await + .map_err(|e| (to_store, e.retryable(), format!("[get tikv client] {}", e)))?; + + // Set min_ts in the request. + req.set_ts(min_ts.into_inner()); + let slow_timer = SlowTimer::default(); + defer!({ + slow_log!( + T + slow_timer, + "check leader rpc costs too long, to_store: {}", + to_store + ); + let elapsed = slow_timer.saturating_elapsed(); + RTS_CHECK_LEADER_DURATION_HISTOGRAM_VEC + .with_label_values(&["rpc"]) + .observe(elapsed.as_secs_f64()); + }); + + let rpc = match client.check_leader_async(req) { + Ok(rpc) => rpc, + Err(GrpcError::RpcFailure(status)) + if status.code() == RpcStatusCode::UNIMPLEMENTED => + { + // Some stores like TiFlash don't implement it. + return Ok((to_store, CheckLeaderResponse::default())); + } + Err(e) => return Err((to_store, true, format!("[rpc create failed]{}", e))), + }; + + PENDING_CHECK_LEADER_REQ_SENT_COUNT.inc(); + defer!(PENDING_CHECK_LEADER_REQ_SENT_COUNT.dec()); + let timeout = Duration::from_millis(DEFAULT_CHECK_LEADER_TIMEOUT_MILLISECONDS); + let resp = tokio::time::timeout(timeout, rpc) + .map_err(|e| (to_store, true, format!("[timeout] {}", e))) + .await? + .map_err(|e| (to_store, true, format!("[rpc failed] {}", e)))?; + Ok((to_store, resp)) + } + .boxed(); + check_leader_rpcs.push(rpc); + } let start = Instant::now_coarse(); defer!({ @@ -362,21 +388,19 @@ impl LeadershipResolver { .with_label_values(&["all"]) .observe(start.saturating_elapsed_secs()); }); - for _ in 0..store_count { + let rpc_count = check_leader_rpcs.len(); + for _ in 0..rpc_count { // Use `select_all` to avoid the process getting blocked when some // TiKVs were down. - let (res, _, remains) = select_all(stores).await; - stores = remains; + let (res, _, remains) = select_all(check_leader_rpcs).await; + check_leader_rpcs = remains; match res { Ok((to_store, resp)) => { for region_id in resp.regions { - if let Some(r) = region_map.get(®ion_id) { - let resps = resp_map.entry(region_id).or_default(); - resps.push(to_store); - if region_has_quorum(r, resps) { - valid_regions.insert(region_id); - } - } + resp_map + .entry(region_id) + .or_insert_with(|| Vec::with_capacity(store_count)) + .push(to_store); } } Err((to_store, reconnect, err)) => { @@ -386,11 +410,21 @@ impl LeadershipResolver { } } } - // Return early if all regions had already got quorum. - if valid_regions.len() == regions.len() { - // break here because all regions have quorum, - // so there is no need waiting for other stores to respond. - break; + } + for (region_id, prs) in region_map { + if prs.is_empty() { + // The peer had the leadership before, but now it's no longer + // the case. Skip checking the region. + continue; + } + if let Some(resp) = resp_map.get(region_id) { + if resp.is_empty() { + // No response, maybe the peer lost leadership. + continue; + } + if region_has_quorum(prs, resp) { + valid_regions.insert(*region_id); + } } } self.valid_regions.drain().collect() diff --git a/components/resolved_ts/src/cmd.rs b/components/resolved_ts/src/cmd.rs index a1468e15bab..d3bda563a4f 100644 --- a/components/resolved_ts/src/cmd.rs +++ b/components/resolved_ts/src/cmd.rs @@ -419,6 +419,7 @@ mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_put(k1.clone(), b"v4".to_vec()), &None, diff --git a/components/resolved_ts/src/endpoint.rs b/components/resolved_ts/src/endpoint.rs index 480c0ee6896..def3d512d3a 100644 --- a/components/resolved_ts/src/endpoint.rs +++ b/components/resolved_ts/src/endpoint.rs @@ -12,7 +12,7 @@ use std::{ }; use concurrency_manager::ConcurrencyManager; -use engine_traits::{KvEngine, Snapshot}; +use engine_traits::KvEngine; use grpcio::Environment; use kvproto::{metapb::Region, raft_cmdpb::AdminCmdType}; use online_config::{self, ConfigChange, ConfigManager, OnlineConfig}; @@ -23,7 +23,6 @@ use raftstore::{ store::{ fsm::StoreMeta, util::{self, RegionReadProgress, RegionReadProgressRegistry}, - RegionSnapshot, }, }; use security::SecurityManager; @@ -41,7 +40,6 @@ use crate::{ metrics::*, resolver::Resolver, scanner::{ScanEntry, ScanMode, ScanTask, ScannerPool}, - sinker::{CmdSinker, SinkCmd}, }; enum ResolverStatus { @@ -264,7 +262,7 @@ impl ObserveRegion { } } -pub struct Endpoint { +pub struct Endpoint { store_id: Option, cfg: ResolvedTsConfig, cfg_update_notify: Arc, @@ -272,28 +270,25 @@ pub struct Endpoint { region_read_progress: RegionReadProgressRegistry, regions: HashMap, scanner_pool: ScannerPool, - scheduler: Scheduler>, - sinker: C, - advance_worker: AdvanceTsWorker, + scheduler: Scheduler, + advance_worker: AdvanceTsWorker, _phantom: PhantomData<(T, E)>, } -impl Endpoint +impl Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { pub fn new( cfg: &ResolvedTsConfig, - scheduler: Scheduler>, + scheduler: Scheduler, raft_router: T, store_meta: Arc>, pd_client: Arc, concurrency_manager: ConcurrencyManager, env: Arc, security_mgr: Arc, - sinker: C, ) -> Self { let (region_read_progress, store_id) = { let meta = store_meta.lock().unwrap(); @@ -320,7 +315,6 @@ where region_read_progress, advance_worker, scanner_pool, - sinker, regions: HashMap::default(), _phantom: PhantomData::default(), }; @@ -502,64 +496,42 @@ where if regions.is_empty() { return; } - - let mut min_ts = TimeStamp::max(); for region_id in regions.iter() { if let Some(observe_region) = self.regions.get_mut(region_id) { if let ResolverStatus::Ready = observe_region.resolver_status { - let resolved_ts = observe_region.resolver.resolve(ts); - if resolved_ts < min_ts { - min_ts = resolved_ts; - } + let _ = observe_region.resolver.resolve(ts); } } } - self.sinker.sink_resolved_ts(regions, ts); } // Tracking or untracking locks with incoming commands that corresponding // observe id is valid. #[allow(clippy::drop_ref)] - fn handle_change_log( - &mut self, - cmd_batch: Vec, - snapshot: Option>, - ) { + fn handle_change_log(&mut self, cmd_batch: Vec) { let size = cmd_batch.iter().map(|b| b.size()).sum::(); RTS_CHANNEL_PENDING_CMD_BYTES.sub(size as i64); - let logs = cmd_batch - .into_iter() - .filter_map(|batch| { - if !batch.is_empty() { - if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { - let observe_id = batch.rts_id; - let region_id = observe_region.meta.id; - if observe_region.handle.id == observe_id { - let logs = ChangeLog::encode_change_log(region_id, batch); - if let Err(e) = observe_region.track_change_log(&logs) { - drop(observe_region); - self.re_register_region(region_id, observe_id, e) - } - return Some(SinkCmd { - region_id, - observe_id, - logs, - }); - } else { - debug!("resolved ts CmdBatch discarded"; - "region_id" => batch.region_id, - "observe_id" => ?batch.rts_id, - "current" => ?observe_region.handle.id, - ); - } + for batch in cmd_batch { + if batch.is_empty() { + continue; + } + if let Some(observe_region) = self.regions.get_mut(&batch.region_id) { + let observe_id = batch.rts_id; + let region_id = observe_region.meta.id; + if observe_region.handle.id == observe_id { + let logs = ChangeLog::encode_change_log(region_id, batch); + if let Err(e) = observe_region.track_change_log(&logs) { + drop(observe_region); + self.re_register_region(region_id, observe_id, e); } + } else { + debug!("resolved ts CmdBatch discarded"; + "region_id" => batch.region_id, + "observe_id" => ?batch.rts_id, + "current" => ?observe_region.handle.id, + ); } - None - }) - .collect(); - match snapshot { - Some(snap) => self.sinker.sink_cmd_with_old_value(logs, snap), - None => self.sinker.sink_cmd(logs), + } } } @@ -615,7 +587,7 @@ where } } -pub enum Task { +pub enum Task { RegionUpdated(Region), RegionDestroyed(Region), RegisterRegion { @@ -638,7 +610,6 @@ pub enum Task { }, ChangeLog { cmd_batch: Vec, - snapshot: Option>, }, ScanLocks { region_id: u64, @@ -651,7 +622,7 @@ pub enum Task { }, } -impl fmt::Debug for Task { +impl fmt::Debug for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut de = f.debug_struct("ResolvedTsTask"); match self { @@ -710,21 +681,20 @@ impl fmt::Debug for Task { } } -impl fmt::Display for Task { +impl fmt::Display for Task { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{:?}", self) } } -impl Runnable for Endpoint +impl Runnable for Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { - type Task = Task; + type Task = Task; - fn run(&mut self, task: Task) { + fn run(&mut self, task: Task) { debug!("run resolved-ts task"; "task" => ?task); match task { Task::RegionDestroyed(region) => self.region_destroyed(region), @@ -742,10 +712,7 @@ where Task::ResolvedTsAdvanced { regions, ts } => { self.handle_resolved_ts_advanced(regions, ts) } - Task::ChangeLog { - cmd_batch, - snapshot, - } => self.handle_change_log(cmd_batch, snapshot), + Task::ChangeLog { cmd_batch } => self.handle_change_log(cmd_batch), Task::ScanLocks { region_id, observe_id, @@ -757,15 +724,15 @@ where } } -pub struct ResolvedTsConfigManager(Scheduler>); +pub struct ResolvedTsConfigManager(Scheduler); -impl ResolvedTsConfigManager { - pub fn new(scheduler: Scheduler>) -> ResolvedTsConfigManager { +impl ResolvedTsConfigManager { + pub fn new(scheduler: Scheduler) -> ResolvedTsConfigManager { ResolvedTsConfigManager(scheduler) } } -impl ConfigManager for ResolvedTsConfigManager { +impl ConfigManager for ResolvedTsConfigManager { fn dispatch(&mut self, change: ConfigChange) -> online_config::Result<()> { if let Err(e) = self.0.schedule(Task::ChangeConfig { change }) { error!("failed to schedule ChangeConfig task"; "err" => ?e); @@ -776,11 +743,10 @@ impl ConfigManager for ResolvedTsConfigManager { const METRICS_FLUSH_INTERVAL: u64 = 10_000; // 10s -impl RunnableWithTimer for Endpoint +impl RunnableWithTimer for Endpoint where T: 'static + RaftStoreRouter, E: KvEngine, - C: CmdSinker, { fn on_timeout(&mut self) { let store_id = self.get_or_init_store_id(); @@ -788,8 +754,7 @@ where let (mut oldest_leader_ts, mut oldest_leader_region) = (u64::MAX, 0); self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { - let (peers, leader_info) = read_progress.dump_leader_info(); - let leader_store_id = crate::util::find_store_id(&peers, leader_info.peer_id); + let (leader_info, leader_store_id) = read_progress.dump_leader_info(); let ts = leader_info.get_read_state().get_safe_ts(); if ts == 0 { zero_ts_count += 1; diff --git a/components/resolved_ts/src/lib.rs b/components/resolved_ts/src/lib.rs index 5ad2941dde2..eef1211a580 100644 --- a/components/resolved_ts/src/lib.rs +++ b/components/resolved_ts/src/lib.rs @@ -27,8 +27,6 @@ mod observer; pub use observer::*; mod advance; pub use advance::*; -mod sinker; -pub use sinker::*; mod endpoint; pub use endpoint::*; mod errors; @@ -37,4 +35,3 @@ mod scanner; pub use scanner::*; mod metrics; pub use metrics::*; -mod util; diff --git a/components/resolved_ts/src/observer.rs b/components/resolved_ts/src/observer.rs index 9ff7b976ad4..7421beaad85 100644 --- a/components/resolved_ts/src/observer.rs +++ b/components/resolved_ts/src/observer.rs @@ -8,16 +8,16 @@ use tikv_util::worker::Scheduler; use crate::{cmd::lock_only_filter, endpoint::Task, metrics::RTS_CHANNEL_PENDING_CMD_BYTES}; -pub struct Observer { - scheduler: Scheduler>, +pub struct Observer { + scheduler: Scheduler, } -impl Observer { - pub fn new(scheduler: Scheduler>) -> Self { +impl Observer { + pub fn new(scheduler: Scheduler) -> Self { Observer { scheduler } } - pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { + pub fn register_to(&self, coprocessor_host: &mut CoprocessorHost) { // The `resolved-ts` cmd observer will `mem::take` the `Vec`, use a // low priority to let it be the last observer and avoid affecting other // observers @@ -33,7 +33,7 @@ impl Observer { } } -impl Clone for Observer { +impl Clone for Observer { fn clone(&self) -> Self { Self { scheduler: self.scheduler.clone(), @@ -41,9 +41,9 @@ impl Clone for Observer { } } -impl Coprocessor for Observer {} +impl Coprocessor for Observer {} -impl CmdObserver for Observer { +impl CmdObserver for Observer { fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -64,7 +64,6 @@ impl CmdObserver for Observer { RTS_CHANNEL_PENDING_CMD_BYTES.add(size as i64); if let Err(e) = self.scheduler.schedule(Task::ChangeLog { cmd_batch: cmd_batches, - snapshot: None, }) { info!("failed to schedule change log event"; "err" => ?e); } @@ -82,7 +81,7 @@ impl CmdObserver for Observer { } } -impl RoleObserver for Observer { +impl RoleObserver for Observer { fn on_role_change(&self, ctx: &mut ObserverContext<'_>, role_change: &RoleChange) { // Stop to advance resolved ts after peer steps down to follower or candidate. // Do not need to check observe id because we expect all role change events are @@ -97,7 +96,7 @@ impl RoleObserver for Observer { } } -impl RegionChangeObserver for Observer { +impl RegionChangeObserver for Observer { fn on_region_changed( &self, ctx: &mut ObserverContext<'_>, @@ -139,7 +138,6 @@ impl RegionChangeObserver for Observer { mod test { use std::time::Duration; - use engine_rocks::RocksSnapshot; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::raft_cmdpb::*; use tikv::storage::kv::TestEngineBuilder; @@ -156,7 +154,7 @@ mod test { cmd } - fn expect_recv(rx: &mut ReceiverWrapper>, data: Vec) { + fn expect_recv(rx: &mut ReceiverWrapper, data: Vec) { if data.is_empty() { match rx.recv_timeout(Duration::from_millis(10)) { Err(std::sync::mpsc::RecvTimeoutError::Timeout) => return, diff --git a/components/resolved_ts/src/sinker.rs b/components/resolved_ts/src/sinker.rs deleted file mode 100644 index 383e5f7acc7..00000000000 --- a/components/resolved_ts/src/sinker.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. - -use std::marker::PhantomData; - -use engine_traits::Snapshot; -use raftstore::{coprocessor::ObserveId, store::RegionSnapshot}; -use txn_types::TimeStamp; - -use crate::cmd::ChangeLog; - -pub struct SinkCmd { - pub region_id: u64, - pub observe_id: ObserveId, - pub logs: Vec, -} - -pub trait CmdSinker: Send { - fn sink_cmd(&mut self, sink_cmd: Vec); - - fn sink_cmd_with_old_value(&mut self, sink_cmd: Vec, snapshot: RegionSnapshot); - - fn sink_resolved_ts(&mut self, regions: Vec, ts: TimeStamp); -} - -pub struct DummySinker(PhantomData); - -impl DummySinker { - pub fn new() -> Self { - Self(PhantomData::default()) - } -} - -impl Default for DummySinker { - fn default() -> Self { - Self::new() - } -} - -impl CmdSinker for DummySinker { - fn sink_cmd(&mut self, _sink_cmd: Vec) {} - - fn sink_cmd_with_old_value(&mut self, _sink_cmd: Vec, _snapshot: RegionSnapshot) {} - - fn sink_resolved_ts(&mut self, _regions: Vec, _ts: TimeStamp) {} -} diff --git a/components/resolved_ts/src/util.rs b/components/resolved_ts/src/util.rs deleted file mode 100644 index 11bc1c547a0..00000000000 --- a/components/resolved_ts/src/util.rs +++ /dev/null @@ -1,12 +0,0 @@ -// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - -use kvproto::metapb::Peer; - -pub fn find_store_id(peer_list: &[Peer], peer_id: u64) -> Option { - for peer in peer_list { - if peer.id == peer_id { - return Some(peer.store_id); - } - } - None -} diff --git a/components/resolved_ts/tests/integrations/mod.rs b/components/resolved_ts/tests/integrations/mod.rs index da28758a5d2..7802108b92b 100644 --- a/components/resolved_ts/tests/integrations/mod.rs +++ b/components/resolved_ts/tests/integrations/mod.rs @@ -63,7 +63,7 @@ fn test_resolved_ts_basic() { sst_epoch.set_conf_ver(1); sst_epoch.set_version(4); - let (mut meta, data) = gen_sst_file(&sst_path, sst_range); + let (mut meta, data) = gen_sst_file(sst_path, sst_range); meta.set_region_id(r1.id); meta.set_region_epoch(sst_epoch); diff --git a/components/resolved_ts/tests/mod.rs b/components/resolved_ts/tests/mod.rs index 376aa216224..e8d2a6429ba 100644 --- a/components/resolved_ts/tests/mod.rs +++ b/components/resolved_ts/tests/mod.rs @@ -4,7 +4,6 @@ use std::{sync::*, time::Duration}; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; use futures::{executor::block_on, stream, SinkExt}; use grpcio::{ChannelBuilder, ClientUnaryReceiver, Environment, Result, WriteFlags}; use kvproto::{ @@ -28,8 +27,8 @@ pub fn init() { pub struct TestSuite { pub cluster: Cluster, - pub endpoints: HashMap>>, - pub obs: HashMap>, + pub endpoints: HashMap>, + pub obs: HashMap, tikv_cli: HashMap, import_cli: HashMap, concurrency_managers: HashMap, @@ -88,7 +87,6 @@ impl TestSuite { cm.clone(), env, sim.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); concurrency_managers.insert(*id, cm); worker.start(rts_endpoint); diff --git a/components/resource_metering/Cargo.toml b/components/resource_metering/Cargo.toml index acb2dff89d3..20ed4ea2eda 100644 --- a/components/resource_metering/Cargo.toml +++ b/components/resource_metering/Cargo.toml @@ -8,7 +8,7 @@ collections = { workspace = true } crossbeam = "0.8" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/resource_metering/src/recorder/mod.rs b/components/resource_metering/src/recorder/mod.rs index 9ed6acfb74f..f0b2e88ee4e 100644 --- a/components/resource_metering/src/recorder/mod.rs +++ b/components/resource_metering/src/recorder/mod.rs @@ -303,8 +303,8 @@ pub fn init_recorder( ) { let recorder = RecorderBuilder::default() .precision_ms(precision_ms) - .add_sub_recorder(Box::new(CpuRecorder::default())) - .add_sub_recorder(Box::new(SummaryRecorder::default())) + .add_sub_recorder(Box::::default()) + .add_sub_recorder(Box::::default()) .build(); let mut recorder_worker = WorkerBuilder::new("resource-metering-recorder") .pending_capacity(256) diff --git a/components/security/Cargo.toml b/components/security/Cargo.toml index 4599b1df43e..a9cdd620d12 100644 --- a/components/security/Cargo.toml +++ b/components/security/Cargo.toml @@ -4,9 +4,6 @@ version = "0.0.1" edition = "2018" publish = false -[features] -tonic = ["dep:tonic"] - [dependencies] collections = { workspace = true } encryption = { workspace = true } @@ -15,7 +12,6 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" tikv_util = { workspace = true } -tonic = { version = "0.5", features = ["tls"], optional = true } [dev-dependencies] tempfile = "3.0" diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index c0be3ba276b..52f438236fd 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -18,8 +18,6 @@ use grpcio::{ RpcContext, RpcStatus, RpcStatusCode, ServerBuilder, ServerChecker, ServerCredentialsBuilder, ServerCredentialsFetcher, }; -#[cfg(feature = "tonic")] -use tonic::transport::{channel::ClientTlsConfig, Certificate, Identity}; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Default)] #[serde(default)] @@ -70,6 +68,23 @@ fn load_key(tag: &str, path: &str) -> Result, Box> { type CertResult = Result<(Vec, Vec, Vec), Box>; +type Pem = Box<[u8]>; + +pub struct Secret(pub Pem); + +impl std::fmt::Debug for Secret { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Secret").finish() + } +} + +#[derive(Debug)] +pub struct ClientSuite { + pub ca: Pem, + pub client_cert: Pem, + pub client_key: Secret, +} + impl SecurityConfig { /// Validates ca, cert and private key. pub fn validate(&self) -> Result<(), Box> { @@ -124,21 +139,13 @@ impl SecurityManager { }) } - #[cfg(feature = "tonic")] - /// Make a tonic tls config via the config. - pub fn tonic_tls_config(&self) -> Option { - let (ca, cert, key) = self.cfg.load_certs().unwrap_or_default(); - if ca.is_empty() && cert.is_empty() && key.is_empty() { - return None; - } - let mut cfg = ClientTlsConfig::new(); - if !ca.is_empty() { - cfg = cfg.ca_certificate(Certificate::from_pem(ca)); - } - if !cert.is_empty() && !key.is_empty() { - cfg = cfg.identity(Identity::from_pem(cert, key)); - } - Some(cfg) + pub fn client_suite(&self) -> Result> { + let (ca, cert, key) = self.cfg.load_certs()?; + Ok(ClientSuite { + ca: ca.into_boxed_slice(), + client_cert: cert.into_boxed_slice(), + client_key: Secret(key.into_boxed_slice()), + }) } pub fn connect(&self, mut cb: ChannelBuilder, addr: &str) -> Channel { @@ -317,7 +324,7 @@ mod tests { .iter() .enumerate() { - fs::write(f, &[id as u8]).unwrap(); + fs::write(f, [id as u8]).unwrap(); } let mut c = cfg.clone(); diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index 1f4d98b2847..b27846ad5a3 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -56,7 +56,7 @@ grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } hex = "0.4" keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } @@ -69,7 +69,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } rand = "0.8" resolved_ts = { workspace = true } resource_metering = { workspace = true } -security = { workspace = true, features = ["tonic"] } +security = { workspace = true } serde_json = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } @@ -81,7 +81,7 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } toml = "0.5" txn_types = { workspace = true } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [target.'cfg(unix)'.dependencies] signal-hook = "0.3" diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 2295839a806..b52abc960d8 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -76,8 +76,8 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, - AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, + SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, RaftRouterCompactedEventSender, }; @@ -116,7 +116,10 @@ use tikv_util::{ math::MovingAvgU32, metrics::INSTANCE_BACKEND_CPU_QUOTA, quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, - sys::{cpu_time::ProcessStat, disk, register_memory_usage_high_water, SysQuota}, + sys::{ + cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, + SysQuota, + }, thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, @@ -218,8 +221,7 @@ struct TikvServer { flow_info_sender: Option>, flow_info_receiver: Option>, system: Option>, - resolver: resolve::PdStoreAddrResolver, - state: Arc>, + resolver: Option, store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, @@ -232,6 +234,7 @@ struct TikvServer { concurrency_manager: ConcurrencyManager, env: Arc, background_worker: Worker, + check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -256,8 +259,7 @@ struct Servers { backup_stream_scheduler: Option>, } -type LocalServer = - Server, resolve::PdStoreAddrResolver, LocalRaftKv>; +type LocalServer = Server>; type LocalRaftKv = RaftKv>; impl TikvServer @@ -319,8 +321,6 @@ where let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - let (resolver, state) = - resolve::new_resolver(Arc::clone(&pd_client), &background_worker, router.clone()); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -360,6 +360,10 @@ where info!("Causal timestamp provider startup."); } + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + TikvServer { config, cfg_controller: Some(cfg_controller), @@ -367,8 +371,7 @@ where pd_client, router, system: Some(system), - resolver, - state, + resolver: None, store_path, snap_mgr: None, encryption_key_manager: None, @@ -381,6 +384,7 @@ where concurrency_manager, env, background_worker, + check_leader_worker, flow_info_sender: None, flow_info_receiver: None, sst_worker: None, @@ -473,7 +477,7 @@ where let cur_port = cur_addr.port(); let lock_dir = get_lock_dir(); - let search_base = env::temp_dir().join(&lock_dir); + let search_base = env::temp_dir().join(lock_dir); file_system::create_dir_all(&search_base) .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); @@ -527,36 +531,66 @@ where // enough space to do compaction and region migration when TiKV recover. // This file is created in data_dir rather than db_path, because we must not // increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); let mut capacity = disk_stats.total_space(); if self.config.raft_store.capacity.0 > 0 { capacity = cmp::min(capacity, self.config.raft_store.capacity.0); } - let mut reserve_space = self.config.storage.reserve_space.0; - if self.config.storage.reserve_space.0 != 0 { - reserve_space = cmp::max( - (capacity as f64 * 0.05) as u64, - self.config.storage.reserve_space.0, - ); - } - disk::set_disk_reserved_space(reserve_space); - let path = - Path::new(&self.config.storage.data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); - if let Err(e) = file_system::remove_file(&path) { - warn!("failed to remove space holder on starting: {}", e); - } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); - let available = disk_stats.available_space(); - // place holder file size is 20% of total reserved space. - if available > reserve_space { - file_system::reserve_space_for_recover( - &self.config.storage.data_dir, - reserve_space / 5, - ) - .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) - .unwrap(); + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir } else { - warn!("no enough disk space left to create the place holder file"); + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); } } @@ -564,6 +598,9 @@ where yatp::metrics::set_namespace(Some("tikv")); prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); } fn init_encryption(&mut self) { @@ -613,14 +650,10 @@ where fn init_gc_worker( &mut self, - ) -> GcWorker< - RaftKv>, - RaftRouter, - > { + ) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), - self.router.clone(), self.flow_info_sender.take().unwrap(), self.config.gc.clone(), self.pd_client.feature_gate().clone(), @@ -781,6 +814,13 @@ where )), ); + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.background_worker, + storage.get_engine().raft_extension().clone(), + ); + self.resolver = Some(resolver); + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) .register(self.coprocessor_host.as_mut().unwrap()); @@ -867,7 +907,7 @@ where self.coprocessor_host.clone().unwrap(), ); let check_leader_scheduler = self - .background_worker + .check_leader_worker .start("check-leader", check_leader_runner); let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); @@ -888,7 +928,7 @@ where raft_store.clone(), self.config.storage.api_version(), self.pd_client.clone(), - self.state.clone(), + state, self.background_worker.clone(), Some(health_service.clone()), None, @@ -911,8 +951,7 @@ where Arc::clone(&self.quota_limiter), ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), - self.router.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), snap_mgr.clone(), gc_worker.clone(), check_leader_scheduler, @@ -951,7 +990,13 @@ where ConnectionConfig { keep_alive_interval: self.config.server.grpc_keepalive_time.0, keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self.security_mgr.tonic_tls_config(), + tls: self + .security_mgr + .client_suite() + .map_err(|err| { + warn!("Failed to load client TLS suite, ignoring TLS config."; "err" => %err); + }) + .ok(), }, ); let backup_stream_endpoint = backup_stream::Endpoint::new( @@ -1064,12 +1109,6 @@ where gc_worker .start(node.id()) .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); - gc_worker - .start_observe_lock_apply( - self.coprocessor_host.as_mut().unwrap(), - self.concurrency_manager.clone(), - ) - .unwrap_or_else(|e| fatal!("gc worker failed to observe lock apply: {}", e)); if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { fatal!("failed to start auto_gc on storage, error: {}", e); } @@ -1116,8 +1155,6 @@ where self.concurrency_manager.clone(), server.env(), self.security_mgr.clone(), - // TODO: replace to the cdc sinker - resolved_ts::DummySinker::new(), ); rts_worker.start_with_timer(rts_endpoint); self.to_stop.push(rts_worker); @@ -1169,7 +1206,7 @@ where let debug_service = DebugService::new( engines.engines.clone(), servers.server.get_debug_thread_pool().clone(), - self.router.clone(), + engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers @@ -1208,7 +1245,7 @@ where .start( servers.node.id(), self.pd_client.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), self.security_mgr.clone(), &self.config.pessimistic_txn, ) @@ -1439,13 +1476,28 @@ where let store_path = self.store_path.clone(); let snap_mgr = self.snap_mgr.clone().unwrap(); let reserve_space = disk::get_disk_reserved_space(); - if reserve_space == 0 { + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { info!("disk space checker not enabled"); return; } + let raft_path = engines.raft.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), engines.kv.path()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; let almost_full_threshold = reserve_space; let already_full_threshold = reserve_space / 2; + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } self.background_worker .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { let disk_stats = match fs2::statvfs(&store_path) { @@ -1472,14 +1524,45 @@ where .get_engine_size() .expect("get raft engine size"); + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } let placeholer_file_path = PathBuf::from_str(&data_dir) .unwrap() .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); let placeholder_size: u64 = - file_system::get_file_size(&placeholer_file_path).unwrap_or(0); + file_system::get_file_size(placeholer_file_path).unwrap_or(0); - let used_size = snap_size + kv_size + raft_size + placeholder_size; + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { disk_cap } else { @@ -1490,18 +1573,22 @@ where available = cmp::min(available, disk_stats.available_space()); let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. - let cur_disk_status = if available <= already_full_threshold { + let cur_kv_disk_status = if available <= already_full_threshold { disk::DiskUsage::AlreadyFull } else if available <= almost_full_threshold { disk::DiskUsage::AlmostFull } else { disk::DiskUsage::Normal }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); if prev_disk_status != cur_disk_status { warn!( - "disk usage {:?}->{:?}, available={},snap={},kv={},raft={},capacity={}", + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", prev_disk_status, cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, available, snap_size, kv_size, diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index 88c2ddac9f4..a92845b843d 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -13,7 +13,7 @@ mod imp { #[allow(dead_code)] pub fn wait_for_signal(engines: Option>) { - let mut signals = Signals::new(&[SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); + let mut signals = Signals::new([SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); for signal in &mut signals { match signal { SIGTERM | SIGINT | SIGHUP => { diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 6b5fbd9127f..d0e2ff7eca8 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -26,11 +26,12 @@ futures = { version = "0.3", features = ["thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } +rand = "0.8" serde = "1.0" serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/sst_importer/src/caching/cache_map.rs b/components/sst_importer/src/caching/cache_map.rs new file mode 100644 index 00000000000..e88e5c3545d --- /dev/null +++ b/components/sst_importer/src/caching/cache_map.rs @@ -0,0 +1,211 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use dashmap::{mapref::entry::Entry, DashMap}; +use futures::Future; + +use crate::metrics::EXT_STORAGE_CACHE_COUNT; + +#[derive(Clone, Default)] +pub struct CacheMap(Arc>); + +impl CacheMap { + #[cfg(test)] + pub fn with_inner(inner: CacheMapInner) -> Self { + Self(Arc::new(inner)) + } +} + +pub trait ShareOwned { + type Shared: 'static; + + fn share_owned(&self) -> Self::Shared; +} + +impl ShareOwned for T { + type Shared = T; + + fn share_owned(&self) -> Self::Shared { + *self + } +} + +pub trait MakeCache: 'static { + type Cached: std::fmt::Debug + ShareOwned + Send + Sync + 'static; + type Error; + + fn make_cache(&self) -> std::result::Result; +} + +#[derive(Debug)] +pub struct CacheMapInner { + cached: DashMap>, + now: AtomicUsize, + + gc_threshold: usize, +} + +impl Default for CacheMapInner { + fn default() -> Self { + Self { + cached: DashMap::default(), + now: Default::default(), + gc_threshold: 20, + } + } +} + +impl CacheMapInner { + #[cfg(test)] + pub fn with_gc_threshold(n: usize) -> Self { + Self { + gc_threshold: n, + ..Self::default() + } + } +} + +#[derive(Debug)] +struct Cached { + resource: R, + last_used: usize, +} + +impl Cached { + fn new(resource: R) -> Self { + Self { + resource, + last_used: 0, + } + } + + fn resource_owned(&mut self, now: usize) -> ::Shared { + self.last_used = now; + self.resource.share_owned() + } +} + +impl CacheMapInner { + fn now(&self) -> usize { + self.now.load(Ordering::SeqCst) + } + + fn tick(&self) { + let now = self.now.fetch_add(1usize, Ordering::SeqCst); + self.cached.retain(|name, cache| { + let need_hold = now.saturating_sub(cache.last_used) < self.gc_threshold; + if !need_hold { + info!("Removing cache due to expired."; "name" => %name, "entry" => ?cache); + } + need_hold + }); + } +} + +impl CacheMap { + pub fn gc_loop(&self) -> impl Future + Send + 'static { + let this = Arc::downgrade(&self.0); + async move { + loop { + tokio::time::sleep(Duration::from_secs(30)).await; + match this.upgrade() { + Some(inner) => inner.tick(), + None => return, + } + } + } + } + + pub fn cached_or_create( + &self, + cache_key: &str, + backend: &M, + ) -> std::result::Result<::Shared, M::Error> { + let s = self.0.cached.get_mut(cache_key); + match s { + Some(mut s) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(s.value_mut().resource_owned(self.0.now())) + } + None => { + drop(s); + let e = self.0.cached.entry(cache_key.to_owned()); + match e { + Entry::Occupied(mut v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["hit"]).inc(); + Ok(v.get_mut().resource_owned(self.0.now())) + } + Entry::Vacant(v) => { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["miss"]).inc(); + let pool = backend.make_cache()?; + info!("Insert storage cache."; "name" => %cache_key, "cached" => ?pool); + let shared = pool.share_owned(); + v.insert(Cached::new(pool)); + Ok(shared) + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + convert::Infallible, + sync::atomic::{AtomicBool, Ordering}, + }; + + use super::{CacheMap, CacheMapInner, MakeCache}; + + #[derive(Default)] + struct CacheChecker(AtomicBool); + + impl MakeCache for CacheChecker { + type Cached = (); + type Error = Infallible; + + fn make_cache(&self) -> std::result::Result { + self.0.store(true, Ordering::SeqCst); + Ok(()) + } + } + + impl CacheChecker { + fn made_cache(&self) -> bool { + self.0.load(Ordering::SeqCst) + } + } + + #[test] + fn test_basic() { + let cached = CacheMapInner::with_gc_threshold(1); + let cached = CacheMap::with_inner(cached); + + let check_cache = |key, should_make_cache: bool| { + let c = CacheChecker::default(); + cached.cached_or_create(key, &c).unwrap(); + assert_eq!(c.made_cache(), should_make_cache); + }; + + check_cache("hello", true); + check_cache("hello", false); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", false); + + cached.0.tick(); + check_cache("world", true); + + cached.0.tick(); + check_cache("hello", true); + } +} diff --git a/components/sst_importer/src/caching/mod.rs b/components/sst_importer/src/caching/mod.rs new file mode 100644 index 00000000000..9e55717c601 --- /dev/null +++ b/components/sst_importer/src/caching/mod.rs @@ -0,0 +1,4 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub mod cache_map; +pub mod storage_cache; diff --git a/components/sst_importer/src/caching/storage_cache.rs b/components/sst_importer/src/caching/storage_cache.rs new file mode 100644 index 00000000000..23732545b92 --- /dev/null +++ b/components/sst_importer/src/caching/storage_cache.rs @@ -0,0 +1,58 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use external_storage_export::ExternalStorage; +use kvproto::brpb::StorageBackend; + +use super::cache_map::{MakeCache, ShareOwned}; +use crate::{Error, Result}; + +impl ShareOwned for StoragePool { + type Shared = Arc; + + fn share_owned(&self) -> Self::Shared { + self.get() + } +} + +impl MakeCache for StorageBackend { + type Cached = StoragePool; + type Error = Error; + + fn make_cache(&self) -> Result { + StoragePool::create(self, 16) + } +} + +pub struct StoragePool(Box<[Arc]>); + +impl StoragePool { + fn create(backend: &StorageBackend, size: usize) -> Result { + let mut r = Vec::with_capacity(size); + for _ in 0..size { + let s = external_storage_export::create_storage(backend, Default::default())?; + r.push(Arc::from(s)); + } + Ok(Self(r.into_boxed_slice())) + } + + fn get(&self) -> Arc { + use rand::Rng; + let idx = rand::thread_rng().gen_range(0..self.0.len()); + Arc::clone(&self.0[idx]) + } +} + +impl std::fmt::Debug for StoragePool { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let url = self + .get() + .url() + .map(|u| u.to_string()) + .unwrap_or_else(|_| "".to_owned()); + f.debug_tuple("StoragePool") + .field(&format_args!("{}", url)) + .finish() + } +} diff --git a/components/sst_importer/src/config.rs b/components/sst_importer/src/config.rs index ef74a40fd01..ac789e2f4ae 100644 --- a/components/sst_importer/src/config.rs +++ b/components/sst_importer/src/config.rs @@ -14,6 +14,8 @@ pub struct Config { /// /// Default is 10m. pub import_mode_timeout: ReadableDuration, + /// the ratio of system memory used for import. + pub memory_use_ratio: f64, } impl Default for Config { @@ -22,6 +24,7 @@ impl Default for Config { num_threads: 8, stream_channel_window: 128, import_mode_timeout: ReadableDuration::minutes(10), + memory_use_ratio: 0.3, } } } @@ -43,6 +46,13 @@ impl Config { ); self.stream_channel_window = default_cfg.stream_channel_window; } + if self.memory_use_ratio > 0.5 || self.memory_use_ratio < 0.0 { + warn!( + "import.mem_ratio should belong to [0.0, 0.5], change it to {}", + default_cfg.memory_use_ratio, + ); + self.memory_use_ratio = default_cfg.memory_use_ratio; + } Ok(()) } } diff --git a/components/sst_importer/src/errors.rs b/components/sst_importer/src/errors.rs index 51aabcbec01..7ff940fff12 100644 --- a/components/sst_importer/src/errors.rs +++ b/components/sst_importer/src/errors.rs @@ -8,7 +8,7 @@ use encryption::Error as EncryptionError; use error_code::{self, ErrorCode, ErrorCodeExt}; use futures::channel::oneshot::Canceled; use grpcio::Error as GrpcError; -use kvproto::{import_sstpb, kvrpcpb::ApiVersion}; +use kvproto::{errorpb, import_sstpb, kvrpcpb::ApiVersion}; use tikv_util::codec::Error as CodecError; use uuid::Error as UuidError; @@ -122,6 +122,9 @@ pub enum Error { storage_api_version: ApiVersion, key: String, }, + + #[error("resource is not enough {0}")] + ResourceNotEnough(String), } impl Error { @@ -149,7 +152,19 @@ pub type Result = result::Result; impl From for import_sstpb::Error { fn from(e: Error) -> import_sstpb::Error { let mut err = import_sstpb::Error::default(); - err.set_message(format!("{}", e)); + match e { + Error::ResourceNotEnough(ref msg) => { + let mut import_err = errorpb::Error::default(); + import_err.set_message(msg.clone()); + import_err.set_server_is_busy(errorpb::ServerIsBusy::default()); + err.set_store_error(import_err); + err.set_message(format!("{}", e)); + } + _ => { + err.set_message(format!("{}", e)); + } + } + err } } @@ -181,6 +196,7 @@ impl ErrorCodeExt for Error { Error::TtlLenNotEqualsToPairs => error_code::sst_importer::TTL_LEN_NOT_EQUALS_TO_PAIRS, Error::IncompatibleApiVersion => error_code::sst_importer::INCOMPATIBLE_API_VERSION, Error::InvalidKeyMode { .. } => error_code::sst_importer::INVALID_KEY_MODE, + Error::ResourceNotEnough(_) => error_code::sst_importer::RESOURCE_NOT_ENOUTH, } } } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index c4a0498a9a6..f766729a066 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -247,9 +247,9 @@ impl ImportDir { /// Make an import path base on the basic path and the file name. pub fn get_import_path(&self, file_name: &str) -> Result { - let save_path = self.root_dir.join(&file_name); - let temp_path = self.temp_dir.join(&file_name); - let clone_path = self.clone_dir.join(&file_name); + let save_path = self.root_dir.join(file_name); + let temp_path = self.temp_dir.join(file_name); + let clone_path = self.clone_dir.join(file_name); Ok(ImportPath { save: save_path, temp: temp_path, @@ -276,7 +276,7 @@ impl ImportDir { pub fn delete_file(&self, path: &Path, key_manager: Option<&DataKeyManager>) -> Result<()> { if path.exists() { - file_system::remove_file(&path)?; + file_system::remove_file(path)?; if let Some(manager) = key_manager { manager.delete_file(path.to_str().unwrap())?; } @@ -515,7 +515,7 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = path_to_sst_meta(&path).unwrap(); + let new_meta = path_to_sst_meta(path).unwrap(); assert_eq!(meta, new_meta); } } diff --git a/components/sst_importer/src/import_mode.rs b/components/sst_importer/src/import_mode.rs index 0e793e2bc2b..5f5b5d1060e 100644 --- a/components/sst_importer/src/import_mode.rs +++ b/components/sst_importer/src/import_mode.rs @@ -9,10 +9,10 @@ use std::{ }; use engine_traits::{CfOptions, DbOptions, KvEngine}; -use futures::executor::ThreadPool; use futures_util::compat::Future01CompatExt; use kvproto::import_sstpb::*; use tikv_util::timer::GLOBAL_TIMER_HANDLE; +use tokio::runtime::Handle; use super::{Config, Result}; @@ -88,7 +88,7 @@ impl ImportModeSwitcher { ImportModeSwitcher { inner, is_import } } - pub fn start(&self, executor: &ThreadPool, db: E) { + pub fn start(&self, executor: &Handle, db: E) { // spawn a background future to put TiKV back into normal mode after timeout let inner = self.inner.clone(); let switcher = Arc::downgrade(&inner); @@ -117,7 +117,7 @@ impl ImportModeSwitcher { } } }; - executor.spawn_ok(timer_loop); + executor.spawn(timer_loop); } pub fn enter_normal_mode(&self, db: &E, mf: RocksDbMetricsFn) -> Result { @@ -243,7 +243,6 @@ mod tests { use std::thread; use engine_traits::{KvEngine, CF_DEFAULT}; - use futures::executor::ThreadPoolBuilder; use tempfile::Builder; use test_sst_importer::{new_test_engine, new_test_engine_with_options}; use tikv_util::config::ReadableDuration; @@ -306,14 +305,13 @@ mod tests { fn mf(_cf: &str, _name: &str, _v: f64) {} let cfg = Config::default(); - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); assert!(switcher.enter_import_mode(&db, mf).unwrap()); check_import_options(&db, &import_db_options, &import_cf_options); @@ -344,19 +342,20 @@ mod tests { import_mode_timeout: ReadableDuration::millis(300), ..Config::default() }; - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") - .create() + + let threads = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() .unwrap(); let switcher = ImportModeSwitcher::new(&cfg); - switcher.start(&threads, db.clone()); + switcher.start(threads.handle(), db.clone()); check_import_options(&db, &normal_db_options, &normal_cf_options); switcher.enter_import_mode(&db, mf).unwrap(); check_import_options(&db, &import_db_options, &import_cf_options); thread::sleep(Duration::from_secs(1)); + threads.block_on(tokio::task::yield_now()); check_import_options(&db, &normal_db_options, &normal_cf_options); } diff --git a/components/sst_importer/src/lib.rs b/components/sst_importer/src/lib.rs index ec0222d416a..4d25201253a 100644 --- a/components/sst_importer/src/lib.rs +++ b/components/sst_importer/src/lib.rs @@ -19,6 +19,7 @@ mod sst_writer; mod util; #[macro_use] pub mod import_mode; +mod caching; pub mod metrics; pub mod sst_importer; diff --git a/components/sst_importer/src/metrics.rs b/components/sst_importer/src/metrics.rs index 08f095078d5..e7eeefd3e82 100644 --- a/components/sst_importer/src/metrics.rs +++ b/components/sst_importer/src/metrics.rs @@ -56,6 +56,11 @@ lazy_static! { "tikv_import_download_bytes", "Bucketed histogram of importer download bytes", exponential_buckets(1024.0, 2.0, 20).unwrap() + ).unwrap(); + pub static ref IMPORTER_APPLY_BYTES: Histogram = register_histogram!( + "tikv_import_apply_bytes", + "Bucketed histogram of importer apply bytes", + exponential_buckets(1024.0, 2.0, 20).unwrap() ) .unwrap(); pub static ref IMPORTER_INGEST_DURATION: HistogramVec = register_histogram_vec!( @@ -96,4 +101,9 @@ lazy_static! { "Bucketed histogram of importer apply count", &["type"] ).unwrap(); + pub static ref EXT_STORAGE_CACHE_COUNT: IntCounterVec = register_int_counter_vec!( + "tikv_import_storage_cache", + "The operations over storage cache", + &["operation"] + ).unwrap(); } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 947d7e98e0c..3e06eb76899 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -4,10 +4,14 @@ use std::{ borrow::Cow, collections::HashMap, fs::File, - io::{self, prelude::*, BufReader}, + io::{self, BufReader, Read}, ops::Bound, path::{Path, PathBuf}, - sync::Arc, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Duration, }; use dashmap::DashMap; @@ -18,27 +22,73 @@ use engine_traits::{ IterOptions, Iterator, KvEngine, RefIterable, SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, CF_WRITE, }; +use external_storage_export::{ + compression_reader_dispatcher, encrypt_wrap_reader, ExternalStorage, RestoreConfig, +}; use file_system::{get_io_rate_limiter, OpenOptions}; -use futures::executor::ThreadPool; use kvproto::{ brpb::{CipherInfo, StorageBackend}, import_sstpb::*, kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventIterator, Iterator as EIterator}, + codec::stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + config::ReadableSize, + stream::block_on_external_io, + sys::SysQuota, time::{Instant, Limiter}, }; +use tokio::runtime::{Handle, Runtime}; use txn_types::{Key, TimeStamp, WriteRef}; use crate::{ + caching::cache_map::CacheMap, import_file::{ImportDir, ImportFile}, import_mode::{ImportModeSwitcher, RocksDbMetricsFn}, metrics::*, sst_writer::{RawSstWriter, TxnSstWriter}, - Config, Error, Result, + util, Config, Error, Result, }; +#[derive(Default, Debug, Clone)] +pub struct DownloadExt<'a> { + cache_key: Option<&'a str>, +} + +impl<'a> DownloadExt<'a> { + pub fn cache_key(self, key: &'a str) -> Self { + Self { + cache_key: Some(key), + } + } +} + +#[derive(Clone, PartialEq, Debug)] +pub enum CacheKvFile { + Mem(Arc>), + Fs(Arc), +} + +impl CacheKvFile { + // get the ref count of item. + pub fn ref_count(&self) -> usize { + match self { + CacheKvFile::Mem(buff) => Arc::strong_count(buff), + CacheKvFile::Fs(path) => Arc::strong_count(path), + } + } + + // check the item is expired. + pub fn is_expired(&self, start: &Instant) -> bool { + match self { + // The expired duration for memeory is 60s. + CacheKvFile::Mem(_) => start.saturating_elapsed() >= Duration::from_secs(60), + // The expired duration for local file is 10min. + CacheKvFile::Fs(_) => start.saturating_elapsed() >= Duration::from_secs(600), + } + } +} + /// SstImporter manages SST files that are waiting for ingesting. pub struct SstImporter { dir: ImportDir, @@ -47,7 +97,12 @@ pub struct SstImporter { // TODO: lift api_version as a type parameter. api_version: ApiVersion, compression_types: HashMap, - file_locks: Arc>, + + cached_storage: CacheMap, + download_rt: Runtime, + file_locks: Arc>, + mem_use: AtomicU64, + mem_limit: ReadableSize, } impl SstImporter { @@ -58,6 +113,15 @@ impl SstImporter { api_version: ApiVersion, ) -> Result { let switcher = ImportModeSwitcher::new(cfg); + let cached_storage = CacheMap::default(); + let download_rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + download_rt.spawn(cached_storage.gc_loop()); + + let memory_limit = (SysQuota::memory_limit_in_bytes() as f64) * cfg.memory_use_ratio; + info!("sst importer memory limit when apply"; "size" => ?memory_limit); + Ok(SstImporter { dir: ImportDir::new(root)?, key_manager, @@ -65,6 +129,10 @@ impl SstImporter { api_version, compression_types: HashMap::with_capacity(2), file_locks: Arc::new(DashMap::default()), + cached_storage, + download_rt, + mem_use: AtomicU64::new(0), + mem_limit: ReadableSize(memory_limit as u64), }) } @@ -80,7 +148,7 @@ impl SstImporter { } } - pub fn start_switch_mode_check(&self, executor: &ThreadPool, db: E) { + pub fn start_switch_mode_check(&self, executor: &Handle, db: E) { self.switcher.start(executor, db); } @@ -174,7 +242,7 @@ impl SstImporter { // // This method returns the *inclusive* key range (`[start, end]`) of SST // file created, or returns None if the SST is empty. - pub fn download( + pub async fn download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -183,6 +251,7 @@ impl SstImporter { crypter: Option, speed_limiter: Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { debug!("download start"; "meta" => ?meta, @@ -191,7 +260,7 @@ impl SstImporter { "rewrite_rule" => ?rewrite_rule, "speed_limit" => speed_limiter.speed_limit(), ); - match self.do_download::( + let r = self.do_download_ext::( meta, backend, name, @@ -199,7 +268,9 @@ impl SstImporter { crypter, &speed_limiter, engine, - ) { + ext, + ); + match r.await { Ok(r) => { info!("download"; "meta" => ?meta, "name" => name, "range" => ?r); Ok(r) @@ -232,6 +303,49 @@ impl SstImporter { support_kms: bool, speed_limiter: &Limiter, restore_config: external_storage_export::RestoreConfig, + ) -> Result<()> { + self.download_rt + .block_on(self.async_download_file_from_external_storage( + file_length, + src_file_name, + dst_file, + backend, + support_kms, + speed_limiter, + "", + restore_config, + )) + } + + /// Create an external storage by the backend, and cache it with the key. + /// If the cache exists, return it directly. + pub fn external_storage_or_cache( + &self, + backend: &StorageBackend, + cache_id: &str, + ) -> Result> { + // prepare to download the file from the external_storage + // TODO: pass a config to support hdfs + let ext_storage = if cache_id.is_empty() { + EXT_STORAGE_CACHE_COUNT.with_label_values(&["skip"]).inc(); + let s = external_storage_export::create_storage(backend, Default::default())?; + Arc::from(s) + } else { + self.cached_storage.cached_or_create(cache_id, backend)? + }; + Ok(ext_storage) + } + + async fn async_download_file_from_external_storage( + &self, + file_length: u64, + src_file_name: &str, + dst_file: std::path::PathBuf, + backend: &StorageBackend, + support_kms: bool, + speed_limiter: &Limiter, + cache_key: &str, + restore_config: external_storage_export::RestoreConfig, ) -> Result<()> { let start_read = Instant::now(); if let Some(p) = dst_file.parent() { @@ -243,34 +357,22 @@ impl SstImporter { } })?; } - // prepare to download the file from the external_storage - // TODO: pass a config to support hdfs - let ext_storage = external_storage_export::create_storage(backend, Default::default())?; - let url = ext_storage.url()?.to_string(); - let ext_storage: Box = if support_kms { - if let Some(key_manager) = &self.key_manager { - Box::new(external_storage_export::EncryptedExternalStorage { - key_manager: (*key_manager).clone(), - storage: ext_storage, - }) as _ - } else { - ext_storage as _ - } - } else { - ext_storage as _ - }; + let ext_storage = self.external_storage_or_cache(backend, cache_key)?; + let ext_storage = self.wrap_kms(ext_storage, support_kms); - let result = ext_storage.restore( - src_file_name, - dst_file.clone(), - file_length, - speed_limiter, - restore_config, - ); + let result = ext_storage + .restore( + src_file_name, + dst_file.clone(), + file_length, + speed_limiter, + restore_config, + ) + .await; IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); result.map_err(|e| Error::CannotReadExternalStorage { - url: url.to_string(), + url: util::url_for(&ext_storage), name: src_file_name.to_owned(), local_path: dst_file.clone(), err: e, @@ -287,17 +389,252 @@ impl SstImporter { debug!("downloaded file succeed"; "name" => src_file_name, - "url" => %url, + "url" => %util::url_for(&ext_storage), ); Ok(()) } + pub fn shrink_by_tick(&self) -> usize { + let mut shrink_buff_size: usize = 0; + let mut retain_buff_size: usize = 0; + let mut shrink_files: Vec = Vec::default(); + let mut retain_file_count = 0_usize; + + self.file_locks.retain(|_, (c, start)| { + let mut need_retain = true; + match c { + CacheKvFile::Mem(buff) => { + let buflen = buff.len(); + // The term of recycle memeory is 60s. + if c.ref_count() == 1 && c.is_expired(start) { + need_retain = false; + shrink_buff_size += buflen; + } else { + retain_buff_size += buflen; + } + } + CacheKvFile::Fs(path) => { + let p = path.to_path_buf(); + // The term of recycle file is 10min. + if c.ref_count() == 1 && c.is_expired(start) { + need_retain = false; + shrink_files.push(p); + } else { + retain_file_count += 1; + } + } + } + + need_retain + }); + + if self.import_support_download() { + let shrink_file_count = shrink_files.len(); + info!("shrink space by tick"; "shrink files count" => shrink_file_count, "retain files count" => retain_file_count); + + for f in shrink_files { + if let Err(e) = file_system::remove_file(&f) { + info!("failed to remove file"; "filename" => ?f, "error" => ?e); + } + } + shrink_file_count + } else { + info!("shrink cache by tick"; "shrink size" => shrink_buff_size, "retain size" => retain_buff_size); + self.dec_mem(shrink_buff_size as _); + shrink_buff_size + } + } + + // If mem_limit is 0, which represent download kv-file when import. + // Or read kv-file into buffer directly. + pub fn import_support_download(&self) -> bool { + self.mem_limit == ReadableSize(0) + } + + fn inc_mem_and_check(&self, meta: &KvMeta) -> bool { + let size = meta.get_length(); + let old = self.mem_use.fetch_add(size, Ordering::SeqCst); + + // If the memory is limited, roll backup the mem_use and return false. + if old + size > self.mem_limit.0 { + self.mem_use.fetch_sub(size, Ordering::SeqCst); + false + } else { + true + } + } + + fn dec_mem(&self, size: u64) { + self.mem_use.fetch_sub(size, Ordering::SeqCst); + } + + pub fn do_read_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + speed_limiter: &Limiter, + ) -> Result { + let start = Instant::now(); + let dst_name = format!("{}_{}", meta.get_name(), meta.get_range_offset()); + + let mut lock = self + .file_locks + .entry(dst_name) + .or_insert((CacheKvFile::Mem(Arc::default()), Instant::now())); + + if let CacheKvFile::Mem(buff) = &lock.0 { + if !buff.is_empty() { + lock.1 = Instant::now(); + return Ok(lock.0.clone()); + } + } + + if !self.inc_mem_and_check(meta) { + return Err(Error::ResourceNotEnough(String::from("memory is limited"))); + } + + let expected_sha256 = { + let sha256 = meta.get_sha256().to_vec(); + if !sha256.is_empty() { + Some(sha256) + } else { + None + } + }; + let file_length = meta.get_length(); + let range = { + let range_length = meta.get_range_length(); + if range_length == 0 { + None + } else { + Some((meta.get_range_offset(), range_length)) + } + }; + let restore_config = external_storage_export::RestoreConfig { + range, + compression_type: Some(meta.get_compression_type()), + expected_sha256, + file_crypter: None, + }; + + let buff = self.read_kv_files_from_external_storage( + file_length, + meta.get_name(), + ext_storage, + speed_limiter, + restore_config, + )?; + + IMPORTER_DOWNLOAD_BYTES.observe(file_length as _); + IMPORTER_APPLY_DURATION + .with_label_values(&["download"]) + .observe(start.saturating_elapsed().as_secs_f64()); + + let rewrite_buff = self.rewrite_kv_file(buff, rewrite_rule)?; + *lock = (CacheKvFile::Mem(Arc::new(rewrite_buff)), Instant::now()); + Ok(lock.0.clone()) + } + + pub fn wrap_kms( + &self, + ext_storage: Arc, + support_kms: bool, + ) -> Arc { + // kv-files needn't are decrypted with KMS when download currently because these + // files are not encrypted when log-backup. It is different from + // sst-files because sst-files is encrypted when saved with rocksdb env + // with KMS. to do: support KMS when log-backup and restore point. + match (support_kms, self.key_manager.clone()) { + (true, Some(key_manager)) => { + Arc::new(external_storage_export::EncryptedExternalStorage { + key_manager, + storage: ext_storage, + }) + } + _ => ext_storage, + } + } + + fn read_kv_files_from_external_storage( + &self, + file_length: u64, + file_name: &str, + ext_storage: Arc, + speed_limiter: &Limiter, + restore_config: RestoreConfig, + ) -> Result> { + let RestoreConfig { + range, + compression_type, + expected_sha256, + file_crypter, + } = restore_config; + + let mut reader = { + let inner = if let Some((off, len)) = range { + ext_storage.read_part(file_name, off, len) + } else { + ext_storage.read(file_name) + }; + + let inner = compression_reader_dispatcher(compression_type, inner)?; + encrypt_wrap_reader(file_crypter, inner)? + }; + + let r = block_on_external_io(external_storage_export::read_external_storage_info_buff( + &mut reader, + speed_limiter, + file_length, + expected_sha256, + external_storage_export::MIN_READ_SPEED, + )); + let url = ext_storage.url()?.to_string(); + let buff = r.map_err(|e| Error::CannotReadExternalStorage { + url: url.to_string(), + name: file_name.to_string(), + err: e, + local_path: PathBuf::default(), + })?; + + Ok(buff) + } + + pub fn read_from_kv_file( + &self, + meta: &KvMeta, + rewrite_rule: &RewriteRule, + ext_storage: Arc, + backend: &StorageBackend, + speed_limiter: &Limiter, + ) -> Result>> { + let c = if self.import_support_download() { + self.do_download_kv_file(meta, backend, speed_limiter)? + } else { + self.do_read_kv_file(meta, rewrite_rule, ext_storage, speed_limiter)? + }; + match c { + // If cache memroy, it has been rewrite, return buffer directly. + CacheKvFile::Mem(buff) => Ok(buff), + // If cache file name, it need to read and rewrite. + CacheKvFile::Fs(path) => { + let file = File::open(path.as_ref())?; + let mut reader = BufReader::new(file); + let mut buffer = Vec::new(); + reader.read_to_end(&mut buffer)?; + + let rewrite_buff = self.rewrite_kv_file(buffer, rewrite_rule)?; + Ok(Arc::new(rewrite_buff)) + } + } + } + pub fn do_download_kv_file( &self, meta: &KvMeta, backend: &StorageBackend, speed_limiter: &Limiter, - ) -> Result { + ) -> Result { let offset = meta.get_range_offset(); let src_name = meta.get_name(); let dst_name = format!("{}_{}", src_name, offset); @@ -309,14 +646,15 @@ impl SstImporter { } else { None }; - if path.save.exists() { - return Ok(path.save); - } - let lock = self.file_locks.entry(dst_name.to_string()).or_default(); + let mut lock = self + .file_locks + .entry(dst_name) + .or_insert((CacheKvFile::Fs(Arc::new(path.save.clone())), Instant::now())); if path.save.exists() { - return Ok(path.save); + lock.1 = Instant::now(); + return Ok(lock.0.clone()); } let range_length = meta.get_range_length(); @@ -336,16 +674,17 @@ impl SstImporter { src_name, path.temp.clone(), backend, - // kv-files needn't are decrypted with KMS when download currently because these files - // are not encrypted when log-backup. It is different from sst-files - // because sst-files is encrypted when saved with rocksdb env with KMS. - // to do: support KMS when log-backup and restore point. false, // don't support encrypt for now. speed_limiter, restore_config, )?; - info!("download file finished {}, offset {}", src_name, offset); + info!( + "download file finished {}, offset {}, length {}", + src_name, + offset, + meta.get_length() + ); if let Some(p) = path.save.parent() { // we have v1 prefix in file name. @@ -358,89 +697,100 @@ impl SstImporter { })?; } - file_system::rename(path.temp, path.save.clone())?; - - drop(lock); - self.file_locks.remove(&dst_name); - + file_system::rename(path.temp, path.save)?; IMPORTER_APPLY_DURATION .with_label_values(&["download"]) .observe(start.saturating_elapsed().as_secs_f64()); - Ok(path.save) + lock.1 = Instant::now(); + Ok(lock.0.clone()) } - pub fn do_apply_kv_file>( + pub fn rewrite_kv_file( &self, - start_key: &[u8], - end_key: &[u8], - restore_ts: u64, - file_path: P, + file_buff: Vec, rewrite_rule: &RewriteRule, - build_fn: &mut dyn FnMut(Vec, Vec), - ) -> Result> { - // iterator file and performs rewrites and apply. - let file = File::open(&file_path)?; - let mut reader = BufReader::new(file); - let mut buffer = Vec::new(); - reader.read_to_end(&mut buffer)?; - - let mut event_iter = EventIterator::new(buffer); - + ) -> Result> { let old_prefix = rewrite_rule.get_old_key_prefix(); let new_prefix = rewrite_rule.get_new_key_prefix(); - - let perform_rewrite = old_prefix != new_prefix; + // if old_prefix equals new_prefix, do not need rewrite. + if old_prefix == new_prefix { + return Ok(file_buff); + } // perform iteration and key rewrite. + let mut new_buff = Vec::with_capacity(file_buff.len()); + let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut key = new_prefix.to_vec(); let new_prefix_data_key_len = key.len(); + + let start = Instant::now(); + loop { + if !event_iter.valid() { + break; + } + event_iter.next()?; + + // perform rewrite + let old_key = event_iter.key(); + if !old_key.starts_with(old_prefix) { + return Err(Error::WrongKeyPrefix { + what: "Key in file", + key: old_key.to_vec(), + prefix: old_prefix.to_vec(), + }); + } + key.truncate(new_prefix_data_key_len); + key.extend_from_slice(&old_key[old_prefix.len()..]); + let value = event_iter.value(); + + let encoded = EventEncoder::encode_event(&key, value); + for slice in encoded { + new_buff.append(&mut slice.as_ref().to_owned()); + } + } + + IMPORTER_APPLY_DURATION + .with_label_values(&["rewrite"]) + .observe(start.saturating_elapsed().as_secs_f64()); + Ok(new_buff) + } + + pub fn do_apply_kv_file( + &self, + start_key: &[u8], + end_key: &[u8], + start_ts: u64, + restore_ts: u64, + file_buff: Arc>, + build_fn: &mut dyn FnMut(Vec, Vec), + ) -> Result> { + let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut smallest_key = None; let mut largest_key = None; - let mut total_key = 0; let mut ts_not_expected = 0; let mut not_in_range = 0; - let start = Instant::now(); + loop { if !event_iter.valid() { break; } total_key += 1; event_iter.next()?; - INPORTER_APPLY_COUNT.with_label_values(&["key_meet"]).inc(); - let ts = Key::decode_ts_from(event_iter.key())?; - if ts > TimeStamp::new(restore_ts) { + + let key = event_iter.key().to_vec(); + let value = event_iter.value().to_vec(); + let ts = Key::decode_ts_from(&key)?; + if ts < TimeStamp::new(start_ts) || ts > TimeStamp::new(restore_ts) { // we assume the keys in file are sorted by ts. // so if we met the key not satisfy the ts. // we can easily filter the remain keys. ts_not_expected += 1; continue; } - if perform_rewrite { - let old_key = event_iter.key(); - - if !old_key.starts_with(old_prefix) { - return Err(Error::WrongKeyPrefix { - what: "Key in file", - key: old_key.to_vec(), - prefix: old_prefix.to_vec(), - }); - } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); - - debug!( - "perform rewrite new key: {:?}, new key prefix: {:?}, old key prefix: {:?}", - log_wrappers::Value::key(&key), - log_wrappers::Value::key(new_prefix), - log_wrappers::Value::key(old_prefix), - ); - } else { - key = event_iter.key().to_vec(); - } if check_key_in_range(&key, 0, start_key, end_key).is_err() { // key not in range, we can simply skip this key here. // the client make sure the correct region will download and apply the same @@ -451,28 +801,21 @@ impl SstImporter { not_in_range += 1; continue; } - let value = event_iter.value().to_vec(); - build_fn(key.clone(), value); - - let iter_key = key.clone(); - smallest_key = smallest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.min(iter_key.clone())), - ); - largest_key = largest_key.map_or_else( - || Some(iter_key.clone()), - |v: Vec| Some(v.max(iter_key.clone())), - ); + build_fn(key.clone(), value); + smallest_key = smallest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.min(key.clone()))); + largest_key = largest_key + .map_or_else(|| Some(key.clone()), |v: Vec| Some(v.max(key.clone()))); } - info!("build download request file done"; "total keys" => %total_key, + if total_key != not_in_range { + info!("build download request file done"; "total keys" => %total_key, "ts filtered keys" => %ts_not_expected, - "range filtered keys" => %not_in_range, - "file" => %file_path.as_ref().display()); + "range filtered keys" => %not_in_range); + } - let label = if perform_rewrite { "rewrite" } else { "normal" }; IMPORTER_APPLY_DURATION - .with_label_values(&[label]) + .with_label_values(&["normal"]) .observe(start.saturating_elapsed().as_secs_f64()); match (smallest_key, largest_key) { @@ -486,7 +829,31 @@ impl SstImporter { } } - fn do_download( + // raw download, without ext, compatibility to old tests. + #[cfg(test)] + fn download( + &self, + meta: &SstMeta, + backend: &StorageBackend, + name: &str, + rewrite_rule: &RewriteRule, + crypter: Option, + speed_limiter: Limiter, + engine: E, + ) -> Result> { + self.download_rt.block_on(self.download_ext( + meta, + backend, + name, + rewrite_rule, + crypter, + speed_limiter, + engine, + DownloadExt::default(), + )) + } + + async fn do_download_ext( &self, meta: &SstMeta, backend: &StorageBackend, @@ -495,6 +862,7 @@ impl SstImporter { crypter: Option, speed_limiter: &Limiter, engine: E, + ext: DownloadExt<'_>, ) -> Result> { let path = self.dir.join(meta)?; @@ -509,15 +877,17 @@ impl SstImporter { ..Default::default() }; - self.download_file_from_external_storage( + self.async_download_file_from_external_storage( meta.length, name, path.temp.clone(), backend, true, speed_limiter, + ext.cache_key.unwrap_or(""), restore_config, - )?; + ) + .await?; // now validate the SST file. let env = get_env(self.key_manager.clone(), get_io_rate_limiter())?; @@ -809,12 +1179,17 @@ fn is_after_end_bound>(value: &[u8], bound: &Bound) -> bool { #[cfg(test)] mod tests { - use std::io::{self, BufWriter}; + use std::{ + io::{self, BufWriter, Write}, + ops::Sub, + usize, + }; use engine_traits::{ collect, EncryptionMethod, Error as TraitError, ExternalSstFileInfo, Iterable, Iterator, RefIterable, SstReader, SstWriter, CF_DEFAULT, DATA_CFS, }; + use external_storage_export::read_external_storage_info_buff; use file_system::File; use openssl::hash::{Hasher, MessageDigest}; use tempfile::Builder; @@ -877,7 +1252,7 @@ mod tests { for (i, &range) in cases.iter().enumerate() { let path = temp_dir.path().join(format!("{}.sst", i)); - let (meta, data) = gen_sst_file(&path, range); + let (meta, data) = gen_sst_file(path, range); let mut f = dir.create(&meta, key_manager.clone()).unwrap(); f.append(&data).unwrap(); @@ -1035,7 +1410,8 @@ mod tests { }) } - fn create_sample_external_kv_file() -> Result<(tempfile::TempDir, StorageBackend, KvMeta)> { + fn create_sample_external_kv_file() + -> Result<(tempfile::TempDir, StorageBackend, KvMeta, Vec)> { let ext_dir = tempfile::tempdir()?; let file_name = "v1/t000001/abc.log"; let file_path = ext_dir.path().join(file_name); @@ -1047,6 +1423,7 @@ mod tests { (b"t1_r01".to_vec(), b"tidb".to_vec()), (b"t1_r02".to_vec(), b"tikv".to_vec()), (b"t1_r03".to_vec(), b"pingcap".to_vec()), + (b"t1_r04".to_vec(), b"test for PITR".to_vec()), ]; let mut sha256 = Hasher::new(MessageDigest::sha256()).unwrap(); @@ -1067,7 +1444,7 @@ mod tests { kv_meta.set_sha256(sha256.finish().unwrap().to_vec()); let backend = external_storage_export::make_local_backend(ext_dir.path()); - Ok((ext_dir, backend, kv_meta)) + Ok((ext_dir, backend, kv_meta, buff.buffer().to_vec())) } fn create_sample_external_rawkv_sst_file( @@ -1245,6 +1622,257 @@ mod tests { assert_eq!(err.kind(), io::ErrorKind::TimedOut); } + #[test] + fn test_read_external_storage_info_buff() { + let data = &b"input some data, used to test read buff"[..]; + let mut reader = data; + let len = reader.len() as _; + let sha_256 = { + let mut hasher = Hasher::new(MessageDigest::sha256()).unwrap(); + hasher.update(data).unwrap(); + hasher.finish().unwrap().to_vec() + }; + + // test successfully. + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256.clone()), + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test without expected_sha245. + reader = data; + let output = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + None, + 0, + )) + .unwrap(); + assert_eq!(&output, data); + + // test with wrong expectd_len. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len + 1, + Some(sha_256.clone()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("length not match")); + + // test with wrong expected_sha256. + reader = data; + let err = block_on_external_io(read_external_storage_info_buff( + &mut reader, + &Limiter::new(f64::INFINITY), + len, + Some(sha_256[..sha_256.len() - 1].to_vec()), + 0, + )) + .unwrap_err(); + assert!(err.to_string().contains("sha256 not match")); + } + + #[test] + fn test_read_external_storage_info_buff_timed_out() { + use futures_util::stream::{pending, TryStreamExt}; + + let mut input = pending::>().into_async_read(); + let err = block_on_external_io(read_external_storage_info_buff( + &mut input, + &Limiter::new(f64::INFINITY), + 0, + None, + usize::MAX, + )) + .unwrap_err(); + assert_eq!(err.kind(), io::ErrorKind::TimedOut); + } + + #[test] + fn test_do_read_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + inner + }; + + // test do_read_kv_file() + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let output = importer + .do_read_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + + assert_eq!(CacheKvFile::Mem(Arc::new(buff.clone())), output); + + // Do not shrint nothing. + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // drop the refcnt + drop(output); + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, 0); + assert_eq!(importer.file_locks.len(), 1); + + // set expired instance in Dashmap + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(61)); + } + let shrink_size = importer.shrink_by_tick(); + assert_eq!(shrink_size, buff.len()); + assert!(importer.file_locks.is_empty()); + } + + #[test] + fn test_read_kv_files_from_external_storage() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let importer = SstImporter::new( + &Config::default(), + import_dir, + Some(key_manager), + ApiVersion::V1, + ) + .unwrap(); + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ); + Arc::new(inner) + }; + + // test read all of the file. + let restore_config = external_storage_export::RestoreConfig { + expected_sha256: Some(kv_meta.get_sha256().to_vec()), + ..Default::default() + }; + + let output = importer + .read_kv_files_from_external_storage( + kv_meta.get_length(), + kv_meta.get_name(), + ext_storage.clone(), + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + assert_eq!( + buff, + output, + "we are testing addition with {} and {}", + buff.len(), + output.len() + ); + + // test read range of the file. + let (offset, len) = (5, 16); + let restore_config = external_storage_export::RestoreConfig { + range: Some((offset, len)), + ..Default::default() + }; + + let output = importer + .read_kv_files_from_external_storage( + len, + kv_meta.get_name(), + ext_storage, + &Limiter::new(f64::INFINITY), + restore_config, + ) + .unwrap(); + assert_eq!(&buff[offset as _..(offset + len) as _], &output[..]); + } + + #[test] + fn test_do_download_kv_file() { + // create a sample kv file. + let (_temp_dir, backend, kv_meta, buff) = create_sample_external_kv_file().unwrap(); + + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let (_, key_manager) = new_key_manager_for_test(); + let cfg = Config { + memory_use_ratio: 0.0, + ..Default::default() + }; + let importer = + SstImporter::new(&cfg, import_dir, Some(key_manager), ApiVersion::V1).unwrap(); + let rewrite_rule = &new_rewrite_rule(b"", b"", 12345); + let ext_storage = { + importer.wrap_kms( + importer.external_storage_or_cache(&backend, "").unwrap(), + false, + ) + }; + let path = importer + .dir + .get_import_path( + format!("{}_{}", kv_meta.get_name(), kv_meta.get_range_offset()).as_str(), + ) + .unwrap(); + + // test do_download_kv_file(). + assert!(importer.import_support_download()); + let output = importer + .read_from_kv_file( + &kv_meta, + rewrite_rule, + ext_storage, + &backend, + &Limiter::new(f64::INFINITY), + ) + .unwrap(); + assert_eq!(*output, buff); + check_file_exists(&path.save, None); + + // test shrink nothing. + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 0); + + // set expired instance in Dashmap. + for mut kv in importer.file_locks.iter_mut() { + kv.1 = Instant::now().sub(Duration::from_secs(601)); + } + let shrint_files_cnt = importer.shrink_by_tick(); + assert_eq!(shrint_files_cnt, 1); + check_file_not_exists(&path.save, None); + } + #[test] fn test_download_file_from_external_storage_for_sst() { // creates a sample SST file. @@ -1285,7 +1913,7 @@ mod tests { #[test] fn test_download_file_from_external_storage_for_kv() { - let (_temp_dir, backend, kv_meta) = create_sample_external_kv_file().unwrap(); + let (_temp_dir, backend, kv_meta, _) = create_sample_external_kv_file().unwrap(); let (_, key_manager) = new_key_manager_for_test(); let import_dir = tempfile::tempdir().unwrap(); @@ -2081,4 +2709,79 @@ mod tests { assert_eq!(sst_reader.compression_name(), expected_compression_name); } } + + #[test] + fn test_import_support_download() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.import_support_download(), false); + + let import_dir = tempfile::tempdir().unwrap(); + let importer = SstImporter::new( + &Config { + memory_use_ratio: 0.0, + ..Default::default() + }, + import_dir, + None, + ApiVersion::V1, + ) + .unwrap(); + assert_eq!(importer.import_support_download(), true); + } + + #[test] + fn test_inc_mem_and_check() { + // create importer object. + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() and dec_mem() successfully. + let meta = KvMeta { + length: 100, + ..Default::default() + }; + let check = importer.inc_mem_and_check(&meta); + assert!(check); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), meta.get_length()); + + importer.dec_mem(meta.get_length()); + assert_eq!(importer.mem_use.load(Ordering::SeqCst), 0); + + // test inc_mem_and_check() failed. + let meta = KvMeta { + length: u64::MAX, + ..Default::default() + }; + let check = importer.inc_mem_and_check(&meta); + assert!(!check); + } + + #[test] + fn test_dashmap_lock() { + let import_dir = tempfile::tempdir().unwrap(); + let importer = + SstImporter::new(&Config::default(), import_dir, None, ApiVersion::V1).unwrap(); + + let key = "file1"; + let value = (CacheKvFile::Mem(Arc::default()), Instant::now()); + let lock = importer.file_locks.entry(key.to_string()).or_insert(value); + + // test locked by try_entry() + let lock2 = importer.file_locks.try_entry(key.to_string()); + assert!(lock2.is_none()); + let lock2 = importer.file_locks.try_get(key); + assert!(lock2.is_locked()); + + // test unlocked by entry() + drop(lock); + let v = importer.file_locks.get(key).unwrap(); + assert_eq!(v.0.ref_count(), 1); + + let _buff = v.0.clone(); + assert_eq!(v.0.ref_count(), 2); + } } diff --git a/components/sst_importer/src/util.rs b/components/sst_importer/src/util.rs index dce63314073..501061e92c0 100644 --- a/components/sst_importer/src/util.rs +++ b/components/sst_importer/src/util.rs @@ -4,6 +4,7 @@ use std::path::Path; use encryption::DataKeyManager; use engine_traits::EncryptionKeyManager; +use external_storage_export::ExternalStorage; use file_system::File; use super::Result; @@ -64,6 +65,13 @@ pub fn prepare_sst_for_ingestion, Q: AsRef>( Ok(()) } +pub fn url_for(storage: &E) -> String { + storage + .url() + .map(|url| url.to_string()) + .unwrap_or_else(|err| format!("ErrUrl({})", err)) +} + #[cfg(test)] mod tests { use std::{path::Path, sync::Arc}; diff --git a/components/test_backup/Cargo.toml b/components/test_backup/Cargo.toml index 902e57d5eed..1798b50c82b 100644 --- a/components/test_backup/Cargo.toml +++ b/components/test_backup/Cargo.toml @@ -23,7 +23,7 @@ futures = "0.3" futures-executor = "0.3" futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } protobuf = "2" rand = "0.8" tempfile = "3.0" diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index a45a3f52462..e990924c638 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -256,7 +256,7 @@ impl TestSuite { let mut batch = Vec::with_capacity(1024); let mut keys = Vec::with_capacity(1024); // Write 50 times to include more different ts. - let batch_size = cmp::min(cmp::max(key_count / 50, 1), 1024); + let batch_size = (key_count / 50).clamp(1, 1024); for _ in 0..versions { let mut j = 0; while j < key_count { diff --git a/components/test_coprocessor/Cargo.toml b/components/test_coprocessor/Cargo.toml index a3bb3f8e476..03047d75e87 100644 --- a/components/test_coprocessor/Cargo.toml +++ b/components/test_coprocessor/Cargo.toml @@ -25,7 +25,7 @@ collections = { workspace = true } concurrency_manager = { workspace = true } engine_rocks = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } protobuf = "2" resource_metering = { workspace = true } test_storage = { workspace = true } @@ -33,5 +33,5 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index d9163706895..a478e6ee325 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -9,7 +9,7 @@ collections = { workspace = true } fail = "0.5" futures = "0.3" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } pd_client = { workspace = true } security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/test_pd/src/mocker/service.rs b/components/test_pd/src/mocker/service.rs index 2ff5c178c67..45dd6e5661d 100644 --- a/components/test_pd/src/mocker/service.rs +++ b/components/test_pd/src/mocker/service.rs @@ -19,7 +19,7 @@ pub struct Service { id_allocator: AtomicUsize, members_resp: Mutex>, is_bootstrapped: AtomicBool, - stores: Mutex>, + stores: Mutex>, regions: Mutex>, leaders: Mutex>, feature_gate: Mutex, @@ -47,7 +47,10 @@ impl Service { /// Add an arbitrary store. pub fn add_store(&self, store: Store) { let store_id = store.get_id(); - self.stores.lock().unwrap().insert(store_id, store); + self.stores + .lock() + .unwrap() + .insert(store_id, (store, StoreStats::new())); } pub fn set_cluster_version(&self, version: String) { @@ -107,7 +110,7 @@ impl PdMocker for Service { self.stores .lock() .unwrap() - .insert(store.get_id(), store.clone()); + .insert(store.get_id(), (store.clone(), StoreStats::new())); self.regions .lock() .unwrap() @@ -138,9 +141,10 @@ impl PdMocker for Service { let mut resp = GetStoreResponse::default(); let stores = self.stores.lock().unwrap(); match stores.get(&req.get_store_id()) { - Some(store) => { + Some((store, stats)) => { resp.set_header(Service::header()); resp.set_store(store.clone()); + resp.set_stats(stats.clone()); Some(Ok(resp)) } None => { @@ -160,7 +164,7 @@ impl PdMocker for Service { resp.set_header(Service::header()); let exclude_tombstone = req.get_exclude_tombstone_stores(); let stores = self.stores.lock().unwrap(); - for store in stores.values() { + for (store, _) in stores.values() { if exclude_tombstone && store.get_state() == StoreState::Tombstone { continue; } @@ -244,11 +248,22 @@ impl PdMocker for Service { Some(Ok(resp)) } - fn store_heartbeat(&self, _: &StoreHeartbeatRequest) -> Option> { + fn store_heartbeat( + &self, + req: &StoreHeartbeatRequest, + ) -> Option> { let mut resp = StoreHeartbeatResponse::default(); let header = Service::header(); resp.set_header(header); resp.set_cluster_version(self.feature_gate.lock().unwrap().to_owned()); + if let Some((_, stats)) = self + .stores + .lock() + .unwrap() + .get_mut(&req.get_stats().get_store_id()) + { + *stats = req.get_stats().clone(); + } Some(Ok(resp)) } diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 79b095ef0d9..9e1a2b3bb0f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -242,18 +242,19 @@ impl Pd for PdMock { let header = Service::header(); let tso_logical = self.tso_logical.clone(); let fut = async move { - resp.send_all(&mut req.map_ok(move |r| { - let logical = - tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; - let mut res = TsoResponse::default(); - res.set_header(header.clone()); - res.mut_timestamp().physical = 42; - res.mut_timestamp().logical = logical; - res.count = r.count; - (res, WriteFlags::default()) - })) - .await - .unwrap(); + // Tolerate errors like RpcFinished(None). + let _ = resp + .send_all(&mut req.map_ok(move |r| { + let logical = + tso_logical.fetch_add(r.count as i64, Ordering::SeqCst) + r.count as i64; + let mut res = TsoResponse::default(); + res.set_header(header.clone()); + res.mut_timestamp().physical = 42; + res.mut_timestamp().logical = logical; + res.count = r.count; + (res, WriteFlags::default()) + })) + .await; let _ = resp.close().await; }; ctx.spawn(fut); diff --git a/components/test_pd/src/util.rs b/components/test_pd/src/util.rs index 1b05196c346..b1a22b93c47 100644 --- a/components/test_pd/src/util.rs +++ b/components/test_pd/src/util.rs @@ -2,7 +2,7 @@ use std::sync::Arc; -use pd_client::{Config, RpcClient}; +use pd_client::{Config, RpcClient, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use tikv_util::config::ReadableDuration; @@ -23,6 +23,13 @@ pub fn new_client(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClient::new(&cfg, None, mgr).unwrap() } +pub fn new_client_v2(eps: Vec<(String, u16)>, mgr: Option>) -> RpcClientV2 { + let cfg = new_config(eps); + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} + pub fn new_client_with_update_interval( eps: Vec<(String, u16)>, mgr: Option>, @@ -34,3 +41,15 @@ pub fn new_client_with_update_interval( mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); RpcClient::new(&cfg, None, mgr).unwrap() } + +pub fn new_client_v2_with_update_interval( + eps: Vec<(String, u16)>, + mgr: Option>, + interval: ReadableDuration, +) -> RpcClientV2 { + let mut cfg = new_config(eps); + cfg.update_interval = interval; + let mgr = + mgr.unwrap_or_else(|| Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap())); + RpcClientV2::new(&cfg, None, mgr).unwrap() +} diff --git a/components/test_pd_client/Cargo.toml b/components/test_pd_client/Cargo.toml index ad2b20de5a0..9f67752b4c5 100644 --- a/components/test_pd_client/Cargo.toml +++ b/components/test_pd_client/Cargo.toml @@ -10,7 +10,7 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } @@ -18,5 +18,5 @@ slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debu slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index f23bc7e3b12..513d08643a7 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -215,13 +215,13 @@ impl Operator { } else { ConfChangeType::AddNode }; - new_pd_change_peer(conf_change_type, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(conf_change_type, peer.clone())]) } else { pdpb::RegionHeartbeatResponse::default() } } Operator::RemovePeer { ref peer, .. } => { - new_pd_change_peer(ConfChangeType::RemoveNode, peer.clone()) + new_pd_change_peer_v2(vec![change_peer(ConfChangeType::RemoveNode, peer.clone())]) } Operator::TransferLeader { ref peer, diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index fb627dccb11..71c214ae21d 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -40,7 +40,7 @@ futures = "0.3" grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } pd_client = { workspace = true } @@ -61,5 +61,5 @@ test_util = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } txn_types = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index c4ac98180a6..f9088ff4e3b 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -1202,7 +1202,7 @@ impl Cluster { self.get_engine(store_id) .get_msg_cf::(engine_traits::CF_RAFT, &key) .unwrap() - .unwrap() + .unwrap_or_default() } pub fn get_raft_local_state(&self, region_id: u64, store_id: u64) -> Option { diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index 50e62f67d28..a121a6c1e0e 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -24,7 +24,8 @@ grpc-raft-conn-num = 1 # Disable stats concurrency. procinfo performs too bad without optimization, # disable it to save CPU for real tests. stats-concurrency = 0 -raft-client-backoff-step = "5ms" +raft-client-max-backoff = "100ms" +raft-client-initial-reconnect-backoff = "100ms" [server.labels] diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 5ae1b1a13a6..ea9868afdbd 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -33,7 +33,7 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreBlackHole, RaftStoreRouter, ServerRaftStoreRouter}, + router::{LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, @@ -64,7 +64,7 @@ use tikv::{ }, storage::{ self, - kv::SnapContext, + kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, }, @@ -84,10 +84,11 @@ use super::*; use crate::Config; type SimulateStoreTransport = SimulateTransport>; -type SimulateServerTransport = - SimulateTransport>; pub type SimulateEngine = RaftKv; +type SimulateRaftExtension = ::RaftExtension; +type SimulateServerTransport = + SimulateTransport>; #[derive(Default, Clone)] pub struct AddressMap { @@ -125,13 +126,13 @@ impl StoreAddrResolver for AddressMap { struct ServerMeta { node: Node, - server: Server, + server: Server, sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: RaftRouter, raw_apply_router: ApplyRouter, - gc_worker: GcWorker, SimulateStoreTransport>, - rts_worker: Option>>, + gc_worker: GcWorker>, + rts_worker: Option>, rsmeter_cleanup: Box, } @@ -152,7 +153,7 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_client: RaftClient, concurrency_managers: HashMap, env: Arc, pub causal_ts_providers: HashMap>, @@ -176,7 +177,7 @@ impl ServerCluster { Arc::default(), security_mgr.clone(), map.clone(), - RaftStoreBlackHole, + FakeExtension, worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); @@ -218,7 +219,7 @@ impl ServerCluster { pub fn get_gc_worker( &self, node_id: u64, - ) -> &GcWorker, SimulateStoreTransport> { + ) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -334,16 +335,12 @@ impl ServerCluster { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine.clone(), - sim_router.clone(), tx, cfg.gc.clone(), Default::default(), Arc::new(region_info_accessor.clone()), ); gc_worker.start(node_id).unwrap(); - gc_worker - .start_observe_lock_apply(&mut coprocessor_host, concurrency_manager.clone()) - .unwrap(); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker @@ -356,13 +353,12 @@ impl ServerCluster { let rts_endpoint = resolved_ts::Endpoint::new( &cfg.resolved_ts, rts_worker.scheduler(), - raft_router.clone(), + raft_router, store_meta.clone(), self.pd_client.clone(), concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); // Start the worker rts_worker.start(rts_endpoint); @@ -405,6 +401,7 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); + let extension = engine.raft_extension().clone(); let store = create_raft_storage::<_, _, _, F, _>( engine, &cfg.storage, @@ -449,7 +446,7 @@ impl ServerCluster { // Create pd client, snapshot manager, server. let (resolver, state) = - resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, router.clone()); + resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, extension.clone()); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) @@ -487,7 +484,7 @@ impl ServerCluster { let debug_service = DebugService::new( engines.clone(), debug_thread_handle, - raft_router, + extension, ConfigController::default(), ); @@ -524,7 +521,6 @@ impl ServerCluster { store.clone(), copr.clone(), copr_v2.clone(), - sim_router.clone(), resolver.clone(), snap_mgr.clone(), gc_worker.clone(), diff --git a/components/test_raftstore/src/transport_simulate.rs b/components/test_raftstore/src/transport_simulate.rs index 00c12073511..06ff550aa64 100644 --- a/components/test_raftstore/src/transport_simulate.rs +++ b/components/test_raftstore/src/transport_simulate.rs @@ -273,7 +273,7 @@ pub struct DefaultFilterFactory(PhantomData); impl FilterFactory for DefaultFilterFactory { fn generate(&self, _: u64) -> Vec> { - vec![Box::new(F::default())] + vec![Box::::default()] } } diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 3718dbce906..64bdca19025 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -301,7 +301,6 @@ pub fn new_transfer_leader_cmd(peer: metapb::Peer) -> AdminRequest { cmd } -#[allow(dead_code)] pub fn new_prepare_merge(target_region: metapb::Region) -> AdminRequest { let mut cmd = AdminRequest::default(); cmd.set_cmd_type(AdminCmdType::PrepareMerge); @@ -818,6 +817,41 @@ pub fn must_kv_read_equal(client: &TikvClient, ctx: Context, key: Vec, val: assert_eq!(get_resp.take_value(), val); } +pub fn write_and_read_key( + client: &TikvClient, + ctx: &Context, + ts: &mut u64, + k: Vec, + v: Vec, +) { + // Prewrite + let prewrite_start_version = *ts + 1; + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite( + client, + ctx.clone(), + vec![mutation], + k.clone(), + prewrite_start_version, + ); + // Commit + let commit_version = *ts + 2; + must_kv_commit( + client, + ctx.clone(), + vec![k.clone()], + prewrite_start_version, + commit_version, + commit_version, + ); + // Get + *ts += 3; + must_kv_read_equal(client, ctx.clone(), k, v, *ts); +} + pub fn kv_read(client: &TikvClient, ctx: Context, key: Vec, ts: u64) -> GetResponse { let mut get_req = GetRequest::default(); get_req.set_context(ctx); @@ -988,6 +1022,39 @@ pub fn kv_pessimistic_lock( kv_pessimistic_lock_with_ttl(client, ctx, keys, ts, for_update_ts, return_values, 20) } +pub fn kv_pessimistic_lock_resumable( + client: &TikvClient, + ctx: Context, + keys: Vec>, + ts: u64, + for_update_ts: u64, + wait_timeout: Option, + return_values: bool, + check_existence: bool, +) -> PessimisticLockResponse { + let mut req = PessimisticLockRequest::default(); + req.set_context(ctx); + let primary = keys[0].clone(); + let mut mutations = vec![]; + for key in keys { + let mut mutation = Mutation::default(); + mutation.set_op(Op::PessimisticLock); + mutation.set_key(key); + mutations.push(mutation); + } + req.set_mutations(mutations.into()); + req.primary_lock = primary; + req.start_version = ts; + req.for_update_ts = for_update_ts; + req.lock_ttl = 20; + req.is_first_lock = false; + req.wait_timeout = wait_timeout.unwrap_or(-1); + req.set_wake_up_mode(PessimisticLockWakeUpMode::WakeUpModeForceLock); + req.return_values = return_values; + req.check_existence = check_existence; + client.kv_pessimistic_lock(&req).unwrap() +} + pub fn kv_pessimistic_lock_with_ttl( client: &TikvClient, ctx: Context, @@ -1023,12 +1090,18 @@ pub fn must_kv_pessimistic_lock(client: &TikvClient, ctx: Context, key: Vec, assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); } -pub fn must_kv_pessimistic_rollback(client: &TikvClient, ctx: Context, key: Vec, ts: u64) { +pub fn must_kv_pessimistic_rollback( + client: &TikvClient, + ctx: Context, + key: Vec, + ts: u64, + for_update_ts: u64, +) { let mut req = PessimisticRollbackRequest::default(); req.set_context(ctx); req.set_keys(vec![key].into_iter().collect()); req.start_version = ts; - req.for_update_ts = ts; + req.for_update_ts = for_update_ts; let resp = client.kv_pessimistic_rollback(&req).unwrap(); assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); assert!(resp.errors.is_empty(), "{:?}", resp.get_errors()); @@ -1055,57 +1128,6 @@ pub fn must_check_txn_status( resp } -pub fn must_physical_scan_lock( - client: &TikvClient, - ctx: Context, - max_ts: u64, - start_key: &[u8], - limit: usize, -) -> Vec { - let mut req = PhysicalScanLockRequest::default(); - req.set_context(ctx); - req.set_max_ts(max_ts); - req.set_start_key(start_key.to_owned()); - req.set_limit(limit as _); - let mut resp = client.physical_scan_lock(&req).unwrap(); - resp.take_locks().into() -} - -pub fn register_lock_observer(client: &TikvClient, max_ts: u64) -> RegisterLockObserverResponse { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.register_lock_observer(&req).unwrap() -} - -pub fn must_register_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = register_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - -pub fn check_lock_observer(client: &TikvClient, max_ts: u64) -> CheckLockObserverResponse { - let mut req = CheckLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.check_lock_observer(&req).unwrap() -} - -pub fn must_check_lock_observer(client: &TikvClient, max_ts: u64, clean: bool) -> Vec { - let mut resp = check_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert_eq!(resp.get_is_clean(), clean); - resp.take_locks().into() -} - -pub fn remove_lock_observer(client: &TikvClient, max_ts: u64) -> RemoveLockObserverResponse { - let mut req = RemoveLockObserverRequest::default(); - req.set_max_ts(max_ts); - client.remove_lock_observer(&req).unwrap() -} - -pub fn must_remove_lock_observer(client: &TikvClient, max_ts: u64) { - let resp = remove_lock_observer(client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); -} - pub fn get_tso(pd_client: &TestPdClient) -> u64 { block_on(pd_client.get_tso()).unwrap().into_inner() } @@ -1224,9 +1246,13 @@ pub fn must_flashback_to_version( version: u64, start_ts: u64, commit_ts: u64, -) -> FlashbackToVersionResponse { +) { let mut prepare_req = PrepareFlashbackToVersionRequest::default(); prepare_req.set_context(ctx.clone()); + prepare_req.set_start_ts(start_ts); + prepare_req.set_version(version); + prepare_req.set_start_key(b"a".to_vec()); + prepare_req.set_end_key(b"z".to_vec()); client .kv_prepare_flashback_to_version(&prepare_req) .unwrap(); @@ -1234,10 +1260,12 @@ pub fn must_flashback_to_version( req.set_context(ctx); req.set_start_ts(start_ts); req.set_commit_ts(commit_ts); - req.version = version; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); - client.kv_flashback_to_version(&req).unwrap() + req.set_version(version); + req.set_start_key(b"a".to_vec()); + req.set_end_key(b"z".to_vec()); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); } // A helpful wrapper to make the test logic clear @@ -1321,7 +1349,7 @@ impl PeerClient { } pub fn must_kv_pessimistic_rollback(&self, key: Vec, ts: u64) { - must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts) + must_kv_pessimistic_rollback(&self.cli, self.ctx.clone(), key, ts, ts) } } diff --git a/components/test_sst_importer/Cargo.toml b/components/test_sst_importer/Cargo.toml index b0c3e96ef5a..f951a6755e6 100644 --- a/components/test_sst_importer/Cargo.toml +++ b/components/test_sst_importer/Cargo.toml @@ -13,5 +13,5 @@ crc32fast = "1.2" engine_rocks = { workspace = true } engine_traits = { workspace = true } keys = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } diff --git a/components/test_storage/Cargo.toml b/components/test_storage/Cargo.toml index 04adc4e6de4..b1172b5d559 100644 --- a/components/test_storage/Cargo.toml +++ b/components/test_storage/Cargo.toml @@ -24,7 +24,7 @@ test-engines-panic = [ api_version = { workspace = true } collections = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } pd_client = { workspace = true } raftstore = { workspace = true } test_raftstore = { workspace = true } diff --git a/components/test_storage/src/sync_storage.rs b/components/test_storage/src/sync_storage.rs index fa53688ea75..3d6e1e139e5 100644 --- a/components/test_storage/src/sync_storage.rs +++ b/components/test_storage/src/sync_storage.rs @@ -12,10 +12,7 @@ use kvproto::{ kvrpcpb::{ChecksumAlgorithm, Context, GetRequest, KeyRange, LockInfo, RawGetRequest}, metapb, }; -use raftstore::{ - coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}, - router::RaftStoreBlackHole, -}; +use raftstore::coprocessor::{region_info_accessor::MockRegionInfoProvider, RegionInfoProvider}; use tikv::{ server::gc_worker::{AutoGcConfig, GcConfig, GcSafePointProvider, GcWorker}, storage::{ @@ -106,7 +103,7 @@ impl SyncTestStorageBuilder { /// Only used for test purpose. #[derive(Clone)] pub struct SyncTestStorage { - gc_worker: GcWorker, + gc_worker: GcWorker, store: Storage, } @@ -123,7 +120,6 @@ impl SyncTestStorage { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( storage.get_engine(), - RaftStoreBlackHole, tx, config, Default::default(), diff --git a/components/test_util/Cargo.toml b/components/test_util/Cargo.toml index 8aca28b092b..740132353f3 100644 --- a/components/test_util/Cargo.toml +++ b/components/test_util/Cargo.toml @@ -16,7 +16,7 @@ collections = { workspace = true } encryption_export = { workspace = true } fail = "0.5" grpcio = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } rand = "0.8" rand_isaac = "0.3" security = { workspace = true } diff --git a/components/test_util/src/runner.rs b/components/test_util/src/runner.rs index d05f7e98879..ee2b6548c23 100644 --- a/components/test_util/src/runner.rs +++ b/components/test_util/src/runner.rs @@ -61,11 +61,11 @@ pub fn run_test_with_hook(cases: &[&TestDescAndFn], hook: impl TestHook + Send + let f = match case.testfn { TestFn::StaticTestFn(f) => TestFn::DynTestFn(Box::new(move || { let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); - f(); + f() })), TestFn::StaticBenchFn(f) => TestFn::DynBenchFn(Box::new(move |b| { let _watcher = CaseLifeWatcher::new(name.clone(), hook.clone()); - f(b); + f(b) })), ref f => panic!("unexpected testfn {:?}", f), }; diff --git a/components/tidb_query_aggr/Cargo.toml b/components/tidb_query_aggr/Cargo.toml index db8d9d64faf..facc9d32f36 100644 --- a/components/tidb_query_aggr/Cargo.toml +++ b/components/tidb_query_aggr/Cargo.toml @@ -12,7 +12,7 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tidb_query_expr = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } [dev-dependencies] panic_hook = { workspace = true } diff --git a/components/tidb_query_aggr/src/impl_max_min.rs b/components/tidb_query_aggr/src/impl_max_min.rs index f4046c35440..c18710b3645 100644 --- a/components/tidb_query_aggr/src/impl_max_min.rs +++ b/components/tidb_query_aggr/src/impl_max_min.rs @@ -514,10 +514,10 @@ where self.extremum = value.copied() } } else { - let v1 = self.extremum.map(|x| x as i64); - let v2 = value.map(|x| *x as i64); + let v1: Option = self.extremum; + let v2: Option = value.copied(); if v1.cmp(&v2) == E::ORD { - self.extremum = value.copied() + self.extremum = v2; } } } diff --git a/components/tidb_query_codegen/src/rpn_function.rs b/components/tidb_query_codegen/src/rpn_function.rs index 864fce9afd8..dfdede3a3b3 100644 --- a/components/tidb_query_codegen/src/rpn_function.rs +++ b/components/tidb_query_codegen/src/rpn_function.rs @@ -385,7 +385,7 @@ impl parse::Parse for RpnFnAttr { )); } - if !is_varg && !is_raw_varg && (min_args != None || max_args != None) { + if !is_varg && !is_raw_varg && (min_args.is_some() || max_args.is_some()) { return Err(Error::new_spanned( config_items, "`min_args` or `max_args` is only available when `varg` or `raw_varg` presents", diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 05133b130e7..3dd1693ba0d 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -11,7 +11,7 @@ async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } futures = "0.3" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } @@ -20,7 +20,7 @@ serde_json = "1.0" thiserror = "1.0" tikv_util = { workspace = true } time = "0.1" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [dev-dependencies] byteorder = "1.2" diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index de8f0b41110..e9d96e16284 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -18,11 +18,11 @@ collections = { workspace = true } encoding_rs = { git = "https://github.com/xiongjiwei/encoding_rs.git", rev = "68e0bc5a72a37a78228d80cd98047326559cf43c" } error_code = { workspace = true } hex = "0.4" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" log_wrappers = { workspace = true } match-template = "0.0.1" -nom = { version = "5.1.0", default-features = false, features = ["std"] } +nom = { version = "7.1.0", default-features = false, features = ["std"] } num = { version = "0.3", default-features = false } num-derive = "0.3" num-traits = "0.2" @@ -38,4 +38,4 @@ thiserror = "1.0" tidb_query_common = { workspace = true } tikv_alloc = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } diff --git a/components/tidb_query_datatype/src/codec/collation/charset.rs b/components/tidb_query_datatype/src/codec/collation/charset.rs index 482e19cb999..9ea76f16b92 100644 --- a/components/tidb_query_datatype/src/codec/collation/charset.rs +++ b/components/tidb_query_datatype/src/codec/collation/charset.rs @@ -22,6 +22,10 @@ impl Charset for CharsetBinary { Some((data[0], 1)) } } + + fn charset() -> crate::Charset { + crate::Charset::Binary + } } pub struct CharsetUtf8mb4; @@ -48,6 +52,10 @@ impl Charset for CharsetUtf8mb4 { }) } } + + fn charset() -> crate::Charset { + crate::Charset::Utf8Mb4 + } } // gbk character data actually stored with utf8mb4 character encoding. diff --git a/components/tidb_query_datatype/src/codec/collation/mod.rs b/components/tidb_query_datatype/src/codec/collation/mod.rs index cdc21cbe35a..9fbef4f1ee2 100644 --- a/components/tidb_query_datatype/src/codec/collation/mod.rs +++ b/components/tidb_query_datatype/src/codec/collation/mod.rs @@ -41,6 +41,32 @@ macro_rules! match_template_collator { }} } +#[macro_export] +macro_rules! match_template_multiple_collators { + ((), (), $($tail:tt)*) => { + $($tail)* + }; + (($first:tt), ($match_exprs:tt), $($tail:tt)*) => { + match_template_multiple_collators! { + ($first,), ($match_exprs,), $($tail)* + } + }; + (($first:tt, $($t:tt)*), ($first_match_expr:tt, $($match_exprs:tt)*), $($tail:tt)*) => {{ + #[allow(unused_imports)] + use $crate::codec::collation::collator::*; + + match_template_collator! { + $first, match $first_match_expr { + Collation::$first => { + match_template_multiple_collators! { + ($($t)*), ($($match_exprs)*), $($tail)* + } + } + } + } + }}; +} + #[macro_export] macro_rules! match_template_charset { ($t:tt, $($tail:tt)*) => {{ @@ -67,6 +93,8 @@ pub trait Charset { fn validate(bstr: &[u8]) -> Result<()>; fn decode_one(data: &[u8]) -> Option<(Self::Char, usize)>; + + fn charset() -> crate::Charset; } pub trait Collator: 'static + std::marker::Send + std::marker::Sync + std::fmt::Debug { diff --git a/components/tidb_query_datatype/src/codec/convert.rs b/components/tidb_query_datatype/src/codec/convert.rs index 26ae799c4ff..418841547ca 100644 --- a/components/tidb_query_datatype/src/codec/convert.rs +++ b/components/tidb_query_datatype/src/codec/convert.rs @@ -186,7 +186,7 @@ pub fn integer_signed_lower_bound(tp: FieldTypeTp) -> i64 { /// `truncate_binary` truncates a buffer to the specified length. #[inline] pub fn truncate_binary(s: &mut Vec, flen: isize) { - if flen != crate::UNSPECIFIED_LENGTH as isize && s.len() > flen as usize { + if flen != crate::UNSPECIFIED_LENGTH && s.len() > flen as usize { s.truncate(flen as usize); } } @@ -431,7 +431,7 @@ impl ToInt for Decimal { fn to_int(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_i64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_int(ctx, tp) } @@ -440,7 +440,7 @@ impl ToInt for Decimal { fn to_uint(&self, ctx: &mut EvalContext, tp: FieldTypeTp) -> Result { let dec = round_decimal_with_ctx(ctx, *self)?; let val = dec.as_u64(); - let err = Error::truncated_wrong_val("DECIMAL", &dec); + let err = Error::truncated_wrong_val("DECIMAL", dec); let r = val.into_result_with_overflow_err(ctx, err)?; r.to_uint(ctx, tp) } @@ -639,7 +639,7 @@ pub fn produce_dec_with_specified_tp( // select (cast 111 as decimal(1)) causes a warning in MySQL. ctx.handle_overflow_err(Error::overflow( "Decimal", - &format!("({}, {})", flen, decimal), + format!("({}, {})", flen, decimal), ))?; dec = max_or_min_dec(dec.is_negative(), flen as u8, decimal as u8) } else if frac != decimal { @@ -648,7 +648,7 @@ pub fn produce_dec_with_specified_tp( .round(decimal as i8, RoundMode::HalfEven) .into_result_with_overflow_err( ctx, - Error::overflow("Decimal", &format!("({}, {})", flen, decimal)), + Error::overflow("Decimal", format!("({}, {})", flen, decimal)), )?; if !rounded.is_zero() && frac > decimal && rounded != old { if ctx.cfg.flag.contains(Flag::IN_INSERT_STMT) @@ -811,7 +811,7 @@ impl ConvertTo for &[u8] { .map_err(|err| -> Error { box_err!("Parse '{}' to float err: {:?}", vs, err) })?; // The `parse` will return Ok(inf) if the float string literal out of range if val.is_infinite() { - ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", &vs))?; + ctx.handle_truncate_err(Error::truncated_wrong_val("DOUBLE", vs))?; if val.is_sign_negative() { return Ok(f64::MIN); } else { @@ -1036,7 +1036,7 @@ fn exp_float_str_to_int_str<'a>( // And the intCnt may contain the len of `+/-`, // so here we use 21 here as the early detection. ctx.warnings - .append_warning(Error::overflow("BIGINT", &valid_float)); + .append_warning(Error::overflow("BIGINT", valid_float)); return Cow::Borrowed(valid_float); } if int_cnt <= 0 { diff --git a/components/tidb_query_datatype/src/codec/mysql/decimal.rs b/components/tidb_query_datatype/src/codec/mysql/decimal.rs index 135a3cd2ce7..143ec6c7760 100644 --- a/components/tidb_query_datatype/src/codec/mysql/decimal.rs +++ b/components/tidb_query_datatype/src/codec/mysql/decimal.rs @@ -373,11 +373,11 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { } let mut carry = 0; let mut res = res.map(|_| Decimal::new(int_cnt, frac_cnt, negative)); - let mut l_idx = l_start + l_int_word_cnt as usize + l_frac_word_cnt as usize; - let mut r_idx = r_start + r_int_word_cnt as usize + r_frac_word_cnt as usize; + let mut l_idx = l_start + l_int_word_cnt + l_frac_word_cnt as usize; + let mut r_idx = r_start + r_int_word_cnt + r_frac_word_cnt as usize; // adjust `l_idx` and `r_idx` to the same position of digits after the point. if l_frac_word_cnt > r_frac_word_cnt { - let l_stop = l_start + l_int_word_cnt as usize + r_frac_word_cnt as usize; + let l_stop = l_start + l_int_word_cnt + r_frac_word_cnt as usize; if l_frac_word_cnt < frac_word_to { // It happens only when suffix 0 exist(3.10000000000-2.00). idx_to -= (frac_word_to - l_frac_word_cnt) as usize; @@ -388,7 +388,7 @@ fn do_sub<'a>(mut lhs: &'a Decimal, mut rhs: &'a Decimal) -> Res { res.word_buf[idx_to] = lhs.word_buf[l_idx]; } } else { - let r_stop = r_start + r_int_word_cnt as usize + l_frac_word_cnt as usize; + let r_stop = r_start + r_int_word_cnt + l_frac_word_cnt as usize; if frac_word_to > r_frac_word_cnt { // It happens only when suffix 0 exist(3.00-2.00000000000). idx_to -= (frac_word_to - r_frac_word_cnt) as usize; @@ -802,7 +802,7 @@ fn do_mul(lhs: &Decimal, rhs: &Decimal) -> Res { word_cnt!(lhs.int_cnt + rhs.int_cnt) as usize, l_frac_word_cnt + r_frac_word_cnt, ); - let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to as i32); + let (mut old_int_word_to, mut old_frac_word_to) = (int_word_to as i32, frac_word_to); let res = fix_word_cnt_err(int_word_to as u8, frac_word_to as u8, WORD_BUF_LEN); let (int_word_to, frac_word_to) = (res.0 as usize, res.1 as usize); let negative = lhs.negative != rhs.negative; @@ -1623,7 +1623,7 @@ impl Decimal { let mut inner_idx = 0; let mut word_idx = int_word_cnt as usize; let mut word = 0; - for c in bs[int_idx - int_cnt as usize..int_idx].iter().rev() { + for c in bs[int_idx - int_cnt..int_idx].iter().rev() { word += u32::from(c - b'0') * TEN_POW[inner_idx]; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { @@ -1642,7 +1642,7 @@ impl Decimal { word_idx = int_word_cnt as usize; word = 0; inner_idx = 0; - for &c in bs.iter().skip(int_idx + 1).take(frac_cnt as usize) { + for &c in bs.iter().skip(int_idx + 1).take(frac_cnt) { word = u32::from(c - b'0') + word * 10; inner_idx += 1; if inner_idx == DIGITS_PER_WORD as usize { @@ -2389,7 +2389,7 @@ impl Hash for Decimal { while idx < stop && self.word_buf[idx] == 0 { idx += 1; } - let start = idx as usize; + let start = idx; let int_word_cnt = stop - idx; int_word_cnt.hash(state); diff --git a/components/tidb_query_datatype/src/codec/mysql/duration.rs b/components/tidb_query_datatype/src/codec/mysql/duration.rs index 520c985f4b5..7279f788146 100644 --- a/components/tidb_query_datatype/src/codec/mysql/duration.rs +++ b/components/tidb_query_datatype/src/codec/mysql/duration.rs @@ -1070,7 +1070,7 @@ mod tests { #[test] fn test_checked_add_and_sub_duration() { /// `MAX_TIME_IN_SECS` is the maximum for mysql time type. - const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR as i64 + const MAX_TIME_IN_SECS: i64 = MAX_HOUR_PART as i64 * SECS_PER_HOUR + MAX_MINUTE_PART as i64 * SECS_PER_MINUTE + MAX_SECOND_PART as i64; @@ -1110,7 +1110,7 @@ mod tests { // UNSPECIFIED_FSP ( 8385959, - UNSPECIFIED_FSP as i8, + UNSPECIFIED_FSP, Ok(Duration::parse(&mut EvalContext::default(), "838:59:59", 0).unwrap()), false, ), diff --git a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs index daeae751fb5..c965247b8da 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/binary.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/binary.rs @@ -5,9 +5,29 @@ use std::convert::TryInto; use codec::number::NumberCodec; use super::{constants::*, JsonRef, JsonType, ERR_CONVERT_FAILED}; -use crate::codec::Result; +use crate::codec::{mysql::json::path_expr::ArrayIndex, Result}; impl<'a> JsonRef<'a> { + /// Gets the index from the ArrayIndex + /// + /// If the idx is greater than the count and is from right, it will return + /// `None` + /// + /// See `jsonPathArrayIndex.getIndexFromStart()` in TiDB + /// `types/json_path_expr.go` + pub fn array_get_index(&self, idx: ArrayIndex) -> Option { + match idx { + ArrayIndex::Left(idx) => Some(idx as usize), + ArrayIndex::Right(idx) => { + if self.get_elem_count() < 1 + (idx as usize) { + None + } else { + Some(self.get_elem_count() - 1 - (idx as usize)) + } + } + } + } + /// Gets the ith element in JsonRef /// /// See `arrayGetElem()` in TiDB `json/binary.go` @@ -62,7 +82,7 @@ impl<'a> JsonRef<'a> { pub fn val_entry_get(&self, val_entry_off: usize) -> Result> { let val_type: JsonType = self.value()[val_entry_off].try_into()?; let val_offset = - NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN as usize..]) as usize; + NumberCodec::decode_u32_le(&self.value()[val_entry_off + TYPE_LEN..]) as usize; Ok(match val_type { JsonType::Literal => { let offset = val_entry_off + TYPE_LEN; diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs index d40451fc9b5..7e619e74c32 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_extract.rs @@ -4,9 +4,10 @@ use collections::HashSet; use super::{ super::Result, - path_expr::{PathExpression, PathLeg, PATH_EXPR_ARRAY_INDEX_ASTERISK, PATH_EXPR_ASTERISK}, + path_expr::{PathExpression, PathLeg}, Json, JsonRef, JsonType, }; +use crate::codec::mysql::json::path_expr::{ArrayIndex, ArraySelection, KeySelection}; impl<'a> JsonRef<'a> { /// `extract` receives several path expressions as arguments, matches them @@ -21,8 +22,11 @@ impl<'a> JsonRef<'a> { let mut elem_list = Vec::with_capacity(path_expr_list.len()); for path_expr in path_expr_list { could_return_multiple_matches |= path_expr.contains_any_asterisk(); + could_return_multiple_matches |= path_expr.contains_any_range(); + elem_list.append(&mut extract_json(*self, &path_expr.legs)?) } + if elem_list.is_empty() { Ok(None) } else if could_return_multiple_matches { @@ -79,43 +83,86 @@ pub fn extract_json<'a>(j: JsonRef<'a>, path_legs: &[PathLeg]) -> Result match j.get_type() { + match current_leg { + PathLeg::ArraySelection(selection) => match j.get_type() { JsonType::Array => { let elem_count = j.get_elem_count(); - if i == PATH_EXPR_ARRAY_INDEX_ASTERISK { - for k in 0..elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.array_get_elem(k)?, sub_path_legs)?, - ) + match selection { + ArraySelection::Asterisk => { + for k in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(k)?, sub_path_legs)?, + ) + } + } + ArraySelection::Index(index) => { + if let Some(index) = j.array_get_index(*index) { + if index < elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(index)?, sub_path_legs)?, + ) + } + } + } + ArraySelection::Range(start, end) => { + if let (Some(start), Some(mut end)) = + (j.array_get_index(*start), j.array_get_index(*end)) + { + if end >= elem_count { + end = elem_count - 1 + } + if start <= end { + for i in start..=end { + append_if_ref_unique( + &mut ret, + &extract_json(j.array_get_elem(i)?, sub_path_legs)?, + ) + } + } + } } - } else if (i as usize) < elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.array_get_elem(i as usize)?, sub_path_legs)?, - ) } } _ => { - if i as usize == 0 { - append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + // If the current object is not an array, still append them if the selection + // includes 0. But for asterisk, it still returns NULL. + // + // as the element is not array, don't use `array_get_index` + match selection { + ArraySelection::Index(ArrayIndex::Left(0)) => { + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0) | ArrayIndex::Left(_), + ) => { + // for [0 to Non-negative Number] and [0 to last], it extracts itself + append_if_ref_unique(&mut ret, &extract_json(j, sub_path_legs)?) + } + _ => {} } } }, - PathLeg::Key(ref key) => { + PathLeg::Key(key) => { if j.get_type() == JsonType::Object { - if key == PATH_EXPR_ASTERISK { - let elem_count = j.get_elem_count(); - for i in 0..elem_count { - append_if_ref_unique( - &mut ret, - &extract_json(j.object_get_val(i)?, sub_path_legs)?, - ) + match key { + KeySelection::Asterisk => { + let elem_count = j.get_elem_count(); + for i in 0..elem_count { + append_if_ref_unique( + &mut ret, + &extract_json(j.object_get_val(i)?, sub_path_legs)?, + ) + } + } + KeySelection::Key(key) => { + if let Some(idx) = j.object_search_key(key.as_bytes()) { + let val = j.object_get_val(idx)?; + append_if_ref_unique(&mut ret, &extract_json(val, sub_path_legs)?) + } } - } else if let Some(idx) = j.object_search_key(key.as_bytes()) { - let val = j.object_get_val(idx)?; - append_if_ref_unique(&mut ret, &extract_json(val, sub_path_legs)?) } } } @@ -154,10 +201,15 @@ mod tests { use super::{ super::path_expr::{ PathExpressionFlag, PATH_EXPRESSION_CONTAINS_ASTERISK, - PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, PATH_EXPR_ARRAY_INDEX_ASTERISK, + PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, *, }; + use crate::codec::mysql::json::path_expr::{ArrayIndex, PATH_EXPRESSION_CONTAINS_RANGE}; + + fn select_from_left(index: usize) -> PathLeg { + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Left(index as u32))) + } #[test] fn test_json_extract() { @@ -168,7 +220,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -176,7 +228,7 @@ mod tests { ( "[true, 2017]", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some("[true, 2017]"), @@ -184,7 +236,7 @@ mod tests { ( "[true, 2107]", vec![PathExpression { - legs: vec![PathLeg::Index(2)], + legs: vec![select_from_left(2)], flags: PathExpressionFlag::default(), }], None, @@ -192,7 +244,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6.18"), @@ -200,7 +252,7 @@ mod tests { ( "6.18", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -208,7 +260,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("true"), @@ -216,7 +268,7 @@ mod tests { ( "true", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -224,7 +276,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("6"), @@ -232,7 +284,7 @@ mod tests { ( "6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -240,7 +292,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some("-6"), @@ -248,7 +300,7 @@ mod tests { ( "-6", vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -256,7 +308,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Asterisk)], flags: PathExpressionFlag::default(), }], None, @@ -264,7 +316,7 @@ mod tests { ( r#"{"a": [1, 2, {"aa": "xx"}]}"#, vec![PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![select_from_left(0)], flags: PathExpressionFlag::default(), }], Some(r#"{"a": [1, 2, {"aa": "xx"}]}"#), @@ -273,7 +325,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("c"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("c")))], flags: PathExpressionFlag::default(), }], Some("false"), @@ -281,7 +333,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from(PATH_EXPR_ASTERISK))], + legs: vec![PathLeg::Key(KeySelection::Asterisk)], flags: PATH_EXPRESSION_CONTAINS_ASTERISK, }], Some(r#"["a1", 20.08, false]"#), @@ -289,7 +341,7 @@ mod tests { ( r#"{"a": "a1", "b": 20.08, "c": false}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("d"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("d")))], flags: PathExpressionFlag::default(), }], None, @@ -298,7 +350,10 @@ mod tests { ( "21", vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], None, @@ -306,7 +361,10 @@ mod tests { ( r#"{"g": {"a": "a1", "b": 20.08, "c": false}}"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[false]"), @@ -314,7 +372,10 @@ mod tests { ( r#"[{"a": "a1", "b": 20.08, "c": false}, true]"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("c"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("c"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[false]"), @@ -322,7 +383,7 @@ mod tests { ( r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[[0, 1], 0, 1, 2, 3, 4, 5, 6]"), @@ -331,11 +392,11 @@ mod tests { r#"[[0, 1], [2, 3], [4, [5, 6]]]"#, vec![ PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }, ], @@ -344,7 +405,7 @@ mod tests { ( "[1]", vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some("[1]"), @@ -352,7 +413,10 @@ mod tests { ( r#"{"a": 1}"#, vec![PathExpression { - legs: vec![PathLeg::Key(String::from("a")), PathLeg::Index(0)], + legs: vec![ + PathLeg::Key(KeySelection::Key(String::from("a"))), + select_from_left(0), + ], flags: PathExpressionFlag::default(), }], Some("1"), @@ -360,7 +424,7 @@ mod tests { ( r#"{"a": 1}"#, vec![PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Index(0)], + legs: vec![PathLeg::DoubleAsterisk, select_from_left(0)], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }], Some(r#"[{"a": 1}, 1]"#), @@ -369,10 +433,10 @@ mod tests { r#"{"a": 1}"#, vec![PathExpression { legs: vec![ - PathLeg::Index(0), - PathLeg::Index(0), - PathLeg::Index(0), - PathLeg::Key(String::from("a")), + select_from_left(0), + select_from_left(0), + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), ], flags: PathExpressionFlag::default(), }], @@ -383,8 +447,8 @@ mod tests { vec![PathExpression { legs: vec![ PathLeg::DoubleAsterisk, - PathLeg::Key(String::from("a")), - PathLeg::Key(String::from("*")), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::Key(KeySelection::Asterisk), ], flags: PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, @@ -395,11 +459,17 @@ mod tests { r#"[{"a": [3,4]}, {"b": 2 }]"#, vec![ PathExpression { - legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }, PathExpression { - legs: vec![PathLeg::Index(1), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(1), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }, ], @@ -408,11 +478,136 @@ mod tests { ( r#"[{"a": [1,1,1,1]}]"#, vec![PathExpression { - legs: vec![PathLeg::Index(0), PathLeg::Key(String::from("a"))], + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PathExpressionFlag::default(), }], Some("[1, 1, 1, 1]"), ), + ( + r#"[1,2,3,4]"#, + vec![PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(2), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(0))), + ], + flags: PathExpressionFlag::default(), + }], + Some("4"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(1))), + ], + flags: PathExpressionFlag::default(), + }], + Some("3"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(100))), + ], + flags: PathExpressionFlag::default(), + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + None, + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(100), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3,4]"), + ), + ( + r#"[{"a": [1,2,3,4]}]"#, + vec![PathExpression { + legs: vec![ + select_from_left(0), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(0), + ArrayIndex::Left(2), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }], + Some("[1,2,3]"), + ), ]; for (i, (js, exprs, expected)) in test_cases.drain(..).enumerate() { let j = js.parse(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs index e8c709e9571..b359158d06b 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_modify.rs @@ -33,7 +33,7 @@ impl<'a> JsonRef<'a> { )); } for expr in path_expr_list { - if expr.contains_any_asterisk() { + if expr.contains_any_asterisk() || expr.contains_any_range() { return Err(box_err!( "Invalid path expression: expected no asterisk, found {:?}", expr diff --git a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs index a350df91b06..bcb6fd01716 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/json_remove.rs @@ -7,10 +7,9 @@ impl<'a> JsonRef<'a> { /// All path expressions cannot contain * or ** wildcard. /// If any error occurs, the input won't be changed. pub fn remove(&self, path_expr_list: &[PathExpression]) -> Result { - if path_expr_list - .iter() - .any(|expr| expr.legs.is_empty() || expr.contains_any_asterisk()) - { + if path_expr_list.iter().any(|expr| { + expr.legs.is_empty() || expr.contains_any_asterisk() || expr.contains_any_range() + }) { return Err(box_err!("Invalid path expression")); } diff --git a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs index 0836eae9d5b..58fe8fbbbcb 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/modifier.rs @@ -11,6 +11,7 @@ use super::{ path_expr::{PathExpression, PathLeg}, Json, JsonRef, JsonType, }; +use crate::codec::mysql::json::path_expr::{ArraySelection, KeySelection}; /// A helper struct that derives a new JSON by combining and manipulating /// the encoded bytes directly. Only used by `json_replace`, `json_set`, @@ -88,7 +89,7 @@ impl<'a> BinaryModifier<'a> { } let parent_node = &result[0]; match last_leg { - PathLeg::Index(_) => { + PathLeg::ArraySelection(ArraySelection::Index(_)) => { // Record the parent node value offset, as it's actually relative to `old` self.to_be_modified_ptr = parent_node.as_ptr(); match parent_node.get_type() { @@ -109,7 +110,7 @@ impl<'a> BinaryModifier<'a> { } } } - PathLeg::Key(insert_key) => { + PathLeg::Key(KeySelection::Key(insert_key)) => { // Ignore constant if parent_node.get_type() != JsonType::Object { return Ok(()); @@ -168,21 +169,23 @@ impl<'a> BinaryModifier<'a> { } let parent_node = &result[0]; match last_leg { - PathLeg::Index(remove_idx) => { + PathLeg::ArraySelection(ArraySelection::Index(remove_idx)) => { if parent_node.get_type() == JsonType::Array { self.to_be_modified_ptr = parent_node.as_ptr(); let elems_count = parent_node.get_elem_count(); let mut elems = Vec::with_capacity(elems_count - 1); - let remove_idx = *remove_idx as usize; - for i in 0..elems_count { - if i != remove_idx { - elems.push(parent_node.array_get_elem(i)?); + if let Some(remove_idx) = parent_node.array_get_index(*remove_idx) { + for i in 0..elems_count { + if i != remove_idx { + elems.push(parent_node.array_get_elem(i)?); + } } + + self.new_value = Some(Json::from_ref_array(elems)?); } - self.new_value = Some(Json::from_ref_array(elems)?); } } - PathLeg::Key(remove_key) => { + PathLeg::Key(KeySelection::Key(remove_key)) => { // Ignore constant if parent_node.get_type() == JsonType::Object { self.to_be_modified_ptr = parent_node.as_ptr(); diff --git a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs index a760f748348..fb707887885 100644 --- a/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs +++ b/components/tidb_query_datatype/src/codec/mysql/json/path_expr.rs @@ -25,375 +25,286 @@ // select json_extract('{"a": "b", "c": [1, "2"]}', '$.*') -> ["b", [1, "2"]] // ``` -use std::{iter::Peekable, str::CharIndices}; +use nom::{ + branch::alt, + bytes::complete::tag, + character::{ + complete, + complete::{char, none_of, satisfy, space0, space1}, + }, + combinator::{map, map_opt}, + multi::{many0, many1}, + sequence::{delimited, pair, tuple}, + IResult, +}; use super::json_unquote::unquote_string; -use crate::codec::{Error, Result}; +use crate::codec::Result; -pub const PATH_EXPR_ASTERISK: &str = "*"; - -#[derive(Clone, Debug, PartialEq)] -pub enum PathLeg { - /// `Key` indicates the path leg with '.key'. - Key(String), - /// `Index` indicates the path leg with form 'number'. - Index(i32), - /// `DoubleAsterisk` indicates the path leg with form '**'. - DoubleAsterisk, +fn lift_error_to_failure(err: nom::Err) -> nom::Err { + if let nom::Err::Error(err) = err { + nom::Err::Failure(err) + } else { + err + } } -// ArrayIndexAsterisk is for parsing '*' into a number. -// we need this number represent "all". -pub const PATH_EXPR_ARRAY_INDEX_ASTERISK: i32 = -1; - -pub type PathExpressionFlag = u8; - -pub const PATH_EXPRESSION_CONTAINS_ASTERISK: PathExpressionFlag = 0x01; -pub const PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK: PathExpressionFlag = 0x02; - -#[derive(Clone, Default, Debug, PartialEq)] -pub struct PathExpression { - pub legs: Vec, - pub flags: PathExpressionFlag, +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum ArrayIndex { + Left(u32), // `Left` represents an array index start from left + Right(u32), // `Right` represents an array index start from right } -impl PathExpression { - pub fn contains_any_asterisk(&self) -> bool { - (self.flags - & (PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK)) - != 0 - } +fn array_index_left(input: &str) -> IResult<&str, ArrayIndex> { + let (input, index) = complete::u32(input)?; + Ok((input, ArrayIndex::Left(index))) } -/// `box_json_path_err` creates an error from the slice position -/// The position is added with 1, to count from 1 as start -macro_rules! box_json_path_err { - ($e:expr) => {{ - box_err!( - "Invalid JSON path expression. The error is around character position {}.", - ($e) + 1 - ) - }}; +fn array_index_last(input: &str) -> IResult<&str, ArrayIndex> { + let (input, _) = tag("last")(input)?; + + Ok((input, ArrayIndex::Right(0))) } -struct PathExpressionTokenizer<'a> { - input: &'a str, +fn array_index_right(input: &str) -> IResult<&str, ArrayIndex> { + let (input, _) = tag("last")(input)?; + let (input, _) = space0(input)?; + let (input, _) = char('-')(input)?; + let (input, _) = space0(input)?; - char_iterator: Peekable>, + let (input, index) = complete::u32(input)?; + Ok((input, ArrayIndex::Right(index))) } -struct Position { - start: usize, - end: usize, +fn array_index(input: &str) -> IResult<&str, ArraySelection> { + map( + alt((array_index_left, array_index_right, array_index_last)), + |index| ArraySelection::Index(index), + )(input) } -/// PathExpressionToken represents a section in path expression and its position -enum PathExpressionToken { - Leg((PathLeg, Position)), - /// Represents the beginning "$" in the expression - Start(Position), +fn array_asterisk(input: &str) -> IResult<&str, ArraySelection> { + map(char('*'), |_| ArraySelection::Asterisk)(input) } -impl<'a> Iterator for PathExpressionTokenizer<'a> { - type Item = Result; - - /// Next will try to parse the next path leg and return - /// If it returns None, it means the input is over. - /// If it returns Some(Err(..)), it means the format is error. - /// If it returns Some(Ok(..)), it represents the next token. - fn next(&mut self) -> Option> { - self.trim_white_spaces(); - // Trim all spaces at first - if self.reached_end() { - return None; - }; - - let (start, ch) = *self.char_iterator.peek().unwrap(); - match ch { - '$' => { - self.char_iterator.next(); - Some(Ok(PathExpressionToken::Start(Position { - start, - end: self.current_index(), - }))) +fn array_range(input: &str) -> IResult<&str, ArraySelection> { + let (input, start) = array_index(input)?; + let (input, _) = space1(input)?; + let (input, _) = tag("to")(input)?; + let (before_last_index, _) = space1(input)?; + let (input, end) = array_index(before_last_index)?; + + match (start, end) { + (ArraySelection::Index(start), ArraySelection::Index(end)) => { + // specially check the position + let allowed = match (start, end) { + (ArrayIndex::Left(start), ArrayIndex::Left(end)) => start <= end, + (ArrayIndex::Right(start), ArrayIndex::Right(end)) => start >= end, + (..) => true, + }; + if !allowed { + // TODO: use a customized error kind, as the ErrorKind::Verify is designed + // to be used in `verify` combinator + return Err(nom::Err::Failure(nom::error::make_error( + before_last_index, + nom::error::ErrorKind::Verify, + ))); } - '.' => Some(self.next_key()), - '[' => Some(self.next_index()), - '*' => Some(self.next_double_asterisk()), - _ => Some(Err(box_json_path_err!(self.current_index()))), + Ok((input, ArraySelection::Range(start, end))) } + _ => unreachable!(), } } -impl<'a> PathExpressionTokenizer<'a> { - fn new(input: &'a str) -> PathExpressionTokenizer<'a> { - PathExpressionTokenizer { - input, - char_iterator: input.char_indices().peekable(), - } - } - - /// Returns the current index on the slice - fn current_index(&mut self) -> usize { - match self.char_iterator.peek() { - Some((start, _)) => *start, - None => self.input.len(), - } - } - - /// `trim_while_spaces` removes following spaces - fn trim_white_spaces(&mut self) { - while self - .char_iterator - .next_if(|(_, ch)| ch.is_whitespace()) - .is_some() - {} - } - - /// Returns whether the input has reached the end - fn reached_end(&mut self) -> bool { - return self.char_iterator.peek().is_none(); - } - - fn next_key(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); +#[derive(Clone, Debug, PartialEq)] +pub enum ArraySelection { + Asterisk, // `Asterisk` select all element from array. + Index(ArrayIndex), // `Index` select one element from array. + Range(ArrayIndex, ArrayIndex), // `Range` selects a closed-interval from array. +} - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +fn path_leg_array_selection(input: &str) -> IResult<&str, PathLeg> { + let (input, _) = char('[')(input)?; + let (input, _) = space0(input)?; + let (input, leg) = map( + alt((array_asterisk, array_range, array_index)), + |array_selection| PathLeg::ArraySelection(array_selection), + )(input) + .map_err(lift_error_to_failure)?; + let (input, _) = space0(input)?; + let (input, _) = char(']')(input).map_err(lift_error_to_failure)?; + + Ok((input, leg)) +} - match *self.char_iterator.peek().unwrap() { - (_, '*') => { - self.char_iterator.next().unwrap(); - - Ok(PathExpressionToken::Leg(( - PathLeg::Key(PATH_EXPR_ASTERISK.to_string()), - Position { - start, - end: self.current_index(), - }, - ))) - } - (mut key_start, '"') => { - // Skip this '"' character - key_start += 1; - self.char_iterator.next().unwrap(); +#[derive(Clone, Debug, PartialEq)] +pub enum KeySelection { + Asterisk, + Key(String), +} - // Next until the next '"' character - while self.char_iterator.next_if(|(_, ch)| *ch != '"').is_some() {} +fn key_selection_asterisk(input: &str) -> IResult<&str, KeySelection> { + map(char('*'), |_| KeySelection::Asterisk)(input) +} - // Now, it's a '"' or the end - if self.char_iterator.peek().is_none() { - return Err(box_json_path_err!(self.current_index())); +fn key_selection_key(input: &str) -> IResult<&str, KeySelection> { + let key_with_quote = map_opt( + delimited(char('"'), many1(none_of("\"")), char('"')), + |key: Vec<_>| { + let key: String = key.into_iter().collect(); + let key = unquote_string(&key).ok()?; + for ch in key.chars() { + if ch.is_control() { + return None; } + } + Some(KeySelection::Key(key)) + }, + ); + + let take_key_until_end = many1(satisfy(|ch| { + !(ch.is_whitespace() || ch == '.' || ch == '[' || ch == '*') + })); + let key_without_quote = map_opt(take_key_until_end, |key: Vec<_>| { + for (i, c) in key.iter().enumerate() { + if i == 0 && c.is_ascii_digit() { + return None; + } + if !c.is_ascii_alphanumeric() && *c != '_' && *c != '$' && c.is_ascii() { + return None; + } + } - // `key_end` is the index of '"' - let key_end = self.current_index(); - self.char_iterator.next().unwrap(); - - let key = unquote_string(unsafe { self.input.get_unchecked(key_start..key_end) })?; - for ch in key.chars() { - // According to JSON standard, a string cannot - // contain any ASCII control characters - if ch.is_control() { - // TODO: add the concrete error location - // after unquote, we lost the map between - // the character and input position. - return Err(box_json_path_err!(key_start)); - } - } + Some(KeySelection::Key(key.into_iter().collect())) + }); - Ok(PathExpressionToken::Leg(( - PathLeg::Key(key), - Position { - start, - end: self.current_index(), - }, - ))) - } - (key_start, _) => { - // We have to also check the current value - while self - .char_iterator - .next_if(|(_, ch)| { - !(ch.is_whitespace() || *ch == '.' || *ch == '[' || *ch == '*') - }) - .is_some() - {} - - // Now it reaches the end or a whitespace/./[/* - let key_end = self.current_index(); - - // The start character is not available - if key_end == key_start { - return Err(box_json_path_err!(key_start)); - } + alt((key_with_quote, key_without_quote))(input) +} - let key = unsafe { self.input.get_unchecked(key_start..key_end) }.to_string(); - - // It's not quoted, we'll have to validate whether it's an available ECMEScript - // identifier - for (i, c) in key.char_indices() { - if i == 0 && c.is_ascii_digit() { - return Err(box_json_path_err!(key_start + i)); - } - if !c.is_ascii_alphanumeric() && c != '_' && c != '$' && c.is_ascii() { - return Err(box_json_path_err!(key_start + i)); - } - } +fn path_leg_key(input: &str) -> IResult<&str, PathLeg> { + let (input, _) = char('.')(input)?; + let (input, _) = space0(input)?; - Ok(PathExpressionToken::Leg(( - PathLeg::Key(key), - Position { - start, - end: key_end, - }, - ))) - } - } - } + map( + alt((key_selection_key, key_selection_asterisk)), + |key_selection| PathLeg::Key(key_selection), + )(input) + .map_err(lift_error_to_failure) +} - fn next_index(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); +fn path_leg_double_asterisk(input: &str) -> IResult<&str, PathLeg> { + map(pair(char('*'), char('*')), |_| PathLeg::DoubleAsterisk)(input) +} - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +#[derive(Clone, Debug, PartialEq)] +pub enum PathLeg { + /// `Key` indicates the path leg with '.key'. + Key(KeySelection), + /// `ArraySelection` indicates the path leg with form '[...]'. + ArraySelection(ArraySelection), + /// `DoubleAsterisk` indicates the path leg with form '**'. + DoubleAsterisk, +} - return match self.char_iterator.next().unwrap() { - (_, '*') => { - // Then it's a glob array index - self.trim_white_spaces(); - if self.reached_end() { - return Err(box_json_path_err!(self.current_index())); - } +pub type PathExpressionFlag = u8; - if self.char_iterator.next_if(|(_, ch)| *ch == ']').is_none() { - return Err(box_json_path_err!(self.current_index())); - } +pub const PATH_EXPRESSION_CONTAINS_ASTERISK: PathExpressionFlag = 0x01; +pub const PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK: PathExpressionFlag = 0x02; +pub const PATH_EXPRESSION_CONTAINS_RANGE: PathExpressionFlag = 0x04; - Ok(PathExpressionToken::Leg(( - PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK), - Position { - start, - end: self.current_index(), - }, - ))) +fn path_expression(input: &str) -> IResult<&str, PathExpression> { + let mut flags = PathExpressionFlag::default(); + let (input, (_, _, legs)) = tuple(( + space0, + char('$'), + many0(delimited( + space0, + alt(( + path_leg_key, + path_leg_array_selection, + path_leg_double_asterisk, + )), + space0, + )), + ))(input)?; + + for leg in legs.iter() { + match leg { + PathLeg::DoubleAsterisk => flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + PathLeg::Key(KeySelection::Asterisk) => flags |= PATH_EXPRESSION_CONTAINS_ASTERISK, + PathLeg::ArraySelection(ArraySelection::Asterisk) => { + flags |= PATH_EXPRESSION_CONTAINS_ASTERISK } - (number_start, '0'..='9') => { - // Then it's a number array index - while self - .char_iterator - .next_if(|(_, ch)| ch.is_ascii_digit()) - .is_some() - {} - let number_end = self.current_index(); - - self.trim_white_spaces(); - // now, it reaches the end of input, or reaches a non-digit character - match self.char_iterator.peek() { - Some((_, ']')) => {} - Some((pos, _)) => { - return Err(box_json_path_err!(pos)); - } - None => { - return Err(box_json_path_err!(self.current_index())); - } - } - self.char_iterator.next().unwrap(); - - let index = self.input[number_start..number_end] - .parse::() - .map_err(|_| -> Error { box_json_path_err!(number_end) })?; - Ok(PathExpressionToken::Leg(( - PathLeg::Index(index), - Position { - start, - end: self.current_index(), - }, - ))) + PathLeg::ArraySelection(ArraySelection::Range(..)) => { + flags |= PATH_EXPRESSION_CONTAINS_RANGE } - (pos, _) => Err(box_json_path_err!(pos)), - }; + _ => {} + } } - fn next_double_asterisk(&mut self) -> Result { - let (start, _) = self.char_iterator.next().unwrap(); + Ok((input, PathExpression { legs, flags })) +} - match self.char_iterator.next() { - Some((end, '*')) => { - // Three or more asterisks are not allowed - if let Some((pos, '*')) = self.char_iterator.peek() { - return Err(box_json_path_err!(pos)); - } +#[derive(Clone, Default, Debug, PartialEq)] +pub struct PathExpression { + pub legs: Vec, + pub flags: PathExpressionFlag, +} - Ok(PathExpressionToken::Leg(( - PathLeg::DoubleAsterisk, - Position { start, end }, - ))) - } - Some((pos, _)) => Err(box_json_path_err!(pos)), - None => Err(box_json_path_err!(self.current_index())), - } +impl PathExpression { + pub fn contains_any_asterisk(&self) -> bool { + (self.flags + & (PATH_EXPRESSION_CONTAINS_ASTERISK | PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK)) + != 0 + } + + pub fn contains_any_range(&self) -> bool { + (self.flags & PATH_EXPRESSION_CONTAINS_RANGE) != 0 } } +/// `box_json_path_err` creates an error from the slice position +/// The position is added with 1, to count from 1 as start +macro_rules! box_json_path_err { + ($e:expr) => {{ + box_err!( + "Invalid JSON path expression. The error is around character position {}.", + ($e) + 1 + ) + }}; +} + /// Parses a JSON path expression. Returns a `PathExpression` /// object which can be used in `JSON_EXTRACT`, `JSON_SET` and so on. -pub fn parse_json_path_expr(path_expr: &str) -> Result { - let mut legs = Vec::new(); - let tokenizer = PathExpressionTokenizer::new(path_expr); - let mut flags = PathExpressionFlag::default(); - - let mut started = false; - let mut last_position = Position { start: 0, end: 0 }; - for (index, token) in tokenizer.enumerate() { - let token = token?; - - match token { - PathExpressionToken::Leg((leg, position)) => { - if !started { - return Err(box_json_path_err!(position.start)); +/// +/// See `parseJSONPathExpr` in TiDB `types/json_path_expr.go`. +pub fn parse_json_path_expr(path_expr_input: &str) -> Result { + let (left_input, path_expr) = match path_expression(path_expr_input) { + Ok(ret) => ret, + Err(err) => { + let input = match err { + nom::Err::Error(err) => err.input, + nom::Err::Failure(err) => err.input, + _ => { + unreachable!() } + }; - match &leg { - PathLeg::Key(key) => { - if key == PATH_EXPR_ASTERISK { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK - } - } - PathLeg::Index(PATH_EXPR_ARRAY_INDEX_ASTERISK) => { - flags |= PATH_EXPRESSION_CONTAINS_ASTERISK - } - PathLeg::DoubleAsterisk => flags |= PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, - _ => {} - } - - legs.push(leg.clone()); - last_position = position; - } - PathExpressionToken::Start(position) => { - started = true; - - if index != 0 { - return Err(box_json_path_err!(position.start)); - } - } + return Err(box_json_path_err!(path_expr_input.len() - input.len())); } - } + }; - // There is no available token - if !started { - return Err(box_json_path_err!(path_expr.len())); + // Some extra input is left + if !left_input.is_empty() { + return Err(box_json_path_err!(path_expr_input.len() - left_input.len())); } + // The last one cannot be the double asterisk - if !legs.is_empty() && legs.last().unwrap() == &PathLeg::DoubleAsterisk { - return Err(box_json_path_err!(last_position.end)); + if !path_expr.legs.is_empty() && path_expr.legs.last().unwrap() == &PathLeg::DoubleAsterisk { + return Err(box_json_path_err!(path_expr_input.len() - 1)); } - Ok(PathExpression { legs, flags }) + Ok(path_expr) } #[cfg(test)] @@ -429,7 +340,7 @@ mod tests { "$.a", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("a"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("a")))], flags: PathExpressionFlag::default(), }), ), @@ -438,8 +349,8 @@ mod tests { None, Some(PathExpression { legs: vec![ - PathLeg::Key(String::from("a")), - PathLeg::Key(String::from("$")), + PathLeg::Key(KeySelection::Key(String::from("a"))), + PathLeg::Key(KeySelection::Key(String::from("$"))), ], flags: PathExpressionFlag::default(), }), @@ -448,7 +359,7 @@ mod tests { "$.\"hello world\"", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("hello world"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("hello world")))], flags: PathExpressionFlag::default(), }), ), @@ -456,7 +367,7 @@ mod tests { "$. \"你好 世界\" ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("你好 世界"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("你好 世界")))], flags: PathExpressionFlag::default(), }), ), @@ -464,7 +375,7 @@ mod tests { "$. ❤️ ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("❤️"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("❤️")))], flags: PathExpressionFlag::default(), }), ), @@ -472,7 +383,7 @@ mod tests { "$. 你好 ", None, Some(PathExpression { - legs: vec![PathLeg::Key(String::from("你好"))], + legs: vec![PathLeg::Key(KeySelection::Key(String::from("你好")))], flags: PathExpressionFlag::default(), }), ), @@ -480,7 +391,9 @@ mod tests { "$[ 0 ]", None, Some(PathExpression { - legs: vec![PathLeg::Index(0)], + legs: vec![PathLeg::ArraySelection(ArraySelection::Index( + ArrayIndex::Left(0), + ))], flags: PathExpressionFlag::default(), }), ), @@ -488,7 +401,10 @@ mod tests { "$**.a", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -496,7 +412,10 @@ mod tests { " $ ** . a", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("a"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("a"))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -504,7 +423,69 @@ mod tests { " $ ** . $", None, Some(PathExpression { - legs: vec![PathLeg::DoubleAsterisk, PathLeg::Key(String::from("$"))], + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::Key(KeySelection::Key(String::from("$"))), + ], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, + }), + ), + ( + " $ [ 1 to 10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Left(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ [ 1 to last - 10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ [ 1 to last-10 ]", + None, + Some(PathExpression { + legs: vec![PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(10), + ))], + flags: PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ ** [ 1 to last ]", + None, + Some(PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::ArraySelection(ArraySelection::Range( + ArrayIndex::Left(1), + ArrayIndex::Right(0), + )), + ], + flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK + | PATH_EXPRESSION_CONTAINS_RANGE, + }), + ), + ( + " $ ** [ last ]", + None, + Some(PathExpression { + legs: vec![ + PathLeg::DoubleAsterisk, + PathLeg::ArraySelection(ArraySelection::Index(ArrayIndex::Right(0))), + ], flags: PATH_EXPRESSION_CONTAINS_DOUBLE_ASTERISK, }), ), @@ -536,8 +517,7 @@ mod tests { ), ( "$.\"\\u33\"", - // TODO: pass the position in the unquote unicode error - Some("Invalid unicode, byte len too short"), + Some("Invalid JSON path expression. The error is around character position 3."), None, ), ( @@ -547,7 +527,7 @@ mod tests { ), ( "$.\"a\\t\"", - Some("Invalid JSON path expression. The error is around character position 4."), + Some("Invalid JSON path expression. The error is around character position 3."), None, ), ( @@ -556,8 +536,23 @@ mod tests { None, ), ( - "$ [ 2147483648 ]", - Some("Invalid JSON path expression. The error is around character position 15."), + "$ [ 4294967296 ]", + Some("Invalid JSON path expression. The error is around character position 5."), + None, + ), + ( + "$ [ 1to2 ]", + Some("Invalid JSON path expression. The error is around character position 6."), + None, + ), + ( + "$ [ 2 to 1 ]", + Some("Invalid JSON path expression. The error is around character position 10."), + None, + ), + ( + "$ [ last - 10 to last - 20 ]", + Some("Invalid JSON path expression. The error is around character position 18."), None, ), ]; @@ -607,4 +602,22 @@ mod tests { assert_eq!(b, expected, "#{} expect {:?} but got {:?}", i, expected, b); } } + + #[test] + fn test_parse_json_path_expr_contains_any_range() { + let mut test_cases = vec![ + ("$.a[0]", false), + ("$.a[*]", false), + ("$**.a[0]", false), + ("$.a[1 to 2]", true), + ("$.a[1 to last - 2]", true), + ]; + for (i, (path_expr, expected)) in test_cases.drain(..).enumerate() { + let r = parse_json_path_expr(path_expr); + assert!(r.is_ok(), "#{} expect parse ok but got err {:?}", i, r); + let e = r.unwrap(); + let b = e.contains_any_range(); + assert_eq!(b, expected, "#{} expect {:?} but got {:?}", i, expected, b); + } + } } diff --git a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs index 7cc233e92d1..9289625ad84 100644 --- a/components/tidb_query_datatype/src/codec/mysql/time/extension.rs +++ b/components/tidb_query_datatype/src/codec/mysql/time/extension.rs @@ -95,7 +95,7 @@ impl DateTimeExtension for Time { } if week_year && days >= 52 * 7 { - weekday = (weekday + calc_days_in_year(year as i32)) % 7; + weekday = (weekday + calc_days_in_year(year)) % 7; if (!first_weekday && weekday < 4) || (first_weekday && weekday == 0) { year += 1; return (year, 1); diff --git a/components/tidb_query_datatype/src/codec/overflow.rs b/components/tidb_query_datatype/src/codec/overflow.rs index b1329e989c7..4a81b23a995 100644 --- a/components/tidb_query_datatype/src/codec/overflow.rs +++ b/components/tidb_query_datatype/src/codec/overflow.rs @@ -13,7 +13,7 @@ pub fn div_i64(a: i64, b: i64) -> Result { match a.overflowing_div(b) { (_res, true) => Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )), (res, false) => Ok(res), } @@ -31,7 +31,7 @@ pub fn div_u64_with_i64(a: u64, b: i64) -> Result { if a != 0 && (b.overflowing_neg().0 as u64) <= a { Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )) } else { Ok(0) @@ -53,7 +53,7 @@ pub fn div_i64_with_u64(a: i64, b: u64) -> Result { if a.overflowing_neg().0 as u64 >= b { Err(Error::overflow( "UNSIGNED BIGINT", - &format!("({} / {})", a, b), + format!("({} / {})", a, b), )) } else { Ok(0) diff --git a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs index 79c08ec5404..8d0e34dfdf7 100644 --- a/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs +++ b/components/tidb_query_datatype/src/codec/row/v2/compat_v1.rs @@ -73,6 +73,7 @@ pub trait V1CompatibleEncoder: DatumFlagAndPayloadEncoder { FieldTypeTp::VarChar | FieldTypeTp::VarString | FieldTypeTp::String + | FieldTypeTp::Geometry | FieldTypeTp::TinyBlob | FieldTypeTp::MediumBlob | FieldTypeTp::LongBlob diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 0c995487b3d..00f6c22347b 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -647,7 +647,7 @@ mod tests { let mut ctx = EvalContext::default(); let col_ids: Vec<_> = row.iter().map(|(&id, _)| id).collect(); - let col_values: Vec<_> = row.iter().map(|(_, v)| v.clone()).collect(); + let col_values: Vec<_> = row.values().cloned().collect(); let mut col_encoded: HashMap<_, _> = row .iter() .map(|(k, v)| { diff --git a/components/tidb_query_datatype/src/expr/ctx.rs b/components/tidb_query_datatype/src/expr/ctx.rs index 758f7b13736..c17cb7af922 100644 --- a/components/tidb_query_datatype/src/expr/ctx.rs +++ b/components/tidb_query_datatype/src/expr/ctx.rs @@ -143,7 +143,7 @@ impl EvalConfig { self.tz = tz; Ok(self) } - None => Err(Error::invalid_timezone(&format!("offset {}s", offset_sec))), + None => Err(Error::invalid_timezone(format!("offset {}s", offset_sec))), } } @@ -300,7 +300,7 @@ impl EvalContext { } let orig_str = String::from_utf8_lossy(bytes); self.warnings - .append_warning(Error::truncated_wrong_val("INTEGER", &orig_str)); + .append_warning(Error::truncated_wrong_val("INTEGER", orig_str)); if negative { Ok(i64::MIN) } else { diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index e448340eddf..123c306c125 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -12,7 +12,7 @@ collections = { workspace = true } fail = "0.5" futures = { version = "0.3", features = ["compat"] } itertools = "0.10" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } match-template = "0.0.1" protobuf = { version = "2.8", features = ["bytes"] } @@ -24,8 +24,8 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tidb_query_expr = { workspace = true } tikv_util = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +tipb = { workspace = true } +yatp = { workspace = true } [dev-dependencies] anyhow = "1.0" diff --git a/components/tidb_query_executors/src/simple_aggr_executor.rs b/components/tidb_query_executors/src/simple_aggr_executor.rs index 75790428187..b6717a40fb5 100644 --- a/components/tidb_query_executors/src/simple_aggr_executor.rs +++ b/components/tidb_query_executors/src/simple_aggr_executor.rs @@ -207,7 +207,7 @@ impl AggregationExecutorImpl for SimpleAggregationImpl #[inline] fn groups_len(&self) -> usize { - if self.has_input_rows { 1 } else { 0 } + self.has_input_rows as usize } #[inline] diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 06dc1ce956b..6ef8c6b2224 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -98,7 +98,7 @@ impl BatchTopNExecutor { Self { heap: BinaryHeap::new(), - eval_columns_buffer_unsafe: Box::new(Vec::new()), + eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), order_is_desc: order_is_desc.into_boxed_slice(), @@ -127,7 +127,7 @@ impl BatchTopNExecutor { Self { heap: BinaryHeap::new(), - eval_columns_buffer_unsafe: Box::new(Vec::new()), + eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), order_is_desc: order_is_desc.into_boxed_slice(), diff --git a/components/tidb_query_expr/Cargo.toml b/components/tidb_query_expr/Cargo.toml index 1ca4a46b6dd..95f37308e59 100644 --- a/components/tidb_query_expr/Cargo.toml +++ b/components/tidb_query_expr/Cargo.toml @@ -30,7 +30,7 @@ tidb_query_common = { workspace = true } tidb_query_datatype = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } twoway = "0.2.0" uuid = { version = "0.8.1", features = ["v4"] } diff --git a/components/tidb_query_expr/src/impl_arithmetic.rs b/components/tidb_query_expr/src/impl_arithmetic.rs index 01776c1ad7a..2f48fec4693 100644 --- a/components/tidb_query_expr/src/impl_arithmetic.rs +++ b/components/tidb_query_expr/src/impl_arithmetic.rs @@ -44,7 +44,7 @@ impl ArithmeticOp for IntIntPlus { fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_add(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} + {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} + {})", lhs, rhs)).into()) .map(Some) } } @@ -61,10 +61,8 @@ impl ArithmeticOp for IntUintPlus { } else { (*rhs as u64).checked_sub(lhs.overflowing_neg().0 as u64) }; - res.ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} + {})", lhs, rhs)).into() - }) - .map(|v| Some(v as i64)) + res.ok_or_else(|| Error::overflow("BIGINT UNSIGNED", format!("({} + {})", lhs, rhs)).into()) + .map(|v| Some(v as i64)) } } @@ -89,7 +87,7 @@ impl ArithmeticOp for UintUintPlus { (*lhs as u64) .checked_add(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} + {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} + {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -104,7 +102,7 @@ impl ArithmeticOp for RealPlus { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs + *rhs; if !res.is_finite() { - return Err(Error::overflow("DOUBLE", &format!("({} + {})", lhs, rhs)).into()); + return Err(Error::overflow("DOUBLE", format!("({} + {})", lhs, rhs)).into()); } Ok(Some(res)) } @@ -130,7 +128,7 @@ impl ArithmeticOp for IntIntMinus { fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_sub(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(Some) } } @@ -145,10 +143,10 @@ impl ArithmeticOp for IntUintMinus { if *lhs >= 0 { (*lhs as u64) .checked_sub(*rhs as u64) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(|v| Some(v as i64)) } else { - Err(Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + Err(Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) } } } @@ -165,7 +163,7 @@ impl ArithmeticOp for UintIntMinus { } else { (*lhs as u64).checked_add(rhs.overflowing_neg().0 as u64) }; - res.ok_or_else(|| Error::overflow("BIGINT", &format!("({} - {})", lhs, rhs)).into()) + res.ok_or_else(|| Error::overflow("BIGINT", format!("({} - {})", lhs, rhs)).into()) .map(|v| Some(v as i64)) } } @@ -180,7 +178,7 @@ impl ArithmeticOp for UintUintMinus { (*lhs as u64) .checked_sub(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} - {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} - {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -195,7 +193,7 @@ impl ArithmeticOp for RealMinus { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs - *rhs; if !res.is_finite() { - return Err(Error::overflow("DOUBLE", &format!("({} - {})", lhs, rhs)).into()); + return Err(Error::overflow("DOUBLE", format!("({} - {})", lhs, rhs)).into()); } Ok(Some(res)) } @@ -332,7 +330,7 @@ impl ArithmeticOp for RealMultiply { fn calc(lhs: &Real, rhs: &Real) -> Result> { let res = *lhs * *rhs; if res.is_infinite() { - Err(Error::overflow("REAL", &format!("({} * {})", lhs, rhs)).into()) + Err(Error::overflow("REAL", format!("({} * {})", lhs, rhs)).into()) } else { Ok(Some(res)) } @@ -346,7 +344,7 @@ impl ArithmeticOp for IntIntMultiply { type T = Int; fn calc(lhs: &Int, rhs: &Int) -> Result> { lhs.checked_mul(*rhs) - .ok_or_else(|| Error::overflow("BIGINT", &format!("({} * {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT", format!("({} * {})", lhs, rhs)).into()) .map(Some) } } @@ -362,7 +360,7 @@ impl ArithmeticOp for IntUintMultiply { } else { None } - .ok_or_else(|| Error::overflow("BIGINT UNSIGNED", &format!("({} * {})", lhs, rhs)).into()) + .ok_or_else(|| Error::overflow("BIGINT UNSIGNED", format!("({} * {})", lhs, rhs)).into()) .map(Some) } } @@ -386,7 +384,7 @@ impl ArithmeticOp for UintUintMultiply { (*lhs as u64) .checked_mul(*rhs as u64) .ok_or_else(|| { - Error::overflow("BIGINT UNSIGNED", &format!("({} * {})", lhs, rhs)).into() + Error::overflow("BIGINT UNSIGNED", format!("({} * {})", lhs, rhs)).into() }) .map(|v| Some(v as i64)) } @@ -500,7 +498,7 @@ impl ArithmeticOpWithCtx for RealDivide { } else { let result = *lhs / *rhs; if result.is_infinite() { - ctx.handle_overflow_err(Error::overflow("DOUBLE", &format!("{} / {}", lhs, rhs))) + ctx.handle_overflow_err(Error::overflow("DOUBLE", format!("{} / {}", lhs, rhs))) .map(|_| None)? } else { Some(result) diff --git a/components/tidb_query_expr/src/impl_cast.rs b/components/tidb_query_expr/src/impl_cast.rs index 81a08b95e94..76e90f79c5b 100644 --- a/components/tidb_query_expr/src/impl_cast.rs +++ b/components/tidb_query_expr/src/impl_cast.rs @@ -242,7 +242,7 @@ pub fn get_cast_fn_rpn_node( func_meta, args_len: 1, field_type: to_field_type, - metadata: Box::new(tipb::InUnionMetadata::default()), + metadata: Box::::default(), }) } @@ -373,7 +373,7 @@ fn cast_string_as_int( ctx.warnings .append_warning(Error::cast_neg_int_as_unsigned()); } - Ok(Some(x as i64)) + Ok(Some(x)) } Err(err) => match *err.kind() { IntErrorKind::PosOverflow | IntErrorKind::NegOverflow => { @@ -3118,7 +3118,7 @@ mod tests { (Json::from_bool(false).unwrap(), 0, false, false), (Json::none().unwrap(), 0, false, false), ( - Json::from_f64(((1u64 << 63) + (1u64 << 62)) as u64 as f64).unwrap(), + Json::from_f64(((1u64 << 63) + (1u64 << 62)) as f64).unwrap(), i64::MAX, true, false, @@ -4341,7 +4341,7 @@ mod tests { test_as_string_helper( ref_cs, |ctx, extra, val| { - let val = val.map(|x| *x as i64); + let val = val.copied(); cast_year_as_string(ctx, extra, &val.unwrap()) }, "cast_year_as_string", @@ -5026,10 +5026,8 @@ mod tests { let expect = match res_type { ResType::Zero => Decimal::zero(), ResType::Same => base_res, - ResType::TruncateToMax => max_decimal(res_flen as u8, res_decimal as u8), - ResType::TruncateToMin => { - max_or_min_dec(true, res_flen as u8, res_decimal as u8) - } + ResType::TruncateToMax => max_decimal(res_flen, res_decimal), + ResType::TruncateToMin => max_or_min_dec(true, res_flen, res_decimal), ResType::Round => { let r = base_res .round(res_decimal as i8, RoundMode::HalfEven) @@ -6697,7 +6695,7 @@ mod tests { Json::from_f64(i64::MAX as u64 as f64).unwrap(), Json::from_f64(i64::MIN as u64 as f64).unwrap(), Json::from_f64(i64::MIN as f64).unwrap(), - Json::from_f64(((1u64 << 63) + (1u64 << 62)) as u64 as f64).unwrap(), + Json::from_f64(((1u64 << 63) + (1u64 << 62)) as f64).unwrap(), Json::from_f64(-((1u64 << 63) as f64 + (1u64 << 62) as f64)).unwrap(), Json::from_f64(f64::from(f32::MIN)).unwrap(), Json::from_f64(f64::from(f32::MAX)).unwrap(), diff --git a/components/tidb_query_expr/src/impl_compare.rs b/components/tidb_query_expr/src/impl_compare.rs index a8dbf96d1cb..3eae996f249 100644 --- a/components/tidb_query_expr/src/impl_compare.rs +++ b/components/tidb_query_expr/src/impl_compare.rs @@ -361,7 +361,7 @@ pub fn greatest_cmp_string_as_time( Ok(t) => greatest = max(greatest, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -398,7 +398,7 @@ pub fn least_cmp_string_as_time( Ok(t) => least = min(least, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -434,7 +434,7 @@ pub fn greatest_cmp_string_as_date( Ok(t) => greatest = max(greatest, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } @@ -471,7 +471,7 @@ pub fn least_cmp_string_as_date( Ok(t) => least = min(least, Some(t)), Err(_) => { return ctx - .handle_invalid_time_error(Error::invalid_time_format(&s)) + .handle_invalid_time_error(Error::invalid_time_format(s)) .map(|_| Ok(None))?; } } diff --git a/components/tidb_query_expr/src/impl_json.rs b/components/tidb_query_expr/src/impl_json.rs index 1926cc648e0..0c905b7458c 100644 --- a/components/tidb_query_expr/src/impl_json.rs +++ b/components/tidb_query_expr/src/impl_json.rs @@ -204,6 +204,31 @@ fn quote(bytes: BytesRef) -> Result> { Ok(Some(result)) } +#[rpn_fn(nullable, raw_varg, min_args = 1, max_args = 1)] +#[inline] +fn json_valid(args: &[ScalarValueRef]) -> Result> { + assert_eq!(args.len(), 1); + let received_et = args[0].eval_type(); + let r = match args[0].to_owned().is_none() { + true => None, + _ => match received_et { + EvalType::Json => args[0].as_json().and(Some(1)), + EvalType::Bytes => match args[0].as_bytes() { + Some(p) => { + let tmp_str = + std::str::from_utf8(p).map_err(tidb_query_datatype::codec::Error::from)?; + let json: serde_json::error::Result = serde_json::from_str(tmp_str); + Some(json.is_ok() as Int) + } + _ => Some(0), + }, + _ => Some(0), + }, + }; + + Ok(r) +} + #[rpn_fn] #[inline] fn json_unquote(arg: BytesRef) -> Result> { @@ -826,6 +851,38 @@ mod tests { } } + #[test] + fn test_json_valid() { + let cases: Vec<(Vec, Option)> = vec![ + ( + vec![Some(Json::from_str(r#"{"a":1}"#).unwrap()).into()], + Some(1), + ), + (vec![Some(b"hello".to_vec()).into()], Some(0)), + (vec![Some(b"\"hello\"".to_vec()).into()], Some(1)), + (vec![Some(b"null".to_vec()).into()], Some(1)), + (vec![Some(Json::from_str(r#"{}"#).unwrap()).into()], Some(1)), + (vec![Some(Json::from_str(r#"[]"#).unwrap()).into()], Some(1)), + (vec![Some(b"2".to_vec()).into()], Some(1)), + (vec![Some(b"2.5".to_vec()).into()], Some(1)), + (vec![Some(b"2019-8-19".to_vec()).into()], Some(0)), + (vec![Some(b"\"2019-8-19\"".to_vec()).into()], Some(1)), + (vec![Some(2).into()], Some(0)), + (vec![Some(2.5).into()], Some(0)), + (vec![None::.into()], None), + (vec![None::.into()], None), + (vec![None::.into()], None), + ]; + + for (vargs, expected) in cases { + let output = RpnFnScalarEvaluator::new() + .push_params(vargs.clone()) + .evaluate(ScalarFuncSig::JsonValidJsonSig) + .unwrap(); + assert_eq!(output, expected, "{:?}", vargs); + } + } + #[test] fn test_json_contains() { let cases: Vec<(Vec, Option)> = vec![ diff --git a/components/tidb_query_expr/src/impl_like.rs b/components/tidb_query_expr/src/impl_like.rs index 39dce827650..2fe99017fe0 100644 --- a/components/tidb_query_expr/src/impl_like.rs +++ b/components/tidb_query_expr/src/impl_like.rs @@ -6,17 +6,21 @@ use tidb_query_datatype::codec::{collation::*, data_type::*}; #[rpn_fn] #[inline] -pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> Result> { +pub fn like( + target: BytesRef, + pattern: BytesRef, + escape: &i64, +) -> Result> { let escape = *escape as u32; // current search positions in pattern and target. let (mut px, mut tx) = (0, 0); // positions for backtrace. let (mut next_px, mut next_tx) = (0, 0); while px < pattern.len() || tx < target.len() { - if let Some((c, mut poff)) = C::Charset::decode_one(&pattern[px..]) { + if let Some((c, mut poff)) = CS::decode_one(&pattern[px..]) { let code: u32 = c.into(); if code == '_' as u32 { - if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + if let Some((_, toff)) = CS::decode_one(&target[tx..]) { px += poff; tx += toff; continue; @@ -26,7 +30,7 @@ pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> R next_px = px; px += poff; next_tx = tx; - next_tx += if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + next_tx += if let Some((_, toff)) = CS::decode_one(&target[tx..]) { toff } else { 1 @@ -35,13 +39,13 @@ pub fn like(target: BytesRef, pattern: BytesRef, escape: &i64) -> R } else { if code == escape && px + poff < pattern.len() { px += poff; - poff = if let Some((_, off)) = C::Charset::decode_one(&pattern[px..]) { + poff = if let Some((_, off)) = CS::decode_one(&pattern[px..]) { off } else { break; } } - if let Some((_, toff)) = C::Charset::decode_one(&target[tx..]) { + if let Some((_, toff)) = CS::decode_one(&target[tx..]) { if let Ok(std::cmp::Ordering::Equal) = C::sort_compare(&target[tx..tx + toff], &pattern[px..px + poff]) { @@ -154,20 +158,6 @@ mod tests { Collation::Binary, Some(0), ), - ( - r#"夏威夷吉他"#, - r#"_____"#, - '\\', - Collation::Binary, - Some(0), - ), - ( - r#"🐶🍐🍳➕🥜🎗🐜"#, - r#"_______"#, - '\\', - Collation::Utf8Mb4Bin, - Some(1), - ), ( r#"IpHONE"#, r#"iPhone"#, @@ -182,14 +172,6 @@ mod tests { Collation::Utf8Mb4GeneralCi, Some(1), ), - (r#"🕺_"#, r#"🕺🕺🕺_"#, '🕺', Collation::Binary, Some(0)), - ( - r#"🕺_"#, - r#"🕺🕺🕺_"#, - '🕺', - Collation::Utf8Mb4GeneralCi, - Some(1), - ), (r#"baab"#, r#"b_%b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), (r#"baab"#, r#"b%_b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), (r#"bab"#, r#"b_%b"#, '\\', Collation::Utf8Mb4Bin, Some(1)), @@ -238,4 +220,151 @@ mod tests { ); } } + + #[test] + fn test_like_wide_character() { + let cases = vec![ + ( + r#"夏威夷吉他"#, + r#"_____"#, + '\\', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🐶🍐🍳➕🥜🎗🐜"#, + r#"_______"#, + '\\', + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Utf8Mb4GeneralCi, + Collation::Utf8Mb4GeneralCi, + Collation::Utf8Mb4GeneralCi, + Some(1), + ), + // When the new collation framework is not enabled, the collation + // will always be binary Some related tests are added here + ( + r#"夏威夷吉他"#, + r#"_____"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🐶🍐🍳➕🥜🎗🐜"#, + r#"_______"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Binary, + Collation::Binary, + Some(0), + ), + ( + r#"🕺_"#, + r#"🕺🕺🕺_"#, + '🕺', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + // Will not match, because '_' matches only one byte. + ( + r#"测试"#, + r#"测_"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Binary, + Some(0), + ), + // Both of them should be decoded with binary charset, so that we'll + // compare byte with byte, but not comparing a long character with a + // byte. + ( + r#"测试"#, + r#"测%"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Binary, + Some(1), + ), + // This can happen when the new collation is not enabled, and TiDB + // doesn't push down the collation information. Using binary + // comparing order is fine, but we'll need to decode strings with + // their own charset (so '_' could match single character, rather + // than single byte). + ( + r#"测试"#, + r#"测_"#, + '\\', + Collation::Binary, + Collation::Utf8Mb4Bin, + Collation::Utf8Mb4Bin, + Some(1), + ), + ]; + for (target, pattern, escape, collation, target_collation, pattern_collation, expected) in + cases + { + let output = RpnFnScalarEvaluator::new() + .return_field_type( + FieldTypeBuilder::new() + .tp(FieldTypeTp::LongLong) + .collation(collation) + .build(), + ) + .push_param_with_field_type( + target.to_owned().into_bytes(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .collation(target_collation), + ) + .push_param_with_field_type( + pattern.to_owned().into_bytes(), + FieldTypeBuilder::new() + .tp(FieldTypeTp::String) + .collation(pattern_collation), + ) + .push_param(escape as i64) + .evaluate(ScalarFuncSig::LikeSig) + .unwrap(); + assert_eq!( + output, expected, + "target={}, pattern={}, escape={}", + target, pattern, escape + ); + } + } } diff --git a/components/tidb_query_expr/src/impl_math.rs b/components/tidb_query_expr/src/impl_math.rs index abd190d077a..beeeef288b4 100644 --- a/components/tidb_query_expr/src/impl_math.rs +++ b/components/tidb_query_expr/src/impl_math.rs @@ -226,7 +226,7 @@ impl Floor for FloorIntToInt { #[inline] fn abs_int(arg: &Int) -> Result> { match arg.checked_abs() { - None => Err(Error::overflow("BIGINT", &format!("abs({})", *arg)).into()), + None => Err(Error::overflow("BIGINT", format!("abs({})", *arg)).into()), Some(arg_abs) => Ok(Some(arg_abs)), } } @@ -288,7 +288,7 @@ fn radians(arg: &Real) -> Result> { pub fn exp(arg: &Real) -> Result> { let ret = arg.exp(); if ret.is_infinite() { - Err(Error::overflow("DOUBLE", &format!("exp({})", arg)).into()) + Err(Error::overflow("DOUBLE", format!("exp({})", arg)).into()) } else { Ok(Real::new(ret).ok()) } diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 9081f623b8e..5289f427e93 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -64,7 +64,7 @@ pub fn unary_minus_uint(arg: Option<&Int>) -> Result> { Some(val) => { let uval = *val as u64; match uval.cmp(&(i64::MAX as u64 + 1)) { - Greater => Err(Error::overflow("BIGINT", &format!("-{}", uval)).into()), + Greater => Err(Error::overflow("BIGINT", format!("-{}", uval)).into()), Equal => Ok(Some(i64::MIN)), Less => Ok(Some(-*val)), } @@ -79,7 +79,7 @@ pub fn unary_minus_int(arg: Option<&Int>) -> Result> { match arg { Some(val) => { if *val == i64::MIN { - Err(Error::overflow("BIGINT", &format!("-{}", *val)).into()) + Err(Error::overflow("BIGINT", format!("-{}", *val)).into()) } else { Ok(Some(-*val)) } diff --git a/components/tidb_query_expr/src/impl_time.rs b/components/tidb_query_expr/src/impl_time.rs index 0f55e21bab5..aca40b658d6 100644 --- a/components/tidb_query_expr/src/impl_time.rs +++ b/components/tidb_query_expr/src/impl_time.rs @@ -256,7 +256,7 @@ pub fn add_string_and_duration( return match arg0.checked_add(*arg1) { Some(result) => Ok(writer.write(Some(duration_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DURATION", &format!("{} + {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DURATION", format!("{} + {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -264,7 +264,7 @@ pub fn add_string_and_duration( return match arg0.checked_add(ctx, *arg1) { Some(result) => Ok(writer.write(Some(datetime_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DATETIME", &format!("{} + {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DATETIME", format!("{} + {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -286,7 +286,7 @@ pub fn sub_string_and_duration( return match arg0.checked_sub(*arg1) { Some(result) => Ok(writer.write(Some(duration_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DURATION", &format!("{} - {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DURATION", format!("{} - {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; @@ -294,7 +294,7 @@ pub fn sub_string_and_duration( return match arg0.checked_sub(ctx, *arg1) { Some(result) => Ok(writer.write(Some(datetime_to_string(result).into_bytes()))), None => ctx - .handle_overflow_err(Error::overflow("DATETIME", &format!("{} - {}", arg0, arg1))) + .handle_overflow_err(Error::overflow("DATETIME", format!("{} - {}", arg0, arg1))) .map(|_| Ok(writer.write(None)))?, }; }; diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 8bb1cc05480..43b0602ebbb 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -44,8 +44,12 @@ pub mod impl_time; use tidb_query_common::Result; use tidb_query_datatype::{ - codec::data_type::*, match_template_charset, match_template_collator, Charset, Collation, - FieldTypeAccessor, FieldTypeFlag, + codec::{ + collation::{Charset as _, Collator}, + data_type::*, + }, + match_template_charset, match_template_collator, match_template_multiple_collators, Charset, + Collation, FieldTypeAccessor, FieldTypeFlag, }; use tipb::{Expr, FieldType, ScalarFuncSig}; @@ -91,10 +95,39 @@ fn map_compare_in_string_sig(ret_field_type: &FieldType) -> Result { }) } -fn map_like_sig(ret_field_type: &FieldType) -> Result { - Ok(match_template_collator! { - TT, match ret_field_type.as_accessor().collation().map_err(tidb_query_datatype::codec::Error::from)? { - Collation::TT => like_fn_meta::() +fn map_like_sig(ret_field_type: &FieldType, children: &[Expr]) -> Result { + let ret_collation = ret_field_type + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + let target_collation = children[0] + .get_field_type() + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + let pattern_collation = children[1] + .get_field_type() + .as_accessor() + .collation() + .map_err(tidb_query_datatype::codec::Error::from)?; + + // If the target charset is the same with pattern charset, and is Utf8mb4, + // use their charset to decode bytes. If not, use the charset pushed down in + // the ret_field type to decode the bytes. + // + // This behavior is for the compatibility and correctness: The TiDB doesn't + // push down the collation information when the new collation framework is + // not enabled, and always use the binary collation. However, the `_` + // pattern considers not only the order of strings, but also the number of + // characters. Some characters more than 1 bytes cannot be matched by `_` if + // the new collation framework is not enabled. + Ok(match_template_multiple_collators! { + (TT, TC, PC), (ret_collation, target_collation, pattern_collation), { + if ::Charset::charset() == ::Charset::charset() { + like_fn_meta::::Charset>() + } else { + like_fn_meta::::Charset>() + } } }) } @@ -595,8 +628,11 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::JsonKeysSig => json_keys_fn_meta(), ScalarFuncSig::JsonKeys2ArgsSig => json_keys_fn_meta(), ScalarFuncSig::JsonQuoteSig => json_quote_fn_meta(), + ScalarFuncSig::JsonValidJsonSig => json_valid_fn_meta(), + ScalarFuncSig::JsonValidStringSig => json_valid_fn_meta(), + ScalarFuncSig::JsonValidOthersSig => json_valid_fn_meta(), // impl_like - ScalarFuncSig::LikeSig => map_like_sig(ft)?, + ScalarFuncSig::LikeSig => map_like_sig(ft, children)?, // impl_regexp ScalarFuncSig::RegexpSig => map_regexp_like_sig(ft)?, ScalarFuncSig::RegexpUtf8Sig => map_regexp_like_sig(ft)?, diff --git a/components/tikv_kv/Cargo.toml b/components/tikv_kv/Cargo.toml index 6ee74371674..2911c7738c6 100644 --- a/components/tikv_kv/Cargo.toml +++ b/components/tikv_kv/Cargo.toml @@ -36,11 +36,12 @@ fail = "0.5" file_system = { workspace = true } futures = { version = "0.3", features = ["thread-pool", "compat"] } into_other = { workspace = true } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" +raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raftstore = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } diff --git a/components/tikv_kv/src/btree_engine.rs b/components/tikv_kv/src/btree_engine.rs index 473b993bf39..35f666896f3 100644 --- a/components/tikv_kv/src/btree_engine.rs +++ b/components/tikv_kv/src/btree_engine.rs @@ -14,13 +14,14 @@ use std::{ use collections::HashMap; use engine_panic::PanicEngine; use engine_traits::{CfName, IterOptions, ReadOptions, CF_DEFAULT, CF_LOCK, CF_WRITE}; +use futures::{future, stream, Future, Stream}; use kvproto::kvrpcpb::Context; use txn_types::{Key, Value}; use super::SnapContext; use crate::{ - Callback as EngineCallback, DummySnapshotExt, Engine, Error as EngineError, - ErrorInner as EngineErrorInner, Iterator, Modify, Result as EngineResult, Snapshot, WriteData, + DummySnapshotExt, Engine, Error as EngineError, ErrorInner as EngineErrorInner, Iterator, + Modify, OnAppliedCb, Result as EngineResult, Snapshot, WriteData, WriteEvent, }; type RwLockTree = RwLock>; @@ -86,29 +87,28 @@ impl Engine for BTreeEngine { unimplemented!(); } + type WriteRes = impl Stream + Send; fn async_write( &self, _ctx: &Context, batch: WriteData, - cb: EngineCallback<()>, - ) -> EngineResult<()> { - if batch.modifies.is_empty() { - return Err(EngineError::from(EngineErrorInner::EmptyRequest)); - } - cb(write_modifies(self, batch.modifies)); + _subscribed: u8, + _on_applied: Option, + ) -> Self::WriteRes { + let res = if batch.modifies.is_empty() { + Err(EngineError::from(EngineErrorInner::EmptyRequest)) + } else { + write_modifies(self, batch.modifies) + }; - Ok(()) + stream::once(future::ready(WriteEvent::Finished(res))) } + type SnapshotRes = impl Future> + Send; /// warning: It returns a fake snapshot whose content will be affected by /// the later modifies! - fn async_snapshot( - &mut self, - _ctx: SnapContext<'_>, - cb: EngineCallback, - ) -> EngineResult<()> { - cb(Ok(BTreeEngineSnapshot::new(self))); - Ok(()) + fn async_snapshot(&mut self, _ctx: SnapContext<'_>) -> Self::SnapshotRes { + futures::future::ready(Ok(BTreeEngineSnapshot::new(self))) } } diff --git a/components/tikv_kv/src/cursor.rs b/components/tikv_kv/src/cursor.rs index 2c9a071fbbb..576aa5cfa76 100644 --- a/components/tikv_kv/src/cursor.rs +++ b/components/tikv_kv/src/cursor.rs @@ -432,10 +432,10 @@ pub struct CursorBuilder<'a, S: Snapshot> { prefix_seek: bool, upper_bound: Option, lower_bound: Option, - // hint for we will only scan data with commit ts >= hint_min_ts - hint_min_ts: Option, - // hint for we will only scan data with commit ts <= hint_max_ts - hint_max_ts: Option, + // hint for we will only scan data with commit_ts >/>= hint_min_ts + hint_min_ts: Option>, + // hint for we will only scan data with commit_ts >, key_only: bool, max_skippable_internal_keys: u64, } @@ -506,8 +506,8 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { /// Default is empty. #[inline] #[must_use] - pub fn hint_min_ts(mut self, min_ts: Option) -> Self { - self.hint_min_ts = min_ts; + pub fn hint_min_ts(mut self, ts_bound: Option>) -> Self { + self.hint_min_ts = ts_bound; self } @@ -516,8 +516,8 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { /// Default is empty. #[inline] #[must_use] - pub fn hint_max_ts(mut self, max_ts: Option) -> Self { - self.hint_max_ts = max_ts; + pub fn hint_max_ts(mut self, ts_bound: Option>) -> Self { + self.hint_max_ts = ts_bound; self } @@ -550,11 +550,11 @@ impl<'a, S: 'a + Snapshot> CursorBuilder<'a, S> { None }; let mut iter_opt = IterOptions::new(l_bound, u_bound, self.fill_cache); - if let Some(ts) = self.hint_min_ts { - iter_opt.set_hint_min_ts(Bound::Included(ts.into_inner())); + if let Some(ts_bound) = self.hint_min_ts { + iter_opt.set_hint_min_ts(ts_bound.map(TimeStamp::into_inner)); } - if let Some(ts) = self.hint_max_ts { - iter_opt.set_hint_max_ts(Bound::Included(ts.into_inner())); + if let Some(ts_bound) = self.hint_max_ts { + iter_opt.set_hint_max_ts(ts_bound.map(TimeStamp::into_inner)); } iter_opt.set_key_only(self.key_only); iter_opt.set_max_skippable_internal_keys(self.max_skippable_internal_keys); diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index 77f9a00efcb..f78b2243331 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -5,8 +5,10 @@ //! [`Server`](crate::server::Server). The [`BTreeEngine`](kv::BTreeEngine) and //! [`RocksEngine`](RocksEngine) are used for testing only. +#![feature(bound_map)] #![feature(min_specialization)] -#![feature(generic_associated_types)] +#![feature(type_alias_impl_trait)] +#![feature(associated_type_defaults)] #[macro_use(fail_point)] extern crate fail; @@ -17,6 +19,7 @@ mod btree_engine; mod cursor; pub mod metrics; mod mock_engine; +mod raft_extension; mod raftstore_impls; mod rocksdb_engine; mod stats; @@ -36,7 +39,7 @@ use engine_traits::{ CF_DEFAULT, CF_LOCK, }; use error_code::{self, ErrorCode, ErrorCodeExt}; -use futures::prelude::*; +use futures::{compat::Future01CompatExt, future::BoxFuture, prelude::*}; use into_other::IntoOther; use kvproto::{ errorpb::Error as ErrorHeader, @@ -46,7 +49,7 @@ use kvproto::{ use pd_client::BucketMeta; use raftstore::store::{PessimisticLockPair, TxnExt}; use thiserror::Error; -use tikv_util::{deadline::Deadline, escape, time::ThreadReadId}; +use tikv_util::{deadline::Deadline, escape, time::ThreadReadId, timer::GLOBAL_TIMER_HANDLE}; use tracker::with_tls_tracker; use txn_types::{Key, PessimisticLock, TimeStamp, TxnExtra, Value}; @@ -54,6 +57,7 @@ pub use self::{ btree_engine::{BTreeEngine, BTreeEngineIterator, BTreeEngineSnapshot}, cursor::{Cursor, CursorBuilder}, mock_engine::{ExpectedWrite, MockEngineBuilder}, + raft_extension::{FakeExtension, RaftExtension}, rocksdb_engine::{RocksEngine, RocksSnapshot}, stats::{ CfStatistics, FlowStatistics, FlowStatsReporter, StageLatencyStats, Statistics, @@ -62,9 +66,10 @@ pub use self::{ }; pub const SEEK_BOUND: u64 = 8; -const DEFAULT_TIMEOUT_SECS: u64 = 5; +const DEFAULT_TIMEOUT: Duration = Duration::from_secs(5); pub type Callback = Box) + Send>; +pub type OnAppliedCb = Box) + Send>; pub type ExtCallback = Box; pub type Result = result::Result; @@ -153,7 +158,7 @@ impl From for raft_cmdpb::Request { // For test purpose only. // It's used to simulate observer actions in `rocksdb_engine`. See -// `RocksEngine::async_write_ext()`. +// `RocksEngine::async_write()`. impl From for Modify { fn from(mut req: raft_cmdpb::Request) -> Modify { let name_to_cf = |name: &str| -> Option { @@ -248,6 +253,37 @@ impl WriteData { } } +/// Events that can subscribed from the `WriteSubscriber`. +pub enum WriteEvent { + Proposed, + Committed, + /// The write is either aborted or applied. + Finished(Result<()>), +} + +impl WriteEvent { + pub const EVENT_PROPOSED: u8 = 1; + pub const EVENT_COMMITTED: u8 = 1 << 1; + pub const ALL_EVENTS: u8 = Self::EVENT_PROPOSED | Self::EVENT_COMMITTED; + pub const BASIC_EVENT: u8 = 0; + + #[inline] + pub fn event_capacity(subscribed: u8) -> usize { + 1 + Self::subscribed_proposed(subscribed) as usize + + Self::subscribed_committed(subscribed) as usize + } + + #[inline] + pub fn subscribed_proposed(ev: u8) -> bool { + ev & Self::EVENT_PROPOSED != 0 + } + + #[inline] + pub fn subscribed_committed(ev: u8) -> bool { + ev & Self::EVENT_COMMITTED != 0 + } +} + #[derive(Debug, Clone, Default)] pub struct SnapContext<'a> { pub pb_ctx: &'a Context, @@ -273,47 +309,78 @@ pub trait Engine: Send + Clone + 'static { /// Currently, only multi-rocksdb version will return `None`. fn kv_engine(&self) -> Option; + type RaftExtension: raft_extension::RaftExtension = FakeExtension; + /// Get the underlying raft extension. + fn raft_extension(&self) -> &Self::RaftExtension { + unimplemented!() + } + /// Write modifications into internal local engine directly. /// /// region_modifies records each region's modifications. fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()>; - fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()>; + type SnapshotRes: Future> + Send + 'static; + /// Get a snapshot asynchronously. + /// + /// Note the snapshot is queried immediately no matter whether the returned + /// future is polled or not. + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes; /// Precheck request which has write with it's context. fn precheck_write_with_ctx(&self, _ctx: &Context) -> Result<()> { Ok(()) } - fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()>; - - /// Writes data to the engine asynchronously with some extensions. + type WriteRes: Stream + Unpin + Send + 'static; + /// Writes data to the engine asynchronously. /// - /// When the write request is proposed successfully, the `proposed_cb` is - /// invoked. When the write request is finished, the `write_cb` is invoked. - fn async_write_ext( + /// You can subscribe special events like `EVENT_PROPOSED` and + /// `EVENT_COMMITTED`. + /// + /// `on_applied` is called right in the processing thread before being + /// fed to the stream. + /// + /// Note the write is started no matter whether the returned stream is + /// polled or not. + fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - _proposed_cb: Option, - _committed_cb: Option, - ) -> Result<()> { - self.async_write(ctx, batch, write_cb) - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes; fn write(&self, ctx: &Context, batch: WriteData) -> Result<()> { - let timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECS); - wait_op!(|cb| self.async_write(ctx, batch, cb), timeout) - .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) + let f = write(self, ctx, batch, None); + let timeout = GLOBAL_TIMER_HANDLE + .delay(Instant::now() + DEFAULT_TIMEOUT) + .compat(); + + futures::executor::block_on(async move { + futures::select! { + res = f.fuse() => { + if let Some(res) = res { + return res; + } + }, + _ = timeout.fuse() => (), + }; + Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT))) + }) } fn release_snapshot(&mut self) {} fn snapshot(&mut self, ctx: SnapContext<'_>) -> Result { - let timeout = Duration::from_secs(DEFAULT_TIMEOUT_SECS); - wait_op!(|cb| self.async_snapshot(ctx, cb), timeout) - .unwrap_or_else(|| Err(Error::from(ErrorInner::Timeout(timeout)))) + let deadline = Instant::now() + DEFAULT_TIMEOUT; + let timeout = GLOBAL_TIMER_HANDLE.delay(deadline).compat(); + futures::executor::block_on(async move { + futures::select! { + res = self.async_snapshot(ctx).fuse() => res, + _ = timeout.fuse() => Err(Error::from(ErrorInner::Timeout(DEFAULT_TIMEOUT))), + } + }) } fn put(&self, ctx: &Context, key: Key, value: Value) -> Result<()> { @@ -348,6 +415,23 @@ pub trait Engine: Send + Clone + 'static { // Some engines have a `TxnExtraScheduler`. This method is to send the extra // to the scheduler. fn schedule_txn_extra(&self, _txn_extra: TxnExtra) {} + + /// Mark the start of flashback. + // It's an infrequent API, use trait object for simplicity. + fn start_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + Box::pin(futures::future::ready(Ok(()))) + } + + /// Mark the end of flashback. + // It's an infrequent API, use trait object for simplicity. + fn end_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + Box::pin(futures::future::ready(Ok(()))) + } + + /// Application may operate on local engine directly, the method is to hint + /// the engine there is probably a notable difference in range, so + /// engine may update its statistics. + fn hint_change_in_range(&self, _start_key: Vec, _end_key: Vec) {} } /// A Snapshot is a consistent view of the underlying engine at a given point in @@ -587,15 +671,10 @@ pub fn snapshot( ctx: SnapContext<'_>, ) -> impl std::future::Future> { let begin = Instant::now(); - let (callback, future) = - tikv_util::future::paired_must_called_future_callback(drop_snapshot_callback::); - let val = engine.async_snapshot(ctx, callback); + let val = engine.async_snapshot(ctx); // make engine not cross yield point async move { - val?; // propagate error - let result = future - .map_err(|cancel| Error::from(ErrorInner::Other(box_err!(cancel)))) - .await?; + let result = val.await; with_tls_tracker(|tracker| { tracker.metrics.get_snapshot_nanos += begin.elapsed().as_nanos() as u64; }); @@ -604,12 +683,22 @@ pub fn snapshot( } } -pub fn drop_snapshot_callback() -> Result { - let bt = backtrace::Backtrace::new(); - warn!("async snapshot callback is dropped"; "backtrace" => ?bt); - let mut err = ErrorHeader::default(); - err.set_message("async snapshot callback is dropped".to_string()); - Err(Error::from(ErrorInner::Request(err))) +pub fn write( + engine: &E, + ctx: &Context, + batch: WriteData, + on_applied: Option, +) -> impl std::future::Future>> { + let mut res = engine.async_write(ctx, batch, WriteEvent::BASIC_EVENT, on_applied); + async move { + loop { + match res.next().await { + Some(WriteEvent::Finished(res)) => return Some(res), + Some(_) => (), + None => return None, + } + } + } } /// Write modifications into a `BaseRocksEngine` instance. @@ -1183,6 +1272,8 @@ mod unit_tests { ttl: 200, for_update_ts: 101.into(), min_commit_ts: 102.into(), + last_change_ts: 80.into(), + versions_to_last_change: 2, }, ), Modify::DeleteRange( @@ -1225,6 +1316,8 @@ mod unit_tests { ttl: 200, for_update_ts: 101.into(), min_commit_ts: 102.into(), + last_change_ts: 80.into(), + versions_to_last_change: 2, } .into_lock() .to_bytes(), diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index 84605a04084..dc812e84d93 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -9,7 +9,7 @@ use collections::HashMap; use kvproto::kvrpcpb::Context; use super::Result; -use crate::{Callback, Engine, ExtCallback, Modify, RocksEngine, SnapContext, WriteData}; +use crate::{Engine, Modify, OnAppliedCb, RocksEngine, SnapContext, WriteData, WriteEvent}; /// A mock engine is a simple wrapper around RocksEngine /// but with the ability to assert the modifies, @@ -153,39 +153,40 @@ impl Engine for MockEngine { self.base.kv_engine() } - fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { - self.base.modify_on_kv_engine(region_modifies) + type RaftExtension = ::RaftExtension; + fn raft_extension(&self) -> &Self::RaftExtension { + self.base.raft_extension() } - fn async_snapshot(&mut self, ctx: SnapContext<'_>, cb: Callback) -> Result<()> { - self.base.async_snapshot(ctx, cb) + fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { + self.base.modify_on_kv_engine(region_modifies) } - fn async_write(&self, ctx: &Context, batch: WriteData, write_cb: Callback<()>) -> Result<()> { - self.async_write_ext(ctx, batch, write_cb, None, None) + type SnapshotRes = ::SnapshotRes; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + self.base.async_snapshot(ctx) } - fn async_write_ext( + type WriteRes = ::WriteRes; + fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { if let Some(expected_modifies) = self.expected_modifies.as_ref() { let mut expected_writes = expected_modifies.0.lock().unwrap(); check_expected_write( &mut expected_writes, &batch.modifies, - proposed_cb.is_some(), - committed_cb.is_some(), + WriteEvent::subscribed_proposed(subscribed), + WriteEvent::subscribed_committed(subscribed), ); } let mut last_modifies = self.last_modifies.lock().unwrap(); last_modifies.push(batch.modifies.clone()); - self.base - .async_write_ext(ctx, batch, write_cb, proposed_cb, committed_cb) + self.base.async_write(ctx, batch, subscribed, on_applied) } } diff --git a/components/tikv_kv/src/raft_extension.rs b/components/tikv_kv/src/raft_extension.rs new file mode 100644 index 00000000000..26c9e687ef6 --- /dev/null +++ b/components/tikv_kv/src/raft_extension.rs @@ -0,0 +1,69 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! TiKV uses raft under the hook to provide consistency between replicas. +//! Though technically, `Engine` trait should hide the details of raft, but in +//! some cases it's unavoidable to access raft interface somehow. This module +//! supports the access pattern via extension. + +use futures::future::BoxFuture; +use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_serverpb::RaftMessage, +}; +use raft::SnapshotStatus; +use raftstore::store::region_meta::RegionMeta; + +use crate::Result; + +/// An interface to provide direct access to raftstore layer. +pub trait RaftExtension: Clone + Send { + /// Feed the message to the raft group. + /// + /// If it's a `key_message` is true, it will log a warning if the message + /// failed to send. + fn feed(&self, _msg: RaftMessage, _key_message: bool) {} + + /// Retport the message is rejected by the remote peer. + fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) {} + + /// Report the target peer is unreachable. + fn report_peer_unreachable(&self, _region_id: u64, _to_peer_id: u64) {} + + /// Report the target store is unreachable. + fn report_store_unreachable(&self, _store_id: u64) {} + + /// Report the status of snapshot. + fn report_snapshot_status(&self, _region_id: u64, _to_peer_id: u64, _status: SnapshotStatus) {} + + /// Report the address of a store is resolved. + fn report_resolved(&self, _store_id: u64, _group_id: u64) {} + + /// Split the region with the given keys. + /// + /// Use `BoxFuture` for simplicity as it's not performance critical path. + fn split( + &self, + _region_id: u64, + _region_epoch: RegionEpoch, + _split_keys: Vec>, + _source: String, + ) -> BoxFuture<'static, Result>> { + Box::pin(async move { Err(box_err!("raft split is not supported")) }) + } + + /// Get the region meta of the given region. + fn query_region(&self, _region_id: u64) -> BoxFuture<'static, Result> { + Box::pin(async move { Err(box_err!("query region is not supported")) }) + } + + /// Ask the raft group to do a consistency check. + fn check_consistency(&self, _region_id: u64) -> BoxFuture<'static, Result<()>> { + Box::pin(async move { Err(box_err!("consistency check is not supported")) }) + } +} + +/// An extension that does nothing or panic on all operations. +#[derive(Clone)] +pub struct FakeExtension; + +impl RaftExtension for FakeExtension {} diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 0ef9b5b274c..26e2c735254 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -2,10 +2,12 @@ use std::{ fmt::{self, Debug, Display, Formatter}, + pin::Pin, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, }, + task::Poll, time::Duration, }; @@ -18,6 +20,10 @@ use engine_traits::{ CfName, Engines, IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, }; use file_system::IoRateLimiter; +use futures::{ + channel::{mpsc, oneshot}, + stream, Future, Stream, +}; use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb}; use raftstore::coprocessor::CoprocessorHost; use tempfile::{Builder, TempDir}; @@ -25,16 +31,17 @@ use tikv_util::worker::{Runnable, Scheduler, Worker}; use txn_types::{Key, Value}; use super::{ - write_modifies, Callback, DummySnapshotExt, Engine, Error, ErrorInner, ExtCallback, + write_modifies, Callback, DummySnapshotExt, Engine, Error, ErrorInner, Iterator as EngineIterator, Modify, Result, SnapContext, Snapshot, WriteData, }; +use crate::{FakeExtension, OnAppliedCb, RaftExtension, WriteEvent}; // Duplicated in test_engine_builder const TEMP_DIR: &str = ""; enum Task { Write(Vec, Callback<()>), - Snapshot(Callback>), + Snapshot(oneshot::Sender>), Pause(Duration), } @@ -56,7 +63,9 @@ impl Runnable for Runner { fn run(&mut self, t: Task) { match t { Task::Write(modifies, cb) => cb(write_modifies(&self.0.kv, modifies)), - Task::Snapshot(cb) => cb(Ok(Arc::new(self.0.kv.snapshot()))), + Task::Snapshot(sender) => { + let _ = sender.send(Arc::new(self.0.kv.snapshot())); + } Task::Pause(dur) => std::thread::sleep(dur), } } @@ -78,12 +87,26 @@ impl Drop for RocksEngineCore { /// /// This is intended for **testing use only**. #[derive(Clone)] -pub struct RocksEngine { +pub struct RocksEngine { core: Arc>, sched: Scheduler, engines: Engines, not_leader: Arc, coprocessor: CoprocessorHost, + ext: RE, +} + +impl RocksEngine { + pub fn with_raft_extension(self, ext: NRE) -> RocksEngine { + RocksEngine { + core: self.core, + sched: self.sched, + engines: self.engines, + not_leader: self.not_leader, + coprocessor: self.coprocessor, + ext, + } + } } impl RocksEngine { @@ -123,9 +146,12 @@ impl RocksEngine { not_leader: Arc::new(AtomicBool::new(false)), engines, coprocessor: CoprocessorHost::default(), + ext: FakeExtension, }) } +} +impl RocksEngine { pub fn trigger_not_leader(&self) { self.not_leader.store(true, Ordering::SeqCst); } @@ -187,13 +213,13 @@ impl RocksEngine { } } -impl Display for RocksEngine { +impl Display for RocksEngine { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "RocksDB") } } -impl Debug for RocksEngine { +impl Debug for RocksEngine { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!( f, @@ -203,7 +229,7 @@ impl Debug for RocksEngine { } } -impl Engine for RocksEngine { +impl Engine for RocksEngine { type Snap = Arc; type Local = BaseRocksEngine; @@ -211,6 +237,11 @@ impl Engine for RocksEngine { Some(self.engines.kv.clone()) } + type RaftExtension = RE; + fn raft_extension(&self) -> &Self::RaftExtension { + &self.ext + } + fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { let modifies = region_modifies.into_values().flatten().collect(); write_modifies(&self.engines.kv, modifies) @@ -223,48 +254,67 @@ impl Engine for RocksEngine { Ok(()) } - fn async_write(&self, ctx: &Context, batch: WriteData, cb: Callback<()>) -> Result<()> { - self.async_write_ext(ctx, batch, cb, None, None) - } - - fn async_write_ext( + type WriteRes = impl Stream + Send + 'static; + fn async_write( &self, - _: &Context, + _ctx: &Context, batch: WriteData, - cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { - fail_point!("rockskv_async_write", |_| Err(box_err!("write failed"))); - - if batch.modifies.is_empty() { - return Err(Error::from(ErrorInner::EmptyRequest)); - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let (mut tx, mut rx) = mpsc::channel::(WriteEvent::event_capacity(subscribed)); + let res = (move || { + fail_point!("rockskv_async_write", |_| Err(box_err!("write failed"))); + + if batch.modifies.is_empty() { + return Err(Error::from(ErrorInner::EmptyRequest)); + } - let batch = self.pre_propose(batch)?; + let batch = self.pre_propose(batch)?; - if let Some(cb) = proposed_cb { - cb(); - } - if let Some(cb) = committed_cb { - cb(); - } - box_try!(self.sched.schedule(Task::Write(batch.modifies, cb))); - Ok(()) + if WriteEvent::subscribed_proposed(subscribed) { + let _ = tx.try_send(WriteEvent::Proposed); + } + if WriteEvent::subscribed_committed(subscribed) { + let _ = tx.try_send(WriteEvent::Committed); + } + let cb = Box::new(move |mut res| { + if let Some(cb) = on_applied { + cb(&mut res); + } + let _ = tx.try_send(WriteEvent::Finished(res)); + }); + box_try!(self.sched.schedule(Task::Write(batch.modifies, cb))); + Ok(()) + })(); + let mut res = Some(res); + stream::poll_fn(move |cx| { + if res.as_ref().map_or(false, |r| r.is_err()) { + return Poll::Ready(res.take().map(WriteEvent::Finished)); + } + // If it's none, it means an error is returned, it should not be polled again. + assert!(res.is_some()); + Pin::new(&mut rx).poll_next(cx) + }) } - fn async_snapshot(&mut self, _: SnapContext<'_>, cb: Callback) -> Result<()> { - fail_point!("rockskv_async_snapshot", |_| Err(box_err!( - "snapshot failed" - ))); - fail_point!("rockskv_async_snapshot_not_leader", |_| { - Err(self.not_leader_error()) - }); - if self.not_leader.load(Ordering::SeqCst) { - return Err(self.not_leader_error()); - } - box_try!(self.sched.schedule(Task::Snapshot(cb))); - Ok(()) + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, _: SnapContext<'_>) -> Self::SnapshotRes { + let res = (|| { + fail_point!("rockskv_async_snapshot", |_| Err(box_err!( + "snapshot failed" + ))); + if self.not_leader.load(Ordering::SeqCst) { + return Err(self.not_leader_error()); + } + let (tx, rx) = oneshot::channel(); + if self.sched.schedule(Task::Snapshot(tx)).is_err() { + return Err(box_err!("failed to schedule snapshot")); + } + Ok(rx) + })(); + + async move { Ok(res?.await.unwrap()) } } } diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 5ff65b33df3..12c3983ef2d 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -26,11 +26,12 @@ futures = { version = "0.3", features = ["compat", "thread-pool"] } futures-util = { version = "0.3", default-features = false, features = ["io"] } grpcio = { workspace = true } http = "0.2.0" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1.3" libc = "0.2" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } +mnt = "0.3.1" nix = "0.24" num-traits = "0.2" num_cpus = "1" @@ -55,10 +56,10 @@ tikv_alloc = { workspace = true } time = "0.1" tokio = { version = "1.5", features = ["rt-multi-thread"] } tokio-executor = "0.1" -tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } +tokio-timer = { workspace = true } tracker = { workspace = true } url = "2" -yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } +yatp = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } diff --git a/components/tikv_util/src/buffer_vec.rs b/components/tikv_util/src/buffer_vec.rs index d2247c011ec..78196577366 100644 --- a/components/tikv_util/src/buffer_vec.rs +++ b/components/tikv_util/src/buffer_vec.rs @@ -429,7 +429,7 @@ mod tests { assert_eq!(format!("{:?}", v), "[]"); assert!(v.is_empty()); - v.push(&[0xAA, 0x0, 0xB]); + v.push([0xAA, 0x0, 0xB]); assert_eq!(v.len(), 1); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -475,7 +475,7 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xCA, 0xB]); + v.push([0xCA, 0xB]); assert_eq!(v.len(), 1); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -488,8 +488,8 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xCA, 0xB]); - v.push(&[]); + v.push([0xCA, 0xB]); + v.push([]); assert_eq!(v.len(), 2); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -503,8 +503,8 @@ mod tests { assert_eq!(v[0], [0xCA, 0xB]); assert_eq!(format!("{:?}", v), "[CA0B]"); - v.push(&[]); - v.push(&[]); + v.push([]); + v.push([]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 2); assert!(!v.is_empty()); @@ -513,7 +513,7 @@ mod tests { assert!(v[2].is_empty()); assert_eq!(format!("{:?}", v), "[CA0B, null, null]"); - v.push(&[0xC]); + v.push([0xC]); assert_eq!(v.len(), 4); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -540,7 +540,7 @@ mod tests { assert!(v[1].is_empty()); assert_eq!(format!("{:?}", v), "[null, null]"); - v.push(&[0xAC, 0xBB, 0x00]); + v.push([0xAC, 0xBB, 0x00]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -561,7 +561,7 @@ mod tests { assert_eq!(v[1], [0xAC, 0xBB, 0x00]); assert_eq!(format!("{:?}", v), "[null, ACBB00]"); - v.push(&[]); + v.push([]); assert_eq!(v.len(), 3); assert_eq!(v.total_len(), 3); assert!(!v.is_empty()); @@ -590,12 +590,12 @@ mod tests { assert!(v.is_empty()); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xA]); - v.push(&[0xB]); - v.push(&[0xC]); - v.push(&[0xD, 0xE]); - v.push(&[]); - v.push(&[]); + v.push([0xA]); + v.push([0xB]); + v.push([0xC]); + v.push([0xD, 0xE]); + v.push([]); + v.push([]); assert_eq!(v.len(), 6); assert_eq!(v.total_len(), 5); assert!(!v.is_empty()); @@ -630,14 +630,14 @@ mod tests { #[test] fn test_copy_from() { let mut v1 = BufferVec::new(); - v1.push(&[]); - v1.push(&[0xAA, 0xBB, 0x0C]); - v1.push(&[]); - v1.push(&[0x00]); + v1.push([]); + v1.push([0xAA, 0xBB, 0x0C]); + v1.push([]); + v1.push([0x00]); let mut v2 = BufferVec::new(); - v2.push(&[]); - v2.push(&[]); + v2.push([]); + v2.push([]); let mut v3 = v1.clone(); v3.copy_from(&v2); @@ -650,8 +650,8 @@ mod tests { assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[null, AABB0C, null]"); - v3.push(&[]); - v3.push(&[0x00]); + v3.push([]); + v3.push([0x00]); assert_eq!(v3.len(), 5); assert_eq!(v3.total_len(), 4); assert_eq!(format!("{:?}", v3), "[null, AABB0C, null, null, 00]"); @@ -681,12 +681,12 @@ mod tests { assert_eq!(format!("{:?}", v3), "[]"); let mut v1 = BufferVec::new(); - v1.push(&[]); - v1.push(&[0xAA, 0xBB, 0x0C]); + v1.push([]); + v1.push([0xAA, 0xBB, 0x0C]); let mut v2 = BufferVec::new(); - v2.push(&[0x0C, 0x00]); - v2.push(&[]); + v2.push([0x0C, 0x00]); + v2.push([]); let mut v3 = v2.clone(); v3.copy_n_from(&v1, 0); @@ -694,7 +694,7 @@ mod tests { assert_eq!(v3.total_len(), 2); assert_eq!(format!("{:?}", v3), "[0C00, null]"); - v3.push(&[0xAA]); + v3.push([0xAA]); assert_eq!(v3.len(), 3); assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[0C00, null, AA]"); @@ -705,16 +705,18 @@ mod tests { assert_eq!(v3.total_len(), 2); assert_eq!(format!("{:?}", v3), "[0C00, null, null]"); - v3.push(&[0xAA]); + v3.push([0xAA]); assert_eq!(v3.len(), 4); assert_eq!(v3.total_len(), 3); assert_eq!(format!("{:?}", v3), "[0C00, null, null, AA]"); - v3.extend(&[0xAA, 0xAB, 0xCC]); + v3.extend([0xAA, 0xAB, 0xCC]); assert_eq!(v3.len(), 5); assert_eq!(v3.total_len(), 6); assert_eq!(format!("{:?}", v3), "[0C00, null, null, AA, AAABCC]"); + // False positive: https://github.com/rust-lang/rust-clippy/issues/9111 + #[allow(clippy::needless_borrow)] v3.extend(&[]); assert_eq!(v3.len(), 6); assert_eq!(v3.total_len(), 6); @@ -761,7 +763,7 @@ mod tests { v.retain_by_array(&[]); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[]); + v.push([]); assert_eq!(format!("{:?}", v), "[null]"); v.retain_by_array(&[true]); @@ -770,8 +772,8 @@ mod tests { v.retain_by_array(&[false]); assert_eq!(format!("{:?}", v), "[]"); - v.push(&[0xAA, 0x00]); - v.push(&[]); + v.push([0xAA, 0x00]); + v.push([]); assert_eq!(format!("{:?}", v), "[AA00, null]"); let mut v2 = v.clone(); @@ -790,8 +792,8 @@ mod tests { v2.retain_by_array(&[false, false]); assert_eq!(format!("{:?}", v2), "[]"); - v.push(&[]); - v.push(&[0xBB, 0x00, 0xA0]); + v.push([]); + v.push([0xBB, 0x00, 0xA0]); assert_eq!(format!("{:?}", v), "[AA00, null, null, BB00A0]"); let mut v2 = v.clone(); @@ -812,7 +814,7 @@ mod tests { v2.retain_by_array(&[false, false, true, true]); assert_eq!(format!("{:?}", v2), "[null, BB00A0]"); - v2.push(&[]); + v2.push([]); assert_eq!(format!("{:?}", v2), "[null, BB00A0, null]"); let mut v2 = v.clone(); @@ -841,12 +843,12 @@ mod tests { #[test] fn test_iter() { let mut v = BufferVec::new(); - v.push(&[]); - v.push(&[0xAA, 0xBB, 0x0C]); - v.push(&[]); - v.push(&[]); - v.push(&[0x00]); - v.push(&[]); + v.push([]); + v.push([0xAA, 0xBB, 0x0C]); + v.push([]); + v.push([]); + v.push([0x00]); + v.push([]); let mut it = v.iter(); assert_eq!(it.count(), 6); diff --git a/components/tikv_util/src/codec/bytes.rs b/components/tikv_util/src/codec/bytes.rs index df23090c9c7..b382f64739c 100644 --- a/components/tikv_util/src/codec/bytes.rs +++ b/components/tikv_util/src/codec/bytes.rs @@ -513,7 +513,7 @@ mod tests { desc ); let mut longer_encoded = encoded.clone(); - longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); + longer_encoded.extend([0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); assert!( !is_encoded_from(&longer_encoded, &raw, desc), "Encoded: {:?}, Raw: {:?}, desc: {}", diff --git a/components/tikv_util/src/codec/stream_event.rs b/components/tikv_util/src/codec/stream_event.rs index b44d239197b..5b00cad6372 100644 --- a/components/tikv_util/src/codec/stream_event.rs +++ b/components/tikv_util/src/codec/stream_event.rs @@ -16,8 +16,8 @@ pub trait Iterator { fn value(&self) -> &[u8]; } -pub struct EventIterator { - buf: Vec, +pub struct EventIterator<'a> { + buf: &'a [u8], offset: usize, key_offset: usize, value_offset: usize, @@ -25,8 +25,8 @@ pub struct EventIterator { value_len: usize, } -impl EventIterator { - pub fn new(buf: Vec) -> EventIterator { +impl EventIterator<'_> { + pub fn new(buf: &[u8]) -> EventIterator<'_> { EventIterator { buf, offset: 0, @@ -44,7 +44,7 @@ impl EventIterator { } } -impl Iterator for EventIterator { +impl Iterator for EventIterator<'_> { fn next(&mut self) -> Result<()> { if self.valid() { self.key_len = self.get_size() as usize; @@ -141,7 +141,7 @@ mod tests { vals.push(val); } - let mut iter = EventIterator::new(event); + let mut iter = EventIterator::new(&event); let mut index = 0_usize; loop { diff --git a/components/tikv_util/src/config.rs b/components/tikv_util/src/config.rs index 828bf1cb3ba..6243512db84 100644 --- a/components/tikv_util/src/config.rs +++ b/components/tikv_util/src/config.rs @@ -384,8 +384,8 @@ impl FromStr for ReadableDuration { if dur.is_sign_negative() { return Err("duration should be positive.".to_owned()); } - let secs = dur as u64 / SECOND as u64; - let micros = (dur as u64 % SECOND as u64) as u32 * 1_000; + let secs = dur as u64 / SECOND; + let micros = (dur as u64 % SECOND) as u32 * 1_000; Ok(ReadableDuration(Duration::new(secs, micros))) } } @@ -814,7 +814,7 @@ mod check_data_dir { } let ent = &*ent; let cur_dir = CStr::from_ptr(ent.mnt_dir).to_str().unwrap(); - if path.starts_with(&cur_dir) && cur_dir.len() >= fs.mnt_dir.len() { + if path.starts_with(cur_dir) && cur_dir.len() >= fs.mnt_dir.len() { fs.tp = CStr::from_ptr(ent.mnt_type).to_str().unwrap().to_owned(); fs.opts = CStr::from_ptr(ent.mnt_opts).to_str().unwrap().to_owned(); fs.fsname = CStr::from_ptr(ent.mnt_fsname).to_str().unwrap().to_owned(); @@ -844,7 +844,7 @@ mod check_data_dir { let block_dir = "/sys/block"; let mut device_dir = format!("{}/{}", block_dir, dev); if !Path::new(&device_dir).exists() { - let dir = fs::read_dir(&block_dir).map_err(|e| { + let dir = fs::read_dir(block_dir).map_err(|e| { ConfigError::FileSystem(format!( "{}: read block dir {:?} failed: {:?}", op, block_dir, e @@ -1546,7 +1546,7 @@ impl RaftDataStateMachine { fs::remove_dir_all(&trash).unwrap(); } else { info!("Removing file"; "path" => %path.display()); - fs::remove_file(&path).unwrap(); + fs::remove_file(path).unwrap(); Self::sync_dir(path.parent().unwrap()); } } @@ -1563,11 +1563,11 @@ impl RaftDataStateMachine { if !path.exists() || !path.is_dir() { return false; } - fs::read_dir(&path).unwrap().next().is_some() + fs::read_dir(path).unwrap().next().is_some() } fn sync_dir(dir: &Path) { - fs::File::open(&dir).and_then(|d| d.sync_all()).unwrap(); + fs::File::open(dir).and_then(|d| d.sync_all()).unwrap(); } } @@ -1781,8 +1781,8 @@ mod tests { ensure_dir_exist(&format!("{}", tmp_dir.to_path_buf().join("dir").display())).unwrap(); let nodes: &[&str] = if cfg!(target_os = "linux") { std::os::unix::fs::symlink( - &tmp_dir.to_path_buf().join("dir"), - &tmp_dir.to_path_buf().join("symlink"), + tmp_dir.to_path_buf().join("dir"), + tmp_dir.to_path_buf().join("symlink"), ) .unwrap(); &["non_existing", "dir", "symlink"] @@ -2108,10 +2108,10 @@ yyy = 100 let source_file = source.join("file"); let target_file = target.join("file"); if !target.exists() { - fs::create_dir_all(&target).unwrap(); + fs::create_dir_all(target).unwrap(); check(); } - fs::copy(&source_file, &target_file).unwrap(); + fs::copy(source_file, target_file).unwrap(); check(); state.after_dump_data_with_check(&check); } @@ -2122,14 +2122,14 @@ yyy = 100 if dst.exists() { fs::remove_dir_all(dst)?; } - fs::create_dir_all(&dst)?; + fs::create_dir_all(dst)?; for entry in fs::read_dir(src)? { let entry = entry?; let ty = entry.file_type()?; if ty.is_dir() { copy_dir(&entry.path(), &dst.join(entry.file_name()))?; } else { - fs::copy(entry.path(), &dst.join(entry.file_name()))?; + fs::copy(entry.path(), dst.join(entry.file_name()))?; } } Ok(()) @@ -2143,7 +2143,7 @@ yyy = 100 fs::create_dir_all(&target).unwrap(); // Write some data into source. let source_file = source.join("file"); - File::create(&source_file).unwrap(); + File::create(source_file).unwrap(); let backup = dir.path().join("backup"); diff --git a/components/tikv_util/src/future.rs b/components/tikv_util/src/future.rs index 5f4c5b43817..7b22bebb482 100644 --- a/components/tikv_util/src/future.rs +++ b/components/tikv_util/src/future.rs @@ -197,6 +197,18 @@ impl ArcWake for PollAtWake { } } +/// Poll the future immediately. If the future is ready, returns the result. +/// Otherwise just ignore the future. +#[inline] +pub fn try_poll(f: impl Future) -> Option { + futures::executor::block_on(async move { + futures::select_biased! { + res = f.fuse() => Some(res), + _ = futures::future::ready(()).fuse() => None, + } + }) +} + #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; @@ -232,4 +244,12 @@ mod tests { // 3. future gets ready, ignore NOTIFIED assert_eq!(poll_times.load(Ordering::SeqCst), 2); } + + #[test] + fn test_try_poll() { + let f = futures::future::ready(1); + assert_eq!(try_poll(f), Some(1)); + let f = futures::future::pending::<()>(); + assert_eq!(try_poll(f), None); + } } diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 98c73e80c6a..9421c0e174b 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -91,7 +91,7 @@ pub fn panic_mark_file_path>(data_dir: P) -> PathBuf { pub fn create_panic_mark_file>(data_dir: P) { let file = panic_mark_file_path(data_dir); - File::create(&file).unwrap(); + File::create(file).unwrap(); } // Copied from file_system to avoid cyclic dependency diff --git a/components/tikv_util/src/logger/file_log.rs b/components/tikv_util/src/logger/file_log.rs index 5b575638c19..fa7b7c67fca 100644 --- a/components/tikv_util/src/logger/file_log.rs +++ b/components/tikv_util/src/logger/file_log.rs @@ -134,7 +134,7 @@ impl Write for RotatingFileLogger { self.file.flush()?; let new_path = (self.rename)(&self.path)?; - fs::rename(&self.path, &new_path)?; + fs::rename(&self.path, new_path)?; self.file = open_log_file(&self.path)?; // Updates all rotators' states. diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index 1e9f94c2f2d..00598f5295d 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -10,10 +10,49 @@ use std::{ use crossbeam::{ channel::{SendError, TryRecvError}, - queue::SegQueue, + queue::{ArrayQueue, SegQueue}, }; use futures::{task::AtomicWaker, Stream, StreamExt}; +enum QueueType { + Unbounded(SegQueue), + Bounded(ArrayQueue), +} + +impl QueueType { + fn len(&self) -> usize { + match self { + QueueType::Unbounded(q) => q.len(), + QueueType::Bounded(q) => q.len(), + } + } + + fn bounded(cap: usize) -> QueueType { + QueueType::Bounded(ArrayQueue::new(cap)) + } + + fn unbounded() -> QueueType { + QueueType::Unbounded(SegQueue::new()) + } + + fn push_back(&self, t: T) -> Result<(), SendError> { + match self { + QueueType::Unbounded(q) => { + q.push(t); + Ok(()) + } + QueueType::Bounded(q) => q.push(t).map_err(SendError), + } + } + + fn pop_front(&self) -> Option { + match self { + QueueType::Unbounded(q) => q.pop(), + QueueType::Bounded(q) => q.pop(), + } + } +} + #[derive(Clone, Copy)] pub enum WakePolicy { Immediately, @@ -21,7 +60,7 @@ pub enum WakePolicy { } struct Queue { - queue: SegQueue, + queue: QueueType, waker: AtomicWaker, liveness: AtomicUsize, policy: WakePolicy, @@ -62,9 +101,9 @@ impl Sender { pub fn send_with(&self, t: T, policy: WakePolicy) -> Result<(), SendError> { let queue = unsafe { &*self.queue }; if queue.liveness.load(Ordering::Acquire) & RECEIVER_COUNT_BASE != 0 { - queue.queue.push(t); + let res = queue.queue.push_back(t); queue.wake(policy); - return Ok(()); + return res; } Err(SendError(t)) } @@ -110,12 +149,12 @@ impl Stream for Receiver { #[inline] fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let queue = unsafe { &*self.queue }; - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Poll::Ready(Some(t)); } queue.waker.register(cx.waker()); // In case the message is pushed right before registering waker. - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Poll::Ready(Some(t)); } if queue.liveness.load(Ordering::Acquire) & !RECEIVER_COUNT_BASE != 0 { @@ -129,7 +168,7 @@ impl Receiver { #[inline] pub fn try_recv(&mut self) -> Result { let queue = unsafe { &*self.queue }; - if let Some(t) = queue.queue.pop() { + if let Some(t) = queue.queue.pop_front() { return Ok(t); } if queue.liveness.load(Ordering::Acquire) & !RECEIVER_COUNT_BASE != 0 { @@ -156,9 +195,19 @@ impl Drop for Receiver { unsafe impl Send for Receiver {} +#[inline] pub fn unbounded(policy: WakePolicy) -> (Sender, Receiver) { + with_queue(QueueType::unbounded(), policy) +} + +#[inline] +pub fn bounded(cap: usize, policy: WakePolicy) -> (Sender, Receiver) { + with_queue(QueueType::bounded(cap), policy) +} + +fn with_queue(queue: QueueType, policy: WakePolicy) -> (Sender, Receiver) { let queue = Box::into_raw(Box::new(Queue { - queue: SegQueue::new(), + queue, waker: AtomicWaker::new(), liveness: AtomicUsize::new(SENDER_COUNT_BASE | RECEIVER_COUNT_BASE), policy, @@ -430,4 +479,13 @@ mod tests { drop(tx1); assert!(dropped.load(Ordering::SeqCst)); } + + #[test] + fn test_bounded() { + let (tx, mut rx) = super::bounded(1, WakePolicy::Immediately); + tx.send(1).unwrap(); + tx.send(2).unwrap_err(); + assert_eq!(rx.try_recv().unwrap(), 1); + rx.try_recv().unwrap_err(); + } } diff --git a/components/tikv_util/src/store/mod.rs b/components/tikv_util/src/store/mod.rs index 81afff2975a..9a36961c202 100644 --- a/components/tikv_util/src/store/mod.rs +++ b/components/tikv_util/src/store/mod.rs @@ -5,10 +5,85 @@ pub mod query_stats; pub mod region; pub use self::{ - peer::{find_peer, find_peer_mut, is_learner, new_learner_peer, new_peer, remove_peer}, + peer::{ + find_peer, find_peer_by_id, find_peer_mut, is_learner, new_learner_peer, new_peer, + new_witness_peer, remove_peer, + }, query_stats::{is_read_query, QueryStats}, region::{ check_key_in_region, check_key_in_region_exclusive, check_key_in_region_inclusive, - region_on_same_stores, + region_on_same_stores, region_on_stores, }, }; + +#[cfg(test)] +mod tests { + use kvproto::metapb::Region; + + use super::*; + + #[test] + fn test_on_same_store() { + let cases = vec![ + (vec![2, 3, 4], vec![], vec![1, 2, 3], vec![], false), + (vec![2, 3, 1], vec![], vec![1, 2, 3], vec![], true), + (vec![2, 3, 4], vec![], vec![1, 2], vec![], false), + (vec![1, 2, 3], vec![], vec![1, 2, 3], vec![], true), + (vec![1, 3], vec![2, 4], vec![1, 2], vec![3, 4], false), + (vec![1, 3], vec![2, 4], vec![1, 3], vec![], false), + (vec![1, 3], vec![2, 4], vec![], vec![2, 4], false), + (vec![1, 3], vec![2, 4], vec![3, 1], vec![4, 2], true), + ]; + + for (s1, s2, s3, s4, exp) in cases { + let mut r1 = Region::default(); + for (store_id, peer_id) in s1.into_iter().zip(0..) { + r1.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s2.into_iter().zip(0..) { + r1.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + + let mut r2 = Region::default(); + for (store_id, peer_id) in s3.into_iter().zip(10..) { + r2.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s4.into_iter().zip(10..) { + r2.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + let res = region_on_same_stores(&r1, &r2); + assert_eq!(res, exp, "{:?} vs {:?}", r1, r2); + } + } + + #[test] + fn test_check_region_on_store() { + let cases = vec![ + (vec![1, 2, 3], vec![], vec![], true), + (vec![2, 3, 1], vec![], vec![1], true), + (vec![1, 3, 2], vec![], vec![2, 3], true), + (vec![3, 2, 1], vec![], vec![4], false), + (vec![1, 2, 3], vec![], vec![2, 4], true), + (vec![1, 3], vec![2, 4], vec![2], true), + (vec![1, 3], vec![2, 4], vec![2, 3], true), + (vec![1, 3], vec![2], vec![4], false), + ]; + + for (s1, s2, target_stores, exp) in cases { + let mut region = Region::default(); + for (store_id, peer_id) in s1.into_iter().zip(0..) { + region.mut_peers().push(new_peer(store_id, peer_id)); + } + for (store_id, peer_id) in s2.into_iter().zip(0..) { + region.mut_peers().push(new_learner_peer(store_id, peer_id)); + } + + let res = region_on_stores(®ion, &target_stores); + assert_eq!( + res, exp, + "region {:?} exists on {:?}", + region, target_stores + ); + } + } +} diff --git a/components/tikv_util/src/store/peer.rs b/components/tikv_util/src/store/peer.rs index 1a9184134f0..bbc96bb786f 100644 --- a/components/tikv_util/src/store/peer.rs +++ b/components/tikv_util/src/store/peer.rs @@ -16,6 +16,10 @@ pub fn find_peer_mut(region: &mut Region, store_id: u64) -> Option<&mut Peer> { .find(|p| p.get_store_id() == store_id) } +pub fn find_peer_by_id(region: &Region, peer_id: u64) -> Option<&Peer> { + region.get_peers().iter().find(|&p| p.get_id() == peer_id) +} + pub fn remove_peer(region: &mut Region, store_id: u64) -> Option { region .get_peers() @@ -45,6 +49,15 @@ pub fn is_learner(peer: &Peer) -> bool { peer.get_role() == PeerRole::Learner } +pub fn new_witness_peer(store_id: u64, peer_id: u64) -> Peer { + let mut peer = Peer::default(); + peer.set_store_id(store_id); + peer.set_id(peer_id); + peer.set_role(PeerRole::Voter); + peer.set_is_witness(true); + peer +} + #[cfg(test)] mod tests { use super::*; @@ -63,38 +76,4 @@ mod tests { assert!(remove_peer(&mut region, 1).is_none()); assert!(find_peer(®ion, 1).is_none()); } - - #[test] - fn test_on_same_store() { - let cases = vec![ - (vec![2, 3, 4], vec![], vec![1, 2, 3], vec![], false), - (vec![2, 3, 1], vec![], vec![1, 2, 3], vec![], true), - (vec![2, 3, 4], vec![], vec![1, 2], vec![], false), - (vec![1, 2, 3], vec![], vec![1, 2, 3], vec![], true), - (vec![1, 3], vec![2, 4], vec![1, 2], vec![3, 4], false), - (vec![1, 3], vec![2, 4], vec![1, 3], vec![], false), - (vec![1, 3], vec![2, 4], vec![], vec![2, 4], false), - (vec![1, 3], vec![2, 4], vec![3, 1], vec![4, 2], true), - ]; - - for (s1, s2, s3, s4, exp) in cases { - let mut r1 = Region::default(); - for (store_id, peer_id) in s1.into_iter().zip(0..) { - r1.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s2.into_iter().zip(0..) { - r1.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - - let mut r2 = Region::default(); - for (store_id, peer_id) in s3.into_iter().zip(10..) { - r2.mut_peers().push(new_peer(store_id, peer_id)); - } - for (store_id, peer_id) in s4.into_iter().zip(10..) { - r2.mut_peers().push(new_learner_peer(store_id, peer_id)); - } - let res = super::super::region_on_same_stores(&r1, &r2); - assert_eq!(res, exp, "{:?} vs {:?}", r1, r2); - } - } } diff --git a/components/tikv_util/src/store/region.rs b/components/tikv_util/src/store/region.rs index 17c3209e7d4..58af4e9fdfa 100644 --- a/components/tikv_util/src/store/region.rs +++ b/components/tikv_util/src/store/region.rs @@ -32,9 +32,26 @@ pub fn region_on_same_stores(lhs: &Region, rhs: &Region) -> bool { // Because every store can only have one replica for the same region, // so just one round check is enough. lhs.get_peers().iter().all(|lp| { - rhs.get_peers() + rhs.get_peers().iter().any(|rp| { + rp.get_store_id() == lp.get_store_id() + && rp.get_role() == lp.get_role() + && rp.get_is_witness() == lp.get_is_witness() + }) + }) +} + +/// Check if the given region exists on stores, by checking whether any one of +/// the peers belonging to this region exist on the given stores. +pub fn region_on_stores(region: &Region, store_ids: &Vec) -> bool { + if store_ids.is_empty() { + return true; + } + // If one of peers in this region exists on any on in `store_ids`, it shows that + // the region exists on the given stores. + region.get_peers().iter().any(|p| { + store_ids .iter() - .any(|rp| rp.get_store_id() == lp.get_store_id() && rp.get_role() == lp.get_role()) + .any(|store_id| *store_id == p.get_store_id()) }) } diff --git a/components/tikv_util/src/stream.rs b/components/tikv_util/src/stream.rs index 8f892659f68..fb29d1c91f0 100644 --- a/components/tikv_util/src/stream.rs +++ b/components/tikv_util/src/stream.rs @@ -152,24 +152,28 @@ where })(); let mut retry_wait_dur = Duration::from_secs(1); - - let mut final_result = action().await; - for _ in 1..max_retry_times { - if let Err(e) = &final_result { - if let Some(ref mut f) = ext.on_failure { - f(e); - } - if e.is_retryable() { - let backoff = thread_rng().gen_range(0..1000); - sleep(retry_wait_dur + Duration::from_millis(backoff)).await; - retry_wait_dur = MAX_RETRY_DELAY.min(retry_wait_dur * 2); - final_result = action().await; - continue; + let mut retry_time = 0; + loop { + match action().await { + Ok(r) => return Ok(r), + Err(e) => { + if let Some(ref mut f) = ext.on_failure { + f(&e); + } + if !e.is_retryable() { + return Err(e); + } + retry_time += 1; + if retry_time > max_retry_times { + return Err(e); + } } } - break; + + let backoff = thread_rng().gen_range(0..1000); + sleep(retry_wait_dur + Duration::from_millis(backoff)).await; + retry_wait_dur = MAX_RETRY_DELAY.min(retry_wait_dur * 2); } - final_result } // Return an error if the future does not finish by the timeout @@ -206,3 +210,55 @@ impl RetryError for HttpDispatchError { true } } + +#[cfg(test)] +mod tests { + use std::{cell::RefCell, pin::Pin}; + + use futures::{Future, FutureExt}; + use rusoto_core::HttpDispatchError; + + use super::RetryError; + use crate::stream::retry; + + #[derive(Debug)] + struct TriviallyRetry; + + impl RetryError for TriviallyRetry { + fn is_retryable(&self) -> bool { + true + } + } + + fn assert_send(_t: T) {} + + #[test] + fn test_retry_is_send_even_return_type_not_sync() { + struct BangSync(Option>); + let fut = retry(|| futures::future::ok::<_, HttpDispatchError>(BangSync(None))); + assert_send(fut) + } + + fn gen_action_fail_for( + n_times: usize, + ) -> impl FnMut() -> Pin>>> { + let mut n = 0; + move || { + if n < n_times { + n += 1; + futures::future::err(TriviallyRetry).boxed() + } else { + futures::future::ok(()).boxed() + } + } + } + + #[tokio::test] + async fn test_failure() { + fail::cfg("retry_count", "return(2)").unwrap(); + let r = retry(gen_action_fail_for(3)).await; + assert!(r.is_err(), "{:?}", r); + let r = retry(gen_action_fail_for(1)).await; + assert!(r.is_ok(), "{:?}", r); + } +} diff --git a/components/tikv_util/src/sys/cgroup.rs b/components/tikv_util/src/sys/cgroup.rs index df15a2dac76..2cd420e5d51 100644 --- a/components/tikv_util/src/sys/cgroup.rs +++ b/components/tikv_util/src/sys/cgroup.rs @@ -94,7 +94,7 @@ impl CGroupSys { } else { format!("{}/memory.limit_in_bytes", path.to_str().unwrap()) }; - return read_to_string(&path) + return read_to_string(path) .map(|x| parse_memory_max(x.trim())) .ok() .flatten(); @@ -112,7 +112,7 @@ impl CGroupSys { if let Some((root, mount_point)) = self.mount_points.get(component) { if let Some(path) = build_path(group, root, mount_point) { let path = format!("{}/cpuset.cpus", path.to_str().unwrap()); - if let Ok(s) = read_to_string(&path) { + if let Ok(s) = read_to_string(path) { return parse_cpu_cores(s.trim()); } } @@ -131,14 +131,14 @@ impl CGroupSys { if let Some(path) = build_path(group, root, mount_point) { if self.is_v2 { let path = format!("{}/cpu.max", path.to_str().unwrap()); - if let Ok(buffer) = read_to_string(&path) { + if let Ok(buffer) = read_to_string(path) { return parse_cpu_quota_v2(buffer.trim()); } } else { let path1 = format!("{}/cpu.cfs_quota_us", path.to_str().unwrap()); let path2 = format!("{}/cpu.cfs_period_us", path.to_str().unwrap()); if let (Ok(buffer1), Ok(buffer2)) = - (read_to_string(&path1), read_to_string(&path2)) + (read_to_string(path1), read_to_string(path2)) { return parse_cpu_quota_v1(buffer1.trim(), buffer2.trim()); } @@ -356,7 +356,7 @@ fn parse_cpu_quota_v1(line1: &str, line2: &str) -> Option { if max > 0.0 { if let Ok(period) = line2.parse::() { if period > 0.0 { - return Some(max as f64 / period as f64); + return Some(max / period); } } } else { @@ -385,11 +385,11 @@ mod tests { fn test_parse_mountinfos_without_cgroup() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"").unwrap(); @@ -402,12 +402,12 @@ mod tests { fn test_cpuset_cpu_cpuacct() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"30 26 0:27 / /sys/fs/cgroup/cpuset,cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,cpu,cpuacct\n").unwrap(); @@ -432,12 +432,12 @@ mod tests { fn test_mountinfo_with_relative_path() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw\n").unwrap(); @@ -461,12 +461,12 @@ mod tests { fn test_conflicting_mountinfo() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw 1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup2 cgroup2 rw").unwrap(); @@ -491,12 +491,12 @@ mod tests { fn test_cgroup_without_mountinfo() { let temp = tempfile::TempDir::new().unwrap(); let dir = temp.path().to_str().unwrap(); - std::fs::copy("/proc/self/stat", &format!("{}/stat", dir)).unwrap(); + std::fs::copy("/proc/self/stat", format!("{}/stat", dir)).unwrap(); let mut f = OpenOptions::new() .create(true) .write(true) - .open(&format!("{}/mountinfo", dir)) + .open(format!("{}/mountinfo", dir)) .unwrap(); f.write_all(b"1663 1661 0:27 /../../../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw\n").unwrap(); diff --git a/components/tikv_util/src/sys/disk.rs b/components/tikv_util/src/sys/disk.rs index 3f2a60855ff..c8fe87a56b0 100644 --- a/components/tikv_util/src/sys/disk.rs +++ b/components/tikv_util/src/sys/disk.rs @@ -10,6 +10,7 @@ pub use kvproto::disk_usage::DiskUsage; // Percent is not configurable, But if you want to change, please make sure // the percent in both the init fs and store monitor are keep the same. static DISK_RESERVED_SPACE: AtomicU64 = AtomicU64::new(0); +static RAFT_DISK_RESERVED_SPACE: AtomicU64 = AtomicU64::new(0); static DISK_STATUS: AtomicI32 = AtomicI32::new(0); pub fn set_disk_reserved_space(v: u64) { @@ -20,6 +21,14 @@ pub fn get_disk_reserved_space() -> u64 { DISK_RESERVED_SPACE.load(Ordering::Acquire) } +pub fn set_raft_disk_reserved_space(v: u64) { + RAFT_DISK_RESERVED_SPACE.store(v, Ordering::Release) +} + +pub fn get_raft_disk_reserved_space() -> u64 { + RAFT_DISK_RESERVED_SPACE.load(Ordering::Acquire) +} + pub fn set_disk_status(status: DiskUsage) { let v = match status { DiskUsage::Normal => 0, diff --git a/components/tikv_util/src/sys/inspector.rs b/components/tikv_util/src/sys/inspector.rs index 7b49b647706..d2ff80c6416 100644 --- a/components/tikv_util/src/sys/inspector.rs +++ b/components/tikv_util/src/sys/inspector.rs @@ -90,7 +90,7 @@ mod linux { fn disk_stat(dev: &Self::DiskID) -> Result, String> { let path = "/proc/diskstats"; - let lines = read_to_string(&path).map_err(|e| format!("open({}): {}", path, e))?; + let lines = read_to_string(path).map_err(|e| format!("open({}): {}", path, e))?; for line in lines.split('\n').map(|x| x.trim()) { let stat = procfs::DiskStat::from_line(line) .map_err(|e| format!("parse disk stat: {}", e))?; diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index d17c821e995..8b5e846592f 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -9,11 +9,15 @@ pub mod ioload; pub mod thread; // re-export some traits for ease of use +#[cfg(target_os = "linux")] +use std::path::PathBuf; use std::sync::atomic::{AtomicU64, Ordering}; use fail::fail_point; #[cfg(target_os = "linux")] use lazy_static::lazy_static; +#[cfg(target_os = "linux")] +use mnt::get_mount; use sysinfo::RefreshKind; pub use sysinfo::{DiskExt, NetworkExt, ProcessExt, ProcessorExt, SystemExt}; @@ -156,3 +160,60 @@ pub fn cache_size(level: usize) -> Option { pub fn cache_line_size(level: usize) -> Option { read_size_in_cache(level, "coherency_line_size") } + +#[cfg(target_os = "linux")] +pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { + if path1.is_empty() || path2.is_empty() { + return false; + } + let path1 = PathBuf::from(path1); + let path2 = PathBuf::from(path2); + match (get_mount(&path1), get_mount(&path2)) { + (Err(e1), _) => { + warn!("Get mount point error for path {}, {}", path1.display(), e1); + false + } + (_, Err(e2)) => { + warn!("Get mount point error for path {}, {}", path2.display(), e2); + false + } + (Ok(None), _) => { + warn!("No mount point for {}", path1.display()); + false + } + (_, Ok(None)) => { + warn!("No mount point for {}", path2.display()); + false + } + (Ok(Some(mount1)), Ok(Some(mount2))) => mount1 != mount2, + } +} + +#[cfg(not(target_os = "linux"))] +pub fn path_in_diff_mount_point(_path1: &str, _path2: &str) -> bool { + false +} + +#[cfg(all(test, target_os = "linux"))] +mod tests { + use super::*; + + #[test] + fn test_path_in_diff_mount_point() { + let (empty_path1, path2) = ("", "/"); + let result = path_in_diff_mount_point(empty_path1, path2); + assert_eq!(result, false); + + let (no_mount_point_path, path2) = ("no_mount_point_path_w943nn", "/"); + let result = path_in_diff_mount_point(no_mount_point_path, path2); + assert_eq!(result, false); + + let (not_existed_path, path2) = ("/non_existed_path_eu2yndh", "/"); + let result = path_in_diff_mount_point(not_existed_path, path2); + assert_eq!(result, false); + + let (normal_path1, normal_path2) = ("/", "/"); + let result = path_in_diff_mount_point(normal_path1, normal_path2); + assert_eq!(result, false); + } +} diff --git a/components/tikv_util/src/sys/thread.rs b/components/tikv_util/src/sys/thread.rs index 00a6e47b409..60c420661d0 100644 --- a/components/tikv_util/src/sys/thread.rs +++ b/components/tikv_util/src/sys/thread.rs @@ -121,7 +121,7 @@ mod imp { // Unsafe due to FFI. unsafe { let tid = libc::syscall(libc::SYS_gettid); - if libc::setpriority(libc::PRIO_PROCESS as u32, tid as u32, pri) != 0 { + if libc::setpriority(libc::PRIO_PROCESS, tid as u32, pri) != 0 { let e = Error::last_os_error(); return Err(e); } @@ -134,7 +134,7 @@ mod imp { unsafe { let tid = libc::syscall(libc::SYS_gettid); clear_errno(); - let ret = libc::getpriority(libc::PRIO_PROCESS as u32, tid as u32); + let ret = libc::getpriority(libc::PRIO_PROCESS, tid as u32); if ret == -1 { let e = Error::last_os_error(); if let Some(errno) = e.raw_os_error() { diff --git a/components/tipb_helper/Cargo.toml b/components/tipb_helper/Cargo.toml index 31d2c290fdc..bfbadabaea3 100644 --- a/components/tipb_helper/Cargo.toml +++ b/components/tipb_helper/Cargo.toml @@ -7,4 +7,4 @@ publish = false [dependencies] codec = { workspace = true } tidb_query_datatype = { workspace = true } -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } diff --git a/components/tracker/Cargo.toml b/components/tracker/Cargo.toml index b369fab9628..84a3f5da0ab 100644 --- a/components/tracker/Cargo.toml +++ b/components/tracker/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] collections = { workspace = true } crossbeam-utils = "0.8" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } lazy_static = "1" parking_lot = "0.12" pin-project = "1" diff --git a/components/txn_types/Cargo.toml b/components/txn_types/Cargo.toml index 9ccfe0bb323..0c357ef1dd6 100644 --- a/components/txn_types/Cargo.toml +++ b/components/txn_types/Cargo.toml @@ -11,7 +11,7 @@ codec = { workspace = true } collections = { workspace = true } error_code = { workspace = true } farmhash = "1.1.5" -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } log_wrappers = { workspace = true } slog = "2.3" thiserror = "1.0" diff --git a/components/txn_types/src/lib.rs b/components/txn_types/src/lib.rs index edd89256d2b..a1a759b21b9 100644 --- a/components/txn_types/src/lib.rs +++ b/components/txn_types/src/lib.rs @@ -19,8 +19,8 @@ pub use lock::{Lock, LockType, PessimisticLock}; use thiserror::Error; pub use timestamp::{TimeStamp, TsSet, TSO_PHYSICAL_SHIFT_BITS}; pub use types::{ - is_short_value, Key, KvPair, Mutation, MutationType, OldValue, OldValues, TxnExtra, - TxnExtraScheduler, Value, WriteBatchFlags, SHORT_VALUE_MAX_LEN, + insert_old_value_if_resolved, is_short_value, Key, KvPair, Mutation, MutationType, OldValue, + OldValues, TxnExtra, TxnExtraScheduler, Value, WriteBatchFlags, SHORT_VALUE_MAX_LEN, }; pub use write::{Write, WriteRef, WriteType}; diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index 96c96828bcb..c8e37823bc4 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -33,6 +33,8 @@ const TXN_SIZE_PREFIX: u8 = b't'; const MIN_COMMIT_TS_PREFIX: u8 = b'c'; const ASYNC_COMMIT_PREFIX: u8 = b'a'; const ROLLBACK_TS_PREFIX: u8 = b'r'; +const LAST_CHANGE_PREFIX: u8 = b'l'; +const TXN_SOURCE_PREFIX: u8 = b's'; impl LockType { pub fn from_mutation(mutation: &Mutation) -> Option { @@ -85,6 +87,22 @@ pub struct Lock { // while committing is relatively expensive. So the solution is putting the ts of the rollback // to the lock. pub rollback_ts: Vec, + + /// The commit TS of the latest PUT/DELETE record + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from the latest version to + /// find the latest PUT/DELETE record. + /// If versions_to_last_change > 0 but last_change_ts == 0, the key does not + /// have a PUT/DELETE record. + pub versions_to_last_change: u64, + /// The source of this txn. It is used by ticdc, if the value is 0 ticdc + /// will sync the kv change event to downstream, if it is not 0, ticdc + /// may ignore this change event. + /// + /// We use `u64` to reserve more space for future use. For now, the upper + /// application is limited to setting this value under `0x80`, + /// so there will no more cost to change it to `u64`. + pub txn_source: u64, } impl std::fmt::Debug for Lock { @@ -108,6 +126,9 @@ impl std::fmt::Debug for Lock { .field("use_async_commit", &self.use_async_commit) .field("secondaries", &secondary_keys) .field("rollback_ts", &self.rollback_ts) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) + .field("txn_source", &self.txn_source) .finish() } } @@ -135,6 +156,9 @@ impl Lock { use_async_commit: false, secondaries: Vec::default(), rollback_ts: Vec::default(), + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, + txn_source: 0, } } @@ -151,6 +175,24 @@ impl Lock { self } + #[must_use] + pub fn set_last_change( + mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } + + #[inline] + #[must_use] + pub fn set_txn_source(mut self, source: u64) -> Self { + self.txn_source = source; + self + } + pub fn to_bytes(&self) -> Vec { let mut b = Vec::with_capacity(self.pre_allocate_size()); b.push(self.lock_type.to_u8()); @@ -188,6 +230,15 @@ impl Lock { b.encode_u64(ts.into_inner()).unwrap(); } } + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { + b.push(LAST_CHANGE_PREFIX); + b.encode_u64(self.last_change_ts.into_inner()).unwrap(); + b.encode_var_u64(self.versions_to_last_change).unwrap(); + } + if self.txn_source != 0 { + b.push(TXN_SOURCE_PREFIX); + b.encode_var_u64(self.txn_source).unwrap(); + } b } @@ -217,6 +268,12 @@ impl Lock { if !self.rollback_ts.is_empty() { size += 1 + MAX_VAR_U64_LEN + size_of::() * self.rollback_ts.len(); } + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { + size += 1 + size_of::() + MAX_VAR_U64_LEN; + } + if self.txn_source != 0 { + size += 1 + MAX_VAR_U64_LEN; + } size } @@ -253,6 +310,9 @@ impl Lock { let mut use_async_commit = false; let mut secondaries = Vec::new(); let mut rollback_ts = Vec::new(); + let mut last_change_ts = TimeStamp::zero(); + let mut versions_to_last_change = 0; + let mut txn_source = 0; while !b.is_empty() { match b.read_u8()? { SHORT_VALUE_PREFIX => { @@ -286,6 +346,13 @@ impl Lock { rollback_ts.push(number::decode_u64(&mut b)?.into()); } } + LAST_CHANGE_PREFIX => { + last_change_ts = number::decode_u64(&mut b)?.into(); + versions_to_last_change = number::decode_var_u64(&mut b)?; + } + TXN_SOURCE_PREFIX => { + txn_source = number::decode_var_u64(&mut b)?; + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -302,7 +369,9 @@ impl Lock { for_update_ts, txn_size, min_commit_ts, - ); + ) + .set_last_change(last_change_ts, versions_to_last_change) + .set_txn_source(txn_source); if use_async_commit { lock = lock.use_async_commit(secondaries); } @@ -328,6 +397,8 @@ impl Lock { info.set_use_async_commit(self.use_async_commit); info.set_min_commit_ts(self.min_commit_ts.into_inner()); info.set_secondaries(self.secondaries.into()); + // The client does not care about last_change_ts, versions_to_last_version and + // txn_source. info } @@ -434,6 +505,9 @@ pub struct PessimisticLock { pub ttl: u64, pub for_update_ts: TimeStamp, pub min_commit_ts: TimeStamp, + + pub last_change_ts: TimeStamp, + pub versions_to_last_change: u64, } impl PessimisticLock { @@ -448,6 +522,7 @@ impl PessimisticLock { 0, self.min_commit_ts, ) + .set_last_change(self.last_change_ts, self.versions_to_last_change) } // Same with `to_lock` but does not copy the primary key. @@ -462,6 +537,7 @@ impl PessimisticLock { 0, self.min_commit_ts, ) + .set_last_change(self.last_change_ts, self.versions_to_last_change) } pub fn memory_size(&self) -> usize { @@ -477,6 +553,8 @@ impl std::fmt::Debug for PessimisticLock { .field("ttl", &self.ttl) .field("for_update_ts", &self.for_update_ts) .field("min_commit_ts", &self.min_commit_ts) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) .finish() } } @@ -687,6 +765,29 @@ mod tests { 555.into(), ) .with_rollback_ts(vec![12.into(), 24.into(), 13.into()]), + Lock::new( + LockType::Lock, + b"pk".to_vec(), + 1.into(), + 10, + None, + 6.into(), + 16, + 8.into(), + ) + .set_last_change(0.into(), 2), + Lock::new( + LockType::Lock, + b"pk".to_vec(), + 1.into(), + 10, + None, + 6.into(), + 16, + 8.into(), + ) + .set_last_change(4.into(), 2) + .set_txn_source(1), ]; for (i, lock) in locks.drain(..).enumerate() { let v = lock.to_bytes(); @@ -931,7 +1032,8 @@ mod tests { b"secondary_kkkkk2".to_vec(), b"secondary_k3k3k3k3k3k3".to_vec(), b"secondary_k4".to_vec(), - ]); + ]) + .set_last_change(80.into(), 4); assert_eq!( format!("{:?}", lock), @@ -939,7 +1041,8 @@ mod tests { short_value: 73686F72745F76616C7565, for_update_ts: TimeStamp(101), txn_size: 10, \ min_commit_ts: TimeStamp(127), use_async_commit: true, \ secondaries: [7365636F6E646172795F6B31, 7365636F6E646172795F6B6B6B6B6B32, \ - 7365636F6E646172795F6B336B336B336B336B336B33, 7365636F6E646172795F6B34], rollback_ts: [] }" + 7365636F6E646172795F6B336B336B336B336B336B33, 7365636F6E646172795F6B34], rollback_ts: [], \ + last_change_ts: TimeStamp(80), versions_to_last_change: 4, txn_source: 0 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -948,7 +1051,8 @@ mod tests { redact_result, "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, \ short_value: ?, for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [?, ?, ?, ?], rollback_ts: [] }" + use_async_commit: true, secondaries: [?, ?, ?, ?], rollback_ts: [], \ + last_change_ts: TimeStamp(80), versions_to_last_change: 4, txn_source: 0 }" ); lock.short_value = None; @@ -957,7 +1061,8 @@ mod tests { format!("{:?}", lock), "Lock { lock_type: Put, primary_key: 706B, start_ts: TimeStamp(100), ttl: 3, short_value: , \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [], rollback_ts: [] }" + use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ + versions_to_last_change: 4, txn_source: 0 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", lock); @@ -966,7 +1071,8 @@ mod tests { redact_result, "Lock { lock_type: Put, primary_key: ?, start_ts: TimeStamp(100), ttl: 3, short_value: ?, \ for_update_ts: TimeStamp(101), txn_size: 10, min_commit_ts: TimeStamp(127), \ - use_async_commit: true, secondaries: [], rollback_ts: [] }" + use_async_commit: true, secondaries: [], rollback_ts: [], last_change_ts: TimeStamp(80), \ + versions_to_last_change: 4, txn_source: 0 }" ); } @@ -978,6 +1084,8 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; let expected_lock = Lock { lock_type: LockType::Pessimistic, @@ -991,6 +1099,9 @@ mod tests { use_async_commit: false, secondaries: vec![], rollback_ts: vec![], + last_change_ts: 8.into(), + versions_to_last_change: 2, + txn_source: 0, }; assert_eq!(pessimistic_lock.to_lock(), expected_lock); assert_eq!(pessimistic_lock.into_lock(), expected_lock); @@ -1004,11 +1115,14 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; assert_eq!( format!("{:?}", pessimistic_lock), "PessimisticLock { primary_key: 7072696D617279, start_ts: TimeStamp(5), ttl: 1000, \ - for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20) }" + for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20), last_change_ts: TimeStamp(8), \ + versions_to_last_change: 2 }" ); log_wrappers::set_redact_info_log(true); let redact_result = format!("{:?}", pessimistic_lock); @@ -1016,7 +1130,8 @@ mod tests { assert_eq!( redact_result, "PessimisticLock { primary_key: ?, start_ts: TimeStamp(5), ttl: 1000, \ - for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20) }" + for_update_ts: TimeStamp(10), min_commit_ts: TimeStamp(20), last_change_ts: TimeStamp(8), \ + versions_to_last_change: 2 }" ); } @@ -1028,8 +1143,10 @@ mod tests { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 8.into(), + versions_to_last_change: 2, }; - // 7 bytes for primary key, 16 bytes for Box<[u8]>, and 4 8-byte integers. - assert_eq!(lock.memory_size(), 7 + 16 + 4 * 8); + // 7 bytes for primary key, 16 bytes for Box<[u8]>, and 6 8-byte integers. + assert_eq!(lock.memory_size(), 7 + 16 + 6 * 8); } } diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 5c9abf0d305..6a2c953afc1 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -512,6 +512,19 @@ impl OldValue { // MutationType is the type of mutation of the current write. pub type OldValues = HashMap)>; +pub fn insert_old_value_if_resolved( + old_values: &mut OldValues, + key: Key, + start_ts: TimeStamp, + old_value: OldValue, + mutation_type: Option, +) { + if old_value.resolved() { + let key = key.append_ts(start_ts); + old_values.insert(key, (old_value, mutation_type)); + } +} + // Extra data fields filled by kvrpcpb::ExtraOp. #[derive(Default, Debug, Clone)] pub struct TxnExtra { @@ -681,7 +694,7 @@ mod tests { let shorter_encoded = Key::from_encoded_slice(&encoded.0[..encoded_len - 9]); assert!(!shorter_encoded.is_encoded_from(&raw)); let mut longer_encoded = encoded.as_encoded().clone(); - longer_encoded.extend(&[0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); + longer_encoded.extend([0, 0, 0, 0, 0, 0, 0, 0, 0xFF]); let longer_encoded = Key::from_encoded(longer_encoded); assert!(!longer_encoded.is_encoded_from(&raw)); diff --git a/components/txn_types/src/write.rs b/components/txn_types/src/write.rs index 755207ed3f3..1a20518e423 100644 --- a/components/txn_types/src/write.rs +++ b/components/txn_types/src/write.rs @@ -28,6 +28,9 @@ const FLAG_ROLLBACK: u8 = b'R'; const FLAG_OVERLAPPED_ROLLBACK: u8 = b'R'; const GC_FENCE_PREFIX: u8 = b'F'; +const LAST_CHANGE_PREFIX: u8 = b'l'; + +const TXN_SOURCE_PREFIX: u8 = b'S'; /// The short value for rollback records which are protected from being /// collapsed. @@ -150,6 +153,14 @@ pub struct Write { /// * `Some(ts)`: A commit record that has been rewritten due to overlapping /// rollback, and it's next version's `commit_ts` is `ts` pub gc_fence: Option, + + /// The commit TS of the latest PUT/DELETE record + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from this record + /// to find the latest PUT/DELETE record + pub versions_to_last_change: u64, + /// The source of this txn. + pub txn_source: u64, } impl std::fmt::Debug for Write { @@ -169,6 +180,9 @@ impl std::fmt::Debug for Write { ) .field("has_overlapped_rollback", &self.has_overlapped_rollback) .field("gc_fence", &self.gc_fence) + .field("last_change_ts", &self.last_change_ts) + .field("versions_to_last_change", &self.versions_to_last_change) + .field("txn_source", &self.txn_source) .finish() } } @@ -183,6 +197,9 @@ impl Write { short_value, has_overlapped_rollback: false, gc_fence: None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, + txn_source: 0, } } @@ -200,6 +217,9 @@ impl Write { short_value, has_overlapped_rollback: false, gc_fence: None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, + txn_source: 0, } } @@ -215,6 +235,24 @@ impl Write { self } + #[must_use] + pub fn set_last_change( + mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } + + #[inline] + #[must_use] + pub fn set_txn_source(mut self, source: u64) -> Self { + self.txn_source = source; + self + } + #[inline] pub fn parse_type(mut b: &[u8]) -> Result { let write_type_bytes = b @@ -231,6 +269,28 @@ impl Write { short_value: self.short_value.as_deref(), has_overlapped_rollback: self.has_overlapped_rollback, gc_fence: self.gc_fence, + last_change_ts: self.last_change_ts, + versions_to_last_change: self.versions_to_last_change, + txn_source: self.txn_source, + } + } + + /// Returns the new `last_change_ts` and `versions_to_last_change` according + /// to this write record. + pub fn next_last_change_info(&self, commit_ts: TimeStamp) -> (TimeStamp, u64) { + match self.write_type { + WriteType::Put | WriteType::Delete => (commit_ts, 1), + WriteType::Lock | WriteType::Rollback => { + // If neither `last_change_ts` nor `versions_to_last_change` exists, do not + // set `last_change_ts` to indicate we don't know where is the last change. + // This should not happen if data is written in new version TiKV. If we hope to + // support data from old TiKV, consider iterating to the last change to find it. + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { + (self.last_change_ts, self.versions_to_last_change + 1) + } else { + (TimeStamp::zero(), 0) + } + } } } } @@ -255,6 +315,17 @@ pub struct WriteRef<'a> { /// /// See [`Write::gc_fence`] for more detail. pub gc_fence: Option, + + /// The commit TS of the last PUT/DELETE record before this write record. + /// It only exists if this is a LOCK/ROLLBACK record. + pub last_change_ts: TimeStamp, + /// The number of versions that need skipping from this record + /// to find the latest PUT/DELETE record. + /// If versions_to_last_change > 0 but last_change_ts == 0, the key does not + /// have a PUT/DELETE record before this write record. + pub versions_to_last_change: u64, + /// The source of this txn. + pub txn_source: u64, } impl WriteRef<'_> { @@ -272,6 +343,9 @@ impl WriteRef<'_> { let mut short_value = None; let mut has_overlapped_rollback = false; let mut gc_fence = None; + let mut last_change_ts = TimeStamp::zero(); + let mut versions_to_last_change = 0; + let mut txn_source = 0; while !b.is_empty() { match b @@ -296,6 +370,13 @@ impl WriteRef<'_> { has_overlapped_rollback = true; } GC_FENCE_PREFIX => gc_fence = Some(number::decode_u64(&mut b)?.into()), + LAST_CHANGE_PREFIX => { + last_change_ts = number::decode_u64(&mut b)?.into(); + versions_to_last_change = number::decode_var_u64(&mut b)?; + } + TXN_SOURCE_PREFIX => { + txn_source = number::decode_var_u64(&mut b)?; + } _ => { // To support forward compatibility, all fields should be serialized in order // and stop parsing if meets an unknown byte. @@ -310,6 +391,9 @@ impl WriteRef<'_> { short_value, has_overlapped_rollback, gc_fence, + last_change_ts, + versions_to_last_change, + txn_source, }) } @@ -329,6 +413,15 @@ impl WriteRef<'_> { b.push(GC_FENCE_PREFIX); b.encode_u64(ts.into_inner()).unwrap(); } + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { + b.push(LAST_CHANGE_PREFIX); + b.encode_u64(self.last_change_ts.into_inner()).unwrap(); + b.encode_var_u64(self.versions_to_last_change).unwrap(); + } + if self.txn_source != 0 { + b.push(TXN_SOURCE_PREFIX); + b.encode_var_u64(self.txn_source).unwrap(); + } b } @@ -341,6 +434,12 @@ impl WriteRef<'_> { if self.gc_fence.is_some() { size += 1 + size_of::(); } + if !self.last_change_ts.is_zero() || self.versions_to_last_change != 0 { + size += 1 + size_of::() + MAX_VAR_U64_LEN; + } + if self.txn_source != 0 { + size += 1 + MAX_VAR_U64_LEN; + } size } @@ -389,6 +488,8 @@ impl WriteRef<'_> { self.short_value.map(|v| v.to_owned()), ) .set_overlapped_rollback(self.has_overlapped_rollback, self.gc_fence) + .set_last_change(self.last_change_ts, self.versions_to_last_change) + .set_txn_source(self.txn_source) } } @@ -447,6 +548,9 @@ mod tests { .set_overlapped_rollback(true, Some(2345678.into())), Write::new(WriteType::Put, 456.into(), Some(b"short_value".to_vec())) .set_overlapped_rollback(true, Some(421397468076048385.into())), + Write::new(WriteType::Lock, 456.into(), None).set_last_change(345.into(), 11), + Write::new(WriteType::Lock, 456.into(), None).set_last_change(0.into(), 11), + Write::new(WriteType::Lock, 456.into(), None).set_txn_source(1), ]; for (i, write) in writes.drain(..).enumerate() { let v = write.as_ref().to_bytes(); diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 5a39fb1f155..46dc3152ea2 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -145,13 +145,7 @@ pub mod root { #[repr(C)] #[derive(Debug)] pub struct PageWithViewVec { - pub inner: * mut PageWithView, - pub len: u64, - } - #[repr(C)] - #[derive(Debug)] - pub struct CppStrWithViewVec { - pub inner: * const CppStrWithView, + pub inner: *mut root::DB::PageWithView, pub len: u64, } #[repr(u8)] @@ -248,6 +242,23 @@ pub mod root { Error = 1, NotFound = 2, } + #[repr(u32)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub enum FastAddPeerStatus { + Ok = 0, + WaitForData = 1, + OtherError = 2, + NoSuitable = 3, + BadData = 4, + FailedInject = 5, + } + #[repr(C)] + #[derive(Debug)] + pub struct FastAddPeerRes { + pub status: root::DB::FastAddPeerStatus, + pub apply_state: root::DB::CppStrWithView, + pub region: root::DB::CppStrWithView, + } #[repr(C)] #[derive(Debug)] pub struct RaftStoreProxyFFIHelper { @@ -386,10 +397,8 @@ pub mod root { arg5: u64, ) -> u8, >, - pub fn_create_write_batch: ::std::option::Option< - unsafe extern "C" fn( - ) -> root::DB::RawCppPtr, - >, + pub fn_create_write_batch: + ::std::option::Option root::DB::RawCppPtr>, pub fn_write_batch_put_page: ::std::option::Option< unsafe extern "C" fn( arg1: root::DB::RawVoidPtr, @@ -398,32 +407,17 @@ pub mod root { ), >, pub fn_write_batch_del_page: ::std::option::Option< - unsafe extern "C" fn( - arg1: root::DB::RawVoidPtr, - arg2: root::DB::BaseBuffView, - ), - >, - pub fn_write_batch_size: ::std::option::Option< - unsafe extern "C" fn( - arg1: root::DB::RawVoidPtr, - ) -> u64, - >, - pub fn_write_batch_is_empty: ::std::option::Option< - unsafe extern "C" fn( - arg1: root::DB::RawVoidPtr, - ) -> u8, + unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::BaseBuffView), >, + pub fn_write_batch_size: + ::std::option::Option u64>, + pub fn_write_batch_is_empty: + ::std::option::Option u8>, pub fn_write_batch_merge: ::std::option::Option< - unsafe extern "C" fn( - arg1: root::DB::RawVoidPtr, - arg2: root::DB::RawVoidPtr, - ), - >, - pub fn_write_batch_clear: ::std::option::Option< - unsafe extern "C" fn( - arg1: root::DB::RawVoidPtr, - ), + unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawVoidPtr), >, + pub fn_write_batch_clear: + ::std::option::Option, pub fn_consume_write_batch: ::std::option::Option< unsafe extern "C" fn( arg1: *const root::DB::EngineStoreServerWrap, @@ -444,15 +438,10 @@ pub mod root { ) -> root::DB::PageWithViewVec, >, pub fn_gc_page_with_view_vec: ::std::option::Option< - unsafe extern "C" fn( - arg1: * mut PageWithView, - arg2: u64, - ), + unsafe extern "C" fn(inner: *mut root::DB::PageWithView, len: u64), >, pub fn_handle_purge_pagestorage: ::std::option::Option< - unsafe extern "C" fn( - arg1: *const root::DB::EngineStoreServerWrap, - ), + unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap), >, pub fn_handle_seek_ps_key: ::std::option::Option< unsafe extern "C" fn( @@ -460,10 +449,8 @@ pub mod root { arg2: root::DB::BaseBuffView, ) -> root::DB::CppStrWithView, >, - pub fn_is_ps_empty: ::std::option::Option< - unsafe extern "C" fn( - arg1: *const root::DB::EngineStoreServerWrap, - ) -> u8, + pub fn_ps_is_empty: ::std::option::Option< + unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap) -> u8, >, pub fn_atomic_update_proxy: ::std::option::Option< unsafe extern "C" fn( @@ -548,8 +535,15 @@ pub mod root { leader_safe_ts: u64, ), >, + pub fn_fast_add_peer: ::std::option::Option< + unsafe extern "C" fn( + arg1: *mut root::DB::EngineStoreServerWrap, + region_id: u64, + new_peer_id: u64, + ) -> root::DB::FastAddPeerRes, + >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 15776819379826780689; + pub const RAFT_STORE_PROXY_VERSION: u64 = 4954147441045435430; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index cad6017ffed..eca8c3ea4a6 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -6,9 +6,9 @@ pub mod interfaces; mod lock_cf_reader; pub mod observer; +pub mod ps_engine; mod read_index_helper; mod utils; -pub mod ps_engine; use std::{ cell::RefCell, @@ -32,11 +32,11 @@ use protobuf::Message; pub use read_index_helper::ReadIndexClient; pub use self::interfaces::root::DB::{ - BaseBuffView, ColumnFamilyType, CppStrVecView, EngineStoreApplyRes, EngineStoreServerHelper, - EngineStoreServerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, - KVGetStatus, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, + BaseBuffView, ColumnFamilyType, CppStrVecView, CppStrWithView, EngineStoreApplyRes, + EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, + FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageWithView, + PageWithViewVec, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, - CppStrWithView, CppStrWithViewVec, PageWithView, PageWithViewVec, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, FileEncryptionInfoRaw, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, @@ -996,6 +996,7 @@ impl EngineStoreServerHelper { // of (index,term) before post_exec. DO NOT use it other than CompactLog. // Use (0,0) instead. #[allow(clippy::collapsible_else_if)] + #[allow(clippy::bool_to_int_with_if)] pub fn try_flush_data( &self, region_id: u64, @@ -1021,119 +1022,49 @@ impl EngineStoreServerHelper { } } - pub fn create_write_batch( - &self, - ) -> RawCppPtr { + pub fn create_write_batch(&self) -> RawCppPtr { debug_assert!(self.fn_create_write_batch.is_some()); - unsafe { - (self.fn_create_write_batch.into_inner())() - } + unsafe { (self.fn_create_write_batch.into_inner())() } } - pub fn write_batch_put_page( - &self, - wb: RawVoidPtr, - page_id: BaseBuffView, - page: BaseBuffView, - ) { + pub fn write_batch_put_page(&self, wb: RawVoidPtr, page_id: BaseBuffView, page: BaseBuffView) { debug_assert!(self.fn_write_batch_put_page.is_some()); - unsafe { - (self.fn_write_batch_put_page.into_inner())( - wb, - page_id, - page, - ) - } + unsafe { (self.fn_write_batch_put_page.into_inner())(wb, page_id, page) } } - pub fn write_batch_del_page( - &self, - wb: RawVoidPtr, - page_id: BaseBuffView, - ) { + pub fn write_batch_del_page(&self, wb: RawVoidPtr, page_id: BaseBuffView) { debug_assert!(self.fn_write_batch_del_page.is_some()); - unsafe { - (self.fn_write_batch_del_page.into_inner())( - wb, - page_id, - ) - } + unsafe { (self.fn_write_batch_del_page.into_inner())(wb, page_id) } } - pub fn write_batch_size( - &self, - wb: RawVoidPtr, - ) -> u64 { + pub fn write_batch_size(&self, wb: RawVoidPtr) -> u64 { debug_assert!(self.fn_write_batch_size.is_some()); - unsafe { - (self.fn_write_batch_size.into_inner())( - wb, - ) - } + unsafe { (self.fn_write_batch_size.into_inner())(wb) } } - pub fn write_batch_is_empty( - &self, - wb: RawVoidPtr, - ) -> u8 { + pub fn write_batch_is_empty(&self, wb: RawVoidPtr) -> u8 { debug_assert!(self.fn_write_batch_is_empty.is_some()); - unsafe { - (self.fn_write_batch_is_empty.into_inner())( - wb, - ) - } + unsafe { (self.fn_write_batch_is_empty.into_inner())(wb) } } - pub fn write_batch_merge( - &self, - lwb: RawVoidPtr, - rwb: RawVoidPtr, - ) { + pub fn write_batch_merge(&self, lwb: RawVoidPtr, rwb: RawVoidPtr) { debug_assert!(self.fn_write_batch_merge.is_some()); - unsafe { - (self.fn_write_batch_merge.into_inner())( - lwb, - rwb, - ) - } + unsafe { (self.fn_write_batch_merge.into_inner())(lwb, rwb) } } - pub fn write_batch_clear( - &self, - wb: RawVoidPtr, - ) { + pub fn write_batch_clear(&self, wb: RawVoidPtr) { debug_assert!(self.fn_write_batch_clear.is_some()); - unsafe { - (self.fn_write_batch_clear.into_inner())( - wb, - ) - } + unsafe { (self.fn_write_batch_clear.into_inner())(wb) } } - pub fn consume_write_batch( - &self, - wb: RawVoidPtr, - ) { + pub fn consume_write_batch(&self, wb: RawVoidPtr) { debug_assert!(self.fn_consume_write_batch.is_some()); - unsafe { - (self.fn_consume_write_batch.into_inner())( - self.inner, - wb, - ) - } + unsafe { (self.fn_consume_write_batch.into_inner())(self.inner, wb) } } - pub fn read_page( - &self, - page_id: BaseBuffView, - ) -> PageWithView { + pub fn read_page(&self, page_id: BaseBuffView) -> PageWithView { debug_assert!(self.fn_handle_read_page.is_some()); - unsafe { - (self.fn_handle_read_page.into_inner())( - self.inner, - page_id, - ) - } + unsafe { (self.fn_handle_read_page.into_inner())(self.inner, page_id) } } pub fn scan_page( @@ -1142,62 +1073,27 @@ impl EngineStoreServerHelper { end_page_id: BaseBuffView, ) -> PageWithViewVec { debug_assert!(self.fn_handle_scan_page.is_some()); - unsafe { - (self.fn_handle_scan_page.into_inner())( - self.inner, - start_page_id, - end_page_id, - ) - } + unsafe { (self.fn_handle_scan_page.into_inner())(self.inner, start_page_id, end_page_id) } } - pub fn gc_page_with_view_vec( - &self, - arg1: * mut PageWithView, - arg2: u64, - ) { + pub fn gc_page_with_view_vec(&self, arg1: *mut PageWithView, arg2: u64) { debug_assert!(self.fn_gc_page_with_view_vec.is_some()); - unsafe { - (self.fn_gc_page_with_view_vec.into_inner())( - arg1, - arg2, - ) - } + unsafe { (self.fn_gc_page_with_view_vec.into_inner())(arg1, arg2) } } - pub fn purge_pagestorage( - &self, - ) { + pub fn purge_pagestorage(&self) { debug_assert!(self.fn_handle_purge_pagestorage.is_some()); - unsafe { - (self.fn_handle_purge_pagestorage.into_inner())( - self.inner, - ) - } + unsafe { (self.fn_handle_purge_pagestorage.into_inner())(self.inner) } } - pub fn seek_ps_key( - &self, - page_id: BaseBuffView, - ) -> CppStrWithView { + pub fn seek_ps_key(&self, page_id: BaseBuffView) -> CppStrWithView { debug_assert!(self.fn_handle_seek_ps_key.is_some()); - unsafe { - (self.fn_handle_seek_ps_key.into_inner())( - self.inner, - page_id, - ) - } + unsafe { (self.fn_handle_seek_ps_key.into_inner())(self.inner, page_id) } } - pub fn is_ps_empty( - &self, - ) -> u8 { - debug_assert!(self.fn_is_ps_empty.is_some()); - unsafe { - (self.fn_is_ps_empty.into_inner())( - self.inner, - ) - } + pub fn is_ps_empty(&self) -> u8 { + debug_assert!(self.fn_ps_is_empty.is_some()); + unsafe { (self.fn_ps_is_empty.into_inner())(self.inner) } } pub fn pre_handle_snapshot( @@ -1257,7 +1153,10 @@ impl EngineStoreServerHelper { } } - fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { + // Generate a cpp string, so the other side can read. + // The string is owned by the otherside, and will be deleted by + // `gc_raw_cpp_ptr`. + pub fn gen_cpp_string(&self, buff: &[u8]) -> RawCppStringPtr { debug_assert!(self.fn_gen_cpp_string.is_some()); unsafe { (self.fn_gen_cpp_string.into_inner())(buff.into()).into_raw() as RawCppStringPtr } } @@ -1349,6 +1248,11 @@ impl EngineStoreServerHelper { ) } } + + pub fn fast_add_peer(&self, region_id: u64, new_peer_id: u64) -> FastAddPeerRes { + debug_assert!(self.fn_fast_add_peer.is_some()); + unsafe { (self.fn_fast_add_peer.into_inner())(self.inner, region_id, new_peer_id) } + } } #[allow(clippy::clone_on_copy)] @@ -1423,6 +1327,7 @@ pub extern "C" fn ffi_make_timer_task(millis: u64) -> RawRustPtr { } } +#[allow(clippy::bool_to_int_with_if)] pub unsafe extern "C" fn ffi_poll_timer_task(task_ptr: RawVoidPtr, waker: RawVoidPtr) -> u8 { let task = &mut *(task_ptr as *mut utils::TimerTask); let waker = if waker.is_null() { @@ -1436,3 +1341,20 @@ pub unsafe extern "C" fn ffi_poll_timer_task(task_ptr: RawVoidPtr, waker: RawVoi 0 } } + +use serde_derive::{Deserialize, Serialize}; +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct EngineStoreConfig { + pub enable_fast_add_peer: bool, +} + +#[allow(clippy::derivable_impls)] +impl Default for EngineStoreConfig { + fn default() -> Self { + Self { + enable_fast_add_peer: false, + } + } +} diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 9244fc5b06c..e41707e8717 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1,21 +1,26 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - use std::{ + collections::hash_map::Entry as MapEntry, + io::Write, ops::DerefMut, path::PathBuf, str::FromStr, - sync::{atomic::Ordering, mpsc, Arc, Mutex, RwLock}, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc, Arc, Mutex, RwLock, + }, }; use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::SstMetaInfo; +use engine_traits::{RaftEngine, SstMetaInfo}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::RaftApplyState, + raft_serverpb::{RaftApplyState, RaftMessage}, }; -use raft::StateRole; +use protobuf::Message; +use raft::{eraftpb, eraftpb::MessageType, StateRole}; use raftstore::{ coprocessor::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, @@ -24,11 +29,15 @@ use raftstore::{ PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, RegionState, StoreSizeInfo, UpdateSafeTsObserver, }, - store, - store::{check_sst_for_ingestion, snap::plain_file_used, SnapKey}, + store::{ + self, check_sst_for_ingestion, + snap::{plain_file_used, SnapEntry}, + SnapKey, SnapManager, Transport, + }, + Error as RaftStoreError, Result as RaftStoreResult, }; use sst_importer::SstImporter; -use tikv_util::{debug, error, info, warn}; +use tikv_util::{box_err, crit, debug, defer, error, info, store::find_peer, warn}; use yatp::{ pool::{Builder, ThreadPool}, task::future::TaskCell, @@ -41,6 +50,13 @@ use crate::{ WriteCmdType, WriteCmds, CF_LOCK, }; +macro_rules! fatal { + ($lvl:expr $(, $arg:expr)*) => ({ + crit!($lvl $(, $arg)*); + ::std::process::exit(1) + }) +} + #[allow(clippy::from_over_into)] impl Into for ffi_interfaces::StoreStats { fn into(self) -> FsStatsExt { @@ -90,28 +106,54 @@ impl PrehandleTask { unsafe impl Send for PrehandleTask {} unsafe impl Sync for PrehandleTask {} -pub struct TiFlashObserver { - pub peer_id: u64, +const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; + +#[derive(Debug, Default)] +pub struct CachedRegionInfo { + pub replicated_or_created: AtomicBool, + // TiKV assumes a region's learner peer is added through snapshot. + // If this field is false, will try fast path when meet MsgAppend. + // If this field is true, it means this peer is inited or will be inited by a TiKV snapshot. + // NOTE If we want a fallback, then we must set inited_or_fallback to true, + // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. + pub inited_or_fallback: AtomicBool, +} + +pub type CachedRegionInfoMap = HashMap>; + +pub struct TiFlashObserver { + pub store_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, pub engine: TiFlashEngine, + pub raft_engine: ER, pub sst_importer: Arc, pub pre_handle_snapshot_ctx: Arc>, pub snap_handle_pool_size: usize, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, + pub cached_region_info: Arc>>, + // TODO should we use a Mutex here? + pub trans: Arc>, + pub snap_mgr: Arc, + pub engine_store_cfg: crate::EngineStoreConfig, } -impl Clone for TiFlashObserver { +impl Clone for TiFlashObserver { fn clone(&self) -> Self { TiFlashObserver { - peer_id: self.peer_id, + store_id: self.store_id, engine_store_server_helper: self.engine_store_server_helper, engine: self.engine.clone(), + raft_engine: self.raft_engine.clone(), sst_importer: self.sst_importer.clone(), pre_handle_snapshot_ctx: self.pre_handle_snapshot_ctx.clone(), snap_handle_pool_size: self.snap_handle_pool_size, apply_snap_pool: self.apply_snap_pool.clone(), pending_delete_ssts: self.pending_delete_ssts.clone(), + cached_region_info: self.cached_region_info.clone(), + trans: self.trans.clone(), + snap_mgr: self.snap_mgr.clone(), + engine_store_cfg: self.engine_store_cfg.clone(), } } } @@ -120,12 +162,361 @@ impl Clone for TiFlashObserver { // avoid being bypassed. const TIFLASH_OBSERVER_PRIORITY: u32 = 0; -impl TiFlashObserver { +// Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) +#[inline] +fn hash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); + i ^ (i >> 31) +} + +#[allow(dead_code)] +#[inline] +fn unhash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); + i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); + i ^ (i >> 30) ^ (i >> 60) +} + +pub fn validate_remote_peer_region( + new_region: &kvproto::metapb::Region, + store_id: u64, + new_peer_id: u64, +) -> bool { + match find_peer(new_region, store_id) { + Some(peer) => peer.get_id() == new_peer_id, + None => false, + } +} + +impl TiFlashObserver { + #[inline] + fn slot_index(id: u64) -> usize { + debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); + hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) + } + + pub fn access_cached_region_info_mut>)>( + &self, + region_id: u64, + mut f: F, + ) -> RaftStoreResult<()> { + let slot_id = Self::slot_index(region_id); + let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { + Ok(g) => g, + Err(_) => return Err(box_err!("access_cached_region_info_mut poisoned")), + }; + f(guard.entry(region_id)); + Ok(()) + } + + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + fn fallback_to_slow_path(&self, region_id: u64) { + // TODO clean local, and prepare to request snapshot from TiKV as a trivial + // procedure. + fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); + if self.set_inited_or_fallback(region_id, true).is_err() { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } + + // Returns whether we need to ignore this message and run fast path instead. + pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { + if !self.engine_store_cfg.enable_fast_add_peer { + // fast path not enabled + return false; + } + // TODO Need to recover all region infomation from restart. + let inner_msg = msg.get_message(); + if inner_msg.get_msg_type() != MessageType::MsgAppend { + // we only handles the first MsgAppend + return false; + } + let region_id = msg.get_region_id(); + let new_peer_id = msg.get_to_peer().get_id(); + let mut is_first = false; + let mut is_replicated = false; + let f = |info: MapEntry>| { + match info { + MapEntry::Occupied(o) => { + is_first = !o.get().inited_or_fallback.load(Ordering::SeqCst); + // TODO include create + is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); + if is_first { + // TODO Maybe too much printing + info!("fast path: ongoing {}:{}, skip MsgAppend", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + ); + } + } + MapEntry::Vacant(v) => { + info!("fast path: ongoing {}:{}, first message", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + v.insert(Arc::new(CachedRegionInfo::default())); + is_first = true; + } + } + }; + // Can use immutable version. + self.access_cached_region_info_mut(region_id, f).unwrap(); + + if !is_first { + info!( + "fast path: normal MsgAppend of {}:{}", + self.store_id, region_id + ); + return false; + } + + { + // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, + // see `check_msg`. + if !is_replicated { + info!("fast path: ongoing {}:{}, wait replicating peer", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + return true; + } + } + + info!("fast path: ongoing {}:{}, fetch data from remote peer", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + fail::fail_point!("go_fast_path_not_allow", |e| { return false }); + // Feed data + let res = self + .engine_store_server_helper + .fast_add_peer(region_id, new_peer_id); + match res.status { + crate::FastAddPeerStatus::Ok => (), + crate::FastAddPeerStatus::WaitForData => { + error!( + "fast path: ongoing {}:{}. remote peer preparing data, wait", + self.store_id, region_id + ); + return true; + } + _ => { + error!( + "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, res + ); + self.fallback_to_slow_path(region_id); + return false; + } + }; + + info!("fast path: ongoing {}:{}, parse", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + let apply_state_str = res.apply_state.view.to_slice(); + let region_str = res.region.view.to_slice(); + let mut apply_state = RaftApplyState::default(); + let mut new_region = kvproto::metapb::Region::default(); + apply_state.merge_from_bytes(apply_state_str).unwrap(); + new_region.merge_from_bytes(region_str).unwrap(); + info!("fast path: ongoing {}:{}, start build and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "new_region" => ?new_region, + "apply_state" => ?apply_state, + ); + match self.build_and_send_snapshot(region_id, new_peer_id, msg, apply_state, new_region) { + Ok(s) => { + match s { + crate::FastAddPeerStatus::Ok => { + info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + ); + } + crate::FastAddPeerStatus::WaitForData => { + error!( + "fast path: ongoing {}:{}. remote peer preparing data, wait", + self.store_id, region_id + ); + return true; + } + _ => { + error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; + "is_first" => is_first,); + self.fallback_to_slow_path(region_id); + return false; + } + }; + } + Err(e) => { + error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; + "is_first" => is_first,); + self.fallback_to_slow_path(region_id); + return false; + } + }; + is_first + } + + fn build_and_send_snapshot( + &self, + region_id: u64, + new_peer_id: u64, + msg: &RaftMessage, + apply_state: RaftApplyState, + new_region: kvproto::metapb::Region, + ) -> RaftStoreResult { + let inner_msg = msg.get_message(); + // Build snapshot by get_snapshot_for_building + let (snap, key) = { + // check if the source already knows the know peer + if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { + info!( + "fast path: ongoing {}:{}. remote peer has not applied conf change for {}", + self.store_id, region_id, new_peer_id; + "region" => ?new_region, + ); + return Ok(crate::FastAddPeerStatus::WaitForData); + } else { + info!( + "fast path: ongoing {}:{}. remote peer has applied conf change for {}", + self.store_id, region_id, new_peer_id + ); + } + + // Find term of entry at applied_index. + let applied_index = apply_state.get_applied_index(); + let applied_term = match self.raft_engine.get_entry(region_id, applied_index)? { + Some(apply_entry) => apply_entry.get_term(), + None => { + return Err(box_err!( + "can't find entry for applied_index {} of region {}, peer_id: {}", + applied_index, + region_id, + new_peer_id + )); + } + }; + let key = SnapKey::new(region_id, applied_term, applied_index); + self.snap_mgr.register(key.clone(), SnapEntry::Generating); + defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); + let snapshot = self.snap_mgr.get_snapshot_for_building(&key)?; + + (snapshot, key.clone()) + }; + + // Build snapshot by do_snapshot + let mut pb_snapshot: eraftpb::Snapshot = Default::default(); + let pb_snapshot_metadata: &mut eraftpb::SnapshotMetadata = pb_snapshot.mut_metadata(); + let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + { + // eraftpb::SnapshotMetadata + for (_, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { + let cf_index: RaftStoreResult = snap + .cf_files() + .iter() + .position(|x| &x.cf == cf) + .ok_or(box_err!("can't find index for cf {}", cf)); + let cf_index = cf_index?; + let cf_file = &snap.cf_files()[cf_index]; + let mut path = cf_file.path.clone(); + path.push(cf_file.file_prefix.clone()); + path.set_extension("sst"); + let mut _file = std::fs::File::create(path.as_path())?; + } + snap_data.set_region(new_region.clone()); + snap_data.set_file_size(0); + const SNAPSHOT_VERSION: u64 = 2; + snap_data.set_version(SNAPSHOT_VERSION); + + // SnapshotMeta + // Which is snap.meta_file.meta + let snapshot_meta = raftstore::store::snap::gen_snapshot_meta(snap.cf_files(), true)?; + + // Write MetaFile + { + let v = snapshot_meta.write_to_bytes()?; + let mut f = std::fs::File::create(snap.meta_path())?; + f.write_all(&v[..])?; + f.flush()?; + f.sync_all()?; + } + snap_data.set_meta(snapshot_meta); + } + + // TODO The rest is test, please remove it after we can fetch the real data. + pb_snapshot_metadata + .mut_conf_state() + .mut_voters() + .push(msg.get_from_peer().get_id()); + pb_snapshot_metadata + .mut_conf_state() + .mut_learners() + .push(msg.get_to_peer().get_id()); + pb_snapshot_metadata.set_index(key.idx); + pb_snapshot_metadata.set_term(key.term); + + pb_snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + + // Send reponse + let mut response = RaftMessage::default(); + let epoch = new_region.get_region_epoch(); + response.set_region_epoch(epoch.clone()); + response.set_region_id(region_id); + response.set_from_peer(msg.get_from_peer().clone()); + response.set_to_peer(msg.get_to_peer().clone()); + response + .mut_message() + .set_msg_type(MessageType::MsgSnapshot); + response.mut_message().set_term(inner_msg.get_term()); + response.mut_message().set_snapshot(pb_snapshot); + debug!( + "!!!! send snapshot key {} raft message {:?} snap data {:?}", + key, response, snap_data + ); + match self.trans.lock() { + Ok(mut trans) => match trans.send(response) { + Ok(_) | Err(RaftStoreError::RegionNotFound(_)) => (), + _ => return Ok(crate::FastAddPeerStatus::OtherError), + }, + Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), + } + + Ok(crate::FastAddPeerStatus::Ok) + } +} + +impl TiFlashObserver { + #[allow(clippy::too_many_arguments)] pub fn new( - peer_id: u64, + store_id: u64, engine: engine_tiflash::RocksEngine, + raft_engine: ER, sst_importer: Arc, snap_handle_pool_size: usize, + trans: T, + snap_mgr: SnapManager, + engine_store_cfg: crate::EngineStoreConfig, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -133,15 +524,24 @@ impl TiFlashObserver { let snap_pool = Builder::new(tikv_util::thd_name!("region-task")) .max_thread_count(snap_handle_pool_size) .build_future_pool(); + let mut cached_region_info = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); + for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { + cached_region_info.push(RwLock::new(HashMap::default())); + } TiFlashObserver { - peer_id, + store_id, engine_store_server_helper, engine, + raft_engine, sst_importer, pre_handle_snapshot_ctx: Arc::new(Mutex::new(PrehandleContext::default())), snap_handle_pool_size, apply_snap_pool: Some(Arc::new(snap_pool)), pending_delete_ssts: Arc::new(RwLock::new(vec![])), + cached_region_info: Arc::new(cached_region_info), + trans: Arc::new(Mutex::new(trans)), + snap_mgr: Arc::new(snap_mgr), + engine_store_cfg, } } @@ -245,14 +645,14 @@ impl TiFlashObserver { } } -impl Coprocessor for TiFlashObserver { +impl Coprocessor for TiFlashObserver { fn stop(&self) { - info!("shutdown tiflash observer"; "peer_id" => self.peer_id); + info!("shutdown tiflash observer"; "store_id" => self.store_id); self.apply_snap_pool.as_ref().unwrap().shutdown(); } } -impl AdminObserver for TiFlashObserver { +impl AdminObserver for TiFlashObserver { fn pre_exec_admin( &self, ob_ctx: &mut ObserverContext<'_>, @@ -269,9 +669,13 @@ impl AdminObserver for TiFlashObserver { index, term, ) { - info!("can't flush data, should filter CompactLog"; - "region" => ?ob_ctx.region(), - "req" => ?req, + info!("can't flush data, filter CompactLog"; + "region_id" => ?ob_ctx.region().get_id(), + "region_epoch" => ?ob_ctx.region().get_region_epoch(), + "index" => index, + "term" => term, + "compact_index" => req.get_compact_log().get_compact_index(), + "compact_term" => req.get_compact_log().get_compact_term(), ); return true; } @@ -411,7 +815,7 @@ impl AdminObserver for TiFlashObserver { } } -impl QueryObserver for TiFlashObserver { +impl QueryObserver for TiFlashObserver { fn on_empty_cmd(&self, ob_ctx: &mut ObserverContext<'_>, index: u64, term: u64) { fail::fail_point!("on_empty_cmd_normal", |_| {}); debug!("encounter empty cmd, maybe due to leadership change"; @@ -579,7 +983,7 @@ impl QueryObserver for TiFlashObserver { } } -impl UpdateSafeTsObserver for TiFlashObserver { +impl UpdateSafeTsObserver for TiFlashObserver { fn on_update_safe_ts(&self, region_id: u64, self_safe_ts: u64, leader_safe_ts: u64) { self.engine_store_server_helper.handle_safe_ts_update( region_id, @@ -589,7 +993,7 @@ impl UpdateSafeTsObserver for TiFlashObserver { } } -impl RegionChangeObserver for TiFlashObserver { +impl RegionChangeObserver for TiFlashObserver { fn on_region_changed( &self, ob_ctx: &mut ObserverContext<'_>, @@ -600,7 +1004,7 @@ impl RegionChangeObserver for TiFlashObserver { info!( "observe destroy"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); @@ -632,13 +1036,13 @@ impl RegionChangeObserver for TiFlashObserver { debug!( "observe pre_persist, persist"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, ); } else { debug!( "observe pre_persist"; "region_id" => ob_ctx.region().get_id(), - "peer_id" => self.peer_id, + "store_id" => self.store_id, "is_finished" => is_finished, ); }; @@ -649,9 +1053,35 @@ impl RegionChangeObserver for TiFlashObserver { fail::fail_point!("on_pre_persist_with_finish", |_| { true }); false } + + fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { + let inner_msg = msg.get_message(); + if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { + } else if inner_msg.get_msg_type() == MessageType::MsgAppend { + return self.maybe_fast_path(&msg); + } + false + } + + fn on_peer_created(&self, region_id: u64) { + let f = |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + let c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); + } + }; + // TODO remove unwrap + self.access_cached_region_info_mut(region_id, f).unwrap(); + } } -impl PdTaskObserver for TiFlashObserver { +impl PdTaskObserver for TiFlashObserver { fn on_compute_engine_size(&self, store_size: &mut Option) { let stats = self.engine_store_server_helper.handle_compute_store_stats(); let _ = store_size.insert(StoreSizeInfo { @@ -722,7 +1152,7 @@ fn pre_handle_snapshot_impl( PtrWrapper(ptr) } -impl ApplySnapshotObserver for TiFlashObserver { +impl ApplySnapshotObserver for TiFlashObserver { #[allow(clippy::single_match)] fn pre_apply_snapshot( &self, @@ -744,10 +1174,22 @@ impl ApplySnapshotObserver for TiFlashObserver { Some(s) => s, }; + fail::fail_point!("on_ob_pre_handle_snapshot_delete", |_| { + let ssts = retrieve_sst_files(snap); + for (pathbuf, _) in ssts.iter() { + debug!("delete snapshot file"; "path" => ?pathbuf); + std::fs::remove_file(pathbuf.as_path()).unwrap(); + } + return; + }); + let (sender, receiver) = mpsc::channel(); let task = Arc::new(PrehandleTask::new(receiver, peer_id)); { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(_) => fatal!("pre_apply_snapshot poisoned"), + }; let ctx = lock.deref_mut(); ctx.tracer.insert(snap_key.clone(), task.clone()); } @@ -803,32 +1245,64 @@ impl ApplySnapshotObserver for TiFlashObserver { "snap_key" => ?snap_key, "region" => ?ob_ctx.region(), ); + let region_id = ob_ctx.region().get_id(); + let mut should_skip = false; + if self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; + "snap_key" => ?snap_key, + ); + } + should_skip = o.get().inited_or_fallback.load(Ordering::SeqCst); + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; let snap = match snap { None => return, Some(s) => s, }; let maybe_snapshot = { - let mut lock = self.pre_handle_snapshot_ctx.lock().unwrap(); + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(_) => fatal!("post_apply_snapshot poisoned"), + }; let ctx = lock.deref_mut(); ctx.tracer.remove(snap_key) }; + if should_skip { + return; + } let need_retry = match maybe_snapshot { Some(t) => { let neer_retry = match t.recv.recv() { Ok(snap_ptr) => { info!("get prehandled snapshot success"; - "peer_id" => ?snap_key, - "region" => ?ob_ctx.region(), + "peer_id" => peer_id, + "snap_key" => ?snap_key, + "region_id" => ob_ctx.region().get_id(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); - self.engine_store_server_helper - .apply_pre_handled_snapshot(snap_ptr.0); + if !should_skip { + self.engine_store_server_helper + .apply_pre_handled_snapshot(snap_ptr.0); + } false } Err(_) => { info!("background pre-handle snapshot get error"; + "peer_id" => peer_id, "snap_key" => ?snap_key, - "region" => ?ob_ctx.region(), + "region_id" => ob_ctx.region().get_id(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); true @@ -838,7 +1312,8 @@ impl ApplySnapshotObserver for TiFlashObserver { .pending_applies_count .fetch_sub(1, Ordering::SeqCst); info!("apply snapshot finished"; - "peer_id" => ?snap_key, + "peer_id" => peer_id, + "snap_key" => ?snap_key, "region" => ?ob_ctx.region(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); @@ -849,14 +1324,15 @@ impl ApplySnapshotObserver for TiFlashObserver { // 1. we can't get snapshot from snap manager at that time. // 2. we disabled background pre handling. info!("pre-handled snapshot not found"; + "peer_id" => peer_id, "snap_key" => ?snap_key, - "region" => ?ob_ctx.region(), + "region_id" => ob_ctx.region().get_id(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); true } }; - if need_retry { + if need_retry && !should_skip { let ssts = retrieve_sst_files(snap); let ptr = pre_handle_snapshot_impl( self.engine_store_server_helper, @@ -866,13 +1342,15 @@ impl ApplySnapshotObserver for TiFlashObserver { snap_key, ); info!("re-gen pre-handled snapshot success"; + "peer_id" => peer_id, "snap_key" => ?snap_key, - "region" => ?ob_ctx.region(), + "region_id" => ob_ctx.region().get_id(), ); self.engine_store_server_helper .apply_pre_handled_snapshot(ptr.0); info!("apply snapshot finished"; - "peer_id" => ?snap_key, + "peer_id" => peer_id, + "snap_key" => ?snap_key, "region" => ?ob_ctx.region(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 73487d51b36..89d50ba9eb7 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -3,32 +3,27 @@ #![allow(dead_code)] #![allow(unused_variables)] -use std::{fmt, slice}; - use std::{ - fmt::{Formatter, Debug}, - mem, + fmt, + fmt::{Debug, Formatter}, + mem, slice, }; +use byteorder::{BigEndian, ByteOrder}; use engine_traits::{ - Error, - RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, Result, RaftLogGcTask, - PerfContext, PerfContextExt, PerfContextKind, PerfLevel, + Error, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, + RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, }; - -use tracker::TrackerToken; - -use protobuf::Message; -use raft::eraftpb::Entry; use kvproto::{ metapb::Region, raft_serverpb::{ RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent, StoreRecoverState, }, }; - -use byteorder::{BigEndian, ByteOrder}; -use tikv_util::{info, box_err, box_try}; +use protobuf::Message; +use raft::eraftpb::Entry; +use tikv_util::{box_err, box_try, info}; +use tracker::TrackerToken; use crate::{gen_engine_store_server_helper, RawCppPtr}; @@ -95,7 +90,10 @@ impl PSEngineWriteBatch { pub fn new(engine_store_server_helper: isize) -> PSEngineWriteBatch { let helper = gen_engine_store_server_helper(engine_store_server_helper); let raw_write_batch = helper.create_write_batch(); - PSEngineWriteBatch { engine_store_server_helper, raw_write_batch } + PSEngineWriteBatch { + engine_store_server_helper, + raw_write_batch, + } } fn put_page(&mut self, page_id: &[u8], value: &[u8]) -> Result<()> { @@ -150,9 +148,9 @@ impl RaftLogBatch for PSEngineWriteBatch { } fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - // This function is used to clean entries that will be overwritten later. - // TODO: make sure overlapped entries will be overwritten by newer log. - // for index in from..to { + // This function is used to clean entries that will be overwritten + // later. TODO: make sure overlapped entries will be overwritten + // by newer log. for index in from..to { // let key = ps_raft_log_key(raft_group_id, index); // self.del_page(&key).unwrap(); // } @@ -216,20 +214,16 @@ impl std::fmt::Debug for PSEngine { impl PSEngine { pub fn new() -> Self { - PSEngine { engine_store_server_helper: 0 } + PSEngine { + engine_store_server_helper: 0, + } } - pub fn init( - &mut self, - engine_store_server_helper: isize, - ) { + pub fn init(&mut self, engine_store_server_helper: isize) { self.engine_store_server_helper = engine_store_server_helper; } - fn get_msg_cf( - &self, - page_id: &[u8], - ) -> Result> { + fn get_msg_cf(&self, page_id: &[u8]) -> Result> { let helper = gen_engine_store_server_helper(self.engine_store_server_helper); let value = helper.read_page(page_id.into()); if value.view.len == 0 { @@ -237,21 +231,20 @@ impl PSEngine { } let mut m = M::default(); - m.merge_from_bytes(unsafe { slice::from_raw_parts(value.view.data as *const u8, value.view.len as usize) })?; + m.merge_from_bytes(unsafe { + slice::from_raw_parts(value.view.data as *const u8, value.view.len as usize) + })?; Ok(Some(m)) } - fn get_value( - &self, - page_id: &[u8], - ) -> Option> { + fn get_value(&self, page_id: &[u8]) -> Option> { let helper = gen_engine_store_server_helper(self.engine_store_server_helper); let value = helper.read_page(page_id.into()); return if value.view.len == 0 { None } else { Some(value.view.to_slice().to_vec()) - } + }; } // Seek the first key >= given key, if not found, return None. @@ -268,15 +261,13 @@ impl PSEngine { /// scan the key between start_key(inclusive) and end_key(exclusive), /// the upper bound is omitted if end_key is empty fn scan(&self, start_key: &[u8], end_key: &[u8], mut f: F) -> Result<()> - where - F: FnMut(&[u8], &[u8]) -> Result, + where + F: FnMut(&[u8], &[u8]) -> Result, { let helper = gen_engine_store_server_helper(self.engine_store_server_helper); let values = helper.scan_page(start_key.into(), end_key.into()); for i in 0..values.len { - let value = unsafe { - &*values.inner.offset(i as isize) - }; + let value = unsafe { &*values.inner.offset(i as isize) }; if value.view.len != 0 { if !f(&[], &value.view.to_slice().to_vec())? { break; @@ -290,9 +281,12 @@ impl PSEngine { if from == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); - // TODO: make sure the seek can skip other raft related key and to the first log key + // TODO: make sure the seek can skip other raft related key and to the first log + // key match self.seek(&start_key) { - Some(target_key) if target_key.starts_with(&prefix) => from = box_try!(keys::raft_log_index(&target_key)), + Some(target_key) if target_key.starts_with(&prefix) => { + from = box_try!(keys::raft_log_index(&target_key)) + } // No need to gc. _ => return Ok(0), } @@ -343,18 +337,14 @@ impl RaftEngineReadOnly for PSEngine { let mut count = 1; - self.scan( - &start_key, - &end_key, - |_, page| { - let mut entry = Entry::default(); - entry.merge_from_bytes(page)?; - buf.push(entry); - total_size += page.len(); - count += 1; - Ok(total_size < max_size) - }, - )?; + self.scan(&start_key, &end_key, |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + total_size += page.len(); + count += 1; + Ok(total_size < max_size) + })?; return Ok(count); } @@ -362,16 +352,12 @@ impl RaftEngineReadOnly for PSEngine { fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { let start_key = keys::raft_log_key(region_id, 0); let end_key = keys::raft_log_key(region_id, u64::MAX); - self.scan( - &start_key, - &end_key, - |_, page| { - let mut entry = Entry::default(); - entry.merge_from_bytes(page)?; - buf.push(entry); - Ok(true) - }, - )?; + self.scan(&start_key, &end_key, |_, page| { + let mut entry = Entry::default(); + entry.merge_from_bytes(page)?; + buf.push(entry); + Ok(true) + })?; Ok(()) } @@ -405,20 +391,16 @@ impl RaftEngineReadOnly for PSEngine { impl RaftEngineDebug for PSEngine { fn scan_entries(&self, raft_group_id: u64, mut f: F) -> Result<()> - where - F: FnMut(&Entry) -> Result, + where + F: FnMut(&Entry) -> Result, { let start_key = keys::raft_log_key(raft_group_id, 0); let end_key = keys::raft_log_key(raft_group_id, u64::MAX); - self.scan( - &start_key, - &end_key, - |_, value| { - let mut entry = Entry::default(); - entry.merge_from_bytes(value)?; - f(&entry) - }, - ); + self.scan(&start_key, &end_key, |_, value| { + let mut entry = Entry::default(); + entry.merge_from_bytes(value)?; + f(&entry) + }); Ok(()) } } @@ -459,16 +441,20 @@ impl RaftEngine for PSEngine { state: &RaftLocalState, batch: &mut Self::LogBatch, ) -> Result<()> { - // info!("try clean raft_group_id {} from {} to {}", raft_group_id, first_index, state.last_index); + // info!("try clean raft_group_id {} from {} to {}", raft_group_id, first_index, + // state.last_index); batch.del_page(&keys::raft_state_key(raft_group_id))?; batch.del_page(&keys::region_state_key(raft_group_id))?; batch.del_page(&keys::apply_state_key(raft_group_id))?; if first_index == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); - // TODO: make sure the seek can skip other raft related key and to the first log key + // TODO: make sure the seek can skip other raft related key and to the first log + // key match self.seek(&start_key) { - Some(target_key) if target_key.starts_with(&prefix) => first_index = box_try!(keys::raft_log_index(&target_key)), + Some(target_key) if target_key.starts_with(&prefix) => { + first_index = box_try!(keys::raft_log_index(&target_key)) + } // No need to gc. _ => return Ok(()), } @@ -476,11 +462,14 @@ impl RaftEngine for PSEngine { if first_index >= state.last_index { return Ok(()); } - info!("clean raft_group_id {} from {} to {}", raft_group_id, first_index, state.last_index); + info!( + "clean raft_group_id {} from {} to {}", + raft_group_id, first_index, state.last_index + ); // TODO: find the first raft log index of this raft group if first_index <= state.last_index { for index in first_index..=state.last_index { - batch.del_page( &keys::raft_log_key(raft_group_id, index)); + batch.del_page(&keys::raft_log_key(raft_group_id, index)); } } self.consume(batch, true); @@ -492,7 +481,7 @@ impl RaftEngine for PSEngine { if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let buf = Vec::with_capacity(max_size as usize); wb.append_impl(raft_group_id, &entries, buf)?; - return self.consume(&mut wb, false) + return self.consume(&mut wb, false); } Ok(0) } @@ -516,16 +505,18 @@ impl RaftEngine for PSEngine { Ok(total) } - fn flush_metrics(&self, instance: &str) { - } + fn flush_metrics(&self, instance: &str) {} - fn reset_statistics(&self) { - } + fn reset_statistics(&self) {} fn dump_stats(&self) -> Result { Ok(String::from("")) } + fn get_engine_path(&self) -> &str { + "" + } + fn get_engine_size(&self) -> Result { Ok(0) } @@ -538,9 +529,9 @@ impl RaftEngine for PSEngine { } fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> - where - F: FnMut(u64) -> std::result::Result<(), E>, - E: From, + where + F: FnMut(u64) -> std::result::Result<(), E>, + E: From, { let start_key = keys::REGION_META_MIN_KEY; let end_key = keys::REGION_META_MAX_KEY; @@ -582,20 +573,16 @@ impl PerfContextExt for PSEngine { } #[derive(Debug)] -pub struct PSPerfContext { -} +pub struct PSPerfContext {} impl PSPerfContext { pub fn new(level: PerfLevel, kind: PerfContextKind) -> Self { - PSPerfContext { } + PSPerfContext {} } } impl PerfContext for PSPerfContext { - fn start_observe(&mut self) { - } + fn start_observe(&mut self) {} - fn report_metrics(&mut self, trackers: &[TrackerToken]) { - - } + fn report_metrics(&mut self, trackers: &[TrackerToken]) {} } diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index 6a29882b3a5..29118a22023 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -13,8 +13,12 @@ use std::{ }; use engine_rocks::{RocksDbVector, RocksEngineIterator, RocksSnapshot}; -use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; +use engine_traits::{ + Checkpointable, Checkpointer, Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, + Result, SyncMutable, +}; use rocksdb::{Writable, DB}; +use tikv_util::box_err; use crate::{r2e, util::get_cf_handle}; @@ -247,3 +251,26 @@ impl SyncMutable for RocksEngine { Ok(()) } } + +pub struct TiFlashCheckpointer {} + +impl Checkpointable for RocksEngine { + type Checkpointer = TiFlashCheckpointer; + + fn new_checkpointer(&self) -> Result { + Err(Error::Other("TiFlash don't support Checkpointable".into())) + } +} + +impl Checkpointer for TiFlashCheckpointer { + fn create_at( + &mut self, + db_out_dir: &Path, + titan_out_dir: Option<&Path>, + log_size_for_flush: u64, + ) -> Result<()> { + Err(Error::Other( + "TiFlash don't support Checkpointer::create_at".into(), + )) + } +} diff --git a/engine_tiflash/src/proxy_utils.rs b/engine_tiflash/src/proxy_utils.rs index 194a2292e46..c44e355ae59 100644 --- a/engine_tiflash/src/proxy_utils.rs +++ b/engine_tiflash/src/proxy_utils.rs @@ -1,6 +1,7 @@ use crate::util::get_cf_handle; pub fn do_write(cf: &str, key: &[u8]) -> bool { + fail::fail_point!("before_tiflash_do_write", |_| true); match cf { engine_traits::CF_RAFT => true, engine_traits::CF_DEFAULT => { @@ -32,6 +33,7 @@ fn cf_to_name(batch: &crate::RocksWriteBatchVec, cf: u32) -> &'static str { fn check_double_write(batch: &crate::RocksWriteBatchVec) { // It will fire if we write by both observer(compat_old_proxy is not enabled) // and TiKV's WriteBatch. + fail::fail_point!("before_tiflash_check_double_write", |_| {}); tikv_util::debug!("check if double write happens"); for wb in batch.wbs.iter() { for (_, cf, k, _) in wb.iter() { diff --git a/engine_tiflash/src/raft_engine.rs b/engine_tiflash/src/raft_engine.rs index b66a56caadf..da15b1708b8 100644 --- a/engine_tiflash/src/raft_engine.rs +++ b/engine_tiflash/src/raft_engine.rs @@ -339,6 +339,10 @@ impl RaftEngine for RocksEngine { Ok(used_size) } + fn get_engine_path(&self) -> &str { + self.as_inner().path() + } + fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { self.put_msg(keys::STORE_IDENT_KEY, ident) } diff --git a/etc/config-template.toml b/etc/config-template.toml index 92b6454ba29..a2b3ab13b00 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -253,6 +253,12 @@ ## Set it to 0 will cause no space is reserved at all. It's generally used for tests. # reserve-space = "5GB" +## Reserve some space for raft disk if raft disk is separated deployed with kv disk. +## `max(reserve-raft-space, raft disk capacity * 5%)` will be reserved exactly. +## +## Set it to 0 will cause no space is reserved at all. It's generally used for tests. +# reserve-raft-space = "1GB" + ## The maximum recovery time after rocksdb detects restorable background errors. When the data belonging ## to the data range is damaged, it will be reported to PD through heartbeat, and PD will add `remove-peer` ## operator to remove this damaged peer. When the damaged peer still exists in the current store, the diff --git a/fuzz/cli.rs b/fuzz/cli.rs index 96972d94565..201e659d8ba 100644 --- a/fuzz/cli.rs +++ b/fuzz/cli.rs @@ -31,7 +31,7 @@ lazy_static! { static ref FUZZ_ROOT: PathBuf = WORKSPACE_ROOT.join("fuzz"); static ref FUZZ_TARGETS: Vec = { let source = FUZZ_ROOT.join("targets/mod.rs"); - let targets_rs = fs::read_to_string(&source).unwrap(); + let targets_rs = fs::read_to_string(source).unwrap(); let match_fuzz_fs = regex::Regex::new(r"pub fn fuzz_(\w+)\(").unwrap(); let target_names = match_fuzz_fs .captures_iter(&targets_rs) @@ -110,7 +110,7 @@ fn write_fuzz_target_source_file(fuzzer: Fuzzer, target: &str) -> Result<()> { template_file_path.display() ))?; - let target_file_path = fuzzer.directory().join(&format!("src/bin/{}.rs", target)); + let target_file_path = fuzzer.directory().join(format!("src/bin/{}.rs", target)); let mut file = fs::OpenOptions::new() .write(true) .create(true) @@ -159,7 +159,7 @@ fn get_seed_dir(target: &str) -> PathBuf { /// Create corpus dir for fuzz target fn create_corpus_dir(base: impl AsRef, target: &str) -> Result { let base = base.as_ref(); - let corpus_dir = base.join(&format!("corpus-{}", target)); + let corpus_dir = base.join(format!("corpus-{}", target)); fs::create_dir_all(&corpus_dir).context(format!( "unable to create corpus dir for {}{}", base.display(), @@ -192,13 +192,13 @@ fn run_afl(target: &str) -> Result<()> { let corpus_dir = create_corpus_dir(fuzzer.directory(), target)?; pre_check( - Command::new("cargo").args(&["afl", "--version"]), + Command::new("cargo").args(["afl", "--version"]), "cargo install afl", )?; // 1. cargo afl build (in fuzzer-afl directory) let fuzzer_build = Command::new("cargo") - .args(&["afl", "build", "--bin", target]) + .args(["afl", "build", "--bin", target]) .current_dir(fuzzer.directory()) .spawn() .context(format!("Failed to build {}", fuzzer))? @@ -218,7 +218,7 @@ fn run_afl(target: &str) -> Result<()> { // ``` let instrumented_bin = WORKSPACE_ROOT.join("target/debug").join(target); let fuzzer_bin = Command::new("cargo") - .args(&["afl", "fuzz"]) + .args(["afl", "fuzz"]) .arg("-i") .arg(&seed_dir) .arg("-o") @@ -244,7 +244,7 @@ fn run_afl(target: &str) -> Result<()> { /// Run one target fuzz test using Honggfuzz fn run_honggfuzz(target: &str) -> Result<()> { pre_check( - Command::new("cargo").args(&["hfuzz", "version"]), + Command::new("cargo").args(["hfuzz", "version"]), "cargo install honggfuzz --version 0.5.45", )?; @@ -262,7 +262,7 @@ fn run_honggfuzz(target: &str) -> Result<()> { ); let fuzzer_bin = Command::new("cargo") - .args(&["hfuzz", "run", target]) + .args(["hfuzz", "run", target]) .env("RUSTFLAGS", &rust_flags) .env("HFUZZ_RUN_ARGS", &hfuzz_args) .current_dir(fuzzer.directory()) @@ -321,7 +321,7 @@ fn run_libfuzzer(target: &str) -> Result<()> { asan_options.push_str(" detect_odr_violation=0"); let fuzzer_bin = Command::new("cargo") - .args(&["run", "--target", target_platform, "--bin", target, "--"]) + .args(["run", "--target", target_platform, "--bin", target, "--"]) .arg(&corpus_dir) .arg(&seed_dir) .env("RUSTFLAGS", &rust_flags) diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index 45a657cc4bb..ccac776b508 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -14426,7 +14426,7 @@ "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, - "description": "The time consumed by raftstore events (P99).99", + "description": "The max time consumed by raftstore events", "editable": true, "error": false, "fieldConfig": { @@ -14466,7 +14466,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "7.5.7", + "pluginVersion": "7.5.11", "pointradius": 5, "points": false, "renderer": "flot", @@ -14476,12 +14476,25 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_event_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", + "exemplar": true, + "expr": "histogram_quantile(1.0, sum(rate(tikv_raftstore_event_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", + "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "C", "step": 4 + }, + { + "exemplar": true, + "expr": "histogram_quantile(1.0, sum(rate(tikv_broadcast_normal_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "broadcast_normal", + "refId": "A", + "step": 4 } ], "thresholds": [ @@ -14496,7 +14509,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "0.99 Duration of raft store events", + "title": "Max duration of raft store events", "tooltip": { "msResolution": false, "shared": true, @@ -14557,7 +14570,7 @@ "h": 8, "w": 12, "x": 0, - "y": 20 + "y": 21 }, "heatmap": {}, "hideZeroBuckets": true, @@ -14603,6 +14616,78 @@ "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "The length of peer msgs for each round handling", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 21 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 23763572958, + "legend": { + "show": false + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tikv_raftstore_peer_msg_len_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "C", + "step": 4 + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Peer msg length distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null } ], "repeat": null, @@ -17938,6 +18023,366 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Unified read pool task execution time during one schedule.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 4199, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_poll_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Duration of One Time Slice", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Unified read pool task total execution duration.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 4202, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_exec_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Execute Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Task schedule number of times.", + "fill": 1, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 4204, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.50, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "50%", + "refId": "A" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.95, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "95%", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "99%", + "refId": "C" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tikv_yatp_task_execute_times_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "999%", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Task Schedule Times", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "title": "Unified Read Pool", diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index fd5de90a353..58db2bb0f2c 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1,1192 +1,140 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(slice_take)] - -use std::{ - collections::{BTreeMap, HashMap, HashSet}, - pin::Pin, - sync::{atomic::Ordering, Mutex}, - time::Duration, -}; - -pub use engine_store_ffi::{ - interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, - RawCppPtr, UnwrapExternCFunc, -}; -use engine_traits::{ - Engines, Iterable, Peekable, SyncMutable, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, -}; -use kvproto::{ - raft_cmdpb::AdminCmdType, - raft_serverpb::{RaftApplyState, RegionLocalState}, -}; -pub use mock_cluster::{ - must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, -}; -use protobuf::Message; -use tikv_util::{debug, error, info, warn}; - -use crate::{config::MockConfig, server::ServerCluster}; - pub mod config; pub mod mock_cluster; +pub mod mock_store; pub mod node; pub mod server; pub mod transport_simulate; -type RegionId = u64; -#[derive(Default, Clone)] -pub struct Region { - pub region: kvproto::metapb::Region, - // Which peer is me? - pub peer: kvproto::metapb::Peer, - // in-memory data - pub data: [BTreeMap, Vec>; 3], - // If we a key is deleted, it will immediately be removed from data, - // We will record the key in pending_delete, so we can delete it from disk when flushing. - pub pending_delete: [HashSet>; 3], - pub pending_write: [BTreeMap, Vec>; 3], - pub apply_state: kvproto::raft_serverpb::RaftApplyState, - pub applied_term: u64, -} - -impl Region { - fn set_applied(&mut self, index: u64, term: u64) { - self.apply_state.set_applied_index(index); - self.applied_term = term; - } - - fn new(meta: kvproto::metapb::Region) -> Self { - Region { - region: meta, - peer: Default::default(), - data: Default::default(), - pending_delete: Default::default(), - pending_write: Default::default(), - apply_state: Default::default(), - applied_term: 0, - } - } -} - -pub struct EngineStoreServer { - pub id: u64, - pub engines: Option>, - pub kvstore: HashMap>, - pub proxy_compat: bool, - pub mock_cfg: MockConfig, -} - -impl EngineStoreServer { - pub fn new( - id: u64, - engines: Option>, - ) -> Self { - EngineStoreServer { - id, - engines, - kvstore: Default::default(), - proxy_compat: false, - mock_cfg: MockConfig::default(), - } - } - - pub fn get_mem( - &self, - region_id: u64, - cf: ffi_interfaces::ColumnFamilyType, - key: &Vec, - ) -> Option<&Vec> { - match self.kvstore.get(®ion_id) { - Some(region) => { - let bmap = ®ion.data[cf as usize]; - bmap.get(key) - } - None => None, - } - } - - pub fn stop(&mut self) { - for (_, region) in self.kvstore.iter_mut() { - for cf in region.pending_write.iter_mut() { - cf.clear(); - } - for cf in region.pending_delete.iter_mut() { - cf.clear(); - } - for cf in region.data.iter_mut() { - cf.clear(); - } - region.apply_state = Default::default(); - // We don't clear applied_term. - } - } - - // False alarm - #[allow(clippy::needless_collect)] - pub fn restore(&mut self) { - // TODO We should actually read from engine store's persistence. - // However, since mock engine store don't persist itself, - // we read from proxy instead. - unsafe { - let region_ids: Vec = self.kvstore.keys().cloned().collect(); - for region_id in region_ids.into_iter() { - load_from_db(self, region_id); - } - } - } -} - -pub struct EngineStoreServerWrap { - pub engine_store_server: *mut EngineStoreServer, - pub maybe_proxy_helper: std::option::Option<*mut RaftStoreProxyFFIHelper>, - // Call `gen_cluster(cluster_ptr)`, and get which cluster this Server belong to. - pub cluster_ptr: isize, -} - -fn set_new_region_peer(new_region: &mut Region, store_id: u64) { - if let Some(peer) = new_region - .region - .get_peers() - .iter() - .find(|&peer| peer.get_store_id() == store_id) - { - new_region.peer = peer.clone(); - } else { - // This happens when region is not found. - } -} - -pub fn make_new_region( - maybe_from_region: Option, - maybe_store_id: Option, -) -> Region { - let mut region = Region { - region: maybe_from_region.unwrap_or_default(), - ..Default::default() - }; - if let Some(store_id) = maybe_store_id { - set_new_region_peer(&mut region, store_id); - } - region - .apply_state - .mut_truncated_state() - .set_index(raftstore::store::RAFT_INIT_LOG_INDEX); - region - .apply_state - .mut_truncated_state() - .set_term(raftstore::store::RAFT_INIT_LOG_TERM); - region.set_applied( - raftstore::store::RAFT_INIT_LOG_INDEX, - raftstore::store::RAFT_INIT_LOG_TERM, - ); - region -} - -fn write_kv_in_mem(region: &mut Box, cf_index: usize, k: &[u8], v: &[u8]) { - let data = &mut region.data[cf_index]; - let pending_delete = &mut region.pending_delete[cf_index]; - let pending_write = &mut region.pending_write[cf_index]; - pending_delete.remove(k); - data.insert(k.to_vec(), v.to_vec()); - pending_write.insert(k.to_vec(), v.to_vec()); -} - -fn delete_kv_in_mem(region: &mut Box, cf_index: usize, k: &[u8]) { - let data = &mut region.data[cf_index]; - let pending_delete = &mut region.pending_delete[cf_index]; - pending_delete.insert(k.to_vec()); - data.remove(k); -} - -unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { - let store_id = store.id; - let engine = &mut store.engines.as_mut().unwrap().kv; - let apply_state: RaftApplyState = engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .unwrap() - .unwrap(); - let region_state: RegionLocalState = engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - .unwrap() - .unwrap(); - - let region = store.kvstore.get_mut(®ion_id).unwrap(); - region.apply_state = apply_state; - region.region = region_state.get_region().clone(); - set_new_region_peer(region, store.id); - - for cf in 0..3 { - let cf_name = cf_to_name(cf.into()); - region.data[cf].clear(); - region.pending_delete[cf].clear(); - region.pending_write[cf].clear(); - let start = region.region.get_start_key().to_owned(); - let end = region.region.get_end_key().to_owned(); - engine - .scan(cf_name, &start, &end, false, |k, v| { - let origin_key = if keys::validate_data_key(k) { - keys::origin_key(k).to_vec() - } else { - k.to_vec() - }; - debug!("restored data"; - "store" => store_id, - "region_id" => region_id, - "cf" => cf, - "k" => ?k, - "origin_key" => ?origin_key, - ); - region.data[cf].insert(origin_key, v.to_vec()); - Ok(true) - }) - .unwrap(); - } -} - -unsafe fn write_to_db_data( - store: &mut EngineStoreServer, - region: &mut Box, - reason: String, -) { - info!("mock flush to engine"; - "region" => ?region.region, - "store_id" => store.id, - "reason" => reason - ); - let kv = &mut store.engines.as_mut().unwrap().kv; - for cf in 0..3 { - let pending_write = std::mem::take(region.pending_write.as_mut().get_mut(cf).unwrap()); - let mut pending_remove = - std::mem::take(region.pending_delete.as_mut().get_mut(cf).unwrap()); - for (k, v) in pending_write.into_iter() { - let tikv_key = keys::data_key(k.as_slice()); - let cf_name = cf_to_name(cf.into()); - if !pending_remove.contains(&k) { - kv.rocks.put_cf(cf_name, tikv_key.as_slice(), &v).unwrap(); - } else { - pending_remove.remove(&k); - } - } - let cf_name = cf_to_name(cf.into()); - for k in pending_remove.into_iter() { - let tikv_key = keys::data_key(k.as_slice()); - kv.rocks.delete_cf(cf_name, &tikv_key).unwrap(); - } - } -} - -impl EngineStoreServerWrap { - pub fn new( - engine_store_server: *mut EngineStoreServer, - maybe_proxy_helper: std::option::Option<*mut RaftStoreProxyFFIHelper>, - cluster_ptr: isize, - ) -> Self { - Self { - engine_store_server, - maybe_proxy_helper, - cluster_ptr, - } - } - - unsafe fn handle_admin_raft_cmd( - &mut self, - req: &kvproto::raft_cmdpb::AdminRequest, - resp: &kvproto::raft_cmdpb::AdminResponse, - header: ffi_interfaces::RaftCmdHeader, - ) -> ffi_interfaces::EngineStoreApplyRes { - let region_id = header.region_id; - let node_id = (*self.engine_store_server).id; - info!("handle_admin_raft_cmd"; - "node_id"=>node_id, - "request"=>?req, - "response"=>?resp, - "header"=>?header, - "region_id"=>header.region_id, - ); - let do_handle_admin_raft_cmd = - move |region: &mut Box, engine_store_server: &mut EngineStoreServer| { - if region.apply_state.get_applied_index() >= header.index { - // If it is a old entry. - error!("obsolete admin index"; - "apply_state"=>?region.apply_state, - "header"=>?header, - "node_id"=>node_id, - ); - panic!("observe obsolete admin index"); - // return ffi_interfaces::EngineStoreApplyRes::None; - } - match req.get_cmd_type() { - AdminCmdType::ChangePeer | AdminCmdType::ChangePeerV2 => { - let new_region_meta = resp.get_change_peer().get_region(); - let old_peer_id = { - let old_region = - engine_store_server.kvstore.get_mut(®ion_id).unwrap(); - old_region.region = new_region_meta.clone(); - region.set_applied(header.index, header.term); - old_region.peer.get_store_id() - }; - - let mut do_remove = true; - if old_peer_id != 0 { - for peer in new_region_meta.get_peers().iter() { - if peer.get_store_id() == old_peer_id { - // Should not remove region - do_remove = false; - } - } - } else { - // If old_peer_id is 0, seems old_region.peer is not set, just neglect - // for convenience. - do_remove = false; - } - if do_remove { - let removed = engine_store_server.kvstore.remove(®ion_id); - // We need to also remove apply state, thus we need to know peer_id - debug!( - "Remove region {:?} peer_id {} at node {}, for new meta {:?}", - removed.unwrap().region, - old_peer_id, - node_id, - new_region_meta - ); - } - } - AdminCmdType::BatchSplit => { - let regions = resp.get_splits().regions.as_ref(); - - for i in 0..regions.len() { - let region_meta = regions.get(i).unwrap(); - if region_meta.id == region_id { - // This is the derived region - debug!( - "region {} is derived by split at peer {} with meta {:?}", - region_meta.id, node_id, region_meta - ); - assert!(engine_store_server.kvstore.contains_key(®ion_meta.id)); - engine_store_server - .kvstore - .get_mut(®ion_meta.id) - .unwrap() - .region = region_meta.clone(); - } else { - // Should split data into new region - debug!( - "new region {} generated by split at peer {} with meta {:?}", - region_meta.id, node_id, region_meta - ); - let new_region = - make_new_region(Some(region_meta.clone()), Some(node_id)); - - // No need to split data because all KV are stored in the same - // RocksDB. TODO But we still need - // to clean all in-memory data. - // We can't assert `region_meta.id` is brand new here - engine_store_server - .kvstore - .insert(region_meta.id, Box::new(new_region)); - } - } - } - AdminCmdType::PrepareMerge => { - let tikv_region = resp.get_split().get_left(); - - let _target = req.prepare_merge.as_ref().unwrap().target.as_ref(); - let region_meta = &mut (engine_store_server - .kvstore - .get_mut(®ion_id) - .unwrap() - .region); - - // Increase self region conf version and version - let region_epoch = region_meta.region_epoch.as_mut().unwrap(); - let new_version = region_epoch.version + 1; - region_epoch.set_version(new_version); - // TODO this check may fail - // assert_eq!(tikv_region.get_region_epoch().get_version(), new_version); - let conf_version = region_epoch.conf_ver + 1; - region_epoch.set_conf_ver(conf_version); - assert_eq!(tikv_region.get_region_epoch().get_conf_ver(), conf_version); - - { - let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); - region.set_applied(header.index, header.term); - } - // We don't handle MergeState and PeerState here - } - AdminCmdType::CommitMerge => { - { - let tikv_target_region_meta = resp.get_split().get_left(); - - let target_region = - &mut (engine_store_server.kvstore.get_mut(®ion_id).unwrap()); - let target_region_meta = &mut target_region.region; - let target_version = - target_region_meta.get_region_epoch().get_version(); - let source_region = req.get_commit_merge().get_source(); - let source_version = source_region.get_region_epoch().get_version(); - - let new_version = std::cmp::max(source_version, target_version) + 1; - target_region_meta - .mut_region_epoch() - .set_version(new_version); - assert_eq!( - target_region_meta.get_region_epoch().get_version(), - new_version - ); - - // No need to merge data - let source_at_left = if source_region.get_start_key().is_empty() { - true - } else if target_region_meta.get_start_key().is_empty() { - false - } else { - source_region - .get_end_key() - .cmp(target_region_meta.get_start_key()) - == std::cmp::Ordering::Equal - }; - - if source_at_left { - target_region_meta - .set_start_key(source_region.get_start_key().to_vec()); - assert_eq!( - tikv_target_region_meta.get_start_key(), - target_region_meta.get_start_key() - ); - } else { - target_region_meta - .set_end_key(source_region.get_end_key().to_vec()); - assert_eq!( - tikv_target_region_meta.get_end_key(), - target_region_meta.get_end_key() - ); - } - target_region.set_applied(header.index, header.term); - } - let to_remove = req.get_commit_merge().get_source().get_id(); - engine_store_server.kvstore.remove(&to_remove); - } - AdminCmdType::RollbackMerge => { - let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); - let region_meta = &mut region.region; - let new_version = region_meta.get_region_epoch().get_version() + 1; - let region_epoch = region_meta.region_epoch.as_mut().unwrap(); - region_epoch.set_version(new_version); - - region.set_applied(header.index, header.term); - } - AdminCmdType::CompactLog => { - // We can always do compact, since a executed CompactLog must follow a - // successful persist. - let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); - let state = &mut region.apply_state; - let compact_index = req.get_compact_log().get_compact_index(); - let compact_term = req.get_compact_log().get_compact_term(); - state.mut_truncated_state().set_index(compact_index); - state.mut_truncated_state().set_term(compact_term); - region.set_applied(header.index, header.term); - } - _ => { - region.set_applied(header.index, header.term); - } - } - // Do persist or not - match req.get_cmd_type() { - AdminCmdType::CompactLog => { - fail::fail_point!("no_persist_compact_log", |_| { - // Persist data, but don't persist meta. - ffi_interfaces::EngineStoreApplyRes::None - }); - ffi_interfaces::EngineStoreApplyRes::Persist - } - AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { - fail::fail_point!("no_persist_flashback", |_| { - // Persist data, but don't persist meta. - ffi_interfaces::EngineStoreApplyRes::None - }); - ffi_interfaces::EngineStoreApplyRes::Persist - } - _ => ffi_interfaces::EngineStoreApplyRes::Persist, - } +pub use mock_store::*; + +pub fn copy_meta_from( + source_engines: &Engines, + target_engines: &Engines, + source: &Region, + target: &mut Region, + new_region_meta: kvproto::metapb::Region, + copy_region_state: bool, + copy_apply_state: bool, + copy_raft_state: bool, +) -> raftstore::Result<()> { + let region_id = source.region.get_id(); + + let mut wb = target_engines.kv.write_batch(); + + // Can't copy this key, otherwise will cause a bootstrap. + // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, &source.region)); + + // region local state + if copy_region_state { + let mut state = RegionLocalState::default(); + state.set_region(new_region_meta); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), &state)); + } + + // apply state + if copy_apply_state { + let apply_state: RaftApplyState = + match general_get_apply_state(&source_engines.kv, region_id) { + Some(x) => x, + None => return Err(box_err!("bad RaftApplyState")), }; - - let res = match (*self.engine_store_server).kvstore.entry(region_id) { - std::collections::hash_map::Entry::Occupied(mut o) => { - do_handle_admin_raft_cmd(o.get_mut(), &mut (*self.engine_store_server)) - } - std::collections::hash_map::Entry::Vacant(v) => { - // Currently in tests, we don't handle commands like BatchSplit, - // and sometimes we don't bootstrap region 1, - // so it is normal if we find no region. - warn!("region {} not found, create for {}", region_id, node_id); - let new_region = v.insert(Default::default()); - assert!((*self.engine_store_server).kvstore.contains_key(®ion_id)); - do_handle_admin_raft_cmd(new_region, &mut (*self.engine_store_server)) - } - }; - - let region = match (*self.engine_store_server).kvstore.get_mut(®ion_id) { - Some(r) => Some(r), - None => { - warn!( - "still can't find region {} for {}, may be remove due to confchange", - region_id, node_id - ); - None - } - }; - if res == ffi_interfaces::EngineStoreApplyRes::Persist { - // Persist tells ApplyDelegate to do a commit. - // So we also need a persist of actual data on engine-store' side. - if let Some(region) = region { - if req.get_cmd_type() == AdminCmdType::CompactLog { - // We already persist when fn_try_flush_data. - } else { - write_to_db_data( - &mut (*self.engine_store_server), - region, - format!("admin {:?}", req), - ); - } - } - } - res + wb.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; + target.apply_state = apply_state.clone(); + target.applied_term = source.applied_term; } - unsafe fn handle_write_raft_cmd( - &mut self, - cmds: ffi_interfaces::WriteCmdsView, - header: ffi_interfaces::RaftCmdHeader, - ) -> ffi_interfaces::EngineStoreApplyRes { - let region_id = header.region_id; - let server = &mut (*self.engine_store_server); - let node_id = (*self.engine_store_server).id; - let _kv = &mut (*self.engine_store_server).engines.as_mut().unwrap().kv; - let proxy_compat = server.proxy_compat; - let mut do_handle_write_raft_cmd = move |region: &mut Box| { - if region.apply_state.get_applied_index() >= header.index { - debug!("obsolete write index"; - "apply_state"=>?region.apply_state, - "header"=>?header, - "node_id"=>node_id, - ); - panic!("observe obsolete write index"); - // return ffi_interfaces::EngineStoreApplyRes::None; - } - for i in 0..cmds.len { - let key = &*cmds.keys.add(i as _); - let val = &*cmds.vals.add(i as _); - let k = &key.to_slice(); - let v = &val.to_slice(); - let tp = &*cmds.cmd_types.add(i as _); - let cf = &*cmds.cmd_cf.add(i as _); - let cf_index = (*cf) as u8; - debug!( - "handle_write_raft_cmd with kv"; - "k" => ?&k[..std::cmp::min(4usize, k.len())], - "v" => ?&v[..std::cmp::min(4usize, v.len())], - "region_id" => region_id, - "node_id" => server.id, - "header" => ?header, - ); - let _data = &mut region.data[cf_index as usize]; - match tp { - engine_store_ffi::WriteCmdType::Put => { - write_kv_in_mem(region, cf_index as usize, k, v); - } - engine_store_ffi::WriteCmdType::Del => { - delete_kv_in_mem(region, cf_index as usize, k); - } - } - } - // Advance apply index, but do not persist - region.set_applied(header.index, header.term); - if !proxy_compat { - // If we don't support new proxy - write_to_db_data(server, region, "write".to_string()); - } - ffi_interfaces::EngineStoreApplyRes::None - }; + wb.write()?; + target_engines.sync_kv()?; - match (*self.engine_store_server).kvstore.entry(region_id) { - std::collections::hash_map::Entry::Occupied(mut o) => { - do_handle_write_raft_cmd(o.get_mut()) - } - std::collections::hash_map::Entry::Vacant(v) => { - warn!("region {} not found", region_id); - do_handle_write_raft_cmd(v.insert(Default::default())) - } - } - } -} - -unsafe extern "C" fn ffi_set_pb_msg_by_bytes( - type_: ffi_interfaces::MsgPBType, - ptr: ffi_interfaces::RawVoidPtr, - buff: ffi_interfaces::BaseBuffView, -) { - match type_ { - ffi_interfaces::MsgPBType::ReadIndexResponse => { - let v = &mut *(ptr as *mut kvproto::kvrpcpb::ReadIndexResponse); - v.merge_from_bytes(buff.to_slice()).unwrap(); - } - ffi_interfaces::MsgPBType::ServerInfoResponse => { - let v = &mut *(ptr as *mut kvproto::diagnosticspb::ServerInfoResponse); - v.merge_from_bytes(buff.to_slice()).unwrap(); - } - ffi_interfaces::MsgPBType::RegionLocalState => { - let v = &mut *(ptr as *mut kvproto::raft_serverpb::RegionLocalState); - v.merge_from_bytes(buff.to_slice()).unwrap(); - } - } -} - -pub fn gen_engine_store_server_helper( - wrap: Pin<&EngineStoreServerWrap>, -) -> EngineStoreServerHelper { - EngineStoreServerHelper { - magic_number: ffi_interfaces::RAFT_STORE_PROXY_MAGIC_NUMBER, - version: ffi_interfaces::RAFT_STORE_PROXY_VERSION, - inner: &(*wrap) as *const EngineStoreServerWrap as *mut _, - fn_gen_cpp_string: Some(ffi_gen_cpp_string), - fn_handle_write_raft_cmd: Some(ffi_handle_write_raft_cmd), - fn_handle_admin_raft_cmd: Some(ffi_handle_admin_raft_cmd), - fn_need_flush_data: Some(ffi_need_flush_data), - fn_try_flush_data: Some(ffi_try_flush_data), - fn_atomic_update_proxy: Some(ffi_atomic_update_proxy), - fn_handle_destroy: Some(ffi_handle_destroy), - fn_handle_ingest_sst: Some(ffi_handle_ingest_sst), - fn_handle_compute_store_stats: Some(ffi_handle_compute_store_stats), - fn_handle_get_engine_store_server_status: None, - fn_pre_handle_snapshot: Some(ffi_pre_handle_snapshot), - fn_apply_pre_handled_snapshot: Some(ffi_apply_pre_handled_snapshot), - fn_handle_http_request: None, - fn_check_http_uri_available: None, - fn_gc_raw_cpp_ptr: Some(ffi_gc_raw_cpp_ptr), - fn_get_config: None, - fn_set_store: None, - fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), - fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), - } -} - -unsafe fn into_engine_store_server_wrap( - arg1: *const ffi_interfaces::EngineStoreServerWrap, -) -> &'static mut EngineStoreServerWrap { - &mut *(arg1 as *mut EngineStoreServerWrap) -} - -unsafe extern "C" fn ffi_handle_admin_raft_cmd( - arg1: *const ffi_interfaces::EngineStoreServerWrap, - arg2: ffi_interfaces::BaseBuffView, - arg3: ffi_interfaces::BaseBuffView, - arg4: ffi_interfaces::RaftCmdHeader, -) -> ffi_interfaces::EngineStoreApplyRes { - let store = into_engine_store_server_wrap(arg1); - let mut req = kvproto::raft_cmdpb::AdminRequest::default(); - let mut resp = kvproto::raft_cmdpb::AdminResponse::default(); - req.merge_from_bytes(arg2.to_slice()).unwrap(); - resp.merge_from_bytes(arg3.to_slice()).unwrap(); - store.handle_admin_raft_cmd(&req, &resp, arg4) -} - -unsafe extern "C" fn ffi_handle_write_raft_cmd( - arg1: *const ffi_interfaces::EngineStoreServerWrap, - arg2: ffi_interfaces::WriteCmdsView, - arg3: ffi_interfaces::RaftCmdHeader, -) -> ffi_interfaces::EngineStoreApplyRes { - let store = into_engine_store_server_wrap(arg1); - store.handle_write_raft_cmd(arg2, arg3) -} - -enum RawCppPtrTypeImpl { - None = 0, - String, - PreHandledSnapshotWithBlock, - WakerNotifier, -} - -// TODO -#[allow(clippy::from_over_into)] -impl From for RawCppPtrTypeImpl { - fn from(o: ffi_interfaces::RawCppPtrType) -> Self { - match o { - 0 => RawCppPtrTypeImpl::None, - 1 => RawCppPtrTypeImpl::String, - 2 => RawCppPtrTypeImpl::PreHandledSnapshotWithBlock, - 3 => RawCppPtrTypeImpl::WakerNotifier, - _ => unreachable!(), - } - } -} - -// TODO remove this warn. -#[allow(clippy::from_over_into)] -impl Into for RawCppPtrTypeImpl { - fn into(self) -> ffi_interfaces::RawCppPtrType { - match self { - RawCppPtrTypeImpl::None => 0, - RawCppPtrTypeImpl::String => 1, - RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => 2, - RawCppPtrTypeImpl::WakerNotifier => 3, - } - } -} - -extern "C" fn ffi_need_flush_data( - _arg1: *mut ffi_interfaces::EngineStoreServerWrap, - _region_id: u64, -) -> u8 { - fail::fail_point!("need_flush_data", |e| e.unwrap().parse::().unwrap()); - true as u8 -} - -unsafe extern "C" fn ffi_try_flush_data( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - region_id: u64, - _try_until_succeed: u8, - index: u64, - term: u64, -) -> u8 { - let store = into_engine_store_server_wrap(arg1); - let kvstore = &mut (*store.engine_store_server).kvstore; - // If we can't find region here, we return true so proxy can trigger a - // CompactLog. The triggered CompactLog will be handled by - // `handleUselessAdminRaftCmd`, and result in a - // `EngineStoreApplyRes::NotFound`. Proxy will print this message and - // continue: `region not found in engine-store, maybe have exec `RemoveNode` - // first`. - let region = match kvstore.get_mut(®ion_id) { - Some(r) => r, - None => { - if (*store.engine_store_server) - .mock_cfg - .panic_when_flush_no_found - .load(Ordering::SeqCst) - { - panic!( - "ffi_try_flush_data no found region {} [index {} term {}], store {}", - region_id, - index, - term, - (*store.engine_store_server).id - ); - } else { - return 1; - } - } - }; - fail::fail_point!("try_flush_data", |e| { - let b = e.unwrap().parse::().unwrap(); - if b == 1 { - write_to_db_data( - &mut (*store.engine_store_server), - region, - "fn_try_flush_data".to_string(), - ); - } - b - }); - write_to_db_data( - &mut (*store.engine_store_server), - region, - "fn_try_flush_data".to_string(), - ); - true as u8 -} - -extern "C" fn ffi_gen_cpp_string(s: ffi_interfaces::BaseBuffView) -> ffi_interfaces::RawCppPtr { - let str = Box::new(Vec::from(s.to_slice())); - let ptr = Box::into_raw(str); - ffi_interfaces::RawCppPtr { - ptr: ptr as *mut _, - type_: RawCppPtrTypeImpl::String.into(), - } -} - -pub struct RawCppStringPtrGuard(ffi_interfaces::RawCppStringPtr); - -impl Default for RawCppStringPtrGuard { - fn default() -> Self { - Self(std::ptr::null_mut()) - } -} - -impl std::convert::AsRef for RawCppStringPtrGuard { - fn as_ref(&self) -> &ffi_interfaces::RawCppStringPtr { - &self.0 - } -} -impl std::convert::AsMut for RawCppStringPtrGuard { - fn as_mut(&mut self) -> &mut ffi_interfaces::RawCppStringPtr { - &mut self.0 - } -} - -impl Drop for RawCppStringPtrGuard { - fn drop(&mut self) { - ffi_interfaces::RawCppPtr { - ptr: self.0 as *mut _, - type_: RawCppPtrTypeImpl::String.into(), + let mut raft_wb = target_engines.raft.log_batch(1024); + // raft state + if copy_raft_state { + let raft_state = match get_raft_local_state(&source_engines.raft, region_id) { + Some(x) => x, + None => return Err(box_err!("bad RaftLocalState")), }; - } -} - -impl RawCppStringPtrGuard { - pub fn as_str(&self) -> &[u8] { - let s = self.0 as *mut Vec; - unsafe { &*s } - } -} - -pub struct ProxyNotifier { - cv: std::sync::Condvar, - mutex: Mutex<()>, - // multi notifiers single receiver model. use another flag to avoid waiting until timeout. - flag: std::sync::atomic::AtomicBool, -} - -impl ProxyNotifier { - pub fn blocked_wait_for(&self, timeout: Duration) { - // if flag from false to false, wait for notification. - // if flag from true to false, do nothing. - if !self.flag.swap(false, std::sync::atomic::Ordering::AcqRel) { - { - let lock = self.mutex.lock().unwrap(); - if !self.flag.load(std::sync::atomic::Ordering::Acquire) { - let _cv = self.cv.wait_timeout(lock, timeout); - } - } - self.flag.store(false, std::sync::atomic::Ordering::Release); - } - } - - pub fn wake(&self) { - // if flag from false -> true, then wake up. - // if flag from true -> true, do nothing. - if !self.flag.swap(true, std::sync::atomic::Ordering::AcqRel) { - let _lock = self.mutex.lock().unwrap(); - self.cv.notify_one(); - } - } - - pub fn new_raw() -> RawCppPtr { - let notifier = Box::new(Self { - cv: Default::default(), - mutex: Mutex::new(()), - flag: std::sync::atomic::AtomicBool::new(false), - }); - - RawCppPtr { - ptr: Box::into_raw(notifier) as _, - type_: RawCppPtrTypeImpl::WakerNotifier.into(), - } - } -} - -extern "C" fn ffi_gc_raw_cpp_ptr( - ptr: ffi_interfaces::RawVoidPtr, - tp: ffi_interfaces::RawCppPtrType, -) { - match RawCppPtrTypeImpl::from(tp) { - RawCppPtrTypeImpl::None => {} - RawCppPtrTypeImpl::String => unsafe { - drop(Box::>::from_raw(ptr as *mut _)); - }, - RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => unsafe { - drop(Box::::from_raw(ptr as *mut _)); - }, - RawCppPtrTypeImpl::WakerNotifier => unsafe { - drop(Box::from_raw(ptr as *mut ProxyNotifier)); - }, - } -} - -unsafe extern "C" fn ffi_atomic_update_proxy( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - arg2: *mut ffi_interfaces::RaftStoreProxyFFIHelper, -) { - let store = into_engine_store_server_wrap(arg1); - store.maybe_proxy_helper = Some(&mut *(arg2 as *mut RaftStoreProxyFFIHelper)); -} - -unsafe extern "C" fn ffi_handle_destroy( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - arg2: u64, -) { - let store = into_engine_store_server_wrap(arg1); - debug!("ffi_handle_destroy {}", arg2); - (*store.engine_store_server).kvstore.remove(&arg2); -} - -type MockRaftProxyHelper = RaftStoreProxyFFIHelper; + raft_wb.put_raft_state(region_id, &raft_state)?; + }; -#[derive(Debug)] -pub struct SSTReader<'a> { - proxy_helper: &'a MockRaftProxyHelper, - inner: ffi_interfaces::SSTReaderPtr, - type_: ffi_interfaces::ColumnFamilyType, + box_try!(target_engines.raft.consume(&mut raft_wb, true)); + Ok(()) } -impl<'a> Drop for SSTReader<'a> { - fn drop(&mut self) { - unsafe { - (self.proxy_helper.sst_reader_interfaces.fn_gc.into_inner())( - self.inner.clone(), - self.type_, - ); - } - } -} +pub fn copy_data_from( + source_engines: &Engines, + target_engines: &Engines, + source: &Region, + target: &mut Region, +) -> raftstore::Result<()> { + let region_id = source.region.get_id(); -impl<'a> SSTReader<'a> { - pub unsafe fn new( - proxy_helper: &'a MockRaftProxyHelper, - view: &'a ffi_interfaces::SSTView, - ) -> Self { - SSTReader { - proxy_helper, - inner: (proxy_helper - .sst_reader_interfaces - .fn_get_sst_reader - .into_inner())(view.clone(), proxy_helper.proxy_ptr.clone()), - type_: view.type_, + // kv data in memory + for cf in 0..3 { + for (k, v) in &source.data[cf] { + write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); } } - pub unsafe fn remained(&mut self) -> bool { - (self - .proxy_helper - .sst_reader_interfaces - .fn_remained - .into_inner())(self.inner.clone(), self.type_) - != 0 - } - - pub unsafe fn key(&mut self) -> ffi_interfaces::BaseBuffView { - (self.proxy_helper.sst_reader_interfaces.fn_key.into_inner())( - self.inner.clone(), - self.type_, - ) - } - - pub unsafe fn value(&mut self) -> ffi_interfaces::BaseBuffView { - (self - .proxy_helper - .sst_reader_interfaces - .fn_value - .into_inner())(self.inner.clone(), self.type_) - } - - pub unsafe fn next(&mut self) { - (self.proxy_helper.sst_reader_interfaces.fn_next.into_inner())( - self.inner.clone(), - self.type_, - ) - } -} - -struct PrehandledSnapshot { - pub region: std::option::Option, -} - -unsafe extern "C" fn ffi_pre_handle_snapshot( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - region_buff: ffi_interfaces::BaseBuffView, - peer_id: u64, - snaps: ffi_interfaces::SSTViewVec, - index: u64, - term: u64, -) -> ffi_interfaces::RawCppPtr { - let store = into_engine_store_server_wrap(arg1); - let proxy_helper = &mut *(store.maybe_proxy_helper.unwrap()); - let _kvstore = &mut (*store.engine_store_server).kvstore; - let node_id = (*store.engine_store_server).id; - - let mut region_meta = kvproto::metapb::Region::default(); - assert_ne!(region_buff.data, std::ptr::null()); - assert_ne!(region_buff.len, 0); - region_meta - .merge_from_bytes(region_buff.to_slice()) + // raft log + let mut raft_wb = target_engines.raft.log_batch(1024); + let mut entries: Vec = Default::default(); + source_engines + .raft + .get_all_entries_to(region_id, &mut entries) .unwrap(); + debug!("copy raft log {:?}", entries); - let mut region = Box::new(Region::new(region_meta)); - debug!( - "pre handle snaps"; - "peer_id" => peer_id, - "store_id" => node_id, - "region" => ?region.region, - "snap len" => snaps.len, - ); - for i in 0..snaps.len { - let snapshot = snaps.views.add(i as usize); - let view = &*(snapshot as *mut ffi_interfaces::SSTView); - let mut sst_reader = SSTReader::new(proxy_helper, view); - - while sst_reader.remained() { - let key = sst_reader.key(); - let value = sst_reader.value(); - - let cf_index = (*snapshot).type_ as u8; - write_kv_in_mem( - &mut region, - cf_index as usize, - key.to_slice(), - value.to_slice(), - ); - - sst_reader.next(); - } - } - { - region.set_applied(index, term); - region.apply_state.mut_truncated_state().set_index(index); - region.apply_state.mut_truncated_state().set_term(term); - } - ffi_interfaces::RawCppPtr { - ptr: Box::into_raw(Box::new(PrehandledSnapshot { - region: Some(*region), - })) as *const Region as ffi_interfaces::RawVoidPtr, - type_: RawCppPtrTypeImpl::PreHandledSnapshotWithBlock.into(), - } + raft_wb.append(region_id, entries)?; + box_try!(target_engines.raft.consume(&mut raft_wb, true)); + Ok(()) } -// In case of newly added cfs. -#[allow(unreachable_patterns)] -pub fn cf_to_name(cf: ffi_interfaces::ColumnFamilyType) -> &'static str { - match cf { - ffi_interfaces::ColumnFamilyType::Lock => CF_LOCK, - ffi_interfaces::ColumnFamilyType::Write => CF_WRITE, - ffi_interfaces::ColumnFamilyType::Default => CF_DEFAULT, - _ => unreachable!(), - } +// TODO Need refactor if moved to raft-engine +pub fn general_get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) } -unsafe extern "C" fn ffi_handle_safe_ts_update( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - _region_id: u64, - self_safe_ts: u64, - leader_safe_ts: u64, -) { - let store = into_engine_store_server_wrap(arg1); - let cluster = store.cluster_ptr as *const mock_cluster::Cluster; - assert_eq!(self_safe_ts, (*cluster).test_data.expected_self_safe_ts); - assert_eq!(leader_safe_ts, (*cluster).test_data.expected_leader_safe_ts); +// TODO Need refactor if moved to raft-engine +pub fn general_get_apply_state( + engine: &EK, + region_id: u64, +) -> Option { + let apply_state_key = keys::apply_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, &apply_state_key) + .unwrap_or(None) } -unsafe extern "C" fn ffi_apply_pre_handled_snapshot( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - arg2: ffi_interfaces::RawVoidPtr, - _arg3: ffi_interfaces::RawCppPtrType, -) { - let store = into_engine_store_server_wrap(arg1); - let region_meta = &mut *(arg2 as *mut PrehandledSnapshot); - let node_id = (*store.engine_store_server).id; - - let region_id = region_meta.region.as_ref().unwrap().region.id; - - let _ = &(*store.engine_store_server) - .kvstore - .insert(region_id, Box::new(region_meta.region.take().unwrap())); - - let region = (*store.engine_store_server) - .kvstore - .get_mut(®ion_id) - .unwrap(); - - debug!( - "apply prehandled snap"; - "store_id" => node_id, - "region" => ?region.region, - ); - write_to_db_data( - &mut (*store.engine_store_server), - region, - String::from("prehandle-snap"), - ); +pub fn get_region_local_state( + engine: &engine_rocks::RocksEngine, + region_id: u64, +) -> Option { + general_get_region_local_state(engine, region_id) } -unsafe extern "C" fn ffi_handle_ingest_sst( - arg1: *mut ffi_interfaces::EngineStoreServerWrap, - snaps: ffi_interfaces::SSTViewVec, - header: ffi_interfaces::RaftCmdHeader, -) -> ffi_interfaces::EngineStoreApplyRes { - let store = into_engine_store_server_wrap(arg1); - let node_id = (*store.engine_store_server).id; - let proxy_helper = &mut *(store.maybe_proxy_helper.unwrap()); - - let region_id = header.region_id; - let kvstore = &mut (*store.engine_store_server).kvstore; - let _kv = &mut (*store.engine_store_server).engines.as_mut().unwrap().kv; - - match kvstore.entry(region_id) { - std::collections::hash_map::Entry::Occupied(_o) => {} - std::collections::hash_map::Entry::Vacant(v) => { - // When we remove hacked code in handle_raft_entry_normal during migration, - // some tests in handle_raft_entry_normal may fail, since it can observe a empty - // cmd, thus creating region. - warn!( - "region {} not found when ingest, create for {}", - region_id, node_id - ); - let _ = v.insert(Default::default()); - } - } - let region = kvstore.get_mut(®ion_id).unwrap(); - - let index = header.index; - let term = header.term; - debug!("handle ingest sst"; - "header" => ?header, - "region_id" => region_id, - "snap len" => snaps.len, - ); - - for i in 0..snaps.len { - let snapshot = snaps.views.add(i as usize); - // let _path = std::str::from_utf8_unchecked((*snapshot).path.to_slice()); - let mut sst_reader = - SSTReader::new(proxy_helper, &*(snapshot as *mut ffi_interfaces::SSTView)); - while sst_reader.remained() { - let key = sst_reader.key(); - let value = sst_reader.value(); - let cf_index = (*snapshot).type_ as usize; - write_kv_in_mem(region, cf_index, key.to_slice(), value.to_slice()); - sst_reader.next(); - } - } - - { - region.set_applied(header.index, header.term); - region.apply_state.mut_truncated_state().set_index(index); - region.apply_state.mut_truncated_state().set_term(term); - } - - fail::fail_point!("on_handle_ingest_sst_return", |_e| { - ffi_interfaces::EngineStoreApplyRes::None - }); - write_to_db_data( - &mut (*store.engine_store_server), - region, - String::from("ingest-sst"), - ); - ffi_interfaces::EngineStoreApplyRes::Persist +// TODO Need refactor if moved to raft-engine +pub fn get_apply_state( + engine: &engine_rocks::RocksEngine, + region_id: u64, +) -> Option { + general_get_apply_state(engine, region_id) } -unsafe extern "C" fn ffi_handle_compute_store_stats( - _arg1: *mut ffi_interfaces::EngineStoreServerWrap, -) -> ffi_interfaces::StoreStats { - ffi_interfaces::StoreStats { - fs_stats: ffi_interfaces::FsStats { - capacity_size: 444444, - used_size: 111111, - avail_size: 333333, - ok: 1, - }, - engine_bytes_written: 0, - engine_keys_written: 0, - engine_bytes_read: 0, - engine_keys_read: 0, +pub fn get_raft_local_state( + raft_engine: &ER, + region_id: u64, +) -> Option { + match raft_engine.get_raft_state(region_id) { + Ok(Some(x)) => Some(x), + _ => None, } } diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index 5be1f40759c..d1a6e175287 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -92,7 +92,7 @@ pub struct TestData { pub struct Cluster> { // Helper to set ffi_helper_set. - ffi_helper_lst: Vec, + pub ffi_helper_lst: Vec, pub ffi_helper_set: Arc>>, pub cfg: Config, @@ -112,6 +112,8 @@ pub struct Cluster> { pub test_data: TestData, } +impl> std::panic::UnwindSafe for Cluster {} + impl> Cluster { pub fn new( id: u64, @@ -230,6 +232,28 @@ impl> Cluster { ) } + pub fn iter_ffi_helpers( + &self, + store_ids: Option>, + f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet), + ) { + let ids = match store_ids { + Some(ids) => ids, + None => self.engines.keys().copied().collect::>(), + }; + for id in ids { + let engine = self.get_engine(id); + let lock = self.ffi_helper_set.lock(); + match lock { + Ok(mut l) => { + let ffiset = l.get_mut(&id).unwrap(); + f(id, &engine, ffiset); + } + Err(_) => std::process::exit(1), + } + } + } + pub fn create_engines(&mut self) { self.io_rate_limiter = Some(Arc::new( self.cfg @@ -245,17 +269,28 @@ impl> Cluster { pub fn run(&mut self) { self.create_engines(); self.bootstrap_region().unwrap(); + self.bootstrap_ffi_helper_set(); self.start().unwrap(); } pub fn run_conf_change(&mut self) -> u64 { self.create_engines(); let region_id = self.bootstrap_conf_change(); + self.bootstrap_ffi_helper_set(); // Will not start new nodes in `start` self.start().unwrap(); region_id } + pub fn run_conf_change_no_start(&mut self) -> u64 { + self.create_engines(); + let region_id = self.bootstrap_conf_change(); + self.bootstrap_ffi_helper_set(); + region_id + } + + /// We need to create FFIHelperSet while we create engine. + /// And later set its `node_id` when we are allocated one when start. pub fn create_ffi_helper_set( &mut self, engines: Engines, @@ -294,6 +329,9 @@ impl> Cluster { self.ffi_helper_lst.push(ffi_helper_set); } + // If index is None, use the last in the list, which is added by + // create_ffi_helper_set. In most cases, index is `Some(0)`, which means we + // will use the first. pub fn associate_ffi_helper_set(&mut self, index: Option, node_id: u64) { let mut ffi_helper_set = if let Some(i) = index { self.ffi_helper_lst.remove(i) @@ -307,6 +345,17 @@ impl> Cluster { .insert(node_id, ffi_helper_set); } + pub fn bootstrap_ffi_helper_set(&mut self) { + let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); + // We force iterate engines in sorted order. + node_ids.sort(); + for (_, node_id) in node_ids.iter().enumerate() { + let node_id = *node_id; + // Always at the front of the vector. + self.associate_ffi_helper_set(Some(0), node_id); + } + } + pub fn create_engine( &mut self, router: Option>, @@ -325,16 +374,24 @@ impl> Cluster { } pub fn start(&mut self) -> ServerResult<()> { + self.start_with(Default::default()) + } + + pub fn start_with(&mut self, skip_set: HashSet) -> ServerResult<()> { init_global_ffi_helper_set(); // Try recover from last shutdown. - let node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); - for node_id in node_ids { + // `self.engines` is inited in bootstrap_region or bootstrap_conf_change. + let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); + // We force iterate engines in sorted order. + node_ids.sort(); + for (cnt, node_id) in node_ids.iter().enumerate() { + let node_id = *node_id; + if skip_set.contains(&cnt) { + tikv_util::info!("skip start at {} is {}", cnt, node_id); + continue; + } debug!("recover node"; "node_id" => node_id); - let _engines = self.engines.get_mut(&node_id).unwrap().clone(); - let _key_mgr = self.key_managers_map[&node_id].clone(); - // Always at the front of the vector. - self.associate_ffi_helper_set(Some(0), node_id); // Like TiKVServer::init self.run_node(node_id)?; // Since we use None to create_ffi_helper_set, we must init again. @@ -349,7 +406,12 @@ impl> Cluster { } // Try start new nodes. + // Normally, this branch will not be called, since self.engines are already + // added in bootstrap_region or bootstrap_conf_change. for _ in 0..self.count - self.engines.len() { + if !skip_set.is_empty() { + panic!("Error when start with skip set"); + } let (router, system) = create_raft_batch_system(&self.cfg.raft_store); self.create_engine(Some(router.clone())); @@ -380,6 +442,8 @@ impl> Cluster { self.key_managers_map.insert(node_id, key_manager.clone()); self.associate_ffi_helper_set(None, node_id); } + assert_eq!(self.count, self.engines.len()); + assert_eq!(self.count, self.dbs.len()); Ok(()) } @@ -1015,6 +1079,10 @@ impl> Cluster { &self.engines[&node_id].kv } + pub fn get_engines(&self, node_id: u64) -> &Engines { + &self.engines[&node_id] + } + pub fn get_raw_engine(&self, node_id: u64) -> Arc { Arc::clone(self.engines[&node_id].kv.bad_downcast()) } @@ -1023,6 +1091,13 @@ impl> Cluster { &self.get_tiflash_engine(node_id).rocks } + pub fn clear_send_filters(&mut self) { + let mut sim = self.sim.wl(); + for node_id in sim.get_node_ids() { + sim.clear_send_filters(node_id); + } + } + pub fn must_transfer_leader(&mut self, region_id: u64, leader: metapb::Peer) { let timer = Instant::now(); loop { diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs new file mode 100644 index 00000000000..1b90d1749ea --- /dev/null +++ b/new-mock-engine-store/src/mock_store.rs @@ -0,0 +1,1443 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +pub use std::{ + cell::RefCell, + collections::{BTreeMap, HashMap, HashSet}, + pin::Pin, + sync::{ + atomic::{AtomicU64, Ordering}, + Mutex, + }, + time::Duration, +}; + +pub use engine_store_ffi::{ + interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, + RawCppPtr, RawVoidPtr, UnwrapExternCFunc, +}; +pub use engine_traits::{ + Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, SyncMutable, + WriteBatch, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, +}; +pub use kvproto::{ + raft_cmdpb::AdminCmdType, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, +}; +pub use protobuf::Message; +pub use tikv_util::{box_err, box_try, debug, error, info, warn}; + +use crate::node::NodeCluster; +pub use crate::{ + config::MockConfig, + copy_data_from, copy_meta_from, general_get_apply_state, general_get_region_local_state, + get_apply_state, get_raft_local_state, get_region_local_state, mock_cluster, + mock_cluster::{ + must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, + }, + server::ServerCluster, +}; + +type RegionId = u64; +#[derive(Default, Clone)] +pub struct Region { + pub region: kvproto::metapb::Region, + // Which peer is me? + pub peer: kvproto::metapb::Peer, + // in-memory data + pub data: [BTreeMap, Vec>; 3], + // If we a key is deleted, it will immediately be removed from data, + // We will record the key in pending_delete, so we can delete it from disk when flushing. + pub pending_delete: [HashSet>; 3], + pub pending_write: [BTreeMap, Vec>; 3], + pub apply_state: kvproto::raft_serverpb::RaftApplyState, + pub applied_term: u64, +} + +impl Region { + fn set_applied(&mut self, index: u64, term: u64) { + self.apply_state.set_applied_index(index); + self.applied_term = term; + } + + fn new(meta: kvproto::metapb::Region) -> Self { + Region { + region: meta, + peer: Default::default(), + data: Default::default(), + pending_delete: Default::default(), + pending_write: Default::default(), + apply_state: Default::default(), + applied_term: 0, + } + } +} + +#[derive(Default)] +pub struct RegionStats { + pub pre_handle_count: AtomicU64, +} + +pub struct EngineStoreServer { + pub id: u64, + pub engines: Option>, + pub kvstore: HashMap>, + pub proxy_compat: bool, + pub mock_cfg: MockConfig, + pub region_states: RefCell>, +} + +impl EngineStoreServer { + pub fn new( + id: u64, + engines: Option>, + ) -> Self { + EngineStoreServer { + id, + engines, + kvstore: Default::default(), + proxy_compat: false, + mock_cfg: MockConfig::default(), + region_states: RefCell::new(Default::default()), + } + } + + pub fn mutate_region_states(&self, region_id: RegionId, f: F) { + let has = self.region_states.borrow().contains_key(®ion_id); + if !has { + self.region_states + .borrow_mut() + .insert(region_id, Default::default()); + } + f(self.region_states.borrow_mut().get_mut(®ion_id).unwrap()) + } + + pub fn get_mem( + &self, + region_id: u64, + cf: ffi_interfaces::ColumnFamilyType, + key: &Vec, + ) -> Option<&Vec> { + match self.kvstore.get(®ion_id) { + Some(region) => { + let bmap = ®ion.data[cf as usize]; + bmap.get(key) + } + None => None, + } + } + + pub fn stop(&mut self) { + for (_, region) in self.kvstore.iter_mut() { + for cf in region.pending_write.iter_mut() { + cf.clear(); + } + for cf in region.pending_delete.iter_mut() { + cf.clear(); + } + for cf in region.data.iter_mut() { + cf.clear(); + } + region.apply_state = Default::default(); + // We don't clear applied_term. + } + } + + // False alarm + #[allow(clippy::needless_collect)] + pub fn restore(&mut self) { + // TODO We should actually read from engine store's persistence. + // However, since mock engine store don't persist itself, + // we read from proxy instead. + unsafe { + let region_ids: Vec = self.kvstore.keys().cloned().collect(); + for region_id in region_ids.into_iter() { + load_from_db(self, region_id); + } + } + } +} + +pub struct EngineStoreServerWrap { + pub engine_store_server: *mut EngineStoreServer, + pub maybe_proxy_helper: std::option::Option<*mut RaftStoreProxyFFIHelper>, + // Call `gen_cluster(cluster_ptr)`, and get which cluster this Server belong to. + pub cluster_ptr: isize, +} + +fn set_new_region_peer(new_region: &mut Region, store_id: u64) { + if let Some(peer) = new_region + .region + .get_peers() + .iter() + .find(|&peer| peer.get_store_id() == store_id) + { + new_region.peer = peer.clone(); + } else { + // This happens when region is not found. + } +} + +pub fn make_new_region( + maybe_from_region: Option, + maybe_store_id: Option, +) -> Region { + let mut region = Region { + region: maybe_from_region.unwrap_or_default(), + ..Default::default() + }; + if let Some(store_id) = maybe_store_id { + set_new_region_peer(&mut region, store_id); + } + region + .apply_state + .mut_truncated_state() + .set_index(raftstore::store::RAFT_INIT_LOG_INDEX); + region + .apply_state + .mut_truncated_state() + .set_term(raftstore::store::RAFT_INIT_LOG_TERM); + region.set_applied( + raftstore::store::RAFT_INIT_LOG_INDEX, + raftstore::store::RAFT_INIT_LOG_TERM, + ); + region +} + +pub fn write_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8], v: &[u8]) { + let data = &mut region.data[cf_index]; + let pending_delete = &mut region.pending_delete[cf_index]; + let pending_write = &mut region.pending_write[cf_index]; + pending_delete.remove(k); + data.insert(k.to_vec(), v.to_vec()); + pending_write.insert(k.to_vec(), v.to_vec()); +} + +fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { + let data = &mut region.data[cf_index]; + let pending_delete = &mut region.pending_delete[cf_index]; + pending_delete.insert(k.to_vec()); + data.remove(k); +} + +unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { + let store_id = store.id; + let engine = &mut store.engines.as_mut().unwrap().kv; + let apply_state: RaftApplyState = engine + .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) + .unwrap() + .unwrap(); + let region_state: RegionLocalState = engine + .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) + .unwrap() + .unwrap(); + + let region = store.kvstore.get_mut(®ion_id).unwrap(); + region.apply_state = apply_state; + region.region = region_state.get_region().clone(); + set_new_region_peer(region, store.id); + + for cf in 0..3 { + let cf_name = cf_to_name(cf.into()); + region.data[cf].clear(); + region.pending_delete[cf].clear(); + region.pending_write[cf].clear(); + let start = region.region.get_start_key().to_owned(); + let end = region.region.get_end_key().to_owned(); + engine + .scan(cf_name, &start, &end, false, |k, v| { + let origin_key = if keys::validate_data_key(k) { + keys::origin_key(k).to_vec() + } else { + k.to_vec() + }; + debug!("restored data"; + "store" => store_id, + "region_id" => region_id, + "cf" => cf, + "k" => ?k, + "origin_key" => ?origin_key, + ); + region.data[cf].insert(origin_key, v.to_vec()); + Ok(true) + }) + .unwrap(); + } +} + +unsafe fn write_to_db_data( + store: &mut EngineStoreServer, + region: &mut Box, + reason: String, +) { + let kv = &mut store.engines.as_mut().unwrap().kv; + write_to_db_data_by_engine(store.id, kv, region, reason) +} + +unsafe fn write_to_db_data_by_engine( + store_id: u64, + kv: &TiFlashEngine, + region: &mut Box, + reason: String, +) { + info!("mock flush to engine"; + "region" => ?region.region, + "store_id" => store_id, + "reason" => reason + ); + for cf in 0..3 { + let pending_write = std::mem::take(region.pending_write.as_mut().get_mut(cf).unwrap()); + let mut pending_remove = + std::mem::take(region.pending_delete.as_mut().get_mut(cf).unwrap()); + for (k, v) in pending_write.into_iter() { + let tikv_key = keys::data_key(k.as_slice()); + let cf_name = cf_to_name(cf.into()); + if !pending_remove.contains(&k) { + kv.rocks.put_cf(cf_name, tikv_key.as_slice(), &v).unwrap(); + } else { + pending_remove.remove(&k); + } + } + let cf_name = cf_to_name(cf.into()); + for k in pending_remove.into_iter() { + let tikv_key = keys::data_key(k.as_slice()); + kv.rocks.delete_cf(cf_name, &tikv_key).unwrap(); + } + } +} + +impl EngineStoreServerWrap { + pub fn new( + engine_store_server: *mut EngineStoreServer, + maybe_proxy_helper: std::option::Option<*mut RaftStoreProxyFFIHelper>, + cluster_ptr: isize, + ) -> Self { + Self { + engine_store_server, + maybe_proxy_helper, + cluster_ptr, + } + } + + unsafe fn handle_admin_raft_cmd( + &mut self, + req: &kvproto::raft_cmdpb::AdminRequest, + resp: &kvproto::raft_cmdpb::AdminResponse, + header: ffi_interfaces::RaftCmdHeader, + ) -> ffi_interfaces::EngineStoreApplyRes { + let region_id = header.region_id; + let node_id = (*self.engine_store_server).id; + info!("handle_admin_raft_cmd"; + "node_id"=>node_id, + "request"=>?req, + "response"=>?resp, + "header"=>?header, + "region_id"=>header.region_id, + ); + let do_handle_admin_raft_cmd = + move |region: &mut Box, engine_store_server: &mut EngineStoreServer| { + if region.apply_state.get_applied_index() >= header.index { + // If it is a old entry. + error!("obsolete admin index"; + "apply_state"=>?region.apply_state, + "header"=>?header, + "node_id"=>node_id, + ); + panic!("observe obsolete admin index"); + // return ffi_interfaces::EngineStoreApplyRes::None; + } + match req.get_cmd_type() { + AdminCmdType::ChangePeer | AdminCmdType::ChangePeerV2 => { + let new_region_meta = resp.get_change_peer().get_region(); + let old_peer_id = { + let old_region = + engine_store_server.kvstore.get_mut(®ion_id).unwrap(); + old_region.region = new_region_meta.clone(); + region.set_applied(header.index, header.term); + old_region.peer.get_store_id() + }; + + let mut do_remove = true; + if old_peer_id != 0 { + for peer in new_region_meta.get_peers().iter() { + if peer.get_store_id() == old_peer_id { + // Should not remove region + do_remove = false; + } + } + } else { + // If old_peer_id is 0, seems old_region.peer is not set, just neglect + // for convenience. + do_remove = false; + } + if do_remove { + let removed = engine_store_server.kvstore.remove(®ion_id); + // We need to also remove apply state, thus we need to know peer_id + debug!( + "Remove region {:?} peer_id {} at node {}, for new meta {:?}", + removed.unwrap().region, + old_peer_id, + node_id, + new_region_meta + ); + } + } + AdminCmdType::BatchSplit => { + let regions = resp.get_splits().regions.as_ref(); + + for i in 0..regions.len() { + let region_meta = regions.get(i).unwrap(); + if region_meta.id == region_id { + // This is the derived region + debug!( + "region {} is derived by split at peer {} with meta {:?}", + region_meta.id, node_id, region_meta + ); + assert!(engine_store_server.kvstore.contains_key(®ion_meta.id)); + engine_store_server + .kvstore + .get_mut(®ion_meta.id) + .unwrap() + .region = region_meta.clone(); + } else { + // Should split data into new region + debug!( + "new region {} generated by split at peer {} with meta {:?}", + region_meta.id, node_id, region_meta + ); + let new_region = + make_new_region(Some(region_meta.clone()), Some(node_id)); + + // No need to split data because all KV are stored in the same + // RocksDB. TODO But we still need + // to clean all in-memory data. + // We can't assert `region_meta.id` is brand new here + engine_store_server + .kvstore + .insert(region_meta.id, Box::new(new_region)); + } + } + } + AdminCmdType::PrepareMerge => { + let tikv_region = resp.get_split().get_left(); + + let _target = req.prepare_merge.as_ref().unwrap().target.as_ref(); + let region_meta = &mut (engine_store_server + .kvstore + .get_mut(®ion_id) + .unwrap() + .region); + + // Increase self region conf version and version + let region_epoch = region_meta.region_epoch.as_mut().unwrap(); + let new_version = region_epoch.version + 1; + region_epoch.set_version(new_version); + // TODO this check may fail + // assert_eq!(tikv_region.get_region_epoch().get_version(), new_version); + let conf_version = region_epoch.conf_ver + 1; + region_epoch.set_conf_ver(conf_version); + assert_eq!(tikv_region.get_region_epoch().get_conf_ver(), conf_version); + + { + let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); + region.set_applied(header.index, header.term); + } + // We don't handle MergeState and PeerState here + } + AdminCmdType::CommitMerge => { + { + let tikv_target_region_meta = resp.get_split().get_left(); + + let target_region = + &mut (engine_store_server.kvstore.get_mut(®ion_id).unwrap()); + let target_region_meta = &mut target_region.region; + let target_version = + target_region_meta.get_region_epoch().get_version(); + let source_region = req.get_commit_merge().get_source(); + let source_version = source_region.get_region_epoch().get_version(); + + let new_version = std::cmp::max(source_version, target_version) + 1; + target_region_meta + .mut_region_epoch() + .set_version(new_version); + assert_eq!( + target_region_meta.get_region_epoch().get_version(), + new_version + ); + + // No need to merge data + let source_at_left = if source_region.get_start_key().is_empty() { + true + } else if target_region_meta.get_start_key().is_empty() { + false + } else { + source_region + .get_end_key() + .cmp(target_region_meta.get_start_key()) + == std::cmp::Ordering::Equal + }; + + if source_at_left { + target_region_meta + .set_start_key(source_region.get_start_key().to_vec()); + assert_eq!( + tikv_target_region_meta.get_start_key(), + target_region_meta.get_start_key() + ); + } else { + target_region_meta + .set_end_key(source_region.get_end_key().to_vec()); + assert_eq!( + tikv_target_region_meta.get_end_key(), + target_region_meta.get_end_key() + ); + } + target_region.set_applied(header.index, header.term); + } + let to_remove = req.get_commit_merge().get_source().get_id(); + engine_store_server.kvstore.remove(&to_remove); + } + AdminCmdType::RollbackMerge => { + let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); + let region_meta = &mut region.region; + let new_version = region_meta.get_region_epoch().get_version() + 1; + let region_epoch = region_meta.region_epoch.as_mut().unwrap(); + region_epoch.set_version(new_version); + + region.set_applied(header.index, header.term); + } + AdminCmdType::CompactLog => { + // We can always do compact, since a executed CompactLog must follow a + // successful persist. + let region = engine_store_server.kvstore.get_mut(®ion_id).unwrap(); + let state = &mut region.apply_state; + let compact_index = req.get_compact_log().get_compact_index(); + let compact_term = req.get_compact_log().get_compact_term(); + state.mut_truncated_state().set_index(compact_index); + state.mut_truncated_state().set_term(compact_term); + region.set_applied(header.index, header.term); + } + _ => { + region.set_applied(header.index, header.term); + } + } + // Do persist or not + match req.get_cmd_type() { + AdminCmdType::CompactLog => { + fail::fail_point!("no_persist_compact_log", |_| { + // Persist data, but don't persist meta. + ffi_interfaces::EngineStoreApplyRes::None + }); + ffi_interfaces::EngineStoreApplyRes::Persist + } + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + fail::fail_point!("no_persist_flashback", |_| { + // Persist data, but don't persist meta. + ffi_interfaces::EngineStoreApplyRes::None + }); + ffi_interfaces::EngineStoreApplyRes::Persist + } + _ => ffi_interfaces::EngineStoreApplyRes::Persist, + } + }; + + let res = match (*self.engine_store_server).kvstore.entry(region_id) { + std::collections::hash_map::Entry::Occupied(mut o) => { + do_handle_admin_raft_cmd(o.get_mut(), &mut (*self.engine_store_server)) + } + std::collections::hash_map::Entry::Vacant(v) => { + // Currently in tests, we don't handle commands like BatchSplit, + // and sometimes we don't bootstrap region 1, + // so it is normal if we find no region. + warn!("region {} not found, create for {}", region_id, node_id); + let new_region = v.insert(Default::default()); + assert!((*self.engine_store_server).kvstore.contains_key(®ion_id)); + do_handle_admin_raft_cmd(new_region, &mut (*self.engine_store_server)) + } + }; + + let region = match (*self.engine_store_server).kvstore.get_mut(®ion_id) { + Some(r) => Some(r), + None => { + warn!( + "still can't find region {} for {}, may be remove due to confchange", + region_id, node_id + ); + None + } + }; + if res == ffi_interfaces::EngineStoreApplyRes::Persist { + // Persist tells ApplyDelegate to do a commit. + // So we also need a persist of actual data on engine-store' side. + if let Some(region) = region { + if req.get_cmd_type() == AdminCmdType::CompactLog { + // We already persist when fn_try_flush_data. + } else { + write_to_db_data( + &mut (*self.engine_store_server), + region, + format!("admin {:?}", req), + ); + } + } + } + res + } + + unsafe fn handle_write_raft_cmd( + &mut self, + cmds: ffi_interfaces::WriteCmdsView, + header: ffi_interfaces::RaftCmdHeader, + ) -> ffi_interfaces::EngineStoreApplyRes { + let region_id = header.region_id; + let server = &mut (*self.engine_store_server); + let node_id = (*self.engine_store_server).id; + let _kv = &mut (*self.engine_store_server).engines.as_mut().unwrap().kv; + let proxy_compat = server.proxy_compat; + let mut do_handle_write_raft_cmd = move |region: &mut Box| { + if region.apply_state.get_applied_index() >= header.index { + debug!("obsolete write index"; + "apply_state"=>?region.apply_state, + "header"=>?header, + "node_id"=>node_id, + ); + panic!("observe obsolete write index"); + // return ffi_interfaces::EngineStoreApplyRes::None; + } + for i in 0..cmds.len { + let key = &*cmds.keys.add(i as _); + let val = &*cmds.vals.add(i as _); + let k = &key.to_slice(); + let v = &val.to_slice(); + let tp = &*cmds.cmd_types.add(i as _); + let cf = &*cmds.cmd_cf.add(i as _); + let cf_index = (*cf) as u8; + debug!( + "handle_write_raft_cmd with kv"; + "k" => ?&k[..std::cmp::min(4usize, k.len())], + "v" => ?&v[..std::cmp::min(4usize, v.len())], + "region_id" => region_id, + "node_id" => server.id, + "header" => ?header, + ); + let _data = &mut region.data[cf_index as usize]; + match tp { + engine_store_ffi::WriteCmdType::Put => { + write_kv_in_mem(region.as_mut(), cf_index as usize, k, v); + } + engine_store_ffi::WriteCmdType::Del => { + delete_kv_in_mem(region.as_mut(), cf_index as usize, k); + } + } + } + // Advance apply index, but do not persist + region.set_applied(header.index, header.term); + if !proxy_compat { + // If we don't support new proxy, we persist everytime. + write_to_db_data(server, region, "write".to_string()); + } + ffi_interfaces::EngineStoreApplyRes::None + }; + + match (*self.engine_store_server).kvstore.entry(region_id) { + std::collections::hash_map::Entry::Occupied(mut o) => { + do_handle_write_raft_cmd(o.get_mut()) + } + std::collections::hash_map::Entry::Vacant(v) => { + warn!("region {} not found", region_id); + do_handle_write_raft_cmd(v.insert(Default::default())) + } + } + } +} + +unsafe extern "C" fn ffi_set_pb_msg_by_bytes( + type_: ffi_interfaces::MsgPBType, + ptr: ffi_interfaces::RawVoidPtr, + buff: ffi_interfaces::BaseBuffView, +) { + match type_ { + ffi_interfaces::MsgPBType::ReadIndexResponse => { + let v = &mut *(ptr as *mut kvproto::kvrpcpb::ReadIndexResponse); + v.merge_from_bytes(buff.to_slice()).unwrap(); + } + ffi_interfaces::MsgPBType::ServerInfoResponse => { + let v = &mut *(ptr as *mut kvproto::diagnosticspb::ServerInfoResponse); + v.merge_from_bytes(buff.to_slice()).unwrap(); + } + ffi_interfaces::MsgPBType::RegionLocalState => { + let v = &mut *(ptr as *mut kvproto::raft_serverpb::RegionLocalState); + v.merge_from_bytes(buff.to_slice()).unwrap(); + } + } +} + +pub fn gen_engine_store_server_helper( + wrap: Pin<&EngineStoreServerWrap>, +) -> EngineStoreServerHelper { + EngineStoreServerHelper { + magic_number: ffi_interfaces::RAFT_STORE_PROXY_MAGIC_NUMBER, + version: ffi_interfaces::RAFT_STORE_PROXY_VERSION, + inner: &(*wrap) as *const EngineStoreServerWrap as *mut _, + fn_gen_cpp_string: Some(ffi_gen_cpp_string), + fn_handle_write_raft_cmd: Some(ffi_handle_write_raft_cmd), + fn_handle_admin_raft_cmd: Some(ffi_handle_admin_raft_cmd), + fn_need_flush_data: Some(ffi_need_flush_data), + fn_try_flush_data: Some(ffi_try_flush_data), + fn_atomic_update_proxy: Some(ffi_atomic_update_proxy), + fn_handle_destroy: Some(ffi_handle_destroy), + fn_handle_ingest_sst: Some(ffi_handle_ingest_sst), + fn_handle_compute_store_stats: Some(ffi_handle_compute_store_stats), + fn_handle_get_engine_store_server_status: None, + fn_pre_handle_snapshot: Some(ffi_pre_handle_snapshot), + fn_apply_pre_handled_snapshot: Some(ffi_apply_pre_handled_snapshot), + fn_handle_http_request: None, + fn_check_http_uri_available: None, + fn_gc_raw_cpp_ptr: Some(ffi_gc_raw_cpp_ptr), + fn_get_config: None, + fn_set_store: None, + fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), + fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), + fn_fast_add_peer: Some(ffi_fast_add_peer), + fn_create_write_batch: None, + fn_write_batch_put_page: None, + fn_write_batch_del_page: None, + fn_write_batch_size: None, + fn_write_batch_is_empty: None, + fn_write_batch_merge: None, + fn_write_batch_clear: None, + fn_consume_write_batch: None, + fn_handle_read_page: None, + fn_gc_page_with_view_vec: None, + fn_handle_purge_pagestorage: None, + fn_handle_scan_page: None, + fn_handle_seek_ps_key: None, + fn_ps_is_empty: None, + } +} + +unsafe fn into_engine_store_server_wrap( + arg1: *const ffi_interfaces::EngineStoreServerWrap, +) -> &'static mut EngineStoreServerWrap { + &mut *(arg1 as *mut EngineStoreServerWrap) +} + +unsafe extern "C" fn ffi_handle_admin_raft_cmd( + arg1: *const ffi_interfaces::EngineStoreServerWrap, + arg2: ffi_interfaces::BaseBuffView, + arg3: ffi_interfaces::BaseBuffView, + arg4: ffi_interfaces::RaftCmdHeader, +) -> ffi_interfaces::EngineStoreApplyRes { + let store = into_engine_store_server_wrap(arg1); + let mut req = kvproto::raft_cmdpb::AdminRequest::default(); + let mut resp = kvproto::raft_cmdpb::AdminResponse::default(); + req.merge_from_bytes(arg2.to_slice()).unwrap(); + resp.merge_from_bytes(arg3.to_slice()).unwrap(); + store.handle_admin_raft_cmd(&req, &resp, arg4) +} + +unsafe extern "C" fn ffi_handle_write_raft_cmd( + arg1: *const ffi_interfaces::EngineStoreServerWrap, + arg2: ffi_interfaces::WriteCmdsView, + arg3: ffi_interfaces::RaftCmdHeader, +) -> ffi_interfaces::EngineStoreApplyRes { + let store = into_engine_store_server_wrap(arg1); + store.handle_write_raft_cmd(arg2, arg3) +} + +enum RawCppPtrTypeImpl { + None = 0, + String, + PreHandledSnapshotWithBlock, + WakerNotifier, +} + +// TODO +#[allow(clippy::from_over_into)] +impl From for RawCppPtrTypeImpl { + fn from(o: ffi_interfaces::RawCppPtrType) -> Self { + match o { + 0 => RawCppPtrTypeImpl::None, + 1 => RawCppPtrTypeImpl::String, + 2 => RawCppPtrTypeImpl::PreHandledSnapshotWithBlock, + 3 => RawCppPtrTypeImpl::WakerNotifier, + _ => unreachable!(), + } + } +} + +// TODO remove this warn. +#[allow(clippy::from_over_into)] +impl Into for RawCppPtrTypeImpl { + fn into(self) -> ffi_interfaces::RawCppPtrType { + match self { + RawCppPtrTypeImpl::None => 0, + RawCppPtrTypeImpl::String => 1, + RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => 2, + RawCppPtrTypeImpl::WakerNotifier => 3, + } + } +} + +extern "C" fn ffi_need_flush_data( + _arg1: *mut ffi_interfaces::EngineStoreServerWrap, + _region_id: u64, +) -> u8 { + fail::fail_point!("need_flush_data", |e| e.unwrap().parse::().unwrap()); + true as u8 +} + +unsafe extern "C" fn ffi_try_flush_data( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + region_id: u64, + _try_until_succeed: u8, + index: u64, + term: u64, +) -> u8 { + let store = into_engine_store_server_wrap(arg1); + let kvstore = &mut (*store.engine_store_server).kvstore; + // If we can't find region here, we return true so proxy can trigger a + // CompactLog. The triggered CompactLog will be handled by + // `handleUselessAdminRaftCmd`, and result in a + // `EngineStoreApplyRes::NotFound`. Proxy will print this message and + // continue: `region not found in engine-store, maybe have exec `RemoveNode` + // first`. + let region = match kvstore.get_mut(®ion_id) { + Some(r) => r, + None => { + if (*store.engine_store_server) + .mock_cfg + .panic_when_flush_no_found + .load(Ordering::SeqCst) + { + panic!( + "ffi_try_flush_data no found region {} [index {} term {}], store {}", + region_id, + index, + term, + (*store.engine_store_server).id + ); + } else { + return 1; + } + } + }; + fail::fail_point!("try_flush_data", |e| { + let b = e.unwrap().parse::().unwrap(); + if b == 1 { + write_to_db_data( + &mut (*store.engine_store_server), + region, + "fn_try_flush_data".to_string(), + ); + } + b + }); + write_to_db_data( + &mut (*store.engine_store_server), + region, + "fn_try_flush_data".to_string(), + ); + true as u8 +} + +extern "C" fn ffi_gen_cpp_string(s: ffi_interfaces::BaseBuffView) -> ffi_interfaces::RawCppPtr { + let str = Box::new(Vec::from(s.to_slice())); + let ptr = Box::into_raw(str); + ffi_interfaces::RawCppPtr { + ptr: ptr as *mut _, + type_: RawCppPtrTypeImpl::String.into(), + } +} + +pub struct RawCppStringPtrGuard(ffi_interfaces::RawCppStringPtr); + +impl Default for RawCppStringPtrGuard { + fn default() -> Self { + Self(std::ptr::null_mut()) + } +} + +impl std::convert::AsRef for RawCppStringPtrGuard { + fn as_ref(&self) -> &ffi_interfaces::RawCppStringPtr { + &self.0 + } +} +impl std::convert::AsMut for RawCppStringPtrGuard { + fn as_mut(&mut self) -> &mut ffi_interfaces::RawCppStringPtr { + &mut self.0 + } +} + +impl Drop for RawCppStringPtrGuard { + fn drop(&mut self) { + ffi_interfaces::RawCppPtr { + ptr: self.0 as *mut _, + type_: RawCppPtrTypeImpl::String.into(), + }; + } +} + +impl RawCppStringPtrGuard { + pub fn as_str(&self) -> &[u8] { + let s = self.0 as *mut Vec; + unsafe { &*s } + } +} + +pub struct ProxyNotifier { + cv: std::sync::Condvar, + mutex: Mutex<()>, + // multi notifiers single receiver model. use another flag to avoid waiting until timeout. + flag: std::sync::atomic::AtomicBool, +} + +impl ProxyNotifier { + pub fn blocked_wait_for(&self, timeout: Duration) { + // if flag from false to false, wait for notification. + // if flag from true to false, do nothing. + if !self.flag.swap(false, std::sync::atomic::Ordering::AcqRel) { + { + let lock = self.mutex.lock().unwrap(); + if !self.flag.load(std::sync::atomic::Ordering::Acquire) { + let _cv = self.cv.wait_timeout(lock, timeout); + } + } + self.flag.store(false, std::sync::atomic::Ordering::Release); + } + } + + pub fn wake(&self) { + // if flag from false -> true, then wake up. + // if flag from true -> true, do nothing. + if !self.flag.swap(true, std::sync::atomic::Ordering::AcqRel) { + let _lock = self.mutex.lock().unwrap(); + self.cv.notify_one(); + } + } + + pub fn new_raw() -> RawCppPtr { + let notifier = Box::new(Self { + cv: Default::default(), + mutex: Mutex::new(()), + flag: std::sync::atomic::AtomicBool::new(false), + }); + + RawCppPtr { + ptr: Box::into_raw(notifier) as _, + type_: RawCppPtrTypeImpl::WakerNotifier.into(), + } + } +} + +extern "C" fn ffi_gc_raw_cpp_ptr( + ptr: ffi_interfaces::RawVoidPtr, + tp: ffi_interfaces::RawCppPtrType, +) { + match RawCppPtrTypeImpl::from(tp) { + RawCppPtrTypeImpl::None => {} + RawCppPtrTypeImpl::String => unsafe { + drop(Box::>::from_raw(ptr as *mut _)); + }, + RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => unsafe { + drop(Box::::from_raw(ptr as *mut _)); + }, + RawCppPtrTypeImpl::WakerNotifier => unsafe { + drop(Box::from_raw(ptr as *mut ProxyNotifier)); + }, + } +} + +unsafe extern "C" fn ffi_atomic_update_proxy( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + arg2: *mut ffi_interfaces::RaftStoreProxyFFIHelper, +) { + let store = into_engine_store_server_wrap(arg1); + store.maybe_proxy_helper = Some(&mut *(arg2 as *mut RaftStoreProxyFFIHelper)); +} + +unsafe extern "C" fn ffi_handle_destroy( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + arg2: u64, +) { + let store = into_engine_store_server_wrap(arg1); + debug!("ffi_handle_destroy {}", arg2); + (*store.engine_store_server).kvstore.remove(&arg2); +} + +type MockRaftProxyHelper = RaftStoreProxyFFIHelper; + +#[derive(Debug)] +pub struct SSTReader<'a> { + proxy_helper: &'a MockRaftProxyHelper, + inner: ffi_interfaces::SSTReaderPtr, + type_: ffi_interfaces::ColumnFamilyType, +} + +impl<'a> Drop for SSTReader<'a> { + fn drop(&mut self) { + unsafe { + (self.proxy_helper.sst_reader_interfaces.fn_gc.into_inner())( + self.inner.clone(), + self.type_, + ); + } + } +} + +impl<'a> SSTReader<'a> { + pub unsafe fn new( + proxy_helper: &'a MockRaftProxyHelper, + view: &'a ffi_interfaces::SSTView, + ) -> Self { + SSTReader { + proxy_helper, + inner: (proxy_helper + .sst_reader_interfaces + .fn_get_sst_reader + .into_inner())(view.clone(), proxy_helper.proxy_ptr.clone()), + type_: view.type_, + } + } + + pub unsafe fn remained(&mut self) -> bool { + (self + .proxy_helper + .sst_reader_interfaces + .fn_remained + .into_inner())(self.inner.clone(), self.type_) + != 0 + } + + pub unsafe fn key(&mut self) -> ffi_interfaces::BaseBuffView { + (self.proxy_helper.sst_reader_interfaces.fn_key.into_inner())( + self.inner.clone(), + self.type_, + ) + } + + pub unsafe fn value(&mut self) -> ffi_interfaces::BaseBuffView { + (self + .proxy_helper + .sst_reader_interfaces + .fn_value + .into_inner())(self.inner.clone(), self.type_) + } + + pub unsafe fn next(&mut self) { + (self.proxy_helper.sst_reader_interfaces.fn_next.into_inner())( + self.inner.clone(), + self.type_, + ) + } +} + +struct PrehandledSnapshot { + pub region: std::option::Option, +} + +unsafe extern "C" fn ffi_pre_handle_snapshot( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + region_buff: ffi_interfaces::BaseBuffView, + peer_id: u64, + snaps: ffi_interfaces::SSTViewVec, + index: u64, + term: u64, +) -> ffi_interfaces::RawCppPtr { + let store = into_engine_store_server_wrap(arg1); + let proxy_helper = &mut *(store.maybe_proxy_helper.unwrap()); + let _kvstore = &mut (*store.engine_store_server).kvstore; + let node_id = (*store.engine_store_server).id; + + let mut region_meta = kvproto::metapb::Region::default(); + assert_ne!(region_buff.data, std::ptr::null()); + assert_ne!(region_buff.len, 0); + region_meta + .merge_from_bytes(region_buff.to_slice()) + .unwrap(); + + let mut region = Box::new(Region::new(region_meta)); + debug!( + "pre handle snaps"; + "peer_id" => peer_id, + "store_id" => node_id, + "index" => index, + "term" => term, + "region" => ?region.region, + "snap len" => snaps.len, + ); + + (*store.engine_store_server).mutate_region_states( + region.region.get_id(), + |e: &mut RegionStats| { + e.pre_handle_count.fetch_add(1, Ordering::SeqCst); + }, + ); + + for i in 0..snaps.len { + let snapshot = snaps.views.add(i as usize); + let view = &*(snapshot as *mut ffi_interfaces::SSTView); + let mut sst_reader = SSTReader::new(proxy_helper, view); + + while sst_reader.remained() { + let key = sst_reader.key(); + let value = sst_reader.value(); + + let cf_index = (*snapshot).type_ as u8; + write_kv_in_mem( + region.as_mut(), + cf_index as usize, + key.to_slice(), + value.to_slice(), + ); + + sst_reader.next(); + } + } + { + region.set_applied(index, term); + region.apply_state.mut_truncated_state().set_index(index); + region.apply_state.mut_truncated_state().set_term(term); + } + ffi_interfaces::RawCppPtr { + ptr: Box::into_raw(Box::new(PrehandledSnapshot { + region: Some(*region), + })) as *const Region as ffi_interfaces::RawVoidPtr, + type_: RawCppPtrTypeImpl::PreHandledSnapshotWithBlock.into(), + } +} + +// In case of newly added cfs. +#[allow(unreachable_patterns)] +pub fn cf_to_name(cf: ffi_interfaces::ColumnFamilyType) -> &'static str { + match cf { + ffi_interfaces::ColumnFamilyType::Lock => CF_LOCK, + ffi_interfaces::ColumnFamilyType::Write => CF_WRITE, + ffi_interfaces::ColumnFamilyType::Default => CF_DEFAULT, + _ => unreachable!(), + } +} + +unsafe extern "C" fn ffi_handle_safe_ts_update( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + _region_id: u64, + self_safe_ts: u64, + leader_safe_ts: u64, +) { + let store = into_engine_store_server_wrap(arg1); + let cluster = store.cluster_ptr as *const mock_cluster::Cluster; + assert_eq!(self_safe_ts, (*cluster).test_data.expected_self_safe_ts); + assert_eq!(leader_safe_ts, (*cluster).test_data.expected_leader_safe_ts); +} + +unsafe extern "C" fn ffi_apply_pre_handled_snapshot( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + arg2: ffi_interfaces::RawVoidPtr, + _arg3: ffi_interfaces::RawCppPtrType, +) { + let store = into_engine_store_server_wrap(arg1); + let region_meta = &mut *(arg2 as *mut PrehandledSnapshot); + let node_id = (*store.engine_store_server).id; + + let region_id = region_meta.region.as_ref().unwrap().region.id; + + let _ = &(*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(region_meta.region.take().unwrap())); + + let region = (*store.engine_store_server) + .kvstore + .get_mut(®ion_id) + .unwrap(); + + debug!( + "apply prehandled snap"; + "store_id" => node_id, + "region" => ?region.region, + ); + write_to_db_data( + &mut (*store.engine_store_server), + region, + String::from("prehandle-snap"), + ); +} + +unsafe extern "C" fn ffi_handle_ingest_sst( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + snaps: ffi_interfaces::SSTViewVec, + header: ffi_interfaces::RaftCmdHeader, +) -> ffi_interfaces::EngineStoreApplyRes { + let store = into_engine_store_server_wrap(arg1); + let node_id = (*store.engine_store_server).id; + let proxy_helper = &mut *(store.maybe_proxy_helper.unwrap()); + + let region_id = header.region_id; + let kvstore = &mut (*store.engine_store_server).kvstore; + let _kv = &mut (*store.engine_store_server).engines.as_mut().unwrap().kv; + + match kvstore.entry(region_id) { + std::collections::hash_map::Entry::Occupied(_o) => {} + std::collections::hash_map::Entry::Vacant(v) => { + // When we remove hacked code in handle_raft_entry_normal during migration, + // some tests in handle_raft_entry_normal may fail, since it can observe a empty + // cmd, thus creating region. + warn!( + "region {} not found when ingest, create for {}", + region_id, node_id + ); + let _ = v.insert(Default::default()); + } + } + let region = kvstore.get_mut(®ion_id).unwrap(); + + let index = header.index; + let term = header.term; + debug!("handle ingest sst"; + "header" => ?header, + "region_id" => region_id, + "snap len" => snaps.len, + ); + + for i in 0..snaps.len { + let snapshot = snaps.views.add(i as usize); + // let _path = std::str::from_utf8_unchecked((*snapshot).path.to_slice()); + let mut sst_reader = + SSTReader::new(proxy_helper, &*(snapshot as *mut ffi_interfaces::SSTView)); + while sst_reader.remained() { + let key = sst_reader.key(); + let value = sst_reader.value(); + let cf_index = (*snapshot).type_ as usize; + write_kv_in_mem(region.as_mut(), cf_index, key.to_slice(), value.to_slice()); + sst_reader.next(); + } + } + + { + region.set_applied(header.index, header.term); + region.apply_state.mut_truncated_state().set_index(index); + region.apply_state.mut_truncated_state().set_term(term); + } + + fail::fail_point!("on_handle_ingest_sst_return", |_e| { + ffi_interfaces::EngineStoreApplyRes::None + }); + write_to_db_data( + &mut (*store.engine_store_server), + region, + String::from("ingest-sst"), + ); + ffi_interfaces::EngineStoreApplyRes::Persist +} + +unsafe extern "C" fn ffi_handle_compute_store_stats( + _arg1: *mut ffi_interfaces::EngineStoreServerWrap, +) -> ffi_interfaces::StoreStats { + ffi_interfaces::StoreStats { + fs_stats: ffi_interfaces::FsStats { + capacity_size: 444444, + used_size: 111111, + avail_size: 333333, + ok: 1, + }, + engine_bytes_written: 0, + engine_keys_written: 0, + engine_bytes_read: 0, + engine_keys_read: 0, + } +} + +unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { + match s { + Some(s) => { + let len = s.len() as u64; + let ptr = Box::into_raw(Box::new(s.clone())); // leak + let s = ffi_interfaces::CppStrWithView { + inner: ffi_interfaces::RawCppPtr { + ptr: ptr as RawVoidPtr, + type_: RawCppPtrTypeImpl::String.into(), + }, + view: ffi_interfaces::BaseBuffView { + data: (*ptr).as_ptr() as *const _, + len, + }, + }; + s + } + None => ffi_interfaces::CppStrWithView { + inner: ffi_interfaces::RawCppPtr { + ptr: std::ptr::null_mut(), + type_: RawCppPtrTypeImpl::None.into(), + }, + view: ffi_interfaces::BaseBuffView { + data: std::ptr::null(), + len: 0, + }, + }, + } +} + +unsafe extern "C" fn ffi_fast_add_peer( + arg1: *mut ffi_interfaces::EngineStoreServerWrap, + region_id: u64, + new_peer_id: u64, +) -> ffi_interfaces::FastAddPeerRes { + let store = into_engine_store_server_wrap(arg1); + let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); + let store_id = (*store.engine_store_server).id; + + let failed_add_peer_res = + |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { + status, + apply_state: create_cpp_str(None), + region: create_cpp_str(None), + }; + let from_store = (|| { + fail::fail_point!("ffi_fast_add_peer_from_id", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 1 + })(); + let block_wait: bool = (|| { + fail::fail_point!("ffi_fast_add_peer_block_wait", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; + debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); + + for retry in 0..300 { + if retry > 0 { + std::thread::sleep(std::time::Duration::from_millis(30)); + } + + let lock = cluster.ffi_helper_set.lock(); + let mut guard = match lock { + Ok(e) => e, + Err(_) => { + error!("ffi_debug_func failed to lock"); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); + } + }; + debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); + let source_server = match guard.get_mut(&from_store) { + Some(s) => &mut s.engine_store_server, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); + } + }; + let source_engines = match source_server.engines.clone() { + Some(s) => s, + None => { + error!("recover from remote peer: failed get source engine"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + + // TODO We must ask the remote peer to persist before get a snapshot. + // { + // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { + // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add + // peer".to_string()); } else { + // error!("recover from remote peer: failed persist source region"; + // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes + // { status: ffi_interfaces::FastAddPeerStatus::BadData, + // apply_state: create_cpp_str(None), + // region: create_cpp_str(None), + // }; + // } + // } + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, + None => { + error!("recover from remote peer: failed read source region info"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + let region_local_state: RegionLocalState = match general_get_region_local_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + debug!("recover from remote peer: preparing from {} to {}, not region state {}", from_store, store_id, new_peer_id; "region_id" => region_id); + // We don't return BadData here, since the data may not be persisted. + if block_wait { + continue; + } + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); + } + }; + let new_region_meta = region_local_state.get_region(); + + if !engine_store_ffi::observer::validate_remote_peer_region( + new_region_meta, + store_id, + new_peer_id, + ) { + debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); + if block_wait { + continue; + } + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); + } + + debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = match (*store.engine_store_server).engines.clone() { + Some(s) => s, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); + } + }; + let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { + Some(s) => s, + None => { + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); + // Must first dump meta then data, otherwise data may lag behind. + // We can see a raft log hole at applied_index otherwise. + let apply_state: RaftApplyState = match general_get_apply_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + error!("recover from remote peer: failed read apply state"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + }; + + debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); + if let Err(e) = copy_data_from( + &source_engines, + &target_engines, + &source_region, + target_region, + ) { + error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); + } + + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); + let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); + let region_ptr = create_cpp_str(Some(region_bytes)); + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); + return ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::Ok, + apply_state: apply_state_ptr, + region: region_ptr, + }; + } + error!("recover from remote peer: failed after retry"; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); +} diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index a383046f51d..7f88e47a613 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -138,11 +138,14 @@ impl Transport for ChannelTransport { h.send_raft_msg(msg)?; if is_snapshot { // should report snapshot finish. - let _ = core.routers[&from_store].report_snapshot_status( - region_id, - to_peer_id, - SnapshotStatus::Finish, - ); + match core.routers.get(&from_store) { + Some(router) => router.report_snapshot_status( + region_id, + to_peer_id, + SnapshotStatus::Finish, + ), + None => return Err(box_err!("Find no from_store {}", from_store)), + }; } Ok(()) } @@ -176,6 +179,8 @@ pub struct NodeCluster { pub importer: Option>, } +impl std::panic::UnwindSafe for NodeCluster {} + impl NodeCluster { pub fn new(pd_client: Arc) -> NodeCluster { NodeCluster { @@ -315,8 +320,12 @@ impl Simulator for NodeCluster { let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), + engines.raft.clone(), importer.clone(), cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), + snap_mgr.clone(), + cfg.proxy_cfg.engine_store.clone(), ); tiflash_ob.register_to(&mut coprocessor_host); @@ -340,6 +349,7 @@ impl Simulator for NodeCluster { ); node.try_bootstrap_store(engines.clone())?; + node.start( engines.clone(), simulate_trans.clone(), diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 466de08126f..69479fa4cf3 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -9,7 +9,7 @@ use std::{ }; use api_version::{dispatch_api_version, KvFormat}; -use causal_ts::CausalTsProvider; +use causal_ts::CausalTsProviderImpl; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use encryption_export::DataKeyManager; @@ -32,7 +32,7 @@ use pd_client::PdClient; use raftstore::{ coprocessor::{CoprocessorHost, RegionInfoAccessor}, errors::Error as RaftError, - router::{LocalReadRouter, RaftStoreBlackHole, RaftStoreRouter, ServerRaftStoreRouter}, + router::{LocalReadRouter, RaftStoreRouter, ServerRaftStoreRouter}, store::{ fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, msg::RaftCmdExtraOpts, @@ -60,7 +60,7 @@ use tikv::{ }, storage::{ self, - kv::SnapContext, + kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, Engine, }, @@ -84,8 +84,9 @@ use crate::config::Config; type SimulateStoreTransport = SimulateTransport>; +type SimulateRaftExtension = ::RaftExtension; type SimulateServerTransport = - SimulateTransport>; + SimulateTransport>; pub type SimulateEngine = RaftKv; @@ -125,13 +126,13 @@ impl StoreAddrResolver for AddressMap { struct ServerMeta { node: Node, - server: Server, + server: Server, sim_router: SimulateStoreTransport, sim_trans: SimulateServerTransport, raw_router: RaftRouter, raw_apply_router: ApplyRouter, - gc_worker: GcWorker, SimulateStoreTransport>, - rts_worker: Option>>, + gc_worker: GcWorker>, + rts_worker: Option>, rsmeter_cleanup: Box, } @@ -152,10 +153,10 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_client: RaftClient, concurrency_managers: HashMap, env: Arc, - pub causal_ts_providers: HashMap>, + pub causal_ts_providers: HashMap>, } impl ServerCluster { @@ -176,7 +177,7 @@ impl ServerCluster { Arc::default(), security_mgr.clone(), map.clone(), - RaftStoreBlackHole, + FakeExtension, worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); @@ -218,7 +219,7 @@ impl ServerCluster { pub fn get_gc_worker( &self, node_id: u64, - ) -> &GcWorker, SimulateStoreTransport> { + ) -> &GcWorker> { &self.metas.get(&node_id).unwrap().gc_worker } @@ -226,7 +227,7 @@ impl ServerCluster { self.concurrency_managers.get(&node_id).unwrap().clone() } - pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { + pub fn get_causal_ts_provider(&self, node_id: u64) -> Option> { self.causal_ts_providers.get(&node_id).cloned() } @@ -334,16 +335,12 @@ impl ServerCluster { let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine.clone(), - sim_router.clone(), tx, cfg.gc.clone(), Default::default(), Arc::new(region_info_accessor.clone()), ); gc_worker.start(node_id).unwrap(); - gc_worker - .start_observe_lock_apply(&mut coprocessor_host, concurrency_manager.clone()) - .unwrap(); let rts_worker = if cfg.resolved_ts.enable { // Resolved ts worker @@ -360,7 +357,6 @@ impl ServerCluster { concurrency_manager.clone(), self.env.clone(), self.security_mgr.clone(), - resolved_ts::DummySinker::new(), ); // Start the worker rts_worker.start(rts_endpoint); @@ -369,21 +365,6 @@ impl ServerCluster { None }; - if ApiVersion::V2 == F::TAG { - let causal_ts_provider = Arc::new( - block_on(causal_ts::BatchTsoProvider::new_opt( - self.pd_client.clone(), - cfg.causal_ts.renew_interval.0, - cfg.causal_ts.alloc_ahead_buffer.0, - cfg.causal_ts.renew_batch_min_size, - cfg.causal_ts.renew_batch_max_size, - )) - .unwrap(), - ); - self.causal_ts_providers - .insert(node_id, causal_ts_provider.clone()); - } - // Start resource metering. let (res_tag_factory, collector_reg_handle, rsmeter_cleanup) = self.init_resource_metering(&cfg.resource_metering); @@ -409,14 +390,6 @@ impl ServerCluster { Arc::clone(&importer), ); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node_id, - engines.kv.clone(), - importer.clone(), - 2, - ); - tiflash_ob.register_to(&mut coprocessor_host); - let check_leader_runner = CheckLeaderRunner::new(store_meta.clone(), coprocessor_host.clone()); let check_leader_scheduler = bg_worker.start("check-leader", check_leader_runner); @@ -432,6 +405,7 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); + let extension = engine.raft_extension().clone(); let store = create_raft_storage::<_, _, _, F, _>( engine, &cfg.storage, @@ -455,7 +429,7 @@ impl ServerCluster { // Create pd client, snapshot manager, server. let (resolver, state) = - resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, router.clone()); + resolve::new_resolver(Arc::clone(&self.pd_client), &bg_worker, extension.clone()); let snap_mgr = SnapManagerBuilder::default() .max_write_bytes_per_sec(cfg.server.snap_max_write_bytes_per_sec.0 as i64) .max_total_size(cfg.server.snap_max_total_size.0) @@ -523,7 +497,6 @@ impl ServerCluster { store.clone(), copr.clone(), copr_v2.clone(), - sim_router.clone(), resolver.clone(), snap_mgr.clone(), gc_worker.clone(), @@ -563,6 +536,18 @@ impl ServerCluster { let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node_id, + engines.kv.clone(), + engines.raft.clone(), + importer.clone(), + cfg.proxy_cfg.raft_store.snap_handle_pool_size, + simulate_trans.clone(), + snap_mgr.clone(), + cfg.proxy_cfg.engine_store.clone(), + ); + tiflash_ob.register_to(&mut coprocessor_host); + // Register the role change observer of the lock manager. lock_mgr.register_detector_role_change_observer(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/transport_simulate.rs b/new-mock-engine-store/src/transport_simulate.rs index a683c51aef5..61286d137d2 100644 --- a/new-mock-engine-store/src/transport_simulate.rs +++ b/new-mock-engine-store/src/transport_simulate.rs @@ -270,7 +270,7 @@ pub struct DefaultFilterFactory(PhantomData); impl FilterFactory for DefaultFilterFactory { fn generate(&self, _: u64) -> Vec> { - vec![Box::new(F::default())] + vec![Box::::default()] } } diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index aad68382eb0..4cda63cf674 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -1,23 +1,25 @@ set -uxeo pipefail if [[ $M == "fmt" ]]; then make gen_proxy_ffi + git status -s GIT_STATUS=$(git status -s) && if [[ ${GIT_STATUS} ]]; then echo "Error: found illegal git status"; echo ${GIT_STATUS}; [[ -z ${GIT_STATUS} ]]; fi - cargo fmt -- --check >/dev/null + cargo fmt -- --check elif [[ $M == "testold" ]]; then export ENGINE_LABEL_VALUE=tiflash export RUST_BACKTRACE=full export ENABLE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine" rustup component add clippy - cargo clippy --features "$ENABLE_FEATURES" --package engine_store_ffi --no-deps -- -Dwarnings -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc - cargo clippy --features "$ENABLE_FEATURES" --package proxy_tests --no-deps -- -Dwarnings -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc - cargo clippy --features "$ENABLE_FEATURES" --package proxy_server --no-deps -- -Dwarnings -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc -A clippy::derive_partial_eq_without_eq - cargo clippy --features "$ENABLE_FEATURES" --package new-mock-engine-store --no-deps -- -Dwarnings -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc -A clippy::derive_partial_eq_without_eq -A clippy::redundant_clone -A clippy::too_many_arguments + cargo clippy --features "$ENABLE_FEATURES" --package engine_store_ffi --no-deps -- -Dwarnings -A clippy::result_large_err -A clippy::needless_borrow -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc + cargo clippy --features "$ENABLE_FEATURES" --package proxy_tests --no-deps -- -Dwarnings -A clippy::result_large_err -A clippy::needless_borrow -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc + cargo clippy --features "$ENABLE_FEATURES" --package proxy_server --no-deps -- -Dwarnings -A clippy::result_large_err -A clippy::needless_borrow -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc -A clippy::derive_partial_eq_without_eq + cargo clippy --features "$ENABLE_FEATURES" --package new-mock-engine-store --no-deps -- -Dwarnings -A clippy::result_large_err -A clippy::needless_borrow -A clippy::clone_on_copy -A clippy::upper_case_acronyms -A clippy::missing_safety_doc -A clippy::derive_partial_eq_without_eq -A clippy::redundant_clone -A clippy::too_many_arguments + # exit # If we depend TiKV as a Cargo component, the following is not necessary, and can fail. cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_normal cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_bootstrap cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_compact_log cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_early_apply cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_encryption - cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_pd_client + # cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_pd_client cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_pending_peers cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_transaction cargo test --features "$ENABLE_FEATURES" --package tests --test failpoints cases::test_cmd_epoch_checker @@ -31,19 +33,17 @@ elif [[ $M == "testnew" ]]; then export ENABLE_FEATURES="test-engine-kv-rocksdb test-engine-raft-raft-engine" cargo check --package proxy_server --features="$ENABLE_FEATURES" # tests based on new-mock-engine-store, with compat for new proxy + cargo test --package proxy_tests --test proxy write + cargo test --package proxy_tests --test proxy snapshot cargo test --package proxy_tests --test proxy normal::store - cargo test --package proxy_tests --test proxy normal::region cargo test --package proxy_tests --test proxy normal::config - cargo test --package proxy_tests --test proxy normal::write cargo test --package proxy_tests --test proxy normal::ingest - cargo test --package proxy_tests --test proxy normal::snapshot cargo test --package proxy_tests --test proxy normal::restart cargo test --package proxy_tests --test proxy normal::persist cargo test --package proxy_tests --test proxy config + cargo test --package proxy_tests --test proxy region cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test - # tests based on new-mock-engine-store, for some tests not available for new proxy - cargo test --package proxy_tests --test proxy proxy elif [[ $M == "debug" ]]; then # export RUSTC_WRAPPER=~/.cargo/bin/sccache export ENGINE_LABEL_VALUE=tiflash diff --git a/proxy_server/Cargo.toml b/proxy_server/Cargo.toml index b1b617c932c..b4c42af2cd1 100644 --- a/proxy_server/Cargo.toml +++ b/proxy_server/Cargo.toml @@ -75,7 +75,7 @@ online_config = { workspace = true } openssl = "0.10" pd_client = { workspace = true, default-features = false } pin-project = "1.0" -pprof = { git = "https://github.com/CalvinNeo/pprof-rs", branch = "master", default-features = false, features = ["flamegraph", "protobuf-codec", "cpp"] } +pprof = { version = "0.11", default-features = false, features = ["flamegraph", "protobuf-codec", "cpp"] } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } diff --git a/proxy_server/src/config.rs b/proxy_server/src/config.rs index b3de592817d..f00854df00a 100644 --- a/proxy_server/src/config.rs +++ b/proxy_server/src/config.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, iter::FromIterator, path::Path}; +use engine_store_ffi::EngineStoreConfig; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use itertools::Itertools; use online_config::OnlineConfig; @@ -61,6 +62,7 @@ pub struct ServerConfig { pub advertise_addr: String, #[online_config(skip)] pub background_thread_count: usize, + pub status_thread_pool_size: usize, } impl Default for ServerConfig { @@ -76,6 +78,7 @@ impl Default for ServerConfig { advertise_status_addr: TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR.to_string(), advertise_addr: TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR.to_string(), background_thread_count, + status_thread_pool_size: 2, } } } @@ -115,11 +118,7 @@ pub fn memory_limit_for_cf(is_raft_db: bool, cf: &str, total_mem: u64) -> Readab _ => unreachable!(), }; let mut size = (total_mem as f64 * ratio) as usize; - if size < min { - size = min; - } else if size > max { - size = max; - } + size = size.clamp(min, max); ReadableSize::mb(size as u64 / MIB) } @@ -258,6 +257,9 @@ pub struct ProxyConfig { #[online_config(skip)] pub import: ImportConfig, + + #[online_config(skip)] + pub engine_store: EngineStoreConfig, } /// We use custom default, in case of later non-ordinary config items. @@ -273,6 +275,7 @@ impl Default for ProxyConfig { enable_io_snoop: false, readpool: ReadPoolConfig::default(), import: ImportConfig::default(), + engine_store: EngineStoreConfig::default(), } } } @@ -395,6 +398,7 @@ pub fn address_proxy_config(config: &mut TikvConfig, proxy_config: &ProxyConfig) config.server.background_thread_count = proxy_config.server.background_thread_count; config.import.num_threads = proxy_config.import.num_threads; + config.server.status_thread_pool_size = proxy_config.server.status_thread_pool_size; } pub fn validate_and_persist_config(config: &mut TikvConfig, persist: bool) { diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 4f2e14a8426..9bfdbc83a61 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -27,8 +27,8 @@ use engine_rocks::{ }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_store_ffi::{ - self, EngineStoreServerHelper, EngineStoreServerStatus, RaftProxyStatus, RaftStoreProxy, - RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, ps_engine::PSEngine + self, ps_engine::PSEngine, EngineStoreServerHelper, EngineStoreServerStatus, RaftProxyStatus, + RaftStoreProxy, RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, }; use engine_traits::{ CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, TabletFactory, @@ -57,8 +57,8 @@ use raftstore::{ RaftBatchSystem, RaftRouter, StoreMeta, MULTI_FILES_SNAPSHOT_FEATURE, PENDING_MSG_CAP, }, memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, - AutoSplitController, CheckLeaderRunner, GlobalReplicationState, LocalReader, SnapManager, - SnapManagerBuilder, SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, + AutoSplitController, CheckLeaderRunner, LocalReader, SnapManager, SnapManagerBuilder, + SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, }; use security::SecurityManager; @@ -333,7 +333,8 @@ pub unsafe fn run_tikv_proxy( ) } else { run_impl::(config, proxy_config, engine_store_server_helper) - // run_impl::(config, proxy_config, engine_store_server_helper) + // run_impl::(config, proxy_config, + // engine_store_server_helper) } }) } @@ -399,9 +400,7 @@ impl TiKvServer { &block_cache, ); match raft_engine.as_ps_engine() { - None => { - - } + None => {} Some(ps_engine) => { ps_engine.init(engine_store_server_helper); } @@ -477,8 +476,7 @@ struct TiKvServer { flow_info_sender: Option>, flow_info_receiver: Option>, system: Option>, - resolver: resolve::PdStoreAddrResolver, - state: Arc>, + resolver: Option, store_path: PathBuf, snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, @@ -509,8 +507,7 @@ struct Servers { importer: Arc, } -type LocalServer = - Server, resolve::PdStoreAddrResolver, LocalRaftKv>; +type LocalServer = Server>; type LocalRaftKv = RaftKv>; impl TiKvServer { @@ -550,8 +547,6 @@ impl TiKvServer { let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - let (resolver, state) = - resolve::new_resolver(Arc::clone(&pd_client), &background_worker, router.clone()); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -584,8 +579,7 @@ impl TiKvServer { pd_client, router, system: Some(system), - resolver, - state, + resolver: None, store_path, snap_mgr: None, encryption_key_manager: None, @@ -831,14 +825,10 @@ impl TiKvServer { fn init_gc_worker( &mut self, - ) -> GcWorker< - RaftKv>, - RaftRouter, - > { + ) -> GcWorker>> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( engines.engine.clone(), - self.router.clone(), self.flow_info_sender.take().unwrap(), self.config.gc.clone(), self.pd_client.feature_gate().clone(), @@ -1009,6 +999,13 @@ impl TiKvServer { )), ); + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.background_worker, + storage.get_engine().raft_extension().clone(), + ); + self.resolver = Some(resolver); + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) .register(self.coprocessor_host.as_mut().unwrap()); @@ -1146,7 +1143,7 @@ impl TiKvServer { raft_store.clone(), self.config.storage.api_version(), self.pd_client.clone(), - self.state.clone(), + state, self.background_worker.clone(), Some(health_service.clone()), Some(default_store), @@ -1182,14 +1179,6 @@ impl TiKvServer { } let importer = Arc::new(importer); - let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( - node.id(), - self.engines.as_ref().unwrap().engines.kv.clone(), - importer.clone(), - self.proxy_config.raft_store.snap_handle_pool_size, - ); - tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); - let check_leader_runner = CheckLeaderRunner::new( engines.store_meta.clone(), self.coprocessor_host.clone().unwrap(), @@ -1213,8 +1202,7 @@ impl TiKvServer { Arc::clone(&self.quota_limiter), ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), - self.router.clone(), - self.resolver.clone(), + self.resolver.clone().unwrap(), snap_mgr.clone(), gc_worker.clone(), check_leader_scheduler, @@ -1224,6 +1212,19 @@ impl TiKvServer { health_service, ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + + let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( + node.id(), + self.engines.as_ref().unwrap().engines.kv.clone(), + self.engines.as_ref().unwrap().engines.raft.clone(), + importer.clone(), + self.proxy_config.raft_store.snap_handle_pool_size, + server.transport().clone(), + snap_mgr.clone(), + self.proxy_config.engine_store.clone(), + ); + tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); + cfg_controller.register( tikv::config::Module::Server, Box::new(ServerConfigManager::new( @@ -1370,7 +1371,7 @@ impl TiKvServer { raft: engines.engines.raft.clone(), }, servers.server.get_debug_thread_pool().clone(), - self.router.clone(), + engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers @@ -1723,7 +1724,7 @@ impl ConfiguredRaftEngine for PSEngine { env: &Arc, key_manager: &Option>, block_cache: &Option, - )-> Self { + ) -> Self { PSEngine::new() } diff --git a/proxy_tests/Cargo.toml b/proxy_tests/Cargo.toml index 9132ca13d00..e9730c960c5 100644 --- a/proxy_tests/Cargo.toml +++ b/proxy_tests/Cargo.toml @@ -15,7 +15,7 @@ failpoints = ["fail/failpoints", "tikv/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] -testexport = ["raftstore/testexport", "tikv/testexport"] +testexport = ["raftstore/testexport", "tikv/testexport", "engine_tiflash/testexport", "engine_store_ffi/testexport"] profiling = ["profiler/profiling"] test-engine-kv-rocksdb = [ @@ -49,6 +49,7 @@ crossbeam = "0.8" encryption = { workspace = true } engine_rocks_helper = { workspace = true } engine_store_ffi = { workspace = true, default-features = false } +engine_tiflash = { workspace = true, default-features = false } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } diff --git a/proxy_tests/proxy/config.rs b/proxy_tests/proxy/config.rs new file mode 100644 index 00000000000..8e791735405 --- /dev/null +++ b/proxy_tests/proxy/config.rs @@ -0,0 +1,199 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use clap::{App, Arg}; +use proxy_server::{ + config::{ + address_proxy_config, memory_limit_for_cf, TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR, + TIFLASH_DEFAULT_LISTENING_ADDR, TIFLASH_DEFAULT_STATUS_ADDR, + }, + proxy::{gen_proxy_config, gen_tikv_config}, + setup::overwrite_config_with_cmd_args, +}; +use tikv_util::sys::SysQuota; + +use crate::proxy::*; + +/// We test here if we can use proxy's default value without given file. +/// Normally, we only need to add config tests in +/// `test_config_proxy_default_no_config_item`. +#[test] +fn test_config_proxy_default_no_config_file() { + let args: Vec<&str> = vec![]; + let matches = App::new("RaftStore Proxy") + .arg( + Arg::with_name("config") + .short("C") + .long("config") + .value_name("FILE") + .help("Set the configuration file") + .takes_value(true), + ) + .get_matches_from(args); + let mut v: Vec = vec![]; + let mut config = gen_tikv_config(&None, false, &mut v); + let mut proxy_config = gen_proxy_config(&None, false, &mut v); + overwrite_config_with_cmd_args(&mut config, &mut proxy_config, &matches); + address_proxy_config(&mut config, &proxy_config); + + assert_eq!(config.server.addr, TIFLASH_DEFAULT_LISTENING_ADDR); + assert_eq!(config.server.status_addr, TIFLASH_DEFAULT_STATUS_ADDR); + assert_eq!( + config.server.advertise_status_addr, + TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR + ); + assert_eq!( + config.raft_store.region_worker_tick_interval.as_millis(), + 500 + ); +} + +/// We test here if we can use proxy's default value with given file, +/// but without given field. +/// Add assertion in this function, if we add some new items in +/// `ProxyConfig`. +#[test] +fn test_config_proxy_default_no_config_item() { + let mut file = tempfile::NamedTempFile::new().unwrap(); + let text = "z=4\n[rocksdb]\nmax-open-files=56\n"; + write!(file, "{}", text).unwrap(); + let path = file.path(); + let cpath = Some(path.as_os_str()); + let args = vec![format!("-C{}", path.to_str().unwrap())]; + let matches = App::new("RaftStore Proxy") + .arg( + Arg::with_name("config") + .short("C") + .long("config") + .value_name("FILE") + .help("Set the configuration file") + .takes_value(true), + ) + .get_matches_from(args); + let mut v: Vec = vec![]; + let mut config = gen_tikv_config(&cpath, false, &mut v); + let mut proxy_config = gen_proxy_config(&cpath, false, &mut v); + overwrite_config_with_cmd_args(&mut config, &mut proxy_config, &matches); + address_proxy_config(&mut config, &proxy_config); + + let total_mem = SysQuota::memory_limit_in_bytes(); + let cpu_num = SysQuota::cpu_cores_quota(); + assert_eq!(config.rocksdb.max_open_files, 56); + assert_eq!(config.server.addr, TIFLASH_DEFAULT_LISTENING_ADDR); + assert_eq!(config.server.status_addr, TIFLASH_DEFAULT_STATUS_ADDR); + assert_eq!( + config.server.advertise_status_addr, + TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR + ); + assert_eq!( + config.raft_store.region_worker_tick_interval.as_millis(), + 500 + ); + assert_eq!( + ProxyConfig::default() + .raft_store + .apply_low_priority_pool_size, + config.raft_store.apply_batch_system.low_priority_pool_size + ); + assert_eq!( + config.raftdb.defaultcf.block_cache_size, + memory_limit_for_cf(true, CF_DEFAULT, total_mem) + ); + assert_eq!( + config.rocksdb.defaultcf.block_cache_size, + memory_limit_for_cf(false, CF_DEFAULT, total_mem) + ); + assert_eq!( + config.rocksdb.writecf.block_cache_size, + memory_limit_for_cf(false, CF_WRITE, total_mem) + ); + assert_eq!( + config.rocksdb.lockcf.block_cache_size, + memory_limit_for_cf(false, CF_LOCK, total_mem) + ); + assert_eq!(config.storage.reserve_space, ReadableSize::gb(1)); + + let background_thread_count = std::cmp::min(4, cpu_num as usize); + assert_eq!( + config.server.background_thread_count, + background_thread_count + ); + + assert_eq!(config.import.num_threads, 4); + assert_eq!(config.server.status_thread_pool_size, 2); +} + +/// We test if the engine-label is set properly. +#[test] +fn test_config_proxy_engine_label() { + // case-1: If engine-label not specified in arguments, use default value. + let args: Vec<&str> = vec![]; + let matches = App::new("RaftStore Proxy").get_matches_from(args); + let mut v: Vec = vec![]; + let mut config = gen_tikv_config(&None, false, &mut v); + let mut proxy_config = gen_proxy_config(&None, false, &mut v); + overwrite_config_with_cmd_args(&mut config, &mut proxy_config, &matches); + address_proxy_config(&mut config, &proxy_config); + const DEFAULT_ENGINE_LABEL_KEY: &str = "engine"; + + assert_eq!( + config + .server + .labels + .get(DEFAULT_ENGINE_LABEL_KEY) + .unwrap() + .as_str(), + option_env!("ENGINE_LABEL_VALUE").unwrap() + ); + + // case-2: If engine-label specified in arguments, use it as engine-label. + const EXPECTED_ENGINE_LABEL: &str = "tiflash_compute"; + let args = vec![ + "test_config_proxy_default1", + "--engine-label", + EXPECTED_ENGINE_LABEL, + ]; + let matches = App::new("RaftStore Proxy") + .arg( + Arg::with_name("engine-label") + .long("engine-label") + .help("Set engine label") + .required(true) + .takes_value(true), + ) + .get_matches_from(args); + overwrite_config_with_cmd_args(&mut config, &mut proxy_config, &matches); + address_proxy_config(&mut config, &proxy_config); + assert_eq!( + config.server.labels.get(DEFAULT_ENGINE_LABEL_KEY).unwrap(), + EXPECTED_ENGINE_LABEL + ); +} + +// We test whether Proxy will overwrite TiKV's value, +// when a config item which is both defined by ProxyConfig and TikvConfig. +// We only nned to add tests to this function when the logic is different. +#[test] +fn test_config_proxy_overwrite() { + let mut file = tempfile::NamedTempFile::new().unwrap(); + write!( + file, + " +[raftstore] +apply-low-priority-pool-size = 41 + " + ) + .unwrap(); + let path = file.path(); + + let mut v: Vec = vec![]; + let cpath = Some(path.as_os_str()); + let mut config = gen_tikv_config(&cpath, false, &mut v); + let proxy_config = gen_proxy_config(&cpath, false, &mut v); + address_proxy_config(&mut config, &proxy_config); + + // When raftstore.apply-low-priority-pool-size is specified, its value + // should be used. + assert_eq!( + 41, + config.raft_store.apply_batch_system.low_priority_pool_size + ); +} diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs new file mode 100644 index 00000000000..962abcbe0b9 --- /dev/null +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -0,0 +1,164 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use crate::proxy::*; + +#[derive(PartialEq, Eq)] +enum SourceType { + Leader, + Learner, + DelayedLearner, + InvalidSource, +} + +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + if block_wait { + fail::cfg("ffi_fast_add_peer_block_wait", "return(1)").unwrap(); + } + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + // If we don't write here, we will have the first MsgAppend with (6,6), which + // will cause "fast-forwarded commit to snapshot". + cluster.must_put(b"k0", b"v0"); + + // Add learner 2 from leader 1 + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + // std::thread::sleep(std::time::Duration::from_millis(2000)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + + // Add learner 3 according to source_type + match source_type { + SourceType::Learner | SourceType::DelayedLearner => { + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + } + SourceType::InvalidSource => { + fail::cfg("ffi_fast_add_peer_from_id", "return(100)").unwrap(); + } + _ => (), + }; + + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + // std::thread::sleep(std::time::Duration::from_millis(2000)); + // match source_type { + // SourceType::Learner => { + // // Wait until Learner has applied ConfChange + // must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: + // &States| -> bool { + // find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + // }); + // } + // _ => {}, + // } + cluster.must_put(b"k2", b"v2"); + + match source_type { + SourceType::DelayedLearner => { + // Make sure conf change is applied. + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Recv), + )); + cluster.must_put(b"k3", b"v3"); + } + _ => (), + }; + + match source_type { + SourceType::DelayedLearner => { + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + } + SourceType::Learner => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + _ => { + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + } + }; + + match source_type { + SourceType::DelayedLearner => { + cluster.clear_send_filters(); + } + _ => (), + }; + + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("on_pre_persist_with_finish"); + fail::remove("ffi_fast_add_peer_block_wait"); + cluster.shutdown(); +} + +#[test] +fn test_fast_add_peer_from_leader() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Leader, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// Fast path by learner snapshot. +#[test] +fn test_fast_add_peer_from_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// If a learner is delayed, but already applied ConfChange. +#[test] +fn test_fast_add_peer_from_delayed_learner() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, false); + fail::remove("on_pre_persist_with_finish"); +} + +/// If we select a wrong source, or we can't run fast path, we can fallback to +/// normal. +#[test] +fn test_fast_add_peer_from_invalid_source() { + simple_fast_add_peer(SourceType::InvalidSource, false); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true); + fail::remove("on_pre_persist_with_finish"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true); + fail::remove("on_pre_persist_with_finish"); +} diff --git a/proxy_tests/proxy/flashback.rs b/proxy_tests/proxy/flashback.rs index be6ec2bbcdd..b6d115376b5 100644 --- a/proxy_tests/proxy/flashback.rs +++ b/proxy_tests/proxy/flashback.rs @@ -89,29 +89,8 @@ mod persist { must_cmd_add_flashback_flag(&mut cluster, &mut region.clone(), new_put_cmd(b"k3", b"v3")); let victim = 1; - info!("stop node {}", victim); - cluster.stop_node(victim); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&victim) - .unwrap() - .engine_store_server - .stop(); - } - - info!("restored node {}", victim); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&victim) - .unwrap() - .engine_store_server - .restore(); - } - cluster.run_node(victim).unwrap(); + stop_tiflash_node(&mut cluster, victim); + restart_tiflash_node(&mut cluster, victim); let new_states = collect_all_states(&cluster, region_id); diff --git a/proxy_tests/proxy/mod.rs b/proxy_tests/proxy/mod.rs index 6051f0c83ef..1d7edced540 100644 --- a/proxy_tests/proxy/mod.rs +++ b/proxy_tests/proxy/mod.rs @@ -7,8 +7,13 @@ #[macro_use] extern crate slog_global; +mod config; +mod fast_add_peer; mod flashback; mod normal; mod proxy; +mod region; mod server_cluster_test; +mod snapshot; mod util; +mod write; diff --git a/proxy_tests/proxy/normal.rs b/proxy_tests/proxy/normal.rs index 274f09b1641..28040a63e94 100644 --- a/proxy_tests/proxy/normal.rs +++ b/proxy_tests/proxy/normal.rs @@ -5,14 +5,7 @@ use proxy_server::config::{ address_proxy_config, ensure_no_common_unrecognized_keys, get_last_config, setup_default_tikv_config, validate_and_persist_config, TIFLASH_DEFAULT_LISTENING_ADDR, }; -use raft::eraftpb::MessageType; use tikv::config::{TikvConfig, LAST_CONFIG_FILE}; -use tikv_util::{ - config::{ReadableDuration, ReadableSize}, - store::find_peer, - time::Duration, - HandyRwLock, -}; use crate::proxy::*; @@ -48,149 +41,6 @@ mod store { } } -mod region { - use super::*; - - #[test] - fn test_handle_destroy() { - let (mut cluster, pd_client) = new_mock_cluster(0, 3); - - disable_auto_gen_compact_log(&mut cluster); - - // Disable default max peer count check. - pd_client.disable_default_operator(); - - cluster.run(); - cluster.must_put(b"k1", b"v1"); - let eng_ids = cluster - .engines - .iter() - .map(|e| e.0.to_owned()) - .collect::>(); - - let region = cluster.get_region(b"k1"); - let region_id = region.get_id(); - let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); - let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); - cluster.must_transfer_leader(region_id, peer_1); - - iter_ffi_helpers( - &cluster, - Some(vec![eng_ids[1]]), - &mut |_, _, ffi: &mut FFIHelperSet| { - let server = &ffi.engine_store_server; - assert!(server.kvstore.contains_key(®ion_id)); - }, - ); - - pd_client.must_remove_peer(region_id, peer_2); - - check_key( - &cluster, - b"k1", - b"v2", - Some(false), - None, - Some(vec![eng_ids[1]]), - ); - - std::thread::sleep(std::time::Duration::from_millis(100)); - // Region removed in server. - iter_ffi_helpers( - &cluster, - Some(vec![eng_ids[1]]), - &mut |_, _, ffi: &mut FFIHelperSet| { - let server = &ffi.engine_store_server; - assert!(!server.kvstore.contains_key(®ion_id)); - }, - ); - - cluster.shutdown(); - } - - #[test] - fn test_get_region_local_state() { - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - cluster.run(); - - let k = b"k1"; - let v = b"v1"; - cluster.must_put(k, v); - check_key(&cluster, k, v, Some(true), None, None); - let region_id = cluster.get_region(k).get_id(); - - // Get RegionLocalState through ffi - unsafe { - iter_ffi_helpers( - &cluster, - None, - &mut |_id: u64, _, ffi_set: &mut FFIHelperSet| { - let f = ffi_set.proxy_helper.fn_get_region_local_state.unwrap(); - let mut state = kvproto::raft_serverpb::RegionLocalState::default(); - let mut error_msg = new_mock_engine_store::RawCppStringPtrGuard::default(); - - assert_eq!( - f( - ffi_set.proxy_helper.proxy_ptr, - region_id, - &mut state as *mut _ as _, - error_msg.as_mut(), - ), - KVGetStatus::Ok - ); - assert!(state.has_region()); - assert_eq!(state.get_state(), kvproto::raft_serverpb::PeerState::Normal); - assert!(error_msg.as_ref().is_null()); - - let mut state = kvproto::raft_serverpb::RegionLocalState::default(); - assert_eq!( - f( - ffi_set.proxy_helper.proxy_ptr, - 0, // not exist - &mut state as *mut _ as _, - error_msg.as_mut(), - ), - KVGetStatus::NotFound - ); - assert!(!state.has_region()); - assert!(error_msg.as_ref().is_null()); - - ffi_set - .proxy - .get_value_cf("none_cf", "123".as_bytes(), |value| { - let msg = value.unwrap_err(); - assert_eq!(msg, "Storage Engine Status { code: IoError, sub_code: None, sev: NoError, state: \"cf none_cf not found\" }"); - }); - ffi_set - .proxy - .get_value_cf("raft", "123".as_bytes(), |value| { - let res = value.unwrap(); - assert!(res.is_none()); - }); - - // If we have no kv engine. - ffi_set.proxy.set_kv_engine(None); - let res = ffi_set.proxy_helper.fn_get_region_local_state.unwrap()( - ffi_set.proxy_helper.proxy_ptr, - region_id, - &mut state as *mut _ as _, - error_msg.as_mut(), - ); - assert_eq!(res, KVGetStatus::Error); - assert!(!error_msg.as_ref().is_null()); - assert_eq!( - error_msg.as_str(), - "KV engine is not initialized".as_bytes() - ); - }, - ); - } - - cluster.shutdown(); - } -} - mod config { use super::*; @@ -297,591 +147,6 @@ mod config { } } -mod write { - use super::*; - #[test] - fn test_interaction() { - // TODO Maybe we should pick this test to TiKV. - // This test is to check if empty entries can affect pre_exec and post_exec. - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - fail::cfg("try_flush_data", "return(0)").unwrap(); - let _ = cluster.run(); - - cluster.must_put(b"k1", b"v1"); - let region = cluster.get_region(b"k1"); - let region_id = region.get_id(); - - // Wait until all nodes have (k1, v1). - check_key(&cluster, b"k1", b"v1", Some(true), None, None); - - let prev_states = collect_all_states(&cluster, region_id); - let compact_log = test_raftstore::new_compact_log_request(100, 10); - let req = - test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); - let _ = cluster - .call_command_on_leader(req.clone(), Duration::from_secs(3)) - .unwrap(); - - // Empty result can also be handled by post_exec - let mut retry = 0; - let new_states = loop { - let new_states = collect_all_states(&cluster, region_id); - let mut ok = true; - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - if old.in_memory_apply_state == new.in_memory_apply_state - && old.in_memory_applied_term == new.in_memory_applied_term - { - ok = false; - break; - } - } - if ok { - break new_states; - } - std::thread::sleep(std::time::Duration::from_millis(100)); - retry += 1; - if retry >= 30 { - panic!("states is not changed") - } - }; - - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - // An empty cmd will not cause persistence. - assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); - } - - cluster.must_put(b"k2", b"v2"); - // Wait until all nodes have (k2, v2). - check_key(&cluster, b"k2", b"v2", Some(true), None, None); - - fail::cfg("on_empty_cmd_normal", "return").unwrap(); - let prev_states = collect_all_states(&cluster, region_id); - let _ = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - - std::thread::sleep(std::time::Duration::from_millis(400)); - let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - } - - fail::remove("try_flush_data"); - fail::remove("on_empty_cmd_normal"); - cluster.shutdown(); - } - - #[test] - fn test_leadership_change_filter() { - test_leadership_change_impl(true); - } - - #[test] - fn test_leadership_change_no_persist() { - test_leadership_change_impl(false); - } - - fn test_leadership_change_impl(filter: bool) { - // Test if a empty command can be observed when leadership changes. - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - disable_auto_gen_compact_log(&mut cluster); - - if filter { - // We don't handle CompactLog at all. - fail::cfg("try_flush_data", "return(0)").unwrap(); - } else { - // We don't return Persist after handling CompactLog. - fail::cfg("no_persist_compact_log", "return").unwrap(); - } - // Do not handle empty cmd. - fail::cfg("on_empty_cmd_normal", "return").unwrap(); - let _ = cluster.run(); - - cluster.must_put(b"k1", b"v1"); - let region = cluster.get_region(b"k1"); - let region_id = region.get_id(); - - let eng_ids = cluster - .engines - .iter() - .map(|e| e.0.to_owned()) - .collect::>(); - let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); - let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); - cluster.must_transfer_leader(region.get_id(), peer_1.clone()); - - cluster.must_put(b"k2", b"v2"); - fail::cfg("on_empty_cmd_normal", "return").unwrap(); - - // Wait until all nodes have (k2, v2), then transfer leader. - check_key(&cluster, b"k2", b"v2", Some(true), None, None); - if filter { - // We should also filter normal kv, since a empty result can also be invoke - // pose_exec. - fail::cfg("on_post_exec_normal", "return(false)").unwrap(); - } - let prev_states = collect_all_states(&cluster, region_id); - cluster.must_transfer_leader(region.get_id(), peer_2.clone()); - - // The states remain the same, since we don't observe empty cmd. - let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - if filter { - // CompactLog can still change in-memory state, when exec in memory. - assert_eq!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - } - assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); - } - - fail::remove("on_empty_cmd_normal"); - // We need forward empty cmd generated by leadership changing to TiFlash. - cluster.must_transfer_leader(region.get_id(), peer_1.clone()); - std::thread::sleep(std::time::Duration::from_secs(1)); - - let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_ne!(old.in_memory_applied_term, new.in_memory_applied_term); - } - - if filter { - fail::remove("try_flush_data"); - fail::remove("on_post_exec_normal"); - } else { - fail::remove("no_persist_compact_log"); - } - cluster.shutdown(); - } - - #[test] - fn test_kv_write_always_persist() { - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - let _ = cluster.run(); - - cluster.must_put(b"k0", b"v0"); - let region_id = cluster.get_region(b"k0").get_id(); - - let mut prev_states = collect_all_states(&cluster, region_id); - // Always persist on every command - fail::cfg("on_post_exec_normal_end", "return(true)").unwrap(); - for i in 1..20 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - - // We can't always get kv from disk, even we commit everytime, - // since they are filtered by engint_tiflash - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); - - // This may happen after memory write data and before commit. - // We must check if we already have in memory. - check_apply_state(&cluster, region_id, &prev_states, Some(false), None); - std::thread::sleep(std::time::Duration::from_millis(20)); - // However, advanced apply index will always persisted. - let new_states = collect_all_states(&cluster, region_id); - for id in cluster.engines.keys() { - let p = &prev_states.get(id).unwrap().in_disk_apply_state; - let n = &new_states.get(id).unwrap().in_disk_apply_state; - assert_ne!(p, n); - } - prev_states = new_states; - } - fail::remove("on_post_exec_normal_end"); - cluster.shutdown(); - } - - #[test] - fn test_kv_write() { - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - fail::cfg("on_post_exec_normal", "return(false)").unwrap(); - fail::cfg("on_post_exec_admin", "return(false)").unwrap(); - // Abandon CompactLog and previous flush. - fail::cfg("try_flush_data", "return(0)").unwrap(); - - let _ = cluster.run(); - - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - - // Since we disable all observers, we can get nothing in either memory and disk. - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key( - &cluster, - k.as_bytes(), - v.as_bytes(), - Some(false), - Some(false), - None, - ); - } - - // We can read initial raft state, since we don't persist meta either. - let r1 = cluster.get_region(b"k1").get_id(); - let prev_states = collect_all_states(&cluster, r1); - - fail::remove("on_post_exec_normal"); - fail::remove("on_post_exec_admin"); - for i in 10..20 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - - // Since we enable all observers, we can get in memory. - // However, we get nothing in disk since we don't persist. - for i in 10..20 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key( - &cluster, - k.as_bytes(), - v.as_bytes(), - Some(true), - Some(false), - None, - ); - } - - let new_states = collect_all_states(&cluster, r1); - for id in cluster.engines.keys() { - assert_ne!( - &prev_states.get(id).unwrap().in_memory_apply_state, - &new_states.get(id).unwrap().in_memory_apply_state - ); - assert_eq!( - &prev_states.get(id).unwrap().in_disk_apply_state, - &new_states.get(id).unwrap().in_disk_apply_state - ); - } - - std::thread::sleep(std::time::Duration::from_millis(20)); - fail::remove("try_flush_data"); - - let prev_states = collect_all_states(&cluster, r1); - // Write more after we force persist when CompactLog. - for i in 20..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - - // We can read from mock-store's memory, we are not sure if we can read from - // disk, since there may be or may not be a CompactLog. - for i in 11..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); - } - - // Force a compact log to persist. - let region_r = cluster.get_region("k1".as_bytes()); - let region_id = region_r.get_id(); - let compact_log = test_raftstore::new_compact_log_request(1000, 100); - let req = - test_raftstore::new_admin_request(region_id, region_r.get_region_epoch(), compact_log); - let res = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - assert!(res.get_header().has_error(), "{:?}", res); - // This CompactLog is executed with an error. It will not trigger a compaction. - // However, it can trigger a persistence. - for i in 11..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key( - &cluster, - k.as_bytes(), - v.as_bytes(), - Some(true), - Some(true), - None, - ); - } - - let new_states = collect_all_states(&cluster, r1); - - // apply_state is changed in memory, and persisted. - for id in cluster.engines.keys() { - assert_ne!( - &prev_states.get(id).unwrap().in_memory_apply_state, - &new_states.get(id).unwrap().in_memory_apply_state - ); - assert_ne!( - &prev_states.get(id).unwrap().in_disk_apply_state, - &new_states.get(id).unwrap().in_disk_apply_state - ); - } - - fail::remove("no_persist_compact_log"); - cluster.shutdown(); - } - - #[test] - fn test_consistency_check() { - // ComputeHash and VerifyHash shall be filtered. - let (mut cluster, _pd_client) = new_mock_cluster(0, 2); - - cluster.run(); - - cluster.must_put(b"k", b"v"); - let region = cluster.get_region("k".as_bytes()); - let region_id = region.get_id(); - - let r = new_compute_hash_request(); - let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), r); - let _ = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - - let r = new_verify_hash_request(vec![7, 8, 9, 0], 1000); - let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), r); - let _ = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - - cluster.must_put(b"k2", b"v2"); - cluster.shutdown(); - } - - #[test] - fn test_old_compact_log() { - // If we just return None for CompactLog, the region state in ApplyFsm will - // change. Because there is no rollback in new implementation. - // This is a ERROR state. - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - cluster.run(); - - // We don't return Persist after handling CompactLog. - fail::cfg("no_persist_compact_log", "return").unwrap(); - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); - } - - let region = cluster.get_region(b"k1"); - let region_id = region.get_id(); - let prev_state = collect_all_states(&cluster, region_id); - let (compact_index, compact_term) = get_valid_compact_index(&prev_state); - let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); - let req = - test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); - let _ = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - - // Wait for state applys. - std::thread::sleep(std::time::Duration::from_secs(2)); - - let new_state = collect_all_states(&cluster, region_id); - for i in prev_state.keys() { - let old = prev_state.get(i).unwrap(); - let new = new_state.get(i).unwrap(); - assert_ne!( - old.in_memory_apply_state.get_truncated_state(), - new.in_memory_apply_state.get_truncated_state() - ); - assert_eq!( - old.in_disk_apply_state.get_truncated_state(), - new.in_disk_apply_state.get_truncated_state() - ); - } - - fail::remove("no_persist_compact_log"); - cluster.shutdown(); - } - - #[test] - fn test_compact_log() { - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - - disable_auto_gen_compact_log(&mut cluster); - - cluster.run(); - - cluster.must_put(b"k", b"v"); - let region = cluster.get_region("k".as_bytes()); - let region_id = region.get_id(); - - fail::cfg("on_empty_cmd_normal", "return").unwrap(); - fail::cfg("try_flush_data", "return(0)").unwrap(); - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); - } - - std::thread::sleep(std::time::Duration::from_millis(500)); - let prev_state = collect_all_states(&cluster, region_id); - - let (compact_index, compact_term) = get_valid_compact_index(&prev_state); - let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); - let req = - test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); - let res = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - // compact index should less than applied index - assert!(!res.get_header().has_error(), "{:?}", res); - - // TODO(tiflash) Make sure compact log is filtered successfully. - // Can be abstract to a retry function. - std::thread::sleep(std::time::Duration::from_millis(500)); - - // CompactLog is filtered, because we can't flush data. - // However, we can still observe apply index advanced - let new_state = collect_all_states(&cluster, region_id); - for i in prev_state.keys() { - let old = prev_state.get(i).unwrap(); - let new = new_state.get(i).unwrap(); - assert_eq!( - old.in_memory_apply_state.get_truncated_state(), - new.in_memory_apply_state.get_truncated_state() - ); - assert_eq!( - old.in_disk_apply_state.get_truncated_state(), - new.in_disk_apply_state.get_truncated_state() - ); - assert_eq!( - old.in_memory_apply_state.get_applied_index() + 1, - new.in_memory_apply_state.get_applied_index() - ); - // Persist is before. - assert_eq!( - old.in_disk_apply_state.get_applied_index(), - new.in_disk_apply_state.get_applied_index() - ); - } - - fail::remove("on_empty_cmd_normal"); - fail::remove("try_flush_data"); - - let (compact_index, compact_term) = get_valid_compact_index(&new_state); - let prev_state = new_state; - let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); - let req = - test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); - let res = cluster - .call_command_on_leader(req, Duration::from_secs(3)) - .unwrap(); - assert!(!res.get_header().has_error(), "{:?}", res); - - cluster.must_put(b"kz", b"vz"); - check_key(&cluster, b"kz", b"vz", Some(true), None, None); - - // CompactLog is not filtered - let new_state = collect_all_states(&cluster, region_id); - for i in prev_state.keys() { - let old = prev_state.get(i).unwrap(); - let new = new_state.get(i).unwrap(); - assert_ne!( - old.in_memory_apply_state.get_truncated_state(), - new.in_memory_apply_state.get_truncated_state() - ); - assert_eq!( - old.in_memory_apply_state.get_applied_index() + 2, // compact log + (kz,vz) - new.in_memory_apply_state.get_applied_index() - ); - } - - cluster.shutdown(); - } - - #[test] - fn test_empty_cmd() { - // Test if a empty command can be observed when leadership changes. - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); - // Disable compact log - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10000); - cluster.cfg.raft_store.snap_apply_batch_size = ReadableSize(50000); - cluster.cfg.raft_store.raft_log_gc_threshold = 1000; - - let _ = cluster.run(); - - cluster.must_put(b"k1", b"v1"); - let region = cluster.get_region(b"k1"); - let region_id = region.get_id(); - let eng_ids = cluster - .engines - .iter() - .map(|e| e.0.to_owned()) - .collect::>(); - let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); - let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); - cluster.must_transfer_leader(region.get_id(), peer_1.clone()); - std::thread::sleep(std::time::Duration::from_secs(2)); - - check_key(&cluster, b"k1", b"v1", Some(true), None, None); - let prev_states = collect_all_states(&cluster, region_id); - - // We need forward empty cmd generated by leadership changing to TiFlash. - cluster.must_transfer_leader(region.get_id(), peer_2.clone()); - std::thread::sleep(std::time::Duration::from_secs(2)); - - let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_ne!(old.in_memory_applied_term, new.in_memory_applied_term); - } - - std::thread::sleep(std::time::Duration::from_secs(2)); - fail::cfg("on_empty_cmd_normal", "return").unwrap(); - - let prev_states = new_states; - cluster.must_transfer_leader(region.get_id(), peer_1.clone()); - std::thread::sleep(std::time::Duration::from_secs(2)); - - let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_eq!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - } - - fail::remove("on_empty_cmd_normal"); - - cluster.shutdown(); - } -} - mod ingest { use sst_importer::SstImporter; use test_sst_importer::gen_sst_file_with_kvs; @@ -1050,75 +315,137 @@ mod ingest { let req = new_ingest_sst_cmd(meta5); let _ = cluster.request(b"k5", vec![req], false, Duration::from_secs(5), true); - check_key(&cluster, b"k1_66", b"2", Some(true), Some(false), None); - check_key(&cluster, b"k5_66", b"2", Some(true), Some(false), None); + check_key(&cluster, b"k1_66", b"2", Some(true), Some(false), None); + check_key(&cluster, b"k5_66", b"2", Some(true), Some(false), None); + + let new_states1 = collect_all_states(&cluster, region1.get_id()); + let new_states5 = collect_all_states(&cluster, region5.get_id()); + must_altered_memory_apply_state(&prev_states1, &new_states1); + must_unaltered_memory_apply_term(&prev_states1, &new_states1); + must_unaltered_disk_apply_state(&prev_states1, &new_states1); + + must_altered_memory_apply_state(&prev_states5, &new_states5); + must_unaltered_memory_apply_term(&prev_states5, &new_states5); + must_unaltered_disk_apply_state(&prev_states5, &new_states5); + let prev_states1 = new_states1; + let prev_states5 = new_states5; + // Not deleted + assert!(file1.as_path().is_file()); + assert!(file5.as_path().is_file()); + fail::remove("on_handle_ingest_sst_return"); + + let (file11, meta11, sst_path11) = make_sst( + &cluster, + region1.get_id(), + region1.get_region_epoch().clone(), + (200..300).map(|i| format!("k1_{}", i)).collect::>(), + ); + assert!(sst_path11.as_path().is_file()); + + let req = new_ingest_sst_cmd(meta11); + let _ = cluster.request(b"k1", vec![req], false, Duration::from_secs(5), true); + + check_key(&cluster, b"k1_222", b"2", Some(true), None, None); + check_key(&cluster, b"k5_66", b"2", Some(false), None, None); + + let new_states1 = collect_all_states(&cluster, region1.get_id()); + let new_states5 = collect_all_states(&cluster, region5.get_id()); + // Region 1 is persisted. + must_altered_memory_apply_state(&prev_states1, &new_states1); + must_unaltered_memory_apply_term(&prev_states1, &new_states1); + must_altered_disk_apply_state(&prev_states1, &new_states1); + // Region 5 not persisted yet. + must_unaltered_disk_apply_state(&prev_states5, &new_states5); + // file1 and file11 for region 1 is deleted. + assert!(!file1.as_path().is_file()); + assert!(!file11.as_path().is_file()); + assert!(file5.as_path().is_file()); + + // ssp_path1/11/5 share one path. + std::fs::remove_file(sst_path1.as_path()).unwrap(); + cluster.shutdown(); + } +} + +mod restart { + use super::*; + + #[test] + fn test_snap_append_restart() { + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + + disable_auto_gen_compact_log(&mut cluster); + cluster.cfg.raft_store.max_snapshot_file_raw_size = ReadableSize(u64::MAX); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + + let first_value = vec![0; 10240]; + for i in 0..10 { + let key = format!("{:03}", i); + cluster.must_put(key.as_bytes(), &first_value); + } + let first_key: &[u8] = b"000"; + + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + + let engine_2 = cluster.get_engine(eng_ids[1]); + pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); + must_get_equal(&engine_2, first_key, first_value.as_slice()); + + fail::cfg("apply_pending_snapshot", "return").unwrap(); + tikv_util::info!("engine_3 is {}", eng_ids[2]); + let engine_3 = cluster.get_engine(eng_ids[2]); + must_get_none(&engine_3, first_key); + pd_client.must_add_peer(r1, new_peer(eng_ids[2], eng_ids[2])); + + std::thread::sleep(std::time::Duration::from_millis(1000)); + { + let (key, value) = (b"k2", b"v2"); + cluster.must_put(key, value); + check_key( + &cluster, + key, + value, + Some(true), + None, + Some(vec![eng_ids[0], eng_ids[1]]), + ); + let new_states = maybe_collect_states(&cluster, r1, None); + // engine_3 has not applied snapshot. + assert!(new_states.get(&eng_ids[2]).is_none()); + // engine_2 has applied snapshot. + assert_eq!( + new_states + .get(&eng_ids[1]) + .unwrap() + .in_disk_region_state + .get_state(), + PeerState::Normal + ); + } + + stop_tiflash_node(&mut cluster, eng_ids[2]); + restart_tiflash_node(&mut cluster, eng_ids[2]); - let new_states1 = collect_all_states(&cluster, region1.get_id()); - let new_states5 = collect_all_states(&cluster, region5.get_id()); - for i in prev_states1.keys() { - let old = prev_states1.get(i).unwrap(); - let new = new_states1.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); - } - for i in prev_states5.keys() { - let old = prev_states5.get(i).unwrap(); - let new = new_states5.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); - } - let prev_states1 = new_states1; - let prev_states5 = new_states5; - // Not deleted - assert!(file1.as_path().is_file()); - assert!(file5.as_path().is_file()); - fail::remove("on_handle_ingest_sst_return"); + fail::remove("apply_pending_snapshot"); - let (file11, meta11, sst_path11) = make_sst( + check_key( &cluster, - region1.get_id(), - region1.get_region_epoch().clone(), - (200..300).map(|i| format!("k1_{}", i)).collect::>(), + first_key, + &first_value, + Some(true), + None, + Some(vec![eng_ids[2]]), ); - assert!(sst_path11.as_path().is_file()); - - let req = new_ingest_sst_cmd(meta11); - let _ = cluster.request(b"k1", vec![req], false, Duration::from_secs(5), true); - - check_key(&cluster, b"k1_222", b"2", Some(true), None, None); - check_key(&cluster, b"k5_66", b"2", Some(false), None, None); - - let new_states1 = collect_all_states(&cluster, region1.get_id()); - let new_states5 = collect_all_states(&cluster, region5.get_id()); - // Region 1 is persisted. - for i in prev_states1.keys() { - let old = prev_states1.get(i).unwrap(); - let new = new_states1.get(i).unwrap(); - assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); - assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); - assert_ne!(old.in_disk_apply_state, new.in_disk_apply_state); - } - // Region 5 not persisted yet. - for i in prev_states5.keys() { - let old = prev_states5.get(i).unwrap(); - let new = new_states5.get(i).unwrap(); - assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); - } - // file1 and file11 for region 1 is deleted. - assert!(!file1.as_path().is_file()); - assert!(!file11.as_path().is_file()); - assert!(file5.as_path().is_file()); - // ssp_path1/11/5 share one path. - std::fs::remove_file(sst_path1.as_path()).unwrap(); cluster.shutdown(); } -} - -mod restart { - use super::*; /// This test is currently not valid, since we can't abort in apply_snap by /// failpoint now. @@ -1163,32 +490,9 @@ mod restart { // So we have to disable this test. // std::thread::sleep(std::time::Duration::from_millis(2500)); - info!("stop node {}", eng_ids[1]); - cluster.stop_node(eng_ids[1]); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&eng_ids[1]) - .unwrap() - .engine_store_server - .stop(); - } - + stop_tiflash_node(&mut cluster, eng_ids[1]); fail::remove("on_ob_pre_handle_snapshot"); - fail::remove("on_ob_post_apply_snapshot"); - info!("resume node {}", eng_ids[1]); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&eng_ids[1]) - .unwrap() - .engine_store_server - .restore(); - } - info!("restored node {}", eng_ids[1]); - cluster.run_node(eng_ids[1]).unwrap(); + restart_tiflash_node(&mut cluster, eng_ids[1]); let (key, value) = (b"k2", b"v2"); cluster.must_put(key, value); @@ -1285,30 +589,8 @@ mod restart { ); } - info!("stop node {}", eng_ids[0]); - cluster.stop_node(eng_ids[0]); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&eng_ids[0]) - .unwrap() - .engine_store_server - .stop(); - } - - info!("resume node {}", eng_ids[0]); - { - let lock = cluster.ffi_helper_set.lock(); - lock.unwrap() - .deref_mut() - .get_mut(&eng_ids[0]) - .unwrap() - .engine_store_server - .restore(); - } - info!("restored node {}", eng_ids[0]); - cluster.run_node(eng_ids[0]).unwrap(); + stop_tiflash_node(&mut cluster, eng_ids[0]); + restart_tiflash_node(&mut cluster, eng_ids[0]); std::thread::sleep(std::time::Duration::from_millis(2000)); @@ -1331,394 +613,6 @@ mod restart { } } -mod snapshot { - use super::*; - - #[test] - fn test_huge_multi_snapshot() { - test_huge_snapshot(true) - } - - #[test] - fn test_huge_normal_snapshot() { - test_huge_snapshot(false) - } - - fn test_huge_snapshot(is_multi: bool) { - let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); - assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); - - fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); - disable_auto_gen_compact_log(&mut cluster); - cluster.cfg.raft_store.max_snapshot_file_raw_size = if is_multi { - ReadableSize(1024 * 1024) - } else { - ReadableSize(u64::MAX) - }; - - // Disable default max peer count check. - pd_client.disable_default_operator(); - let r1 = cluster.run_conf_change(); - - let first_value = vec![0; 10240]; - // at least 4m data - for i in 0..400 { - let key = format!("{:03}", i); - cluster.must_put(key.as_bytes(), &first_value); - } - let first_key: &[u8] = b"000"; - - let eng_ids = cluster - .engines - .iter() - .map(|e| e.0.to_owned()) - .collect::>(); - tikv_util::info!("engine_2 is {}", eng_ids[1]); - let engine_2 = cluster.get_engine(eng_ids[1]); - must_get_none(&engine_2, first_key); - // add peer (engine_2,engine_2) to region 1. - pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); - - { - let (key, value) = (b"k2", b"v2"); - cluster.must_put(key, value); - // we can get in memory, since snapshot is pre handled, though it is not - // persisted - check_key( - &cluster, - key, - value, - Some(true), - None, - Some(vec![eng_ids[1]]), - ); - let engine_2 = cluster.get_engine(eng_ids[1]); - // now snapshot must be applied on peer engine_2 - must_get_equal(&engine_2, first_key, first_value.as_slice()); - - // engine 3 will not exec post apply snapshot. - fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); - - tikv_util::info!("engine_3 is {}", eng_ids[2]); - let engine_3 = cluster.get_engine(eng_ids[2]); - must_get_none(&engine_3, first_key); - pd_client.must_add_peer(r1, new_peer(eng_ids[2], eng_ids[2])); - - std::thread::sleep(std::time::Duration::from_millis(500)); - // We have not apply pre handled snapshot, - // we can't be sure if it exists in only get from memory too, since pre handle - // snapshot is async. - must_get_none(&engine_3, first_key); - fail::remove("on_ob_post_apply_snapshot"); - - std::thread::sleep(std::time::Duration::from_millis(500)); - tikv_util::info!("put to engine_3"); - let (key, value) = (b"k3", b"v3"); - cluster.must_put(key, value); - tikv_util::info!("check engine_3"); - check_key(&cluster, key, value, Some(true), None, None); - } - - fail::remove("on_can_apply_snapshot"); - - cluster.shutdown(); - } - - #[test] - fn test_concurrent_snapshot() { - let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); - assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); - disable_auto_gen_compact_log(&mut cluster); - - // Disable default max peer count check. - pd_client.disable_default_operator(); - - let r1 = cluster.run_conf_change(); - cluster.must_put(b"k1", b"v1"); - pd_client.must_add_peer(r1, new_peer(2, 2)); - // Force peer 2 to be followers all the way. - cluster.add_send_filter(CloneFilterFactory( - RegionPacketFilter::new(r1, 2) - .msg_type(MessageType::MsgRequestVote) - .direction(Direction::Send), - )); - cluster.must_transfer_leader(r1, new_peer(1, 1)); - cluster.must_put(b"k3", b"v3"); - // Pile up snapshots of overlapped region ranges and deliver them all at once. - let (tx, rx) = mpsc::channel(); - cluster - .sim - .wl() - .add_recv_filter(3, Box::new(CollectSnapshotFilter::new(tx))); - pd_client.must_add_peer(r1, new_peer(3, 3)); - // Ensure the snapshot of range ("", "") is sent and piled in filter. - if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { - panic!("the snapshot is not sent before split, e: {:?}", e); - } - - // Occasionally fails. - // let region1 = cluster.get_region(b"k1"); - // // Split the region range and then there should be another snapshot for the - // split ranges. cluster.must_split(®ion, b"k2"); - // check_key(&cluster, b"k3", b"v3", None, Some(true), Some(vec![3])); - // - // // Ensure the regions work after split. - // cluster.must_put(b"k11", b"v11"); - // check_key(&cluster, b"k11", b"v11", Some(true), None, Some(vec![3])); - // cluster.must_put(b"k4", b"v4"); - // check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![3])); - - cluster.shutdown(); - } - - fn new_split_region_cluster(count: u64) -> (Cluster, Arc) { - let (mut cluster, pd_client) = new_mock_cluster(0, 3); - // Disable raft log gc in this test case. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); - // Disable default max peer count check. - pd_client.disable_default_operator(); - - let _ = cluster.run_conf_change(); - for i in 0..count { - let k = format!("k{:0>4}", 2 * i + 1); - let v = format!("v{}", 2 * i + 1); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - - // k1 in [ , ] splited by k2 -> (, k2] [k2, ) - // k3 in [k2, ) splited by k4 -> [k2, k4) [k4, ) - for i in 0..count { - let k = format!("k{:0>4}", 2 * i + 1); - let region = cluster.get_region(k.as_bytes()); - let sp = format!("k{:0>4}", 2 * i + 2); - cluster.must_split(®ion, sp.as_bytes()); - } - - (cluster, pd_client) - } - - #[test] - fn test_prehandle_fail() { - let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); - assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); - - // Disable raft log gc in this test case. - cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); - - // Disable default max peer count check. - pd_client.disable_default_operator(); - let r1 = cluster.run_conf_change(); - cluster.must_put(b"k1", b"v1"); - - let eng_ids = cluster - .engines - .iter() - .map(|e| e.0.to_owned()) - .collect::>(); - // If we fail to call pre-handle snapshot, we can still handle it when apply - // snapshot. - fail::cfg("before_actually_pre_handle", "return").unwrap(); - pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); - check_key( - &cluster, - b"k1", - b"v1", - Some(true), - Some(true), - Some(vec![eng_ids[1]]), - ); - fail::remove("before_actually_pre_handle"); - - // If we failed in apply snapshot(not panic), even if per_handle_snapshot is not - // called. - fail::cfg("on_ob_pre_handle_snapshot", "return").unwrap(); - check_key( - &cluster, - b"k1", - b"v1", - Some(false), - Some(false), - Some(vec![eng_ids[2]]), - ); - pd_client.must_add_peer(r1, new_peer(eng_ids[2], eng_ids[2])); - check_key( - &cluster, - b"k1", - b"v1", - Some(true), - Some(true), - Some(vec![eng_ids[2]]), - ); - fail::remove("on_ob_pre_handle_snapshot"); - - cluster.shutdown(); - } - - #[test] - fn test_split_merge() { - let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); - assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); - - // Can always apply snapshot immediately - fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); - cluster.cfg.raft_store.right_derive_when_split = true; - - // May fail if cluster.start, since node 2 is not in region1.peers(), - // and node 2 has not bootstrap region1, - // because region1 is not bootstrap if we only call cluster.start() - cluster.run(); - - cluster.must_put(b"k1", b"v1"); - cluster.must_put(b"k3", b"v3"); - - check_key(&cluster, b"k1", b"v1", Some(true), None, None); - check_key(&cluster, b"k3", b"v3", Some(true), None, None); - - let r1 = cluster.get_region(b"k1"); - let r3 = cluster.get_region(b"k3"); - assert_eq!(r1.get_id(), r3.get_id()); - - cluster.must_split(&r1, b"k2"); - let r1_new = cluster.get_region(b"k1"); - let r3_new = cluster.get_region(b"k3"); - - assert_eq!(r1.get_id(), r3_new.get_id()); - - iter_ffi_helpers(&cluster, None, &mut |id: u64, _, ffi: &mut FFIHelperSet| { - let server = &ffi.engine_store_server; - if !server.kvstore.contains_key(&r1_new.get_id()) { - panic!("node {} has no region {}", id, r1_new.get_id()) - } - if !server.kvstore.contains_key(&r3_new.get_id()) { - panic!("node {} has no region {}", id, r3_new.get_id()) - } - // Region meta must equal - assert_eq!(server.kvstore.get(&r1_new.get_id()).unwrap().region, r1_new); - assert_eq!(server.kvstore.get(&r3_new.get_id()).unwrap().region, r3_new); - - // Can get from disk - check_key(&cluster, b"k1", b"v1", None, Some(true), None); - check_key(&cluster, b"k3", b"v3", None, Some(true), None); - // TODO Region in memory data must not contradict, but now we do not - // delete data - }); - - pd_client.must_merge(r1_new.get_id(), r3_new.get_id()); - let _r1_new2 = cluster.get_region(b"k1"); - let r3_new2 = cluster.get_region(b"k3"); - - iter_ffi_helpers(&cluster, None, &mut |id: u64, _, ffi: &mut FFIHelperSet| { - let server = &ffi.engine_store_server; - - // The left region is removed - if server.kvstore.contains_key(&r1_new.get_id()) { - panic!("node {} should has no region {}", id, r1_new.get_id()) - } - if !server.kvstore.contains_key(&r3_new.get_id()) { - panic!("node {} has no region {}", id, r3_new.get_id()) - } - // Region meta must equal - assert_eq!( - server.kvstore.get(&r3_new2.get_id()).unwrap().region, - r3_new2 - ); - - // Can get from disk - check_key(&cluster, b"k1", b"v1", None, Some(true), None); - check_key(&cluster, b"k3", b"v3", None, Some(true), None); - // TODO Region in memory data must not contradict, but now we do not delete data - - let origin_epoch = r3_new.get_region_epoch(); - let new_epoch = r3_new2.get_region_epoch(); - // PrepareMerge + CommitMerge, so it should be 2. - assert_eq!(new_epoch.get_version(), origin_epoch.get_version() + 2); - assert_eq!(new_epoch.get_conf_ver(), origin_epoch.get_conf_ver()); - }); - - fail::remove("on_can_apply_snapshot"); - cluster.shutdown(); - } - - #[test] - fn test_basic_concurrent_snapshot() { - let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); - assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); - - disable_auto_gen_compact_log(&mut cluster); - - // Disable default max peer count check. - pd_client.disable_default_operator(); - - let _ = cluster.run_conf_change(); - cluster.must_put(b"k1", b"v1"); - cluster.must_put(b"k3", b"v3"); - - let region1 = cluster.get_region(b"k1"); - cluster.must_split(®ion1, b"k2"); - let r1 = cluster.get_region(b"k1").get_id(); - let r3 = cluster.get_region(b"k3").get_id(); - - fail::cfg("before_actually_pre_handle", "sleep(1000)").unwrap(); - tikv_util::info!("region k1 {} k3 {}", r1, r3); - let pending_count = cluster - .engines - .get(&2) - .unwrap() - .kv - .pending_applies_count - .clone(); - pd_client.add_peer(r1, new_peer(2, 2)); - pd_client.add_peer(r3, new_peer(2, 2)); - // handle_pending_applies will do nothing. - fail::cfg("apply_pending_snapshot", "return").unwrap(); - // wait snapshot is generated. - std::thread::sleep(std::time::Duration::from_millis(500)); - // Now, region k1 and k3 are not handled, since pre-handle process is not - // finished. This is because `pending_applies_count` is not greater than - // `snap_handle_pool_size`, So there are no `handle_pending_applies` - // until `on_timeout`. - - fail::remove("apply_pending_snapshot"); - assert_eq!(pending_count.load(Ordering::SeqCst), 2); - std::thread::sleep(std::time::Duration::from_millis(600)); - check_key(&cluster, b"k1", b"v1", None, Some(true), Some(vec![1, 2])); - check_key(&cluster, b"k3", b"v3", None, Some(true), Some(vec![1, 2])); - // Now, k1 and k3 are handled. - assert_eq!(pending_count.load(Ordering::SeqCst), 0); - - fail::remove("before_actually_pre_handle"); - - cluster.shutdown(); - } - - #[test] - fn test_many_concurrent_snapshot() { - let c = 4; - let (mut cluster, pd_client) = new_split_region_cluster(c); - - for i in 0..c { - let k = format!("k{:0>4}", 2 * i + 1); - let region_id = cluster.get_region(k.as_bytes()).get_id(); - pd_client.must_add_peer(region_id, new_peer(2, 2)); - } - - for i in 0..c { - let k = format!("k{:0>4}", 2 * i + 1); - let v = format!("v{}", 2 * i + 1); - check_key( - &cluster, - k.as_bytes(), - v.as_bytes(), - Some(true), - Some(true), - Some(vec![2]), - ); - } - - cluster.shutdown(); - } -} - mod persist { use super::*; @@ -1736,18 +630,8 @@ mod persist { cluster.must_put(b"k1", b"v1"); check_key(&cluster, b"k1", b"v1", Some(true), Some(false), None); let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - assert_eq!( - old.in_memory_apply_state.get_applied_index() + 1, - new.in_memory_apply_state.get_applied_index() - ); - assert_eq!( - old.in_disk_apply_state.get_applied_index(), - new.in_disk_apply_state.get_applied_index() - ); - } + must_altered_memory_apply_index(&prev_states, &new_states, 1); + must_altered_disk_apply_index(&prev_states, &new_states, 0); fail::cfg("on_pre_persist_with_finish", "return").unwrap(); cluster.must_put(b"k2", b"v2"); @@ -1764,15 +648,7 @@ mod persist { // TODO(tiflash) wait `write_apply_state` in raftstore. std::thread::sleep(std::time::Duration::from_millis(1000)); let new_states = collect_all_states(&cluster, region_id); - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - let gap = new.in_memory_apply_state.get_applied_index() - - old.in_memory_apply_state.get_applied_index(); - let gap2 = new.in_disk_apply_state.get_applied_index() - - old.in_disk_apply_state.get_applied_index(); - assert_eq!(gap, gap2); - } + must_apply_index_advanced_diff(&prev_states, &new_states, 0); fail::remove("on_pre_persist_with_finish"); } @@ -1809,14 +685,6 @@ mod persist { let new_states = collect_all_states(&cluster, r3_new.get_id()); // index 6 empty command // index 7 CommitMerge - for i in prev_states.keys() { - let old = prev_states.get(i).unwrap(); - let new = new_states.get(i).unwrap(); - let _gap = new.in_memory_apply_state.get_applied_index() - - old.in_memory_apply_state.get_applied_index(); - let gap2 = new.in_disk_apply_state.get_applied_index() - - old.in_disk_apply_state.get_applied_index(); - assert_eq!(gap2, 2); - } + must_altered_disk_apply_index(&prev_states, &new_states, 2); } } diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index d3e15a0750c..2fd0de1ef72 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -3,14 +3,18 @@ pub use std::{ collections::HashMap, io::Write, + iter::FromIterator, ops::DerefMut, path::{Path, PathBuf}, str::FromStr, sync::{atomic::Ordering, mpsc, Arc, RwLock}, }; +pub use collections::HashSet; pub use engine_store_ffi::{KVGetStatus, RaftStoreProxyFFI}; -pub use engine_traits::{MiscExt, CF_DEFAULT, CF_LOCK, CF_WRITE}; +pub use engine_traits::{ + MiscExt, Mutable, RaftEngineDebug, RaftLogBatch, WriteBatch, CF_DEFAULT, CF_LOCK, CF_WRITE, +}; // use engine_store_ffi::config::{ensure_no_common_unrecognized_keys, ProxyConfig}; pub use engine_traits::{Peekable, CF_RAFT}; pub use kvproto::{ @@ -18,48 +22,30 @@ pub use kvproto::{ metapb, metapb::RegionEpoch, raft_cmdpb::{AdminCmdType, AdminRequest, CmdType, Request}, - raft_serverpb::{RaftApplyState, RegionLocalState, StoreIdent}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState, StoreIdent}, }; pub use new_mock_engine_store::{ config::Config, + get_apply_state, get_raft_local_state, get_region_local_state, make_new_region, mock_cluster::{new_put_cmd, new_request, FFIHelperSet}, must_get_equal, must_get_none, node::NodeCluster, transport_simulate::{ CloneFilterFactory, CollectSnapshotFilter, Direction, RegionPacketFilter, }, - Cluster, ProxyConfig, Simulator, TestPdClient, + write_kv_in_mem, Cluster, ProxyConfig, RegionStats, Simulator, TestPdClient, }; +pub use raft::eraftpb::{ConfChangeType, MessageType}; pub use raftstore::coprocessor::ConsistencyCheckMethod; -pub use test_raftstore::new_peer; +pub use test_raftstore::{new_learner_peer, new_peer}; pub use tikv_util::{ + box_err, box_try, config::{ReadableDuration, ReadableSize}, + store::{find_peer, find_peer_by_id}, time::Duration, + HandyRwLock, }; -// TODO Need refactor if moved to raft-engine -pub fn get_region_local_state( - engine: &engine_rocks::RocksEngine, - region_id: u64, -) -> RegionLocalState { - let region_state_key = keys::region_state_key(region_id); - let region_state = match engine.get_msg_cf::(CF_RAFT, ®ion_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - region_state -} - -// TODO Need refactor if moved to raft-engine -pub fn get_apply_state(engine: &engine_rocks::RocksEngine, region_id: u64) -> RaftApplyState { - let apply_state_key = keys::apply_state_key(region_id); - let apply_state = match engine.get_msg_cf::(CF_RAFT, &apply_state_key) { - Ok(Some(s)) => s, - _ => unreachable!(), - }; - apply_state -} - pub fn new_compute_hash_request() -> AdminRequest { let mut req = AdminRequest::default(); req.set_cmd_type(AdminCmdType::ComputeHash); @@ -82,54 +68,60 @@ pub struct States { pub in_memory_applied_term: u64, pub in_disk_apply_state: RaftApplyState, pub in_disk_region_state: RegionLocalState, + pub in_disk_raft_state: RaftLocalState, pub ident: StoreIdent, } -pub fn iter_ffi_helpers( - cluster: &Cluster, +pub fn iter_ffi_helpers>( + cluster: &Cluster, store_ids: Option>, f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet) -> (), ) { - let ids = match store_ids { - Some(ids) => ids, - None => cluster.engines.keys().map(|e| *e).collect::>(), - }; - for id in ids { - let engine = cluster.get_engine(id); - let mut lock = cluster.ffi_helper_set.lock().unwrap(); - let ffiset = lock.get_mut(&id).unwrap(); - f(id, &engine, ffiset); - } + cluster.iter_ffi_helpers(store_ids, f); } -pub fn collect_all_states(cluster: &Cluster, region_id: u64) -> HashMap { +pub fn maybe_collect_states( + cluster: &Cluster, + region_id: u64, + store_ids: Option>, +) -> HashMap { let mut prev_state: HashMap = HashMap::default(); iter_ffi_helpers( cluster, - None, + store_ids, &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { let server = &ffi.engine_store_server; - let region = server.kvstore.get(®ion_id).unwrap(); - let ident = match engine.get_msg::(keys::STORE_IDENT_KEY) { - Ok(Some(i)) => (i), - _ => unreachable!(), - }; - prev_state.insert( - id, - States { - in_memory_apply_state: region.apply_state.clone(), - in_memory_applied_term: region.applied_term, - in_disk_apply_state: get_apply_state(&engine, region_id), - in_disk_region_state: get_region_local_state(&engine, region_id), - ident, - }, - ); + let raft_engine = &cluster.get_engines(id).raft; + if let Some(region) = server.kvstore.get(®ion_id) { + let ident = match engine.get_msg::(keys::STORE_IDENT_KEY) { + Ok(Some(i)) => i, + _ => unreachable!(), + }; + prev_state.insert( + id, + States { + in_memory_apply_state: region.apply_state.clone(), + in_memory_applied_term: region.applied_term, + in_disk_apply_state: get_apply_state(&engine, region_id).unwrap(), + in_disk_region_state: get_region_local_state(&engine, region_id).unwrap(), + in_disk_raft_state: get_raft_local_state(raft_engine, region_id).unwrap(), + ident, + }, + ); + } }, ); prev_state } +pub fn collect_all_states(cluster: &Cluster, region_id: u64) -> HashMap { + let prev_state = maybe_collect_states(cluster, region_id, None); + assert_eq!(prev_state.len(), cluster.engines.keys().len()); + prev_state +} + pub fn new_mock_cluster(id: u64, count: usize) -> (Cluster, Arc) { + tikv_util::set_panic_hook(true, "./"); let pd_client = Arc::new(TestPdClient::new(0, false)); let sim = Arc::new(RwLock::new(NodeCluster::new(pd_client.clone()))); let mut cluster = Cluster::new(id, count, sim, pd_client.clone(), ProxyConfig::default()); @@ -152,7 +144,8 @@ pub fn new_mock_cluster_snap(id: u64, count: usize) -> (Cluster, Ar } pub fn must_get_mem( - engine_store_server: &Box, + cluster: &Cluster, + node_id: u64, region_id: u64, key: &[u8], value: Option<&[u8]>, @@ -160,27 +153,93 @@ pub fn must_get_mem( let last_res: Option<&Vec> = None; let cf = new_mock_engine_store::ffi_interfaces::ColumnFamilyType::Default; for _ in 1..300 { - let res = engine_store_server.get_mem(region_id, cf, &key.to_vec()); - - if let (Some(value), Some(last_res)) = (value, res) { - assert_eq!(value, &last_res[..]); - return; + let mut ok = false; + { + iter_ffi_helpers( + &cluster, + Some(vec![node_id]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + let res = server.get_mem(region_id, cf, &key.to_vec()); + if let (Some(value), Some(last_res)) = (value, res) { + assert_eq!(value, &last_res[..]); + ok = true; + return; + } + if value.is_none() && last_res.is_none() { + ok = true; + return; + } + }, + ); } - if value.is_none() && last_res.is_none() { + if ok { return; } + std::thread::sleep(std::time::Duration::from_millis(20)); } let s = std::str::from_utf8(key).unwrap_or(""); - panic!( + let e = format!( "can't get mem value {:?} for key {}({}) in store {} cf {:?}, actual {:?}", value.map(tikv_util::escape), log_wrappers::hex_encode_upper(key), s, - engine_store_server.id, + node_id, cf, last_res, - ) + ); + error!("{}", e); + panic!("{}", e); +} + +pub fn must_put_and_check_key_with_generator (String, String)>( + cluster: &mut Cluster, + gen: F, + from: u64, + to: u64, + in_mem: Option, + in_disk: Option, + engines: Option>, +) { + for i in from..to { + let (k, v) = gen(i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + for i in from..to { + let (k, v) = gen(i); + check_key( + &cluster, + k.as_bytes(), + v.as_bytes(), + in_mem, + in_disk, + engines.clone(), + ); + } +} + +pub fn must_put_and_check_key( + cluster: &mut Cluster, + from: u64, + to: u64, + in_mem: Option, + in_disk: Option, + engines: Option>, +) { + must_put_and_check_key_with_generator( + cluster, + |i: u64| { + let k = format!("k{}", i); + let v = format!("v{}", i); + (k, v) + }, + from, + to, + in_mem, + in_disk, + engines.clone(), + ); } pub fn check_key( @@ -199,7 +258,7 @@ pub fn check_key( } }; for id in engine_keys { - let engine = &cluster.get_engine(id); + let engine = cluster.get_engine(id); match in_disk { Some(b) => { @@ -213,12 +272,10 @@ pub fn check_key( }; match in_mem { Some(b) => { - let lock = cluster.ffi_helper_set.lock().unwrap(); - let server = &lock.get(&id).unwrap().engine_store_server; if b { - must_get_mem(server, region_id, k, Some(v)); + must_get_mem(cluster, id, region_id, k, Some(v)); } else { - must_get_mem(server, region_id, k, None); + must_get_mem(cluster, id, region_id, k, None); } } None => (), @@ -281,8 +338,24 @@ pub fn check_apply_state( } pub fn get_valid_compact_index(states: &HashMap) -> (u64, u64) { + get_valid_compact_index_by(states, None) +} + +pub fn get_valid_compact_index_by( + states: &HashMap, + use_nodes: Option>, +) -> (u64, u64) { + let set = use_nodes.map_or(None, |nodes| { + Some(HashSet::from_iter(nodes.clone().into_iter())) + }); states .iter() + .filter(|(k, _)| { + if let Some(ref s) = set { + return s.contains(k); + } + true + }) .map(|(_, s)| { ( s.in_memory_apply_state.get_applied_index(), @@ -302,104 +375,281 @@ pub fn disable_auto_gen_compact_log(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_log_gc_threshold = 10000; } -#[test] -fn test_kv_write() { - let (mut cluster, _pd_client) = new_mock_cluster(0, 3); +pub fn compare_states( + prev_states: &HashMap, + new_states: &HashMap, + f: F, +) { + for i in prev_states.keys() { + let old = prev_states.get(i).unwrap(); + let new = new_states.get(i).unwrap(); + f(old, new); + } +} - cluster.cfg.proxy_compat = false; - // No persist will be triggered by CompactLog - fail::cfg("no_persist_compact_log", "return").unwrap(); - let _ = cluster.run(); +pub fn must_unaltered_memory_apply_term( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_eq!(old.in_memory_applied_term, new.in_memory_applied_term); + }; + compare_states(prev_states, new_states, f); +} - cluster.must_put(b"k0", b"v0"); - // check_key(&cluster, b"k0", b"v0", Some(false), Some(false), None); +pub fn must_altered_memory_apply_term( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_ne!(old.in_memory_applied_term, new.in_memory_applied_term); + }; + compare_states(prev_states, new_states, f); +} - // We can read initial raft state, since we don't persist meta either. - let r1 = cluster.get_region(b"k0").get_id(); - let prev_states = collect_all_states(&mut cluster, r1); +pub fn must_unaltered_memory_apply_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_eq!(old.in_memory_apply_state, new.in_memory_apply_state); + }; + compare_states(prev_states, new_states, f); +} - for i in 1..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } +pub fn must_altered_memory_apply_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_ne!(old.in_memory_apply_state, new.in_memory_apply_state); + }; + compare_states(prev_states, new_states, f); +} - // Since we disable all observers, we can get nothing in either memory and disk. - for i in 0..10 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); - } +pub fn must_altered_memory_apply_index( + prev_states: &HashMap, + new_states: &HashMap, + apply_index_advanced: u64, +) { + let f = |old: &States, new: &States| { + assert_eq!( + old.in_memory_apply_state.get_applied_index() + apply_index_advanced, + new.in_memory_apply_state.get_applied_index() + ); + }; + compare_states(prev_states, new_states, f); +} + +pub fn must_altered_disk_apply_index( + prev_states: &HashMap, + new_states: &HashMap, + apply_index_advanced: u64, +) { + let f = |old: &States, new: &States| { + assert_eq!( + old.in_disk_apply_state.get_applied_index() + apply_index_advanced, + new.in_disk_apply_state.get_applied_index() + ); + }; + compare_states(prev_states, new_states, f); +} + +pub fn must_apply_index_advanced_diff( + prev_states: &HashMap, + new_states: &HashMap, + memory_more_advanced: u64, +) { + let f = |old: &States, new: &States| { + let gap = new.in_memory_apply_state.get_applied_index() + - old.in_memory_apply_state.get_applied_index(); + let gap2 = new.in_disk_apply_state.get_applied_index() + - old.in_disk_apply_state.get_applied_index(); + assert_eq!(gap, gap2 + memory_more_advanced); + }; + compare_states(prev_states, new_states, f); +} + +pub fn must_unaltered_disk_apply_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_eq!(old.in_disk_apply_state, new.in_disk_apply_state); + }; + compare_states(prev_states, new_states, f); +} + +pub fn must_altered_disk_apply_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_ne!(old.in_disk_apply_state, new.in_disk_apply_state); + }; + compare_states(prev_states, new_states, f); +} - let new_states = collect_all_states(&mut cluster, r1); - for id in cluster.engines.keys() { +pub fn must_altered_memory_truncated_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { assert_ne!( - &prev_states.get(id).unwrap().in_memory_apply_state, - &new_states.get(id).unwrap().in_memory_apply_state + old.in_memory_apply_state.get_truncated_state(), + new.in_memory_apply_state.get_truncated_state() ); + }; + compare_states(prev_states, new_states, f); +} + +pub fn must_unaltered_memory_truncated_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { assert_eq!( - &prev_states.get(id).unwrap().in_disk_apply_state, - &new_states.get(id).unwrap().in_disk_apply_state + old.in_memory_apply_state.get_truncated_state(), + new.in_memory_apply_state.get_truncated_state() ); - } + }; + compare_states(prev_states, new_states, f); +} - debug!("now CompactLog can persist"); - fail::remove("no_persist_compact_log"); +pub fn must_altered_disk_truncated_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_ne!( + old.in_disk_apply_state.get_truncated_state(), + new.in_disk_apply_state.get_truncated_state() + ); + }; + compare_states(prev_states, new_states, f); +} - let prev_states = collect_all_states(&mut cluster, r1); - // Write more after we force persist when CompactLog. - for i in 20..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - cluster.must_put(k.as_bytes(), v.as_bytes()); +pub fn must_unaltered_disk_truncated_state( + prev_states: &HashMap, + new_states: &HashMap, +) { + let f = |old: &States, new: &States| { + assert_eq!( + old.in_disk_apply_state.get_truncated_state(), + new.in_disk_apply_state.get_truncated_state() + ); + }; + compare_states(prev_states, new_states, f); +} + +// Must wait until all nodes satisfy cond given by `pref`. +pub fn must_wait_until_cond_states( + cluster: &Cluster, + region_id: u64, + prev_states: &HashMap, + pred: &dyn Fn(&States, &States) -> bool, +) -> HashMap { + let mut retry = 0; + loop { + let new_states = collect_all_states(&cluster, region_id); + let mut ok = true; + for i in prev_states.keys() { + let old = prev_states.get(i).unwrap(); + let new = new_states.get(i).unwrap(); + if !pred(old, new) { + ok = false; + break; + } + } + if ok { + break new_states; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retry += 1; + if retry >= 30 { + panic!("states not as expect after timeout") + } } +} - // We can read from mock-store's memory, we are not sure if we can read from - // disk, since there may be or may not be a CompactLog. - for i in 20..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); +// Must wait until some node satisfy cond given by `pref`. +pub fn must_wait_until_cond_node( + cluster: &Cluster, + region_id: u64, + store_ids: Option>, + pred: &dyn Fn(&States) -> bool, +) -> HashMap { + let mut retry = 0; + loop { + let new_states = maybe_collect_states(&cluster, region_id, store_ids.clone()); + if let Some(ref e) = store_ids { + assert_eq!(e.len(), new_states.len()); + } + let mut ok = true; + for i in new_states.keys() { + let new = new_states.get(i).unwrap(); + if !pred(new) { + ok = false; + break; + } + } + if ok { + break new_states; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retry += 1; + if retry >= 30 { + panic!("states not as expect after timeout") + } } +} - // Force a compact log to persist. - let region_r = cluster.get_region("k1".as_bytes()); - let region_id = region_r.get_id(); - let compact_log = test_raftstore::new_compact_log_request(100, 10); - let req = - test_raftstore::new_admin_request(region_id, region_r.get_region_epoch(), compact_log); - let res = cluster +pub fn force_compact_log( + cluster: &mut Cluster, + key: &[u8], + use_nodes: Option>, +) -> u64 { + let region = cluster.get_region(key); + let region_id = region.get_id(); + let prev_states = maybe_collect_states(&cluster, region_id, None); + + let (compact_index, compact_term) = get_valid_compact_index_by(&prev_states, use_nodes); + let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let _ = cluster .call_command_on_leader(req, Duration::from_secs(3)) .unwrap(); - assert!(res.get_header().has_error(), "{:?}", res); + return compact_index; +} - for i in 20..30 { - let k = format!("k{}", i); - let v = format!("v{}", i); - check_key( +pub fn stop_tiflash_node(cluster: &mut Cluster, node_id: u64) { + info!("stop node {}", node_id); + { + cluster.stop_node(node_id); + } + { + iter_ffi_helpers( &cluster, - k.as_bytes(), - v.as_bytes(), - Some(true), - Some(true), - None, + Some(vec![node_id]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &mut ffi.engine_store_server; + server.stop(); + }, ); } +} - let new_states = collect_all_states(&mut cluster, r1); - - // apply_state is changed in memory, and persisted. - for id in cluster.engines.keys() { - assert_ne!( - &prev_states.get(id).unwrap().in_memory_apply_state, - &new_states.get(id).unwrap().in_memory_apply_state - ); - assert_ne!( - &prev_states.get(id).unwrap().in_disk_apply_state, - &new_states.get(id).unwrap().in_disk_apply_state +pub fn restart_tiflash_node(cluster: &mut Cluster, node_id: u64) { + info!("restored node {}", node_id); + { + iter_ffi_helpers( + &cluster, + Some(vec![node_id]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &mut ffi.engine_store_server; + server.restore(); + }, ); } - - fail::remove("no_persist_compact_log"); - cluster.shutdown(); + cluster.run_node(node_id).unwrap(); } diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs new file mode 100644 index 00000000000..f93834c6423 --- /dev/null +++ b/proxy_tests/proxy/region.rs @@ -0,0 +1,615 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::iter::FromIterator; + +use collections::HashSet; + +use crate::proxy::*; + +#[test] +fn test_handle_destroy() { + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + + disable_auto_gen_compact_log(&mut cluster); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + + cluster.run(); + cluster.must_put(b"k1", b"v1"); + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + + let region = cluster.get_region(b"k1"); + let region_id = region.get_id(); + let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); + let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); + cluster.must_transfer_leader(region_id, peer_1); + + iter_ffi_helpers( + &cluster, + Some(vec![eng_ids[1]]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + assert!(server.kvstore.contains_key(®ion_id)); + }, + ); + + pd_client.must_remove_peer(region_id, peer_2); + + check_key( + &cluster, + b"k1", + b"v2", + Some(false), + None, + Some(vec![eng_ids[1]]), + ); + + std::thread::sleep(std::time::Duration::from_millis(100)); + // Region removed in server. + iter_ffi_helpers( + &cluster, + Some(vec![eng_ids[1]]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + assert!(!server.kvstore.contains_key(®ion_id)); + }, + ); + + cluster.shutdown(); +} + +#[test] +fn test_get_region_local_state() { + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + cluster.run(); + + let k = b"k1"; + let v = b"v1"; + cluster.must_put(k, v); + check_key(&cluster, k, v, Some(true), None, None); + let region_id = cluster.get_region(k).get_id(); + + // Get RegionLocalState through ffi + unsafe { + iter_ffi_helpers( + &cluster, + None, + &mut |_id: u64, _, ffi_set: &mut FFIHelperSet| { + let f = ffi_set.proxy_helper.fn_get_region_local_state.unwrap(); + let mut state = kvproto::raft_serverpb::RegionLocalState::default(); + let mut error_msg = new_mock_engine_store::RawCppStringPtrGuard::default(); + + assert_eq!( + f( + ffi_set.proxy_helper.proxy_ptr, + region_id, + &mut state as *mut _ as _, + error_msg.as_mut(), + ), + KVGetStatus::Ok + ); + assert!(state.has_region()); + assert_eq!(state.get_state(), kvproto::raft_serverpb::PeerState::Normal); + assert!(error_msg.as_ref().is_null()); + + let mut state = kvproto::raft_serverpb::RegionLocalState::default(); + assert_eq!( + f( + ffi_set.proxy_helper.proxy_ptr, + 0, // not exist + &mut state as *mut _ as _, + error_msg.as_mut(), + ), + KVGetStatus::NotFound + ); + assert!(!state.has_region()); + assert!(error_msg.as_ref().is_null()); + + ffi_set + .proxy + .get_value_cf("none_cf", "123".as_bytes(), |value| { + let msg = value.unwrap_err(); + assert_eq!(msg, "Storage Engine Status { code: IoError, sub_code: None, sev: NoError, state: \"cf none_cf not found\" }"); + }); + ffi_set + .proxy + .get_value_cf("raft", "123".as_bytes(), |value| { + let res = value.unwrap(); + assert!(res.is_none()); + }); + + // If we have no kv engine. + ffi_set.proxy.set_kv_engine(None); + let res = ffi_set.proxy_helper.fn_get_region_local_state.unwrap()( + ffi_set.proxy_helper.proxy_ptr, + region_id, + &mut state as *mut _ as _, + error_msg.as_mut(), + ); + assert_eq!(res, KVGetStatus::Error); + assert!(!error_msg.as_ref().is_null()); + assert_eq!( + error_msg.as_str(), + "KV engine is not initialized".as_bytes() + ); + }, + ); + } + + cluster.shutdown(); +} + +/// This test is very important. +/// If make sure we can add learner peer for a store which is not started +/// actually. +#[test] +fn test_add_absent_learner_peer_by_simple() { + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + disable_auto_gen_compact_log(&mut cluster); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let _ = cluster.run(); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + + pd_client.must_add_peer(1, new_learner_peer(4, 4)); + + cluster.must_put(b"k3", b"v3"); + check_key(&cluster, b"k3", b"v3", Some(true), None, None); + let new_states = collect_all_states(&cluster, 1); + assert_eq!(new_states.len(), 3); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 3 + 1 // Learner + ); + } + + cluster.shutdown(); +} + +/// This test is very important. +/// If make sure we can add learner peer for a store which is not started +/// actually. +#[test] +fn test_add_absent_learner_peer_by_joint() { + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + disable_auto_gen_compact_log(&mut cluster); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let _ = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + + pd_client.must_joint_confchange( + 1, + vec![ + (ConfChangeType::AddNode, new_peer(2, 2)), + (ConfChangeType::AddNode, new_peer(3, 3)), + (ConfChangeType::AddLearnerNode, new_learner_peer(4, 4)), + (ConfChangeType::AddLearnerNode, new_learner_peer(5, 5)), + ], + ); + assert!(pd_client.is_in_joint(1)); + pd_client.must_leave_joint(1); + + cluster.must_put(b"k3", b"v3"); + check_key(&cluster, b"k3", b"v3", Some(true), None, None); + let new_states = collect_all_states(&cluster, 1); + assert_eq!(new_states.len(), 3); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 1 + 2 /* AddPeer */ + 2 // Learner + ); + } + + cluster.shutdown(); +} + +use engine_traits::{Engines, KvEngine, RaftEngine}; +use raftstore::store::{write_initial_apply_state, write_initial_raft_state, RAFT_INIT_LOG_INDEX}; + +pub fn prepare_bootstrap_cluster_with( + engines: &Engines, + region: &metapb::Region, +) -> raftstore::Result<()> { + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); + + let mut wb = engines.kv.write_batch(); + // box_try!(wb.put_msg(keys::PREPARE_BOOTSTRAP_KEY, region)); + box_try!(wb.put_msg_cf(CF_RAFT, &keys::region_state_key(region.get_id()), &state)); + write_initial_apply_state(&mut wb, region.get_id())?; + wb.write()?; + engines.sync_kv()?; + + let mut raft_wb = engines.raft.log_batch(1024); + write_initial_raft_state(&mut raft_wb, region.get_id())?; + box_try!(engines.raft.consume(&mut raft_wb, true)); + Ok(()) +} + +fn new_later_add_learner_cluster)>( + initer: F, + learner: Vec, +) -> (Cluster, Arc) { + let (mut cluster, pd_client) = new_mock_cluster(0, 5); + // Make sure we persist before generate snapshot. + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + + cluster.cfg.proxy_compat = false; + disable_auto_gen_compact_log(&mut cluster); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let _ = cluster.run_conf_change_no_start(); + let _ = cluster.start_with(HashSet::from_iter( + vec![3, 4].into_iter().map(|x| x as usize), + )); + initer(&mut cluster); + + let mut peers = vec![ + (ConfChangeType::AddNode, new_peer(2, 2)), + (ConfChangeType::AddNode, new_peer(3, 3)), + ]; + let mut learner_peers: Vec<(ConfChangeType, kvproto::metapb::Peer)> = learner + .iter() + .map(|i| (ConfChangeType::AddLearnerNode, new_learner_peer(*i, *i))) + .collect(); + peers.append(&mut learner_peers); + pd_client.must_joint_confchange(1, peers); + assert!(pd_client.is_in_joint(1)); + pd_client.must_leave_joint(1); + + (cluster, pd_client) +} + +fn later_bootstrap_learner_peer( + cluster: &mut Cluster, + peers: Vec, + already_learner_count: usize, +) { + // Check if the voters has correct learner peer. + let new_states = maybe_collect_states(&cluster, 1, Some(vec![1, 2, 3])); + assert_eq!(new_states.len(), 3); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 1 + 2 /* AddPeer */ + already_learner_count // Learner + ); + } + + let region = new_states + .get(&1) + .unwrap() + .in_disk_region_state + .get_region(); + // Explicitly bootstrap region. + for id in peers { + let engines = cluster.get_engines(id); + assert!(prepare_bootstrap_cluster_with(engines, region).is_ok()); + } +} + +#[test] +fn test_add_delayed_started_learner_by_joint() { + let (mut cluster, pd_client) = new_later_add_learner_cluster( + |c: &mut Cluster| { + c.must_put(b"k1", b"v1"); + check_key(c, b"k1", b"v1", Some(true), None, Some(vec![1])); + }, + vec![4, 5], + ); + + cluster.must_put(b"k2", b"v2"); + check_key( + &cluster, + b"k2", + b"v2", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + + later_bootstrap_learner_peer(&mut cluster, vec![4, 5], 2); + cluster + .start_with(HashSet::from_iter( + vec![0, 1, 2].into_iter().map(|x| x as usize), + )) + .unwrap(); + + cluster.must_put(b"k4", b"v4"); + check_key(&cluster, b"k4", b"v4", Some(true), None, None); + + let new_states = maybe_collect_states(&cluster, 1, None); + assert_eq!(new_states.len(), 5); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 1 + 2 /* AddPeer */ + 2 // Learner + ); + } + + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); +} + +use new_mock_engine_store::{copy_data_from, copy_meta_from}; + +fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_id: u64) { + let mut maybe_source_region = None; + iter_ffi_helpers( + cluster, + Some(vec![from]), + &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + let server = &mut ffi.engine_store_server; + maybe_source_region = server.kvstore.get(®ion_id).cloned(); + }, + ); + let source_region = maybe_source_region.unwrap(); + let mut new_region_meta = source_region.region.clone(); + new_region_meta.mut_peers().push(new_learner_peer(to, to)); + + // Copy all node `from`'s data to node `to` + iter_ffi_helpers( + cluster, + Some(vec![to]), + &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + let server = &mut ffi.engine_store_server; + assert!(server.kvstore.get(®ion_id).is_none()); + + let new_region = make_new_region(Some(source_region.region.clone()), Some(id)); + server + .kvstore + .insert(source_region.region.get_id(), Box::new(new_region)); + if let Some(region) = server.kvstore.get_mut(®ion_id) { + let source_engines = cluster.get_engines(from); + let target_engines = cluster.get_engines(to); + copy_data_from( + source_engines, + target_engines, + source_region.as_ref(), + region.as_mut(), + ) + .unwrap(); + copy_meta_from( + source_engines, + target_engines, + source_region.as_ref(), + region.as_mut(), + new_region_meta.clone(), + true, + true, + true, + ) + .unwrap(); + } else { + panic!("error"); + } + }, + ); + { + let prev_states = maybe_collect_states(cluster, region_id, None); + assert_eq!( + prev_states.get(&from).unwrap().in_disk_apply_state, + prev_states.get(&to).unwrap().in_disk_apply_state + ); + } +} + +#[test] +fn test_add_delayed_started_learner_no_snapshot() { + // fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + // fail::cfg("before_tiflash_do_write", "return").unwrap(); + let (mut cluster, pd_client) = new_later_add_learner_cluster( + |c: &mut Cluster| { + c.must_put(b"k1", b"v1"); + check_key(c, b"k1", b"v1", Some(true), None, Some(vec![1])); + }, + vec![4], + ); + + // Start Leader store 4. + cluster + .start_with(HashSet::from_iter( + vec![0, 1, 2, 4].into_iter().map(|x| x as usize), + )) + .unwrap(); + + must_put_and_check_key_with_generator( + &mut cluster, + |i: u64| (format!("k{}", i), (0..1024).map(|_| "X").collect()), + 10, + 20, + Some(true), + None, + Some(vec![1, 2, 3, 4]), + ); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + // Force a compact log, so the leader have to send snapshot if peer 5 not catch + // up. + { + assert!(force_compact_log(&mut cluster, b"k1", None) > 15); + } + + // Simulate 4 is lost, recover its data to node 5. + cluster.stop_node(4); + + later_bootstrap_learner_peer(&mut cluster, vec![5], 1); + // After that, we manually compose data, to avoid snapshot sending. + recover_from_peer(&cluster, 4, 5, 1); + // Add node 5 to cluster. + pd_client.must_add_peer(1, new_learner_peer(5, 5)); + + fail::cfg("apply_on_handle_snapshot_finish_1_1", "panic").unwrap(); + // Start store 5. + cluster + .start_with(HashSet::from_iter( + vec![0, 1, 2, 3].into_iter().map(|x| x as usize), + )) + .unwrap(); + + cluster.must_put(b"z1", b"v1"); + check_key( + &cluster, + b"z1", + b"v1", + Some(true), + None, + Some(vec![1, 2, 3, 5]), + ); + + // Check if every node has the correct configuation. + let new_states = maybe_collect_states(&cluster, 1, Some(vec![1, 2, 3, 5])); + assert_eq!(new_states.len(), 4); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 1 + 2 /* AddPeer */ + 2 // Learner + ); + } + + fail::remove("apply_on_handle_snapshot_finish_1_1"); + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); + // fail::remove("before_tiflash_check_double_write"); + // fail::remove("before_tiflash_do_write"); +} + +#[test] +fn test_add_delayed_started_learner_snapshot() { + let (mut cluster, pd_client) = new_later_add_learner_cluster( + |c: &mut Cluster| { + c.must_put(b"k1", b"v1"); + check_key(c, b"k1", b"v1", Some(true), None, Some(vec![1])); + }, + vec![4], + ); + + // Start Leader store 4. + cluster + .start_with(HashSet::from_iter( + vec![0, 1, 2, 4].into_iter().map(|x| x as usize), + )) + .unwrap(); + + must_put_and_check_key_with_generator( + &mut cluster, + |i: u64| (format!("k{}", i), (0..1024).map(|_| "X").collect()), + 10, + 20, + Some(true), + None, + Some(vec![1, 2, 3, 4]), + ); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + // Simulate 4 is lost, recover its data to node 5. + cluster.stop_node(4); + + // Force a compact log, so the leader have to send snapshot if peer 5 not catch + // up. + { + must_put_and_check_key(&mut cluster, 21, 25, Some(true), None, Some(vec![1, 2, 3])); + let prev_states = maybe_collect_states(&cluster, 1, Some(vec![4])); + assert!( + force_compact_log(&mut cluster, b"k1", Some(vec![1, 2, 3])) + > prev_states + .get(&4) + .unwrap() + .in_disk_apply_state + .get_applied_index() + ); + } + + later_bootstrap_learner_peer(&mut cluster, vec![5], 1); + // After that, we manually compose data, to avoid snapshot sending. + recover_from_peer(&cluster, 4, 5, 1); + // Add node 5 to cluster. + pd_client.must_add_peer(1, new_learner_peer(5, 5)); + + // Start store 5. + cluster + .start_with(HashSet::from_iter( + vec![0, 1, 2, 3].into_iter().map(|x| x as usize), + )) + .unwrap(); + + cluster.must_put(b"z1", b"v1"); + check_key( + &cluster, + b"z1", + b"v1", + Some(true), + None, + Some(vec![1, 2, 3, 5]), + ); + + // Check if every node has the correct configuation. + let new_states = maybe_collect_states(&cluster, 1, Some(vec![1, 2, 3, 5])); + assert_eq!(new_states.len(), 4); + for i in new_states.keys() { + assert_eq!( + new_states + .get(i) + .unwrap() + .in_disk_region_state + .get_region() + .get_peers() + .len(), + 1 + 2 /* AddPeer */ + 2 // Learner + ); + } + + iter_ffi_helpers( + &cluster, + Some(vec![5]), + &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert_eq!(e.pre_handle_count.load(Ordering::SeqCst), 1); + }); + }, + ); + + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/server_cluster_test.rs b/proxy_tests/proxy/server_cluster_test.rs index 1cbc6a3a04e..e1c4e162a33 100644 --- a/proxy_tests/proxy/server_cluster_test.rs +++ b/proxy_tests/proxy/server_cluster_test.rs @@ -14,6 +14,7 @@ use new_mock_engine_store::{ use tikv_util::HandyRwLock; use txn_types::TimeStamp; static INIT: Once = Once::new(); +use crate::proxy::{iter_ffi_helpers, FFIHelperSet}; pub fn init() { INIT.call_once(test_util::setup_for_ci); @@ -96,10 +97,9 @@ fn test_safe_ts_basic() { suite.stop(); } -use std::{error::Error, fs::File, io::Write, net::SocketAddr, sync::Arc}; +use std::{error::Error, net::SocketAddr, sync::Arc}; use hyper::{body, Client, StatusCode, Uri}; -use kvproto::recoverdatapb::RegionMeta; use proxy_server::status_server::StatusServer; use security::SecurityConfig; use tikv::config::ConfigController; @@ -140,27 +140,33 @@ fn test_pprof() { let peer = region.get_peers().get(0); assert!(peer.is_some()); let store_id = peer.unwrap().get_store_id(); - let router = cluster.sim.rl().get_router(store_id); - assert!(router.is_some()); let id = 1; let engine = cluster.get_engine(id); - let mut lock = cluster.ffi_helper_set.lock().unwrap(); - let ffiset = lock.get_mut(&id).unwrap(); - let mut status_server = StatusServer::new( - engine_store_ffi::gen_engine_store_server_helper(ffiset.engine_store_server_helper_ptr), - 1, - ConfigController::default(), - Arc::new(SecurityConfig::default()), - router.unwrap(), - std::env::temp_dir(), - ) - .unwrap(); - let addr = format!("127.0.0.1:{}", test_util::alloc_port()); - status_server.start(addr).unwrap(); - let check_task = check(status_server.listening_addr(), region_id); - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(err) = rt.block_on(check_task) { - panic!("{}", err); - } - status_server.stop(); + iter_ffi_helpers( + &cluster, + Some(vec![id]), + &mut |_, _, ffiset: &mut FFIHelperSet| { + let router = cluster.sim.rl().get_router(store_id); + assert!(router.is_some()); + let mut status_server = StatusServer::new( + engine_store_ffi::gen_engine_store_server_helper( + ffiset.engine_store_server_helper_ptr, + ), + 1, + ConfigController::default(), + Arc::new(SecurityConfig::default()), + router.unwrap(), + std::env::temp_dir(), + ) + .unwrap(); + let addr = format!("127.0.0.1:{}", test_util::alloc_port()); + status_server.start(addr).unwrap(); + let check_task = check(status_server.listening_addr(), region_id); + let rt = tokio::runtime::Runtime::new().unwrap(); + if let Err(err) = rt.block_on(check_task) { + panic!("{}", err); + } + status_server.stop(); + }, + ); } diff --git a/proxy_tests/proxy/snapshot.rs b/proxy_tests/proxy/snapshot.rs new file mode 100644 index 00000000000..69211e2bdfd --- /dev/null +++ b/proxy_tests/proxy/snapshot.rs @@ -0,0 +1,454 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use crate::proxy::*; + +// This is a panic while panic test, which we can not handle. +// This double panic is due to: +// 1. check_applying_snap after apply_snap. +// 2. Drop in PeerFsm which leads to check_applying_snap. +// #[test] +#[should_panic] +fn test_delete_snapshot_after_apply() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + + fail::cfg("apply_pending_snapshot", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + + let first_value = vec![0; 10240]; + // at least 4m data + for i in 0..400 { + let key = format!("{:03}", i); + cluster.must_put(key.as_bytes(), &first_value); + } + let first_key: &[u8] = b"000"; + + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + tikv_util::info!("engine_2 is {}", eng_ids[1]); + let engine_2 = cluster.get_engine(eng_ids[1]); + must_get_none(&engine_2, first_key); + // add peer (engine_2,engine_2) to region 1. + + fail::cfg("on_ob_pre_handle_snapshot_delete", "return").unwrap(); + pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); + + std::thread::sleep(std::time::Duration::from_millis(1000)); + // Note there is no region 1 on engine_2. + let new_states = maybe_collect_states(&cluster, r1, None); + assert!(new_states.get(&eng_ids[1]).is_none()); + + // assert_eq!(new_states.get(&eng_ids[1]).unwrap().in_disk_region_state. + // get_state(), kvproto::raft_serverpb::PeerState::Applying); + + fail::remove("apply_pending_snapshot"); + { + let (key, value) = (b"k2", b"v2"); + cluster.must_put(key, value); + check_key( + &cluster, + key, + value, + Some(true), + None, + Some(vec![eng_ids[1]]), + ); + let engine_2 = cluster.get_engine(eng_ids[1]); + // now snapshot must be applied on peer engine_2 + must_get_equal(&engine_2, first_key, first_value.as_slice()); + } + + fail::remove("apply_pending_snapshot"); + fail::remove("on_ob_pre_handle_snapshot_delete"); + cluster.shutdown(); +} + +#[test] +fn test_huge_multi_snapshot() { + test_huge_snapshot(true) +} + +#[test] +fn test_huge_normal_snapshot() { + test_huge_snapshot(false) +} + +fn test_huge_snapshot(is_multi: bool) { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + cluster.cfg.raft_store.max_snapshot_file_raw_size = if is_multi { + ReadableSize(1024 * 1024) + } else { + ReadableSize(u64::MAX) + }; + + // Disable default max peer count check. + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + + let first_value = vec![0; 10240]; + // at least 4m data + for i in 0..400 { + let key = format!("{:03}", i); + cluster.must_put(key.as_bytes(), &first_value); + } + let first_key: &[u8] = b"000"; + + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + tikv_util::info!("engine_2 is {}", eng_ids[1]); + let engine_2 = cluster.get_engine(eng_ids[1]); + must_get_none(&engine_2, first_key); + // add peer (engine_2,engine_2) to region 1. + pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); + + { + let (key, value) = (b"k2", b"v2"); + cluster.must_put(key, value); + // we can get in memory, since snapshot is pre handled, though it is not + // persisted + check_key( + &cluster, + key, + value, + Some(true), + None, + Some(vec![eng_ids[1]]), + ); + let engine_2 = cluster.get_engine(eng_ids[1]); + // now snapshot must be applied on peer engine_2 + must_get_equal(&engine_2, first_key, first_value.as_slice()); + + // engine 3 will not exec post apply snapshot. + fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); + + tikv_util::info!("engine_3 is {}", eng_ids[2]); + let engine_3 = cluster.get_engine(eng_ids[2]); + must_get_none(&engine_3, first_key); + pd_client.must_add_peer(r1, new_peer(eng_ids[2], eng_ids[2])); + + std::thread::sleep(std::time::Duration::from_millis(500)); + // We have not apply pre handled snapshot, + // we can't be sure if it exists in only get from memory too, since pre handle + // snapshot is async. + must_get_none(&engine_3, first_key); + fail::remove("on_ob_post_apply_snapshot"); + + std::thread::sleep(std::time::Duration::from_millis(500)); + tikv_util::info!("put to engine_3"); + let (key, value) = (b"k3", b"v3"); + cluster.must_put(key, value); + tikv_util::info!("check engine_3"); + check_key(&cluster, key, value, Some(true), None, None); + } + + fail::remove("on_can_apply_snapshot"); + + cluster.shutdown(); +} + +#[test] +fn test_concurrent_snapshot() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + disable_auto_gen_compact_log(&mut cluster); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + // Force peer 2 to be followers all the way. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 2) + .msg_type(MessageType::MsgRequestVote) + .direction(Direction::Send), + )); + cluster.must_transfer_leader(r1, new_peer(1, 1)); + cluster.must_put(b"k3", b"v3"); + // Pile up snapshots of overlapped region ranges and deliver them all at once. + let (tx, rx) = mpsc::channel(); + cluster + .sim + .wl() + .add_recv_filter(3, Box::new(CollectSnapshotFilter::new(tx))); + pd_client.must_add_peer(r1, new_peer(3, 3)); + // Ensure the snapshot of range ("", "") is sent and piled in filter. + if let Err(e) = rx.recv_timeout(Duration::from_secs(1)) { + panic!("the snapshot is not sent before split, e: {:?}", e); + } + + // Occasionally fails. + // let region1 = cluster.get_region(b"k1"); + // // Split the region range and then there should be another snapshot for the + // split ranges. cluster.must_split(®ion, b"k2"); + // check_key(&cluster, b"k3", b"v3", None, Some(true), Some(vec![3])); + // + // // Ensure the regions work after split. + // cluster.must_put(b"k11", b"v11"); + // check_key(&cluster, b"k11", b"v11", Some(true), None, Some(vec![3])); + // cluster.must_put(b"k4", b"v4"); + // check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![3])); + + cluster.shutdown(); +} + +fn new_split_region_cluster(count: u64) -> (Cluster, Arc) { + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let _ = cluster.run_conf_change(); + for i in 0..count { + let k = format!("k{:0>4}", 2 * i + 1); + let v = format!("v{}", 2 * i + 1); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + // k1 in [ , ] splited by k2 -> (, k2] [k2, ) + // k3 in [k2, ) splited by k4 -> [k2, k4) [k4, ) + for i in 0..count { + let k = format!("k{:0>4}", 2 * i + 1); + let region = cluster.get_region(k.as_bytes()); + let sp = format!("k{:0>4}", 2 * i + 2); + cluster.must_split(®ion, sp.as_bytes()); + } + + (cluster, pd_client) +} + +#[test] +fn test_prehandle_fail() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + + // Disable raft log gc in this test case. + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::secs(60); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + // If we fail to call pre-handle snapshot, we can still handle it when apply + // snapshot. + fail::cfg("before_actually_pre_handle", "return").unwrap(); + pd_client.must_add_peer(r1, new_peer(eng_ids[1], eng_ids[1])); + check_key( + &cluster, + b"k1", + b"v1", + Some(true), + Some(true), + Some(vec![eng_ids[1]]), + ); + fail::remove("before_actually_pre_handle"); + + // If we failed in apply snapshot(not panic), even if per_handle_snapshot is not + // called. + fail::cfg("on_ob_pre_handle_snapshot", "return").unwrap(); + check_key( + &cluster, + b"k1", + b"v1", + Some(false), + Some(false), + Some(vec![eng_ids[2]]), + ); + pd_client.must_add_peer(r1, new_peer(eng_ids[2], eng_ids[2])); + check_key( + &cluster, + b"k1", + b"v1", + Some(true), + Some(true), + Some(vec![eng_ids[2]]), + ); + fail::remove("on_ob_pre_handle_snapshot"); + + cluster.shutdown(); +} + +#[test] +fn test_split_merge() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + // May fail if cluster.start, since node 2 is not in region1.peers(), + // and node 2 has not bootstrap region1, + // because region1 is not bootstrap if we only call cluster.start() + cluster.run(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + check_key(&cluster, b"k3", b"v3", Some(true), None, None); + + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + cluster.must_split(&r1, b"k2"); + let r1_new = cluster.get_region(b"k1"); + let r3_new = cluster.get_region(b"k3"); + + assert_eq!(r1.get_id(), r3_new.get_id()); + + iter_ffi_helpers(&cluster, None, &mut |id: u64, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + if !server.kvstore.contains_key(&r1_new.get_id()) { + panic!("node {} has no region {}", id, r1_new.get_id()) + } + if !server.kvstore.contains_key(&r3_new.get_id()) { + panic!("node {} has no region {}", id, r3_new.get_id()) + } + // Region meta must equal + assert_eq!(server.kvstore.get(&r1_new.get_id()).unwrap().region, r1_new); + assert_eq!(server.kvstore.get(&r3_new.get_id()).unwrap().region, r3_new); + + // Can get from disk + check_key(&cluster, b"k1", b"v1", None, Some(true), None); + check_key(&cluster, b"k3", b"v3", None, Some(true), None); + // TODO Region in memory data must not contradict, but now we do not + // delete data + }); + + pd_client.must_merge(r1_new.get_id(), r3_new.get_id()); + let _r1_new2 = cluster.get_region(b"k1"); + let r3_new2 = cluster.get_region(b"k3"); + + iter_ffi_helpers(&cluster, None, &mut |id: u64, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + + // The left region is removed + if server.kvstore.contains_key(&r1_new.get_id()) { + panic!("node {} should has no region {}", id, r1_new.get_id()) + } + if !server.kvstore.contains_key(&r3_new.get_id()) { + panic!("node {} has no region {}", id, r3_new.get_id()) + } + // Region meta must equal + assert_eq!( + server.kvstore.get(&r3_new2.get_id()).unwrap().region, + r3_new2 + ); + + // Can get from disk + check_key(&cluster, b"k1", b"v1", None, Some(true), None); + check_key(&cluster, b"k3", b"v3", None, Some(true), None); + // TODO Region in memory data must not contradict, but now we do not delete data + + let origin_epoch = r3_new.get_region_epoch(); + let new_epoch = r3_new2.get_region_epoch(); + // PrepareMerge + CommitMerge, so it should be 2. + assert_eq!(new_epoch.get_version(), origin_epoch.get_version() + 2); + assert_eq!(new_epoch.get_conf_ver(), origin_epoch.get_conf_ver()); + }); + + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); +} + +#[test] +fn test_basic_concurrent_snapshot() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); + + disable_auto_gen_compact_log(&mut cluster); + + // Disable default max peer count check. + pd_client.disable_default_operator(); + + let _ = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + let region1 = cluster.get_region(b"k1"); + cluster.must_split(®ion1, b"k2"); + let r1 = cluster.get_region(b"k1").get_id(); + let r3 = cluster.get_region(b"k3").get_id(); + + fail::cfg("before_actually_pre_handle", "sleep(1000)").unwrap(); + tikv_util::info!("region k1 {} k3 {}", r1, r3); + let pending_count = cluster + .engines + .get(&2) + .unwrap() + .kv + .pending_applies_count + .clone(); + pd_client.add_peer(r1, new_peer(2, 2)); + pd_client.add_peer(r3, new_peer(2, 2)); + // handle_pending_applies will do nothing. + fail::cfg("apply_pending_snapshot", "return").unwrap(); + // wait snapshot is generated. + std::thread::sleep(std::time::Duration::from_millis(500)); + // Now, region k1 and k3 are not handled, since pre-handle process is not + // finished. This is because `pending_applies_count` is not greater than + // `snap_handle_pool_size`, So there are no `handle_pending_applies` + // until `on_timeout`. + + fail::remove("apply_pending_snapshot"); + assert_eq!(pending_count.load(Ordering::SeqCst), 2); + std::thread::sleep(std::time::Duration::from_millis(600)); + check_key(&cluster, b"k1", b"v1", None, Some(true), Some(vec![1, 2])); + check_key(&cluster, b"k3", b"v3", None, Some(true), Some(vec![1, 2])); + // Now, k1 and k3 are handled. + assert_eq!(pending_count.load(Ordering::SeqCst), 0); + + fail::remove("before_actually_pre_handle"); + + cluster.shutdown(); +} + +#[test] +fn test_many_concurrent_snapshot() { + let c = 4; + let (mut cluster, pd_client) = new_split_region_cluster(c); + + for i in 0..c { + let k = format!("k{:0>4}", 2 * i + 1); + let region_id = cluster.get_region(k.as_bytes()).get_id(); + pd_client.must_add_peer(region_id, new_peer(2, 2)); + } + + for i in 0..c { + let k = format!("k{:0>4}", 2 * i + 1); + let v = format!("v{}", 2 * i + 1); + check_key( + &cluster, + k.as_bytes(), + v.as_bytes(), + Some(true), + Some(true), + Some(vec![2]), + ); + } + + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/write.rs b/proxy_tests/proxy/write.rs new file mode 100644 index 00000000000..edf439427f2 --- /dev/null +++ b/proxy_tests/proxy/write.rs @@ -0,0 +1,520 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use crate::proxy::*; + +#[test] +fn test_interaction() { + // TODO Maybe we should pick this test to TiKV. + // This test is to check if empty entries can affect pre_exec and post_exec. + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + fail::cfg("try_flush_data", "return(0)").unwrap(); + let _ = cluster.run(); + + cluster.must_put(b"k1", b"v1"); + let region = cluster.get_region(b"k1"); + let region_id = region.get_id(); + + // Wait until all nodes have (k1, v1). + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + + let prev_states = collect_all_states(&cluster, region_id); + let compact_log = test_raftstore::new_compact_log_request(100, 10); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let _ = cluster + .call_command_on_leader(req.clone(), Duration::from_secs(3)) + .unwrap(); + + // Empty result can also be handled by post_exec + let new_states = must_wait_until_cond_states( + &cluster, + region_id, + &prev_states, + &|old: &States, new: &States| { + // Must wait advance of apply_index. + old.in_memory_apply_state != new.in_memory_apply_state + || old.in_memory_applied_term != new.in_memory_applied_term + }, + ); + + must_altered_memory_apply_state(&prev_states, &new_states); + must_unaltered_memory_apply_term(&prev_states, &new_states); + must_unaltered_disk_apply_state(&prev_states, &new_states); + + cluster.must_put(b"k2", b"v2"); + // Wait until all nodes have (k2, v2). + check_key(&cluster, b"k2", b"v2", Some(true), None, None); + + fail::cfg("on_empty_cmd_normal", "return").unwrap(); + let prev_states = collect_all_states(&cluster, region_id); + let _ = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + + std::thread::sleep(std::time::Duration::from_millis(400)); + let new_states = collect_all_states(&cluster, region_id); + must_altered_memory_apply_state(&prev_states, &new_states); + must_unaltered_memory_apply_term(&prev_states, &new_states); + + fail::remove("try_flush_data"); + fail::remove("on_empty_cmd_normal"); + cluster.shutdown(); +} + +#[test] +fn test_leadership_change_filter() { + test_leadership_change_impl(true); +} + +#[test] +fn test_leadership_change_no_persist() { + test_leadership_change_impl(false); +} + +fn test_leadership_change_impl(filter: bool) { + // Test if a empty command can be observed when leadership changes. + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + disable_auto_gen_compact_log(&mut cluster); + + if filter { + // We don't handle CompactLog at all. + fail::cfg("try_flush_data", "return(0)").unwrap(); + } else { + // We don't return Persist after handling CompactLog. + fail::cfg("no_persist_compact_log", "return").unwrap(); + } + // Do not handle empty cmd. + fail::cfg("on_empty_cmd_normal", "return").unwrap(); + let _ = cluster.run(); + + cluster.must_put(b"k1", b"v1"); + let region = cluster.get_region(b"k1"); + let region_id = region.get_id(); + + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); + let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_1.clone()); + + cluster.must_put(b"k2", b"v2"); + fail::cfg("on_empty_cmd_normal", "return").unwrap(); + + // Wait until all nodes have (k2, v2), then transfer leader. + check_key(&cluster, b"k2", b"v2", Some(true), None, None); + if filter { + // We should also filter normal kv, since a empty result can also be invoke + // pose_exec. + fail::cfg("on_post_exec_normal", "return(false)").unwrap(); + } + let prev_states = collect_all_states(&cluster, region_id); + cluster.must_transfer_leader(region.get_id(), peer_2.clone()); + + // The states remain the same, since we don't observe empty cmd. + let new_states = collect_all_states(&cluster, region_id); + if filter { + must_unaltered_memory_apply_state(&prev_states, &new_states); + must_unaltered_memory_apply_term(&prev_states, &new_states); + } + must_unaltered_disk_apply_state(&prev_states, &new_states); + + fail::remove("on_empty_cmd_normal"); + // We need forward empty cmd generated by leadership changing to TiFlash. + cluster.must_transfer_leader(region.get_id(), peer_1.clone()); + std::thread::sleep(std::time::Duration::from_secs(1)); + + let new_states = collect_all_states(&cluster, region_id); + must_altered_memory_apply_state(&prev_states, &new_states); + must_altered_memory_apply_term(&prev_states, &new_states); + + if filter { + fail::remove("try_flush_data"); + fail::remove("on_post_exec_normal"); + } else { + fail::remove("no_persist_compact_log"); + } + cluster.shutdown(); +} + +#[test] +fn test_kv_write_always_persist() { + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + let _ = cluster.run(); + + cluster.must_put(b"k0", b"v0"); + let region_id = cluster.get_region(b"k0").get_id(); + + let mut prev_states = collect_all_states(&cluster, region_id); + // Always persist on every command + fail::cfg("on_post_exec_normal_end", "return(true)").unwrap(); + for i in 1..20 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + + // We can't always get kv from disk, even we commit everytime, + // since they are filtered by engint_tiflash + check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); + + // This may happen after memory write data and before commit. + // We must check if we already have in memory. + check_apply_state(&cluster, region_id, &prev_states, Some(false), None); + std::thread::sleep(std::time::Duration::from_millis(20)); + // However, advanced apply index will always persisted. + let new_states = collect_all_states(&cluster, region_id); + must_altered_disk_apply_state(&prev_states, &new_states); + prev_states = new_states; + } + fail::remove("on_post_exec_normal_end"); + cluster.shutdown(); +} + +#[test] +fn test_kv_write() { + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + fail::cfg("on_post_exec_normal", "return(false)").unwrap(); + fail::cfg("on_post_exec_admin", "return(false)").unwrap(); + // Abandon CompactLog and previous flush. + fail::cfg("try_flush_data", "return(0)").unwrap(); + + let _ = cluster.run(); + + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + // Since we disable all observers, we can get nothing in either memory and disk. + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key( + &cluster, + k.as_bytes(), + v.as_bytes(), + Some(false), + Some(false), + None, + ); + } + + // We can read initial raft state, since we don't persist meta either. + let r1 = cluster.get_region(b"k1").get_id(); + let prev_states = collect_all_states(&cluster, r1); + + fail::remove("on_post_exec_normal"); + fail::remove("on_post_exec_admin"); + for i in 10..20 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + // Since we enable all observers, we can get in memory. + // However, we get nothing in disk since we don't persist. + for i in 10..20 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key( + &cluster, + k.as_bytes(), + v.as_bytes(), + Some(true), + Some(false), + None, + ); + } + + let new_states = collect_all_states(&cluster, r1); + must_altered_memory_apply_state(&prev_states, &new_states); + must_unaltered_disk_apply_state(&prev_states, &new_states); + + std::thread::sleep(std::time::Duration::from_millis(20)); + fail::remove("try_flush_data"); + + let prev_states = collect_all_states(&cluster, r1); + // Write more after we force persist when CompactLog. + for i in 20..30 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + // We can read from mock-store's memory, we are not sure if we can read from + // disk, since there may be or may not be a CompactLog. + for i in 11..30 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); + } + + // Force a compact log to persist. + let region_r = cluster.get_region("k1".as_bytes()); + let region_id = region_r.get_id(); + let compact_log = test_raftstore::new_compact_log_request(1000, 100); + let req = + test_raftstore::new_admin_request(region_id, region_r.get_region_epoch(), compact_log); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(res.get_header().has_error(), "{:?}", res); + // This CompactLog is executed with an error. It will not trigger a compaction. + // However, it can trigger a persistence. + for i in 11..30 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key( + &cluster, + k.as_bytes(), + v.as_bytes(), + Some(true), + Some(true), + None, + ); + } + + let new_states = collect_all_states(&cluster, r1); + must_altered_memory_apply_state(&prev_states, &new_states); + must_altered_disk_apply_state(&prev_states, &new_states); + + fail::remove("no_persist_compact_log"); + cluster.shutdown(); +} + +#[test] +fn test_unsupport_admin_cmd() { + // ComputeHash and VerifyHash shall be filtered. + let (mut cluster, _pd_client) = new_mock_cluster(0, 2); + + cluster.run(); + + cluster.must_put(b"k", b"v"); + let region = cluster.get_region("k".as_bytes()); + let region_id = region.get_id(); + + let r = new_compute_hash_request(); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), r); + let _ = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + + let r = new_verify_hash_request(vec![7, 8, 9, 0], 1000); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), r); + let _ = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + + cluster.must_put(b"k2", b"v2"); + cluster.shutdown(); +} + +#[test] +fn test_old_compact_log() { + // If we just return None for CompactLog, the region state in ApplyFsm will + // change. Because there is no rollback in new implementation. + // This is a ERROR state. + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + cluster.run(); + + // We don't return Persist after handling CompactLog. + fail::cfg("no_persist_compact_log", "return").unwrap(); + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); + } + + let region = cluster.get_region(b"k1"); + let region_id = region.get_id(); + let prev_state = collect_all_states(&cluster, region_id); + let (compact_index, compact_term) = get_valid_compact_index(&prev_state); + let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let _ = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + + // Wait for state applys. + std::thread::sleep(std::time::Duration::from_secs(2)); + + let new_state = collect_all_states(&cluster, region_id); + must_altered_memory_apply_state(&prev_state, &new_state); + must_unaltered_disk_apply_state(&prev_state, &new_state); + + fail::remove("no_persist_compact_log"); + cluster.shutdown(); +} + +#[test] +fn test_compact_log() { + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + disable_auto_gen_compact_log(&mut cluster); + + cluster.run(); + + cluster.must_put(b"k", b"v"); + let region = cluster.get_region("k".as_bytes()); + let region_id = region.get_id(); + + fail::cfg("on_empty_cmd_normal", "return").unwrap(); + fail::cfg("try_flush_data", "return(0)").unwrap(); + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); + } + + std::thread::sleep(std::time::Duration::from_millis(500)); + let prev_state = collect_all_states(&cluster, region_id); + + let (compact_index, compact_term) = get_valid_compact_index(&prev_state); + let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + // compact index should less than applied index + assert!(!res.get_header().has_error(), "{:?}", res); + + // TODO(tiflash) Make sure compact log is filtered successfully. + // Can be abstract to a retry function. + std::thread::sleep(std::time::Duration::from_millis(500)); + + // CompactLog is filtered, because we can't flush data. + // However, we can still observe apply index advanced + let new_state = collect_all_states(&cluster, region_id); + must_altered_disk_apply_index(&prev_state, &new_state, 0); + must_altered_memory_apply_index(&prev_state, &new_state, 1); + must_unaltered_memory_truncated_state(&prev_state, &new_state); + must_unaltered_disk_truncated_state(&prev_state, &new_state); + + fail::remove("on_empty_cmd_normal"); + fail::remove("try_flush_data"); + + let (compact_index, compact_term) = get_valid_compact_index(&new_state); + let prev_state = new_state; + let compact_log = test_raftstore::new_compact_log_request(compact_index, compact_term); + let req = test_raftstore::new_admin_request(region_id, region.get_region_epoch(), compact_log); + let res = cluster + .call_command_on_leader(req, Duration::from_secs(3)) + .unwrap(); + assert!(!res.get_header().has_error(), "{:?}", res); + + cluster.must_put(b"kz", b"vz"); + check_key(&cluster, b"kz", b"vz", Some(true), None, None); + + // CompactLog is not filtered + let new_state = collect_all_states(&cluster, region_id); + // compact log + (kz,vz) + must_altered_memory_apply_index(&prev_state, &new_state, 2); + must_altered_memory_truncated_state(&prev_state, &new_state); + + cluster.shutdown(); +} + +#[test] +fn test_empty_cmd() { + // Test if a empty command can be observed when leadership changes. + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + // Disable compact log + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(1000); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(10000); + cluster.cfg.raft_store.snap_apply_batch_size = ReadableSize(50000); + cluster.cfg.raft_store.raft_log_gc_threshold = 1000; + + let _ = cluster.run(); + + cluster.must_put(b"k1", b"v1"); + let region = cluster.get_region(b"k1"); + let region_id = region.get_id(); + let eng_ids = cluster + .engines + .iter() + .map(|e| e.0.to_owned()) + .collect::>(); + let peer_1 = find_peer(®ion, eng_ids[0]).cloned().unwrap(); + let peer_2 = find_peer(®ion, eng_ids[1]).cloned().unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_1.clone()); + std::thread::sleep(std::time::Duration::from_secs(2)); + + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + let prev_states = collect_all_states(&cluster, region_id); + + // We need forward empty cmd generated by leadership changing to TiFlash. + cluster.must_transfer_leader(region.get_id(), peer_2.clone()); + std::thread::sleep(std::time::Duration::from_secs(2)); + + let new_states = collect_all_states(&cluster, region_id); + must_altered_memory_apply_state(&prev_states, &new_states); + must_altered_memory_apply_term(&prev_states, &new_states); + + std::thread::sleep(std::time::Duration::from_secs(2)); + fail::cfg("on_empty_cmd_normal", "return").unwrap(); + + let prev_states = new_states; + cluster.must_transfer_leader(region.get_id(), peer_1.clone()); + std::thread::sleep(std::time::Duration::from_secs(2)); + + let new_states = collect_all_states(&cluster, region_id); + must_unaltered_memory_apply_state(&prev_states, &new_states); + must_unaltered_memory_apply_term(&prev_states, &new_states); + + fail::remove("on_empty_cmd_normal"); + + cluster.shutdown(); +} + +#[test] +fn test_old_kv_write() { + let (mut cluster, _pd_client) = new_mock_cluster(0, 3); + + cluster.cfg.proxy_compat = false; + // No persist will be triggered by CompactLog + fail::cfg("no_persist_compact_log", "return").unwrap(); + let _ = cluster.run(); + + cluster.must_put(b"k0", b"v0"); + // check_key(&cluster, b"k0", b"v0", Some(false), Some(false), None); + + // We can read initial raft state, since we don't persist meta either. + let r1 = cluster.get_region(b"k0").get_id(); + let prev_states = collect_all_states(&mut cluster, r1); + + for i in 1..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + cluster.must_put(k.as_bytes(), v.as_bytes()); + } + + // We can get from memory. + for i in 0..10 { + let k = format!("k{}", i); + let v = format!("v{}", i); + check_key(&cluster, k.as_bytes(), v.as_bytes(), Some(true), None, None); + } + + let new_states = collect_all_states(&mut cluster, r1); + must_altered_memory_apply_state(&prev_states, &new_states); + must_unaltered_disk_apply_state(&prev_states, &new_states); + + fail::remove("no_persist_compact_log"); + cluster.shutdown(); +} diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 519af996bc4..c795931123c 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 15776819379826780689ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4954147441045435430ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index b4ded43adf3..190de54c6c5 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -87,13 +87,13 @@ struct CppStrWithView { }; struct PageWithView { - RawCppPtr inner; - BaseBuffView view; + RawCppPtr inner; + BaseBuffView view; }; struct PageWithViewVec { - PageWithView * inner; - const uint64_t len; + PageWithView *inner; + const uint64_t len; }; enum class HttpRequestStatus : uint8_t { @@ -153,6 +153,21 @@ enum class KVGetStatus : uint32_t { NotFound, }; +enum class FastAddPeerStatus : uint32_t { + Ok = 0, + WaitForData, + OtherError, + NoSuitable, + BadData, + FailedInject, +}; + +struct FastAddPeerRes { + FastAddPeerStatus status; + CppStrWithView apply_state; + CppStrWithView region; +}; + struct RaftStoreProxyFFIHelper { RaftStoreProxyPtr proxy_ptr; RaftProxyStatus (*fn_handle_get_proxy_status)(RaftStoreProxyPtr); @@ -200,20 +215,23 @@ struct EngineStoreServerHelper { uint8_t (*fn_need_flush_data)(EngineStoreServerWrap *, uint64_t); uint8_t (*fn_try_flush_data)(EngineStoreServerWrap *, uint64_t, uint8_t, uint64_t, uint64_t); - RawCppPtr (*fn_create_write_batch)(); - void (*fn_write_batch_put_page)(RawVoidPtr, BaseBuffView, BaseBuffView); - void (*fn_write_batch_del_page)(RawVoidPtr, BaseBuffView); - uint64_t (*fn_write_batch_size)(RawVoidPtr); - uint8_t (*fn_write_batch_is_empty)(RawVoidPtr); - void (*fn_write_batch_merge)(RawVoidPtr, RawVoidPtr); - void (*fn_write_batch_clear)(RawVoidPtr); - void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); - PageWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, BaseBuffView); - PageWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, BaseBuffView, BaseBuffView); - void (*fn_gc_page_with_view_vec)(PageWithView * inner, uint64_t len); - void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); - CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, BaseBuffView); - uint8_t (*fn_ps_is_empty)(const EngineStoreServerWrap *); + RawCppPtr (*fn_create_write_batch)(); + void (*fn_write_batch_put_page)(RawVoidPtr, BaseBuffView, BaseBuffView); + void (*fn_write_batch_del_page)(RawVoidPtr, BaseBuffView); + uint64_t (*fn_write_batch_size)(RawVoidPtr); + uint8_t (*fn_write_batch_is_empty)(RawVoidPtr); + void (*fn_write_batch_merge)(RawVoidPtr, RawVoidPtr); + void (*fn_write_batch_clear)(RawVoidPtr); + void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); + PageWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, + BaseBuffView); + PageWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, + BaseBuffView, BaseBuffView); + void (*fn_gc_page_with_view_vec)(PageWithView *inner, uint64_t len); + void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); + CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, + BaseBuffView); + uint8_t (*fn_ps_is_empty)(const EngineStoreServerWrap *); void (*fn_atomic_update_proxy)(EngineStoreServerWrap *, RaftStoreProxyFFIHelper *); void (*fn_handle_destroy)(EngineStoreServerWrap *, uint64_t); @@ -239,5 +257,7 @@ struct EngineStoreServerHelper { void (*fn_handle_safe_ts_update)(EngineStoreServerWrap *, uint64_t region_id, uint64_t self_safe_ts, uint64_t leader_safe_ts); + FastAddPeerRes (*fn_fast_add_peer)(EngineStoreServerWrap *, + uint64_t region_id, uint64_t new_peer_id); }; } // namespace DB diff --git a/rust-toolchain b/rust-toolchain index 2181086f8d2..4e5f9a4d82b 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-07-31 +nightly-2022-11-15 diff --git a/scripts/clippy b/scripts/clippy index c5999ad670c..7685cddfeeb 100755 --- a/scripts/clippy +++ b/scripts/clippy @@ -19,11 +19,15 @@ fi # - Enables `significant_drop_in_scrutinee` after # https://github.com/rust-lang/rust-clippy/issues/8963 is fixed. # - `derive_partial_eq_without_eq` has compilation overhead. +# - Blocking issue for enabling `result_large_err` is the protobuf messages. +# - Blocking issue for clippy::large_enum_variant is the raftstore peer message. CLIPPY_LINTS=( -A clippy::module_inception \ + -A clippy::result_large_err \ + -A clippy::large_enum_variant \ -A clippy::should_implement_trait \ -A clippy::too_many_arguments \ - -A clippy::blacklisted_name \ + -A clippy::disallowed_names \ -A clippy::redundant_closure \ -A clippy::field_reassign_with_default \ -A clippy::wrong_self_convention \ diff --git a/src/config.rs b/src/config.rs index c978b1bf90a..e9eca154d6e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -113,12 +113,7 @@ fn memory_limit_for_cf(is_raft_db: bool, cf: &str, total_mem: u64) -> ReadableSi (false, CF_WRITE) => (0.15, 0, usize::MAX), _ => unreachable!(), }; - let mut size = (total_mem as f64 * ratio) as usize; - if size < min { - size = min; - } else if size > max { - size = max; - } + let size = ((total_mem as f64 * ratio) as usize).clamp(min, max); ReadableSize::mb(size as u64 / MIB) } @@ -182,13 +177,13 @@ impl Default for TitanCfConfig { impl TitanCfConfig { fn build_opts(&self) -> RocksTitanDbOptions { let mut opts = RocksTitanDbOptions::new(); - opts.set_min_blob_size(self.min_blob_size.0 as u64); + opts.set_min_blob_size(self.min_blob_size.0); opts.set_blob_file_compression(self.blob_file_compression.into()); opts.set_blob_cache(self.blob_cache_size.0 as usize, -1, false, 0.0); - opts.set_min_gc_batch_size(self.min_gc_batch_size.0 as u64); - opts.set_max_gc_batch_size(self.max_gc_batch_size.0 as u64); + opts.set_min_gc_batch_size(self.min_gc_batch_size.0); + opts.set_max_gc_batch_size(self.max_gc_batch_size.0); opts.set_discardable_ratio(self.discardable_ratio); - opts.set_merge_small_file_threshold(self.merge_small_file_threshold.0 as u64); + opts.set_merge_small_file_threshold(self.merge_small_file_threshold.0); opts.set_blob_run_mode(self.blob_run_mode.into()); opts.set_level_merge(self.level_merge); opts.set_range_merge(self.range_merge); @@ -254,10 +249,7 @@ fn get_background_job_limits_impl( ); // Cap max_sub_compactions to allow at least two compactions. let max_compactions = max_background_jobs - max_background_flushes; - let max_sub_compactions: u32 = cmp::max( - 1, - cmp::min(defaults.max_sub_compactions, (max_compactions - 1) as u32), - ); + let max_sub_compactions: u32 = (max_compactions - 1).clamp(1, defaults.max_sub_compactions); // Maximum background GC threads for Titan let max_titan_background_gc = cmp::min(defaults.max_titan_background_gc, cpu_num); @@ -1123,7 +1115,7 @@ impl Default for DbConfig { rate_limiter_auto_tuned: true, bytes_per_sync: ReadableSize::mb(1), wal_bytes_per_sync: ReadableSize::kb(512), - max_sub_compactions: bg_job_limits.max_sub_compactions as u32, + max_sub_compactions: bg_job_limits.max_sub_compactions, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, @@ -1179,8 +1171,8 @@ impl DbConfig { } } - opts.set_bytes_per_sync(self.bytes_per_sync.0 as u64); - opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0 as u64); + opts.set_bytes_per_sync(self.bytes_per_sync.0); + opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); opts.set_max_subcompactions(self.max_sub_compactions); opts.set_writable_file_max_buffer_size(self.writable_file_max_buffer_size.0 as i32); opts.set_use_direct_io_for_flush_and_compaction( @@ -1434,7 +1426,7 @@ impl Default for RaftDbConfig { info_log_keep_log_file_num: 10, info_log_dir: "".to_owned(), info_log_level: RocksLogLevel::Info, - max_sub_compactions: bg_job_limits.max_sub_compactions as u32, + max_sub_compactions: bg_job_limits.max_sub_compactions, writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: true, @@ -1481,8 +1473,8 @@ impl RaftDbConfig { opts.enable_unordered_write(self.enable_unordered_write); opts.allow_concurrent_memtable_write(self.allow_concurrent_memtable_write); opts.add_event_listener(RocksEventListener::new("raft", None)); - opts.set_bytes_per_sync(self.bytes_per_sync.0 as u64); - opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0 as u64); + opts.set_bytes_per_sync(self.bytes_per_sync.0); + opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); // TODO maybe create a new env for raft engine if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); @@ -2566,9 +2558,6 @@ pub struct CdcConfig { // Deprecated! preserved for compatibility check. #[online_config(skip)] #[doc(hidden)] - pub raw_min_ts_outlier_threshold: ReadableDuration, - #[online_config(skip)] - #[doc(hidden)] #[serde(skip_serializing)] pub old_value_cache_size: usize, } @@ -2576,7 +2565,7 @@ pub struct CdcConfig { impl Default for CdcConfig { fn default() -> Self { Self { - min_ts_interval: ReadableDuration::secs(1), + min_ts_interval: ReadableDuration::millis(200), hibernate_regions_compatible: true, // 4 threads for incremental scan. incremental_scan_threads: 4, @@ -2591,8 +2580,6 @@ impl Default for CdcConfig { sink_memory_quota: ReadableSize::mb(512), // 512MB memory for old value cache. old_value_cache_memory_quota: ReadableSize::mb(512), - // Trigger raw region outlier judgement if resolved_ts's lag is over 60s. - raw_min_ts_outlier_threshold: ReadableDuration::secs(60), // Deprecated! preserved for compatibility check. old_value_cache_size: 0, } @@ -2634,14 +2621,6 @@ impl CdcConfig { ); self.incremental_scan_ts_filter_ratio = default_cfg.incremental_scan_ts_filter_ratio; } - if self.raw_min_ts_outlier_threshold.is_zero() { - warn!( - "cdc.raw_min_ts_outlier_threshold should be larger than 0, - change it to {}", - default_cfg.raw_min_ts_outlier_threshold - ); - self.raw_min_ts_outlier_threshold = default_cfg.raw_min_ts_outlier_threshold; - } Ok(()) } } @@ -3646,7 +3625,7 @@ pub fn persist_config(config: &TikvConfig) -> Result<(), String> { } // Create parent directory if missing. - if let Err(e) = fs::create_dir_all(&store_path) { + if let Err(e) = fs::create_dir_all(store_path) { return Err(format!( "create parent directory '{}' failed: {}", store_path.to_str().unwrap(), diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 1b7d42a8575..3274700d812 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -1,6 +1,8 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{borrow::Cow, future::Future, marker::PhantomData, sync::Arc, time::Duration}; +use std::{ + borrow::Cow, future::Future, iter::FromIterator, marker::PhantomData, sync::Arc, time::Duration, +}; use ::tracker::{ set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, @@ -485,7 +487,7 @@ impl Endpoint { #[inline] pub fn parse_and_handle_unary_request( &self, - req: coppb::Request, + mut req: coppb::Request, peer: Option, ) -> impl Future> { let tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(RequestInfo::new( @@ -493,23 +495,27 @@ impl Endpoint { RequestType::Unknown, req.start_ts, ))); + let result_of_batch = self.process_batch_tasks(&mut req, &peer); set_tls_tracker_token(tracker); let result_of_future = self .parse_request_and_check_memory_locks(req, peer, false) .map(|(handler_builder, req_ctx)| self.handle_unary_request(req_ctx, handler_builder)); - async move { let res = match result_of_future { - Err(e) => make_error_response(e).into(), + Err(e) => { + let mut res = make_error_response(e); + let batch_res = result_of_batch.await; + res.set_batch_responses(batch_res.into()); + res.into() + } Ok(handle_fut) => { - let mut response = handle_fut - .await - .unwrap_or_else(|e| make_error_response(e).into()); - let scan_detail_v2 = response.mut_exec_details_v2().mut_scan_detail_v2(); + let (handle_res, batch_res) = futures::join!(handle_fut, result_of_batch); + let mut res = handle_res.unwrap_or_else(|e| make_error_response(e).into()); + res.set_batch_responses(batch_res.into()); GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - tracker.write_scan_detail(scan_detail_v2); + tracker.write_scan_detail(res.mut_exec_details_v2().mut_scan_detail_v2()); }); - response + res } }; GLOBAL_TRACKERS.remove(tracker); @@ -517,6 +523,82 @@ impl Endpoint { } } + // process_batch_tasks process the input batched coprocessor tasks if any, + // prepare all the requests and schedule them into the read pool, then + // collect all the responses and convert them into the `StoreBatchResponse` + // type. + pub fn process_batch_tasks( + &self, + req: &mut coppb::Request, + peer: &Option, + ) -> impl Future> { + let mut batch_futs = Vec::with_capacity(req.tasks.len()); + let batch_reqs: Vec<(coppb::Request, u64)> = req + .take_tasks() + .iter_mut() + .map(|task| { + let mut new_req = req.clone(); + // Disable the coprocessor cache path for the batched tasks, the + // coprocessor cache related fields are not passed in the "task" by now. + new_req.is_cache_enabled = false; + new_req.ranges = task.take_ranges(); + let new_context = new_req.mut_context(); + new_context.set_region_id(task.get_region_id()); + new_context.set_region_epoch(task.take_region_epoch()); + new_context.set_peer(task.take_peer()); + (new_req, task.get_task_id()) + }) + .collect(); + for (cur_req, task_id) in batch_reqs.into_iter() { + let request_info = RequestInfo::new( + cur_req.get_context(), + RequestType::Unknown, + cur_req.start_ts, + ); + let mut response = coppb::StoreBatchTaskResponse::new(); + response.set_task_id(task_id); + match self.parse_request_and_check_memory_locks(cur_req, peer.clone(), false) { + Ok((handler_builder, req_ctx)) => { + let cur_tracker = GLOBAL_TRACKERS.insert(::tracker::Tracker::new(request_info)); + set_tls_tracker_token(cur_tracker); + let fut = self.handle_unary_request(req_ctx, handler_builder); + let fut = async move { + let res = fut.await; + match res { + Ok(mut resp) => { + response.set_data(resp.take_data()); + if let Some(err) = resp.region_error.take() { + response.set_region_error(err); + } + if let Some(lock_info) = resp.locked.take() { + response.set_locked(lock_info); + } + response.set_other_error(resp.take_other_error()); + GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { + tracker.write_scan_detail( + response.mut_exec_details_v2().mut_scan_detail_v2(), + ); + }); + } + Err(e) => { + make_error_batch_response(&mut response, e); + } + } + GLOBAL_TRACKERS.remove(cur_tracker); + response + }; + + batch_futs.push(future::Either::Left(fut)); + } + Err(e) => batch_futs.push(future::Either::Right(async move { + make_error_batch_response(&mut response, e); + response + })), + } + } + stream::FuturesOrdered::from_iter(batch_futs).collect() + } + /// The real implementation of handling a stream request. /// /// It first retrieves a snapshot, then builds the `RequestHandler` over the @@ -654,6 +736,42 @@ impl Endpoint { } } +fn make_error_batch_response(batch_resp: &mut coppb::StoreBatchTaskResponse, e: Error) { + warn!( + "batch cop task error-response"; + "err" => %e + ); + let tag; + match e { + Error::Region(e) => { + tag = storage::get_tag_from_header(&e); + batch_resp.set_region_error(e); + } + Error::Locked(info) => { + tag = "meet_lock"; + batch_resp.set_locked(info); + } + Error::DeadlineExceeded => { + tag = "deadline_exceeded"; + batch_resp.set_other_error(e.to_string()); + } + Error::MaxPendingTasksExceeded => { + tag = "max_pending_tasks_exceeded"; + let mut server_is_busy_err = errorpb::ServerIsBusy::default(); + server_is_busy_err.set_reason(e.to_string()); + let mut errorpb = errorpb::Error::default(); + errorpb.set_message(e.to_string()); + errorpb.set_server_is_busy(server_is_busy_err); + batch_resp.set_region_error(errorpb); + } + Error::Other(_) => { + tag = "other"; + batch_resp.set_other_error(e.to_string()); + } + }; + COPR_REQ_ERROR.with_label_values(&[tag]).inc(); +} + fn make_error_response(e: Error) -> coppb::Response { warn!( "error-response"; @@ -1332,7 +1450,7 @@ mod tests { let config = Config { end_point_request_max_handle_duration: ReadableDuration::millis( - (PAYLOAD_SMALL + PAYLOAD_LARGE) as u64 * 2, + (PAYLOAD_SMALL + PAYLOAD_LARGE) * 2, ), ..Default::default() }; @@ -1357,23 +1475,22 @@ mod tests { // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration( - Ok(coppb::Response::default()), - PAYLOAD_SMALL as u64, + Ok( + UnaryFixture::new_with_duration(Ok(coppb::Response::default()), PAYLOAD_SMALL) + .into_boxed(), ) - .into_boxed()) }); let resp_future_1 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { Ok( - UnaryFixture::new_with_duration(Err(box_err!("foo")), PAYLOAD_LARGE as u64) + UnaryFixture::new_with_duration(Err(box_err!("foo")), PAYLOAD_LARGE) .into_boxed(), ) }); @@ -1381,7 +1498,7 @@ mod tests { copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Response 1 let resp = &rx.recv().unwrap()[0]; @@ -1447,7 +1564,7 @@ mod tests { let handler_builder = Box::new(|_, _: &_| { Ok(UnaryFixture::new_with_duration_yieldable( Ok(coppb::Response::default()), - PAYLOAD_SMALL as u64, + PAYLOAD_SMALL, ) .into_boxed()) }); @@ -1456,21 +1573,20 @@ mod tests { let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Unary, error response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration_yieldable( - Err(box_err!("foo")), - PAYLOAD_LARGE as u64, + Ok( + UnaryFixture::new_with_duration_yieldable(Err(box_err!("foo")), PAYLOAD_LARGE) + .into_boxed(), ) - .into_boxed()) }); let resp_future_2 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_2).unwrap()]).unwrap()); - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Response 1 // @@ -1524,18 +1640,17 @@ mod tests { // Request 1: Unary, success response. let handler_builder = Box::new(|_, _: &_| { - Ok(UnaryFixture::new_with_duration( - Ok(coppb::Response::default()), - PAYLOAD_LARGE as u64, + Ok( + UnaryFixture::new_with_duration(Ok(coppb::Response::default()), PAYLOAD_LARGE) + .into_boxed(), ) - .into_boxed()) }); let resp_future_1 = copr.handle_unary_request(req_with_exec_detail.clone(), handler_builder); let sender = tx.clone(); thread::spawn(move || sender.send(vec![block_on(resp_future_1).unwrap()]).unwrap()); // Sleep a while to make sure that thread is spawn and snapshot is taken. - thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS as u64)); + thread::sleep(Duration::from_millis(SNAPSHOT_DURATION_MS)); // Request 2: Stream. let handler_builder = Box::new(|_, _: &_| { @@ -1545,11 +1660,7 @@ mod tests { Err(box_err!("foo")), Ok(coppb::Response::default()), ], - vec![ - PAYLOAD_SMALL as u64, - PAYLOAD_LARGE as u64, - PAYLOAD_SMALL as u64, - ], + vec![PAYLOAD_SMALL, PAYLOAD_LARGE, PAYLOAD_SMALL], ) .into_boxed()) }); diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index ade8a007383..383f6161a1b 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -843,7 +843,7 @@ impl SampleBuilder { .map_or_else(|| 0_usize, |req| req.get_top_n_size() as usize), common_handle_col_ids: common_handle_ids, columns_info, - analyze_common_handle: common_handle_req != None, + analyze_common_handle: common_handle_req.is_some(), }) } @@ -1116,7 +1116,7 @@ impl AnalyzeSamplingResult { impl Default for AnalyzeSamplingResult { fn default() -> Self { - AnalyzeSamplingResult::new(Box::new(ReservoirRowSampleCollector::default())) + AnalyzeSamplingResult::new(Box::::default()) } } diff --git a/src/coprocessor/statistics/histogram.rs b/src/coprocessor/statistics/histogram.rs index 8797c38a721..b7a70600e39 100644 --- a/src/coprocessor/statistics/histogram.rs +++ b/src/coprocessor/statistics/histogram.rs @@ -29,7 +29,7 @@ impl Bucket { upper_bound, lower_bound, repeats, - ndv: if with_ndv { 1 } else { 0 }, + ndv: with_ndv as u64, } } diff --git a/src/coprocessor_v2/plugin_registry.rs b/src/coprocessor_v2/plugin_registry.rs index c02a652fc88..6262fe6bae9 100644 --- a/src/coprocessor_v2/plugin_registry.rs +++ b/src/coprocessor_v2/plugin_registry.rs @@ -130,7 +130,7 @@ impl PluginRegistry { // Simple helper functions for loading/unloading plugins. let maybe_load = |file: &PathBuf| { let mut hot_reload_registry = hot_reload_registry.write().unwrap(); - if is_library_file(&file) { + if is_library_file(file) { // Ignore errors. hot_reload_registry.load_plugin(file).ok(); } @@ -243,7 +243,7 @@ impl PluginRegistry { let dir_name = dir_name.into(); let mut loaded_plugins = Vec::new(); - for entry in std::fs::read_dir(&dir_name)? { + for entry in std::fs::read_dir(dir_name)? { if let Ok(file) = entry.map(|f| f.path()) { if is_library_file(&file) { // Ignore errors. @@ -481,7 +481,7 @@ mod tests { fn initialize_library() -> PathBuf { let mut path = std::env::current_exe().unwrap(); - path.set_file_name(pkgname_to_libname("example-plugin")); + path.set_file_name(pkgname_to_libname("example-coprocessor-plugin")); path } @@ -489,9 +489,9 @@ mod tests { fn load_plugin() { let library_path = initialize_library(); - let loaded_plugin = unsafe { LoadedPlugin::new(&library_path).unwrap() }; + let loaded_plugin = unsafe { LoadedPlugin::new(library_path).unwrap() }; - assert_eq!(loaded_plugin.name(), "example_plugin"); + assert_eq!(loaded_plugin.name(), "example_coprocessor_plugin"); assert_eq!(loaded_plugin.version(), &Version::parse("0.1.0").unwrap()); } @@ -504,10 +504,15 @@ mod tests { let plugin = registry.get_plugin(&plugin_name).unwrap(); - assert_eq!(plugin.name(), "example_plugin"); - assert_eq!(registry.loaded_plugin_names(), vec!["example_plugin"]); + assert_eq!(plugin.name(), "example_coprocessor_plugin"); assert_eq!( - registry.get_path_for_plugin("example_plugin").unwrap(), + registry.loaded_plugin_names(), + vec!["example_coprocessor_plugin"] + ); + assert_eq!( + registry + .get_path_for_plugin("example_coprocessor_plugin") + .unwrap(), library_path.as_os_str() ); } @@ -519,7 +524,7 @@ mod tests { let library_path_2 = library_path .parent() .unwrap() - .join(pkgname_to_libname("example-plugin-2")); + .join(pkgname_to_libname("example-coprocessor-plugin-2")); let registry = PluginRegistry::new(); let plugin_name = registry.load_plugin(&library_path).unwrap(); @@ -543,7 +548,7 @@ mod tests { let registry = PluginRegistry::new(); - let plugin_name = registry.load_plugin(&library_path).unwrap(); + let plugin_name = registry.load_plugin(library_path).unwrap(); assert!(registry.get_plugin(&plugin_name).is_some()); @@ -558,9 +563,10 @@ mod tests { let original_library_path = initialize_library(); let coprocessor_dir = std::env::temp_dir().join("coprocessors"); - let library_path = coprocessor_dir.join(pkgname_to_libname("example-plugin")); - let library_path_2 = coprocessor_dir.join(pkgname_to_libname("example-plugin-2")); - let plugin_name = "example_plugin"; + let library_path = coprocessor_dir.join(pkgname_to_libname("example-coprocessor-plugin")); + let library_path_2 = + coprocessor_dir.join(pkgname_to_libname("example-coprocessor-plugin-2")); + let plugin_name = "example_coprocessor_plugin"; // Make the coprocessor directory is empty. std::fs::create_dir_all(&coprocessor_dir).unwrap(); @@ -570,7 +576,7 @@ mod tests { registry.start_hot_reloading(&coprocessor_dir).unwrap(); // trigger loading - std::fs::copy(&original_library_path, &library_path).unwrap(); + std::fs::copy(original_library_path, &library_path).unwrap(); // fs watcher detects changes in every 3 seconds, therefore, wait 4 seconds so // as to make sure the watcher is triggered. std::thread::sleep(Duration::from_secs(4)); diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index fff9c79cec2..9d45052fea9 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -5,18 +5,13 @@ use std::{ future::Future, path::PathBuf, sync::{Arc, Mutex}, + time::Duration, }; use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{ - executor::{ThreadPool, ThreadPoolBuilder}, - future::join_all, - sink::SinkExt, - stream::TryStreamExt, - TryFutureExt, -}; +use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -32,13 +27,17 @@ use raftstore::{ router::RaftStoreRouter, store::{Callback, RaftCmdExtraOpts, RegionSnapshot}, }; -use sst_importer::{error_inc, metrics::*, sst_meta_to_path, Config, Error, Result, SstImporter}; +use sst_importer::{ + error_inc, metrics::*, sst_importer::DownloadExt, sst_meta_to_path, Config, Error, Result, + SstImporter, +}; use tikv_util::{ config::ReadableSize, future::{create_stream_with_buffer, paired_future_callback}, sys::thread::ThreadBuildWrapper, time::{Instant, Limiter}, }; +use tokio::{runtime::Runtime, time::sleep}; use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; @@ -56,7 +55,7 @@ where cfg: Config, engine: E, router: Router, - threads: ThreadPool, + threads: Arc, importer: Arc, limiter: Limiter, task_slots: Arc>>, @@ -81,22 +80,25 @@ where importer: Arc, ) -> ImportSstService { let props = tikv_util::thread_group::current_properties(); - let threads = ThreadPoolBuilder::new() - .pool_size(cfg.num_threads) - .name_prefix("sst-importer") + let threads = tokio::runtime::Builder::new_multi_thread() + .worker_threads(cfg.num_threads) + .enable_all() + .thread_name("sst-importer") .after_start_wrapper(move || { tikv_util::thread_group::set_properties(props.clone()); tikv_alloc::add_thread_memory_accessor(); set_io_type(IoType::Import); }) .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) - .create() + .build() .unwrap(); - importer.start_switch_mode_check(&threads, engine.clone()); + importer.start_switch_mode_check(threads.handle(), engine.clone()); + threads.spawn(Self::tick(importer.clone())); + ImportSstService { cfg, engine, - threads, + threads: Arc::new(threads), router, importer, limiter: Limiter::new(f64::INFINITY), @@ -105,6 +107,13 @@ where } } + async fn tick(importer: Arc) { + loop { + sleep(Duration::from_secs(10)).await; + importer.shrink_by_tick(); + } + } + fn acquire_lock(task_slots: &Arc>>, meta: &SstMeta) -> Result { let mut slots = task_slots.lock().unwrap(); let p = sst_meta_to_path(meta)?; @@ -299,8 +308,8 @@ macro_rules! impl_write { $crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(buf_driver); - self.threads.spawn_ok(handle_task); + self.threads.spawn(buf_driver); + self.threads.spawn(handle_task); } }; } @@ -383,8 +392,8 @@ where crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(buf_driver); - self.threads.spawn_ok(handle_task); + self.threads.spawn(buf_driver); + self.threads.spawn(handle_task); } // clear_files the KV files after apply finished. @@ -419,7 +428,7 @@ where let resp = Ok(resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } // Downloads KV file and performs key-rewrite then apply kv into this tikv @@ -443,38 +452,97 @@ where sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); - + let mut start_apply = Instant::now(); let mut futs = vec![]; let mut apply_resp = ApplyResponse::default(); let context = req.take_context(); - let meta = req.get_meta(); + let mut rules = req.take_rewrite_rules(); + let mut metas = req.take_metas(); + // For compatibility with old requests. + if req.has_meta() { + metas.push(req.take_meta()); + rules.push(req.take_rewrite_rule()); + } let result = (|| -> Result<()> { - let temp_file = - importer.do_download_kv_file(meta, req.get_storage_backend(), &limiter)?; - let mut reqs = RequestCollector::from_cf(meta.get_cf()); let mut cmd_reqs = vec![]; - let mut build_req_fn = build_apply_request( - raft_size.0, - &mut reqs, - cmd_reqs.as_mut(), - meta.get_is_delete(), - meta.get_cf(), - context.clone(), - ); - let range = importer.do_apply_kv_file( - meta.get_start_key(), - meta.get_end_key(), - meta.get_restore_ts(), - temp_file, - req.get_rewrite_rule(), - &mut build_req_fn, - )?; - drop(build_req_fn); - if !reqs.is_empty() { - let cmd = make_request(&mut reqs, context); + let mut reqs_default = RequestCollector::from_cf(CF_DEFAULT); + let mut reqs_write = RequestCollector::from_cf(CF_WRITE); + let mut req_default_size = 0_u64; + let mut req_write_size = 0_u64; + let mut range: Option = None; + let ext_storage = { + let inner = importer.wrap_kms( + importer.external_storage_or_cache( + req.get_storage_backend(), + req.get_storage_cache_id(), + )?, + false, + ); + inner + }; + + for (i, meta) in metas.iter().enumerate() { + let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { + (&mut reqs_default, &mut req_default_size) + } else { + (&mut reqs_write, &mut req_write_size) + }; + + let mut build_req_fn = build_apply_request( + req_size, + raft_size.0, + reqs, + cmd_reqs.as_mut(), + meta.get_is_delete(), + meta.get_cf(), + context.clone(), + ); + + let buff = importer.read_from_kv_file( + meta, + &rules[i], + Arc::clone(&ext_storage), + req.get_storage_backend(), + &limiter, + )?; + let r: Option = importer.do_apply_kv_file( + meta.get_start_key(), + meta.get_end_key(), + meta.get_start_ts(), + meta.get_restore_ts(), + buff, + &mut build_req_fn, + )?; + + if let Some(mut r) = r { + range = match range { + Some(mut v) => { + let s = v.take_start().min(r.take_start()); + let e = v.take_end().max(r.take_end()); + Some(Range { + start: s, + end: e, + ..Default::default() + }) + } + None => Some(r), + }; + } + } + + if !reqs_default.is_empty() { + let cmd = make_request(&mut reqs_default, context.clone()); + cmd_reqs.push(cmd); + IMPORTER_APPLY_BYTES.observe(req_default_size as _); + } + if !reqs_write.is_empty() { + let cmd = make_request(&mut reqs_write, context); cmd_reqs.push(cmd); + IMPORTER_APPLY_BYTES.observe(req_write_size as _); } + + start_apply = Instant::now(); for cmd in cmd_reqs { let (cb, future) = paired_future_callback(); match router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) @@ -507,26 +575,28 @@ where if r.response.get_header().has_error() { let mut import_err = kvproto::import_sstpb::Error::default(); let err = r.response.get_header().get_error(); - import_err - .set_message("failed to complete raft command".to_string()); + import_err.set_message("failed to complete raft command".to_string()); // FIXME: if there are many errors, we may lose some of them here. - import_err - .set_store_error(err.clone()); - warn!("failed to apply the file to the store"; "error" => ?err, "file" => %meta.get_name()); + import_err.set_store_error(err.clone()); + warn!("failed to apply the file to the store"; "error" => ?err); resp.set_error(import_err); } } } resp })); + // Records how long the apply task waits to be scheduled. + sst_importer::metrics::IMPORTER_APPLY_DURATION + .with_label_values(&["apply"]) + .observe(start_apply.saturating_elapsed().as_secs_f64()); sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["finish"]) .observe(start.saturating_elapsed().as_secs_f64()); debug!("finished apply kv file with {:?}", resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Downloads the file and performs key-rewrite for later ingesting. @@ -559,7 +629,7 @@ where .into_option() .filter(|c| c.cipher_type != EncryptionMethod::Plaintext); - let res = importer.download::( + let res = importer.download_ext::( req.get_sst(), req.get_storage_backend(), req.get_name(), @@ -567,9 +637,10 @@ where cipher, limiter, engine, + DownloadExt::default().cache_key(req.get_storage_cache_id()), ); let mut resp = DownloadResponse::default(); - match res { + match res.await { Ok(range) => match range { Some(r) => resp.set_range(r), None => resp.set_is_empty(true), @@ -580,7 +651,7 @@ where crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Ingest the file by sending a raft command to raftstore. @@ -626,7 +697,7 @@ where Self::release_lock(&task_slots, &meta).unwrap(); crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } /// Ingest multiple files by sending a raft command to raftstore. @@ -677,7 +748,7 @@ where } crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } fn compact( @@ -726,7 +797,7 @@ where crate::send_rpc_response!(res, sink, label, timer); }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } fn set_download_speed_limit( @@ -817,7 +888,7 @@ where } let _ = sink.close().await; }; - self.threads.spawn_ok(handle_task); + self.threads.spawn(handle_task); } impl_write!(write, WriteRequest, WriteResponse, Chunk, new_txn_writer); @@ -861,9 +932,9 @@ enum RequestCollector { /// This is used for write CF because resolved ts observer hates duplicated /// key in the same request. RetainLastTs(HashMap, (Request, u64)>), - /// Collector favor that simple collect all items. - /// This is used for default CF. - KeepAll(Vec), + /// Collector favor that simple collect all items, and it do not contains + /// duplicated key-value. This is used for default CF. + KeepAll(HashMap, Request>), } impl RequestCollector { @@ -879,9 +950,9 @@ impl RequestCollector { } fn accept(&mut self, req: Request) { + let k = key_from_request(&req); match self { RequestCollector::RetainLastTs(ref mut reqs) => { - let k = key_from_request(&req); let (encoded_key, ts) = match Key::split_on_ts_for(k) { Ok(k) => k, Err(err) => { @@ -897,7 +968,9 @@ impl RequestCollector { reqs.insert(encoded_key.to_owned(), (req, ts.into_inner())); } } - RequestCollector::KeepAll(ref mut a) => a.push(req), + RequestCollector::KeepAll(ref mut reqs) => { + reqs.insert(k.to_owned(), req); + } } } @@ -906,7 +979,7 @@ impl RequestCollector { RequestCollector::RetainLastTs(ref mut reqs) => { reqs.drain().map(|(_, (req, _))| req).collect() } - RequestCollector::KeepAll(ref mut reqs) => std::mem::take(reqs), + RequestCollector::KeepAll(ref mut reqs) => reqs.drain().map(|(_, req)| req).collect(), } } @@ -956,6 +1029,7 @@ fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 // will panic if found duplicated entry during Vec. fn build_apply_request<'a, 'b>( + req_size: &'a mut u64, raft_size: u64, reqs: &'a mut RequestCollector, cmd_reqs: &'a mut Vec, @@ -966,51 +1040,46 @@ fn build_apply_request<'a, 'b>( where 'a: 'b, { - let mut req_size = 0_u64; - // use callback to collect kv data. - if is_delete { - Box::new(move |k: Vec, _v: Vec| { - let mut req = Request::default(); - let mut del = DeleteRequest::default(); + Box::new(move |k: Vec, v: Vec| { + // Need to skip the empty key/value that could break the transaction or cause + // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. + if k.is_empty() || v.is_empty() { + return; + } + let mut req = Request::default(); + if is_delete { + let mut del = DeleteRequest::default(); del.set_key(k); del.set_cf(cf.to_string()); req.set_cmd_type(CmdType::Delete); req.set_delete(del); - req_size += req.compute_size() as u64; - reqs.accept(req); - // When the request size get grow to half of the max request size, - // build the request and add it to a batch. - if req_size > raft_size / 2 { - req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - }) - } else { - Box::new(move |k: Vec, v: Vec| { + } else { if cf == CF_WRITE && !write_needs_restore(&v) { return; } - let mut req = Request::default(); let mut put = PutRequest::default(); - put.set_key(k); put.set_value(v); put.set_cf(cf.to_string()); req.set_cmd_type(CmdType::Put); req.set_put(put); - req_size += req.compute_size() as u64; - reqs.accept(req); - if req_size > raft_size / 2 { - req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - }) - } + } + + // When the request size get grow to max request size, + // build the request and add it to a batch. + if *req_size + req.compute_size() as u64 > raft_size * 7 / 8 { + IMPORTER_APPLY_BYTES.observe(*req_size as _); + *req_size = 0; + let cmd = make_request(reqs, context.clone()); + cmd_reqs.push(cmd); + } + + *req_size += req.compute_size() as u64; + reqs.accept(req); + }) } fn write_needs_restore(write: &[u8]) -> bool { @@ -1063,23 +1132,42 @@ mod test { fn default_req(key: &[u8], val: &[u8], start_ts: u64) -> Request { let (k, v) = default(key, val, start_ts); - req(k, v, CF_DEFAULT) + req(k, v, CF_DEFAULT, CmdType::Put) } fn write_req(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> Request { let (k, v) = write(key, ty, commit_ts, start_ts); - req(k, v, CF_WRITE) + let cmd_type = if ty == WriteType::Delete { + CmdType::Delete + } else { + CmdType::Put + }; + + req(k, v, CF_WRITE, cmd_type) } - fn req(k: Vec, v: Vec, cf: &str) -> Request { + fn req(k: Vec, v: Vec, cf: &str, cmd_type: CmdType) -> Request { let mut req = Request::default(); - let mut put = PutRequest::default(); + req.set_cmd_type(cmd_type); + + match cmd_type { + CmdType::Put => { + let mut put = PutRequest::default(); + put.set_key(k); + put.set_value(v); + put.set_cf(cf.to_string()); + + req.set_put(put) + } + CmdType::Delete => { + let mut del = DeleteRequest::default(); + del.set_cf(cf.to_string()); + del.set_key(k); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); + req.set_delete(del); + } + _ => panic!("invalid input cmd_type"), + } req } @@ -1088,26 +1176,36 @@ mod test { #[derive(Debug)] struct Case { cf: &'static str, + is_delete: bool, mutations: Vec<(Vec, Vec)>, expected_reqs: Vec, } fn run_case(c: &Case) { - let mut v = vec![]; - let mut coll = RequestCollector::from_cf(c.cf); - let mut builder = - build_apply_request(1024, &mut coll, &mut v, false, c.cf, Context::new()); + let mut cmds = vec![]; + let mut reqs = RequestCollector::from_cf(c.cf); + let mut req_size = 0_u64; + + let mut builder = build_apply_request( + &mut req_size, + 1024, + &mut reqs, + &mut cmds, + c.is_delete, + c.cf, + Context::new(), + ); for (k, v) in c.mutations.clone() { builder(k, v); } drop(builder); - if !coll.is_empty() { - let cmd = make_request(&mut coll, Context::new()); - v.push(cmd); + if !reqs.is_empty() { + let cmd = make_request(&mut reqs, Context::new()); + cmds.push(cmd); } - let mut req1: HashMap<_, _> = v + let mut req1: HashMap<_, _> = cmds .into_iter() .flat_map(|mut x| x.take_requests().into_iter()) .map(|req| { @@ -1126,12 +1224,14 @@ mod test { let cases = vec![ Case { cf: CF_WRITE, + is_delete: false, mutations: vec![ write(b"foo", Lock, 42, 41), write(b"foo", Put, 40, 39), write(b"bar", Put, 38, 37), write(b"baz", Put, 34, 31), - write(b"bar", Delete, 28, 17), + write(b"bar", Put, 28, 17), + (Vec::default(), Vec::default()), ], expected_reqs: vec![ write_req(b"foo", Put, 40, 39), @@ -1139,8 +1239,24 @@ mod test { write_req(b"baz", Put, 34, 31), ], }, + Case { + cf: CF_WRITE, + is_delete: true, + mutations: vec![ + write(b"foo", Delete, 40, 39), + write(b"bar", Delete, 38, 37), + write(b"baz", Delete, 34, 31), + write(b"bar", Delete, 28, 17), + ], + expected_reqs: vec![ + write_req(b"foo", Delete, 40, 39), + write_req(b"bar", Delete, 38, 37), + write_req(b"baz", Delete, 34, 31), + ], + }, Case { cf: CF_DEFAULT, + is_delete: false, mutations: vec![ default(b"aria", b"The planet where flowers bloom.", 123), default( @@ -1149,6 +1265,8 @@ mod test { 178, ), default(b"beyond", b"Calling your name.", 278), + default(b"beyond", b"Calling your name.", 278), + default(b"PingCap", b"", 300), ], expected_reqs: vec![ default_req(b"aria", b"The planet where flowers bloom.", 123), @@ -1166,4 +1284,67 @@ mod test { run_case(&case); } } + + #[test] + fn test_request_collector_with_write_cf() { + let mut request_collector = RequestCollector::from_cf(CF_WRITE); + assert_eq!(request_collector.is_empty(), true); + let reqs = vec![ + write_req(b"foo", WriteType::Put, 40, 39), + write_req(b"aar", WriteType::Put, 38, 37), + write_req(b"foo", WriteType::Put, 34, 31), + write_req(b"zzz", WriteType::Put, 41, 40), + ]; + let reqs_result = vec![ + write_req(b"aar", WriteType::Put, 38, 37), + write_req(b"foo", WriteType::Put, 40, 39), + write_req(b"zzz", WriteType::Put, 41, 40), + ]; + + for req in reqs { + request_collector.accept(req); + } + assert_eq!(request_collector.is_empty(), false); + let mut reqs = request_collector.drain(); + reqs.sort_by(|r1, r2| { + let k1 = key_from_request(r1); + let k2 = key_from_request(r2); + k1.cmp(k2) + }); + assert_eq!(reqs, reqs_result); + assert_eq!(request_collector.is_empty(), true); + } + + #[test] + fn test_request_collector_with_default_cf() { + let mut request_collector = RequestCollector::from_cf(CF_DEFAULT); + assert_eq!(request_collector.is_empty(), true); + let reqs = vec![ + default_req(b"foo", b"", 39), + default_req(b"zzz", b"", 40), + default_req(b"foo", b"", 37), + default_req(b"foo", b"", 39), + ]; + let reqs_result = vec![ + default_req(b"foo", b"", 37), + default_req(b"foo", b"", 39), + default_req(b"zzz", b"", 40), + ]; + + for req in reqs { + request_collector.accept(req); + } + assert_eq!(request_collector.is_empty(), false); + let mut reqs = request_collector.drain(); + reqs.sort_by(|r1, r2| { + let k1 = key_from_request(r1); + let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); + let k2 = key_from_request(r2); + let (k2, ts2) = Key::split_on_ts_for(k2).unwrap(); + + k1.cmp(k2).then(ts1.cmp(&ts2)) + }); + assert_eq!(reqs, reqs_result); + assert_eq!(request_collector.is_empty(), true); + } } diff --git a/src/lib.rs b/src/lib.rs index a961abc7d38..43d5db81458 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,8 +25,8 @@ #![feature(box_patterns)] #![feature(drain_filter)] #![feature(deadline_api)] -#![feature(generic_associated_types)] -#![feature(let_else)] +#![feature(let_chains)] +#![feature(type_alias_impl_trait)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/read_pool.rs b/src/read_pool.rs index deb7336975c..5212c4ae594 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -32,10 +32,10 @@ use crate::{ // the duration to check auto-scale unified-thread-pool's thread const READ_POOL_THREAD_CHECK_DURATION: Duration = Duration::from_secs(10); // consider scale out read pool size if the average thread cpu usage is higher -// than this threahold. +// than this threshold. const READ_POOL_THREAD_HIGH_THRESHOLD: f64 = 0.8; // consider scale in read pool size if the average thread cpu usage is lower -// than this threahold. +// than this threshold. const READ_POOL_THREAD_LOW_THRESHOLD: f64 = 0.7; // avg running tasks per-thread that indicates read-pool is busy const RUNNING_TASKS_PER_THREAD_THRESHOLD: i64 = 3; @@ -192,10 +192,7 @@ impl ReadPoolHandle { match self { ReadPoolHandle::FuturePools { read_pool_normal, .. - } => { - read_pool_normal.get_running_task_count() as usize - / read_pool_normal.get_pool_size() - } + } => read_pool_normal.get_running_task_count() / read_pool_normal.get_pool_size(), ReadPoolHandle::Yatp { running_tasks, pool_size, diff --git a/src/server/config.rs b/src/server/config.rs index 1959b77df00..ae5c70abe1d 100644 --- a/src/server/config.rs +++ b/src/server/config.rs @@ -90,9 +90,18 @@ pub struct Config { // When merge raft messages into a batch message, leave a buffer. #[online_config(skip)] pub raft_client_grpc_send_msg_buffer: usize, - #[online_config(skip)] pub raft_client_queue_size: usize, + // Test only + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(skip)] + pub raft_client_max_backoff: ReadableDuration, + // Test only + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(skip)] + pub raft_client_initial_reconnect_backoff: ReadableDuration, pub raft_msg_max_batch_size: usize, @@ -156,12 +165,6 @@ pub struct Config { #[online_config(skip)] pub forward_max_connections_per_address: usize, - // Test only. - #[doc(hidden)] - #[serde(skip_serializing)] - #[online_config(skip)] - pub raft_client_backoff_step: ReadableDuration, - #[doc(hidden)] #[online_config(skip)] /// When TiKV memory usage reaches `memory_usage_high_water` it will try to @@ -218,6 +221,8 @@ impl Default for Config { max_grpc_send_msg_len: DEFAULT_MAX_GRPC_SEND_MSG_LEN, raft_client_grpc_send_msg_buffer: 512 * 1024, raft_client_queue_size: 8192, + raft_client_max_backoff: ReadableDuration::secs(5), + raft_client_initial_reconnect_backoff: ReadableDuration::secs(1), raft_msg_max_batch_size: 128, grpc_compression_type: GrpcCompressionType::None, grpc_gzip_compression_level: DEFAULT_GRPC_GZIP_COMPRESSION_LEVEL, @@ -254,7 +259,6 @@ impl Default for Config { heavy_load_threshold: 75, heavy_load_wait_duration: None, enable_request_batch: true, - raft_client_backoff_step: ReadableDuration::secs(1), reject_messages_on_memory_ratio: 0.2, background_thread_count, end_point_slow_log_threshold: ReadableDuration::secs(1), diff --git a/src/server/debug.rs b/src/server/debug.rs index 7f85aabcf50..48435f72163 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -120,6 +120,10 @@ impl From for debugpb::BottommostLevelCompaction { } } +trait InnerRocksEngineExtractor { + fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine>; +} + #[derive(Clone)] pub struct Debugger { engines: Engines, @@ -127,6 +131,26 @@ pub struct Debugger { cfg_controller: ConfigController, } +impl InnerRocksEngineExtractor for Debugger { + default fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { + match db { + DbType::Kv => Ok(&self.engines.kv), + DbType::Raft => Err(box_err!("Get raft db is not allowed")), + _ => Err(box_err!("invalid DB type")), + } + } +} + +impl InnerRocksEngineExtractor for Debugger { + fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { + match db { + DbType::Kv => Ok(&self.engines.kv), + DbType::Raft => Ok(&self.engines.raft), + _ => Err(box_err!("invalid DB type")), + } + } +} + impl Debugger { pub fn new( engines: Engines, @@ -163,14 +187,6 @@ impl Debugger { Ok(regions) } - fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { - match db { - DbType::Kv => Ok(&self.engines.kv), - DbType::Raft => Err(box_err!("Get raft db is not allowed")), - _ => Err(box_err!("invalid DB type")), - } - } - pub fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { validate_db_and_cf(db, cf)?; let db = self.get_db_from_type(db)?; @@ -868,7 +884,7 @@ impl Debugger { res.push(("region.end_key".to_owned(), hex::encode(®ion.end_key))); res.push(( "region.middle_key_by_approximate_size".to_owned(), - hex::encode(&middle_key), + hex::encode(middle_key), )); Ok(res) @@ -2272,4 +2288,14 @@ mod tests { .get_api_version() ) } + + #[test] + fn test_compact() { + let debugger = new_debugger(); + let compact = |db, cf| debugger.compact(db, cf, &[0], &[0xFF], 1, Some("skip").into()); + compact(DbType::Kv, CF_DEFAULT).unwrap(); + compact(DbType::Kv, CF_LOCK).unwrap(); + compact(DbType::Kv, CF_WRITE).unwrap(); + compact(DbType::Raft, CF_DEFAULT).unwrap(); + } } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index d8492dae5ce..7e8a1457500 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -272,7 +272,7 @@ impl TabletFactory for KvEngineFactory { false } - fn tablet_path(&self, _id: u64, _suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { self.kv_engine_path() } diff --git a/src/server/engine_factory_v2.rs b/src/server/engine_factory_v2.rs index b4a7688ef68..f370a08e280 100644 --- a/src/server/engine_factory_v2.rs +++ b/src/server/engine_factory_v2.rs @@ -19,7 +19,8 @@ const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; #[derive(Clone)] pub struct KvEngineFactoryV2 { inner: KvEngineFactory, - registry: Arc>>, + // region_id -> (tablet, tablet_suffix) + registry: Arc>>, } impl KvEngineFactoryV2 { @@ -31,23 +32,11 @@ impl KvEngineFactoryV2 { } } -// Extract tablet id and tablet suffix from the path. -fn get_id_and_suffix_from_path(path: &Path) -> (u64, u64) { - let (mut tablet_id, mut tablet_suffix) = (0, 1); - if let Some(s) = path.file_name().map(|s| s.to_string_lossy()) { - let mut split = s.split('_'); - tablet_id = split.next().and_then(|s| s.parse().ok()).unwrap_or(0); - tablet_suffix = split.next().and_then(|s| s.parse().ok()).unwrap_or(1); - } - (tablet_id, tablet_suffix) -} - impl TabletFactory for KvEngineFactoryV2 { /// open a tablet according to the OpenOptions. /// /// If options.cache_only is true, only open the relevant tablet from - /// `registry`, and if suffix is None, return an arbitrary tablet with the - /// target region id if there are any. + /// `registry`. /// /// If options.create_new is true, create a tablet by id and suffix. If the /// tablet exists, it will fail. @@ -55,6 +44,8 @@ impl TabletFactory for KvEngineFactoryV2 { /// If options.create is true, open the tablet with id and suffix if it /// exists or create it otherwise. /// + /// If options.skip_cache is true, cache will not be updated. + /// /// Note: options.cache_only and options.create and/or options.create_new /// cannot be true simultaneously fn open_tablet( @@ -63,39 +54,39 @@ impl TabletFactory for KvEngineFactoryV2 { suffix: Option, mut options: OpenOptions, ) -> Result { + if options.create_new() && suffix.is_none() { + return Err(box_err!( + "suffix should be provided when creating new tablet" + )); + } + if options.create() || options.create_new() { options = options.set_cache_only(false); } let mut reg = self.registry.lock().unwrap(); if let Some(suffix) = suffix { - if let Some(tablet) = reg.get(&(id, suffix)) { + if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { // Target tablet exist in the cache - if options.create_new() { return Err(box_err!( "region {} {} already exists", id, - tablet.as_inner().path() + cached_tablet.as_inner().path() )); } - return Ok(tablet.clone()); + return Ok(cached_tablet.clone()); } else if !options.cache_only() { let tablet_path = self.tablet_path(id, suffix); let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; if !options.skip_cache() { debug!("Insert a tablet"; "key" => ?(id, suffix)); - reg.insert((id, suffix), tablet.clone()); + reg.insert(id, (tablet.clone(), suffix)); } return Ok(tablet); } - } else if options.cache_only() { - // This branch reads an arbitrary tablet with region id `id` - - if let Some(k) = reg.keys().find(|k| k.0 == id) { - debug!("choose a random tablet"; "key" => ?k); - return Ok(reg.get(k).unwrap().clone()); - } + } else if let Some((tablet, _)) = reg.get(&id) { + return Ok(tablet.clone()); } Err(box_err!( @@ -154,18 +145,25 @@ impl TabletFactory for KvEngineFactoryV2 { } #[inline] - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { self.inner .store_path() - .join(format!("tablets/{}_{}", id, suffix)) + .join(format!("tablets/{}{}_{}", prefix, id, suffix)) } #[inline] fn mark_tombstone(&self, region_id: u64, suffix: u64) { let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - std::fs::File::create(&path).unwrap(); + // When the full directory path does not exsit, create will return error and in + // this case, we just ignore it. + let _ = std::fs::File::create(path); debug!("tombstone tablet"; "region_id" => region_id, "suffix" => suffix); - self.registry.lock().unwrap().remove(&(region_id, suffix)); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } } #[inline] @@ -176,42 +174,45 @@ impl TabletFactory for KvEngineFactoryV2 { } #[inline] - fn destroy_tablet(&self, id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(id, suffix); - self.registry.lock().unwrap().remove(&(id, suffix)); + fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { + let path = self.tablet_path(region_id, suffix); + { + let mut reg = self.registry.lock().unwrap(); + if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { + reg.insert(region_id, (cached_tablet, cached_suffix)); + } + } self.inner.destroy_tablet(&path)?; - self.inner.on_tablet_destroy(id, suffix); + self.inner.on_tablet_destroy(region_id, suffix); Ok(()) } #[inline] - fn load_tablet(&self, path: &Path, id: u64, suffix: u64) -> Result { + fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { { let reg = self.registry.lock().unwrap(); - if let Some(db) = reg.get(&(id, suffix)) { + if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { return Err(box_err!( "region {} {} already exists", - id, + region_id, db.as_inner().path() )); } } - let db_path = self.tablet_path(id, suffix); - std::fs::rename(path, &db_path)?; - let new_engine = - self.open_tablet(id, Some(suffix), OpenOptions::default().set_create(true)); - if new_engine.is_ok() { - let (old_id, old_suffix) = get_id_and_suffix_from_path(path); - self.registry.lock().unwrap().remove(&(old_id, old_suffix)); - } - new_engine + let db_path = self.tablet_path(region_id, suffix); + std::fs::rename(path, db_path)?; + self.open_tablet( + region_id, + Some(suffix), + OpenOptions::default().set_create(true), + ) } fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { let reg = self.registry.lock().unwrap(); // pick up any tablet and set the shared block cache capacity - if let Some(((_id, _suffix), tablet)) = (*reg).iter().next() { + if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap opt.set_block_cache_capacity(capacity)?; } @@ -223,7 +224,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[inline] fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { let reg = self.registry.lock().unwrap(); - for ((id, suffix), tablet) in &*reg { + for (id, (tablet, suffix)) in &*reg { f(*id, *suffix, tablet) } } @@ -236,7 +237,7 @@ impl TabletAccessor for KvEngineFactoryV2 { #[cfg(test)] mod tests { - use engine_traits::{OpenOptions, TabletFactory, CF_WRITE}; + use engine_traits::{OpenOptions, TabletFactory, CF_WRITE, SPLIT_PREFIX}; use super::*; use crate::{config::TikvConfig, server::KvEngineFactoryBuilder}; @@ -373,6 +374,11 @@ mod tests { .unwrap(); assert_eq!(tablet.as_inner().path(), tablet2.as_inner().path()); + // Only both region id and suffix match can get the tablet from the cache. + factory + .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + let tablet_path = factory.tablet_path(1, 10); let result = factory.open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)); result.unwrap_err(); @@ -400,14 +406,39 @@ mod tests { .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) .unwrap(); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_create_new(true)) + .unwrap(); + // After open a tablet with the same id but higher suffix, we cannot get the old + // one from cache. + factory + .open_tablet(1, Some(20), OpenOptions::default().set_cache_only(true)) + .unwrap_err(); + // Destroy/mark tombstone the old tablet will not unregister the new tablet in + // the cache factory.mark_tombstone(1, 20); - assert!(factory.is_tombstoned(1, 20)); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) + .unwrap(); factory.destroy_tablet(1, 20).unwrap(); + factory + .open_tablet(1, Some(30), OpenOptions::default().set_cache_only(true)) + .unwrap(); - let result = factory.open_tablet(1, Some(20), OpenOptions::default()); + factory.mark_tombstone(1, 30); + assert!(factory.is_tombstoned(1, 30)); + factory.destroy_tablet(1, 30).unwrap(); + + let result = factory.open_tablet(1, Some(30), OpenOptions::default()); result.unwrap_err(); assert!(!factory.is_single_engine()); + + assert!( + factory + .tablet_path_with_prefix(SPLIT_PREFIX, 1, 10) + .ends_with("split_1_10") + ); } #[test] @@ -428,7 +459,7 @@ mod tests { .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) .unwrap(); drop(tablet); - let tablet = factory.registry.lock().unwrap().remove(&(1, 10)).unwrap(); + let (tablet, _) = factory.registry.lock().unwrap().remove(&1).unwrap(); drop(tablet); factory .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) diff --git a/src/server/errors.rs b/src/server/errors.rs index c7a41947f79..5936f365120 100644 --- a/src/server/errors.rs +++ b/src/server/errors.rs @@ -3,7 +3,7 @@ use std::{error::Error as StdError, io::Error as IoError, net::AddrParseError, result}; use engine_traits::Error as EngineTraitError; -use futures::channel::oneshot::Canceled; +use futures::channel::{mpsc::SendError, oneshot::Canceled}; use grpcio::Error as GrpcError; use hyper::Error as HttpError; use openssl::error::ErrorStack as OpenSslError; @@ -66,6 +66,9 @@ pub enum Error { #[error("{0:?}")] OpenSsl(#[from] OpenSslError), + + #[error("{0:?}")] + StreamDisconnect(#[from] SendError), } pub type Result = result::Result; diff --git a/src/server/gc_worker/applied_lock_collector.rs b/src/server/gc_worker/applied_lock_collector.rs deleted file mode 100644 index 9d0e16f4286..00000000000 --- a/src/server/gc_worker/applied_lock_collector.rs +++ /dev/null @@ -1,894 +0,0 @@ -// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. - -use std::{ - cmp::Ordering::*, - fmt::{self, Debug, Display}, - sync::{ - atomic::{AtomicBool, AtomicU64, Ordering}, - Arc, Mutex, - }, -}; - -use concurrency_manager::ConcurrencyManager; -use engine_traits::{CfName, KvEngine, CF_LOCK}; -use keys::origin_key; -use kvproto::{kvrpcpb::LockInfo, raft_cmdpb::CmdType}; -use raftstore::coprocessor::{ - ApplySnapshotObserver, BoxApplySnapshotObserver, BoxQueryObserver, Cmd, Coprocessor, - CoprocessorHost, ObserverContext, QueryObserver, -}; -use tikv_util::worker::{Builder as WorkerBuilder, Runnable, ScheduleError, Scheduler, Worker}; -use txn_types::Key; - -// TODO: Use new error type for GcWorker instead of storage::Error. -use super::{Error, ErrorInner, Result}; -use crate::storage::{ - mvcc::{ErrorInner as MvccErrorInner, Lock, TimeStamp}, - txn::Error as TxnError, -}; - -const MAX_COLLECT_SIZE: usize = 1024; - -/// The state of the observer. Shared between all clones. -#[derive(Default)] -struct LockObserverState { - max_ts: AtomicU64, - - /// `is_clean` is true, only it's sure that all applying of stale locks - /// (locks with start_ts <= specified max_ts) are monitored and collected. - /// If there are too many stale locks or any error happens, `is_clean` - /// must be set to `false`. - is_clean: AtomicBool, -} - -impl LockObserverState { - fn load_max_ts(&self) -> TimeStamp { - self.max_ts.load(Ordering::Acquire).into() - } - - fn store_max_ts(&self, max_ts: TimeStamp) { - self.max_ts.store(max_ts.into_inner(), Ordering::Release) - } - - fn is_clean(&self) -> bool { - self.is_clean.load(Ordering::Acquire) - } - - fn mark_clean(&self) { - self.is_clean.store(true, Ordering::Release); - } - - fn mark_dirty(&self) { - self.is_clean.store(false, Ordering::Release); - } -} - -pub type Callback = Box) + Send>; - -enum LockCollectorTask { - // Messages from observer - ObservedLocks(Vec<(Key, Lock)>), - - // Messages from client - StartCollecting { - max_ts: TimeStamp, - callback: Callback<()>, - }, - GetCollectedLocks { - max_ts: TimeStamp, - callback: Callback<(Vec, bool)>, - }, - StopCollecting { - max_ts: TimeStamp, - callback: Callback<()>, - }, -} - -impl Debug for LockCollectorTask { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - LockCollectorTask::ObservedLocks(locks) => f - .debug_struct("ObservedLocks") - .field("locks", locks) - .finish(), - LockCollectorTask::StartCollecting { max_ts, .. } => f - .debug_struct("StartCollecting") - .field("max_ts", max_ts) - .finish(), - LockCollectorTask::GetCollectedLocks { max_ts, .. } => f - .debug_struct("GetCollectedLocks") - .field("max_ts", max_ts) - .finish(), - LockCollectorTask::StopCollecting { max_ts, .. } => f - .debug_struct("StopCollecting") - .field("max_ts", max_ts) - .finish(), - } - } -} - -impl Display for LockCollectorTask { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - Debug::fmt(&self, f) - } -} - -/// `LockObserver` observes apply events and apply snapshot events. If it -/// happens in CF_LOCK, it checks the `start_ts`s of the locks being written. If -/// a lock's `start_ts` <= specified `max_ts` in the `state`, it will send the -/// lock to through the `sender`, so the receiver can collect it. -#[derive(Clone)] -struct LockObserver { - state: Arc, - sender: Scheduler, -} - -impl LockObserver { - pub fn new(state: Arc, sender: Scheduler) -> Self { - Self { state, sender } - } - - pub fn register(self, coprocessor_host: &mut CoprocessorHost) { - coprocessor_host - .registry - .register_apply_snapshot_observer(1, BoxApplySnapshotObserver::new(self.clone())); - coprocessor_host - .registry - .register_query_observer(1, BoxQueryObserver::new(self)); - } - - fn send(&self, locks: Vec<(Key, Lock)>) { - #[cfg(feature = "failpoints")] - let injected_full = (|| { - fail_point!("lock_observer_send_full", |_| { - info!("[failpoint] injected lock observer channel full"; "locks" => ?locks); - true - }); - false - })(); - #[cfg(not(feature = "failpoints"))] - let injected_full = false; - - let res = if injected_full { - Err(ScheduleError::Full(LockCollectorTask::ObservedLocks(locks))) - } else { - self.sender - .schedule(LockCollectorTask::ObservedLocks(locks)) - }; - - match res { - Ok(()) => (), - Err(ScheduleError::Stopped(_)) => { - error!("lock observer failed to send locks because collector is stopped"); - } - Err(ScheduleError::Full(_)) => { - fail_point!("lock_observer_before_mark_dirty_on_full"); - self.state.mark_dirty(); - warn!("cannot collect all applied lock because channel is full"); - } - } - } -} - -impl Coprocessor for LockObserver {} - -impl QueryObserver for LockObserver { - fn post_apply_query(&self, _: &mut ObserverContext<'_>, cmd: &Cmd) { - fail_point!("notify_lock_observer_query"); - let max_ts = self.state.load_max_ts(); - if max_ts.is_zero() { - return; - } - - if !self.state.is_clean() { - return; - } - - let mut locks = vec![]; - // For each put in CF_LOCK, collect it if its ts <= max_ts. - for req in cmd.request.get_requests() { - if req.get_cmd_type() != CmdType::Put { - continue; - } - let put_request = req.get_put(); - if put_request.get_cf() != CF_LOCK { - continue; - } - - let lock = match Lock::parse(put_request.get_value()) { - Ok(l) => l, - Err(e) => { - error!(?e; - "cannot parse lock"; - "value" => log_wrappers::Value::value(put_request.get_value()), - ); - self.state.mark_dirty(); - return; - } - }; - - if lock.ts <= max_ts { - let key = Key::from_encoded_slice(put_request.get_key()); - locks.push((key, lock)); - } - } - if !locks.is_empty() { - self.send(locks); - } - } -} - -impl ApplySnapshotObserver for LockObserver { - fn apply_plain_kvs( - &self, - _: &mut ObserverContext<'_>, - cf: CfName, - kv_pairs: &[(Vec, Vec)], - ) { - fail_point!("notify_lock_observer_snapshot"); - if cf != CF_LOCK { - return; - } - - let max_ts = self.state.load_max_ts(); - if max_ts.is_zero() { - return; - } - - if !self.state.is_clean() { - return; - } - - let locks: Result> = kv_pairs - .iter() - .map(|(key, value)| { - Lock::parse(value) - .map(|lock| (key, lock)) - .map_err(|e| ErrorInner::Txn(TxnError::from_mvcc(e)).into()) - }) - .filter(|result| result.is_err() || result.as_ref().unwrap().1.ts <= max_ts) - .map(|result| { - // `apply_plain_keys` will be invoked with the data_key in RocksDB layer. So we - // need to remove the `z` prefix. - result.map(|(key, lock)| (Key::from_encoded_slice(origin_key(key)), lock)) - }) - .collect(); - - match locks { - Err(e) => { - error!(?e; "cannot parse lock"); - self.state.mark_dirty() - } - Ok(l) => self.send(l), - } - } - - fn apply_sst(&self, _: &mut ObserverContext<'_>, cf: CfName, _path: &str) { - if cf == CF_LOCK { - error!("cannot collect all applied lock: snapshot of lock cf applied from sst file"); - self.state.mark_dirty(); - } - } -} - -struct LockCollectorRunner { - observer_state: Arc, - - collected_locks: Vec<(Key, Lock)>, -} - -impl LockCollectorRunner { - pub fn new(observer_state: Arc) -> Self { - Self { - observer_state, - collected_locks: vec![], - } - } - - fn handle_observed_locks(&mut self, mut locks: Vec<(Key, Lock)>) { - if self.collected_locks.len() >= MAX_COLLECT_SIZE { - return; - } - - if locks.len() + self.collected_locks.len() >= MAX_COLLECT_SIZE { - self.observer_state.mark_dirty(); - info!("lock collector marked dirty because received too many locks"); - locks.truncate(MAX_COLLECT_SIZE - self.collected_locks.len()); - } - self.collected_locks.extend(locks); - } - - fn start_collecting(&mut self, max_ts: TimeStamp) -> Result<()> { - let curr_max_ts = self.observer_state.load_max_ts(); - match max_ts.cmp(&curr_max_ts) { - Less => Err(box_err!( - "collecting locks with a greater max_ts: {}", - curr_max_ts - )), - Equal => { - // Stale request. Ignore it. - Ok(()) - } - Greater => { - info!("start collecting locks"; "max_ts" => max_ts); - self.collected_locks.clear(); - // TODO: `is_clean` may be unexpectedly set to false here, if any error happens - // on a previous observing. It need to be solved, although it's very unlikely to - // happen and doesn't affect correctness of data. - self.observer_state.mark_clean(); - self.observer_state.store_max_ts(max_ts); - Ok(()) - } - } - } - - fn get_collected_locks(&mut self, max_ts: TimeStamp) -> Result<(Vec, bool)> { - let curr_max_ts = self.observer_state.load_max_ts(); - if curr_max_ts != max_ts { - warn!( - "trying to fetch collected locks but now collecting with another max_ts"; - "req_max_ts" => max_ts, - "current_max_ts" => curr_max_ts, - ); - return Err(box_err!( - "trying to fetch collected locks but now collecting with another max_ts" - )); - } - - let locks: Result<_> = self - .collected_locks - .iter() - .map(|(k, l)| { - k.to_raw() - .map(|raw_key| l.clone().into_lock_info(raw_key)) - .map_err(|e| Error::from(TxnError::from_mvcc(e))) - }) - .collect(); - - Ok((locks?, self.observer_state.is_clean())) - } - - fn stop_collecting(&mut self, max_ts: TimeStamp) -> Result<()> { - let res = self.observer_state.max_ts.compare_exchange( - max_ts.into_inner(), - 0, - Ordering::SeqCst, - Ordering::SeqCst, - ); - if res.is_ok() { - self.collected_locks.clear(); - info!("stop collecting locks"; "max_ts" => max_ts); - Ok(()) - } else { - warn!( - "trying to stop collecting locks, but now collecting with a different max_ts"; - "stopping_max_ts" => max_ts, - "current_max_ts" => TimeStamp::new(res.unwrap_err()), - ); - Err(box_err!("collecting locks with another max_ts")) - } - } -} - -impl Runnable for LockCollectorRunner { - type Task = LockCollectorTask; - - fn run(&mut self, task: LockCollectorTask) { - match task { - LockCollectorTask::ObservedLocks(locks) => self.handle_observed_locks(locks), - LockCollectorTask::StartCollecting { max_ts, callback } => { - callback(self.start_collecting(max_ts)) - } - LockCollectorTask::GetCollectedLocks { max_ts, callback } => { - callback(self.get_collected_locks(max_ts)) - } - LockCollectorTask::StopCollecting { max_ts, callback } => { - callback(self.stop_collecting(max_ts)) - } - } - } -} - -pub struct AppliedLockCollector { - worker: Mutex, - scheduler: Scheduler, - concurrency_manager: ConcurrencyManager, -} - -impl AppliedLockCollector { - pub fn new( - coprocessor_host: &mut CoprocessorHost, - concurrency_manager: ConcurrencyManager, - ) -> Result { - let worker = Mutex::new(WorkerBuilder::new("lock-collector").create()); - - let state = Arc::new(LockObserverState::default()); - let runner = LockCollectorRunner::new(Arc::clone(&state)); - let scheduler = worker.lock().unwrap().start("lock-collector", runner); - let observer = LockObserver::new(state, scheduler.clone()); - - observer.register(coprocessor_host); - - // Start the worker - - Ok(Self { - worker, - scheduler, - concurrency_manager, - }) - } - - pub fn stop(&self) { - self.worker.lock().unwrap().stop(); - } - - /// Starts collecting applied locks whose `start_ts` <= `max_ts`. Only one - /// `max_ts` is valid at one time. - pub fn start_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { - // Before starting collecting, check the concurrency manager to avoid later - // prewrite requests uses a min_commit_ts less than the safepoint. - // `max_ts` here is the safepoint of the current round of GC. - // Ths is similar to that we update max_ts and check memory lock when handling - // other transactional read requests. However this is done at start_collecting - // instead of physical_scan_locks. The reason is that, to fully scan a TiKV - // store, it might needs more than one physical_scan_lock requests. However - // memory lock needs to be checked before scanning the locks, and we can't know - // the `end_key` of the scan range at that time. As a result, each - // physical_scan_lock request will cause scanning memory lock from the start_key - // to the very-end of the TiKV node, which is a waste. But since we always start - // collecting applied locks before physical scan lock, so a better idea is to - // check the memory lock before physical_scan_lock. - self.concurrency_manager.update_max_ts(max_ts); - self.concurrency_manager - .read_range_check(None, None, |key, lock| { - // `Lock::check_ts_conflict` can't be used here, because LockType::Lock - // can't be ignored in this case. - if lock.ts <= max_ts { - Err(TxnError::from_mvcc(MvccErrorInner::KeyIsLocked( - lock.clone().into_lock_info(key.to_raw()?), - ))) - } else { - Ok(()) - } - })?; - self.scheduler - .schedule(LockCollectorTask::StartCollecting { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } - - /// Get the collected locks after `start_collecting`. Only valid when - /// `max_ts` matches the `max_ts` provided to `start_collecting`. - /// Collects at most `MAX_COLLECT_SIZE` locks. If there are (even - /// potentially) more locks than `MAX_COLLECT_SIZE` or any error happens, - /// the flag `is_clean` will be unset, which represents - /// `AppliedLockCollector` cannot collect all locks. - pub fn get_collected_locks( - &self, - max_ts: TimeStamp, - callback: Callback<(Vec, bool)>, - ) -> Result<()> { - self.scheduler - .schedule(LockCollectorTask::GetCollectedLocks { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } - - /// Stop collecting locks. Only valid when `max_ts` matches the `max_ts` - /// provided to `start_collecting`. - pub fn stop_collecting(&self, max_ts: TimeStamp, callback: Callback<()>) -> Result<()> { - self.scheduler - .schedule(LockCollectorTask::StopCollecting { max_ts, callback }) - .map_err(|e| box_err!("failed to schedule task: {:?}", e)) - } -} - -impl Drop for AppliedLockCollector { - fn drop(&mut self) { - self.stop(); - } -} - -#[cfg(test)] -mod tests { - use std::sync::mpsc::channel; - - use engine_test::kv::KvTestEngine; - use engine_traits::CF_DEFAULT; - use futures::executor::block_on; - use kvproto::{ - kvrpcpb::Op, - metapb::Region, - raft_cmdpb::{PutRequest, RaftCmdRequest, RaftCmdResponse, Request as RaftRequest}, - }; - use txn_types::LockType; - - use super::*; - - fn lock_info_to_kv(mut lock_info: LockInfo) -> (Vec, Vec) { - let key = Key::from_raw(lock_info.get_key()).into_encoded(); - let lock = Lock::new( - match lock_info.get_lock_type() { - Op::Put => LockType::Put, - Op::Del => LockType::Delete, - Op::Lock => LockType::Lock, - Op::PessimisticLock => LockType::Pessimistic, - _ => unreachable!(), - }, - lock_info.take_primary_lock(), - lock_info.get_lock_version().into(), - lock_info.get_lock_ttl(), - None, - 0.into(), - lock_info.get_txn_size(), - 0.into(), - ); - let value = lock.to_bytes(); - (key, value) - } - - fn make_apply_request( - key: Vec, - value: Vec, - cf: &str, - cmd_type: CmdType, - ) -> RaftRequest { - let mut put_req = PutRequest::default(); - put_req.set_cf(cf.to_owned()); - put_req.set_key(key); - put_req.set_value(value); - - let mut req = RaftRequest::default(); - req.set_cmd_type(cmd_type); - req.set_put(put_req); - req - } - - fn make_raft_cmd(requests: Vec) -> Cmd { - let mut req = RaftCmdRequest::default(); - req.set_requests(requests.into()); - Cmd::new(0, 0, req, RaftCmdResponse::default()) - } - - fn new_test_collector() -> (AppliedLockCollector, CoprocessorHost) { - let mut coprocessor_host = CoprocessorHost::default(); - let collector = - AppliedLockCollector::new(&mut coprocessor_host, ConcurrencyManager::new(1.into())) - .unwrap(); - (collector, coprocessor_host) - } - - fn start_collecting(c: &AppliedLockCollector, max_ts: u64) -> Result<()> { - let (tx, rx) = channel(); - c.start_collecting(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .and_then(move |()| rx.recv().unwrap()) - } - - fn get_collected_locks(c: &AppliedLockCollector, max_ts: u64) -> Result<(Vec, bool)> { - let (tx, rx) = channel(); - c.get_collected_locks(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .unwrap(); - rx.recv().unwrap() - } - - fn stop_collecting(c: &AppliedLockCollector, max_ts: u64) -> Result<()> { - let (tx, rx) = channel(); - c.stop_collecting(max_ts.into(), Box::new(move |r| tx.send(r).unwrap())) - .unwrap(); - rx.recv().unwrap() - } - - #[test] - fn test_start_stop() { - let (c, _) = new_test_collector(); - // Not started. - get_collected_locks(&c, 1).unwrap_err(); - stop_collecting(&c, 1).unwrap_err(); - - // Started. - start_collecting(&c, 2).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 2.into()); - get_collected_locks(&c, 2).unwrap(); - stop_collecting(&c, 2).unwrap(); - // Stopped. - get_collected_locks(&c, 2).unwrap_err(); - stop_collecting(&c, 2).unwrap_err(); - - // When start_collecting is invoked with a larger ts, the later one will - // ovewrite the previous one. - start_collecting(&c, 3).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 3.into()); - get_collected_locks(&c, 3).unwrap(); - get_collected_locks(&c, 4).unwrap_err(); - start_collecting(&c, 4).unwrap(); - assert_eq!(c.concurrency_manager.max_ts(), 4.into()); - get_collected_locks(&c, 3).unwrap_err(); - get_collected_locks(&c, 4).unwrap(); - // Do not allow aborting previous observing with a smaller max_ts. - start_collecting(&c, 3).unwrap_err(); - get_collected_locks(&c, 3).unwrap_err(); - get_collected_locks(&c, 4).unwrap(); - // Do not allow stoping observing with a different max_ts. - stop_collecting(&c, 3).unwrap_err(); - stop_collecting(&c, 5).unwrap_err(); - stop_collecting(&c, 4).unwrap(); - } - - #[test] - fn test_check_memlock_on_start() { - let (c, _) = new_test_collector(); - let cm = c.concurrency_manager.clone(); - - let mem_lock = |k: &[u8], ts: u64, lock_type| { - let key = Key::from_raw(k); - let guard = block_on(cm.lock_key(&key)); - guard.with_lock(|lock| { - *lock = Some(txn_types::Lock::new( - lock_type, - k.to_vec(), - ts.into(), - 100, - None, - 0.into(), - 1, - 20.into(), - )); - }); - guard - }; - - let guard = mem_lock(b"a", 100, LockType::Put); - start_collecting(&c, 90).unwrap(); - stop_collecting(&c, 90).unwrap(); - start_collecting(&c, 100).unwrap_err(); - // Use get_collected_locks to check it's not collecting. - get_collected_locks(&c, 100).unwrap_err(); - start_collecting(&c, 110).unwrap_err(); - get_collected_locks(&c, 110).unwrap_err(); - drop(guard); - - let guard = mem_lock(b"b", 100, LockType::Lock); - start_collecting(&c, 90).unwrap(); - stop_collecting(&c, 90).unwrap(); - start_collecting(&c, 100).unwrap_err(); - get_collected_locks(&c, 100).unwrap_err(); - start_collecting(&c, 110).unwrap_err(); - get_collected_locks(&c, 110).unwrap_err(); - drop(guard); - - start_collecting(&c, 200).unwrap(); - stop_collecting(&c, 200).unwrap(); - } - - #[test] - fn test_apply() { - let locks: Vec<_> = vec![ - (b"k0", 10), - (b"k1", 110), - (b"k5", 100), - (b"k2", 101), - (b"k3", 90), - (b"k2", 99), - ] - .into_iter() - .map(|(k, ts)| { - let mut lock_info = LockInfo::default(); - lock_info.set_key(k.to_vec()); - lock_info.set_primary_lock(k.to_vec()); - lock_info.set_lock_type(Op::Put); - lock_info.set_lock_version(ts); - lock_info - }) - .collect(); - let lock_kvs: Vec<_> = locks - .iter() - .map(|lock| lock_info_to_kv(lock.clone())) - .collect(); - - let (c, coprocessor_host) = new_test_collector(); - let mut expected_result = vec![]; - - start_collecting(&c, 100).unwrap(); - assert_eq!(get_collected_locks(&c, 100).unwrap(), (vec![], true)); - - // Only puts in lock cf will be monitered. - let req = vec![ - make_apply_request( - lock_kvs[0].0.clone(), - lock_kvs[0].1.clone(), - CF_LOCK, - CmdType::Put, - ), - make_apply_request(b"1".to_vec(), b"1".to_vec(), CF_DEFAULT, CmdType::Put), - make_apply_request(b"2".to_vec(), b"2".to_vec(), CF_LOCK, CmdType::Delete), - ]; - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req)); - expected_result.push(locks[0].clone()); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result.clone(), true) - ); - - // When start collecting with the same max_ts again, shouldn't clean up the - // observer state. - start_collecting(&c, 100).unwrap(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result.clone(), true) - ); - - // Only locks with ts <= 100 will be collected. - let req: Vec<_> = lock_kvs - .iter() - .map(|(k, v)| make_apply_request(k.clone(), v.clone(), CF_LOCK, CmdType::Put)) - .collect(); - expected_result.extend( - locks - .iter() - .filter(|l| l.get_lock_version() <= 100) - .cloned(), - ); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req.clone())); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_result, true) - ); - - // When start_collecting is double-invoked again with larger ts, the previous - // results are dropped. - start_collecting(&c, 110).unwrap(); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(req)); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks, true)); - } - - #[test] - fn test_apply_snapshot() { - let locks: Vec<_> = vec![ - (b"k0", 10), - (b"k1", 110), - (b"k5", 100), - (b"k2", 101), - (b"k3", 90), - (b"k2", 99), - ] - .into_iter() - .map(|(k, ts)| { - let mut lock_info = LockInfo::default(); - lock_info.set_key(k.to_vec()); - lock_info.set_primary_lock(k.to_vec()); - lock_info.set_lock_type(Op::Put); - lock_info.set_lock_version(ts); - lock_info - }) - .collect(); - let lock_kvs: Vec<_> = locks - .iter() - .map(|lock| lock_info_to_kv(lock.clone())) - .map(|(k, v)| (keys::data_key(&k), v)) - .collect(); - - let (c, coprocessor_host) = new_test_collector(); - start_collecting(&c, 100).unwrap(); - - // Apply plain file to other CFs. Nothing happens. - coprocessor_host.post_apply_plain_kvs_from_snapshot( - &Region::default(), - CF_DEFAULT, - &lock_kvs, - ); - assert_eq!(get_collected_locks(&c, 100).unwrap(), (vec![], true)); - - // Apply plain file to lock cf. Locks with ts before 100 will be collected. - let expected_locks: Vec<_> = locks - .iter() - .filter(|l| l.get_lock_version() <= 100) - .cloned() - .collect(); - coprocessor_host.post_apply_plain_kvs_from_snapshot(&Region::default(), CF_LOCK, &lock_kvs); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - // Fetch result twice gets the same result. - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - - // When stale start_collecting request arrives, the previous collected results - // shouldn't be dropped. - start_collecting(&c, 100).unwrap(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks.clone(), true) - ); - start_collecting(&c, 90).unwrap_err(); - assert_eq!( - get_collected_locks(&c, 100).unwrap(), - (expected_locks, true) - ); - - // When start_collecting is double-invoked again with larger ts, the previous - // results are dropped. - start_collecting(&c, 110).unwrap(); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (vec![], true)); - coprocessor_host.post_apply_plain_kvs_from_snapshot(&Region::default(), CF_LOCK, &lock_kvs); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks.clone(), true)); - - // Apply SST file to other cfs. Nothing happens. - coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_DEFAULT, ""); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks.clone(), true)); - - // Apply SST file to lock cf is not supported. This will cause error and - // therefore `is_clean` will be set to false. - coprocessor_host.post_apply_sst_from_snapshot(&Region::default(), CF_LOCK, ""); - assert_eq!(get_collected_locks(&c, 110).unwrap(), (locks, false)); - } - - #[test] - fn test_not_clean() { - let (c, coprocessor_host) = new_test_collector(); - start_collecting(&c, 1).unwrap(); - // When error happens, `is_clean` should be set to false. - // The value is not a valid lock. - let (k, v) = (Key::from_raw(b"k1").into_encoded(), b"v1".to_vec()); - let req = make_apply_request(k.clone(), v.clone(), CF_LOCK, CmdType::Put); - coprocessor_host.post_apply(&Region::default(), &make_raft_cmd(vec![req])); - assert_eq!(get_collected_locks(&c, 1).unwrap(), (vec![], false)); - - // `is_clean` should be reset after invoking `start_collecting`. - start_collecting(&c, 2).unwrap(); - assert_eq!(get_collected_locks(&c, 2).unwrap(), (vec![], true)); - coprocessor_host.post_apply_plain_kvs_from_snapshot( - &Region::default(), - CF_LOCK, - &[(keys::data_key(&k), v)], - ); - assert_eq!(get_collected_locks(&c, 2).unwrap(), (vec![], false)); - - start_collecting(&c, 3).unwrap(); - assert_eq!(get_collected_locks(&c, 3).unwrap(), (vec![], true)); - - // If there are too many locks, `is_clean` should be set to false. - let mut lock = LockInfo::default(); - lock.set_key(b"k2".to_vec()); - lock.set_primary_lock(b"k2".to_vec()); - lock.set_lock_type(Op::Put); - lock.set_lock_version(1); - - let batch_generate_locks = |count| { - let (k, v) = lock_info_to_kv(lock.clone()); - let req = make_apply_request(k, v, CF_LOCK, CmdType::Put); - let raft_cmd = make_raft_cmd(vec![req; count]); - coprocessor_host.post_apply(&Region::default(), &raft_cmd); - }; - - batch_generate_locks(MAX_COLLECT_SIZE - 1); - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE - 1); - assert!(is_clean); - - batch_generate_locks(1); - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - - batch_generate_locks(1); - // If there are more locks, they will be dropped. - let (locks, is_clean) = get_collected_locks(&c, 3).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - - start_collecting(&c, 4).unwrap(); - assert_eq!(get_collected_locks(&c, 4).unwrap(), (vec![], true)); - - batch_generate_locks(MAX_COLLECT_SIZE - 5); - let (locks, is_clean) = get_collected_locks(&c, 4).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE - 5); - assert!(is_clean); - - batch_generate_locks(10); - let (locks, is_clean) = get_collected_locks(&c, 4).unwrap(); - assert_eq!(locks.len(), MAX_COLLECT_SIZE); - assert!(!is_clean); - } -} diff --git a/src/server/gc_worker/gc_manager.rs b/src/server/gc_worker/gc_manager.rs index b80c17e5ff9..4f528d8c356 100644 --- a/src/server/gc_worker/gc_manager.rs +++ b/src/server/gc_worker/gc_manager.rs @@ -197,7 +197,7 @@ fn set_status_metrics(state: GcManagerState) { ] { AUTO_GC_STATUS_GAUGE_VEC .with_label_values(&[s.tag()]) - .set(if state == *s { 1 } else { 0 }); + .set((state == *s) as i64); } } @@ -653,7 +653,6 @@ mod tests { } => callback, GcTask::GcKeys { .. } => unreachable!(), GcTask::RawGcKeys { .. } => unreachable!(), - GcTask::PhysicalScanLock { .. } => unreachable!(), GcTask::OrphanVersions { .. } => unreachable!(), GcTask::Validate(_) => unreachable!(), }; diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 8e345f0909b..1ccac8860c6 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -22,16 +22,9 @@ use engine_traits::{ }; use file_system::{IoType, WithIoType}; use futures::executor::block_on; -use kvproto::{ - kvrpcpb::{Context, LockInfo}, - metapb::Region, -}; +use kvproto::{kvrpcpb::Context, metapb::Region}; use pd_client::{FeatureGate, PdClient}; -use raftstore::{ - coprocessor::{CoprocessorHost, RegionInfoProvider}, - router::RaftStoreRouter, - store::msg::StoreMsg, -}; +use raftstore::coprocessor::RegionInfoProvider; use tikv_kv::{CfStatistics, CursorBuilder, Modify, SnapContext}; use tikv_util::{ config::{Tracker, VersionTrack}, @@ -43,7 +36,6 @@ use tikv_util::{ use txn_types::{Key, TimeStamp}; use super::{ - applied_lock_collector::{AppliedLockCollector, Callback as LockCollectorCallback}, check_need_gc, compaction_filter::{ CompactionFilterInitializer, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, @@ -115,14 +107,6 @@ where callback: Callback<()>, region_info_provider: Arc, }, - PhysicalScanLock { - ctx: Context, - max_ts: TimeStamp, - start_key: Key, - limit: usize, - callback: Callback>, - region_info_provider: Arc, - }, /// If GC in compaction filter is enabled, versions on default CF will be /// handled with `DB::delete` in write CF's compaction filter. However if /// the compaction filter finds the DB is stalled, it will send the task @@ -149,7 +133,6 @@ where GcTask::GcKeys { .. } => GcCommandKind::gc_keys, GcTask::RawGcKeys { .. } => GcCommandKind::raw_gc_keys, GcTask::UnsafeDestroyRange { .. } => GcCommandKind::unsafe_destroy_range, - GcTask::PhysicalScanLock { .. } => GcCommandKind::physical_scan_lock, GcTask::OrphanVersions { .. } => GcCommandKind::orphan_versions, #[cfg(any(test, feature = "testexport"))] GcTask::Validate(_) => GcCommandKind::validate_config, @@ -179,10 +162,6 @@ where .field("start_key", &format!("{}", start_key)) .field("end_key", &format!("{}", end_key)) .finish(), - GcTask::PhysicalScanLock { max_ts, .. } => f - .debug_struct("PhysicalScanLock") - .field("max_ts", max_ts) - .finish(), GcTask::OrphanVersions { id, wb } => f .debug_struct("OrphanVersions") .field("id", id) @@ -195,15 +174,10 @@ where } /// Used to perform GC operations on the engine. -pub struct GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +pub struct GcRunner { store_id: u64, engine: E, - raft_store_router: RR, flow_info_sender: Sender, /// Used to limit the write flow of GC. @@ -304,15 +278,10 @@ fn init_snap_ctx(store_id: u64, region: &Region) -> Context { ctx } -impl GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +impl GcRunner { pub fn new( store_id: u64, engine: E, - raft_store_router: RR, flow_info_sender: Sender, cfg_tracker: Tracker, cfg: GcConfig, @@ -325,7 +294,6 @@ where Self { store_id, engine, - raft_store_router, flow_info_sender, limiter, cfg, @@ -492,7 +460,7 @@ where "versions" => gc_info.found_versions, ); } - if gc_info.deleted_versions as usize >= GC_LOG_DELETED_VERSION_THRESHOLD { + if gc_info.deleted_versions >= GC_LOG_DELETED_VERSION_THRESHOLD { debug!( "GC deleted plenty versions for a key"; "key" => %key, @@ -818,15 +786,10 @@ where .send(FlowInfo::AfterUnsafeDestroyRange(ctx.region_id)) .unwrap(); - self.raft_store_router - .send_store_msg(StoreMsg::ClearRegionSizeInRange { - start_key: start_key.as_encoded().to_vec(), - end_key: end_key.as_encoded().to_vec(), - }) - .unwrap_or_else(|e| { - // Warn and ignore it. - warn!("unsafe destroy range: failed sending ClearRegionSizeInRange"; "err" => ?e); - }); + self.engine.hint_change_in_range( + start_key.as_encoded().to_vec(), + end_key.as_encoded().to_vec(), + ); } else { let cfs = &[CF_LOCK, CF_DEFAULT, CF_WRITE]; let keys = vec![start_key.clone(), end_key.clone()]; @@ -870,47 +833,6 @@ where Ok(()) } - fn handle_physical_scan_lock( - &mut self, - _: &Context, - max_ts: TimeStamp, - start_key: &Key, - limit: usize, - regions_provider: Arc, - ) -> Result> { - let store_id = self.store_id; - let regions = box_try!(regions_provider.get_regions_in_range(start_key.as_encoded(), &[])) - .into_iter() - .filter(move |r| find_peer(r, store_id).is_some()); - - let mut first_round = true; - let mut locks = Vec::new(); - for region in regions { - let start_key = { - if first_round { - first_round = false; - start_key.clone() - } else { - Key::from_raw(region.get_start_key()) - } - }; - let snap = self.get_snapshot(store_id, ®ion)?; - let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); - let (locks_this_region, _) = reader - .scan_locks(Some(&start_key), None, |l| l.ts <= max_ts, limit) - .map_err(TxnError::from_mvcc)?; - - locks.extend(locks_this_region); - } - - let mut lock_infos = Vec::with_capacity(locks.len()); - for (key, lock) in locks { - let raw_key = key.into_raw().map_err(TxnError::from_mvcc)?; - lock_infos.push(lock.into_lock_info(raw_key)); - } - Ok(lock_infos) - } - fn update_statistics_metrics(&mut self, key_mode: GcKeyMode) { if let Some(mut_stats) = self.stats_map.get_mut(&key_mode) { let stats = mem::take(mut_stats); @@ -951,11 +873,7 @@ where } } -impl Runnable for GcRunner -where - E: Engine, - RR: RaftStoreRouter, -{ +impl Runnable for GcRunner { type Task = GcTask; #[inline] @@ -1064,31 +982,6 @@ where end_key ); } - GcTask::PhysicalScanLock { - ctx, - max_ts, - start_key, - limit, - callback, - region_info_provider, - } => { - let res = self.handle_physical_scan_lock( - &ctx, - max_ts, - &start_key, - limit, - region_info_provider, - ); - update_metrics(res.is_err()); - callback(res); - slow_log!( - T timer, - "PhysicalScanLock start_key {:?}, max_ts {}, limit {}", - start_key, - max_ts, - limit, - ); - } GcTask::OrphanVersions { mut wb, id } => { info!("handling GcTask::OrphanVersions"; "id" => id); let mut wopts = WriteOptions::default(); @@ -1121,9 +1014,6 @@ fn handle_gc_task_schedule_error(e: ScheduleError>) -> Res GcTask::Gc { callback, .. } | GcTask::UnsafeDestroyRange { callback, .. } => { callback(Err(Error::from(ErrorInner::GcWorkerTooBusy))) } - GcTask::PhysicalScanLock { callback, .. } => { - callback(Err(Error::from(ErrorInner::GcWorkerTooBusy))) - } // Attention: If you are adding a new GcTask, do not forget to call the callback if it has a // callback. GcTask::GcKeys { .. } | GcTask::RawGcKeys { .. } | GcTask::OrphanVersions { .. } => {} @@ -1162,16 +1052,12 @@ pub fn sync_gc( } /// Used to schedule GC operations. -pub struct GcWorker +pub struct GcWorker where E: Engine, - RR: RaftStoreRouter + 'static, { engine: E, - /// `raft_store_router` is useful to signal raftstore clean region size - /// informations. - raft_store_router: RR, /// Used to signal unsafe destroy range is executed. flow_info_sender: Option>, region_info_provider: Arc, @@ -1184,30 +1070,22 @@ where worker: Arc>>>, worker_scheduler: Scheduler>, - applied_lock_collector: Option>, - gc_manager_handle: Arc>>, feature_gate: FeatureGate, } -impl Clone for GcWorker -where - E: Engine, - RR: RaftStoreRouter, -{ +impl Clone for GcWorker { #[inline] fn clone(&self) -> Self { self.refs.fetch_add(1, Ordering::SeqCst); Self { engine: self.engine.clone(), - raft_store_router: self.raft_store_router.clone(), flow_info_sender: self.flow_info_sender.clone(), config_manager: self.config_manager.clone(), refs: self.refs.clone(), worker: self.worker.clone(), worker_scheduler: self.worker_scheduler.clone(), - applied_lock_collector: self.applied_lock_collector.clone(), gc_manager_handle: self.gc_manager_handle.clone(), feature_gate: self.feature_gate.clone(), region_info_provider: self.region_info_provider.clone(), @@ -1215,11 +1093,7 @@ where } } -impl Drop for GcWorker -where - E: Engine, - RR: RaftStoreRouter + 'static, -{ +impl Drop for GcWorker { #[inline] fn drop(&mut self) { let refs = self.refs.fetch_sub(1, Ordering::SeqCst); @@ -1235,31 +1109,24 @@ where } } -impl GcWorker -where - E: Engine, - RR: RaftStoreRouter, -{ +impl GcWorker { pub fn new( engine: E, - raft_store_router: RR, flow_info_sender: Sender, cfg: GcConfig, feature_gate: FeatureGate, region_info_provider: Arc, - ) -> GcWorker { + ) -> Self { let worker_builder = WorkerBuilder::new("gc-worker").pending_capacity(GC_MAX_PENDING_TASKS); let worker = worker_builder.create().lazy_build("gc-worker"); let worker_scheduler = worker.scheduler(); GcWorker { engine, - raft_store_router, flow_info_sender: Some(flow_info_sender), config_manager: GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg))), refs: Arc::new(AtomicUsize::new(1)), worker: Arc::new(Mutex::new(worker)), worker_scheduler, - applied_lock_collector: None, gc_manager_handle: Arc::new(Mutex::new(None)), feature_gate, region_info_provider, @@ -1305,7 +1172,6 @@ where let runner = GcRunner::new( store_id, self.engine.clone(), - self.raft_store_router.clone(), self.flow_info_sender.take().unwrap(), self.config_manager.0.clone().tracker("gc-woker".to_owned()), self.config_manager.value().clone(), @@ -1314,20 +1180,6 @@ where Ok(()) } - pub fn start_observe_lock_apply( - &mut self, - coprocessor_host: &mut CoprocessorHost, - concurrency_manager: ConcurrencyManager, - ) -> Result<()> { - assert!(self.applied_lock_collector.is_none()); - let collector = Arc::new(AppliedLockCollector::new( - coprocessor_host, - concurrency_manager, - )?); - self.applied_lock_collector = Some(collector); - Ok(()) - } - pub fn stop(&self) -> Result<()> { // Stop GcManager. if let Some(h) = self.gc_manager_handle.lock().unwrap().take() { @@ -1387,61 +1239,6 @@ where pub fn get_config_manager(&self) -> GcWorkerConfigManager { self.config_manager.clone() } - - pub fn physical_scan_lock( - &self, - ctx: Context, - max_ts: TimeStamp, - start_key: Key, - limit: usize, - callback: Callback>, - ) -> Result<()> { - GC_COMMAND_COUNTER_VEC_STATIC.physical_scan_lock.inc(); - - self.worker_scheduler - .schedule(GcTask::PhysicalScanLock { - ctx, - max_ts, - start_key, - limit, - callback, - region_info_provider: self.region_info_provider.clone(), - }) - .or_else(handle_gc_task_schedule_error) - } - - pub fn start_collecting( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<()>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.start_collecting(max_ts, callback)) - } - - pub fn get_collected_locks( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<(Vec, bool)>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.get_collected_locks(max_ts, callback)) - } - - pub fn stop_collecting( - &self, - max_ts: TimeStamp, - callback: LockCollectorCallback<()>, - ) -> Result<()> { - self.applied_lock_collector - .as_ref() - .ok_or_else(|| box_err!("applied_lock_collector not supported")) - .and_then(move |c| c.stop_collecting(max_ts, callback)) - } } #[cfg(any(test, feature = "testexport"))] @@ -1450,21 +1247,19 @@ pub mod test_gc_worker { use collections::HashMap; use engine_rocks::{RocksEngine, RocksSnapshot}; + use futures::Future; use kvproto::{ kvrpcpb::Context, metapb::{Peer, Region}, }; use raftstore::store::RegionSnapshot; - use tikv_kv::write_modifies; + use tikv_kv::{write_modifies, OnAppliedCb}; use txn_types::{Key, TimeStamp}; use crate::{ server::gc_worker::{GcSafePointProvider, Result as GcWorkerResult}, storage::{ - kv::{ - self, Callback as EngineCallback, Modify, Result as EngineResult, SnapContext, - WriteData, - }, + kv::{self, Modify, Result as EngineResult, SnapContext, WriteData}, Engine, }, }; @@ -1517,12 +1312,14 @@ pub mod test_gc_worker { write_modifies(&self.kv_engine().unwrap(), modifies) } + type WriteRes = ::WriteRes; fn async_write( &self, ctx: &Context, mut batch: WriteData, - callback: EngineCallback<()>, - ) -> EngineResult<()> { + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { batch.modifies.iter_mut().for_each(|modify| match modify { Modify::Delete(_, ref mut key) => { *key = Key::from_encoded(keys::data_key(key.as_encoded())); @@ -1538,25 +1335,19 @@ pub mod test_gc_worker { *end_key = Key::from_encoded(keys::data_end_key(end_key.as_encoded())); } }); - self.0.async_write(ctx, batch, callback) + self.0.async_write(ctx, batch, subscribed, on_applied) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - callback: EngineCallback, - ) -> EngineResult<()> { - self.0.async_snapshot( - ctx, - Box::new(move |r| { - callback(r.map(|snap| { - let mut region = Region::default(); - // Add a peer to pass initialized check. - region.mut_peers().push(Peer::default()); - RegionSnapshot::from_snapshot(snap, Arc::new(region)) - })) - }), - ) + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { + let f = self.0.async_snapshot(ctx); + async move { + let snap = f.await?; + let mut region = Region::default(); + // Add a peer to pass initialized check. + region.mut_peers().push(Peer::default()); + Ok(RegionSnapshot::from_snapshot(snap, Arc::new(region))) + } } } @@ -1595,27 +1386,27 @@ pub mod test_gc_worker { Ok(()) } + type WriteRes = ::WriteRes; fn async_write( &self, ctx: &Context, batch: WriteData, - callback: EngineCallback<()>, - ) -> EngineResult<()> { - self.engines.lock().unwrap()[&ctx.region_id].async_write(ctx, batch, callback) + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.engines.lock().unwrap()[&ctx.region_id] + .async_write(ctx, batch, subscribed, on_applied) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - callback: EngineCallback, - ) -> EngineResult<()> { + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { let region_id = ctx.pb_ctx.region_id; self.engines .lock() .unwrap() .get_mut(®ion_id) .unwrap() - .async_snapshot(ctx, callback) + .async_snapshot(ctx) } } } @@ -1626,7 +1417,7 @@ mod tests { use std::{ collections::{BTreeMap, BTreeSet}, path::Path, - sync::mpsc::{self, channel}, + sync::mpsc, thread, time::Duration, }; @@ -1635,23 +1426,15 @@ mod tests { use engine_rocks::{util::get_cf_handle, RocksEngine}; use engine_traits::Peekable as _; use futures::executor::block_on; - use kvproto::{ - kvrpcpb::{ApiVersion, Op}, - metapb::Peer, - }; + use kvproto::{kvrpcpb::ApiVersion, metapb::Peer}; use raft::StateRole; - use raftstore::{ - coprocessor::{ - region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, - RegionChangeEvent, - }, - router::RaftStoreBlackHole, + use raftstore::coprocessor::{ + region_info_accessor::{MockRegionInfoProvider, RegionInfoAccessor}, + CoprocessorHost, RegionChangeEvent, }; use tempfile::Builder; use tikv_kv::Snapshot; - use tikv_util::{ - codec::number::NumberEncoder, future::paired_future_callback, store::new_peer, - }; + use tikv_util::store::new_peer; use txn_types::Mutation; use super::{test_gc_worker::MultiRocksEngine, *}; @@ -1794,7 +1577,6 @@ mod tests { let mut gc_worker = GcWorker::new( engine, - RaftStoreBlackHole, tx, GcConfig::default(), gate, @@ -1955,88 +1737,6 @@ mod tests { .unwrap(); } - #[test] - fn test_physical_scan_lock() { - let store_id = 1; - let engine = TestEngineBuilder::new().build().unwrap(); - let prefixed_engine = PrefixedEngine(engine); - let storage = TestStorageBuilderApiV1::from_engine_and_lock_mgr( - prefixed_engine.clone(), - MockLockManager::new(), - ) - .build() - .unwrap(); - let (tx, _rx) = mpsc::channel(); - let mut region = Region::default(); - region.mut_peers().push(new_peer(store_id, 0)); - let mut gc_worker = GcWorker::new( - prefixed_engine, - RaftStoreBlackHole, - tx, - GcConfig::default(), - FeatureGate::default(), - Arc::new(MockRegionInfoProvider::new(vec![region])), - ); - gc_worker.start(store_id).unwrap(); - - let physical_scan_lock = |max_ts: u64, start_key, limit| { - let (cb, f) = paired_future_callback(); - gc_worker - .physical_scan_lock(Context::default(), max_ts.into(), start_key, limit, cb) - .unwrap(); - block_on(f).unwrap() - }; - - let mut expected_lock_info = Vec::new(); - - // Put locks into the storage. - for i in 0..50 { - let mut k = vec![]; - k.encode_u64(i).unwrap(); - let v = k.clone(); - - let mutation = Mutation::make_put(Key::from_raw(&k), v); - - let lock_ts = 10 + i % 3; - - // Collect all locks with ts <= 11 to check the result of physical_scan_lock. - if lock_ts <= 11 { - let mut info = LockInfo::default(); - info.set_primary_lock(k.clone()); - info.set_lock_version(lock_ts); - info.set_key(k.clone()); - info.set_lock_type(Op::Put); - expected_lock_info.push(info) - } - - let (tx, rx) = channel(); - storage - .sched_txn_command( - commands::Prewrite::with_defaults(vec![mutation], k, lock_ts.into()), - Box::new(move |res| tx.send(res).unwrap()), - ) - .unwrap(); - rx.recv() - .unwrap() - .unwrap() - .locks - .into_iter() - .for_each(|r| r.unwrap()); - } - - let res = physical_scan_lock(11, Key::from_raw(b""), 50).unwrap(); - assert_eq!(res, expected_lock_info); - - let res = physical_scan_lock(11, Key::from_raw(b""), 5).unwrap(); - assert_eq!(res[..], expected_lock_info[..5]); - - let mut start_key = vec![]; - start_key.encode_u64(4).unwrap(); - let res = physical_scan_lock(11, Key::from_raw(&start_key), 6).unwrap(); - // expected_locks[3] is the key 4. - assert_eq!(res[..], expected_lock_info[3..9]); - } - #[test] fn test_gc_keys_with_region_info_provider() { let store_id = 1; @@ -2053,7 +1753,6 @@ mod tests { let mut gc_worker = GcWorker::new( prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, @@ -2145,7 +1844,6 @@ mod tests { let mut runner = GcRunner::new( store_id, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2208,7 +1906,6 @@ mod tests { let mut runner = GcRunner::new( store_id, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2310,7 +2007,6 @@ mod tests { let mut runner = GcRunner::new( 1, prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2439,7 +2135,6 @@ mod tests { let mut gc_worker = GcWorker::new( engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), gate, @@ -2569,7 +2264,7 @@ mod tests { ) -> ( MultiRocksEngine, Arc, - GcRunner, + GcRunner, Vec, mpsc::Receiver, ) { @@ -2625,7 +2320,6 @@ mod tests { let gc_runner = GcRunner::new( store_id, engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 @@ -2804,7 +2498,6 @@ mod tests { let mut gc_runner = GcRunner::new( store_id, engine.clone(), - RaftStoreBlackHole, tx, GcWorkerConfigManager(Arc::new(VersionTrack::new(cfg.clone()))) .0 diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index 5b43b9b4be3..a5b8837cd2e 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -mod applied_lock_collector; pub mod compaction_filter; mod config; mod gc_manager; diff --git a/src/server/lock_manager/mod.rs b/src/server/lock_manager/mod.rs index ae60467124b..243d533a0e5 100644 --- a/src/server/lock_manager/mod.rs +++ b/src/server/lock_manager/mod.rs @@ -37,10 +37,10 @@ use crate::{ }, storage::{ lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockManager as LockManagerTrait, LockWaitToken, - UpdateWaitForEvent, WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, + LockManager as LockManagerTrait, LockWaitToken, UpdateWaitForEvent, WaitTimeout, }, - DynamicConfigs as StorageDynamicConfigs, Error as StorageError, + DynamicConfigs as StorageDynamicConfigs, }, }; @@ -248,7 +248,7 @@ impl LockManagerTrait for LockManager { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { let timeout = match timeout { diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 33164833fba..467580645d3 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -27,13 +27,12 @@ use tikv_util::{ }; use tokio::task::spawn_local; use tracker::GLOBAL_TRACKERS; -use txn_types::Key; use super::{config::Config, deadlock::Scheduler as DetectorScheduler, metrics::*}; use crate::storage::{ lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, - WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, + UpdateWaitForEvent, WaitTimeout, }, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, TimeStamp}, txn::Error as TxnError, @@ -107,9 +106,6 @@ pub type Callback = Box) + Send>; #[allow(clippy::large_enum_variant)] pub enum Task { - SetKeyWakeUpDelayCallback { - cb: Box, - }, WaitFor { token: LockWaitToken, region_id: u64, @@ -119,7 +115,7 @@ pub enum Task { start_ts: TimeStamp, wait_info: KeyLockWaitInfo, timeout: WaitTimeout, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, start_waiting_time: Instant, }, @@ -158,9 +154,6 @@ impl Debug for Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { - Task::SetKeyWakeUpDelayCallback { .. } => { - write!(f, "setting key wake up delay callback") - } Task::WaitFor { token, start_ts, @@ -206,7 +199,7 @@ pub(crate) struct Waiter { // term: u64, pub(crate) start_ts: TimeStamp, pub(crate) wait_info: KeyLockWaitInfo, - pub(crate) cancel_callback: Box, + pub(crate) cancel_callback: CancellationCallback, pub diag_ctx: DiagnosticContext, delay: Delay, start_waiting_time: Instant, @@ -219,7 +212,7 @@ impl Waiter { _term: u64, start_ts: TimeStamp, wait_info: KeyLockWaitInfo, - cancel_callback: Box, + cancel_callback: CancellationCallback, deadline: Instant, diag_ctx: DiagnosticContext, start_waiting_time: Instant, @@ -280,7 +273,7 @@ impl Waiter { pub(super) fn cancel_no_timeout( wait_info: KeyLockWaitInfo, - cancel_callback: Box, + cancel_callback: CancellationCallback, ) { let lock_info = wait_info.lock_info; let error = MvccError::from(MvccErrorInner::KeyIsLocked(lock_info)); @@ -311,8 +304,6 @@ struct WaitTable { wait_table: HashMap<(u64, TimeStamp), LockWaitToken>, waiter_pool: HashMap, waiter_count: Arc, - - wake_up_key_delay_callback: Option>, } impl WaitTable { @@ -321,17 +312,9 @@ impl WaitTable { wait_table: HashMap::default(), waiter_pool: HashMap::default(), waiter_count, - wake_up_key_delay_callback: None, } } - fn set_wake_up_key_delay_callback( - &mut self, - cb: Option>, - ) { - self.wake_up_key_delay_callback = cb; - } - #[cfg(test)] fn count(&self) -> usize { self.waiter_pool.len() @@ -384,8 +367,8 @@ impl WaitTable { fn to_wait_for_entries(&self) -> Vec { self.waiter_pool - .iter() - .map(|(_, waiter)| { + .values() + .map(|waiter| { let mut wait_for_entry = WaitForEntry::default(); wait_for_entry.set_txn(waiter.start_ts.into_inner()); wait_for_entry.set_wait_for_txn(waiter.wait_info.lock_digest.ts.into_inner()); @@ -430,7 +413,7 @@ impl Scheduler { start_ts: TimeStamp, wait_info: KeyLockWaitInfo, timeout: WaitTimeout, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { self.notify_scheduler(Task::WaitFor { @@ -447,13 +430,6 @@ impl Scheduler { }); } - pub fn set_key_wake_up_delay_callback( - &self, - cb: Box, - ) { - self.notify_scheduler(Task::SetKeyWakeUpDelayCallback { cb }); - } - pub fn remove_lock_wait(&self, token: LockWaitToken) { self.notify_scheduler(Task::RemoveLockWait { token }); } @@ -610,11 +586,6 @@ impl WaiterManager { impl FutureRunnable for WaiterManager { fn run(&mut self, task: Task) { match task { - Task::SetKeyWakeUpDelayCallback { cb } => { - self.wait_table - .borrow_mut() - .set_wake_up_key_delay_callback(Some(cb)); - } Task::WaitFor { token, region_id, diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 3e07a75899f..23f8256835b 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -51,7 +51,6 @@ make_auto_flush_static_metric! { raw_compare_and_swap, raw_checksum, unsafe_destroy_range, - physical_scan_lock, register_lock_observer, check_lock_observer, remove_lock_observer, @@ -71,7 +70,6 @@ make_auto_flush_static_metric! { gc_keys, raw_gc_keys, unsafe_destroy_range, - physical_scan_lock, validate_config, orphan_versions, } diff --git a/src/server/mod.rs b/src/server/mod.rs index af1aa289de7..d926ca40b2a 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -20,6 +20,7 @@ pub mod server; pub mod service; pub mod snap; pub mod status_server; +pub mod tablet_snap; pub mod transport; pub mod ttl; diff --git a/src/server/node.rs b/src/server/node.rs index 65dd592b490..0b654921f59 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -357,7 +357,7 @@ where because found data key that is not written by TiDB: {:?}", ident.api_version, self.api_version, - log_wrappers::hex_encode_upper(&unexpected_data_key) + log_wrappers::hex_encode_upper(unexpected_data_key) )); } } diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index bc0e8a59303..fa12600bb98 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -1,10 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - cmp, collections::VecDeque, ffi::CString, - marker::{PhantomData, Unpin}, + marker::Unpin, mem, pin::Pin, result, @@ -17,7 +16,6 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::queue::ArrayQueue; -use engine_traits::KvEngine; use futures::{ channel::oneshot, compat::Future01CompatExt, @@ -27,16 +25,18 @@ use futures::{ }; use futures_timer::Delay; use grpcio::{ - ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, RpcStatusCode, - WriteFlags, + Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, + RpcStatusCode, WriteFlags, }; use kvproto::{ - raft_serverpb::{Done, RaftMessage}, + raft_serverpb::{Done, RaftMessage, RaftSnapshotData}, tikvpb::{BatchRaftMessage, TikvClient}, }; +use protobuf::Message; use raft::SnapshotStatus; -use raftstore::{errors::DiscardReason, router::RaftStoreRouter}; +use raftstore::errors::DiscardReason; use security::SecurityManager; +use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, lru::LruCache, @@ -346,18 +346,16 @@ impl Buffer for MessageBuffer { } /// Reporter reports whether a snapshot is sent successfully. -struct SnapshotReporter { - raft_router: T, - engine: PhantomData, +struct SnapshotReporter { + raft_router: R, region_id: u64, to_peer_id: u64, to_store_id: u64, } -impl SnapshotReporter +impl SnapshotReporter where - T: RaftStoreRouter + 'static, - E: KvEngine, + R: RaftExtension + 'static, { pub fn report(&self, status: SnapshotStatus) { debug!( @@ -374,43 +372,21 @@ where .inc(); } - if let Err(e) = - self.raft_router - .report_snapshot_status(self.region_id, self.to_peer_id, status) - { - error!(?e; - "report snapshot to peer failes"; - "to_peer_id" => self.to_peer_id, - "to_store_id" => self.to_store_id, - "region_id" => self.region_id, - ); - } + self.raft_router + .report_snapshot_status(self.region_id, self.to_peer_id, status); } } -fn report_unreachable(router: &R, msg: &RaftMessage) -where - R: RaftStoreRouter, - E: KvEngine, -{ +fn report_unreachable(router: &impl RaftExtension, msg: &RaftMessage) { let to_peer = msg.get_to_peer(); if msg.get_message().has_snapshot() { let store = to_peer.store_id.to_string(); REPORT_FAILURE_MSG_COUNTER .with_label_values(&["snapshot", &*store]) .inc(); - let res = router.report_snapshot_status(msg.region_id, to_peer.id, SnapshotStatus::Failure); - if let Err(e) = res { - error!( - ?e; - "reporting snapshot to peer fails"; - "to_peer_id" => to_peer.id, - "to_store_id" => to_peer.store_id, - "region_id" => msg.region_id, - ); - } + router.report_snapshot_status(msg.region_id, to_peer.id, SnapshotStatus::Failure); } - let _ = router.report_unreachable(msg.region_id, to_peer.id); + router.report_peer_unreachable(msg.region_id, to_peer.id); } fn grpc_error_is_unimplemented(e: &grpcio::Error) -> bool { @@ -422,7 +398,7 @@ fn grpc_error_is_unimplemented(e: &grpcio::Error) -> bool { } /// Struct tracks the lifetime of a `raft` or `batch_raft` RPC. -struct AsyncRaftSender { +struct AsyncRaftSender { sender: ClientCStreamSender, queue: Arc, buffer: B, @@ -430,23 +406,20 @@ struct AsyncRaftSender { snap_scheduler: Scheduler, addr: String, flush_timeout: Option, - _engine: PhantomData, } -impl AsyncRaftSender +impl AsyncRaftSender where - R: RaftStoreRouter + 'static, + R: RaftExtension + 'static, B: Buffer, - E: KvEngine, { - fn new_snapshot_reporter(&self, msg: &RaftMessage) -> SnapshotReporter { + fn new_snapshot_reporter(&self, msg: &RaftMessage) -> SnapshotReporter { let region_id = msg.get_region_id(); let to_peer_id = msg.get_to_peer().get_id(); let to_store_id = msg.get_to_peer().get_store_id(); SnapshotReporter { raft_router: self.router.clone(), - engine: PhantomData, region_id, to_peer_id, to_store_id, @@ -484,20 +457,25 @@ where None => return, }; if msg.get_message().has_snapshot() { - self.send_snapshot_sock(msg); - continue; - } else { - self.buffer.push(msg); + let mut snapshot = RaftSnapshotData::default(); + snapshot + .merge_from_bytes(msg.get_message().get_snapshot().get_data()) + .unwrap(); + // Witness's snapshot must be empty, no need to send snapshot files + if !snapshot.get_meta().get_for_witness() { + self.send_snapshot_sock(msg); + continue; + } } + self.buffer.push(msg); } } } -impl Future for AsyncRaftSender +impl Future for AsyncRaftSender where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, B: Buffer + Unpin, - E: KvEngine, { type Output = grpcio::Result<()>; @@ -550,42 +528,51 @@ where } } -struct RaftCall { - sender: AsyncRaftSender, +#[derive(PartialEq)] +enum RaftCallRes { + // the call is not supported, probably due to visiting to older version TiKV + Fallback, + // the connection is aborted or closed + Disconnected, +} + +struct RaftCall { + sender: AsyncRaftSender, receiver: ClientCStreamReceiver, - lifetime: Option>, + lifetime: Option>, store_id: u64, } -impl RaftCall +impl RaftCall where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, B: Buffer + Unpin, - E: KvEngine, { - fn clean_up(&mut self, sink_err: Option, recv_err: Option) { - error!("connection aborted"; "store_id" => self.store_id, "sink_error" => ?sink_err, "receiver_err" => ?recv_err, "addr" => %self.sender.addr); + async fn poll(&mut self) { + let res = futures::join!(&mut self.sender, &mut self.receiver); + if let (Ok(()), Ok(Done { .. })) = res { + info!("connection close"; "store_id" => self.store_id, "addr" => %self.sender.addr); + if let Some(tx) = self.lifetime.take() { + let _ = tx.send(RaftCallRes::Disconnected); + } + return; + } + let (sink_err, recv_err) = (res.0.err(), res.1.err()); + error!("connection aborted"; "store_id" => self.store_id, "sink_error" => ?sink_err, "receiver_err" => ?recv_err, "addr" => %self.sender.addr); if let Some(tx) = self.lifetime.take() { let should_fallback = [sink_err, recv_err] .iter() .any(|e| e.as_ref().map_or(false, grpc_error_is_unimplemented)); - if should_fallback { - // Asks backend to fallback. - let _ = tx.send(()); - return; - } - } - self.sender.router.broadcast_unreachable(self.store_id); - } - async fn poll(&mut self) { - let res = futures::join!(&mut self.sender, &mut self.receiver); - if let (Ok(()), Ok(Done { .. })) = res { - info!("connection close"; "store_id" => self.store_id, "addr" => %self.sender.addr); - return; + let res = if should_fallback { + // Asks backend to fallback. + RaftCallRes::Fallback + } else { + RaftCallRes::Disconnected + }; + let _ = tx.send(res); } - self.clean_up(res.0.err(), res.1.err()); } } @@ -624,18 +611,16 @@ impl ConnectionBuilder { /// StreamBackEnd watches lifetime of a connection and handles reconnecting, /// spawn new RPC. -struct StreamBackEnd { +struct StreamBackEnd { store_id: u64, queue: Arc, builder: ConnectionBuilder, - engine: PhantomData, } -impl StreamBackEnd +impl StreamBackEnd where S: StoreAddrResolver, - R: RaftStoreRouter + Unpin + 'static, - E: KvEngine, + R: RaftExtension + Unpin + 'static, { fn resolve(&self) -> impl Future> { let (tx, rx) = oneshot::channel(); @@ -686,7 +671,7 @@ where .inc_by(len as u64); } - fn connect(&self, addr: &str) -> TikvClient { + fn connect(&self, addr: &str) -> Channel { info!("server: new connection with tikv endpoint"; "addr" => addr, "store_id" => self.store_id); let cfg = self.builder.cfg.value(); @@ -697,16 +682,17 @@ where .default_compression_algorithm(cfg.grpc_compression_algorithm()) .default_gzip_compression_level(cfg.grpc_gzip_compression_level) .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress) + .max_reconnect_backoff(cfg.raft_client_max_backoff.0) + .initial_reconnect_backoff(cfg.raft_client_initial_reconnect_backoff.0) // hack: so it's different args, grpc will always create a new connection. .raw_cfg_int( CString::new("random id").unwrap(), CONN_ID.fetch_add(1, Ordering::SeqCst), ); - let channel = self.builder.security_mgr.connect(cb, addr); - TikvClient::new(channel) + self.builder.security_mgr.connect(cb, addr) } - fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver<()> { + fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { let (batch_sink, batch_stream) = client.batch_raft().unwrap(); let (tx, rx) = oneshot::channel(); let mut call = RaftCall { @@ -718,7 +704,6 @@ where snap_scheduler: self.builder.snap_scheduler.clone(), addr, flush_timeout: None, - _engine: PhantomData::, }, receiver: batch_stream, lifetime: Some(tx), @@ -731,7 +716,7 @@ where rx } - fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver<()> { + fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { let (sink, stream) = client.raft().unwrap(); let (tx, rx) = oneshot::channel(); let mut call = RaftCall { @@ -743,7 +728,6 @@ where snap_scheduler: self.builder.snap_scheduler.clone(), addr, flush_timeout: None, - _engine: PhantomData::, }, receiver: stream, lifetime: Some(tx), @@ -756,22 +740,23 @@ where } } -async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Instant, retry_times: &mut u32) { - if *retry_times == 0 { - return; - } - let timeout = backoff * cmp::min(*retry_times, 5); +async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Option) { let now = Instant::now(); - if *last_wake_time + timeout < now { - // We have spent long enough time in last retry, no need to backoff again. - *last_wake_time = now; - *retry_times = 0; + if let Some(last) = *last_wake_time { + if last + backoff < now { + // We have spent long enough time in last retry, no need to backoff again. + *last_wake_time = Some(now); + return; + } + } else { + *last_wake_time = Some(now); return; } - if let Err(e) = GLOBAL_TIMER_HANDLE.delay(now + timeout).compat().await { + + if let Err(e) = GLOBAL_TIMER_HANDLE.delay(now + backoff).compat().await { error_unknown!(?e; "failed to backoff"); } - *last_wake_time = Instant::now(); + *last_wake_time = Some(Instant::now()); } /// A future that drives the life cycle of a connection. @@ -784,21 +769,20 @@ async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Instant, retry_ti /// 4. fallback to legacy API if incompatible /// /// Every failure during the process should trigger retry automatically. -async fn start( - back_end: StreamBackEnd, +async fn start( + back_end: StreamBackEnd, conn_id: usize, pool: Arc>, ) where S: StoreAddrResolver + Send, - R: RaftStoreRouter + Unpin + Send + 'static, - E: KvEngine, + R: RaftExtension + Unpin + Send + 'static, { - let mut last_wake_time = Instant::now(); - let mut retry_times = 0; - let backoff_duration = back_end.builder.cfg.value().raft_client_backoff_step.0; + let mut last_wake_time = None; + let mut first_time = true; + let backoff_duration = back_end.builder.cfg.value().raft_client_max_backoff.0; + let mut addr_channel = None; loop { - maybe_backoff(backoff_duration, &mut last_wake_time, &mut retry_times).await; - retry_times += 1; + maybe_backoff(backoff_duration, &mut last_wake_time).await; let f = back_end.resolve(); let addr = match f.await { Ok(addr) => { @@ -822,36 +806,65 @@ async fn start( continue; } }; - let client = back_end.connect(&addr); + + // reuse channel if the address is the same. + if addr_channel + .as_ref() + .map_or(true, |(_, prev_addr)| prev_addr != &addr) + { + addr_channel = Some((back_end.connect(&addr), addr.clone())); + } + let channel = addr_channel.as_ref().unwrap().0.clone(); + + debug!("connecting to store"; "store_id" => back_end.store_id, "addr" => %addr); + if !channel.wait_for_connected(backoff_duration).await { + error!("wait connect timeout"; "store_id" => back_end.store_id, "addr" => addr); + + // Clears pending messages to avoid consuming high memory when one node is + // shutdown. + back_end.clear_pending_message("unreachable"); + + // broadcast is time consuming operation which would blocks raftstore, so report + // unreachable only once until being connected again. + if first_time { + first_time = false; + back_end + .builder + .router + .report_store_unreachable(back_end.store_id); + } + continue; + } else { + debug!("connection established"; "store_id" => back_end.store_id, "addr" => %addr); + } + + let client = TikvClient::new(channel); let f = back_end.batch_call(&client, addr.clone()); - let mut res = f.await; - if res == Ok(()) { - // If the call is setup successfully, it will never finish. Returning `Ok(())` - // means the batch_call is not supported, we are probably connect to - // an old version of TiKV. So we need to fallback to use legacy API. + let mut res = f.await; // block here until the stream call is closed or aborted. + if res == Ok(RaftCallRes::Fallback) { + // If the call is setup successfully, it will never finish. Returning + // `UnImplemented` means the batch_call is not supported, we are probably + // connect to an old version of TiKV. So we need to fallback to use + // legacy API. let f = back_end.call(&client, addr.clone()); res = f.await; } match res { - Ok(()) => { + Ok(RaftCallRes::Fallback) => { error!("connection fail"; "store_id" => back_end.store_id, "addr" => addr, "err" => "require fallback even with legacy API"); } - Err(_) => { + // Err(_) should be tx is dropped + Ok(RaftCallRes::Disconnected) | Err(_) => { error!("connection abort"; "store_id" => back_end.store_id, "addr" => addr); - if retry_times > 1 { - // Clears pending messages to avoid consuming high memory when one node is - // shutdown. - back_end.clear_pending_message("unreachable"); - } else { - // At least report failure in metrics. - REPORT_FAILURE_MSG_COUNTER - .with_label_values(&["unreachable", &back_end.store_id.to_string()]) - .inc_by(1); - } + REPORT_FAILURE_MSG_COUNTER + .with_label_values(&["unreachable", &back_end.store_id.to_string()]) + .inc_by(1); back_end .builder .router - .broadcast_unreachable(back_end.store_id); + .report_store_unreachable(back_end.store_id); + addr_channel = None; + first_time = false; } } } @@ -908,24 +921,22 @@ struct CachedQueue { /// } /// raft_client.flush(); /// ``` -pub struct RaftClient { +pub struct RaftClient { pool: Arc>, cache: LruCache<(u64, usize), CachedQueue>, need_flush: Vec<(u64, usize)>, full_stores: Vec<(u64, usize)>, future_pool: Arc>, builder: ConnectionBuilder, - engine: PhantomData, last_hash: (u64, u64), } -impl RaftClient +impl RaftClient where S: StoreAddrResolver + Send + 'static, - R: RaftStoreRouter + Unpin + Send + 'static, - E: KvEngine, + R: RaftExtension + Unpin + Send + 'static, { - pub fn new(builder: ConnectionBuilder) -> RaftClient { + pub fn new(builder: ConnectionBuilder) -> Self { let future_pool = Arc::new( yatp::Builder::new(thd_name!("raft-stream")) .max_thread_count(1) @@ -938,7 +949,6 @@ where full_stores: vec![], future_pool, builder, - engine: PhantomData::, last_hash: (0, 0), } } @@ -971,7 +981,6 @@ where store_id, queue: queue.clone(), builder: self.builder.clone(), - engine: PhantomData::, }; self.future_pool .spawn(start(back_end, conn_id, self.pool.clone())); @@ -1123,7 +1132,7 @@ where } } -impl Clone for RaftClient +impl Clone for RaftClient where S: Clone, R: Clone, @@ -1136,7 +1145,6 @@ where full_stores: vec![], future_pool: self.future_pool.clone(), builder: self.builder.clone(), - engine: PhantomData::, last_hash: (0, 0), } } diff --git a/src/server/raftkv.rs b/src/server/raftkv/mod.rs similarity index 63% rename from src/server/raftkv.rs rename to src/server/raftkv/mod.rs index eaa13995650..6c7169d043c 100644 --- a/src/server/raftkv.rs +++ b/src/server/raftkv/mod.rs @@ -1,29 +1,42 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. +mod raft_extension; + // #[PerformanceCriticalPath] use std::{ borrow::Cow, + cell::UnsafeCell, fmt::{self, Debug, Display, Formatter}, io::Error as IoError, mem, num::NonZeroU64, + pin::Pin, result, - sync::{Arc, RwLock}, + sync::{ + atomic::{AtomicU8, Ordering}, + Arc, RwLock, + }, + task::Poll, time::Duration, }; use collections::{HashMap, HashSet}; use concurrency_manager::ConcurrencyManager; use engine_traits::{CfName, KvEngine, MvccProperties, Snapshot}; +use futures::{future::BoxFuture, task::AtomicWaker, Future, Stream, StreamExt}; use kvproto::{ errorpb, kvrpcpb::{Context, IsolationLevel}, - raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request, Response}, + raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request, + Response, + }, }; use raft::{ eraftpb::{self, MessageType}, StateRole, }; +pub use raft_extension::RaftRouterWrap; use raftstore::{ coprocessor::{ dispatcher::BoxReadIndexObserver, Coprocessor, CoprocessorHost, ReadIndexObserver, @@ -31,22 +44,23 @@ use raftstore::{ errors::Error as RaftServerError, router::{LocalReadRouter, RaftStoreRouter}, store::{ - Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, - RegionSnapshot, WriteResponse, + self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, + RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; -use tikv_kv::write_modifies; -use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use tikv_kv::{write_modifies, OnAppliedCb, WriteEvent}; +use tikv_util::{ + codec::number::NumberEncoder, + future::{paired_future_callback, paired_must_called_future_callback}, + time::Instant, +}; use txn_types::{Key, TimeStamp, TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::metrics::*; use crate::storage::{ self, kv, - kv::{ - Callback, Engine, Error as KvError, ErrorInner as KvErrorInner, ExtCallback, Modify, - SnapContext, WriteData, - }, + kv::{Engine, Error as KvError, ErrorInner as KvErrorInner, Modify, SnapContext, WriteData}, }; #[derive(Debug, Error)] @@ -70,19 +84,6 @@ pub enum Error { Timeout(Duration), } -fn get_status_kind_from_error(e: &Error) -> RequestStatusKind { - match *e { - Error::RequestFailed(ref header) => { - RequestStatusKind::from(storage::get_error_kind_from_header(header)) - } - Error::Io(_) => RequestStatusKind::err_io, - Error::Server(_) => RequestStatusKind::err_server, - Error::InvalidResponse(_) => RequestStatusKind::err_invalid_resp, - Error::InvalidRequest(_) => RequestStatusKind::err_invalid_req, - Error::Timeout(_) => RequestStatusKind::err_timeout, - } -} - fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { match *e { KvError(box KvErrorInner::Request(ref header)) => { @@ -149,6 +150,146 @@ where } } +#[inline] +pub fn new_request_header(ctx: &Context) -> RaftRequestHeader { + let mut header = RaftRequestHeader::default(); + header.set_region_id(ctx.get_region_id()); + header.set_peer(ctx.get_peer().clone()); + header.set_region_epoch(ctx.get_region_epoch().clone()); + if ctx.get_term() != 0 { + header.set_term(ctx.get_term()); + } + header.set_sync_log(ctx.get_sync_log()); + header.set_replica_read(ctx.get_replica_read()); + header +} + +#[inline] +pub fn new_flashback_req(ctx: &Context, ty: AdminCmdType) -> RaftCmdRequest { + let header = new_request_header(ctx); + let mut req = RaftCmdRequest::default(); + req.set_header(header); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + req.mut_admin_request().set_cmd_type(ty); + req +} + +fn exec_admin>( + router: &S, + req: RaftCmdRequest, +) -> BoxFuture<'static, kv::Result<()>> { + let (cb, f) = paired_future_callback(); + let res = router.send_command( + req, + raftstore::store::Callback::write(cb), + RaftCmdExtraOpts::default(), + ); + Box::pin(async move { + res?; + let mut resp = box_try!(f.await); + check_raft_cmd_response(&mut resp.response)?; + Ok(()) + }) +} + +pub fn drop_snapshot_callback() -> kv::Result { + let bt = backtrace::Backtrace::new(); + warn!("async snapshot callback is dropped"; "backtrace" => ?bt); + let mut err = errorpb::Error::default(); + err.set_message("async snapshot callback is dropped".to_string()); + Err(kv::Error::from(kv::ErrorInner::Request(err))) +} + +struct WriteResCore { + ev: AtomicU8, + result: UnsafeCell>>, + wake: AtomicWaker, +} + +struct WriteResSub { + notified_ev: u8, + core: Arc, +} + +unsafe impl Send for WriteResSub {} + +impl Stream for WriteResSub { + type Item = WriteEvent; + + #[inline] + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let mut s = self.as_mut(); + let mut cur_ev = s.core.ev.load(Ordering::Acquire); + if cur_ev == s.notified_ev { + s.core.wake.register(cx.waker()); + cur_ev = s.core.ev.load(Ordering::Acquire); + if cur_ev == s.notified_ev { + return Poll::Pending; + } + } + s.notified_ev = cur_ev; + match cur_ev { + WriteEvent::EVENT_PROPOSED => Poll::Ready(Some(WriteEvent::Proposed)), + WriteEvent::EVENT_COMMITTED => Poll::Ready(Some(WriteEvent::Committed)), + u8::MAX => { + let result = unsafe { (*s.core.result.get()).take().unwrap() }; + Poll::Ready(Some(WriteEvent::Finished(result))) + } + e => panic!("unexpected event {}", e), + } + } +} + +#[derive(Clone)] +struct WriteResFeed { + core: Arc, +} + +unsafe impl Send for WriteResFeed {} + +impl WriteResFeed { + fn pair() -> (Self, WriteResSub) { + let core = Arc::new(WriteResCore { + ev: AtomicU8::new(0), + result: UnsafeCell::new(None), + wake: AtomicWaker::new(), + }); + ( + Self { core: core.clone() }, + WriteResSub { + notified_ev: 0, + core, + }, + ) + } + + fn notify_proposed(&self) { + self.core + .ev + .store(WriteEvent::EVENT_PROPOSED, Ordering::Release); + self.core.wake.wake(); + } + + fn notify_committed(&self) { + self.core + .ev + .store(WriteEvent::EVENT_COMMITTED, Ordering::Release); + self.core.wake.wake(); + } + + fn notify(&self, result: kv::Result<()>) { + unsafe { + (*self.core.result.get()) = Some(result); + } + self.core.ev.store(u8::MAX, Ordering::Release); + self.core.wake.wake(); + } +} + /// `RaftKv` is a storage engine base on `RaftStore`. #[derive(Clone)] pub struct RaftKv @@ -156,7 +297,7 @@ where E: KvEngine, S: RaftStoreRouter + LocalReadRouter + 'static, { - router: S, + router: RaftRouterWrap, engine: E, txn_extra_scheduler: Option>, region_leaders: Arc>>, @@ -170,7 +311,7 @@ where /// Create a RaftKv using specified configuration. pub fn new(router: S, engine: E, region_leaders: Arc>>) -> RaftKv { RaftKv { - router, + router: RaftRouterWrap::new(router), engine, txn_extra_scheduler: None, region_leaders, @@ -180,114 +321,6 @@ where pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { self.txn_extra_scheduler = Some(txn_extra_scheduler); } - - fn new_request_header(&self, ctx: &Context) -> RaftRequestHeader { - let mut header = RaftRequestHeader::default(); - header.set_region_id(ctx.get_region_id()); - header.set_peer(ctx.get_peer().clone()); - header.set_region_epoch(ctx.get_region_epoch().clone()); - if ctx.get_term() != 0 { - header.set_term(ctx.get_term()); - } - header.set_sync_log(ctx.get_sync_log()); - header.set_replica_read(ctx.get_replica_read()); - header - } - - fn exec_snapshot( - &mut self, - ctx: SnapContext<'_>, - req: Request, - cb: Callback>, - ) -> Result<()> { - let mut header = self.new_request_header(ctx.pb_ctx); - let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); - flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); - } - if ctx.for_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } - header.set_flags(flags); - - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.set_requests(vec![req].into()); - self.router - .read( - ctx.read_id, - cmd, - StoreCallback::read(Box::new(move |resp| { - cb(on_read_result(resp).map_err(Error::into)); - })), - ) - .map_err(From::from) - } - - fn exec_write_requests( - &self, - ctx: &Context, - batch: WriteData, - write_cb: Callback>, - proposed_cb: Option, - committed_cb: Option, - ) -> Result<()> { - #[cfg(feature = "failpoints")] - { - // If rid is some, only the specified region reports error. - // If rid is None, all regions report error. - let raftkv_early_error_report_fp = || -> Result<()> { - fail_point!("raftkv_early_error_report", |rid| { - let region_id = ctx.get_region_id(); - rid.and_then(|rid| { - let rid: u64 = rid.parse().unwrap(); - if rid == region_id { None } else { Some(()) } - }) - .ok_or_else(|| RaftServerError::RegionNotFound(region_id).into()) - }); - Ok(()) - }; - raftkv_early_error_report_fp()?; - } - - let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); - let txn_extra = batch.extra; - let mut header = self.new_request_header(ctx); - let mut flags = 0; - if txn_extra.one_pc { - flags |= WriteBatchFlags::ONE_PC.bits(); - } - if txn_extra.for_flashback { - flags |= WriteBatchFlags::FLASHBACK.bits(); - } - header.set_flags(flags); - - let mut cmd = RaftCmdRequest::default(); - cmd.set_header(header); - cmd.set_requests(reqs.into()); - - self.schedule_txn_extra(txn_extra); - - let cb = StoreCallback::write_ext( - Box::new(move |resp| { - write_cb(on_write_result(resp).map_err(Error::into)); - }), - proposed_cb, - committed_cb, - ); - let extra_opts = RaftCmdExtraOpts { - deadline: batch.deadline, - disk_full_opt: batch.disk_full_opt, - }; - self.router.send_command(cmd, cb, extra_opts)?; - - Ok(()) - } } fn invalid_resp_type(exp: CmdType, act: CmdType) -> Error { @@ -329,6 +362,12 @@ where Some(self.engine.clone()) } + type RaftExtension = RaftRouterWrap; + #[inline] + fn raft_extension(&self) -> &Self::RaftExtension { + &self.router + } + fn modify_on_kv_engine( &self, mut region_modifies: HashMap>, @@ -372,70 +411,127 @@ where } } + type WriteRes = impl Stream + Send + Unpin; fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: Callback<()>, - ) -> kv::Result<()> { - self.async_write_ext(ctx, batch, write_cb, None, None) - } - - fn async_write_ext( - &self, - ctx: &Context, - batch: WriteData, - write_cb: Callback<()>, - proposed_cb: Option, - committed_cb: Option, - ) -> kv::Result<()> { - fail_point!("raftkv_async_write"); - if batch.modifies.is_empty() { - return Err(KvError::from(KvErrorInner::EmptyRequest)); - } + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let mut res = (|| { + fail_point!("raftkv_async_write"); + if batch.modifies.is_empty() { + return Err(KvError::from(KvErrorInner::EmptyRequest)); + } + Ok(()) + })(); ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); let begin_instant = Instant::now_coarse(); - self.exec_write_requests( - ctx, - batch, - Box::new(move |res| match res { + if res.is_ok() { + // If rid is some, only the specified region reports error. + // If rid is None, all regions report error. + res = (|| { + fail_point!("raftkv_early_error_report", |rid| { + let region_id = ctx.get_region_id(); + rid.and_then(|rid| { + let rid: u64 = rid.parse().unwrap(); + if rid == region_id { None } else { Some(()) } + }) + .ok_or_else(|| RaftServerError::RegionNotFound(region_id).into()) + }); + Ok(()) + })(); + } + + let reqs: Vec = batch.modifies.into_iter().map(Into::into).collect(); + let txn_extra = batch.extra; + let mut header = new_request_header(ctx); + let mut flags = 0; + if txn_extra.one_pc { + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if txn_extra.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(reqs.into()); + + self.schedule_txn_extra(txn_extra); + + let (tx, rx) = WriteResFeed::pair(); + let proposed_cb = if !WriteEvent::subscribed_proposed(subscribed) { + None + } else { + let tx = tx.clone(); + Some(Box::new(move || tx.notify_proposed()) as store::ExtCallback) + }; + let committed_cb = if !WriteEvent::subscribed_committed(subscribed) { + None + } else { + let tx = tx.clone(); + Some(Box::new(move || tx.notify_committed()) as store::ExtCallback) + }; + let applied_tx = tx.clone(); + let applied_cb = Box::new(move |resp: WriteResponse| { + let mut res = match on_write_result::(resp) { Ok(CmdRes::Resp(_)) => { + fail_point!("raftkv_async_write_finish"); + Ok(()) + } + Ok(CmdRes::Snap(_)) => Err(box_err!("unexpect snapshot, should mutate instead.")), + Err(e) => Err(kv::Error::from(e)), + }; + if let Some(cb) = on_applied { + cb(&mut res); + } + applied_tx.notify(res); + }); + + let cb = StoreCallback::write_ext(applied_cb, proposed_cb, committed_cb); + let extra_opts = RaftCmdExtraOpts { + deadline: batch.deadline, + disk_full_opt: batch.disk_full_opt, + }; + if res.is_ok() { + res = self + .router + .send_command(cmd, cb, extra_opts) + .map_err(kv::Error::from); + } + if res.is_err() { + tx.notify(res); + } + rx.inspect(move |ev| { + let WriteEvent::Finished(res) = ev else { return }; + match res { + Ok(()) => { ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); ASYNC_REQUESTS_DURATIONS_VEC .write .observe(begin_instant.saturating_elapsed_secs()); - fail_point!("raftkv_async_write_finish"); - write_cb(Ok(())) - } - Ok(CmdRes::Snap(_)) => { - write_cb(Err(box_err!("unexpect snapshot, should mutate instead."))) } Err(e) => { - let status_kind = get_status_kind_from_engine_error(&e); + let status_kind = get_status_kind_from_engine_error(e); ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - write_cb(Err(e)) } - }), - proposed_cb, - committed_cb, - ) - .map_err(|e| { - let status_kind = get_status_kind_from_error(&e); - ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); - e.into() + } }) } - fn async_snapshot( - &mut self, - mut ctx: SnapContext<'_>, - cb: Callback, - ) -> kv::Result<()> { - fail_point!("raftkv_async_snapshot_err", |_| Err(box_err!( - "injected error for async_snapshot" - ))); + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, mut ctx: SnapContext<'_>) -> Self::SnapshotRes { + let mut res: kv::Result<()> = (|| { + fail_point!("raftkv_async_snapshot_err", |_| { + Err(box_err!("injected error for async_snapshot")) + }); + Ok(()) + })(); let mut req = Request::default(); req.set_cmd_type(CmdType::Snap); @@ -447,10 +543,46 @@ where } ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); let begin_instant = Instant::now_coarse(); - self.exec_snapshot( - ctx, - req, - Box::new(move |res| match res { + let (cb, f) = paired_must_called_future_callback(drop_snapshot_callback); + + let mut header = new_request_header(ctx.pb_ctx); + let mut flags = 0; + if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { + let mut data = [0u8; 8]; + (&mut data[..]) + .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) + .unwrap(); + flags |= WriteBatchFlags::STALE_READ.bits(); + header.set_flag_data(data.into()); + } + if ctx.for_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(vec![req].into()); + if res.is_ok() { + res = self + .router + .read( + ctx.read_id, + cmd, + StoreCallback::read(Box::new(move |resp| { + cb(on_read_result(resp).map_err(Error::into)); + })), + ) + .map_err(kv::Error::from); + } + async move { + // It's impossible to return cancel because the callback will be invoked if it's + // destroyed. + let res = match res { + Ok(()) => f.await.unwrap(), + Err(e) => Err(e), + }; + match res { Ok(CmdRes::Resp(mut r)) => { let e = if r .get(0) @@ -462,27 +594,22 @@ where } else { invalid_resp_type(CmdType::Snap, r[0].get_cmd_type()).into() }; - cb(Err(e)) + Err(e) } Ok(CmdRes::Snap(s)) => { ASYNC_REQUESTS_DURATIONS_VEC .snapshot .observe(begin_instant.saturating_elapsed_secs()); ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); - cb(Ok(s)) + Ok(s) } Err(e) => { let status_kind = get_status_kind_from_engine_error(&e); ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); - cb(Err(e)) + Err(e) } - }), - ) - .map_err(|e| { - let status_kind = get_status_kind_from_error(&e); - ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); - e.into() - }) + } + } } fn release_snapshot(&mut self) { @@ -509,6 +636,33 @@ where } } } + + fn start_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the + // later flashback. Once invoked, we will update the persistent region meta and + // the memory state of the flashback in Peer FSM to reject all read, write + // and scheduling operations for this region when propose/apply before we + // start the actual data flashback transaction command in the next phase. + let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + exec_admin(&*self.router, req) + } + + fn end_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + // Send an `AdminCmdType::FinishFlashback` to unset the persistence state + // in `RegionLocalState` and region's meta, and when that admin cmd is applied, + // will update the memory state of the flashback + let req = new_flashback_req(ctx, AdminCmdType::FinishFlashback); + exec_admin(&*self.router, req) + } + + fn hint_change_in_range(&self, start_key: Vec, end_key: Vec) { + self.router + .send_store_msg(StoreMsg::ClearRegionSizeInRange { start_key, end_key }) + .unwrap_or_else(|e| { + // Warn and ignore it. + warn!("unsafe destroy range: failed sending ClearRegionSizeInRange"; "err" => ?e); + }); + } } #[derive(Clone)] diff --git a/src/server/raftkv/raft_extension.rs b/src/server/raftkv/raft_extension.rs new file mode 100644 index 00000000000..d3178842489 --- /dev/null +++ b/src/server/raftkv/raft_extension.rs @@ -0,0 +1,177 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + marker::PhantomData, + ops::{Deref, DerefMut}, +}; + +use futures::future::BoxFuture; +use kvproto::{ + metapb::{Region, RegionEpoch}, + raft_cmdpb::{AdminCmdType, RaftCmdRequest}, + raft_serverpb::RaftMessage, +}; +use raft::SnapshotStatus; +use raftstore::{ + router::RaftStoreRouter, + store::{ + region_meta::{RaftStateRole, RegionMeta}, + CasualMessage, + }, +}; +use tikv_util::future::paired_future_callback; + +use crate::storage::kv; + +#[derive(Clone)] +pub struct RaftRouterWrap { + router: S, + _phantom: PhantomData, +} + +impl RaftRouterWrap { + pub fn new(router: S) -> Self { + Self { + router, + _phantom: PhantomData, + } + } +} + +impl Deref for RaftRouterWrap { + type Target = S; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.router + } +} + +impl DerefMut for RaftRouterWrap { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.router + } +} + +impl tikv_kv::RaftExtension for RaftRouterWrap +where + S: RaftStoreRouter + 'static, + E: engine_traits::KvEngine, +{ + #[inline] + fn feed(&self, msg: RaftMessage, key_message: bool) { + let region_id = msg.get_region_id(); + let msg_ty = msg.get_message().get_msg_type(); + // Channel full and region not found are ignored unless it's a key message. + if let Err(e) = self.router.send_raft_msg(msg) && key_message { + error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); + } + } + + #[inline] + fn report_reject_message(&self, region_id: u64, from_peer_id: u64) { + let m = CasualMessage::RejectRaftAppend { + peer_id: from_peer_id, + }; + let _ = self.router.send_casual_msg(region_id, m); + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + let _ = self.router.report_unreachable(region_id, to_peer_id); + } + + #[inline] + fn report_store_unreachable(&self, store_id: u64) { + self.router.broadcast_unreachable(store_id); + } + + #[inline] + fn report_snapshot_status(&self, region_id: u64, to_peer_id: u64, status: SnapshotStatus) { + if let Err(e) = self + .router + .report_snapshot_status(region_id, to_peer_id, status) + { + error!(?e; + "report snapshot to peer failes"; + "to_peer_id" => to_peer_id, + "status" => ?status, + "region_id" => region_id, + ); + } + } + + #[inline] + fn report_resolved(&self, store_id: u64, group_id: u64) { + self.router.report_resolved(store_id, group_id); + } + + #[inline] + fn split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: String, + ) -> BoxFuture<'static, kv::Result>> { + let (cb, rx) = paired_future_callback(); + let req = CasualMessage::SplitRegion { + region_epoch, + split_keys, + callback: raftstore::store::Callback::write(cb), + source: source.into(), + }; + let res = self.router.send_casual_msg(region_id, req); + Box::pin(async move { + res?; + let mut admin_resp = box_try!(rx.await); + super::check_raft_cmd_response(&mut admin_resp.response)?; + let regions = admin_resp + .response + .mut_admin_response() + .mut_splits() + .take_regions(); + Ok(regions.into()) + }) + } + + /// Get the region meta of the given region. + #[inline] + fn query_region(&self, region_id: u64) -> BoxFuture<'static, kv::Result> { + let (cb, rx) = paired_future_callback(); + let res = self + .router + .send_casual_msg(region_id, CasualMessage::AccessPeer(cb)); + Box::pin(async move { + res?; + Ok(box_try!(rx.await)) + }) + } + + /// Ask the raft group to do a consistency check. + fn check_consistency(&self, region_id: u64) -> BoxFuture<'static, kv::Result<()>> { + let region = self.query_region(region_id); + let router = self.router.clone(); + Box::pin(async move { + let meta: RegionMeta = region.await?; + let leader_id = meta.raft_status.soft_state.leader_id; + let mut leader = None; + for peer in meta.region_state.peers { + if peer.id == leader_id { + leader = Some(peer.into()); + } + } + if meta.raft_status.soft_state.raft_state != RaftStateRole::Leader { + return Err(raftstore::Error::NotLeader(region_id, leader).into()); + } + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_peer(leader.unwrap()); + req.mut_admin_request() + .set_cmd_type(AdminCmdType::ComputeHash); + let f = super::exec_admin(&router, req); + f.await + }) + } +} diff --git a/src/server/reset_to_version.rs b/src/server/reset_to_version.rs index e1faccd9b3f..1ea98acc1c8 100644 --- a/src/server/reset_to_version.rs +++ b/src/server/reset_to_version.rs @@ -85,10 +85,7 @@ impl ResetToVersionWorker { fn next_write(&mut self) -> Result, Write)>> { if self.write_iter.valid().unwrap() { - let mut state = self - .state - .lock() - .expect("failed to lock ResetToVersionWorker::state"); + let mut state = self.state.lock().unwrap(); debug_assert!(matches!( *state, ResetToVersionState::RemovingWrite { scanned: _ } @@ -149,16 +146,14 @@ impl ResetToVersionWorker { let mut has_more = true; for _ in 0..batch_size { if self.lock_iter.valid().unwrap() { - let mut state = self - .state - .lock() - .expect("failed to lock ResetToVersionWorker::state"); - debug_assert!(matches!( - *state, - ResetToVersionState::RemovingLock { scanned: _ } - )); - *state.scanned() += 1; - drop(state); + { + let mut state = self.state.lock().unwrap(); + debug_assert!(matches!( + *state, + ResetToVersionState::RemovingLock { scanned: _ } + )); + *state.scanned() += 1; + } box_try!(wb.delete_cf(CF_LOCK, self.lock_iter.key())); self.lock_iter.next().unwrap(); @@ -197,6 +192,12 @@ impl Clone for ResetToVersionManager { } } +impl Drop for ResetToVersionManager { + fn drop(&mut self) { + self.wait(); + } +} + #[allow(dead_code)] impl ResetToVersionManager { pub fn new(engine: RocksEngine) -> Self { @@ -221,30 +222,32 @@ impl ResetToVersionManager { let mut worker = ResetToVersionWorker::new(write_iter, lock_iter, ts, self.state.clone()); let mut wb = self.engine.write_batch(); let props = tikv_util::thread_group::current_properties(); - if self.worker_handle.borrow().is_some() { - warn!("A reset-to-version process is already in progress! Wait until it finish first."); - self.wait(); - } - *self.worker_handle.borrow_mut() = Some(std::thread::Builder::new() - .name("reset_to_version".to_string()) - .spawn_wrapper(move || { - tikv_util::thread_group::set_properties(props); - tikv_alloc::add_thread_memory_accessor(); + self.wait(); - while worker.process_next_batch(BATCH_SIZE, &mut wb).expect("reset_to_version failed when removing invalid writes") { - } - *worker.state.lock() - .expect("failed to lock `ResetToVersionWorker::state` in `ResetToVersionWorker::process_next_batch`") - = ResetToVersionState::RemovingLock { scanned: 0 }; - while worker.process_next_batch_lock(BATCH_SIZE, &mut wb).expect("reset_to_version failed when removing invalid locks") { - } - *worker.state.lock() - .expect("failed to lock `ResetToVersionWorker::state` in `ResetToVersionWorker::process_next_batch_lock`") - = ResetToVersionState::Done; - info!("Reset to version done!"); - tikv_alloc::remove_thread_memory_accessor(); - }) - .expect("failed to spawn reset_to_version thread")); + *self.worker_handle.borrow_mut() = Some( + std::thread::Builder::new() + .name("reset_to_version".to_string()) + .spawn_wrapper(move || { + tikv_util::thread_group::set_properties(props); + tikv_alloc::add_thread_memory_accessor(); + + while worker + .process_next_batch(BATCH_SIZE, &mut wb) + .expect("process_next_batch") + {} + *worker.state.lock().unwrap() = + ResetToVersionState::RemovingLock { scanned: 0 }; + while worker + .process_next_batch_lock(BATCH_SIZE, &mut wb) + .expect("process_next_batch_lock") + {} + *worker.state.lock().unwrap() = ResetToVersionState::Done; + info!("Reset to version done!"); + + tikv_alloc::remove_thread_memory_accessor(); + }) + .expect("failed to spawn reset_to_version thread"), + ); } /// Current process state. @@ -257,7 +260,10 @@ impl ResetToVersionManager { /// Wait until the process finished. pub fn wait(&self) { - self.worker_handle.take().unwrap().join().unwrap(); + if let Some(handle) = self.worker_handle.take() { + info!("Wait for the reset-to-version task to complete."); + handle.join().unwrap(); + } } } diff --git a/src/server/resolve.rs b/src/server/resolve.rs index acf60ae783f..c831ff28d17 100644 --- a/src/server/resolve.rs +++ b/src/server/resolve.rs @@ -2,15 +2,14 @@ use std::{ fmt::{self, Display, Formatter}, - marker::PhantomData, sync::{Arc, Mutex}, }; use collections::HashMap; -use engine_traits::KvEngine; use kvproto::replication_modepb::ReplicationMode; use pd_client::{take_peer_address, PdClient}; -use raftstore::{router::RaftStoreRouter, store::GlobalReplicationState}; +use raftstore::store::GlobalReplicationState; +use tikv_kv::RaftExtension; use tikv_util::{ time::Instant, worker::{Runnable, Scheduler, Worker}, @@ -52,24 +51,21 @@ struct StoreAddr { } /// A runner for resolving store addresses. -struct Runner +struct Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { pd_client: Arc, store_addrs: HashMap, state: Arc>, - router: RR, - engine: PhantomData, + router: R, } -impl Runner +impl Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { fn resolve(&mut self, store_id: u64) -> Result { if let Some(s) = self.store_addrs.get(&store_id) { @@ -128,11 +124,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where T: PdClient, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension, { type Task = Task; fn run(&mut self, task: Task) { @@ -157,15 +152,14 @@ impl PdStoreAddrResolver { } /// Creates a new `PdStoreAddrResolver`. -pub fn new_resolver( +pub fn new_resolver( pd_client: Arc, worker: &Worker, - router: RR, + router: R, ) -> (PdStoreAddrResolver, Arc>) where T: PdClient + 'static, - RR: RaftStoreRouter, - E: KvEngine, + R: RaftExtension + 'static, { let state = Arc::new(Mutex::new(GlobalReplicationState::default())); let runner = Runner { @@ -173,7 +167,6 @@ where store_addrs: HashMap::default(), state: state.clone(), router, - engine: PhantomData, }; let scheduler = worker.start("addr-resolver", runner); let resolver = PdStoreAddrResolver::new(scheduler); @@ -190,16 +183,12 @@ impl StoreAddrResolver for PdStoreAddrResolver { #[cfg(test)] mod tests { - use std::{ - marker::PhantomData, net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, - time::Duration, - }; + use std::{net::SocketAddr, ops::Sub, str::FromStr, sync::Arc, thread, time::Duration}; use collections::HashMap; - use engine_test::kv::KvTestEngine; use kvproto::metapb; use pd_client::{PdClient, Result}; - use raftstore::router::RaftStoreBlackHole; + use tikv_kv::FakeExtension; use super::*; @@ -236,7 +225,7 @@ mod tests { store } - fn new_runner(store: metapb::Store) -> Runner { + fn new_runner(store: metapb::Store) -> Runner { let client = MockPdClient { start: Instant::now(), store, @@ -245,8 +234,7 @@ mod tests { pd_client: Arc::new(client), store_addrs: HashMap::default(), state: Default::default(), - router: RaftStoreBlackHole, - engine: PhantomData, + router: FakeExtension, } } diff --git a/src/server/server.rs b/src/server/server.rs index 992b5cf6fa0..1921483e37b 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,10 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::{ - router::RaftStoreRouter, - store::{CheckLeaderTask, SnapManager}, -}; +use raftstore::store::{CheckLeaderTask, SnapManager}; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -58,8 +55,7 @@ pub const STATS_THREAD_PREFIX: &str = "transport-stats"; /// /// It hosts various internal components, including gRPC, the raftstore router /// and a snapshot worker. -pub struct Server + 'static, S: StoreAddrResolver + 'static, E: Engine> -{ +pub struct Server { env: Arc, /// A GrpcServer builder or a GrpcServer. /// @@ -68,8 +64,8 @@ pub struct Server + 'static, S: StoreAddrResolver + grpc_mem_quota: ResourceQuota, local_addr: SocketAddr, // Transport. - trans: ServerTransport, - raft_router: T, + trans: ServerTransport, + raft_router: E::RaftExtension, // For sending/receiving snapshots. snap_mgr: SnapManager, snap_worker: LazyWorker, @@ -83,8 +79,11 @@ pub struct Server + 'static, S: StoreAddrResolver + timer: Handle, } -impl + Unpin, S: StoreAddrResolver + 'static, E: Engine> - Server +impl Server +where + S: StoreAddrResolver + 'static, + E: Engine, + E::RaftExtension: Unpin, { #[allow(clippy::too_many_arguments)] pub fn new( @@ -94,10 +93,9 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En storage: Storage, copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, - raft_router: T, resolver: S, snap_mgr: SnapManager, - gc_worker: GcWorker, + gc_worker: GcWorker, check_leader_scheduler: Scheduler, env: Arc, yatp_read_pool: Option, @@ -124,6 +122,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En let snap_worker = Worker::new("snap-handler"); let lazy_worker = snap_worker.lazy_build("snap-handler"); + let raft_ext = storage.get_engine().raft_extension().clone(); let proxy = Proxy::new(security_mgr.clone(), &env, Arc::new(cfg.value().clone())); let kv_service = KvService::new( @@ -132,7 +131,6 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En gc_worker, copr, copr_v2, - raft_router.clone(), lazy_worker.scheduler(), check_leader_scheduler, Arc::clone(&grpc_thread_load), @@ -170,7 +168,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En Arc::clone(cfg), security_mgr.clone(), resolver, - raft_router.clone(), + raft_ext.clone(), lazy_worker.scheduler(), grpc_thread_load.clone(), ); @@ -185,7 +183,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En grpc_mem_quota: mem_quota, local_addr: addr, trans, - raft_router, + raft_router: raft_ext, snap_mgr, snap_worker: lazy_worker, stats_pool, @@ -207,7 +205,7 @@ impl + Unpin, S: StoreAddrResolver + 'static, E: En self.snap_worker.scheduler() } - pub fn transport(&self) -> ServerTransport { + pub fn transport(&self) -> ServerTransport { self.trans.clone() } @@ -340,21 +338,20 @@ pub mod test_router { use std::sync::mpsc::*; use engine_rocks::{RocksEngine, RocksSnapshot}; - use engine_traits::{KvEngine, Snapshot}; use kvproto::raft_serverpb::RaftMessage; - use raftstore::{store::*, Result as RaftStoreResult}; + use raftstore::{router::RaftStoreRouter, store::*, Result as RaftStoreResult}; use super::*; #[derive(Clone)] pub struct TestRaftStoreRouter { - tx: Sender, + tx: Sender, StoreMsg>>, significant_msg_sender: Sender>, } impl TestRaftStoreRouter { pub fn new( - tx: Sender, + tx: Sender, StoreMsg>>, significant_msg_sender: Sender>, ) -> TestRaftStoreRouter { TestRaftStoreRouter { @@ -365,25 +362,26 @@ pub mod test_router { } impl StoreRouter for TestRaftStoreRouter { - fn send(&self, _: StoreMsg) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + fn send(&self, msg: StoreMsg) -> RaftStoreResult<()> { + let _ = self.tx.send(Either::Right(msg)); Ok(()) } } - impl ProposalRouter for TestRaftStoreRouter { + impl ProposalRouter for TestRaftStoreRouter { fn send( &self, - _: RaftCommand, - ) -> std::result::Result<(), crossbeam::channel::TrySendError>> { - let _ = self.tx.send(1); + cmd: RaftCommand, + ) -> std::result::Result<(), crossbeam::channel::TrySendError>> + { + let _ = self.tx.send(Either::Left(PeerMsg::RaftCommand(cmd))); Ok(()) } } - impl CasualRouter for TestRaftStoreRouter { - fn send(&self, _: u64, _: CasualMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + impl CasualRouter for TestRaftStoreRouter { + fn send(&self, _: u64, msg: CasualMessage) -> RaftStoreResult<()> { + let _ = self.tx.send(Either::Left(PeerMsg::CasualMessage(msg))); Ok(()) } } @@ -400,13 +398,18 @@ pub mod test_router { } impl RaftStoreRouter for TestRaftStoreRouter { - fn send_raft_msg(&self, _: RaftMessage) -> RaftStoreResult<()> { - let _ = self.tx.send(1); + fn send_raft_msg(&self, msg: RaftMessage) -> RaftStoreResult<()> { + let _ = self + .tx + .send(Either::Left(PeerMsg::RaftMessage(InspectedRaftMessage { + heap_size: 0, + msg, + }))); Ok(()) } - fn broadcast_normal(&self, _: impl FnMut() -> PeerMsg) { - let _ = self.tx.send(1); + fn broadcast_normal(&self, mut f: impl FnMut() -> PeerMsg) { + let _ = self.tx.send(Either::Left(f())); } } } @@ -423,11 +426,12 @@ mod tests { use kvproto::raft_serverpb::RaftMessage; use raftstore::{ coprocessor::region_info_accessor::MockRegionInfoProvider, + router::RaftStoreRouter, store::{transport::Transport, *}, }; use resource_metering::ResourceTagFactory; use security::SecurityConfig; - use tikv_util::quota_limiter::QuotaLimiter; + use tikv_util::{config::ReadableDuration, quota_limiter::QuotaLimiter}; use tokio::runtime::Builder as TokioBuilder; use super::{ @@ -440,8 +444,8 @@ mod tests { use crate::{ config::CoprReadPoolConfig, coprocessor::{self, readpool_impl}, - server::TestRaftStoreRouter, - storage::{lock_manager::MockLockManager, TestStorageBuilderApiV1}, + server::{raftkv::RaftRouterWrap, TestRaftStoreRouter}, + storage::{lock_manager::MockLockManager, TestEngineBuilder, TestStorageBuilderApiV1}, }; #[derive(Clone)] @@ -487,16 +491,24 @@ mod tests { let mock_store_id = 5; let cfg = Config { addr: "127.0.0.1:0".to_owned(), + raft_client_max_backoff: ReadableDuration::millis(100), + raft_client_initial_reconnect_backoff: ReadableDuration::millis(100), ..Default::default() }; - let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) - .build() - .unwrap(); - let (tx, rx) = mpsc::channel(); let (significant_msg_sender, significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); + let engine = TestEngineBuilder::new() + .build() + .unwrap() + .with_raft_extension(RaftRouterWrap::new(router.clone())); + + let storage = + TestStorageBuilderApiV1::from_engine_and_lock_mgr(engine, MockLockManager::new()) + .build() + .unwrap(); + let env = Arc::new( EnvBuilder::new() .cq_count(1) @@ -507,7 +519,6 @@ mod tests { let (tx, _rx) = mpsc::channel(); let mut gc_worker = GcWorker::new( storage.get_engine(), - router.clone(), tx, Default::default(), Default::default(), @@ -549,7 +560,6 @@ mod tests { storage, copr, copr_v2, - router.clone(), MockResolver { quick_fail: Arc::clone(&quick_fail), addr: Arc::clone(&addr), diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index 30cc8342959..ae0d53bacda 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,9 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use engine_rocks::RocksEngine; -use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; +use engine_traits::{Engines, MiscExt, RaftEngine}; use futures::{ - channel::oneshot, future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, stream::{self, TryStreamExt}, @@ -12,17 +11,8 @@ use grpcio::{ Error as GrpcError, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, UnarySink, WriteFlags, }; -use kvproto::{ - debugpb::{self, *}, - raft_cmdpb::{ - AdminCmdType, AdminRequest, RaftCmdRequest, RaftRequestHeader, RegionDetailResponse, - StatusCmdType, StatusRequest, - }, -}; -use raftstore::{ - router::RaftStoreRouter, - store::msg::{Callback, RaftCmdExtraOpts}, -}; +use kvproto::debugpb::{self, *}; +use tikv_kv::RaftExtension; use tikv_util::metrics; use tokio::runtime::Handle; @@ -53,28 +43,26 @@ fn error_to_grpc_error(tag: &'static str, e: Error) -> GrpcError { /// Service handles the RPC messages for the `Debug` service. #[derive(Clone)] -pub struct Service> { +pub struct Service { pool: Handle, debugger: Debugger, raft_router: T, - _phantom: std::marker::PhantomData, } -impl> Service { - /// Constructs a new `Service` with `Engines`, a `RaftStoreRouter` and a +impl Service { + /// Constructs a new `Service` with `Engines`, a `RaftExtension` and a /// `GcWorker`. pub fn new( engines: Engines, pool: Handle, raft_router: T, cfg_controller: ConfigController, - ) -> Service { + ) -> Self { let debugger = Debugger::new(engines, cfg_controller); Service { pool, debugger, raft_router, - _phantom: Default::default(), } } @@ -99,9 +87,7 @@ impl> Service { } } -impl + 'static> debugpb::Debug - for Service -{ +impl debugpb::Debug for Service { fn get(&mut self, ctx: RpcContext<'_>, mut req: GetRequest, sink: UnarySink) { const TAG: &str = "debug_get"; @@ -386,18 +372,14 @@ impl + 'static> debugpb::De sink: UnarySink, ) { let region_id = req.get_region_id(); - let debugger = self.debugger.clone(); - let router1 = self.raft_router.clone(); - let router2 = self.raft_router.clone(); - - let consistency_check_task = async move { - let store_id = debugger.get_store_ident()?.store_id; - let detail = region_detail(router2, region_id, store_id).await?; - consistency_check(router1, detail).await + let f = self.raft_router.check_consistency(region_id); + let task = async move { + box_try!(f.await); + Ok(()) }; let f = self .pool - .spawn(consistency_check_task) + .spawn(task) .map(|res| res.unwrap()) .map_ok(|_| RegionConsistencyCheckResponse::default()); self.handle_response(ctx, sink, f, "check_region_consistency"); @@ -537,79 +519,6 @@ impl + 'static> debugpb::De } } -fn region_detail>( - raft_router: T, - region_id: u64, - store_id: u64, -) -> impl Future> { - let mut header = RaftRequestHeader::default(); - header.set_region_id(region_id); - header.mut_peer().set_store_id(store_id); - let mut status_request = StatusRequest::default(); - status_request.set_cmd_type(StatusCmdType::RegionDetail); - let mut raft_cmd = RaftCmdRequest::default(); - raft_cmd.set_header(header); - raft_cmd.set_status_request(status_request); - - let (tx, rx) = oneshot::channel(); - let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); - - async move { - raft_router - .send_command(raft_cmd, cb, RaftCmdExtraOpts::default()) - .map_err(|e| Error::Other(Box::new(e)))?; - - let mut r = rx.map_err(|e| Error::Other(Box::new(e))).await?; - - if r.response.get_header().has_error() { - let e = r.response.get_header().get_error(); - warn!("region_detail got error"; "err" => ?e); - return Err(Error::Other(e.message.clone().into())); - } - - let detail = r.response.take_status_response().take_region_detail(); - debug!("region_detail got region detail"; "detail" => ?detail); - let leader_store_id = detail.get_leader().get_store_id(); - if leader_store_id != store_id { - let msg = format!("Leader is on store {}", leader_store_id); - return Err(Error::Other(msg.into())); - } - Ok(detail) - } -} - -fn consistency_check>( - raft_router: T, - mut detail: RegionDetailResponse, -) -> impl Future> { - let mut header = RaftRequestHeader::default(); - header.set_region_id(detail.get_region().get_id()); - header.set_peer(detail.take_leader()); - let mut admin_request = AdminRequest::default(); - admin_request.set_cmd_type(AdminCmdType::ComputeHash); - let mut raft_cmd = RaftCmdRequest::default(); - raft_cmd.set_header(header); - raft_cmd.set_admin_request(admin_request); - - let (tx, rx) = oneshot::channel(); - let cb = Callback::read(Box::new(|resp| tx.send(resp).unwrap())); - - async move { - raft_router - .send_command(raft_cmd, cb, RaftCmdExtraOpts::default()) - .map_err(|e| Error::Other(Box::new(e)))?; - - let r = rx.map_err(|e| Error::Other(Box::new(e))).await?; - - if r.response.get_header().has_error() { - let e = r.response.get_header().get_error(); - warn!("consistency-check got error"; "err" => ?e); - return Err(Error::Other(e.message.clone().into())); - } - Ok(()) - } -} - mod region_size_response { pub type Entry = kvproto::debugpb::RegionSizeResponseEntry; } diff --git a/src/server/service/diagnostics/log.rs b/src/server/service/diagnostics/log.rs index 6f06bf17b30..8e77d65233e 100644 --- a/src/server/service/diagnostics/log.rs +++ b/src/server/service/diagnostics/log.rs @@ -559,7 +559,7 @@ Some invalid logs 2: Welcome to TiKV .unwrap(); let log_file2 = dir.path().join("tikv.2019-08-23T18-10-00.387.log"); - let mut file = File::create(&log_file2).unwrap(); + let mut file = File::create(log_file2).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -736,7 +736,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# // this file is ignored because its filename is not expected let log_file2 = dir.path().join("tikv.log.2"); - let mut file = File::create(&log_file2).unwrap(); + let mut file = File::create(log_file2).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -749,7 +749,7 @@ Some invalid logs 4: Welcome to TiKV - test-filter"# .unwrap(); let log_file3 = dir.path().join("tikv.2019-08-23T18-11-02.123.log"); - let mut file = File::create(&log_file3).unwrap(); + let mut file = File::create(log_file3).unwrap(); write!( file, r#"[2019/08/23 18:11:53.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] @@ -766,7 +766,7 @@ Some invalid logs 2: Welcome to TiKV - test-filter"# // this file is ignored because its filename is not expected let log_file4 = dir.path().join("tikv.T.log"); - let mut file = File::create(&log_file4).unwrap(); + let mut file = File::create(log_file4).unwrap(); write!( file, r#"[2019/08/23 18:10:01.387 +08:00] [INFO] [foo.rs:100] [some message] [key=val] diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index e62028e66e6..17ed9a78b3f 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -201,7 +201,7 @@ fn nic_load_info(prev_nic: HashMap, collector: &mut Vec, collector: &mut Vec) { let current = ioload::IoLoad::snapshot(); - let rate = |cur, prev| (cur - prev) as f64; + let rate = |cur, prev| (cur - prev); for (name, cur) in current.into_iter() { let prev = match prev_io.get(&name) { Some(p) => p, diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 84015ddab57..db50dfe459e 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -6,7 +6,6 @@ use std::{mem, sync::Arc}; use api_version::KvFormat; use fail::fail_point; use futures::{ - channel::oneshot, compat::Future01CompatExt, future::{self, Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -16,30 +15,19 @@ use grpcio::{ ClientStreamingSink, DuplexSink, Error as GrpcError, RequestStream, Result as GrpcResult, RpcContext, RpcStatus, RpcStatusCode, ServerStreamingSink, UnarySink, WriteFlags, }; -use kvproto::{ - coprocessor::*, - errorpb::{Error as RegionError, *}, - kvrpcpb::*, - mpp::*, - raft_cmdpb::{ - AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftRequestHeader, - Request as RaftRequest, - }, - raft_serverpb::*, - tikvpb::*, -}; +use kvproto::{coprocessor::*, kvrpcpb::*, mpp::*, raft_serverpb::*, tikvpb::*}; use protobuf::RepeatedField; use raft::eraftpb::MessageType; use raftstore::{ - router::RaftStoreRouter, store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, metrics::RAFT_ENTRIES_CACHES_GAUGE, - Callback, CasualMessage, CheckLeaderTask, RaftCmdExtraOpts, + CheckLeaderTask, }, - DiscardReason, Error as RaftStoreError, Result as RaftStoreResult, + Error as RaftStoreError, Result as RaftStoreResult, }; use tikv_alloc::trace::MemoryTraceGuard; +use tikv_kv::RaftExtension; use tikv_util::{ future::{paired_future_callback, poll_future_notify}, mpsc::future::{unbounded, BatchReceiver, Sender, WakePolicy}, @@ -47,9 +35,8 @@ use tikv_util::{ time::{duration_to_ms, duration_to_sec, Instant}, worker::Scheduler, }; -use tokio::sync::Mutex; use tracker::{set_tls_tracker_token, RequestInfo, RequestType, Tracker, GLOBAL_TRACKERS}; -use txn_types::{self, Key, WriteBatchFlags}; +use txn_types::{self, Key}; use super::batch::{BatcherBuilder, ReqBatcher}; use crate::{ @@ -60,9 +47,10 @@ use crate::{ Error, Proxy, Result as ServerResult, }, storage::{ + self, errors::{ extract_committed, extract_key_error, extract_key_errors, extract_kv_pairs, - extract_region_error, map_kv_pairs, + extract_region_error, extract_region_error_from_error, map_kv_pairs, }, kv::Engine, lock_manager::LockManager, @@ -74,18 +62,16 @@ const GRPC_MSG_MAX_BATCH_SIZE: usize = 128; const GRPC_MSG_NOTIFY_SIZE: usize = 8; /// Service handles the RPC messages for the `Tikv` service. -pub struct Service + 'static, E: Engine, L: LockManager, F: KvFormat> { +pub struct Service { store_id: u64, /// Used to handle requests related to GC. - gc_worker: GcWorker, + gc_worker: GcWorker, // For handling KV requests. storage: Storage, // For handling coprocessor requests. copr: Endpoint, // For handling corprocessor v2 requests. copr_v2: coprocessor_v2::Endpoint, - // For handling raft messages. - ch: T, // For handling snapshot. snap_scheduler: Scheduler, // For handling `CheckLeader` request. @@ -101,13 +87,7 @@ pub struct Service + 'static, E: Engine, L: LockMan reject_messages_on_memory_ratio: f64, } -impl< - T: RaftStoreRouter + Clone + 'static, - E: Engine + Clone, - L: LockManager + Clone, - F: KvFormat, -> Clone for Service -{ +impl Clone for Service { fn clone(&self) -> Self { Service { store_id: self.store_id, @@ -115,7 +95,6 @@ impl< storage: self.storage.clone(), copr: self.copr.clone(), copr_v2: self.copr_v2.clone(), - ch: self.ch.clone(), snap_scheduler: self.snap_scheduler.clone(), check_leader_scheduler: self.check_leader_scheduler.clone(), enable_req_batch: self.enable_req_batch, @@ -126,17 +105,14 @@ impl< } } -impl + 'static, E: Engine, L: LockManager, F: KvFormat> - Service -{ +impl Service { /// Constructs a new `Service` which provides the `Tikv` service. pub fn new( store_id: u64, storage: Storage, - gc_worker: GcWorker, + gc_worker: GcWorker, copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, - ch: T, snap_scheduler: Scheduler, check_leader_scheduler: Scheduler, grpc_thread_load: Arc, @@ -150,7 +126,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor storage, copr, copr_v2, - ch, snap_scheduler, check_leader_scheduler, enable_req_batch, @@ -162,7 +137,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor fn handle_raft_message( store_id: u64, - ch: &T, + ch: &E::RaftExtension, msg: RaftMessage, reject: bool, ) -> RaftStoreResult<()> { @@ -177,13 +152,11 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor RAFT_APPEND_REJECTS.inc(); let id = msg.get_region_id(); let peer_id = msg.get_message().get_from(); - let m = CasualMessage::RejectRaftAppend { peer_id }; - let _ = ch.send_casual_msg(id, m); + ch.report_reject_message(id, peer_id); return Ok(()); } - // `send_raft_msg` may return `RaftStoreError::RegionNotFound` or - // `RaftStoreError::Transport(DiscardReason::Full)` - ch.send_raft_msg(msg) + ch.feed(msg, false); + Ok(()) } } @@ -233,9 +206,7 @@ macro_rules! set_total_time { }; } -impl + 'static, E: Engine, L: LockManager, F: KvFormat> Tikv - for Service -{ +impl Tikv for Service { handle_request!(kv_get, future_get, GetRequest, GetResponse, has_time_detail); handle_request!(kv_scan, future_scan, ScanRequest, ScanResponse); handle_request!( @@ -414,7 +385,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); - let resp = future_prepare_flashback_to_version(&self.storage, &self.ch, req); + let resp = future_prepare_flashback_to_version(&self.storage, req); let task = async move { let resp = resp.await?; let elapsed = begin_instant.saturating_elapsed(); @@ -445,7 +416,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); - let resp = future_flashback_to_version(&self.storage, &self.ch, req); + let resp = future_flashback_to_version(&self.storage, req); let task = async move { let resp = resp.await?; let elapsed = begin_instant.saturating_elapsed(); @@ -523,169 +494,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ctx.spawn(task); } - fn register_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: RegisterLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.start_collecting(req.get_max_ts().into(), cb); - - let task = async move { - // Here except for the receiving error of `futures::channel::oneshot`, - // other errors will be returned as the successful response of rpc. - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = RegisterLockObserverResponse::default(); - if let Err(e) = res { - resp.set_error(format!("{}", e)); - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .register_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "register_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.register_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn check_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: CheckLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self - .gc_worker - .get_collected_locks(req.get_max_ts().into(), cb); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = CheckLockObserverResponse::default(); - match res { - Ok((locks, is_clean)) => { - resp.set_is_clean(is_clean); - resp.set_locks(locks.into()); - } - Err(e) => resp.set_error(format!("{}", e)), - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .check_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "check_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.check_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn remove_lock_observer( - &mut self, - ctx: RpcContext<'_>, - req: RemoveLockObserverRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.stop_collecting(req.get_max_ts().into(), cb); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = RemoveLockObserverResponse::default(); - if let Err(e) = res { - resp.set_error(format!("{}", e)); - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .remove_lock_observer - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "remove_lock_observer" - ); - GRPC_MSG_FAIL_COUNTER.remove_lock_observer.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - - fn physical_scan_lock( - &mut self, - ctx: RpcContext<'_>, - mut req: PhysicalScanLockRequest, - sink: UnarySink, - ) { - let begin_instant = Instant::now(); - - let (cb, f) = paired_future_callback(); - let res = self.gc_worker.physical_scan_lock( - req.take_context(), - req.get_max_ts().into(), - Key::from_raw(req.get_start_key()), - req.get_limit() as _, - cb, - ); - - let task = async move { - let res = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - let mut resp = PhysicalScanLockResponse::default(); - match res { - Ok(locks) => resp.set_locks(locks.into()), - Err(e) => resp.set_error(format!("{}", e)), - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .physical_scan_lock - .observe(duration_to_sec(begin_instant.saturating_elapsed())); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "physical_scan_lock" - ); - GRPC_MSG_FAIL_COUNTER.physical_scan_lock.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - fn unsafe_destroy_range( &mut self, ctx: RpcContext<'_>, @@ -782,7 +590,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor sink: ClientStreamingSink, ) { let store_id = self.store_id; - let ch = self.ch.clone(); + let ch = self.storage.get_engine().raft_extension().clone(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -825,7 +633,7 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ) { info!("batch_raft RPC is called, new gRPC stream established"); let store_id = self.store_id; - let ch = self.ch.clone(); + let ch = self.storage.get_engine().raft_extension().clone(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -894,7 +702,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let begin_instant = Instant::now(); let region_id = req.get_context().get_region_id(); - let (cb, f) = paired_future_callback(); let mut split_keys = if req.is_raw_kv { if !req.get_split_key().is_empty() { vec![F::encode_raw_key_owned(req.take_split_key(), None).into_encoded()] @@ -915,52 +722,45 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor } }; split_keys.sort(); - let req = CasualMessage::SplitRegion { - region_epoch: req.take_context().take_region_epoch(), + let engine = self.storage.get_engine(); + let f = engine.raft_extension().split( + region_id, + req.take_context().take_region_epoch(), split_keys, - callback: Callback::write(cb), - source: ctx.peer().into(), - }; - - if let Err(e) = self.ch.send_casual_msg(region_id, req) { - // Retrun region error instead a gRPC error. - let mut resp = SplitRegionResponse::default(); - resp.set_region_error(raftstore_error_to_region_error(e, region_id)); - ctx.spawn( - async move { - sink.success(resp).await?; - ServerResult::Ok(()) - } - .map_err(|_| ()) - .map(|_| ()), - ); - return; - } + ctx.peer(), + ); let task = async move { - let mut res = f.await?; + let res = f.await; let mut resp = SplitRegionResponse::default(); - if res.response.get_header().has_error() { - resp.set_region_error(res.response.mut_header().take_error()); - } else { - let admin_resp = res.response.mut_admin_response(); - let regions: Vec<_> = admin_resp.mut_splits().take_regions().into(); - if regions.len() < 2 { - error!( - "invalid split response"; - "region_id" => region_id, - "resp" => ?admin_resp - ); - resp.mut_region_error().set_message(format!( - "Internal Error: invalid response: {:?}", - admin_resp - )); - } else { - if regions.len() == 2 { - resp.set_left(regions[0].clone()); - resp.set_right(regions[1].clone()); + match res { + Ok(regions) => { + if regions.len() < 2 { + error!( + "invalid split response"; + "region_id" => region_id, + "resp" => ?regions + ); + resp.mut_region_error().set_message(format!( + "Internal Error: invalid response: {:?}", + regions + )); + } else { + if regions.len() == 2 { + resp.set_left(regions[0].clone()); + resp.set_right(regions[1].clone()); + } + resp.set_regions(regions.into()); + } + } + Err(e) => { + let err: crate::storage::Result<()> = Err(e.into()); + if let Some(err) = extract_region_error(&err) { + resp.set_region_error(err) + } else { + resp.mut_region_error() + .set_message(format!("failed to split: {:?}", err)); } - resp.set_regions(regions.into()); } } sink.success(resp).await?; @@ -980,103 +780,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor ctx.spawn(task); } - fn read_index( - &mut self, - ctx: RpcContext<'_>, - req: ReadIndexRequest, - sink: UnarySink, - ) { - forward_unary!(self.proxy, read_index, ctx, req, sink); - let begin_instant = Instant::now(); - - let region_id = req.get_context().get_region_id(); - let mut cmd = RaftCmdRequest::default(); - let mut header = RaftRequestHeader::default(); - let mut inner_req = RaftRequest::default(); - inner_req.set_cmd_type(CmdType::ReadIndex); - inner_req.mut_read_index().set_start_ts(req.get_start_ts()); - for r in req.get_ranges() { - let mut range = kvproto::kvrpcpb::KeyRange::default(); - range.set_start_key(Key::from_raw(r.get_start_key()).into_encoded()); - range.set_end_key(Key::from_raw(r.get_end_key()).into_encoded()); - inner_req.mut_read_index().mut_key_ranges().push(range); - } - header.set_region_id(req.get_context().get_region_id()); - header.set_peer(req.get_context().get_peer().clone()); - header.set_region_epoch(req.get_context().get_region_epoch().clone()); - if req.get_context().get_term() != 0 { - header.set_term(req.get_context().get_term()); - } - header.set_sync_log(req.get_context().get_sync_log()); - header.set_read_quorum(true); - cmd.set_header(header); - cmd.set_requests(vec![inner_req].into()); - - let (cb, f) = paired_future_callback(); - - // We must deal with all requests which acquire read-quorum in raftstore-thread, - // so just send it as an command. - if let Err(e) = self - .ch - .send_command(cmd, Callback::read(cb), RaftCmdExtraOpts::default()) - { - // Retrun region error instead a gRPC error. - let mut resp = ReadIndexResponse::default(); - resp.set_region_error(raftstore_error_to_region_error(e, region_id)); - ctx.spawn( - async move { - sink.success(resp).await?; - ServerResult::Ok(()) - } - .map_err(|_| ()) - .map(|_| ()), - ); - return; - } - - let task = async move { - let mut res = f.await?; - let mut resp = ReadIndexResponse::default(); - if res.response.get_header().has_error() { - resp.set_region_error(res.response.mut_header().take_error()); - } else { - let mut raft_resps = res.response.take_responses(); - if raft_resps.len() != 1 { - error!( - "invalid read index response"; - "region_id" => region_id, - "response" => ?raft_resps - ); - resp.mut_region_error().set_message(format!( - "Internal Error: invalid response: {:?}", - raft_resps - )); - } else { - let mut read_index_resp = raft_resps[0].take_read_index(); - if read_index_resp.has_locked() { - resp.set_locked(read_index_resp.take_locked()); - } else { - resp.set_read_index(read_index_resp.get_read_index()); - } - } - } - sink.success(resp).await?; - GRPC_MSG_HISTOGRAM_STATIC - .read_index - .observe(begin_instant.saturating_elapsed_secs()); - ServerResult::Ok(()) - } - .map_err(|e| { - log_net_error!(e, "kv rpc failed"; - "request" => "read_index" - ); - GRPC_MSG_FAIL_COUNTER.read_index.inc(); - }) - .map(|_| ()); - - ctx.spawn(task); - } - fn batch_commands( &mut self, ctx: RpcContext<'_>, @@ -1093,7 +796,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor let copr_v2 = self.copr_v2.clone(); let pool_size = storage.get_normal_pool_size(); let batch_builder = BatcherBuilder::new(self.enable_req_batch, pool_size); - let ch = self.ch.clone(); let request_handler = stream.try_for_each(move |mut req| { let request_ids = req.take_request_ids(); let requests: Vec<_> = req.take_requests().into(); @@ -1110,7 +812,6 @@ impl + 'static, E: Engine, L: LockManager, F: KvFor id, req, &tx, - &ch, ); if let Some(batch) = batcher.as_mut() { batch.maybe_commit(&storage, &tx); @@ -1311,12 +1012,7 @@ fn response_batch_commands_request( poll_future_notify(task); } -fn handle_batch_commands_request< - T: RaftStoreRouter + 'static, - E: Engine, - L: LockManager, - F: KvFormat, ->( +fn handle_batch_commands_request( batcher: &mut Option, storage: &Storage, copr: &Endpoint, @@ -1325,7 +1021,6 @@ fn handle_batch_commands_request< id: u64, req: batch_commands_request::Request, tx: &Sender, - ch: &T, ) { // To simplify code and make the logic more clear. macro_rules! oneof { @@ -1428,8 +1123,8 @@ fn handle_batch_commands_request< ResolveLock, future_resolve_lock(storage), kv_resolve_lock; Gc, future_gc(), kv_gc; DeleteRange, future_delete_range(storage), kv_delete_range; - PrepareFlashbackToVersion, future_prepare_flashback_to_version(storage, ch), kv_prepare_flashback_to_version; - FlashbackToVersion, future_flashback_to_version(storage, ch), kv_flashback_to_version; + PrepareFlashbackToVersion, future_prepare_flashback_to_version(storage), kv_prepare_flashback_to_version; + FlashbackToVersion, future_flashback_to_version(storage), kv_flashback_to_version; RawBatchGet, future_raw_batch_get(storage), raw_batch_get; RawPut, future_raw_put(storage), raw_put; RawBatchPut, future_raw_batch_put(storage), raw_batch_put; @@ -1725,77 +1420,64 @@ fn future_delete_range( // Preparing the flashback for a region will "lock" the region so that // there is no any read, write or scheduling operation could be proposed before // the actual flashback operation. -fn future_prepare_flashback_to_version< - E: Engine, - L: LockManager, - F: KvFormat, - T: RaftStoreRouter + 'static, ->( +// NOTICE: the caller needs to make sure the version we want to flashback won't +// be between any transactions that have not been fully committed. +fn future_prepare_flashback_to_version( // Keep this param to hint the type of E for the compiler. - _storage: &Storage, - raft_router: &T, + storage: &Storage, req: PrepareFlashbackToVersionRequest, ) -> impl Future> { - let raft_router = Mutex::new(raft_router.clone()); + let storage = storage.clone(); async move { - // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the - // later flashback. Once invoked, we will update the persistent region meta and - // the memory state of the flashback in Peer FSM to reject all read, write - // and scheduling operations for this region when propose/apply before we - // start the actual data flashback transaction command in the next phase. - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::PrepareFlashback, - ) - .await?; - Ok(PrepareFlashbackToVersionResponse::default()) + let f = storage.get_engine().start_flashback(req.get_context()); + let mut res = f.await.map_err(storage::Error::from); + if matches!(res, Ok(())) { + // After the region is put into the flashback state, we need to do a special + // prewrite to prevent `resolved_ts` from advancing. + let (cb, f) = paired_future_callback(); + res = storage.sched_txn_command(req.clone().into(), cb); + if matches!(res, Ok(())) { + res = f.await.unwrap_or_else(|e| Err(box_err!(e))); + } + } + let mut resp = PrepareFlashbackToVersionResponse::default(); + if let Some(e) = extract_region_error(&res) { + resp.set_region_error(e); + } else if let Err(e) = res { + resp.set_error(format!("{}", e)); + } + Ok(resp) } } // Flashback the region to a specific point with the given `version`, please // make sure the region is "locked" by `PrepareFlashbackToVersion` first, // otherwise this request will fail. -fn future_flashback_to_version< - T: RaftStoreRouter + 'static, - E: Engine, - L: LockManager, - F: KvFormat, ->( +fn future_flashback_to_version( storage: &Storage, - raft_router: &T, req: FlashbackToVersionRequest, ) -> impl Future> { - let storage_clone = storage.clone(); - let raft_router = Mutex::new(raft_router.clone()); + let storage = storage.clone(); async move { // Perform the data flashback transaction command. We will check if the region // is in the flashback state when proposing the flashback modification. let (cb, f) = paired_future_callback(); - let res = storage_clone.sched_txn_command(req.clone().into(), cb); - // Avoid crossing `.await` to bypass the `Send` constraint. - drop(storage_clone); - let v = match res { - Err(e) => Err(e), - Ok(_) => f.await?, - }; - fail_point!("skip_finish_flashback_to_version", |_| { - Ok(FlashbackToVersionResponse::default()) - }); - // Send an `AdminCmdType::FinishFlashback` to unset the persistence state - // in `RegionLocalState` and region's meta, and when that - // admin cmd is applied, will update the memory - // state of the flashback - send_flashback_msg::( - &raft_router, - req.get_context(), - AdminCmdType::FinishFlashback, - ) - .await?; + let mut res = storage.sched_txn_command(req.clone().into(), cb); + if matches!(res, Ok(())) { + res = f.await.unwrap_or_else(|e| Err(box_err!(e))); + } + if matches!(res, Ok(())) { + // Only finish flashback when Flashback executed successfully. + fail_point!("skip_finish_flashback_to_version", |_| { + Ok(FlashbackToVersionResponse::default()) + }); + let f = storage.get_engine().end_flashback(req.get_context()); + res = f.await.map_err(storage::Error::from); + } let mut resp = FlashbackToVersionResponse::default(); - if let Some(err) = extract_region_error(&v) { + if let Some(err) = extract_region_error(&res) { resp.set_region_error(err); - } else if let Err(e) = v { + } else if let Err(e) = res { resp.set_error(format!("{}", e)); } Ok(resp) @@ -2185,12 +1867,12 @@ fn future_raw_coprocessor( } macro_rules! txn_command_future { - ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) $prelude: stmt; ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { + ($fn_name: ident, $req_ty: ident, $resp_ty: ident, ($req: ident) {$($prelude: stmt)*}; ($v: ident, $resp: ident, $tracker: ident) { $else_branch: expr }) => { fn $fn_name( storage: &Storage, $req: $req_ty, ) -> impl Future> { - $prelude + $($prelude)* let $tracker = GLOBAL_TRACKERS.insert(Tracker::new(RequestInfo::new( $req.get_context(), RequestType::Unknown, @@ -2237,22 +1919,42 @@ txn_command_future!(future_prewrite, PrewriteRequest, PrewriteResponse, (v, resp } resp.set_errors(extract_key_errors(v.map(|v| v.locks)).into()); }}); -txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, (v, resp, tracker) {{ - match v { - Ok(Ok(res)) => { - let (values, not_founds) = res.into_values_and_not_founds(); - resp.set_values(values.into()); - resp.set_not_founds(not_founds); - }, - Err(e) | Ok(Err(e)) => { - resp.set_errors(vec![extract_key_error(&e)].into()) - }, - } - GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { - tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); - tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); - }); -}}); +txn_command_future!(future_acquire_pessimistic_lock, PessimisticLockRequest, PessimisticLockResponse, + (req) { + let mode = req.get_wake_up_mode() + }; + (v, resp, tracker) {{ + match v { + Ok(Ok(res)) => { + match mode { + PessimisticLockWakeUpMode::WakeUpModeForceLock => { + let (res, error) = res.into_pb(); + resp.set_results(res.into()); + if let Some(e) = error { + if let Some(region_error) = extract_region_error_from_error(&e.0) { + resp.set_region_error(region_error); + } else { + resp.set_errors(vec![extract_key_error(&e.0)].into()); + } + } + } + PessimisticLockWakeUpMode::WakeUpModeNormal => { + let (values, not_founds) = res.into_legacy_values_and_not_founds(); + resp.set_values(values.into()); + resp.set_not_founds(not_founds); + } + } + }, + Err(e) | Ok(Err(e)) => { + resp.set_errors(vec![extract_key_error(&e)].into()) + }, + } + GLOBAL_TRACKERS.with_tracker(tracker, |tracker| { + tracker.write_scan_detail(resp.mut_exec_details_v2().mut_scan_detail_v2()); + tracker.write_write_detail(resp.mut_exec_details_v2().mut_write_detail()); + }); + }} +); txn_command_future!(future_pessimistic_rollback, PessimisticRollbackRequest, PessimisticRollbackResponse, (v, resp) { resp.set_errors(extract_key_errors(v).into()) }); @@ -2425,20 +2127,6 @@ fn collect_batch_resp(v: &mut MeasuredBatchResponse, mut e: MeasuredSingleRespon v.measures.push(e.measure); } -fn raftstore_error_to_region_error(e: RaftStoreError, region_id: u64) -> RegionError { - if let RaftStoreError::Transport(DiscardReason::Disconnected) = e { - // `From::from(RaftStoreError) -> RegionError` treats `Disconnected` as `Other`. - let mut region_error = RegionError::default(); - let region_not_found = RegionNotFound { - region_id, - ..Default::default() - }; - region_error.set_region_not_found(region_not_found); - return region_error; - } - e.into() -} - fn needs_reject_raft_append(reject_messages_on_memory_ratio: f64) -> bool { fail_point!("needs_reject_raft_append", |_| true); if reject_messages_on_memory_ratio < f64::EPSILON { @@ -2465,61 +2153,6 @@ fn needs_reject_raft_append(reject_messages_on_memory_ratio: f64) -> bool { false } -async fn send_flashback_msg + 'static, E: Engine>( - raft_router: &Mutex, - ctx: &Context, - cmd_type: AdminCmdType, -) -> ServerResult<()> { - let region_id = ctx.get_region_id(); - let (result_tx, result_rx) = oneshot::channel(); - let cb = Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx.send(false).unwrap(); - error!("exec flashback msg failed"; - "region_id" => region_id, - "type" => ?cmd_type, - "error" => ?resp.response.get_header().get_error()); - return; - } - result_tx.send(true).unwrap(); - })); - let mut admin = AdminRequest::default(); - admin.set_cmd_type(cmd_type); - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(ctx.get_region_epoch().clone()); - req.mut_header().set_peer(ctx.get_peer().clone()); - req.set_admin_request(admin); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - // call admin request directly - let raft_router = raft_router.lock().await; - if let Err(e) = raft_router.send_command( - req, - cb, - RaftCmdExtraOpts { - deadline: None, - disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, - }, - ) { - return Err(Error::Other(box_err!( - "send flashback msg {:?} failed for region {}, error {:?}", - cmd_type, - region_id, - e - ))); - } - if !result_rx.await? { - return Err(Error::Other(box_err!( - "wait flashback msg {:?} result failed for region {} failed", - cmd_type, - region_id - ))); - } - Ok(()) -} - #[cfg(test)] mod tests { use std::thread; diff --git a/src/server/snap.rs b/src/server/snap.rs index 49c38cb645b..8fe737c2e60 100644 --- a/src/server/snap.rs +++ b/src/server/snap.rs @@ -3,7 +3,6 @@ use std::{ fmt::{self, Display, Formatter}, io::{Read, Write}, - marker::PhantomData, pin::Pin, sync::{ atomic::{AtomicUsize, Ordering}, @@ -12,7 +11,6 @@ use std::{ time::Duration, }; -use engine_traits::KvEngine; use file_system::{IoType, WithIoType}; use futures::{ future::{Future, TryFutureExt}, @@ -29,11 +27,9 @@ use kvproto::{ tikvpb::TikvClient, }; use protobuf::Message; -use raftstore::{ - router::RaftStoreRouter, - store::{SnapEntry, SnapKey, SnapManager, Snapshot}, -}; +use raftstore::store::{SnapEntry, SnapKey, SnapManager, Snapshot}; use security::SecurityManager; +use tikv_kv::RaftExtension; use tikv_util::{ config::{Tracker, VersionTrack}, time::Instant, @@ -47,7 +43,7 @@ use crate::tikv_util::sys::thread::ThreadBuildWrapper; pub type Callback = Box) + Send>; -const DEFAULT_POOL_SIZE: usize = 4; +pub const DEFAULT_POOL_SIZE: usize = 4; /// A task for either receiving Snapshot or sending Snapshot pub enum Task { @@ -83,7 +79,7 @@ struct SnapChunk { remain_bytes: usize, } -const SNAP_CHUNK_LEN: usize = 1024 * 1024; +pub const SNAP_CHUNK_LEN: usize = 1024 * 1024; impl Stream for SnapChunk { type Item = Result<(SnapshotChunk, WriteFlags)>; @@ -260,7 +256,7 @@ impl RecvSnapContext { }) } - fn finish>(self, raft_router: R) -> Result<()> { + fn finish(self, raft_router: R) -> Result<()> { let _with_io_type = WithIoType::new(self.io_type); let key = self.key; if let Some(mut file) = self.file { @@ -271,15 +267,13 @@ impl RecvSnapContext { return Err(e); } } - if let Err(e) = raft_router.send_raft_msg(self.raft_msg) { - return Err(box_err!("{} failed to send snapshot to raft: {}", key, e)); - } + raft_router.feed(self.raft_msg, true); info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); Ok(()) } } -fn recv_snap + 'static>( +fn recv_snap( stream: RequestStream, sink: ClientStreamingSink, snap_mgr: SnapManager, @@ -331,11 +325,7 @@ fn recv_snap + 'static>( } } -pub struct Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +pub struct Runner { env: Arc, snap_mgr: SnapManager, pool: Runtime, @@ -345,21 +335,16 @@ where cfg: Config, sending_count: Arc, recving_count: Arc, - engine: PhantomData, } -impl Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl Runner { pub fn new( env: Arc, snap_mgr: SnapManager, r: R, security_mgr: Arc, cfg: Arc>, - ) -> Runner { + ) -> Self { let cfg_tracker = cfg.clone().tracker("snap-sender".to_owned()); let snap_worker = Runner { env, @@ -377,7 +362,6 @@ where cfg: cfg.value().clone(), sending_count: Arc::new(AtomicUsize::new(0)), recving_count: Arc::new(AtomicUsize::new(0)), - engine: PhantomData, }; snap_worker } @@ -404,11 +388,7 @@ where } } -impl Runnable for Runner -where - E: KvEngine, - R: RaftStoreRouter + 'static, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { diff --git a/src/server/status_server/profile.rs b/src/server/status_server/profile.rs index 3419c7df0c8..b3d91d3bea6 100644 --- a/src/server/status_server/profile.rs +++ b/src/server/status_server/profile.rs @@ -234,7 +234,7 @@ pub fn read_file(path: &str) -> Result, String> { pub fn jeprof_heap_profile(path: &str) -> Result, String> { info!("using jeprof to process {}", path); let output = Command::new("./jeprof") - .args(&["--show_bytes", "./bin/tikv-server", path, "--svg"]) + .args(["--show_bytes", "./bin/tikv-server", path, "--svg"]) .output() .map_err(|e| format!("jeprof: {}", e))?; if !output.status.success() { @@ -250,7 +250,7 @@ pub fn list_heap_profiles() -> Result, String> { None => return Ok(vec![]), }; - let dir = std::fs::read_dir(&path).map_err(|e| format!("read dir fail: {}", e))?; + let dir = std::fs::read_dir(path).map_err(|e| format!("read dir fail: {}", e))?; let mut profiles = Vec::new(); for item in dir { let item = match item { diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs new file mode 100644 index 00000000000..5dd83deb092 --- /dev/null +++ b/src/server/tablet_snap.rs @@ -0,0 +1,537 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + convert::{TryFrom, TryInto}, + fs::{self, File}, + io::{Read, Write}, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; + +use file_system::{IoType, WithIoType}; +use futures::{ + future::{Future, TryFutureExt}, + sink::{Sink, SinkExt}, + stream::{Stream, StreamExt, TryStreamExt}, +}; +use grpcio::{ + self, ChannelBuilder, ClientStreamingSink, Environment, RequestStream, RpcStatus, + RpcStatusCode, WriteFlags, +}; +use kvproto::{ + raft_serverpb::{Done, RaftMessage, RaftSnapshotData, SnapshotChunk}, + tikvpb::TikvClient, +}; +use protobuf::Message; +use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; +use security::SecurityManager; +use tikv_kv::RaftExtension; +use tikv_util::{ + config::{Tracker, VersionTrack}, + time::Instant, + worker::Runnable, +}; +use tokio::runtime::{Builder as RuntimeBuilder, Runtime}; + +use super::{ + metrics::*, + snap::{Task, DEFAULT_POOL_SIZE, SNAP_CHUNK_LEN}, + Config, Error, Result, +}; +use crate::tikv_util::{sys::thread::ThreadBuildWrapper, time::Limiter}; + +struct RecvTabletSnapContext { + key: TabletSnapKey, + raft_msg: RaftMessage, + io_type: IoType, + start: Instant, + chunk_size: usize, +} + +impl RecvTabletSnapContext { + fn new(mut head: SnapshotChunk) -> Result { + if !head.has_message() { + return Err(box_err!("no raft message in the first chunk")); + } + + let chunk_size = match head.take_data().try_into() { + Ok(buff) => usize::from_ne_bytes(buff), + Err(_) => return Err(box_err!("failed to get chunk size")), + }; + let meta = head.take_message(); + let key = TabletSnapKey::from_region_snap( + meta.get_region_id(), + meta.get_to_peer().get_id(), + meta.get_message().get_snapshot(), + ); + let io_type = io_type_from_raft_message(&meta)?; + + Ok(RecvTabletSnapContext { + key, + raft_msg: meta, + io_type, + start: Instant::now(), + chunk_size, + }) + } + + fn finish(self, raft_router: R) -> Result<()> { + let key = self.key; + raft_router.feed(self.raft_msg, true); + info!("saving all snapshot files"; "snap_key" => %key, "takes" => ?self.start.saturating_elapsed()); + Ok(()) + } +} + +fn io_type_from_raft_message(msg: &RaftMessage) -> Result { + let snapshot = msg.get_message().get_snapshot(); + let data = snapshot.get_data(); + let mut snapshot_data = RaftSnapshotData::default(); + snapshot_data.merge_from_bytes(data)?; + let snapshot_meta = snapshot_data.get_meta(); + if snapshot_meta.get_for_balance() { + Ok(IoType::LoadBalance) + } else { + Ok(IoType::Replication) + } +} + +async fn send_snap_files( + mgr: &TabletSnapManager, + mut sender: impl Sink<(SnapshotChunk, WriteFlags), Error = Error> + Unpin, + msg: RaftMessage, + key: TabletSnapKey, + limiter: Limiter, +) -> Result { + let path = mgr.tablet_gen_path(&key); + info!("begin to send snapshot file";"snap_key" => %key); + let files = fs::read_dir(&path)? + .map(|f| Ok(f?.path())) + .filter(|f| f.is_ok() && f.as_ref().unwrap().is_file()) + .collect::>>()?; + let io_type = io_type_from_raft_message(&msg)?; + let _with_io_type = WithIoType::new(io_type); + let mut total_sent = msg.compute_size() as u64; + let mut chunk = SnapshotChunk::default(); + chunk.set_message(msg); + chunk.set_data(usize::to_ne_bytes(SNAP_CHUNK_LEN).to_vec()); + sender + .feed((chunk, WriteFlags::default().buffer_hint(true))) + .await?; + for path in files { + let name = path.file_name().unwrap().to_str().unwrap(); + let mut buffer = Vec::with_capacity(SNAP_CHUNK_LEN); + buffer.push(name.len() as u8); + buffer.extend_from_slice(name.as_bytes()); + let mut f = File::open(&path)?; + let mut off = buffer.len(); + loop { + unsafe { + buffer.set_len(SNAP_CHUNK_LEN); + } + // it should break if readed len is zero or the buffer is full. + while off < SNAP_CHUNK_LEN { + let readed = f.read(&mut buffer[off..])?; + if readed == 0 { + unsafe { + buffer.set_len(off); + } + break; + } + off += readed; + } + limiter.consume(off); + total_sent += off as u64; + let mut chunk = SnapshotChunk::default(); + chunk.set_data(buffer); + sender + .feed((chunk, WriteFlags::default().buffer_hint(true))) + .await?; + // It should switch the next file if the read buffer len is less than the + // SNAP_CHUNK_LEN. + if off < SNAP_CHUNK_LEN { + break; + } + buffer = Vec::with_capacity(SNAP_CHUNK_LEN); + off = 0 + } + } + info!("sent all snap file finish"; "snap_key" => %key); + sender.close().await?; + Ok(total_sent) +} + +/// Send the snapshot to specified address. +/// +/// It will first send the normal raft snapshot message and then send the +/// snapshot file. +pub fn send_snap( + env: Arc, + mgr: TabletSnapManager, + security_mgr: Arc, + cfg: &Config, + addr: &str, + msg: RaftMessage, + limiter: Limiter, +) -> Result>> { + assert!(msg.get_message().has_snapshot()); + let timer = Instant::now(); + let send_timer = SEND_SNAP_HISTOGRAM.start_coarse_timer(); + let key = TabletSnapKey::from_region_snap( + msg.get_region_id(), + msg.get_to_peer().get_id(), + msg.get_message().get_snapshot(), + ); + + let cb = ChannelBuilder::new(env) + .stream_initial_window_size(cfg.grpc_stream_initial_window_size.0 as i32) + .keepalive_time(cfg.grpc_keepalive_time.0) + .keepalive_timeout(cfg.grpc_keepalive_timeout.0) + .default_compression_algorithm(cfg.grpc_compression_algorithm()) + .default_gzip_compression_level(cfg.grpc_gzip_compression_level) + .default_grpc_min_message_size_to_compress(cfg.grpc_min_message_size_to_compress); + + let channel = security_mgr.connect(cb, addr); + let client = TikvClient::new(channel); + let (sink, receiver) = client.snapshot()?; + let send_task = async move { + let sink = sink.sink_map_err(Error::from); + let total_size = send_snap_files(&mgr, sink, msg, key.clone(), limiter).await?; + let recv_result = receiver.map_err(Error::from).await; + send_timer.observe_duration(); + drop(client); + match recv_result { + Ok(_) => { + mgr.delete_snapshot(&key); + Ok(SendStat { + key, + total_size, + elapsed: timer.saturating_elapsed(), + }) + } + Err(e) => Err(e), + } + }; + Ok(send_task) +} + +async fn recv_snap_files( + snap_mgr: TabletSnapManager, + mut stream: impl Stream> + Unpin, + limit: Limiter, +) -> Result { + let head = stream + .next() + .await + .transpose()? + .ok_or_else(|| Error::Other("empty gRPC stream".into()))?; + let context = RecvTabletSnapContext::new(head)?; + let chunk_size = context.chunk_size; + let path = snap_mgr.tmp_recv_path(&context.key); + info!("begin to receive tablet snapshot files"; "file" => %path.display()); + fs::create_dir_all(&path)?; + let _with_io_type = WithIoType::new(context.io_type); + loop { + let mut chunk = match stream.next().await { + Some(Ok(mut c)) if !c.has_message() => c.take_data(), + Some(_) => { + return Err(box_err!("duplicated metadata")); + } + None => break, + }; + // the format of chunk: + // |--name_len--|--name--|--content--| + let len = chunk[0] as usize; + let file_name = box_try!(std::str::from_utf8(&chunk[1..len + 1])); + let p = path.join(file_name); + let mut f = File::create(&p)?; + let mut size = chunk.len() - len - 1; + f.write_all(&chunk[len + 1..])?; + // It should switch next file if the chunk size is less than the SNAP_CHUNK_LEN. + while chunk.len() >= chunk_size { + chunk = match stream.next().await { + Some(Ok(mut c)) if !c.has_message() => c.take_data(), + Some(_) => return Err(box_err!("duplicated metadata")), + None => return Err(box_err!("missing chunk")), + }; + f.write_all(&chunk[..])?; + limit.consume(chunk.len()); + size += chunk.len(); + } + debug!("received snap file"; "file" => %p.display(), "size" => size); + SNAP_LIMIT_TRANSPORT_BYTES_COUNTER_STATIC + .recv + .inc_by(size as u64); + f.sync_data()?; + } + info!("received all tablet snapshot file"; "snap_key" => %context.key); + let final_path = snap_mgr.final_recv_path(&context.key); + fs::rename(&path, final_path)?; + Ok(context) +} + +fn recv_snap( + stream: RequestStream, + sink: ClientStreamingSink, + snap_mgr: TabletSnapManager, + raft_router: R, + limit: Limiter, +) -> impl Future> { + let recv_task = async move { + let stream = stream.map_err(Error::from); + let context = recv_snap_files(snap_mgr, stream, limit).await?; + context.finish(raft_router) + }; + async move { + match recv_task.await { + Ok(()) => sink.success(Done::default()).await.map_err(Error::from), + Err(e) => { + let status = RpcStatus::with_message(RpcStatusCode::UNKNOWN, format!("{:?}", e)); + sink.fail(status).await.map_err(Error::from) + } + } + } +} + +pub struct TabletRunner { + env: Arc, + snap_mgr: TabletSnapManager, + security_mgr: Arc, + pool: Runtime, + raft_router: R, + cfg_tracker: Tracker, + cfg: Config, + sending_count: Arc, + recving_count: Arc, + limiter: Limiter, +} + +impl TabletRunner { + pub fn new( + env: Arc, + snap_mgr: TabletSnapManager, + r: R, + security_mgr: Arc, + cfg: Arc>, + ) -> Self { + let config = cfg.value().clone(); + let cfg_tracker = cfg.tracker("tablet-sender".to_owned()); + let limit = i64::try_from(config.snap_max_write_bytes_per_sec.0) + .unwrap_or_else(|_| panic!("snap_max_write_bytes_per_sec > i64::max_value")); + let limiter = Limiter::new(if limit > 0 { + limit as f64 + } else { + f64::INFINITY + }); + + let snap_worker = TabletRunner { + env, + snap_mgr, + pool: RuntimeBuilder::new_multi_thread() + .thread_name(thd_name!("tablet-snap-sender")) + .worker_threads(DEFAULT_POOL_SIZE) + .after_start_wrapper(tikv_alloc::add_thread_memory_accessor) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) + .build() + .unwrap(), + raft_router: r, + security_mgr, + cfg_tracker, + cfg: config, + sending_count: Arc::new(AtomicUsize::new(0)), + recving_count: Arc::new(AtomicUsize::new(0)), + limiter, + }; + snap_worker + } + + fn refresh_cfg(&mut self) { + if let Some(incoming) = self.cfg_tracker.any_new() { + let limit = if incoming.snap_max_write_bytes_per_sec.0 > 0 { + incoming.snap_max_write_bytes_per_sec.0 as f64 + } else { + f64::INFINITY + }; + self.limiter.set_speed_limit(limit); + info!("refresh snapshot manager config"; + "speed_limit"=> limit); + self.cfg = incoming.clone(); + } + } +} + +pub struct SendStat { + key: TabletSnapKey, + total_size: u64, + elapsed: Duration, +} + +impl Runnable for TabletRunner { + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Recv { stream, sink } => { + let task_num = self.recving_count.load(Ordering::SeqCst); + if task_num >= self.cfg.concurrent_recv_snap_limit { + warn!("too many recving snapshot tasks, ignore"); + let status = RpcStatus::with_message( + RpcStatusCode::RESOURCE_EXHAUSTED, + format!( + "the number of received snapshot tasks {} exceeded the limitation {}", + task_num, self.cfg.concurrent_recv_snap_limit + ), + ); + self.pool.spawn(sink.fail(status)); + return; + } + SNAP_TASK_COUNTER_STATIC.recv.inc(); + + let snap_mgr = self.snap_mgr.clone(); + let raft_router = self.raft_router.clone(); + let recving_count = self.recving_count.clone(); + recving_count.fetch_add(1, Ordering::SeqCst); + let limit = self.limiter.clone(); + let task = async move { + let result = recv_snap(stream, sink, snap_mgr, raft_router, limit).await; + recving_count.fetch_sub(1, Ordering::SeqCst); + if let Err(e) = result { + error!("failed to recv snapshot"; "err" => %e); + } + }; + self.pool.spawn(task); + } + Task::Send { addr, msg, cb } => { + let region_id = msg.get_region_id(); + if self.sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit + { + warn!( + "too many sending snapshot tasks, drop Send Snap[to: {}, snap: {:?}]", + addr, msg + ); + cb(Err(Error::Other("Too many sending snapshot tasks".into()))); + return; + } + SNAP_TASK_COUNTER_STATIC.send.inc(); + + let env = Arc::clone(&self.env); + let mgr = self.snap_mgr.clone(); + let security_mgr = Arc::clone(&self.security_mgr); + let sending_count = Arc::clone(&self.sending_count); + sending_count.fetch_add(1, Ordering::SeqCst); + let limit = self.limiter.clone(); + let send_task = + send_snap(env, mgr, security_mgr, &self.cfg.clone(), &addr, msg, limit); + let task = async move { + let res = match send_task { + Err(e) => Err(e), + Ok(f) => f.await, + }; + match res { + Ok(stat) => { + info!( + "sent snapshot"; + "region_id" => region_id, + "snap_key" => %stat.key, + "size" => stat.total_size, + "duration" => ?stat.elapsed + ); + cb(Ok(())); + } + Err(e) => { + error!("failed to send snap"; "to_addr" => addr, "region_id" => region_id, "err" => ?e); + cb(Err(e)); + } + }; + sending_count.fetch_sub(1, Ordering::SeqCst); + }; + + self.pool.spawn(task); + } + Task::RefreshConfigEvent => { + self.refresh_cfg(); + } + Task::Validate(f) => { + f(&self.cfg); + } + } + } +} + +#[cfg(test)] +mod tests { + use std::{ + fs::{create_dir_all, File}, + io::Write, + }; + + use futures::{ + channel::mpsc::{self}, + executor::block_on, + sink::SinkExt, + }; + use futures_util::StreamExt; + use grpcio::WriteFlags; + use kvproto::raft_serverpb::{RaftMessage, SnapshotChunk}; + use raftstore::store::snap::{TabletSnapKey, TabletSnapManager}; + use tempfile::TempDir; + use tikv_util::{store::new_peer, time::Limiter}; + + use super::{super::Error, recv_snap_files, send_snap_files, SNAP_CHUNK_LEN}; + + #[test] + fn test_send_tablet() { + let limiter = Limiter::new(f64::INFINITY); + let snap_key = TabletSnapKey::new(1, 1, 1, 1); + let mut msg = RaftMessage::default(); + msg.set_region_id(1); + msg.set_to_peer(new_peer(1, 1)); + msg.mut_message().mut_snapshot().mut_metadata().set_index(1); + msg.mut_message().mut_snapshot().mut_metadata().set_term(1); + let send_path = TempDir::new().unwrap(); + let send_snap_mgr = + TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()); + let snap_path = send_snap_mgr.tablet_gen_path(&snap_key); + create_dir_all(snap_path.as_path()).unwrap(); + // send file should skip directory + create_dir_all(snap_path.join("dir")).unwrap(); + for i in 0..2 { + let mut f = File::create(snap_path.join(i.to_string())).unwrap(); + let count = SNAP_CHUNK_LEN - 2; + let mut data = std::iter::repeat("a".as_bytes()) + .take(count) + .collect::>(); + for buffer in data.iter_mut() { + f.write_all(buffer).unwrap(); + } + f.sync_data().unwrap(); + } + + let recv_path = TempDir::new().unwrap(); + let recv_snap_manager = + TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()); + let (tx, rx) = mpsc::unbounded(); + let sink = tx.sink_map_err(Error::from); + block_on(send_snap_files( + &send_snap_mgr, + sink, + msg, + snap_key.clone(), + limiter.clone(), + )) + .unwrap(); + + let stream = rx.map(|x: (SnapshotChunk, WriteFlags)| Ok(x.0)); + let final_path = recv_snap_manager.final_recv_path(&snap_key); + let r = block_on(recv_snap_files(recv_snap_manager, stream, limiter)).unwrap(); + assert_eq!(r.key, snap_key); + std::thread::sleep(std::time::Duration::from_secs(1)); + let dir = std::fs::read_dir(final_path).unwrap(); + assert_eq!(2, dir.count()); + send_snap_mgr.delete_snapshot(&snap_key); + assert!(!snap_path.exists()); + } +} diff --git a/src/server/transport.rs b/src/server/transport.rs index e52bead3934..1303eff81f5 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -1,56 +1,45 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. -use std::marker::PhantomData; - -use engine_traits::KvEngine; use kvproto::raft_serverpb::RaftMessage; -use raftstore::{router::RaftStoreRouter, store::Transport, Result as RaftStoreResult}; +use raftstore::{store::Transport, Result as RaftStoreResult}; +use tikv_kv::RaftExtension; use crate::server::{raft_client::RaftClient, resolve::StoreAddrResolver}; -pub struct ServerTransport +pub struct ServerTransport where - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, - E: KvEngine, { - raft_client: RaftClient, - engine: PhantomData, + raft_client: RaftClient, } -impl Clone for ServerTransport +impl Clone for ServerTransport where - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, - E: KvEngine, { fn clone(&self) -> Self { ServerTransport { raft_client: self.raft_client.clone(), - engine: PhantomData, } } } -impl ServerTransport +impl ServerTransport where - E: KvEngine, - T: RaftStoreRouter + 'static, + T: RaftExtension + 'static, S: StoreAddrResolver + 'static, { - pub fn new(raft_client: RaftClient) -> ServerTransport { - ServerTransport { - raft_client, - engine: PhantomData, - } + pub fn new(raft_client: RaftClient) -> Self { + ServerTransport { raft_client } } } -impl Transport for ServerTransport +impl Transport for ServerTransport where - T: RaftStoreRouter + Unpin + 'static, + T: RaftExtension + Unpin + 'static, S: StoreAddrResolver + Unpin + 'static, - E: KvEngine, { fn send(&mut self, msg: RaftMessage) -> RaftStoreResult<()> { match self.raft_client.send(msg) { diff --git a/src/storage/config.rs b/src/storage/config.rs index 7f2e6820201..313f86ba048 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -29,6 +29,7 @@ const MAX_SCHED_CONCURRENCY: usize = 2 * 1024 * 1024; const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; +const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] @@ -50,6 +51,8 @@ pub struct Config { // Reserve disk space to make tikv would have enough space to compact when disk is full. pub reserve_space: ReadableSize, #[online_config(skip)] + pub reserve_raft_space: ReadableSize, + #[online_config(skip)] pub enable_async_apply_prewrite: bool, #[online_config(skip)] pub api_version: u8, @@ -78,10 +81,11 @@ impl Default for Config { scheduler_worker_pool_size: if cpu_num >= 16.0 { 8 } else { - std::cmp::max(1, std::cmp::min(4, cpu_num as usize)) + cpu_num.clamp(1., 4.) as usize }, scheduler_pending_write_threshold: ReadableSize::mb(DEFAULT_SCHED_PENDING_WRITE_MB), reserve_space: ReadableSize::gb(DEFAULT_RESERVED_SPACE_GB), + reserve_raft_space: ReadableSize::gb(DEFAULT_RESERVED_RAFT_SPACE_GB), enable_async_apply_prewrite: false, api_version: 1, enable_ttl: false, diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 7ce5d925dfa..2b41cf23ea2 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -238,45 +238,45 @@ pub fn get_tag_from_header(header: &errorpb::Error) -> &'static str { get_error_kind_from_header(header).get_str() } -pub fn extract_region_error(res: &Result) -> Option { - match *res { +pub fn extract_region_error_from_error(e: &Error) -> Option { + match e { // TODO: use `Error::cause` instead. - Err(Error(box ErrorInner::Kv(KvError(box KvErrorInner::Request(ref e))))) - | Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Engine(KvError( + Error(box ErrorInner::Kv(KvError(box KvErrorInner::Request(ref e)))) + | Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Engine(KvError( box KvErrorInner::Request(ref e), - )))))) - | Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + ))))) + | Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( box MvccErrorInner::Kv(KvError(box KvErrorInner::Request(ref e))), - )))))) => Some(e.to_owned()), - Err(Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::MaxTimestampNotSynced { + ))))) => Some(e.to_owned()), + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::MaxTimestampNotSynced { .. - })))) => { + }))) => { let mut err = errorpb::Error::default(); err.set_max_timestamp_not_synced(Default::default()); Some(err) } - Err(Error(box ErrorInner::SchedTooBusy)) => { + Error(box ErrorInner::SchedTooBusy) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(SCHEDULER_IS_BUSY.to_owned()); err.set_server_is_busy(server_is_busy_err); Some(err) } - Err(Error(box ErrorInner::GcWorkerTooBusy)) => { + Error(box ErrorInner::GcWorkerTooBusy) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(GC_WORKER_IS_BUSY.to_owned()); err.set_server_is_busy(server_is_busy_err); Some(err) } - Err(Error(box ErrorInner::Closed)) => { + Error(box ErrorInner::Closed) => { // TiKV is closing, return an RegionError to tell the client that this region is // unavailable temporarily, the client should retry the request in other TiKVs. let mut err = errorpb::Error::default(); err.set_message("TiKV is Closing".to_string()); Some(err) } - Err(Error(box ErrorInner::DeadlineExceeded)) => { + Error(box ErrorInner::DeadlineExceeded) => { let mut err = errorpb::Error::default(); let mut server_is_busy_err = errorpb::ServerIsBusy::default(); server_is_busy_err.set_reason(DEADLINE_EXCEEDED.to_owned()); @@ -287,6 +287,13 @@ pub fn extract_region_error(res: &Result) -> Option { } } +pub fn extract_region_error(res: &Result) -> Option { + match res { + Ok(_) => None, + Err(e) => extract_region_error_from_error(e), + } +} + pub fn extract_committed(err: &Error) -> Option { match *err { Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( @@ -461,19 +468,25 @@ pub fn extract_key_errors(res: Result>>) -> Vec); +pub struct SharedError(pub Arc); + +impl SharedError { + pub fn inner(&self) -> &ErrorInner { + &self.0.0 + } +} impl From for SharedError { fn from(e: ErrorInner) -> Self { - Self(Arc::new(e)) + Self(Arc::new(Error::from(e))) } } impl From for SharedError { fn from(e: Error) -> Self { - Self(Arc::from(e.0)) + Self(Arc::new(e)) } } @@ -483,7 +496,7 @@ impl TryFrom for Error { type Error = (); fn try_from(e: SharedError) -> std::result::Result { - Arc::try_unwrap(e.0).map(Into::into).map_err(|_| ()) + Arc::try_unwrap(e.0).map_err(|_| ()) } } diff --git a/src/storage/lock_manager/lock_wait_context.rs b/src/storage/lock_manager/lock_wait_context.rs index 24a61876f44..32c99867a3f 100644 --- a/src/storage/lock_manager/lock_wait_context.rs +++ b/src/storage/lock_manager/lock_wait_context.rs @@ -11,20 +11,30 @@ //! of a single `AcquirePessimisticLock` request, and ensuring the internal //! callback for returning response through RPC is called at most only once. -use std::{convert::TryInto, result::Result, sync::Arc}; +use std::{ + convert::TryInto, + result::Result, + sync::{ + atomic::{AtomicBool, Ordering}, + mpsc, Arc, + }, +}; use parking_lot::Mutex; use txn_types::Key; use crate::storage::{ errors::SharedError, - lock_manager::{ - lock_waiting_queue::{LockWaitQueues, PessimisticLockKeyCallback}, - LockManager, LockWaitToken, - }, - Error as StorageError, PessimisticLockRes, ProcessResult, StorageCallback, + lock_manager::{lock_waiting_queue::LockWaitQueues, LockManager, LockWaitToken}, + types::PessimisticLockKeyResult, + Error as StorageError, PessimisticLockResults, ProcessResult, StorageCallback, }; +// The arguments are: (result, is_canceled_before_enqueueing). +pub type PessimisticLockKeyCallback = + Box, bool) + Send + 'static>; +pub type CancellationCallback = Box; + pub struct LockWaitContextInner { /// The callback for finishing the current AcquirePessimisticLock request. /// Usually, requests are accepted from RPC, and in this case calling @@ -52,6 +62,123 @@ pub struct LockWaitContextSharedState { /// The key on which lock waiting occurs. key: Key, + + /// When a lock-waiting request (allow_lock_with_conflict == true) is + /// resumed, it's theoretically possible that the request meets lock + /// again, therefore it may need to be pushed to the lock waiting queue + /// again. Since the request is popped out from the queue when resuming + /// (which means the lock wait entry doesn't exist in the lock waiting + /// queue during the resumed execution), it's possible that timeout or + /// deadlock happens from `WaiterManager` during that time, which will + /// try to cancel the request. Therefore it leads to such a corner case: + /// + /// 1. (scheduler) A request enters lock waiting state, so an entry is + /// pushed to the `LockWaitQueues`, and a message is sent to + /// `LockManager`. + /// 2. (scheduler) After a while the entry is popped out and resumed + /// from the `LockWaitQueues`. + /// 3. (scheduler) The request resumes execution but still finds lock + /// on the key. + /// * This is possible to be caused by delayed-waking up or encountering + /// error when writing a lock-releasing command to the engine. + /// 4. (lock_manager) At the same time, `LockManager` tries to cancel + /// the request due to timeout. But when calling `finish_request`, + /// the entry cannot be found from the `LockWaitQueues`. So it + /// believes that the entry is already popped out and resumed and does + /// nothing. + /// 5. (scheduler) An entry is pushed to the `LockWaitQueues` due to + /// encountering lock at step 3. 6. Then the request becomes unable to + /// be canceled by timeout or other possible errors. In worst cases, + /// the request may stuck in TiKV forever. + /// + /// To solve this problem, a `is_canceled` flag should be set when + /// `LockManager` tries to cancel it, before accessing the + /// `LockWaitQueues`; when an entry is pushed to the `LockWaitQueues`, + /// check if `is_canceled` is set after locking its inner map (ensures + /// exclusive access with `LockManager`), and if it's set, cancel the + /// request like how `LockManager` should have done. + /// + /// The request should be canceled with the error that occurs in + /// `LockManager`. `external_error_tx` and `external_error_rx` are used + /// to pass this error in this case. + /// + /// `is_canceled` marks if the request is canceled from outside. Usually + /// this is caused by timeout or deadlock detected. When this flag is + /// marked true, the request must not be put into the lock waiting queue + /// since nobody will wake it up for timeout and it may stuck forever. + is_canceled: AtomicBool, + + /// The sender for passing errors in some cancellation cases. See comments + /// in [`is_canceled`](LockWaitContextSharedState::is_canceled) for details. + /// It's only possible to be used in `LockManager`, so there's no contention + /// on the mutex. + external_error_tx: Mutex>>, + + /// The sender for passing errors in some cancellation cases. See comments + /// in [`is_canceled`](LockWaitContextSharedState::is_canceled) for details. + /// It's only possible to be used when scheduler tries to push to + /// `LockWaitQueues`, so there's no contention on the mutex. + external_error_rx: Mutex>>, +} + +impl LockWaitContextSharedState { + fn new(lock_wait_token: LockWaitToken, key: Key, cb: StorageCallback) -> Self { + let inner = LockWaitContextInner { cb }; + let (tx, rx) = mpsc::channel(); + Self { + ctx_inner: Mutex::new(Some(inner)), + key, + lock_wait_token, + is_canceled: AtomicBool::new(false), + external_error_tx: Mutex::new(Some(tx)), + external_error_rx: Mutex::new(Some(rx)), + } + } + + #[cfg(test)] + pub fn new_dummy(lock_wait_token: LockWaitToken, key: Key) -> Self { + let (tx, rx) = mpsc::channel(); + Self { + ctx_inner: Mutex::new(None), + key, + lock_wait_token, + is_canceled: AtomicBool::new(false), + external_error_tx: Mutex::new(Some(tx)), + external_error_rx: Mutex::new(Some(rx)), + } + } + + pub fn is_canceled(&self) -> bool { + self.is_canceled.load(Ordering::Acquire) + } + + /// Gets the external error. It's assumed that the external error must have + /// been set and consumes it. This function is expected to be called at + /// most only once. Only used to handle the case that cancelling and + /// resuming happens concurrently. + pub(in crate::storage) fn get_external_error(&self) -> StorageError { + self.external_error_rx + .lock() + .take() + .unwrap() + .recv() + .unwrap() + } + + /// Stores the external error. This function is expected to be called at + /// most only once. Only used to handle the case that cancelling and + /// resuming happens concurrently. + fn put_external_error(&self, error: StorageError) { + if let Err(e) = self.external_error_tx.lock().take().unwrap().send(error) { + debug!("failed to set external error"; "err" => ?e); + } + } +} + +enum FinishRequestKind { + Executed, + Canceled, + CanceledBeforeEnqueueing, } #[derive(Clone)] @@ -69,13 +196,8 @@ impl LockWaitContext { cb: StorageCallback, allow_lock_with_conflict: bool, ) -> Self { - let inner = LockWaitContextInner { cb }; Self { - shared_states: Arc::new(LockWaitContextSharedState { - ctx_inner: Mutex::new(Some(inner)), - key, - lock_wait_token, - }), + shared_states: Arc::new(LockWaitContextSharedState::new(lock_wait_token, key, cb)), lock_wait_queues, allow_lock_with_conflict, } @@ -104,8 +226,13 @@ impl LockWaitContext { /// key. pub fn get_callback_for_blocked_key(&self) -> PessimisticLockKeyCallback { let ctx = self.clone(); - Box::new(move |res| { - ctx.finish_request(res, false); + Box::new(move |res, is_canceled_before_enqueueing| { + let kind = if is_canceled_before_enqueueing { + FinishRequestKind::CanceledBeforeEnqueueing + } else { + FinishRequestKind::Executed + }; + ctx.finish_request(res, kind); }) } @@ -117,27 +244,45 @@ impl LockWaitContext { /// This function is assumed to be called when the lock-waiting request is /// queueing but canceled outside, so it includes an operation to actively /// remove the entry from the lock waiting queue. - pub fn get_callback_for_cancellation(&self) -> impl FnOnce(StorageError) { + pub fn get_callback_for_cancellation(&self) -> CancellationCallback { let ctx = self.clone(); - move |e| { - ctx.finish_request(Err(e.into()), true); - } + Box::new(move |e| { + ctx.finish_request(Err(e.into()), FinishRequestKind::Canceled); + }) } - fn finish_request(&self, result: Result, is_canceling: bool) { - if is_canceling { - let entry = self - .lock_wait_queues - .remove_by_token(&self.shared_states.key, self.shared_states.lock_wait_token); - if entry.is_none() { - // Already popped out from the queue so that it will be woken up normally. Do - // nothing. - return; + fn finish_request( + &self, + result: Result, + finish_kind: FinishRequestKind, + ) { + match finish_kind { + FinishRequestKind::Executed => { + self.lock_wait_queues + .get_lock_mgr() + .remove_lock_wait(self.shared_states.lock_wait_token); + } + FinishRequestKind::Canceled => { + self.shared_states + .is_canceled + .store(true, Ordering::Release); + + let entry = self + .lock_wait_queues + .remove_by_token(&self.shared_states.key, self.shared_states.lock_wait_token); + if entry.is_none() { + // It's absent in the queue infers that it's already popped out from the queue + // so that it will be woken up normally. However + // it may still meet lock and tries to enter waiting state again. In such case, + // the request should be canceled. Store the error here so + // that it can be used for cancellation in that case, where + // there will be a `finish_request(None, false)` invocation). + self.shared_states + .put_external_error(result.unwrap_err().try_into().unwrap()); + return; + } } - } else { - self.lock_wait_queues - .get_lock_mgr() - .remove_lock_wait(self.shared_states.lock_wait_token); + FinishRequestKind::CanceledBeforeEnqueueing => {} } // When this is executed, the waiter is either woken up from the queue or @@ -152,9 +297,19 @@ impl LockWaitContext { return; } - // The following code is only valid after implementing the new lock-waiting - // model. - unreachable!(); + let key_res = match result { + Ok(key_res) => { + assert!(!matches!(key_res, PessimisticLockKeyResult::Waiting)); + key_res + } + Err(e) => PessimisticLockKeyResult::Failed(e), + }; + + let mut res = PessimisticLockResults::with_capacity(1); + res.push(key_res); + let pr = ProcessResult::PessimisticLockRes { res: Ok(res) }; + + ctx_inner.cb.execute(pr); } } @@ -177,7 +332,7 @@ mod tests { fn create_storage_cb() -> ( StorageCallback, - Receiver>>, + Receiver>>, ) { let (tx, rx) = channel(); let cb = StorageCallback::PessimisticLock(Box::new(move |r| tx.send(r).unwrap())); @@ -190,7 +345,7 @@ mod tests { ) -> ( LockWaitToken, LockWaitContext, - Receiver>>, + Receiver>>, ) { let (cb, rx) = create_storage_cb(); let token = lock_wait_queues.get_lock_mgr().allocate_token(); @@ -228,7 +383,7 @@ mod tests { // Nothing happens currently. (ctx.get_callback_for_first_write_batch()).execute(ProcessResult::Res); rx.recv_timeout(Duration::from_millis(20)).unwrap_err(); - (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict()))); + (ctx.get_callback_for_blocked_key())(Err(SharedError::from(write_conflict())), false); let res = rx.recv().unwrap().unwrap_err(); assert!(matches!( &res, @@ -253,7 +408,9 @@ mod tests { for_update_ts: 1.into(), ..Default::default() }, + should_not_exist: false, lock_wait_token: token, + req_states: ctx.get_shared_states().clone(), legacy_wake_up_index: None, key_cb: None, }), diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 16b3787bd7e..663c6729962 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -57,15 +57,14 @@ use std::{ future::Future, pin::Pin, - result::Result, sync::{ - atomic::{AtomicU64, Ordering}, + atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::{Duration, Instant}, }; -use dashmap; +use dashmap::{self, mapref::entry::Entry as DashMapEntry}; use futures_util::compat::Future01CompatExt; use keyed_priority_queue::KeyedPriorityQueue; use kvproto::kvrpcpb; @@ -75,25 +74,28 @@ use tikv_util::{time::InstantExt, timer::GLOBAL_TIMER_HANDLE}; use txn_types::{Key, TimeStamp}; use crate::storage::{ - errors::SharedError, - lock_manager::{LockManager, LockWaitToken}, + lock_manager::{ + lock_wait_context::{LockWaitContextSharedState, PessimisticLockKeyCallback}, + LockManager, LockWaitToken, + }, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, - txn::Error as TxnError, - types::{PessimisticLockParameters, PessimisticLockRes}, - Error as StorageError, + txn::{Error as TxnError, ErrorInner as TxnErrorInner}, + types::PessimisticLockParameters, + Error as StorageError, ErrorInner as StorageErrorInner, }; -pub type CallbackWithSharedError = Box) + Send + 'static>; -pub type PessimisticLockKeyCallback = CallbackWithSharedError; - /// Represents an `AcquirePessimisticLock` request that's waiting for a lock, /// and contains the request's parameters. pub struct LockWaitEntry { pub key: Key, pub lock_hash: u64, pub parameters: PessimisticLockParameters, + // `parameters` provides parameter for a request, but `should_not_exist` is specified key-wise. + // Put it in a separated field. + pub should_not_exist: bool, pub lock_wait_token: LockWaitToken, + pub req_states: Arc, pub legacy_wake_up_index: Option, pub key_cb: Option>, } @@ -215,6 +217,7 @@ pub type DelayedNotifyAllFuture = Pin { queue_map: dashmap::DashMap, id_allocated: AtomicU64, + entries_count: AtomicUsize, lock_mgr: L, } @@ -229,6 +232,7 @@ impl LockWaitQueues { inner: Arc::new(LockWaitQueueInner { queue_map: dashmap::DashMap::new(), id_allocated: AtomicU64::new(1), + entries_count: AtomicUsize::new(0), lock_mgr, }), } @@ -243,23 +247,36 @@ impl LockWaitQueues { current_lock: kvrpcpb::LockInfo, ) { let mut new_key = false; - let mut key_state = self - .inner - .queue_map - .entry(lock_wait_entry.key.clone()) - .or_insert_with(|| { - new_key = true; - KeyLockWaitState::new() - }); - key_state.current_lock = current_lock; + + let map_entry = self.inner.queue_map.entry(lock_wait_entry.key.clone()); + + // If it's not the first time the request is put into the queue, the request + // might be canceled from outside when the entry is temporarily absent + // in the queue. In this case, the cancellation operation is not done. + // Do it here. For details about this corner case, see document of + // `LockWaitContext::is_canceled` field. + if lock_wait_entry.req_states.is_canceled() { + self.on_push_canceled_entry(lock_wait_entry, map_entry); + return; + } + + let mut key_state = map_entry.or_insert_with(|| { + new_key = true; + KeyLockWaitState::new() + }); + if !current_lock.key.is_empty() { + key_state.current_lock = current_lock; + } if lock_wait_entry.legacy_wake_up_index.is_none() { lock_wait_entry.legacy_wake_up_index = Some(key_state.value().legacy_wake_up_index); } + key_state .value_mut() .queue .push(lock_wait_entry.lock_wait_token, lock_wait_entry); + self.inner.entries_count.fetch_add(1, Ordering::SeqCst); let len = key_state.value_mut().queue.len(); drop(key_state); @@ -270,6 +287,32 @@ impl LockWaitQueues { } } + fn on_push_canceled_entry( + &self, + lock_wait_entry: Box, + key_state: DashMapEntry<'_, Key, KeyLockWaitState, impl std::hash::BuildHasher>, + ) { + let mut err = lock_wait_entry.req_states.get_external_error(); + + if let DashMapEntry::Occupied(key_state_entry) = key_state { + if let StorageError(box StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + MvccError(box MvccErrorInner::KeyIsLocked(lock_info)), + )))) = &mut err + { + // Update the lock info in the error to the latest if possible. + let latest_lock_info = &key_state_entry.get().current_lock; + if !latest_lock_info.key.is_empty() { + *lock_info = latest_lock_info.clone(); + } + } + } + + // `key_state` is dropped here, so the mutex in the queue map is released. + + let cb = lock_wait_entry.key_cb.unwrap().into_inner(); + cb(Err(err.into()), true); + } + /// Dequeues the head of the lock waiting queue of the specified key, /// assuming the popped entry will be woken up. /// @@ -305,7 +348,7 @@ impl LockWaitQueues { ) -> Option<(Box, Option)> { let mut result = None; // For statistics. - let mut removed_waiters = 0; + let mut removed_waiters = 0usize; // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic @@ -334,6 +377,10 @@ impl LockWaitQueues { } } + self.inner + .entries_count + .fetch_sub(removed_waiters, Ordering::SeqCst); + // Remove the queue if it's emptied. v.queue.is_empty() }); @@ -341,7 +388,7 @@ impl LockWaitQueues { if removed_waiters != 0 { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC .waiters - .sub(removed_waiters); + .sub(removed_waiters as i64); } if removed_key.is_some() { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); @@ -426,6 +473,8 @@ impl LockWaitQueues { prev_delay_ms = current_delay_ms; } + fail_point!("lock_waiting_queue_before_delayed_notify_all"); + self.delayed_notify_all(&key, notify_id) } @@ -436,7 +485,7 @@ impl LockWaitQueues { let mut conflicting_start_ts = TimeStamp::zero(); let mut conflicting_commit_ts = TimeStamp::zero(); - let mut removed_waiters = 0; + let mut removed_waiters = 0usize; // We don't want other threads insert any more entries between finding the // queue is empty and removing the queue from the map. Wrap the logic @@ -479,6 +528,10 @@ impl LockWaitQueues { popped_lock_wait_entries.push(lock_wait_entry); } + self.inner + .entries_count + .fetch_sub(removed_waiters, Ordering::SeqCst); + // If the queue is empty, remove it from the map. v.queue.is_empty() }); @@ -486,7 +539,7 @@ impl LockWaitQueues { if removed_waiters != 0 { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC .waiters - .sub(removed_waiters); + .sub(removed_waiters as i64); } if removed_key.is_some() { LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.keys.dec(); @@ -508,7 +561,7 @@ impl LockWaitQueues { reason: kvrpcpb::WriteConflictReason::PessimisticRetry, }, ))); - cb(Err(e.into())); + cb(Err(e.into()), false); } // Return the item to be woken up in resumable way. @@ -532,6 +585,7 @@ impl LockWaitQueues { // procedure. let removed_key = self.inner.queue_map.remove_if_mut(key, |_, v| { if let Some(res) = v.queue.remove(&lock_wait_token) { + self.inner.entries_count.fetch_sub(1, Ordering::SeqCst); LOCK_WAIT_QUEUE_ENTRIES_GAUGE_VEC.waiters.dec(); result = Some(res); } @@ -545,6 +599,20 @@ impl LockWaitQueues { result } + /// Gets the count of entries currently waiting in queues. + /// + /// Mind that the contents of the queues may be changed concurrently. + pub fn entry_count(&self) -> usize { + self.inner.entries_count.load(Ordering::SeqCst) + } + + /// Checks whether there's nothing at all waiting in queue. + /// + /// Mind that the contents of the queues may be changed concurrently. + pub fn is_empty(&self) -> bool { + self.entry_count() == 0 + } + #[allow(dead_code)] pub(super) fn get_lock_mgr(&self) -> &L { &self.inner.lock_mgr @@ -582,14 +650,15 @@ mod tests { use super::*; use crate::storage::{ + errors::SharedError, lock_manager::{lock_wait_context::LockWaitContext, MockLockManager, WaitTimeout}, txn::ErrorInner as TxnErrorInner, - ErrorInner as StorageErrorInner, StorageCallback, + ErrorInner as StorageErrorInner, PessimisticLockKeyResult, StorageCallback, }; struct TestLockWaitEntryHandle { token: LockWaitToken, - wake_up_rx: Receiver>, + wake_up_rx: Receiver>, cancel_cb: Box, } @@ -597,7 +666,7 @@ mod tests { fn wait_for_result_timeout( &self, timeout: Duration, - ) -> Option> { + ) -> Option> { match self.wake_up_rx.recv_timeout(timeout) { Ok(res) => Some(res), Err(RecvTimeoutError::Timeout) => None, @@ -608,7 +677,7 @@ mod tests { } } - fn wait_for_result(self) -> Result { + fn wait_for_result(self) -> Result { self.wake_up_rx .recv_timeout(Duration::from_secs(10)) .unwrap() @@ -660,6 +729,7 @@ mod tests { min_commit_ts: 0.into(), check_existence: false, is_first_lock: false, + lock_only_if_exists: false, allow_lock_with_conflict: false, }; @@ -670,9 +740,13 @@ mod tests { key, lock_hash, parameters, + should_not_exist: false, lock_wait_token: token, + req_states: dummy_ctx.get_shared_states().clone(), legacy_wake_up_index: None, - key_cb: Some(SyncWrapper::new(Box::new(move |res| tx.send(res).unwrap()))), + key_cb: Some(SyncWrapper::new(Box::new(move |res, _| { + tx.send(res).unwrap() + }))), }); let cancel_callback = dummy_ctx.get_callback_for_cancellation(); @@ -809,11 +883,11 @@ mod tests { } fn expect_write_conflict( - err: &StorageErrorInner, + err: &StorageError, expect_conflict_start_ts: impl Into, expect_conflict_commit_ts: impl Into, ) { - match err { + match &*err.0 { StorageErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( box MvccErrorInner::WriteConflict { conflict_start_ts, @@ -831,9 +905,13 @@ mod tests { #[test] fn test_simple_push_pop() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); + assert_eq!(queues.is_empty(), true); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k2", 11, 5, false); + assert_eq!(queues.entry_count(), 2); + assert_eq!(queues.is_empty(), false); queues .must_pop(b"k1", 5, 6) @@ -841,6 +919,8 @@ mod tests { .check_start_ts(10); queues.must_pop_none(b"k1", 5, 6); queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 1); + assert_eq!(queues.is_empty(), false); queues .must_pop(b"k2", 5, 6) @@ -848,11 +928,14 @@ mod tests { .check_start_ts(11); queues.must_pop_none(b"k2", 5, 6); queues.must_not_contain_key(b"k2"); + assert_eq!(queues.entry_count(), 0); + assert_eq!(queues.is_empty(), true); } #[test] fn test_popping_priority() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 10, 5, false); queues.mock_lock_wait(b"k1", 20, 5, false); @@ -860,6 +943,7 @@ mod tests { queues.mock_lock_wait(b"k1", 13, 5, false); // Duplication is possible considering network issues and RPC retrying. queues.mock_lock_wait(b"k1", 12, 5, false); + assert_eq!(queues.entry_count(), 5); // Ordered by start_ts for &expected_start_ts in &[10u64, 12, 12, 13, 20] { @@ -870,11 +954,13 @@ mod tests { } queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[test] fn test_removing_by_token() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 10, 5, false); let token11 = queues.mock_lock_wait(b"k1", 11, 5, false).token; @@ -882,6 +968,7 @@ mod tests { let token13 = queues.mock_lock_wait(b"k1", 13, 5, false).token; queues.mock_lock_wait(b"k1", 14, 5, false); assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + assert_eq!(queues.entry_count(), 5); queues .remove_by_token(&Key::from_raw(b"k1"), token11) @@ -894,6 +981,7 @@ mod tests { .check_key(b"k1") .check_start_ts(13); assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + assert_eq!(queues.entry_count(), 3); // Removing not-existing entry takes no effect. assert!( @@ -907,15 +995,19 @@ mod tests { .is_none() ); assert_eq!(queues.get_queue_length_of_key(b"k1"), 3); + assert_eq!(queues.entry_count(), 3); queues.must_pop(b"k1", 5, 6).check_start_ts(10); queues.must_pop(b"k1", 5, 6).check_start_ts(12); queues.must_pop(b"k1", 5, 6).check_start_ts(14); + queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[test] fn test_dropping_cancelled_entries() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); let h10 = queues.mock_lock_wait(b"k1", 10, 5, false); let h11 = queues.mock_lock_wait(b"k1", 11, 5, false); @@ -924,12 +1016,14 @@ mod tests { queues.mock_lock_wait(b"k1", 14, 5, false); assert_eq!(queues.get_queue_length_of_key(b"k1"), 5); + assert_eq!(queues.entry_count(), 5); h10.cancel(); h11.cancel(); h13.cancel(); assert_eq!(queues.get_queue_length_of_key(b"k1"), 2); + assert_eq!(queues.entry_count(), 2); for &expected_start_ts in &[12u64, 14] { queues @@ -937,11 +1031,13 @@ mod tests { .check_start_ts(expected_start_ts); } queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } #[tokio::test] async fn test_delayed_notify_all() { let queues = LockWaitQueues::new(MockLockManager::new()); + assert_eq!(queues.entry_count(), 0); queues.mock_lock_wait(b"k1", 8, 5, false); @@ -952,6 +1048,7 @@ mod tests { ]; // Current queue: [8, 11, 12, 13] + assert_eq!(queues.entry_count(), 4); let (entry, delay_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 5, 6); entry.check_key(b"k1").check_start_ts(8); @@ -959,6 +1056,7 @@ mod tests { // Current queue: [11*, 12*, 13*] (Items marked with * means it has // legacy_wake_up_index less than that in KeyLockWaitState, so it might // be woken up when calling delayed_notify_all). + assert_eq!(queues.entry_count(), 3); let handles2 = vec![ queues.mock_lock_wait(b"k1", 14, 5, false), @@ -967,6 +1065,7 @@ mod tests { ]; // Current queue: [11*, 12*, 13*, 14, 15, 16] + assert_eq!(queues.entry_count(), 6); assert!( handles1[0] @@ -988,9 +1087,11 @@ mod tests { ); // Current queue: [14, 15, 16] + assert_eq!(queues.entry_count(), 3); queues.mock_lock_wait(b"k1", 9, 5, false); // Current queue: [9, 14, 15, 16] + assert_eq!(queues.entry_count(), 4); // 9 will be woken up and delayed wake up should be scheduled. After delaying, // 14 to 16 should be all woken up later if they are all not resumable. @@ -1000,11 +1101,13 @@ mod tests { entry.check_key(b"k1").check_start_ts(9); // Current queue: [14*, 15*, 16*] + assert_eq!(queues.entry_count(), 3); queues.mock_lock_wait(b"k1", 17, 5, false); let handle18 = queues.mock_lock_wait(b"k1", 18, 5, false); // Current queue: [14*, 15*, 16*, 17, 18] + assert_eq!(queues.entry_count(), 5); // Wakes up 14, and stops at 15 which is resumable. Then, 15 should be returned // and the caller should be responsible for waking it up. @@ -1012,6 +1115,7 @@ mod tests { entry15.check_key(b"k1").check_start_ts(15); // Current queue: [16*, 17, 18] + assert_eq!(queues.entry_count(), 3); let mut it = handles2.into_iter(); // Receive 14. @@ -1050,6 +1154,7 @@ mod tests { ); // Current queue: [16*, 17, 18] + assert_eq!(queues.entry_count(), 3); let (entry, delayed_wake_up_future) = queues.must_pop_with_delayed_notify(b"k1", 7, 8); entry.check_key(b"k1").check_start_ts(16); @@ -1064,6 +1169,7 @@ mod tests { queues.must_have_next_entry(b"k1", 17); // Current queue: [17*, 18*] + assert_eq!(queues.entry_count(), 2); // Don't need to create new future if there already exists one for the key. let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); @@ -1071,18 +1177,22 @@ mod tests { queues.must_have_next_entry(b"k1", 18); // Current queue: [18*] + assert_eq!(queues.entry_count(), 1); queues.mock_lock_wait(b"k1", 19, 5, false); // Current queue: [18*, 19] + assert_eq!(queues.entry_count(), 2); assert!(delayed_wake_up_future.await.is_none()); // 18 will be cancelled with ts of the latest wake-up event. expect_write_conflict(&handle18.wait_for_result().unwrap_err().0, 9, 10); // Current queue: [19] + assert_eq!(queues.entry_count(), 1); // Don't need to create new future if the queue is cleared. let entry = queues.must_pop_with_no_delayed_notify(b"k1", 9, 10); entry.check_key(b"k1").check_start_ts(19); // Current queue: empty + assert_eq!(queues.entry_count(), 0); queues.must_not_contain_key(b"k1"); // Calls delayed_notify_all on keys that not exists (maybe deleted due to @@ -1093,5 +1203,6 @@ mod tests { .is_none() ); queues.must_not_contain_key(b"k1"); + assert_eq!(queues.entry_count(), 0); } } diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 3ba9c7f7905..75b133a808f 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -9,12 +9,13 @@ use std::{ time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use kvproto::{kvrpcpb::LockInfo, metapb::RegionEpoch}; use parking_lot::Mutex; use tracker::TrackerToken; use txn_types::{Key, TimeStamp}; +pub use crate::storage::lock_manager::lock_wait_context::CancellationCallback; use crate::{ server::lock_manager::{waiter_manager, waiter_manager::Callback}, storage::{ @@ -147,7 +148,7 @@ pub trait LockManager: Clone + Send + Sync + 'static { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ); @@ -170,8 +171,7 @@ pub trait LockManager: Clone + Send + Sync + 'static { #[derive(Clone)] pub struct MockLockManager { allocated_token: Arc, - waiters: - Arc)>>>, + waiters: Arc>>, } impl MockLockManager { @@ -205,7 +205,7 @@ impl LockManager for MockLockManager { wait_info: KeyLockWaitInfo, _is_first_lock: bool, _timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, _diag_ctx: DiagnosticContext, ) { self.waiters @@ -230,4 +230,19 @@ impl MockLockManager { cancel_callback(StorageError::from(TxnError::from(error))); } } + + pub fn simulate_timeout(&self, token: LockWaitToken) { + if let Some((wait_info, cancel_callback)) = self.waiters.lock().remove(&token) { + let error = MvccError::from(MvccErrorInner::KeyIsLocked(wait_info.lock_info)); + cancel_callback(StorageError::from(TxnError::from(error))); + } + } + + pub fn get_all_tokens(&self) -> HashSet { + self.waiters + .lock() + .iter() + .map(|(&token, _)| token) + .collect() + } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 2bbe4b7b762..e84a7dfb4e9 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -126,6 +126,7 @@ make_auto_flush_static_metric! { batch_get_command, prewrite, acquire_pessimistic_lock, + acquire_pessimistic_lock_resumed, commit, cleanup, rollback, diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 33d1c4ddf97..caed0f57c91 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -64,8 +64,9 @@ use std::{ borrow::Cow, iter, marker::PhantomData, + mem, sync::{ - atomic::{self, AtomicBool, AtomicU64}, + atomic::{self, AtomicBool, AtomicU64, Ordering}, Arc, }, }; @@ -87,16 +88,17 @@ use pd_client::FeatureGate; use raftstore::store::{util::build_key_range, ReadStats, TxnExt, WriteStats}; use rand::prelude::*; use resource_metering::{FutureExt, ResourceTagFactory}; -use tikv_kv::SnapshotExt; +use tikv_kv::{OnAppliedCb, SnapshotExt}; use tikv_util::{ deadline::Deadline, + future::try_poll, quota_limiter::QuotaLimiter, time::{duration_to_ms, Instant, ThreadReadId}, }; use tracker::{ clear_tls_tracker_token, set_tls_tracker_token, with_tls_tracker, TrackedFuture, TrackerToken, }; -use txn_types::{Key, KvPair, Lock, LockType, OldValues, TimeStamp, TsSet, Value}; +use txn_types::{Key, KvPair, Lock, LockType, TimeStamp, TsSet, Value}; pub use self::{ errors::{get_error_kind_from_header, get_tag_from_header, Error, ErrorHeaderKind, ErrorInner}, @@ -107,7 +109,10 @@ pub use self::{ raw::RawStore, read_pool::{build_read_pool, build_read_pool_for_test}, txn::{Latches, Lock as LatchLock, ProcessResult, Scanner, SnapshotStore, Store}, - types::{PessimisticLockRes, PrewriteResult, SecondaryLocksStatus, StorageCallback, TxnStatus}, + types::{ + PessimisticLockKeyResult, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, + StorageCallback, TxnStatus, + }, }; use self::{kv::SnapContext, test_util::latest_feature_gate}; use crate::{ @@ -1413,7 +1418,7 @@ impl Storage { callback: Callback, ) -> Result<()> { use crate::storage::txn::commands::{ - AcquirePessimisticLock, Prewrite, PrewritePessimistic, + AcquirePessimisticLock, AcquirePessimisticLockResumed, Prewrite, PrewritePessimistic, }; let cmd: Command = cmd.into(); @@ -1449,6 +1454,18 @@ impl Storage { )?; check_key_size!(keys, self.max_key_size, callback); } + Command::AcquirePessimisticLockResumed(AcquirePessimisticLockResumed { + items, .. + }) => { + let keys = items.iter().map(|item| item.key.as_encoded()); + Self::check_api_version( + self.api_version, + cmd.ctx().api_version, + CommandKind::acquire_pessimistic_lock_resumed, + keys.clone(), + )?; + check_key_size!(keys, self.max_key_size, callback); + } _ => {} } with_tls_tracker(|tracker| { @@ -1533,11 +1550,18 @@ impl Storage { let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - self.engine.async_write( + let res = kv::write( + &self.engine, &ctx, batch, - Box::new(|res| callback(res.map_err(Error::from))), - )?; + Some(Box::new(|res| { + callback(mem::replace(res, Ok(())).map_err(Error::from)) + })), + ); + // TODO: perhaps change delete_range API to return future. + if let Some(Some(Err(e))) = try_poll(res) { + return Err(Error::from(e)); + } KV_COMMAND_COUNTER_VEC_STATIC.delete_range.inc(); Ok(()) } @@ -1936,14 +1960,12 @@ impl Storage { let mut batch = WriteData::from_modifies(vec![m]); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2039,14 +2061,12 @@ impl Storage { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, ts.unwrap()); let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2103,14 +2123,12 @@ impl Storage { let m = Self::raw_delete_request_to_modify(cf, key, ts.unwrap()); let mut batch = WriteData::from_modifies(vec![m]); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2156,14 +2174,12 @@ impl Storage { batch.set_allowed_on_disk_almost_full(); // TODO: special notification channel for API V2. - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2216,14 +2232,12 @@ impl Storage { .collect(); let mut batch = WriteData::from_modifies(modifies); batch.set_allowed_on_disk_almost_full(); - let (cb, f) = tikv_util::future::paired_future_callback(); - let async_ret = - engine.async_write(&ctx, batch, Box::new(|res| cb(res.map_err(Error::from)))); - let v: Result<()> = match async_ret { - Err(e) => Err(Error::from(e)), - Ok(_) => f.await.unwrap(), - }; - callback(v); + let res = kv::write(&engine, &ctx, batch, None); + callback( + res.await + .unwrap_or_else(|| Err(box_err!("stale command"))) + .map_err(Error::from), + ); KV_COMMAND_COUNTER_VEC_STATIC.get(CMD).inc(); SCHED_STAGE_COUNTER_VEC.get(CMD).write_finish.inc(); SCHED_HISTOGRAM_VEC_STATIC @@ -2968,27 +2982,25 @@ impl Engine for TxnTestEngine { self.engine.modify_on_kv_engine(region_modifies) } - fn async_snapshot( - &mut self, - ctx: SnapContext<'_>, - cb: tikv_kv::Callback, - ) -> tikv_kv::Result<()> { + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, ctx: SnapContext<'_>) -> Self::SnapshotRes { let txn_ext = self.txn_ext.clone(); - self.engine.async_snapshot( - ctx, - Box::new(move |snapshot| { - cb(snapshot.map(|snapshot| TxnTestSnapshot { snapshot, txn_ext })) - }), - ) + let f = self.engine.async_snapshot(ctx); + async move { + let snapshot = f.await?; + Ok(TxnTestSnapshot { snapshot, txn_ext }) + } } + type WriteRes = E::WriteRes; fn async_write( &self, ctx: &Context, batch: WriteData, - write_cb: tikv_kv::Callback<()>, - ) -> tikv_kv::Result<()> { - self.engine.async_write(ctx, batch, write_cb) + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + self.engine.async_write(ctx, batch, subscribed, on_applied) } } @@ -3090,6 +3102,12 @@ impl TestStorageBuilder { self } + pub fn wake_up_delay_duration(self, duration_ms: u64) -> Self { + self.wake_up_delay_duration_ms + .store(duration_ms, Ordering::Relaxed); + self + } + pub fn set_api_version(mut self, api_version: ApiVersion) -> Self { self.config.set_api_version(api_version); self @@ -3184,8 +3202,15 @@ pub mod test_util { }, }; + use futures_executor::block_on; + use kvproto::kvrpcpb::Op; + use super::*; - use crate::storage::{lock_manager::WaitTimeout, txn::commands}; + use crate::storage::{ + lock_manager::WaitTimeout, + txn::commands, + types::{PessimisticLockKeyResult, PessimisticLockResults}, + }; pub fn expect_none(x: Option) { assert_eq!(x, None); @@ -3251,12 +3276,65 @@ pub mod test_util { }) } + pub fn expect_value_with_checker_callback( + done: Sender, + id: i32, + check: impl FnOnce(T) + Send + 'static, + ) -> Callback { + Box::new(move |x: Result| { + check(x.unwrap()); + done.send(id).unwrap(); + }) + } + pub fn expect_pessimistic_lock_res_callback( done: Sender, - pessimistic_lock_res: PessimisticLockRes, - ) -> Callback> { - Box::new(move |res: Result>| { - assert_eq!(res.unwrap().unwrap(), pessimistic_lock_res); + pessimistic_lock_res: PessimisticLockResults, + ) -> Callback> { + fn key_res_matches_ignoring_error_content( + lhs: &PessimisticLockKeyResult, + rhs: &PessimisticLockKeyResult, + ) -> bool { + match (lhs, rhs) { + (PessimisticLockKeyResult::Empty, PessimisticLockKeyResult::Empty) => true, + (PessimisticLockKeyResult::Value(l), PessimisticLockKeyResult::Value(r)) => l == r, + ( + PessimisticLockKeyResult::Existence(l), + PessimisticLockKeyResult::Existence(r), + ) => l == r, + ( + PessimisticLockKeyResult::LockedWithConflict { + value: value1, + conflict_ts: ts1, + }, + PessimisticLockKeyResult::LockedWithConflict { + value: value2, + conflict_ts: ts2, + }, + ) => value1 == value2 && ts1 == ts2, + (PessimisticLockKeyResult::Waiting, PessimisticLockKeyResult::Waiting) => true, + (PessimisticLockKeyResult::Failed(_), PessimisticLockKeyResult::Failed(_)) => false, + _ => false, + } + } + + Box::new(move |res: Result>| { + let res = res.unwrap().unwrap(); + assert_eq!( + res.0.len(), + pessimistic_lock_res.0.len(), + "pessimistic lock result length not match, expected: {:?}, got: {:?}", + pessimistic_lock_res, + res + ); + for (expected, got) in pessimistic_lock_res.0.iter().zip(res.0.iter()) { + assert!( + key_res_matches_ignoring_error_content(expected, got), + "pessimistic lock result not match, expected: {:?}, got: {:?}", + pessimistic_lock_res, + res + ); + } done.send(0).unwrap(); }) } @@ -3271,7 +3349,41 @@ pub mod test_util { }) } - type PessimisticLockCommand = TypedCommand>; + type PessimisticLockCommand = TypedCommand>; + + impl PessimisticLockCommand { + pub fn allow_lock_with_conflict(mut self, v: bool) -> Self { + if let Command::AcquirePessimisticLock(commands::AcquirePessimisticLock { + allow_lock_with_conflict, + .. + }) = &mut self.cmd + { + *allow_lock_with_conflict = v; + } else { + panic!( + "expects AcquirePessimisticLock command, got: {:?}", + self.cmd + ); + } + self + } + + pub fn lock_wait_timeout(mut self, timeout: Option) -> Self { + if let Command::AcquirePessimisticLock(commands::AcquirePessimisticLock { + wait_timeout, + .. + }) = &mut self.cmd + { + *wait_timeout = timeout; + } else { + panic!( + "expects AcquirePessimisticLock command, got: {:?}", + self.cmd + ); + } + self + } + } pub fn new_acquire_pessimistic_lock_command( keys: Vec<(Key, bool)>, @@ -3280,7 +3392,27 @@ pub mod test_util { return_values: bool, check_existence: bool, ) -> PessimisticLockCommand { - let primary = keys[0].0.clone().to_raw().unwrap(); + new_acquire_pessimistic_lock_command_with_pk( + keys, + None, + start_ts, + for_update_ts, + return_values, + check_existence, + ) + } + + pub fn new_acquire_pessimistic_lock_command_with_pk( + keys: Vec<(Key, bool)>, + pk: Option<&[u8]>, + start_ts: impl Into, + for_update_ts: impl Into, + return_values: bool, + check_existence: bool, + ) -> PessimisticLockCommand { + let primary = pk + .map(|k| k.to_vec()) + .unwrap_or_else(|| keys[0].0.clone().to_raw().unwrap()); let for_update_ts: TimeStamp = for_update_ts.into(); commands::AcquirePessimisticLock::new( keys, @@ -3292,9 +3424,9 @@ pub mod test_util { Some(WaitTimeout::Default), return_values, for_update_ts.next(), - OldValues::default(), check_existence, false, + false, Context::default(), ) } @@ -3383,6 +3515,46 @@ pub mod test_util { feature_gate.set_version(env!("CARGO_PKG_VERSION")).unwrap(); feature_gate } + + pub fn must_have_locks( + storage: &Storage, + ts: u64, + start_key: &[u8], + end_key: &[u8], + expected_locks: &[( + // key + &[u8], + Op, + // start_ts + u64, + // for_update_ts + u64, + )], + ) { + let locks = block_on(storage.scan_lock( + Context::default(), + ts.into(), + Some(Key::from_raw(start_key)), + Some(Key::from_raw(end_key)), + 100, + )) + .unwrap(); + assert_eq!( + locks.len(), + expected_locks.len(), + "lock count not match, expected: {:?}; got: {:?}", + expected_locks, + locks + ); + for (lock_info, (expected_key, expected_op, expected_start_ts, expected_for_update_ts)) in + locks.into_iter().zip(expected_locks.iter()) + { + assert_eq!(lock_info.get_key(), *expected_key); + assert_eq!(lock_info.get_lock_type(), *expected_op); + assert_eq!(lock_info.get_lock_version(), *expected_start_ts); + assert_eq!(lock_info.get_lock_for_update_ts(), *expected_for_update_ts); + } + } } /// All statistics related to KvGet/KvBatchGet. @@ -3423,7 +3595,10 @@ mod tests { use super::{ mvcc::tests::{must_unlocked, must_written}, test_util::*, - txn::FLASHBACK_BATCH_SIZE, + txn::{ + commands::{new_flashback_rollback_lock_cmd, new_flashback_write_cmd}, + FLASHBACK_BATCH_SIZE, + }, *, }; use crate::{ @@ -3435,8 +3610,8 @@ mod tests { Error as KvError, ErrorInner as EngineErrorInner, ExpectedWrite, MockEngineBuilder, }, lock_manager::{ - DiagnosticContext, KeyLockWaitInfo, LockDigest, LockWaitToken, UpdateWaitForEvent, - WaitTimeout, + CancellationCallback, DiagnosticContext, KeyLockWaitInfo, LockDigest, + LockWaitToken, UpdateWaitForEvent, WaitTimeout, }, mvcc::LockType, txn::{ @@ -3445,6 +3620,7 @@ mod tests { tests::must_rollback, Error as TxnError, ErrorInner as TxnErrorInner, }, + types::{PessimisticLockKeyResult, PessimisticLockResults}, }, }; @@ -4693,21 +4869,14 @@ mod tests { let (key, value) = write.0.clone().into_key_value(); // The version we want to flashback to. let version = write.2; - storage - .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( - start_ts, - commit_ts, - version, - None, - Some(key.clone()), - Some(key.clone()), - Context::default(), - ), - expect_ok_callback(tx.clone(), 2), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + version, + key.clone(), + Key::from_raw(b"z"), + ); if let Mutation::Put(..) = write.0 { expect_value( value.unwrap(), @@ -4725,6 +4894,44 @@ mod tests { } } + fn run_flashback_to_version( + storage: &Storage, + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ) { + let (tx, rx) = channel(); + storage + .sched_txn_command( + new_flashback_rollback_lock_cmd( + start_ts, + version, + start_key.clone(), + end_key.clone(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + new_flashback_write_cmd( + start_ts, + commit_ts, + version, + start_key, + end_key, + Context::default(), + ), + expect_ok_callback(tx, 1), + ) + .unwrap(); + rx.recv().unwrap(); + } + #[test] fn test_flashback_to_version_lock() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) @@ -4768,7 +4975,7 @@ mod tests { b"k".to_vec(), *ts.incr(), ), - expect_ok_callback(tx.clone(), 2), + expect_ok_callback(tx, 2), ) .unwrap(); rx.recv().unwrap(); @@ -4784,21 +4991,14 @@ mod tests { let start_ts = *ts.incr(); let commit_ts = *ts.incr(); - storage - .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( - start_ts, - commit_ts, - 2.into(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), - Context::default(), - ), - expect_ok_callback(tx.clone(), 3), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + 2.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); expect_value( b"v@1".to_vec(), block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) @@ -4807,21 +5007,14 @@ mod tests { ); let start_ts = *ts.incr(); let commit_ts = *ts.incr(); - storage - .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( - start_ts, - commit_ts, - 1.into(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), - Context::default(), - ), - expect_ok_callback(tx, 4), - ) - .unwrap(); - rx.recv().unwrap(); + run_flashback_to_version( + &storage, + start_ts, + commit_ts, + 1.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); expect_none( block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) .unwrap() @@ -4900,30 +5093,103 @@ mod tests { .0, ); } - // Flashback all records. + // Flashback all records multiple times to make sure the flashback operation is + // idempotent. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + for _ in 0..10 { + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); + for i in 1..=FLASHBACK_BATCH_SIZE * 4 { + let key = Key::from_raw(format!("k{}", i).as_bytes()); + expect_none( + block_on(storage.get(Context::default(), key, *ts.incr())) + .unwrap() + .0, + ); + } + } + } + + #[test] + fn test_flashback_to_version_deleted_key() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let (tx, rx) = channel(); + let mut ts = TimeStamp::zero(); + let (k, v) = (Key::from_raw(b"k"), b"v".to_vec()); + // Write a key. storage .sched_txn_command( - commands::FlashbackToVersionReadPhase::new( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(k.clone(), v.clone())], + k.as_encoded().to_vec(), *ts.incr(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new(vec![k.clone()], ts, *ts.incr(), Context::default()), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + v, + block_on(storage.get(Context::default(), k.clone(), ts)) + .unwrap() + .0, + ); + // Delete the key. + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_delete(k.clone())], + k.as_encoded().to_vec(), *ts.incr(), - TimeStamp::zero(), - None, - Some(Key::from_raw(b"k")), - Some(Key::from_raw(b"k")), - Context::default(), ), - expect_ok_callback(tx, 2), + expect_ok_callback(tx.clone(), 2), ) .unwrap(); rx.recv().unwrap(); - for i in 1..=FLASHBACK_BATCH_SIZE * 4 { - let key = Key::from_raw(format!("k{}", i).as_bytes()); - expect_none( - block_on(storage.get(Context::default(), key, *ts.incr())) - .unwrap() - .0, - ); - } + storage + .sched_txn_command( + commands::Commit::new(vec![k.clone()], ts, *ts.incr(), Context::default()), + expect_value_callback(tx, 3, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k"), ts)) + .unwrap() + .0, + ); + // Flashback the key. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + 1.into(), + Key::from_raw(b"k"), + Key::from_raw(b"z"), + ); + expect_none( + block_on(storage.get(Context::default(), k, flashback_commit_ts)) + .unwrap() + .0, + ); } #[test] @@ -7712,16 +7978,33 @@ mod tests { let (key, val) = (Key::from_raw(b"key"), b"val".to_vec()); let (key2, val2) = (Key::from_raw(b"key2"), b"val2".to_vec()); + let results_values = |res: Vec>| { + PessimisticLockResults( + res.into_iter() + .map(|v| PessimisticLockKeyResult::Value(v)) + .collect::>(), + ) + }; + let results_existence = |res: Vec| { + PessimisticLockResults( + res.into_iter() + .map(|v| PessimisticLockKeyResult::Existence(v)) + .collect::>(), + ) + }; + let results_empty = + |len| PessimisticLockResults(vec![PessimisticLockKeyResult::Empty; len]); + // Key not exist for &(return_values, check_existence) in &[(false, false), (false, true), (true, false), (true, true)] { let pessimistic_lock_res = if return_values { - PessimisticLockRes::Values(vec![None]) + results_values(vec![None]) } else if check_existence { - PessimisticLockRes::Existence(vec![false]) + results_existence(vec![false]) } else { - PessimisticLockRes::Empty + results_empty(1) }; storage @@ -7769,7 +8052,7 @@ mod tests { false, false, ), - expect_pessimistic_lock_res_callback(tx.clone(), PessimisticLockRes::Empty), + expect_pessimistic_lock_res_callback(tx.clone(), results_empty(1)), ) .unwrap(); rx.recv().unwrap(); @@ -7802,8 +8085,8 @@ mod tests { rx.recv().unwrap(); } - // Needn't update max_ts when failing to read value - assert_eq!(cm.max_ts(), 10.into()); + // Always update max_ts when trying to read. + assert_eq!(cm.max_ts(), 20.into()); // Put key and key2. storage @@ -7872,19 +8155,18 @@ mod tests { rx.recv().unwrap(); } - // Needn't update max_ts when failing to read value - assert_eq!(cm.max_ts(), 10.into()); + assert_eq!(cm.max_ts(), 20.into()); // Return multiple values for &(return_values, check_existence) in &[(false, false), (false, true), (true, false), (true, true)] { let pessimistic_lock_res = if return_values { - PessimisticLockRes::Values(vec![Some(val.clone()), Some(val2.clone()), None]) + results_values(vec![Some(val.clone()), Some(val2.clone()), None]) } else if check_existence { - PessimisticLockRes::Existence(vec![true, true, false]) + results_existence(vec![true, true, false]) } else { - PessimisticLockRes::Empty + results_empty(3) }; storage .sched_txn_command( @@ -7918,6 +8200,527 @@ mod tests { test_pessimistic_lock_impl(true); } + fn test_pessimistic_lock_resumable_impl( + pipelined_pessimistic_lock: bool, + in_memory_lock: bool, + ) { + type Res = PessimisticLockKeyResult; + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .pipelined_pessimistic_lock(pipelined_pessimistic_lock) + .in_memory_pessimistic_lock(in_memory_lock) + .build() + .unwrap(); + let (tx, rx) = channel(); + + let results_empty = + |len| PessimisticLockResults(vec![PessimisticLockKeyResult::Empty; len]); + + for case_num in 0..4 { + let key = |i| vec![b'k', case_num, i]; + // Put key "k1". + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(1)), b"v1".to_vec())], + key(1), + 10.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(1))], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Put key "k2". + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(2)), b"v2".to_vec())], + key(2), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(2))], + 30.into(), + 40.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock "k3", and we will pessimistic-rollback it. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 20, + 20, + false, + false, + ), + expect_pessimistic_lock_res_callback(tx.clone(), results_empty(1)), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k4", and we will commit it + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(4)), b"v4".to_vec())], + key(4), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k5", and we will roll it back + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(5)), b"v5".to_vec())], + key(5), + 30.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + + // Prewrite "k6", and it won't cause conflict after committing. + storage + .sched_txn_command( + commands::Prewrite::new( + vec![Mutation::make_put(Key::from_raw(&key(6)), b"v6".to_vec())], + key(6), + 10.into(), + 3000, + false, + 1, + TimeStamp::zero(), + TimeStamp::default(), + None, + false, + AssertionLevel::Off, + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + } + + for &(case_num, return_values, check_existence) in &[ + (0, false, false), + (1, false, true), + (2, true, false), + (3, true, true), + ] { + let key = |i| vec![b'k', case_num, i]; + let expected_results = if return_values { + vec![ + Res::Value(Some(b"v1".to_vec())), + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Value(None), + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Value(Some(b"v6".to_vec())), + ] + } else if check_existence { + vec![ + Res::Existence(true), + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Existence(false), + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Existence(true), + ] + } else { + vec![ + Res::Empty, + Res::LockedWithConflict { + value: Some(b"v2".to_vec()), + conflict_ts: 40.into(), + }, + Res::Empty, + Res::LockedWithConflict { + value: Some(b"v4".to_vec()), + conflict_ts: 40.into(), + }, + Res::LockedWithConflict { + value: None, + conflict_ts: 30.into(), + }, + Res::Empty, + ] + }; + + // k1 & k2 + for (i, k) in &[(0, key(1)), (1, key(2))] { + let i = *i; + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(k), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true), + expect_pessimistic_lock_res_callback( + tx.clone(), + PessimisticLockResults(vec![expected_results[i].clone()]), + ), + ) + .unwrap(); + rx.recv().unwrap(); + } + + // k3 + // Report KeyIsLocked if no wait + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(None), + expect_value_with_checker_callback( + tx.clone(), + 0, + |res: Result| { + let e = res.unwrap().0[0].unwrap_err(); + match e.inner() { + ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc( + mvcc::Error(box mvcc::ErrorInner::KeyIsLocked(..)), + ))) => (), + e => panic!("unexpected error chain: {:?}", e), + } + }, + ), + ) + .unwrap(); + rx.recv().unwrap(); + + // Lock wait + let (tx1, rx1) = channel(); + // k3 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(3)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[2].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(&key(3)), 20, 20); + rx1.recv().unwrap(); + + // k4 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(4)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[3].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(4))], + 30.into(), + 40.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + // k5 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(5)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[4].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Rollback::new( + vec![Key::from_raw(&key(5))], + 30.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + // k6 + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(6)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[5].clone()]), + ), + ) + .unwrap(); + rx1.recv_timeout(Duration::from_millis(100)).unwrap_err(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(&key(6))], + 10.into(), + 20.into(), + Context::default(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + rx1.recv().unwrap(); + + must_have_locks( + &storage, + 50, + &key(0), + &key(10), + &[ + (&key(1), Op::PessimisticLock, 25, 25), + (&key(2), Op::PessimisticLock, 25, 40), + (&key(3), Op::PessimisticLock, 25, 25), + (&key(4), Op::PessimisticLock, 25, 40), + (&key(5), Op::PessimisticLock, 25, 30), + (&key(6), Op::PessimisticLock, 25, 25), + ], + ); + + // Test idempotency + for i in 0..6usize { + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(&key(i as u8 + 1)), false)], + 25, + 25, + return_values, + check_existence, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback( + tx1.clone(), + PessimisticLockResults(vec![expected_results[i].clone()]), + ), + ) + .unwrap(); + rx1.recv().unwrap(); + } + } + + // Check the channel is clear to avoid misusing in the above test code. + tx.send(100).unwrap(); + assert_eq!(rx.recv().unwrap(), 100); + + // Test request queueing. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(b"k21"), false)], + 10, + 10, + false, + false, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback(tx, results_empty(1)), + ) + .unwrap(); + rx.recv().unwrap(); + + let channels: Vec<_> = (0..4).map(|_| channel()).collect(); + let start_ts = &[20, 50, 30, 40]; + for i in 0..4 { + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command( + vec![(Key::from_raw(b"k21"), false)], + start_ts[i], + start_ts[i], + false, + false, + ) + .allow_lock_with_conflict(true) + .lock_wait_timeout(Some(WaitTimeout::Default)), + expect_pessimistic_lock_res_callback(channels[i].0.clone(), results_empty(1)), + ) + .unwrap(); + channels[i] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + } + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 10, 10); + channels[0].1.recv().unwrap(); + channels[2] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 20, 20); + channels[2].1.recv().unwrap(); + channels[3] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 30, 30); + channels[3].1.recv().unwrap(); + channels[1] + .1 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + + delete_pessimistic_lock(&storage, Key::from_raw(b"k21"), 40, 40); + channels[1].1.recv().unwrap(); + } + + #[test] + fn test_pessimistic_lock_resumable() { + for &pipelined_pessimistic_lock in &[false, true] { + for &in_memory_lock in &[false, true] { + test_pessimistic_lock_resumable_impl(pipelined_pessimistic_lock, in_memory_lock); + } + } + } + #[allow(clippy::large_enum_variant)] pub enum Msg { WaitFor { @@ -7929,7 +8732,7 @@ mod tests { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, }, RemoveLockWait { @@ -7969,7 +8772,7 @@ mod tests { wait_info: KeyLockWaitInfo, is_first_lock: bool, timeout: Option, - cancel_callback: Box, + cancel_callback: CancellationCallback, diag_ctx: DiagnosticContext, ) { self.tx @@ -8045,7 +8848,7 @@ mod tests { Some(WaitTimeout::Millis(100)), false, 21.into(), - OldValues::default(), + false, false, false, Context::default(), @@ -8137,7 +8940,7 @@ mod tests { Some(WaitTimeout::Millis(5000)), false, (lock_ts + 1).into(), - OldValues::default(), + false, false, false, Context::default(), @@ -8722,7 +9525,7 @@ mod tests { None, false, 0.into(), - OldValues::default(), + false, false, false, Default::default(), @@ -8745,7 +9548,7 @@ mod tests { None, false, 0.into(), - OldValues::default(), + false, false, false, Default::default(), @@ -8975,7 +9778,7 @@ mod tests { None, false, TimeStamp::new(12), - OldValues::default(), + false, false, false, Context::default(), @@ -9001,7 +9804,7 @@ mod tests { None, false, TimeStamp::new(12), - OldValues::default(), + false, false, false, Context::default(), @@ -9610,6 +10413,8 @@ mod tests { ttl: 3000, for_update_ts: 10.into(), min_commit_ts: 11.into(), + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 1, }, false ) diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 6191c2ad46d..3dca7a219f9 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -706,6 +706,16 @@ pub mod tests { assert_eq!(ts, commit_ts.into()); } + pub fn must_get_txn_source(engine: &mut E, key: &[u8], ts: u64, txn_source: u64) { + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = SnapshotReader::new(TimeStamp::from(ts), snapshot, true); + let write = reader + .get_write(&Key::from_raw(key), TimeStamp::from(ts)) + .unwrap() + .unwrap(); + assert_eq!(write.txn_source, txn_source); + } + pub fn must_get_commit_ts_none( engine: &mut E, key: &[u8], diff --git a/src/storage/mvcc/reader/point_getter.rs b/src/storage/mvcc/reader/point_getter.rs index 012189201c5..2f215986ca9 100644 --- a/src/storage/mvcc/reader/point_getter.rs +++ b/src/storage/mvcc/reader/point_getter.rs @@ -5,6 +5,7 @@ use std::borrow::Cow; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::kvrpcpb::{IsolationLevel, WriteConflictReason}; +use tikv_kv::SEEK_BOUND; use txn_types::{Key, Lock, LockType, TimeStamp, TsSet, Value, WriteRef, WriteType}; use crate::storage::{ @@ -281,10 +282,9 @@ impl PointGetter { return Ok(None); } + let mut write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; + let mut owned_value: Vec; // To work around lifetime problem loop { - // No need to compare user key because it uses prefix seek. - let write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; - if !write.check_gc_fence_as_latest_version(self.ts) { return Ok(None); } @@ -315,13 +315,37 @@ impl PointGetter { return Ok(None); } WriteType::Lock | WriteType::Rollback => { - // Continue iterate next `write`. + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + return Ok(None); + } + if write.versions_to_last_change < SEEK_BOUND { + // Continue iterate next `write`. + } else { + let commit_ts = write.last_change_ts; + let key_with_ts = user_key.clone().append_ts(commit_ts); + match self.snapshot.get_cf(CF_WRITE, &key_with_ts)? { + Some(v) => owned_value = v, + None => return Ok(None), + } + self.statistics.write.get += 1; + write = WriteRef::parse(&owned_value)?; + assert!( + write.write_type == WriteType::Put + || write.write_type == WriteType::Delete, + "Write record pointed by last_change_ts {} should be Put or Delete, but got {:?}", + commit_ts, + write.write_type, + ); + continue; + } } } if !self.write_cursor.next(&mut self.statistics.write) { return Ok(None); } + // No need to compare user key because it uses prefix seek. + write = WriteRef::parse(self.write_cursor.value(&mut self.statistics.write))?; } } @@ -611,7 +635,7 @@ mod tests { must_get_value(&mut getter, b"foo2", b"foo2v"); let s = getter.take_statistics(); // We have to check every version - assert_seek_next_prev(&s.write, 1, 40, 0); + assert_seek_next_prev(&s.write, 1, 0, 0); assert_eq!( s.processed_size, Key::from_raw(b"foo2").len() @@ -621,7 +645,8 @@ mod tests { // Get again must_get_value(&mut getter, b"foo2", b"foo2v"); let s = getter.take_statistics(); - assert_seek_next_prev(&s.write, 1, 40, 0); + assert_seek_next_prev(&s.write, 1, 0, 0); + assert_eq!(s.write.get, 1); assert_eq!( s.processed_size, Key::from_raw(b"foo2").len() @@ -1243,4 +1268,25 @@ mod tests { must_get_value(&mut batch_getter_ok, key4, val4); must_get_value(&mut batch_getter_ok, key5, val5); } + + #[test] + fn test_point_get_non_exist_skip_lock() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let k = b"k"; + + // Write enough LOCK recrods + for start_ts in (1..30).into_iter().step_by(2) { + must_prewrite_lock(&mut engine, k, k, start_ts); + must_commit(&mut engine, k, start_ts, start_ts + 1); + } + + let mut getter = new_point_getter(&mut engine, 40.into()); + must_get_none(&mut getter, k); + let s = getter.take_statistics(); + // We can know the key doesn't exist without skipping all these locks according + // to last_change_ts and versions_to_last_change. + assert_eq!(s.write.seek, 1); + assert_eq!(s.write.next, 0); + assert_eq!(s.write.get, 0); + } } diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 6bf712050ac..4847dbb8428 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1,12 +1,14 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] +use std::ops::Bound; + use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::{ errorpb::{self, EpochNotMatch, StaleCommand}, kvrpcpb::Context, }; -use tikv_kv::SnapshotExt; +use tikv_kv::{SnapshotExt, SEEK_BOUND}; use txn_types::{Key, Lock, OldValue, TimeStamp, Value, Write, WriteRef, WriteType}; use crate::storage::{ @@ -127,6 +129,8 @@ pub struct MvccReader { lower_bound: Option, upper_bound: Option, + hint_min_ts: Option>, + /// None means following operations are performed on a single user key, /// i.e., different versions of the same key. It can use prefix seek to /// speed up reads from the write-cf. @@ -154,6 +158,7 @@ impl MvccReader { write_cursor: None, lower_bound: None, upper_bound: None, + hint_min_ts: None, scan_mode, current_key: None, fill_cache, @@ -171,6 +176,7 @@ impl MvccReader { write_cursor: None, lower_bound: None, upper_bound: None, + hint_min_ts: None, scan_mode, current_key: None, fill_cache: !ctx.get_not_fill_cache(), @@ -179,17 +185,12 @@ impl MvccReader { } } - /// load the value associated with `key` and pointed by `write` - fn load_data(&mut self, key: &Key, write: Write) -> Result { - assert_eq!(write.write_type, WriteType::Put); - if let Some(val) = write.short_value { - return Ok(val); - } + /// get the value of a user key with the given `start_ts`. + pub fn get_value(&mut self, key: &Key, start_ts: TimeStamp) -> Result> { if self.scan_mode.is_some() { self.create_data_cursor()?; } - - let k = key.clone().append_ts(write.start_ts); + let k = key.clone().append_ts(start_ts); let val = if let Some(ref mut cursor) = self.data_cursor { cursor .get(&k, &mut self.statistics.data)? @@ -198,13 +199,25 @@ impl MvccReader { self.statistics.data.get += 1; self.snapshot.get(&k)? }; + if val.is_some() { + self.statistics.data.processed_keys += 1; + } + Ok(val) + } - match val { - Some(val) => { - self.statistics.data.processed_keys += 1; - Ok(val) - } - None => Err(default_not_found_error(k.into_encoded(), "get")), + /// load the value associated with `key` and pointed by `write` + pub fn load_data(&mut self, key: &Key, write: Write) -> Result { + assert_eq!(write.write_type, WriteType::Put); + if let Some(val) = write.short_value { + return Ok(val); + } + let start_ts = write.start_ts; + match self.get_value(key, start_ts)? { + Some(val) => Ok(val), + None => Err(default_not_found_error( + key.clone().append_ts(start_ts).into_encoded(), + "get", + )), } } @@ -367,8 +380,9 @@ impl MvccReader { mut ts: TimeStamp, gc_fence_limit: Option, ) -> Result> { + let mut seek_res = self.seek_write(key, ts)?; loop { - match self.seek_write(key, ts)? { + match seek_res { Some((commit_ts, write)) => { if let Some(limit) = gc_fence_limit { if !write.as_ref().check_gc_fence_as_latest_version(limit) { @@ -382,11 +396,38 @@ impl MvccReader { WriteType::Delete => { return Ok(None); } - WriteType::Lock | WriteType::Rollback => ts = commit_ts.prev(), + WriteType::Lock | WriteType::Rollback => { + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + return Ok(None); + } + if write.versions_to_last_change < SEEK_BOUND { + ts = commit_ts.prev(); + } else { + let commit_ts = write.last_change_ts; + let key_with_ts = key.clone().append_ts(commit_ts); + let Some(value) = self + .snapshot + .get_cf(CF_WRITE, &key_with_ts)? else { + return Ok(None); + }; + self.statistics.write.get += 1; + let write = WriteRef::parse(&value)?.to_owned(); + assert!( + write.write_type == WriteType::Put + || write.write_type == WriteType::Delete, + "Write record pointed by last_change_ts {} should be Put or Delete, but got {:?}", + commit_ts, + write.write_type, + ); + seek_res = Some((commit_ts, write)); + continue; + } + } } } None => return Ok(None), } + seek_res = self.seek_write(key, ts)?; } } @@ -443,6 +484,8 @@ impl MvccReader { .prefix_seek(self.scan_mode.is_none()) .scan_mode(self.get_scan_mode(true)) .range(self.lower_bound.clone(), self.upper_bound.clone()) + // `hint_min_ts` filters data by the `commit_ts`. + .hint_min_ts(self.hint_min_ts) .build()?; self.write_cursor = Some(cursor); } @@ -531,32 +574,25 @@ impl MvccReader { Ok((locks, has_remain)) } - /// Scan the writes to get all the latest keys with their corresponding - /// PUT/DELETE write records at the given version, if the version is not - /// specified, it will scan the latest version for each key, if the key - /// does not exist or is not visible at that point, an `Option::None` will - /// be placed. The return type is: - /// * `(Vec<(key, commit_ts, Option)>, has_remain)`. - /// - `key` is the encoded key without commit ts. - /// - `commit_ts` is the latest commit ts of the key. - /// - `write` is the PUT/DELETE write record at the given version. - /// - `has_remain` indicates whether there MAY be remaining writes that + /// Scan the writes to get all the latest user keys. The return type is: + /// * `(Vec, has_remain)`. + /// - `key` is the encoded user key without `commit_ts`. + /// - `has_remain` indicates whether there MAY be remaining user keys that /// can be scanned. /// /// This function is mainly used by /// `txn::commands::FlashbackToVersionReadPhase` /// and `txn::commands::FlashbackToVersion` to achieve the MVCC /// overwriting. - pub fn scan_writes( + pub fn scan_latest_user_keys( &mut self, start: Option<&Key>, end: Option<&Key>, - version: Option, filter: F, limit: usize, - ) -> Result<(Vec<(Key, TimeStamp, Option)>, bool)> + ) -> Result<(Vec, bool)> where - F: Fn(&Key) -> bool, + F: Fn(&Key /* user key */, TimeStamp /* latest `commit_ts` */) -> bool, { self.create_write_cursor()?; let cursor = self.write_cursor.as_mut().unwrap(); @@ -567,10 +603,8 @@ impl MvccReader { if !ok { return Ok((vec![], false)); } - // Use the latest version as the default value if the version is not given. - let version = version.unwrap_or_else(TimeStamp::max); - let mut cur_key = None; - let mut key_writes = Vec::with_capacity(limit); + let mut cur_user_key = None; + let mut keys = Vec::with_capacity(limit); let mut has_remain = false; while cursor.valid()? { let key = Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); @@ -581,57 +615,31 @@ impl MvccReader { } } let commit_ts = key.decode_ts()?; - let user_key = key.clone().truncate_ts()?; - // To make sure we only check each unique key once and `filter(&key)` returns + let user_key = key.truncate_ts()?; + // To make sure we only check each unique user key once and the filter returns // true. - if (cur_key.is_some() && cur_key.clone().unwrap() == user_key) || !filter(&key) { + let is_same_user_key = cur_user_key.as_ref() == Some(&user_key); + if !is_same_user_key { + cur_user_key = Some(user_key.clone()); + } + if is_same_user_key || !filter(&user_key, commit_ts) { cursor.next(&mut self.statistics.write); continue; } - cur_key = Some(user_key.clone()); - - let mut write = None; - let version_key = user_key.clone().append_ts(version); - // Try to seek to the key with the specified version. - if cursor.near_seek(&version_key, &mut self.statistics.write)? - && Key::is_user_key_eq( - cursor.key(&mut self.statistics.write), - user_key.as_encoded(), - ) - { - while cursor.valid()? { - write = - Some(WriteRef::parse(cursor.value(&mut self.statistics.write))?.to_owned()); - // Move to the next key. - cursor.next(&mut self.statistics.write); - match write.as_ref().unwrap().write_type { - WriteType::Put | WriteType::Delete => { - break; - } - WriteType::Lock | WriteType::Rollback => { - // We should find the latest visible version after it. - let key = - Key::from_encoded_slice(cursor.key(&mut self.statistics.write)); - // Could not find the visible version, current cursor is on the next - // key, so we set both `write` and `cur_key` to `None`. - if key.truncate_ts()? != user_key { - write = None; - cur_key = None; - break; - } - } - } - } - } - key_writes.push((user_key, commit_ts, write)); - if limit > 0 && key_writes.len() == limit { + keys.push(user_key.clone()); + if limit > 0 && keys.len() == limit { has_remain = true; break; } + // Seek once to skip all the writes of the same user key. + cursor.near_seek( + &user_key.append_ts(TimeStamp::zero()), + &mut self.statistics.write, + )?; } - self.statistics.write.processed_keys += key_writes.len(); - resource_metering::record_read_keys(key_writes.len() as u32); - Ok((key_writes, has_remain)) + self.statistics.write.processed_keys += keys.len(); + resource_metering::record_read_keys(keys.len() as u32); + Ok((keys, has_remain)) } pub fn scan_keys( @@ -743,6 +751,10 @@ impl MvccReader { self.lower_bound = lower; self.upper_bound = upper; } + + pub fn set_hint_min_ts(&mut self, ts_bound: Option>) { + self.hint_min_ts = ts_bound; + } } #[cfg(test)] @@ -852,6 +864,7 @@ pub mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } @@ -924,6 +937,7 @@ pub mod tests { TimeStamp::zero(), true, false, + false, ) .unwrap(); self.write(txn.into_modifies()); @@ -1394,7 +1408,8 @@ pub mod tests { let (commit_ts, write) = reader.seek_write(&k, 20.into()).unwrap().unwrap(); assert_eq!(commit_ts, 20.into()); - assert_eq!(write, Write::new(WriteType::Lock, 10.into(), None)); + assert_eq!(write.write_type, WriteType::Lock); + assert_eq!(write.start_ts, 10.into()); assert_eq!(reader.statistics.write.seek, 1); assert_eq!(reader.statistics.write.next, 1); @@ -1642,6 +1657,10 @@ pub mod tests { for_update_ts, 0, TimeStamp::zero(), + ) + .set_last_change( + TimeStamp::zero(), + (lock_type == LockType::Lock || lock_type == LockType::Pessimistic) as u64, ), ) }) @@ -1725,9 +1744,9 @@ pub mod tests { } #[test] - fn test_scan_writes() { + fn test_scan_latest_user_keys() { let path = tempfile::Builder::new() - .prefix("_test_storage_mvcc_reader_scan_writes") + .prefix("_test_storage_mvcc_reader_scan_latest_user_keys") .tempdir() .unwrap(); let path = path.path().to_str().unwrap(); @@ -1804,6 +1823,13 @@ pub mod tests { 8, ); engine.commit(b"k3", 8, 9); + // Prewrite and rollback k4. + engine.prewrite( + Mutation::make_put(Key::from_raw(b"k4"), b"v4@1".to_vec()), + b"k4", + 10, + ); + engine.rollback(b"k4", 10); // Current MVCC keys in `CF_WRITE` should be: // PUT k0 -> v0@999 @@ -1815,272 +1841,58 @@ pub mod tests { // PUT k3 -> v3@8 // ROLLBACK k3 -> v3@7 // PUT k3 -> v3@5 + // ROLLBACK k4 -> v4@1 struct Case { start_key: Option, end_key: Option, - version: Option, limit: usize, - expect_res: Vec<(Key, TimeStamp, Option)>, + expect_res: Vec, expect_is_remain: bool, } let cases = vec![ - // Get all latest writes with the unspecified version. - Case { - start_key: None, - end_key: None, - version: None, - limit: 4, - expect_res: vec![ - ( - Key::from_raw(b"k0"), - 1000.into(), - Some(Write::new( - WriteType::Put, - 999.into(), - Some(b"v0@999".to_vec()), - )), - ), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - 9.into(), - Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k0 is invisible at version 9. - Case { - start_key: None, - end_key: None, - version: Some(9), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - 9.into(), - Some(Write::new(WriteType::Put, 8.into(), Some(b"v3@8".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k3 has an old version write at version 8. - Case { - start_key: None, - end_key: None, - version: Some(8), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - 9.into(), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(7), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - 9.into(), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(6), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - ( - Key::from_raw(b"k3"), - 9.into(), - Some(Write::new(WriteType::Put, 5.into(), Some(b"v3@5".to_vec()))), - ), - ], - expect_is_remain: true, - }, - // k3 doesn't exist at version 5. - Case { - start_key: None, - end_key: None, - version: Some(5), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - (Key::from_raw(b"k3"), 9.into(), None), - ], - expect_is_remain: true, - }, + // Test the limit. Case { start_key: None, end_key: None, - version: Some(4), - limit: 4, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v1@3".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 3.into(), Some(b"v2@3".to_vec()))), - ), - (Key::from_raw(b"k3"), 9.into(), None), - ], + limit: 1, + expect_res: vec![Key::from_raw(b"k0")], expect_is_remain: true, }, - // k1 and k2 have old version writes at version 8. Case { start_key: None, end_key: None, - version: Some(3), - limit: 4, + limit: 6, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), - ), - (Key::from_raw(b"k3"), 9.into(), None), + Key::from_raw(b"k0"), + Key::from_raw(b"k1"), + Key::from_raw(b"k2"), + Key::from_raw(b"k3"), + Key::from_raw(b"k4"), ], - expect_is_remain: true, + expect_is_remain: false, }, + // Test the start/end key. Case { - start_key: None, + start_key: Some(Key::from_raw(b"k2")), end_key: None, - version: Some(2), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - ( - Key::from_raw(b"k1"), - 4.into(), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v1@1".to_vec()))), - ), - ( - Key::from_raw(b"k2"), - 4.into(), - Some(Write::new(WriteType::Put, 1.into(), Some(b"v2@1".to_vec()))), - ), - (Key::from_raw(b"k3"), 9.into(), None), + Key::from_raw(b"k2"), + Key::from_raw(b"k3"), + Key::from_raw(b"k4"), ], - expect_is_remain: true, + expect_is_remain: false, }, - // All keys don't exist at version 1. Case { start_key: None, - end_key: None, - version: Some(1), + end_key: Some(Key::from_raw(b"k3")), limit: 4, expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - (Key::from_raw(b"k1"), 4.into(), None), - (Key::from_raw(b"k2"), 4.into(), None), - (Key::from_raw(b"k3"), 9.into(), None), - ], - expect_is_remain: true, - }, - // Test the limit. - Case { - start_key: None, - end_key: None, - version: Some(0), - limit: 1, - expect_res: vec![(Key::from_raw(b"k0"), 1000.into(), None)], - expect_is_remain: true, - }, - Case { - start_key: None, - end_key: None, - version: Some(0), - limit: 5, - expect_res: vec![ - (Key::from_raw(b"k0"), 1000.into(), None), - (Key::from_raw(b"k1"), 4.into(), None), - (Key::from_raw(b"k2"), 4.into(), None), - (Key::from_raw(b"k3"), 9.into(), None), + Key::from_raw(b"k0"), + Key::from_raw(b"k1"), + Key::from_raw(b"k2"), ], expect_is_remain: false, }, @@ -2090,11 +1902,10 @@ pub mod tests { let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, Some(ScanMode::Forward), false); let res = reader - .scan_writes( + .scan_latest_user_keys( case.start_key.as_ref(), case.end_key.as_ref(), - case.version.map(Into::into), - |_| true, + |_, _| true, case.limit, ) .unwrap(); @@ -2207,8 +2018,17 @@ pub mod tests { engine.write(case.modifies); let snap = RegionSnapshot::::from_raw(db.clone(), region.clone()); let mut reader = MvccReader::new(snap, case.scan_mode, false); - let result = reader.load_data(&case.key, case.write); + let result = reader.load_data(&case.key, case.write.clone()); assert_eq!(format!("{:?}", result), format!("{:?}", case.expected)); + if let Ok(expected) = case.expected { + if expected == long_value.to_vec() { + let result = reader + .get_value(&case.key, case.write.start_ts) + .unwrap() + .unwrap(); + assert_eq!(format!("{:?}", result), format!("{:?}", expected)); + } + } } } @@ -2518,4 +2338,101 @@ pub mod tests { assert_eq!(reader.statistics.write.seek_tombstone, *tombstones); } } + + #[test] + fn test_get_write_second_get() { + let path = tempfile::Builder::new() + .prefix("_test_storage_mvcc_reader_get_write_second_get") + .tempdir() + .unwrap(); + let path = path.path().to_str().unwrap(); + let region = make_region(1, vec![], vec![]); + let db = open_db(path, true); + let mut engine = RegionEngine::new(&db, ®ion); + + let (k, v) = (b"k", b"v"); + let m = Mutation::make_put(Key::from_raw(k), v.to_vec()); + engine.prewrite(m, k, 1); + engine.commit(k, 1, 2); + + // Write enough LOCK recrods + for start_ts in (6..30).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + + let m = Mutation::make_delete(Key::from_raw(k)); + engine.prewrite(m, k, 45); + engine.commit(k, 45, 46); + + // Write enough LOCK recrods + for start_ts in (50..80).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + + let snap = RegionSnapshot::::from_raw(db, region); + let mut reader = MvccReader::new(snap, None, false); + + let key = Key::from_raw(k); + // Get write record whose commit_ts = 2 + let w2 = reader + .get_write(&key, TimeStamp::new(2), None) + .unwrap() + .unwrap(); + + // Clear statistics first + reader.statistics = Statistics::default(); + let (write, commit_ts) = reader + .get_write_with_commit_ts(&key, 40.into(), None) + .unwrap() + .unwrap(); + assert_eq!(commit_ts, 2.into()); + assert_eq!(write, w2); + // versions_to_last_change should be large enough to trigger a second get + // instead of calling a series of next, so the count of next should be 0 instead + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 1); + + // Clear statistics first + reader.statistics = Statistics::default(); + let res = reader + .get_write_with_commit_ts(&key, 80.into(), None) + .unwrap(); + // If the type is Delete, get_write_with_commit_ts should return None. + assert!(res.is_none()); + // versions_to_last_change should be large enough to trigger a second get + // instead of calling a series of next, so the count of next should be 0 instead + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 1); + } + + #[test] + fn test_get_write_not_exist_skip_lock() { + let path = tempfile::Builder::new() + .prefix("_test_storage_mvcc_reader_get_write_not_exist_skip_lock") + .tempdir() + .unwrap(); + let path = path.path().to_str().unwrap(); + let region = make_region(1, vec![], vec![]); + let db = open_db(path, true); + let mut engine = RegionEngine::new(&db, ®ion); + let k = b"k"; + + // Write enough LOCK recrods + for start_ts in (6..30).into_iter().step_by(2) { + engine.lock(k, start_ts, start_ts + 1); + } + + let snap = RegionSnapshot::::from_raw(db, region); + let mut reader = MvccReader::new(snap, None, false); + + let res = reader + .get_write_with_commit_ts(&Key::from_raw(k), 40.into(), None) + .unwrap(); + // We can know the key doesn't exist without skipping all these locks according + // to last_change_ts and versions_to_last_change. + assert!(res.is_none()); + assert_eq!(reader.statistics.write.seek, 1); + assert_eq!(reader.statistics.write.next, 0); + assert_eq!(reader.statistics.write.get, 0); + } } diff --git a/src/storage/mvcc/reader/scanner/forward.rs b/src/storage/mvcc/reader/scanner/forward.rs index c59c20fbe05..8828033c8a1 100644 --- a/src/storage/mvcc/reader/scanner/forward.rs +++ b/src/storage/mvcc/reader/scanner/forward.rs @@ -472,12 +472,21 @@ impl ScanPolicy for LatestKvPolicy { } WriteType::Delete => break None, WriteType::Lock | WriteType::Rollback => { - // Continue iterate next `write`. + if write.versions_to_last_change > 0 && write.last_change_ts.is_zero() { + break None; + } + if write.versions_to_last_change < SEEK_BOUND { + // Continue iterate next `write`. + cursors.write.next(&mut statistics.write); + } else { + // Seek to the expected version directly. + let commit_ts = write.last_change_ts; + let key_with_ts = current_user_key.clone().append_ts(commit_ts); + cursors.write.seek(&key_with_ts, &mut statistics.write)?; + } } } - cursors.write.next(&mut statistics.write); - if !cursors.write.valid()? { // Key space ended. Needn't move write cursor to next key. return Ok(HandleRes::Skip(current_user_key)); @@ -886,6 +895,8 @@ pub mod test_util { pub commit_ts: TimeStamp, pub for_update_ts: TimeStamp, pub old_value: OldValue, + pub last_change_ts: TimeStamp, + pub versions_to_last_change: u64, } impl Default for EntryBuilder { @@ -898,6 +909,8 @@ pub mod test_util { commit_ts: 0.into(), for_update_ts: 0.into(), old_value: OldValue::None, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, } } } @@ -931,6 +944,15 @@ pub mod test_util { self.old_value = OldValue::value(old_value.to_owned()); self } + pub fn last_change( + &mut self, + last_change_ts: TimeStamp, + versions_to_last_change: u64, + ) -> &mut Self { + self.last_change_ts = last_change_ts; + self.versions_to_last_change = versions_to_last_change; + self + } pub fn build_commit(&self, wt: WriteType, is_short_value: bool) -> TxnEntry { let write_key = Key::from_raw(&self.key).append_ts(self.commit_ts); let (key, value, short) = if is_short_value { @@ -949,7 +971,8 @@ pub mod test_util { None, ) }; - let write_value = Write::new(wt, self.start_ts, short); + let write_value = Write::new(wt, self.start_ts, short) + .set_last_change(self.last_change_ts, self.versions_to_last_change); TxnEntry::Commit { default: (key, value), write: (write_key.into_encoded(), write_value.as_ref().to_bytes()), @@ -984,7 +1007,8 @@ pub mod test_util { self.for_update_ts, 0, 0.into(), - ); + ) + .set_last_change(self.last_change_ts, self.versions_to_last_change); TxnEntry::Prewrite { default: (key, value), lock: (lock_key.into_encoded(), lock_value.to_bytes()), @@ -1586,6 +1610,86 @@ mod latest_kv_tests { ); scanner.next().unwrap_err(); } + + #[test] + fn test_skip_versions_by_seek() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v11", b"k1", 1); + must_commit(&mut engine, b"k1", 1, 5); + must_prewrite_put(&mut engine, b"k1", b"v12", b"k1", 6); + must_commit(&mut engine, b"k1", 6, 8); + must_prewrite_put(&mut engine, b"k2", b"v21", b"k2", 2); + must_commit(&mut engine, b"k2", 2, 6); + must_prewrite_put(&mut engine, b"k4", b"v41", b"k4", 3); + must_commit(&mut engine, b"k4", 3, 7); + + for start_ts in (10..30).into_iter().step_by(2) { + must_prewrite_lock(&mut engine, b"k1", b"k1", start_ts); + must_commit(&mut engine, b"k1", start_ts, start_ts + 1); + must_prewrite_lock(&mut engine, b"k3", b"k1", start_ts); + must_commit(&mut engine, b"k3", start_ts, start_ts + 1); + must_prewrite_lock(&mut engine, b"k4", b"k1", start_ts); + must_commit(&mut engine, b"k4", start_ts, start_ts + 1); + } + + must_prewrite_put(&mut engine, b"k1", b"v13", b"k1", 40); + must_commit(&mut engine, b"k1", 40, 45); + must_prewrite_put(&mut engine, b"k2", b"v22", b"k2", 41); + must_commit(&mut engine, b"k2", 41, 46); + must_prewrite_put(&mut engine, b"k3", b"v32", b"k3", 42); + must_commit(&mut engine, b"k3", 42, 47); + + // KEY | COMMIT_TS | TYPE | VALUE + // ----|-----------|----------|------- + // k1 | 45 | PUT | v13 + // k1 | 29 | LOCK | + // k1 | 27 | LOCK | + // k1 | ... | LOCK | + // k1 | 11 | LOCK | + // k1 | 8 | PUT | v12 + // k1 | 5 | PUT | v1 + // k2 | 46 | PUT | v22 + // k2 | 6 | PUT | v21 + // k3 | 47 | PUT | v32 + // k3 | 29 | LOCK | + // k3 | 27 | LOCK | + // k3 | ... | LOCK | + // k3 | 11 | LOCK | + // k4 | 29 | LOCK | + // k4 | 27 | LOCK | + // k4 | ... | LOCK | + // k4 | 11 | LOCK | + // k4 | 7 | PUT | v41 + + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut scanner = ScannerBuilder::new(snapshot, 35.into()) + .range(None, None) + .build() + .unwrap(); + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k1"), b"v12".to_vec())) + ); + let stats = scanner.take_statistics(); + assert_eq!(stats.write.next, 3); // skip k1@45, k1@8, k1@5 + assert_eq!(stats.write.seek, 2); // seek beginning and k1@8 + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k2"), b"v21".to_vec())) + ); + scanner.take_statistics(); + + assert_eq!( + scanner.next().unwrap(), + Some((Key::from_raw(b"k4"), b"v41".to_vec())) + ); + let stats = scanner.take_statistics(); + assert_le!(stats.write.next, 1 + SEEK_BOUND as usize); // skip k2@6, near_seek to k4 (8 times next) + assert_eq!(stats.write.seek, 2); // seek k4, k4@7 + } } #[cfg(test)] @@ -2426,11 +2530,9 @@ mod delta_entry_tests { let mut entries_of_key = vec![]; if let Some((ts, lock_type, value)) = lock { - let max_commit_ts = writes - .last() - .cloned() - .map(|(_, commit_ts, ..)| commit_ts) - .unwrap_or(0); + let last_write = writes.last(); + let max_commit_ts = + last_write.map(|(_, commit_ts, ..)| *commit_ts).unwrap_or(0); let for_update_ts = std::cmp::max(*ts, max_commit_ts + 1); if *ts <= to_ts { @@ -2580,10 +2682,12 @@ mod delta_entry_tests { // Do assertions one by one so that if it fails it won't print too long panic // message. for i in 0..std::cmp::max(actual.len(), expected.len()) { + // We don't care about last_change_ts here. Use a trick to ignore them. + let actual_erased = actual[i].erasing_last_change_ts(); assert_eq!( - actual[i], expected[i], + actual_erased, expected[i], "item {} not match: expected {:?}, but got {:?}", - i, &expected[i], &actual[i] + i, &expected[i], &actual_erased ); } }; diff --git a/src/storage/mvcc/reader/scanner/mod.rs b/src/storage/mvcc/reader/scanner/mod.rs index 664a4fed99e..5b87cca7f7a 100644 --- a/src/storage/mvcc/reader/scanner/mod.rs +++ b/src/storage/mvcc/reader/scanner/mod.rs @@ -4,6 +4,8 @@ mod backward; mod forward; +use std::ops::Bound; + use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_WRITE}; use kvproto::kvrpcpb::{ExtraOp, IsolationLevel}; use txn_types::{ @@ -330,8 +332,8 @@ impl ScannerConfig { .range(lower, upper) .fill_cache(self.fill_cache) .scan_mode(scan_mode) - .hint_min_ts(hint_min_ts) - .hint_max_ts(hint_max_ts) + .hint_min_ts(hint_min_ts.map(|ts| Bound::Included(ts))) + .hint_max_ts(hint_max_ts.map(|ts| Bound::Included(ts))) .build()?; Ok(cursor) } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index a73f8b99027..4cc0ab57ffb 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -103,6 +103,10 @@ impl MvccTxn { self.write_size } + pub fn is_empty(&self) -> bool { + self.modifies.len() == 0 && self.locks_for_1pc.len() == 0 + } + pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock) { let write = Modify::Put(CF_LOCK, key, lock.to_bytes()); self.write_size += write.size(); @@ -767,6 +771,7 @@ pub(crate) mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } diff --git a/src/storage/raw/raw_mvcc.rs b/src/storage/raw/raw_mvcc.rs index 6d86203e8f2..8c4ad5da08b 100644 --- a/src/storage/raw/raw_mvcc.rs +++ b/src/storage/raw/raw_mvcc.rs @@ -232,11 +232,7 @@ impl Iterator for RawMvccIterator { #[cfg(test)] mod tests { - use std::{ - fmt::Debug, - iter::Iterator as StdIterator, - sync::mpsc::{channel, Sender}, - }; + use std::iter::Iterator as StdIterator; use api_version::{ApiV2, KvFormat, RawValue}; use engine_traits::{raw_ttl::ttl_to_expire_ts, CF_DEFAULT}; @@ -244,21 +240,13 @@ mod tests { use tikv_kv::{Engine, Iterator as EngineIterator, Modify, WriteData}; use super::*; - use crate::storage::{raw::encoded::RawEncodeSnapshot, TestEngineBuilder}; - - fn expect_ok_callback(done: Sender, id: i32) -> tikv_kv::Callback { - Box::new(move |x: tikv_kv::Result| { - x.unwrap(); - done.send(id).unwrap(); - }) - } + use crate::storage::{kv, raw::encoded::RawEncodeSnapshot, TestEngineBuilder}; #[test] fn test_raw_mvcc_snapshot() { // Use `Engine` to be independent to `Storage`. // Do not set "api version" to use `Engine` as a raw RocksDB. let mut engine = TestEngineBuilder::new().build().unwrap(); - let (tx, rx) = channel(); let ctx = Context::default(); // TODO: Consider another way other than hard coding, to generate keys' prefix @@ -291,10 +279,8 @@ mod tests { ApiV2::encode_raw_value_owned(raw_value), ); let batch = WriteData::from_modifies(vec![m]); - engine - .async_write(&ctx, batch, expect_ok_callback(tx.clone(), 0)) - .unwrap(); - rx.recv().unwrap(); + let res = futures::executor::block_on(kv::write(&engine, &ctx, batch, None)).unwrap(); + res.unwrap(); } // snapshot diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 7c2f41d3e1b..8e7c4d95118 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -9,19 +9,30 @@ use crate::storage::{ metrics::{MVCC_CONFLICT_COUNTER, MVCC_DUPLICATE_CMD_COUNTER_VEC}, ErrorInner, MvccTxn, Result as MvccResult, SnapshotReader, }, - txn::actions::check_data_constraint::check_data_constraint, + txn::{ + actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, + scheduler::LAST_CHANGE_TS, + }, + types::PessimisticLockKeyResult, Snapshot, }; /// Acquires pessimistic lock on a single key. Optionally reads the previous /// value by the way. /// -/// When `need_value` is set, the first return value will be the previous value -/// of the key (possibly `None`). When `need_value` is not set but -/// `need_check_existence` is set, the first return value will be an empty value -/// (`Some(vec![])`) if the key exists before or `None` if not. If neither -/// `need_value` nor `need_check_existence` is set, the first return value is -/// always `None`. +/// When `need_value` is set, the first return value will be +/// `PessimisticLockKeyResult::Value`. When `need_value` is not set but +/// `need_check_existence` is set, the first return value will be +/// `PessimisticLockKeyResult::Existence`. If neither `need_value` nor +/// `need_check_existence` is set, the first return value will be +/// `PessimisticLockKeyResult::Empty`. +/// +/// If `allow_lock_with_conflict` is set, and the lock is acquired successfully +/// ignoring a write conflict, the first return value will be +/// `PessimisticLockKeyResult::LockedWithConflict` no matter how `need_value` +/// and `need_check_existence` are set, and the `for_update_ts` in +/// the actually-written lock will be equal to the `commit_ts` of the latest +/// Write record found on the key. /// /// The second return value will also contains the previous value of the key if /// `need_old_value` is set, or `OldValue::Unspecified` otherwise. @@ -32,13 +43,14 @@ pub fn acquire_pessimistic_lock( primary: &[u8], should_not_exist: bool, lock_ttl: u64, - for_update_ts: TimeStamp, + mut for_update_ts: TimeStamp, need_value: bool, need_check_existence: bool, min_commit_ts: TimeStamp, need_old_value: bool, lock_only_if_exists: bool, -) -> MvccResult<(Option, OldValue)> { + allow_lock_with_conflict: bool, +) -> MvccResult<(PessimisticLockKeyResult, OldValue)> { fail_point!("acquire_pessimistic_lock", |err| Err( crate::storage::mvcc::txn::make_txn_error(err, &key, reader.start_ts).into() )); @@ -54,9 +66,10 @@ pub fn acquire_pessimistic_lock( } .into()); } - // Update max_ts for Insert operation to guarantee linearizability and snapshot - // isolation - if should_not_exist { + // If any of `should_not_exist`, `need_value`, `need_check_existence` is set, + // it infers a read to the value, in which case max_ts need to be updated to + // guarantee the linearizability and snapshot isolation. + if should_not_exist || need_value || need_check_existence { txn.concurrency_manager.update_max_ts(for_update_ts); } @@ -64,7 +77,7 @@ pub fn acquire_pessimistic_lock( // `need_check_existence` and `need_old_value` are both set, we also load // the value even if `need_value` is false, so that it avoids // `load_old_value` doing repeated work. - let need_load_value = need_value || (need_check_existence && need_old_value); + let mut need_load_value = need_value || (need_check_existence && need_old_value); fn load_old_value( need_old_value: bool, @@ -90,19 +103,6 @@ pub fn acquire_pessimistic_lock( } } - /// Returns proper result according to the loaded value (if any) the - /// specified settings. - #[inline] - fn ret_val(need_value: bool, need_check_existence: bool, val: Option) -> Option { - if need_value { - val - } else if need_check_existence { - val.map(|_| vec![]) - } else { - None - } - } - let mut val = None; if let Some(lock) = reader.load_lock(&key)? { if lock.ts != reader.start_ts { @@ -116,6 +116,32 @@ pub fn acquire_pessimistic_lock( } .into()); } + + let locked_with_conflict_ts = + if allow_lock_with_conflict && for_update_ts < lock.for_update_ts { + // If the key is already locked by the same transaction with larger + // for_update_ts, and the current request has + // `allow_lock_with_conflict` set, we must consider + // these possibilities: + // * If a previous request successfully locked the key with conflict, but the + // response is lost due to some errors such as RPC failures. In this case, we + // return like the current request's result is locked_with_conflict, for + // idempotency concern. + // * The key is locked by a newer request with larger for_update_ts, and the + // current request is stale. We can't distinguish this case with the above + // one, but we don't need to handle this case since no one would need the + // current request's result anymore. + + // Load value if locked_with_conflict, so that when the client (TiDB) need to + // read the value during statement retry, it will be possible to read the value + // from cache instead of RPC. + need_load_value = true; + for_update_ts = lock.for_update_ts; + Some(lock.for_update_ts) + } else { + None + }; + if need_load_value { val = reader.get(&key, for_update_ts)?; } else if need_check_existence { @@ -142,6 +168,8 @@ pub fn acquire_pessimistic_lock( ttl: lock_ttl, for_update_ts, min_commit_ts, + last_change_ts: lock.last_change_ts, + versions_to_last_change: lock.versions_to_last_change, }; txn.put_pessimistic_lock(key, lock); } else { @@ -149,11 +177,22 @@ pub fn acquire_pessimistic_lock( .acquire_pessimistic_lock .inc(); } - return Ok((ret_val(need_value, need_check_existence, val), old_value)); + return Ok(( + PessimisticLockKeyResult::new_success( + need_value, + need_check_existence, + locked_with_conflict_ts, + val, + ), + old_value, + )); } + let mut locked_with_conflict_ts = None; + // Following seek_write read the previous write. let (prev_write_loaded, mut prev_write) = (true, None); + let (mut last_change_ts, mut versions_to_last_change); if let Some((commit_ts, write)) = reader.seek_write(&key, TimeStamp::max())? { // Find a previous write. if need_old_value { @@ -168,15 +207,22 @@ pub fn acquire_pessimistic_lock( MVCC_CONFLICT_COUNTER .acquire_pessimistic_lock_conflict .inc(); - return Err(ErrorInner::WriteConflict { - start_ts: reader.start_ts, - conflict_start_ts: write.start_ts, - conflict_commit_ts: commit_ts, - key: key.into_raw()?, - primary: primary.to_vec(), - reason: WriteConflictReason::PessimisticRetry, + if allow_lock_with_conflict { + // TODO: New metrics. + locked_with_conflict_ts = Some(commit_ts); + for_update_ts = commit_ts; + need_load_value = true; + } else { + return Err(ErrorInner::WriteConflict { + start_ts: reader.start_ts, + conflict_start_ts: write.start_ts, + conflict_commit_ts: commit_ts, + key: key.into_raw()?, + primary: primary.to_vec(), + reason: WriteConflictReason::PessimisticRetry, + } + .into()); } - .into()); } // Handle rollback. @@ -211,10 +257,19 @@ pub fn acquire_pessimistic_lock( } } - // Check data constraint when acquiring pessimistic lock. - check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + // Check data constraint when acquiring pessimistic lock. But in case we are + // going to lock it with write conflict, we do not check it since the + // statement will then retry. + if locked_with_conflict_ts.is_none() { + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + } + + (last_change_ts, versions_to_last_change) = write.next_last_change_info(commit_ts); - if need_value || need_check_existence { + // Load value if locked_with_conflict, so that when the client (TiDB) need to + // read the value during statement retry, it will be possible to read the value + // from cache instead of RPC. + if need_value || need_check_existence || locked_with_conflict_ts.is_some() { val = match write.write_type { // If it's a valid Write, no need to read again. WriteType::Put @@ -238,6 +293,13 @@ pub fn acquire_pessimistic_lock( } }; } + } else { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (last_change_ts, versions_to_last_change) = (TimeStamp::zero(), 1); + } + if !tls_can_enable(LAST_CHANGE_TS) { + (last_change_ts, versions_to_last_change) = (TimeStamp::zero(), 0); } let old_value = load_old_value( @@ -256,16 +318,26 @@ pub fn acquire_pessimistic_lock( ttl: lock_ttl, for_update_ts, min_commit_ts, + last_change_ts, + versions_to_last_change, }; - // When lock_only_if_exists is false, always accquire pessimitic lock, otherwise + // When lock_only_if_exists is false, always acquire pessimistic lock, otherwise // do it when val exists if !lock_only_if_exists || val.is_some() { txn.put_pessimistic_lock(key, lock); } // TODO don't we need to commit the modifies in txn? - Ok((ret_val(need_value, need_check_existence, val), old_value)) + Ok(( + PessimisticLockKeyResult::new_success( + need_value, + need_check_existence, + locked_with_conflict_ts, + val, + ), + old_value, + )) } pub mod tests { @@ -273,7 +345,7 @@ pub mod tests { use kvproto::kvrpcpb::Context; #[cfg(test)] use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; - use txn_types::TimeStamp; + use txn_types::{Lock, TimeStamp}; use super::*; use crate::storage::{ @@ -292,6 +364,70 @@ pub mod tests { TestEngineBuilder, }; + #[cfg(test)] + pub fn acquire_pessimistic_lock_allow_lock_with_conflict( + engine: &mut E, + key: &[u8], + pk: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + need_value: bool, + need_check_existence: bool, + ) -> MvccResult { + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let cm = ConcurrencyManager::new(0.into()); + let start_ts = start_ts.into(); + let mut txn = MvccTxn::new(start_ts, cm); + let mut reader = SnapshotReader::new(start_ts, snapshot, true); + let res = acquire_pessimistic_lock( + &mut txn, + &mut reader, + Key::from_raw(key), + pk, + false, + 1, + for_update_ts.into(), + need_value, + need_check_existence, + 0.into(), + false, + false, + true, + ); + if res.is_ok() { + let modifies = txn.into_modifies(); + if !modifies.is_empty() { + engine + .write(&ctx, WriteData::from_modifies(modifies)) + .unwrap(); + } + } + res.map(|r| r.0) + } + + #[cfg(test)] + pub fn must_succeed_allow_lock_with_conflict( + engine: &mut E, + key: &[u8], + pk: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + need_value: bool, + need_check_existence: bool, + ) -> PessimisticLockKeyResult { + acquire_pessimistic_lock_allow_lock_with_conflict( + engine, + key, + pk, + start_ts, + for_update_ts, + need_value, + need_check_existence, + ) + .unwrap() + } + pub fn must_succeed_impl( engine: &mut E, key: &[u8], @@ -325,6 +461,7 @@ pub mod tests { min_commit_ts, false, lock_only_if_exists, + false, ) .unwrap(); let modifies = txn.into_modifies(); @@ -333,7 +470,19 @@ pub mod tests { .write(&ctx, WriteData::from_modifies(modifies)) .unwrap(); } - res.0 + // TODO: Adapt to new interface + match res.0 { + PessimisticLockKeyResult::Value(v) => v, + PessimisticLockKeyResult::Existence(e) => { + if e { + Some(vec![]) + } else { + None + } + } + PessimisticLockKeyResult::Empty => None, + res => panic!("unexpected result: {:?}", res), + } } pub fn must_succeed( @@ -494,6 +643,7 @@ pub mod tests { min_commit_ts, false, lock_only_if_exists, + false, ) .unwrap_err() } @@ -503,13 +653,14 @@ pub mod tests { key: &[u8], start_ts: impl Into, for_update_ts: impl Into, - ) { + ) -> Lock { let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new(snapshot, None, true); let lock = reader.load_lock(&Key::from_raw(key)).unwrap().unwrap(); assert_eq!(lock.ts, start_ts.into()); assert_eq!(lock.for_update_ts, for_update_ts.into()); assert_eq!(lock.lock_type, LockType::Pessimistic); + lock } #[test] @@ -1100,6 +1251,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap(); assert_eq!(old_value, OldValue::None); @@ -1151,6 +1303,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap(); assert_eq!( @@ -1185,6 +1338,7 @@ pub mod tests { min_commit_ts, true, false, + false, ) .unwrap(); assert_eq!( @@ -1228,6 +1382,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, )?; Ok(old_value) }); @@ -1281,6 +1436,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap_err(); @@ -1315,6 +1471,7 @@ pub mod tests { min_commit_ts, need_old_value, false, + false, ) .unwrap_err(); } @@ -1460,4 +1617,219 @@ pub mod tests { } } } + + #[test] + fn test_calculate_last_change_ts() { + use engine_traits::CF_WRITE; + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate.clone()); + + // Latest version is a PUT, but last_change_ts is enabled with cluster version + // higher than 6.5.0. + let write = Write::new(WriteType::Put, 15.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(20.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 10, 30); + let lock = must_pessimistic_locked(&mut engine, key, 10, 30); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 10, 30); + // Set cluster version to 6.5.0, last_change_ts should work now. + feature_gate.set_version("6.5.0").unwrap(); + must_succeed(&mut engine, key, key, 10, 30); + let lock = must_pessimistic_locked(&mut engine, key, 10, 30); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 1); + pessimistic_rollback::tests::must_success(&mut engine, key, 10, 30); + + // Latest version is a DELETE + let write = Write::new(WriteType::Delete, 40.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(50.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 60, 70); + let lock = must_pessimistic_locked(&mut engine, key, 60, 70); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 1); + pessimistic_rollback::tests::must_success(&mut engine, key, 60, 70); + + // Latest version is a LOCK without last_change_ts + let write = Write::new(WriteType::Lock, 70.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(75.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 80, 80); + let lock = must_pessimistic_locked(&mut engine, key, 80, 80); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 80, 80); + + // Latest version is a ROLLBACK without last_change_ts + let write = Write::new(WriteType::Lock, 90.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(90.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 95, 95); + let lock = must_pessimistic_locked(&mut engine, key, 95, 95); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + pessimistic_rollback::tests::must_success(&mut engine, key, 95, 95); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 100.into(), None).set_last_change(40.into(), 4); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(110.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 120, 130); + let lock = must_pessimistic_locked(&mut engine, key, 120, 130); + assert_eq!(lock.last_change_ts, 40.into()); + assert_eq!(lock.versions_to_last_change, 5); + pessimistic_rollback::tests::must_success(&mut engine, key, 120, 130); + + // Latest version is a ROLLBACK with last_change_ts + let write = Write::new(WriteType::Rollback, 120.into(), None).set_last_change(40.into(), 5); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(120.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + must_succeed(&mut engine, key, key, 140, 140); + let lock = must_pessimistic_locked(&mut engine, key, 140, 140); + assert_eq!(lock.last_change_ts, 40.into()); + assert_eq!(lock.versions_to_last_change, 6); + pessimistic_rollback::tests::must_success(&mut engine, key, 140, 140); + + // Lock on a key with no write record + must_succeed(&mut engine, b"k2", b"k2", 150, 150); + let lock = must_pessimistic_locked(&mut engine, b"k2", 150, 150); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 1); + } + + #[test] + fn test_lock_with_conflict() { + use pessimistic_rollback::tests::must_success as must_pessimistic_rollback; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 10); + must_commit(&mut engine, b"k1", 10, 20); + + // Normal cases. + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, false, false) + .assert_empty(); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, false, true) + .assert_existence(true); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, true, false) + .assert_value(Some(b"v1")); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 30, true, true) + .assert_value(Some(b"v1")); + must_pessimistic_rollback(&mut engine, b"k1", 10, 30); + must_unlocked(&mut engine, b"k1"); + + // Conflicting cases. + for &(need_value, need_check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + must_succeed_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 10, + 15, + need_value, + need_check_existence, + ) + .assert_locked_with_conflict(Some(b"v1"), 20); + must_pessimistic_locked(&mut engine, b"k1", 10, 20); + must_pessimistic_rollback(&mut engine, b"k1", 10, 20); + must_unlocked(&mut engine, b"k1"); + } + + // Idempotency + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 50, false, false) + .assert_empty(); + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 40, false, false) + .assert_locked_with_conflict(Some(b"v1"), 50); + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 15, false, false) + .assert_locked_with_conflict(Some(b"v1"), 50); + must_pessimistic_locked(&mut engine, b"k1", 10, 50); + must_pessimistic_rollback(&mut engine, b"k1", 10, 50); + must_unlocked(&mut engine, b"k1"); + + // Lock waiting. + must_succeed_allow_lock_with_conflict(&mut engine, b"k1", b"k1", 10, 50, false, false) + .assert_empty(); + let err = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 11, + 55, + false, + false, + ) + .unwrap_err(); + assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); + let err = acquire_pessimistic_lock_allow_lock_with_conflict( + &mut engine, + b"k1", + b"k1", + 9, + 9, + false, + false, + ) + .unwrap_err(); + assert!(matches!(err, MvccError(box ErrorInner::KeyIsLocked(_)))); + must_pessimistic_locked(&mut engine, b"k1", 10, 50); + must_pessimistic_rollback(&mut engine, b"k1", 10, 50); + must_unlocked(&mut engine, b"k1"); + } } diff --git a/src/storage/txn/actions/commit.rs b/src/storage/txn/actions/commit.rs index eb798090ba2..bfb1d39f768 100644 --- a/src/storage/txn/actions/commit.rs +++ b/src/storage/txn/actions/commit.rs @@ -91,7 +91,9 @@ pub fn commit( WriteType::from_lock_type(lock.lock_type).unwrap(), reader.start_ts, lock.short_value.take(), - ); + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source); for ts in &lock.rollback_ts { if *ts == commit_ts { @@ -116,7 +118,8 @@ pub mod tests { #[cfg(test)] use crate::storage::txn::tests::{ must_acquire_pessimistic_lock_for_large_txn, must_prewrite_delete, must_prewrite_lock, - must_prewrite_put, must_prewrite_put_for_large_txn, must_prewrite_put_impl, must_rollback, + must_prewrite_put, must_prewrite_put_for_large_txn, must_prewrite_put_impl, + must_prewrite_put_with_txn_soucre, must_rollback, }; #[cfg(test)] use crate::storage::{ @@ -320,4 +323,49 @@ pub mod tests { must_err(&mut engine, k, ts(60, 0), ts(65, 0)); must_succeed(&mut engine, k, ts(60, 0), ts(80, 0)); } + + #[test] + fn test_inherit_last_change_info_from_lock() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k = b"k"; + must_prewrite_put(&mut engine, k, b"v1", k, 5); + must_succeed(&mut engine, k, 5, 10); + + // WriteType is Lock + must_prewrite_lock(&mut engine, k, k, 15); + let lock = must_locked(&mut engine, k, 15); + assert_eq!(lock.last_change_ts, 10.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_succeed(&mut engine, k, 15, 20); + let write = must_written(&mut engine, k, 15, 20, WriteType::Lock); + assert_eq!(write.last_change_ts, 10.into()); + assert_eq!(write.versions_to_last_change, 1); + + // WriteType is Put + must_prewrite_put(&mut engine, k, b"v2", k, 25); + let lock = must_locked(&mut engine, k, 25); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_succeed(&mut engine, k, 25, 30); + let write = must_written(&mut engine, k, 25, 30, WriteType::Put); + assert!(write.last_change_ts.is_zero()); + assert_eq!(write.versions_to_last_change, 0); + } + + #[test] + fn test_2pc_with_txn_source() { + for source in [0x1, 0x85] { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let k = b"k"; + // WriteType is Put + must_prewrite_put_with_txn_soucre(&mut engine, k, b"v2", k, 25, source); + let lock = must_locked(&mut engine, k, 25); + assert_eq!(lock.txn_source, source); + must_succeed(&mut engine, k, 25, 30); + let write = must_written(&mut engine, k, 25, 30, WriteType::Put); + assert_eq!(write.txn_source, source); + } + } } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 96f80b9389c..4b05c8eef8f 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -1,162 +1,243 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use txn_types::{Key, Lock, TimeStamp, Write, WriteType}; +use std::ops::Bound; + +use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, - txn::{actions::check_txn_status::rollback_lock, Error, ErrorInner, Result as TxnResult}, - Snapshot, Statistics, + txn::{actions::check_txn_status::rollback_lock, Result as TxnResult}, + Snapshot, }; pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for multiple batches */; -pub fn flashback_to_version_read_lock( - reader: &mut MvccReader, - next_lock_key: &Option, - end_key: &Option, - statistics: &mut Statistics, -) -> TxnResult<(Vec<(Key, Lock)>, bool)> { - if next_lock_key.is_none() { - return Ok((vec![], false)); - } - let key_locks_result = reader.scan_locks( - next_lock_key.as_ref(), - end_key.as_ref(), - // To flashback `CF_LOCK`, we need to delete all locks. +pub fn flashback_to_version_read_lock( + reader: &mut MvccReader, + next_lock_key: Key, + end_key: &Key, +) -> TxnResult> { + let result = reader.scan_locks( + Some(&next_lock_key), + Some(end_key), |_| true, FLASHBACK_BATCH_SIZE, ); - statistics.add(&reader.statistics); - Ok(key_locks_result?) + let (key_locks, _) = result?; + Ok(key_locks) } -pub fn flashback_to_version_read_write( - reader: &mut MvccReader, - key_locks_len: usize, - next_write_key: &Option, - end_key: &Option, +pub fn flashback_to_version_read_write( + reader: &mut MvccReader, + next_write_key: Key, + start_key: &Key, + end_key: &Key, flashback_version: TimeStamp, - flashback_start_ts: TimeStamp, flashback_commit_ts: TimeStamp, - statistics: &mut Statistics, -) -> TxnResult<(Vec<(Key, Option)>, bool)> { - if next_write_key.is_none() { - return Ok((vec![], false)); - } else if key_locks_len >= FLASHBACK_BATCH_SIZE { - // The batch is full, we need to read the writes in the next batch later. - return Ok((vec![], true)); - } - // To flashback the data, we need to get all the latest keys first by scanning - // every unique key in `CF_WRITE` and to get its corresponding old MVCC write - // record if exists. - let (key_ts_old_writes, has_remain_writes) = reader.scan_writes( - next_write_key.as_ref(), - end_key.as_ref(), - Some(flashback_version), - // No need to find an old version for the key if its latest `commit_ts` is smaller - // than or equal to the version. - |key| key.decode_ts().unwrap_or(TimeStamp::zero()) > flashback_version, - FLASHBACK_BATCH_SIZE - key_locks_len, - )?; - statistics.add(&reader.statistics); - let mut key_old_writes = Vec::with_capacity(FLASHBACK_BATCH_SIZE - key_locks_len); - // Check the latest commit ts to make sure there is no commit change during the - // flashback, otherwise, we need to abort the flashback. - for (key, commit_ts, old_write) in key_ts_old_writes { - if commit_ts > flashback_commit_ts { - return Err(Error::from(ErrorInner::InvalidTxnTso { - start_ts: flashback_start_ts, - commit_ts: flashback_commit_ts, - })); - } - // Since the first flashback preparation phase make sure there will be no writes - // other than flashback after it, so we need to check if there is already a - // successful flashback result, and if so, just finish the flashback ASAP. - if commit_ts == flashback_commit_ts { - key_old_writes.clear(); - return Ok((key_old_writes, false)); - } - key_old_writes.push((key, old_write)); - } - Ok((key_old_writes, has_remain_writes)) +) -> TxnResult> { + // Filter out the SST that does not have a newer version than + // `flashback_version` in `CF_WRITE`, i.e, whose latest `commit_ts` <= + // `flashback_version`. By doing this, we can only flashback those keys that + // have version changed since `flashback_version` as much as possible. + reader.set_hint_min_ts(Some(Bound::Excluded(flashback_version))); + // To flashback the data, we need to get all the latest visible keys first by + // scanning every unique key in `CF_WRITE`. + let keys_result = reader.scan_latest_user_keys( + Some(&next_write_key), + Some(end_key), + |key, latest_commit_ts| { + // There is no any other write could happen after the flashback begins. + assert!(latest_commit_ts <= flashback_commit_ts); + // - Skip the `start_key` which as prewrite key. + // - No need to find an old version for the key if its latest `commit_ts` is + // smaller than or equal to the flashback version. + // - No need to flashback a key twice if its latest `commit_ts` is equal to the + // flashback `commit_ts`. + key != start_key + && latest_commit_ts > flashback_version + && latest_commit_ts < flashback_commit_ts + }, + FLASHBACK_BATCH_SIZE, + ); + let (keys, _) = keys_result?; + Ok(keys) } -pub fn flashback_to_version( +// At the very first beginning of flashback, we need to rollback all locks in +// `CF_LOCK`. +pub fn rollback_locks( txn: &mut MvccTxn, - reader: &mut SnapshotReader, - next_lock_key: &mut Option, - next_write_key: &mut Option, + snapshot: impl Snapshot, key_locks: Vec<(Key, Lock)>, - key_old_writes: Vec<(Key, Option)>, - start_ts: TimeStamp, - commit_ts: TimeStamp, -) -> TxnResult { - // To flashback the `CF_LOCK`, we need to delete all locks records whose - // `start_ts` is greater than the specified version, and if it's not a - // short-value `LockType::Put`, we need to delete the actual data from - // `CF_DEFAULT` as well. - // TODO: `resolved_ts` should be taken into account. +) -> TxnResult> { + let mut reader = SnapshotReader::new(txn.start_ts, snapshot, false); for (key, lock) in key_locks { if txn.write_size() >= MAX_TXN_WRITE_SIZE { - *next_lock_key = Some(key); - break; + return Ok(Some(key)); } // To guarantee rollback with start ts of the locks reader.start_ts = lock.ts; rollback_lock( txn, - reader, + &mut reader, key.clone(), &lock, lock.is_pessimistic_txn(), true, )?; } - // To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC - // record for each key in `self.keys` with its old value at `self.version`, - // specifically, the flashback will have the following behavior: - // - If a key doesn't exist at `self.version`, it will be put a - // `WriteType::Delete`. - // - If a key exists at `self.version`, it will be put the exact same record - // in `CF_WRITE` and `CF_DEFAULT` if needed with `self.commit_ts` and - // `self.start_ts`. - for (key, old_write) in key_old_writes { + Ok(None) +} + +// To flashback the `CF_WRITE` and `CF_DEFAULT`, we need to write a new MVCC +// record for each key in keys with its old value at `flashback_version`, +// specifically, the flashback will have the following behavior: +// - If a key doesn't exist or isn't invisible at `flashback_version`, it will +// be put a `WriteType::Delete`. +// - If a key exists and is visible at `flashback_version`, it will be put the +// exact same record in `CF_WRITE` and `CF_DEFAULT` with `self.commit_ts` +// and `self.start_ts`. +pub fn flashback_to_version_write( + txn: &mut MvccTxn, + reader: &mut MvccReader, + keys: Vec, + flashback_version: TimeStamp, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, +) -> TxnResult> { + for key in keys { + #[cfg(feature = "failpoints")] + { + let should_skip = || { + fail::fail_point!("flashback_skip_1_key_in_write", |_| true); + false + }; + if should_skip() { + continue; + } + } if txn.write_size() >= MAX_TXN_WRITE_SIZE { - *next_write_key = Some(key); - break; + return Ok(Some(key.clone())); } + let old_write = reader.get_write(&key, flashback_version, None)?; let new_write = if let Some(old_write) = old_write { - // If it's not a short value and it's a `WriteType::Put`, we should put the old + // If it's a `WriteType::Put` without the short value, we should put the old // value in `CF_DEFAULT` with `self.start_ts` as well. - if old_write.short_value.is_none() && old_write.write_type == WriteType::Put { + if old_write.write_type == WriteType::Put && old_write.short_value.is_none() { txn.put_value( key.clone(), - start_ts, + flashback_start_ts, reader.load_data(&key, old_write.clone())?, ); } - Write::new(old_write.write_type, start_ts, old_write.short_value) + Write::new( + old_write.write_type, + flashback_start_ts, + old_write.short_value.clone(), + ) } else { // If the old write doesn't exist, we should put a `WriteType::Delete` record to // delete the current key when needed. - if let Some((_, latest_write)) = reader.seek_write(&key, commit_ts)? { - if latest_write.write_type == WriteType::Delete { - continue; - } - } - Write::new(WriteType::Delete, start_ts, None) + Write::new(WriteType::Delete, flashback_start_ts, None) }; - txn.put_write(key.clone(), commit_ts, new_write.as_ref().to_bytes()); + txn.put_write(key, flashback_commit_ts, new_write.as_ref().to_bytes()); } - Ok(txn.modifies.len()) + Ok(None) +} + +// Prewrite the `key_to_lock`, namely the `self.start_key`, to do a special 2PC +// transaction. +pub fn prewrite_flashback_key( + txn: &mut MvccTxn, + reader: &mut MvccReader, + key_to_lock: &Key, + flashback_version: TimeStamp, + flashback_start_ts: TimeStamp, +) -> TxnResult<()> { + let old_write = reader.get_write(key_to_lock, flashback_version, None)?; + // Flashback the value in `CF_DEFAULT` as well if the old write is a + // `WriteType::Put` without the short value. + if let Some(old_write) = old_write.as_ref() { + if old_write.write_type == WriteType::Put + && old_write.short_value.is_none() + // If the value with `flashback_start_ts` already exists, we don't need to write again. + && reader.get_value(key_to_lock, flashback_start_ts)?.is_none() + { + txn.put_value( + key_to_lock.clone(), + flashback_start_ts, + reader.load_data(key_to_lock, old_write.clone())?, + ); + } + } + txn.put_lock( + key_to_lock.clone(), + &Lock::new( + old_write.as_ref().map_or(LockType::Delete, |write| { + if write.write_type == WriteType::Delete { + LockType::Delete + } else { + LockType::Put + } + }), + key_to_lock.as_encoded().to_vec(), + flashback_start_ts, + 0, + old_write.and_then(|write| write.short_value), + TimeStamp::zero(), + 1, + TimeStamp::zero(), + ), + ); + Ok(()) +} + +pub fn commit_flashback_key( + txn: &mut MvccTxn, + reader: &mut MvccReader, + key_to_commit: &Key, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, +) -> TxnResult<()> { + if let Some(mut lock) = reader.load_lock(key_to_commit)? { + txn.put_write( + key_to_commit.clone(), + flashback_commit_ts, + Write::new( + WriteType::from_lock_type(lock.lock_type).unwrap(), + flashback_start_ts, + lock.short_value.take(), + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source) + .as_ref() + .to_bytes(), + ); + txn.unlock_key( + key_to_commit.clone(), + lock.is_pessimistic_txn(), + flashback_commit_ts, + ); + } + Ok(()) +} + +pub fn get_first_user_key( + reader: &mut MvccReader, + start_key: &Key, + end_key: &Key, +) -> TxnResult> { + let (mut keys_result, _) = + reader.scan_latest_user_keys(Some(start_key), Some(end_key), |_, _| true, 1)?; + Ok(keys_result.pop()) } #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::Context; + use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::DoPessimisticCheck}; use tikv_kv::ScanMode; - use txn_types::TimeStamp; + use txn_types::{TimeStamp, SHORT_VALUE_MAX_LEN}; use super::*; use crate::storage::{ @@ -172,56 +253,107 @@ pub mod tests { Engine, TestEngineBuilder, }; - fn must_flashback_to_version( + fn must_rollback_lock( + engine: &mut E, + key: &[u8], + start_ts: impl Into, + ) -> usize { + let next_key = Key::from_raw(keys::next_key(key).as_slice()); + let key = Key::from_raw(key); + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &ctx); + let key_locks = flashback_to_version_read_lock(&mut reader, key, &next_key).unwrap(); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts.into(), cm); + rollback_locks(&mut txn, snapshot, key_locks).unwrap(); + let rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_prewrite_flashback_key( + engine: &mut E, + key: &[u8], + version: impl Into, + start_ts: impl Into, + ) -> usize { + let (version, start_ts) = (version.into(), start_ts.into()); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts, cm); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let ctx = Context::default(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let prewrite_key = if let Some(first_key) = + get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")).unwrap() + { + first_key + } else { + // If the key is None return directly + return 0; + }; + prewrite_flashback_key(&mut txn, &mut reader, &prewrite_key, version, start_ts).unwrap(); + let rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_flashback_write_to_version( engine: &mut E, key: &[u8], version: impl Into, start_ts: impl Into, commit_ts: impl Into, ) -> usize { + let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let mut statistics = Statistics::default(); - let (key_locks, has_remain_locks) = - flashback_to_version_read_lock(&mut reader, &Some(key.clone()), &None, &mut statistics) - .unwrap(); - assert!(!has_remain_locks); - let (key_old_writes, has_remain_writes) = flashback_to_version_read_write( + // Flashback the writes. + let keys = flashback_to_version_read_write( &mut reader, - 0, - &Some(key.clone()), - &None, + key, + &Key::from_raw(b""), + &next_key, version, - start_ts, commit_ts, - &mut statistics, ) .unwrap(); - assert!(!has_remain_writes); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + let mut txn = MvccTxn::new(start_ts, cm); + flashback_to_version_write(&mut txn, &mut reader, keys, version, start_ts, commit_ts) + .unwrap(); + let rows = txn.modifies.len(); + write(engine, &ctx, txn.into_modifies()); + rows + } + + fn must_commit_flashback_key( + engine: &mut E, + key: &[u8], + start_ts: impl Into, + commit_ts: impl Into, + ) -> usize { + let (start_ts, commit_ts) = (start_ts.into(), commit_ts.into()); let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); - let mut reader = SnapshotReader::new_with_ctx(version, snapshot, &ctx); - let rows = flashback_to_version( - &mut txn, - &mut reader, - &mut None, - &mut Some(key), - key_locks, - key_old_writes, - start_ts, - commit_ts, - ) - .unwrap(); + let ctx = Context::default(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let key_to_lock = + get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")) + .unwrap() + .unwrap(); + commit_flashback_key(&mut txn, &mut reader, &key_to_lock, start_ts, commit_ts).unwrap(); + let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); rows } #[test] - fn test_flashback_to_version() { + fn test_flashback_write_to_version() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let k = b"k"; @@ -249,50 +381,50 @@ pub mod tests { must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 1 with start_ts = 14, commit_ts = 15. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 2 with start_ts = 17, commit_ts = 18. assert_eq!( - must_flashback_to_version(&mut engine, k, 2, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 2, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 5 with start_ts = 20, commit_ts = 21. assert_eq!( - must_flashback_to_version(&mut engine, k, 5, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 5, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 7 with start_ts = 23, commit_ts = 24. assert_eq!( - must_flashback_to_version(&mut engine, k, 7, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 7, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v1); // Flashback to version 10 with start_ts = 26, commit_ts = 27. assert_eq!( - must_flashback_to_version(&mut engine, k, 10, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 10, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); // Flashback to version 13 with start_ts = 29, commit_ts = 30. assert_eq!( - must_flashback_to_version(&mut engine, k, 13, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 13, *ts.incr(), *ts.incr()), 1 ); must_get(&mut engine, k, *ts.incr(), v2); // Flashback to version 27 with start_ts = 32, commit_ts = 33. assert_eq!( - must_flashback_to_version(&mut engine, k, 27, *ts.incr(), *ts.incr()), + must_flashback_write_to_version(&mut engine, k, 27, *ts.incr(), *ts.incr()), 1 ); must_get_none(&mut engine, k, *ts.incr()); } #[test] - fn test_flashback_to_version_deleted() { + fn test_flashback_write_to_version_deleted() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", b"v"); @@ -301,19 +433,17 @@ pub mod tests { must_get(&mut engine, k, ts, v); must_prewrite_delete(&mut engine, k, k, *ts.incr()); must_commit(&mut engine, k, ts, *ts.incr()); - // Since the key has been deleted, flashback to version 1 should not do - // anything. + // Though the key has been deleted, flashback to version 1 still needs to write + // a new `WriteType::Delete` with the flashback `commit_ts`. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), - 0 + must_flashback_write_to_version(&mut engine, k, 1, *ts.incr(), *ts.incr()), + 1 ); must_get_none(&mut engine, k, ts); } #[test] - fn test_flashback_to_version_pessimistic() { - use kvproto::kvrpcpb::PrewriteRequestPessimisticAction::*; - + fn test_flashback_write_to_version_pessimistic() { let mut engine = TestEngineBuilder::new().build().unwrap(); let k = b"k"; let (v1, v2, v3) = (b"v1", b"v2", b"v3"); @@ -330,7 +460,11 @@ pub mod tests { // Flashback to version 17 with start_ts = 35, commit_ts = 40. // Distinguish from pessimistic start_ts 30 to make sure rollback ts is by lock // ts. - assert_eq!(must_flashback_to_version(&mut engine, k, 17, 35, 40), 3); + assert_eq!(must_rollback_lock(&mut engine, k, 35), 2); + assert_eq!( + must_flashback_write_to_version(&mut engine, k, 17, 35, 40), + 1 + ); // Pessimistic Prewrite Put(k -> v3) with stat_ts = 30 will be error with // Rollback. @@ -339,7 +473,7 @@ pub mod tests { } #[test] - fn test_duplicated_flashback_to_version() { + fn test_duplicated_flashback_write_to_version() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", b"v"); @@ -349,15 +483,129 @@ pub mod tests { let start_ts = *ts.incr(); let commit_ts = *ts.incr(); assert_eq!( - must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + must_flashback_write_to_version(&mut engine, k, 1, start_ts, commit_ts), 1 ); must_get_none(&mut engine, k, ts); // Flashback again with the same `start_ts` and `commit_ts` should not do // anything. assert_eq!( - must_flashback_to_version(&mut engine, k, 1, start_ts, commit_ts), + must_flashback_write_to_version(&mut engine, k, 1, start_ts, commit_ts), + 0 + ); + } + + #[test] + fn test_duplicated_prewrite_flashback_key() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut ts = TimeStamp::zero(); + let (k, v) = (b"k", [u8::MAX; SHORT_VALUE_MAX_LEN + 1]); + must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, &v); + + let flashback_start_ts = *ts.incr(); + // Rollback nothing. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); + // Lock and write the value of `k`. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 2 + ); + // Retry Prepare + // Unlock `k`, put rollback record and delete the value of `k`. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 3); + // Lock and write the value of `k`. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 2 + ); + // Retry Prepare + // Only unlock `k` since there is an overlapped rollback record. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 1); + // Only lock `k` since the value of `k` has already existed. + assert_eq!( + must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), + 1 + ); + } + + #[test] + fn test_prewrite_with_special_key() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + let mut ts = TimeStamp::zero(); + let (prewrite_key, prewrite_val) = (b"b", b"val"); + must_prewrite_put( + &mut engine, + prewrite_key, + prewrite_val, + prewrite_key, + *ts.incr(), + ); + must_commit(&mut engine, prewrite_key, ts, *ts.incr()); + must_get(&mut engine, prewrite_key, ts, prewrite_val); + let (k, v1, v2) = (b"c", b"v1", b"v2"); + must_prewrite_put(&mut engine, k, v1, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, v2); + // Check for prewrite key b"b". + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); + let first_key = get_first_user_key(&mut reader, &Key::from_raw(b""), &Key::from_raw(b"z")) + .unwrap_or_else(|_| Some(Key::from_raw(b""))) + .unwrap(); + assert_eq!(first_key, Key::from_raw(prewrite_key)); + + // case 1: start key is before all keys, flashback b"c". + let start_key = b"a"; + let (flashback_start_ts, flashback_commit_ts) = (*ts.incr(), *ts.incr()); + // Rollback nothing. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); + // Prewrite "prewrite_key" not "start_key". + assert_eq!( + must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + 1 + ); + // Flashback (b"c", v2) to (b"c", v1). + assert_eq!( + must_flashback_write_to_version( + &mut engine, + k, + 4, + flashback_start_ts, + flashback_commit_ts + ), + 1 + ); + // Put prewrite record and Unlock, will commit "prewrite_key" not "start_key". + assert_eq!( + must_commit_flashback_key( + &mut engine, + start_key, + flashback_start_ts, + flashback_commit_ts + ), + 2 + ); + must_get(&mut engine, k, ts, v1); + must_get(&mut engine, prewrite_key, ts, prewrite_val); + + // case 2: start key is after all keys, prewrite will return None. + let start_key = b"d"; + let flashback_start_ts = *ts.incr(); + // Rollback nothing. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); + // Prewrite null. + assert_eq!( + must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), 0 ); + // case 3: start key is valid, end_key is invalid, prewrite key will be None. + let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), &Key::from_raw(b"")) + .unwrap_or_else(|_| Some(Key::from_raw(b""))); + assert_eq!(first_key, None); } } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index 40709032d61..f2de9df0004 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -21,7 +21,10 @@ use crate::storage::{ }, Error, ErrorInner, Lock, LockType, MvccTxn, Result, SnapshotReader, }, - txn::{actions::check_data_constraint::check_data_constraint, LockInfo}, + txn::{ + actions::check_data_constraint::check_data_constraint, sched_pool::tls_can_enable, + scheduler::LAST_CHANGE_TS, LockInfo, + }, Snapshot, }; @@ -62,7 +65,7 @@ pub fn prewrite( let lock_status = match reader.load_lock(&mutation.key)? { Some(lock) => mutation.check_lock(lock, pessimistic_action)?, None if matches!(pessimistic_action, DoPessimisticCheck) => { - amend_pessimistic_lock(&mutation, reader)?; + amend_pessimistic_lock(&mut mutation, reader)?; lock_amended = true; LockStatus::None } @@ -169,6 +172,7 @@ pub struct TransactionProperties<'a> { pub need_old_value: bool, pub is_retry_request: bool, pub assertion_level: AssertionLevel, + pub txn_source: u64, } impl<'a> TransactionProperties<'a> { @@ -236,6 +240,8 @@ struct PrewriteMutation<'a> { lock_type: Option, lock_ttl: u64, + last_change_ts: TimeStamp, + versions_to_last_change: u64, should_not_exist: bool, should_not_write: bool, @@ -273,6 +279,8 @@ impl<'a> PrewriteMutation<'a> { lock_type, lock_ttl: txn_props.lock_ttl, + last_change_ts: TimeStamp::zero(), + versions_to_last_change: 0, should_not_exist, should_not_write, @@ -320,6 +328,9 @@ impl<'a> PrewriteMutation<'a> { return Err(ErrorInner::KeyIsLocked(self.lock_info(lock)?).into()); } + self.last_change_ts = lock.last_change_ts; + self.versions_to_last_change = lock.versions_to_last_change; + if lock.lock_type == LockType::Pessimistic { // TODO: remove it in future if !self.txn_props.is_pessimistic() { @@ -350,7 +361,7 @@ impl<'a> PrewriteMutation<'a> { } fn check_for_newer_version( - &self, + &mut self, reader: &mut SnapshotReader, ) -> Result> { let mut seek_ts = TimeStamp::max(); @@ -365,6 +376,10 @@ impl<'a> PrewriteMutation<'a> { // TODO: Maybe we need to add a new error for the rolled back case. self.write_conflict_error(&write, commit_ts, WriteConflictReason::SelfRolledBack)?; } + if seek_ts == TimeStamp::max() { + (self.last_change_ts, self.versions_to_last_change) = + write.next_last_change_info(commit_ts); + } match self.txn_props.kind { TransactionKind::Optimistic(_) => { if commit_ts > self.txn_props.start_ts { @@ -424,6 +439,12 @@ impl<'a> PrewriteMutation<'a> { return Ok(Some((write, commit_ts))); } + // If seek_ts is max and it goes here, there is no write record for this key. + if seek_ts == TimeStamp::max() { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (self.last_change_ts, self.versions_to_last_change) = (TimeStamp::zero(), 1); + } Ok(None) } @@ -439,7 +460,13 @@ impl<'a> PrewriteMutation<'a> { self.txn_props.for_update_ts(), self.txn_props.txn_size, self.min_commit_ts, - ); + ) + .set_txn_source(self.txn_props.txn_source); + // Only Lock needs to record `last_change_ts` in its write record, Put or Delete + // records themselves are effective changes. + if tls_can_enable(LAST_CHANGE_TS) && self.lock_type == Some(LockType::Lock) { + lock = lock.set_last_change(self.last_change_ts, self.versions_to_last_change); + } if let Some(value) = self.value { if is_short_value(&value) { @@ -503,7 +530,7 @@ impl<'a> PrewriteMutation<'a> { } fn check_assertion( - &self, + &mut self, reader: &mut SnapshotReader, write: &Option<(Write, TimeStamp)>, write_loaded: bool, @@ -694,11 +721,11 @@ fn async_commit_timestamps( // If the data is not changed after acquiring the lock, we can still prewrite // the key. fn amend_pessimistic_lock( - mutation: &PrewriteMutation<'_>, + mutation: &mut PrewriteMutation<'_>, reader: &mut SnapshotReader, ) -> Result<()> { let write = reader.seek_write(&mutation.key, TimeStamp::max())?; - if let Some((commit_ts, _)) = write.as_ref() { + if let Some((commit_ts, write)) = write.as_ref() { // The invariants of pessimistic locks are: // 1. lock's for_update_ts >= key's latest commit_ts // 2. lock's for_update_ts >= txn's start_ts @@ -727,6 +754,12 @@ fn amend_pessimistic_lock( } .into()); } + (mutation.last_change_ts, mutation.versions_to_last_change) = + write.next_last_change_info(*commit_ts); + } else { + // last_change_ts == 0 && versions_to_last_change > 0 means the key actually + // does not exist. + (mutation.last_change_ts, mutation.versions_to_last_change) = (TimeStamp::zero(), 1); } // Used pipelined pessimistic lock acquiring in this txn but failed // Luckily no other txn modified this lock, amend it by treat it as optimistic @@ -774,6 +807,7 @@ pub mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } @@ -800,6 +834,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } @@ -1112,6 +1147,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_check_not_exists(Key::from_raw(key)), &None, @@ -1144,6 +1180,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; // calculated commit_ts = 43 ≤ 50, ok let (_, old_value) = prewrite( @@ -1194,6 +1231,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; // calculated commit_ts = 43 ≤ 50, ok let (_, old_value) = prewrite( @@ -1303,6 +1341,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let cases = vec![ @@ -1363,6 +1402,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let cases: Vec<_> = vec![ @@ -1634,6 +1674,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(start_ts); @@ -1688,6 +1729,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let snapshot = engine.snapshot(Default::default()).unwrap(); let cm = ConcurrencyManager::new(start_ts); @@ -1829,6 +1871,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let (_, old_value) = prewrite( &mut txn, @@ -1865,6 +1908,7 @@ pub mod tests { need_old_value: true, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; let (_, old_value) = prewrite( &mut txn, @@ -2194,4 +2238,266 @@ pub mod tests { must_commit(&mut engine, key, 21, 22); must_pessimistic_prewrite_insert(&mut engine, key, value, key, 23, 23, DoConstraintCheck); } + + #[cfg(test)] + fn test_calculate_last_change_ts_from_latest_write_impl( + prewrite_func: impl Fn(&mut RocksEngine, LockType, /* start_ts */ u64), + ) { + use engine_traits::CF_WRITE; + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + + // Latest version does not exist + prewrite_func(&mut engine, LockType::Lock, 2); + let lock = must_locked(&mut engine, key, 2); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 2, false); + + // Latest change ts should not be enabled on TiKV 6.4 + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + let write = Write::new(WriteType::Put, 5.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(8.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 10); + let lock = must_locked(&mut engine, key, 10); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 10, false); + + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.5.0").unwrap(); + set_tls_feature_gate(feature_gate); + + // Latest version is a PUT. But as we are prewriting a PUT, no need to record + // `last_change_ts`. + let write = Write::new(WriteType::Put, 15.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(20.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Put, 25); + let lock = must_locked(&mut engine, key, 25); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 25, false); + + // Latest version is a PUT + let write = Write::new(WriteType::Put, 30.into(), Some(b"value".to_vec())); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(35.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 40); + let lock = must_locked(&mut engine, key, 40); + assert_eq!(lock.last_change_ts, 35.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 40, false); + + // Latest version is a DELETE + let write = Write::new(WriteType::Delete, 45.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(50.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 55); + let lock = must_locked(&mut engine, key, 55); + assert_eq!(lock.last_change_ts, 50.into()); + assert_eq!(lock.versions_to_last_change, 1); + must_rollback(&mut engine, key, 55, false); + + // Latest version is a LOCK without last_change_ts. Set the last_change_ts of + // the new record to zero. + let write = Write::new(WriteType::Lock, 60.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(65.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 70); + let lock = must_locked(&mut engine, key, 70); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 70, false); + + // Latest version is a ROLLBACK without last_change_ts. Set the last_change_ts + // of the new record to zero. + let write = Write::new(WriteType::Rollback, 75.into(), None); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(80.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 85); + let lock = must_locked(&mut engine, key, 85); + assert!(lock.last_change_ts.is_zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 85, false); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 90.into(), None).set_last_change(20.into(), 6); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(95.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 100); + let lock = must_locked(&mut engine, key, 100); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 7); + must_rollback(&mut engine, key, 100, false); + + // Latest version is a LOCK with last_change_ts + let write = Write::new(WriteType::Lock, 105.into(), None).set_last_change(20.into(), 8); + engine + .put_cf( + Default::default(), + CF_WRITE, + Key::from_raw(key).append_ts(110.into()), + write.as_ref().to_bytes(), + ) + .unwrap(); + prewrite_func(&mut engine, LockType::Lock, 120); + let lock = must_locked(&mut engine, key, 120); + assert_eq!(lock.last_change_ts, 20.into()); + assert_eq!(lock.versions_to_last_change, 9); + must_rollback(&mut engine, key, 120, false); + } + + #[test] + fn test_optimistic_txn_calculate_last_change_ts() { + test_calculate_last_change_ts_from_latest_write_impl(|engine, tp, start_ts| match tp { + LockType::Put => must_prewrite_put(engine, b"k", b"value", b"k", start_ts), + LockType::Delete => must_prewrite_delete(engine, b"k", b"k", start_ts), + LockType::Lock => must_prewrite_lock(engine, b"k", b"k", start_ts), + _ => unreachable!(), + }); + } + + #[test] + fn test_pessimistic_amend_txn_calculate_last_change_ts() { + test_calculate_last_change_ts_from_latest_write_impl(|engine, tp, start_ts| match tp { + LockType::Put => must_pessimistic_prewrite_put( + engine, + b"k", + b"value", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + LockType::Delete => must_pessimistic_prewrite_delete( + engine, + b"k", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + LockType::Lock => must_pessimistic_prewrite_lock( + engine, + b"k", + b"k", + start_ts, + start_ts, + DoPessimisticCheck, + ), + _ => unreachable!(), + }); + } + + #[test] + fn test_inherit_last_change_ts_from_pessimistic_lock() { + use engine_traits::CF_LOCK; + + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let key = b"k"; + let put_lock = + |engine: &mut RocksEngine, ts: u64, last_change_ts: u64, versions_to_last_change| { + let lock = Lock::new( + LockType::Pessimistic, + key.to_vec(), + ts.into(), + 100, + None, + ts.into(), + 5, + ts.into(), + ) + .set_last_change(last_change_ts.into(), versions_to_last_change); + engine + .put_cf( + Default::default(), + CF_LOCK, + Key::from_raw(key), + lock.to_bytes(), + ) + .unwrap(); + }; + + // Prewrite LOCK from pessimistic lock without `last_change_ts` + put_lock(&mut engine, 10, 0, 0); + must_pessimistic_prewrite_lock(&mut engine, key, key, 10, 10, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 10); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 10, false); + + // Prewrite LOCK from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 20, 15, 3); + must_pessimistic_prewrite_lock(&mut engine, key, key, 20, 20, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 20); + assert_eq!(lock.last_change_ts, 15.into()); + assert_eq!(lock.versions_to_last_change, 3); + must_rollback(&mut engine, key, 20, false); + + // Prewrite PUT from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 30, 15, 5); + must_pessimistic_prewrite_put(&mut engine, key, b"value", key, 30, 30, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 30); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 30, false); + + // Prewrite DELETE from pessimistic lock with `last_change_ts` + put_lock(&mut engine, 40, 15, 5); + must_pessimistic_prewrite_delete(&mut engine, key, key, 40, 30, DoPessimisticCheck); + let lock = must_locked(&mut engine, key, 40); + assert_eq!(lock.last_change_ts, TimeStamp::zero()); + assert_eq!(lock.versions_to_last_change, 0); + must_rollback(&mut engine, key, 40, false); + } } diff --git a/src/storage/txn/actions/tests.rs b/src/storage/txn/actions/tests.rs index fdf060d950d..e6872ef493f 100644 --- a/src/storage/txn/actions/tests.rs +++ b/src/storage/txn/actions/tests.rs @@ -52,6 +52,7 @@ pub fn must_prewrite_put_impl( assertion_level, false, None, + 0, ); } @@ -90,6 +91,7 @@ pub fn must_prewrite_insert_impl( assertion_level, true, None, + 0, ); } @@ -111,8 +113,10 @@ pub fn must_prewrite_put_impl_with_should_not_exist( assertion_level: AssertionLevel, should_not_exist: bool, region_id: Option, + txn_source: u64, ) { let mut ctx = Context::default(); + ctx.set_txn_source(txn_source); if let Some(region_id) = region_id { ctx.region_id = region_id; } @@ -154,6 +158,7 @@ pub fn must_prewrite_put_impl_with_should_not_exist( need_old_value: false, is_retry_request, assertion_level, + txn_source, }, mutation, secondary_keys, @@ -215,6 +220,37 @@ pub fn must_prewrite_put_on_region( AssertionLevel::Off, false, Some(region_id), + 0, + ); +} + +pub fn must_prewrite_put_with_txn_soucre( + engine: &mut E, + key: &[u8], + value: &[u8], + pk: &[u8], + ts: impl Into, + txn_source: u64, +) { + must_prewrite_put_impl_with_should_not_exist( + engine, + key, + value, + pk, + &None, + ts.into(), + SkipPessimisticCheck, + 0, + TimeStamp::default(), + 0, + TimeStamp::default(), + TimeStamp::default(), + false, + Assertion::None, + AssertionLevel::Off, + false, + None, + txn_source, ); } @@ -422,6 +458,7 @@ fn default_txn_props( need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 359f0abacd8..6bd147cf02e 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -1,8 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use kvproto::kvrpcpb::{ExtraOp, LockInfo}; -use txn_types::{Key, OldValues, TimeStamp, TxnExtra}; +use kvproto::kvrpcpb::ExtraOp; +use tikv_kv::Modify; +use txn_types::{insert_old_value_if_resolved, Key, OldValues, TimeStamp, TxnExtra}; use crate::storage::{ kv::WriteData, @@ -16,9 +17,9 @@ use crate::storage::{ }, Error, ErrorInner, Result, }, - types::PessimisticLockParameters, - Error as StorageError, ErrorInner as StorageErrorInner, PessimisticLockRes, ProcessResult, - Result as StorageResult, Snapshot, + types::{PessimisticLockParameters, PessimisticLockResults}, + Error as StorageError, PessimisticLockKeyResult, ProcessResult, Result as StorageResult, + Snapshot, }; command! { @@ -26,7 +27,7 @@ command! { /// /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. AcquirePessimisticLock: - cmd_ty => StorageResult, + cmd_ty => StorageResult, display => "kv::command::acquirepessimisticlock keys({:?}) @ {} {} {} {:?} {} {} {} | {:?}", (keys, start_ts, lock_ttl, for_update_ts, wait_timeout, min_commit_ts, check_existence, lock_only_if_exists, ctx), content => { @@ -46,9 +47,9 @@ command! { /// later read in the same transaction. return_values: bool, min_commit_ts: TimeStamp, - old_values: OldValues, check_existence: bool, lock_only_if_exists: bool, + allow_lock_with_conflict: bool, } } @@ -69,17 +70,15 @@ impl CommandExt for AcquirePessimisticLock { gen_lock!(keys: multiple(|x| &x.0)); } -fn extract_lock_info_from_result(res: &StorageResult) -> &LockInfo { - match res { - Err(StorageError(box StorageErrorInner::Txn(Error(box ErrorInner::Mvcc(MvccError( - box MvccErrorInner::KeyIsLocked(info), - )))))) => info, - _ => panic!("unexpected mvcc error"), - } -} - impl WriteCommand for AcquirePessimisticLock { - fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { + fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { + if self.allow_lock_with_conflict && self.keys.len() > 1 { + // Currently multiple keys with `allow_lock_with_conflict` set is not supported. + return Err(Error::from(ErrorInner::Other(box_err!( + "multiple keys in a single request with allowed_lock_with_conflict set is not allowed" + )))); + } + let (start_ts, ctx, keys) = (self.start_ts, self.ctx, self.keys); let mut txn = MvccTxn::new(start_ts, context.concurrency_manager); let mut reader = ReaderWithStats::new( @@ -87,18 +86,11 @@ impl WriteCommand for AcquirePessimisticLock context.statistics, ); - let rows = keys.len(); - let mut res = if self.return_values { - Ok(PessimisticLockRes::Values(vec![])) - } else if self.check_existence { - // If return_value is set, the existence status is implicitly included in the - // result. So check_existence only need to be explicitly handled if - // `return_values` is not set. - Ok(PessimisticLockRes::Existence(vec![])) - } else { - Ok(PessimisticLockRes::Empty) - }; + let total_keys = keys.len(); + let mut res = PessimisticLockResults::with_capacity(total_keys); + let mut encountered_locks = vec![]; let need_old_value = context.extra_op == ExtraOp::ReadOldValue; + let mut old_values = OldValues::default(); for (k, should_not_exist) in keys { match acquire_pessimistic_lock( &mut txn, @@ -113,74 +105,79 @@ impl WriteCommand for AcquirePessimisticLock self.min_commit_ts, need_old_value, self.lock_only_if_exists, + self.allow_lock_with_conflict, ) { - Ok((val, old_value)) => { - if self.return_values || self.check_existence { - res.as_mut().unwrap().push(val); - } - if old_value.resolved() { - let key = k.append_ts(txn.start_ts); - // MutationType is unknown in AcquirePessimisticLock stage. - let mutation_type = None; - self.old_values.insert(key, (old_value, mutation_type)); - } + Ok((key_res, old_value)) => { + res.push(key_res); + // MutationType is unknown in AcquirePessimisticLock stage. + insert_old_value_if_resolved(&mut old_values, k, txn.start_ts, old_value, None); } - Err(e @ MvccError(box MvccErrorInner::KeyIsLocked { .. })) => { - res = Err(e).map_err(Error::from).map_err(StorageError::from); + Err(MvccError(box MvccErrorInner::KeyIsLocked(lock_info))) => { + let request_parameters = PessimisticLockParameters { + pb_ctx: ctx.clone(), + primary: self.primary.clone(), + start_ts, + lock_ttl: self.lock_ttl, + for_update_ts: self.for_update_ts, + wait_timeout: self.wait_timeout, + return_values: self.return_values, + min_commit_ts: self.min_commit_ts, + check_existence: self.check_existence, + is_first_lock: self.is_first_lock, + lock_only_if_exists: self.lock_only_if_exists, + allow_lock_with_conflict: self.allow_lock_with_conflict, + }; + let lock_info = WriteResultLockInfo::new( + lock_info, + request_parameters, + k, + should_not_exist, + ); + encountered_locks.push(lock_info); + // Do not lock previously succeeded keys. + txn.clear(); + res.0.clear(); + res.push(PessimisticLockKeyResult::Waiting); break; } Err(e) => return Err(Error::from(e)), } } - // Some values are read, update max_ts - match &res { - Ok(PessimisticLockRes::Values(values)) if !values.is_empty() => { - txn.concurrency_manager.update_max_ts(self.for_update_ts); - } - Ok(PessimisticLockRes::Existence(values)) if !values.is_empty() => { - txn.concurrency_manager.update_max_ts(self.for_update_ts); + let modifies = txn.into_modifies(); + + let mut res = Ok(res); + + // If encountered lock and `wait_timeout` is `None` (which means no wait), + // return error directly here. + if !encountered_locks.is_empty() && self.wait_timeout.is_none() { + // Mind the difference of the protocols of legacy requests and resumable + // requests. For resumable requests (allow_lock_with_conflict == + // true), key errors are considered key by key instead of for the + // whole request. + let lock_info = encountered_locks.drain(..).next().unwrap().lock_info_pb; + let err = StorageError::from(Error::from(MvccError::from( + MvccErrorInner::KeyIsLocked(lock_info), + ))); + if self.allow_lock_with_conflict { + res.as_mut().unwrap().0[0] = PessimisticLockKeyResult::Failed(err.into()) + } else { + res = Err(err) } - _ => (), } - // no conflict - let (pr, to_be_write, rows, ctx, lock_info) = if res.is_ok() { - let pr = ProcessResult::PessimisticLockRes { res }; - let extra = TxnExtra { - old_values: self.old_values, - // One pc status is unkown AcquirePessimisticLock stage. - one_pc: false, - for_flashback: false, - }; - let write_data = WriteData::new(txn.into_modifies(), extra); - (pr, write_data, rows, ctx, None) - } else { - let request_parameters = PessimisticLockParameters { - pb_ctx: ctx.clone(), - primary: self.primary.clone(), - start_ts: self.start_ts, - lock_ttl: self.lock_ttl, - for_update_ts: self.for_update_ts, - wait_timeout: self.wait_timeout, - return_values: self.return_values, - min_commit_ts: self.min_commit_ts, - check_existence: self.check_existence, - is_first_lock: self.is_first_lock, - allow_lock_with_conflict: false, - }; - let lock_info_pb = extract_lock_info_from_result(&res); - let lock_info = WriteResultLockInfo::new(lock_info_pb.clone(), request_parameters); - let pr = ProcessResult::PessimisticLockRes { res }; - // Wait for lock released - (pr, WriteData::default(), 0, ctx, Some(lock_info)) - }; + let rows = if res.is_ok() { total_keys } else { 0 }; + + let pr = ProcessResult::PessimisticLockRes { res }; + + let to_be_write = make_write_data(modifies, old_values); + Ok(WriteResult { ctx, to_be_write, rows, pr, - lock_info, + lock_info: encountered_locks, released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, @@ -188,38 +185,16 @@ impl WriteCommand for AcquirePessimisticLock } } -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_gen_lock_info_from_result() { - let raw_key = b"key".to_vec(); - let key = Key::from_raw(&raw_key); - let ts = 100; - let is_first_lock = true; - let wait_timeout = WaitTimeout::from_encoded(200); - - let mut info = LockInfo::default(); - info.set_key(raw_key.clone()); - info.set_lock_version(ts); - info.set_lock_ttl(100); - let case = StorageError::from(StorageErrorInner::Txn(Error::from(ErrorInner::Mvcc( - MvccError::from(MvccErrorInner::KeyIsLocked(info.clone())), - )))); - let lock_info = WriteResultLockInfo::new( - extract_lock_info_from_result::<()>(&Err(case)).clone(), - PessimisticLockParameters { - is_first_lock, - wait_timeout, - ..Default::default() - }, - ); - assert_eq!(lock_info.lock_digest.ts, ts.into()); - assert_eq!(lock_info.lock_digest.hash, key.gen_hash()); - assert_eq!(lock_info.key.into_raw().unwrap(), raw_key); - assert_eq!(lock_info.parameters.is_first_lock, is_first_lock); - assert_eq!(lock_info.parameters.wait_timeout, wait_timeout); - assert_eq!(lock_info.lock_info_pb, info); +pub(super) fn make_write_data(modifies: Vec, old_values: OldValues) -> WriteData { + if !modifies.is_empty() { + let extra = TxnExtra { + old_values, + // One pc status is unknown in AcquirePessimisticLock stage. + one_pc: false, + for_flashback: false, + }; + WriteData::new(modifies, extra) + } else { + WriteData::default() } } diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs new file mode 100644 index 00000000000..a66f8228755 --- /dev/null +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -0,0 +1,441 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{Debug, Formatter}, + sync::Arc, +}; + +// #[PerformanceCriticalPath] +use kvproto::kvrpcpb::ExtraOp; +use txn_types::{insert_old_value_if_resolved, Key, OldValues}; + +use crate::storage::{ + lock_manager::{ + lock_wait_context::LockWaitContextSharedState, lock_waiting_queue::LockWaitEntry, + LockManager, LockWaitToken, + }, + mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, MvccTxn, SnapshotReader}, + txn::{ + acquire_pessimistic_lock, + commands::{ + acquire_pessimistic_lock::make_write_data, Command, CommandExt, ReleasedLocks, + ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, + WriteResultLockInfo, + }, + Error, Result, + }, + types::{PessimisticLockParameters, PessimisticLockResults}, + Error as StorageError, PessimisticLockKeyResult, ProcessResult, Result as StorageResult, + Snapshot, +}; + +pub struct ResumedPessimisticLockItem { + pub key: Key, + pub should_not_exist: bool, + pub params: PessimisticLockParameters, + pub lock_wait_token: LockWaitToken, + pub req_states: Arc, +} + +impl Debug for ResumedPessimisticLockItem { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResumedPessimisticLockItem") + .field("key", &self.key) + .field("should_not_exist", &self.should_not_exist) + .field("params", &self.params) + .field("lock_wait_token", &self.lock_wait_token) + .finish() + } +} + +command! { + /// Acquire a Pessimistic lock on the keys. + /// + /// This can be rolled back with a [`PessimisticRollback`](Command::PessimisticRollback) command. + AcquirePessimisticLockResumed: + cmd_ty => StorageResult, + display => "kv::command::acquirepessimisticlockresumed {:?}", + (items), + content => { + items: Vec, + } +} + +impl CommandExt for AcquirePessimisticLockResumed { + ctx!(); + tag!(acquire_pessimistic_lock_resumed); + request_type!(KvPessimisticLock); + + property!(can_be_pipelined); + + fn write_bytes(&self) -> usize { + self.items + .iter() + .map(|item| item.key.as_encoded().len()) + .sum() + } + + gen_lock!(items: multiple(|x| &x.key)); +} + +impl WriteCommand for AcquirePessimisticLockResumed { + fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { + fail_point!("acquire_pessimistic_lock_resumed_before_process_write"); + let mut modifies = vec![]; + let mut txn = None; + let mut reader: Option> = None; + + let total_keys = self.items.len(); + let mut res = PessimisticLockResults::with_capacity(total_keys); + let mut encountered_locks = vec![]; + let need_old_value = context.extra_op == ExtraOp::ReadOldValue; + let mut old_values = OldValues::default(); + + let mut new_locked_keys = Vec::with_capacity(total_keys); + + for item in self.items.into_iter() { + let ResumedPessimisticLockItem { + key, + should_not_exist, + params, + lock_wait_token, + req_states, + } = item; + + // TODO: Refine the code for rebuilding txn state. + if txn + .as_ref() + .map_or(true, |t: &MvccTxn| t.start_ts != params.start_ts) + { + if let Some(prev_txn) = txn.replace(MvccTxn::new( + params.start_ts, + context.concurrency_manager.clone(), + )) { + modifies.extend(prev_txn.into_modifies()); + } + // TODO: Is it possible to reuse the same reader but change the start_ts stored + // in it? + if let Some(mut prev_reader) = reader.replace(SnapshotReader::new_with_ctx( + params.start_ts, + snapshot.clone(), + &self.ctx, + )) { + context.statistics.add(&prev_reader.take_statistics()); + } + } + let txn = txn.as_mut().unwrap(); + let reader = reader.as_mut().unwrap(); + + match acquire_pessimistic_lock( + txn, + reader, + key.clone(), + ¶ms.primary, + should_not_exist, + params.lock_ttl, + params.for_update_ts, + params.return_values, + params.check_existence, + params.min_commit_ts, + need_old_value, + params.lock_only_if_exists, + true, + ) { + Ok((key_res, old_value)) => { + res.push(key_res); + new_locked_keys.push((params.start_ts, key.clone())); + + insert_old_value_if_resolved( + &mut old_values, + key, + params.start_ts, + old_value, + None, + ); + } + Err(MvccError(box MvccErrorInner::KeyIsLocked(lock_info))) => { + let mut lock_info = + WriteResultLockInfo::new(lock_info, params, key, should_not_exist); + lock_info.lock_wait_token = lock_wait_token; + lock_info.req_states = Some(req_states); + res.push(PessimisticLockKeyResult::Waiting); + encountered_locks.push(lock_info); + } + Err(e) => { + res.push(PessimisticLockKeyResult::Failed( + StorageError::from(Error::from(e)).into(), + )); + } + }; + } + + if let Some(txn) = txn { + if !txn.is_empty() { + modifies.extend(txn.into_modifies()); + } + } + if let Some(mut reader) = reader { + context.statistics.add(&reader.take_statistics()); + } + + let pr = ProcessResult::PessimisticLockRes { res: Ok(res) }; + let to_be_write = make_write_data(modifies, old_values); + + Ok(WriteResult { + ctx: self.ctx, + to_be_write, + rows: total_keys, + pr, + lock_info: encountered_locks, + released_locks: ReleasedLocks::new(), + lock_guards: vec![], + response_policy: ResponsePolicy::OnProposed, + }) + } +} + +impl AcquirePessimisticLockResumed { + pub fn from_lock_wait_entries( + lock_wait_entries: impl IntoIterator>, + ) -> TypedCommand> { + let items: Vec<_> = lock_wait_entries + .into_iter() + .map(|item| { + assert!(item.key_cb.is_none()); + ResumedPessimisticLockItem { + key: item.key, + should_not_exist: item.should_not_exist, + params: item.parameters, + lock_wait_token: item.lock_wait_token, + req_states: item.req_states, + } + }) + .collect(); + + assert!(!items.is_empty()); + let ctx = items[0].params.pb_ctx.clone(); + // TODO: May it cause problem by using the first one as the pb_ctx of the + // Command? + Self::new(items, ctx) + } +} + +#[cfg(test)] +mod tests { + use concurrency_manager::ConcurrencyManager; + use kvproto::kvrpcpb::Context; + use rand::random; + use tikv_kv::Engine; + use txn_types::TimeStamp; + + use super::*; + use crate::storage::{ + lock_manager::{MockLockManager, WaitTimeout}, + mvcc::tests::{must_locked, write}, + txn::{ + commands::pessimistic_rollback::tests::must_success as must_pessimistic_rollback, + tests::{must_commit, must_pessimistic_locked, must_prewrite_put, must_rollback}, + }, + TestEngineBuilder, + }; + + #[allow(clippy::vec_box)] + fn must_success( + engine: &mut E, + lock_wait_entries: Vec>, + ) -> PessimisticLockResults { + let ctx = Context::default(); + let snapshot = engine.snapshot(Default::default()).unwrap(); + let cm = ConcurrencyManager::new(TimeStamp::zero()); + + let items_info: Vec<_> = lock_wait_entries + .iter() + .map(|item| { + ( + item.lock_wait_token, + item.key.clone(), + item.parameters.clone(), + item.should_not_exist, + ) + }) + .collect(); + + let command = AcquirePessimisticLockResumed::from_lock_wait_entries(lock_wait_entries).cmd; + let result = command + .process_write( + snapshot, + WriteContext { + lock_mgr: &MockLockManager::new(), + concurrency_manager: cm, + extra_op: Default::default(), + statistics: &mut Default::default(), + async_apply_prewrite: false, + raw_ext: None, + }, + ) + .unwrap(); + let res = if let ProcessResult::PessimisticLockRes { res } = result.pr { + res.unwrap() + } else { + panic!("unexpected process result: {:?}", result.pr); + }; + + // Check correctness of returned lock info. + let mut lock_info_index = 0; + for (i, res) in res.0.iter().enumerate() { + if let PessimisticLockKeyResult::Waiting = res { + let (token, key, params, should_not_exist) = &items_info[i]; + let lock_info: &WriteResultLockInfo = &result.lock_info[lock_info_index]; + lock_info_index += 1; + + assert_eq!(lock_info.lock_wait_token, *token); + assert_eq!(&lock_info.key, key); + assert_eq!(&lock_info.parameters, params); + assert_eq!(lock_info.should_not_exist, *should_not_exist); + } + } + assert_eq!(lock_info_index, result.lock_info.len()); + + write(engine, &ctx, result.to_be_write.modifies); + res + } + + fn make_lock_waiting( + key: &[u8], + start_ts: impl Into, + for_update_ts: impl Into, + return_values: bool, + check_existence: bool, + ) -> Box { + let start_ts = start_ts.into(); + let for_update_ts = for_update_ts.into(); + assert!(for_update_ts >= start_ts); + let parameters = PessimisticLockParameters { + pb_ctx: Context::default(), + primary: key.to_vec(), + start_ts, + lock_ttl: 1000, + for_update_ts, + wait_timeout: Some(WaitTimeout::Millis(1000)), + return_values, + min_commit_ts: for_update_ts.next(), + check_existence, + is_first_lock: false, + lock_only_if_exists: false, + allow_lock_with_conflict: true, + }; + + let key = Key::from_raw(key); + let lock_hash = key.gen_hash(); + let token = LockWaitToken(Some(random())); + // The tests in this file doesn't need a valid req_state. Set a dummy value + // here. + let req_states = Arc::new(LockWaitContextSharedState::new_dummy(token, key.clone())); + let entry = LockWaitEntry { + key, + lock_hash, + parameters, + should_not_exist: false, + lock_wait_token: token, + legacy_wake_up_index: Some(0), + req_states, + key_cb: None, + }; + Box::new(entry) + } + + #[test] + fn test_acquire_pessimistic_lock_resumed() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let res = must_success( + &mut engine, + vec![make_lock_waiting(b"k1", 10, 15, false, false)], + ); + assert_eq!(res.0.len(), 1); + res.0[0].assert_empty(); + must_pessimistic_locked(&mut engine, b"k1", 10, 15); + must_pessimistic_rollback(&mut engine, b"k1", 10, 15); + + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 20, 25, false, false), + make_lock_waiting(b"k2", 20, 25, false, false), + make_lock_waiting(b"k3", 21, 26, false, false), + ], + ); + assert_eq!(res.0.len(), 3); + res.0.iter().for_each(|x| x.assert_empty()); + must_pessimistic_locked(&mut engine, b"k1", 20, 25); + must_pessimistic_locked(&mut engine, b"k2", 20, 25); + must_pessimistic_locked(&mut engine, b"k3", 21, 26); + + must_pessimistic_rollback(&mut engine, b"k1", 20, 25); + must_pessimistic_rollback(&mut engine, b"k2", 20, 25); + must_pessimistic_rollback(&mut engine, b"k3", 21, 26); + + must_prewrite_put(&mut engine, b"k1", b"v1", b"k1", 30); + must_commit(&mut engine, b"k1", 30, 35); + must_prewrite_put(&mut engine, b"k2", b"v2", b"k1", 30); + must_prewrite_put(&mut engine, b"k3", b"v3", b"k3", 28); + must_commit(&mut engine, b"k3", 28, 29); + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 31, 31, false, false), + make_lock_waiting(b"k2", 32, 32, false, false), + make_lock_waiting(b"k3", 33, 33, true, false), + make_lock_waiting(b"k4", 34, 34, false, true), + make_lock_waiting(b"k5", 35, 35, false, false), + ], + ); + assert_eq!(res.0.len(), 5); + res.0[0].assert_locked_with_conflict(Some(b"v1"), 35); + res.0[1].assert_waiting(); + res.0[2].assert_value(Some(b"v3")); + res.0[3].assert_existence(false); + res.0[4].assert_empty(); + must_pessimistic_locked(&mut engine, b"k1", 31, 35); + must_locked(&mut engine, b"k2", 30); + must_pessimistic_locked(&mut engine, b"k3", 33, 33); + must_pessimistic_locked(&mut engine, b"k4", 34, 34); + must_pessimistic_locked(&mut engine, b"k5", 35, 35); + + must_pessimistic_rollback(&mut engine, b"k1", 31, 35); + must_pessimistic_rollback(&mut engine, b"k3", 33, 33); + must_pessimistic_rollback(&mut engine, b"k4", 34, 34); + must_pessimistic_rollback(&mut engine, b"k5", 35, 35); + + must_prewrite_put(&mut engine, b"k4", b"v4", b"k4", 40); + must_prewrite_put(&mut engine, b"k6", b"v6", b"k4", 40); + let res = must_success( + &mut engine, + vec![ + make_lock_waiting(b"k1", 41, 41, false, false), + make_lock_waiting(b"k2", 41, 41, false, false), + make_lock_waiting(b"k3", 42, 42, false, false), + make_lock_waiting(b"k4", 42, 42, false, false), + make_lock_waiting(b"k5", 43, 43, false, false), + make_lock_waiting(b"k6", 43, 43, false, false), + ], + ); + assert_eq!(res.0.len(), 6); + for &i in &[0, 2, 4] { + res.0[i].assert_empty(); + } + for &i in &[1, 3, 5] { + res.0[i].assert_waiting(); + } + must_pessimistic_locked(&mut engine, b"k1", 41, 41); + must_pessimistic_locked(&mut engine, b"k3", 42, 42); + must_pessimistic_locked(&mut engine, b"k5", 43, 43); + + must_pessimistic_rollback(&mut engine, b"k1", 41, 41); + must_rollback(&mut engine, b"k2", 30, false); + must_pessimistic_rollback(&mut engine, b"k3", 43, 43); + must_rollback(&mut engine, b"k2", 40, false); + must_pessimistic_rollback(&mut engine, b"k5", 45, 45); + must_rollback(&mut engine, b"k2", 40, false); + } +} diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index b935d991eea..1df5c5b2cf8 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -58,7 +58,7 @@ impl WriteCommand for RawAtomicStore { to_be_write, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 1a4b547b6d7..4802535c054 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -153,7 +153,7 @@ impl WriteCommand for CheckSecondaryLocks { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 58f7f557448..34948109f4b 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -132,7 +132,7 @@ impl WriteCommand for CheckTxnStatus { to_be_write: write_data, rows: 1, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, @@ -1163,4 +1163,27 @@ pub mod tests { must_unlocked(&mut engine, k); must_get_rollback_ts(&mut engine, k, ts(50, 0)); } + + #[test] + fn test_rollback_calculate_last_change_info() { + let mut engine = crate::storage::TestEngineBuilder::new().build().unwrap(); + let k = b"k"; + + // Below is a case explaining why we don't calculate last_change_ts for + // rollback. + + must_prewrite_put(&mut engine, k, b"v1", k, 5); + must_commit(&mut engine, k, 5, 6); + + must_prewrite_put(&mut engine, k, b"v2", k, 7); + // When we calculate last_change_ts here, we will get 6. + must_rollback(&mut engine, k, 10, true); + // But we can still commit with ts 8, then the last_change_ts of the rollback + // will be incorrect. + must_commit(&mut engine, k, 7, 8); + + let rollback = must_written(&mut engine, k, 10, 10, WriteType::Rollback); + assert!(rollback.last_change_ts.is_zero()); + assert_eq!(rollback.versions_to_last_change, 0); + } } diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index 0b82432e3cd..a6c529420d3 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -74,7 +74,7 @@ impl WriteCommand for Cleanup { to_be_write: write_data, rows: 1, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 86e1f541306..910b7832ed1 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -74,7 +74,7 @@ impl WriteCommand for Commit { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 2fff0620b27..943fc6f69d1 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -112,7 +112,7 @@ impl WriteCommand for RawCompareAndSwap { to_be_write, rows, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards, response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 9b198724e3b..13de0c9b183 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -1,18 +1,25 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use txn_types::{Key, Lock, TimeStamp, Write}; +use std::mem; + +use tikv_kv::ScanMode; +use txn_types::{Key, TimeStamp}; use crate::storage::{ kv::WriteData, lock_manager::LockManager, - mvcc::{MvccTxn, SnapshotReader}, + mvcc::{MvccReader, MvccTxn}, txn::{ + actions::flashback_to_version::{ + commit_flashback_key, flashback_to_version_write, prewrite_flashback_key, + rollback_locks, + }, commands::{ - Command, CommandExt, FlashbackToVersionReadPhase, ReaderWithStats, ReleasedLocks, - ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, + Command, CommandExt, FlashbackToVersionReadPhase, FlashbackToVersionState, + ReleasedLocks, ResponsePolicy, TypedCommand, WriteCommand, WriteContext, WriteResult, }, - flashback_to_version, latch, Result, + latch, Result, }, ProcessResult, Snapshot, }; @@ -25,11 +32,9 @@ command! { start_ts: TimeStamp, commit_ts: TimeStamp, version: TimeStamp, - end_key: Option, - next_lock_key: Option, - next_write_key: Option, - key_locks: Vec<(Key, Lock)>, - key_old_writes: Vec<(Key, Option)>, + start_key: Key, + end_key: Key, + state: FlashbackToVersionState, } } @@ -39,71 +44,120 @@ impl CommandExt for FlashbackToVersion { request_type!(KvFlashbackToVersion); fn gen_lock(&self) -> latch::Lock { - latch::Lock::new( - self.key_locks - .iter() - .map(|(key, _)| key) - .chain(self.key_old_writes.iter().map(|(key, _)| key)), - ) + match &self.state { + FlashbackToVersionState::RollbackLock { key_locks, .. } => { + latch::Lock::new(key_locks.iter().map(|(key, _)| key)) + } + FlashbackToVersionState::Prewrite { key_to_lock } => latch::Lock::new([key_to_lock]), + FlashbackToVersionState::FlashbackWrite { keys, .. } => latch::Lock::new(keys.iter()), + FlashbackToVersionState::Commit { key_to_commit } => latch::Lock::new([key_to_commit]), + } } fn write_bytes(&self) -> usize { - self.key_locks - .iter() - .map(|(key, _)| key.as_encoded().len()) - .chain( - self.key_old_writes - .iter() - .map(|(key, _)| key.as_encoded().len()), - ) - .sum() + match &self.state { + FlashbackToVersionState::RollbackLock { key_locks, .. } => key_locks + .iter() + .map(|(key, _)| key.as_encoded().len()) + .sum(), + FlashbackToVersionState::Prewrite { key_to_lock } => key_to_lock.as_encoded().len(), + FlashbackToVersionState::FlashbackWrite { keys, .. } => { + keys.iter().map(|key| key.as_encoded().len()).sum() + } + FlashbackToVersionState::Commit { key_to_commit } => key_to_commit.as_encoded().len(), + } } } impl WriteCommand for FlashbackToVersion { fn process_write(mut self, snapshot: S, context: WriteContext<'_, L>) -> Result { - let mut reader = ReaderWithStats::new( - SnapshotReader::new_with_ctx(self.version, snapshot, &self.ctx), - context.statistics, - ); + let mut reader = + MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &self.ctx); let mut txn = MvccTxn::new(TimeStamp::zero(), context.concurrency_manager); - - let mut next_lock_key = self.next_lock_key.take(); - let mut next_write_key = self.next_write_key.take(); - let rows = flashback_to_version( - &mut txn, - &mut reader, - &mut next_lock_key, - &mut next_write_key, - self.key_locks, - self.key_old_writes, - self.start_ts, - self.commit_ts, - )?; + match self.state { + FlashbackToVersionState::RollbackLock { + ref mut next_lock_key, + ref mut key_locks, + } => { + if let Some(new_next_lock_key) = + rollback_locks(&mut txn, snapshot, mem::take(key_locks))? + { + *next_lock_key = new_next_lock_key; + } + } + FlashbackToVersionState::Prewrite { ref key_to_lock } => prewrite_flashback_key( + &mut txn, + &mut reader, + key_to_lock, + self.version, + self.start_ts, + )?, + FlashbackToVersionState::FlashbackWrite { + ref mut next_write_key, + ref mut keys, + } => { + if let Some(new_next_write_key) = flashback_to_version_write( + &mut txn, + &mut reader, + mem::take(keys), + self.version, + self.start_ts, + self.commit_ts, + )? { + *next_write_key = new_next_write_key; + } + } + FlashbackToVersionState::Commit { ref key_to_commit } => commit_flashback_key( + &mut txn, + &mut reader, + key_to_commit, + self.start_ts, + self.commit_ts, + )?, + } + let rows = txn.modifies.len(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); + // To let the flashback modification could be proposed and applied successfully. write_data.extra.for_flashback = true; + // To let the CDC treat the flashback modification as an 1PC transaction. + if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { + write_data.extra.one_pc = true; + } + context.statistics.add(&reader.statistics); Ok(WriteResult { ctx: self.ctx.clone(), to_be_write: write_data, rows, - pr: if next_lock_key.is_none() && next_write_key.is_none() { - ProcessResult::Res - } else { - let next_cmd = FlashbackToVersionReadPhase { - ctx: self.ctx.clone(), - deadline: self.deadline, - start_ts: self.start_ts, - commit_ts: self.commit_ts, - version: self.version, - end_key: self.end_key, - next_lock_key, - next_write_key, - }; + pr: (move || { + if matches!( + self.state, + FlashbackToVersionState::Prewrite { .. } + | FlashbackToVersionState::Commit { .. } + ) { + return ProcessResult::Res; + } + + #[cfg(feature = "failpoints")] + if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { + fail_point!("flashback_failed_after_first_batch", |_| { + ProcessResult::Res + }); + } + ProcessResult::NextCommand { - cmd: Command::FlashbackToVersionReadPhase(next_cmd), + cmd: Command::FlashbackToVersionReadPhase(FlashbackToVersionReadPhase { + ctx: self.ctx, + deadline: self.deadline, + start_ts: self.start_ts, + commit_ts: self.commit_ts, + version: self.version, + start_key: self.start_key, + end_key: self.end_key, + state: self.state, + }), } - }, - lock_info: None, + })(), + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 47348c8e188..9ac5014b7f3 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -1,11 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use txn_types::{Key, TimeStamp}; +use txn_types::{Key, Lock, TimeStamp}; use crate::storage::{ mvcc::MvccReader, txn::{ + actions::flashback_to_version::get_first_user_key, commands::{ Command, CommandExt, FlashbackToVersion, ProcessResult, ReadCommand, TypedCommand, }, @@ -13,9 +14,70 @@ use crate::storage::{ sched_pool::tls_collect_keyread_histogram_vec, Error, ErrorInner, Result, }, - ScanMode, Snapshot, Statistics, + Context, ScanMode, Snapshot, Statistics, }; +#[derive(Debug)] +pub enum FlashbackToVersionState { + RollbackLock { + next_lock_key: Key, + key_locks: Vec<(Key, Lock)>, + }, + Prewrite { + key_to_lock: Key, + }, + FlashbackWrite { + next_write_key: Key, + keys: Vec, + }, + Commit { + key_to_commit: Key, + }, +} + +pub fn new_flashback_rollback_lock_cmd( + start_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ctx: Context, +) -> TypedCommand<()> { + FlashbackToVersionReadPhase::new( + start_ts, + TimeStamp::zero(), + version, + start_key.clone(), + end_key, + FlashbackToVersionState::RollbackLock { + next_lock_key: start_key, + key_locks: Vec::new(), + }, + ctx, + ) +} + +pub fn new_flashback_write_cmd( + start_ts: TimeStamp, + commit_ts: TimeStamp, + version: TimeStamp, + start_key: Key, + end_key: Key, + ctx: Context, +) -> TypedCommand<()> { + FlashbackToVersionReadPhase::new( + start_ts, + commit_ts, + version, + start_key.clone(), + end_key, + FlashbackToVersionState::FlashbackWrite { + next_write_key: start_key, + keys: Vec::new(), + }, + ctx, + ) +} + command! { FlashbackToVersionReadPhase: cmd_ty => (), @@ -24,9 +86,9 @@ command! { start_ts: TimeStamp, commit_ts: TimeStamp, version: TimeStamp, - end_key: Option, - next_lock_key: Option, - next_write_key: Option, + start_key: Key, + end_key: Key, + state: FlashbackToVersionState, } } @@ -42,78 +104,138 @@ impl CommandExt for FlashbackToVersionReadPhase { } } -/// FlashbackToVersion contains two phases: -/// 1. Read phase: -/// - Scan all locks to delete them all later. -/// - Scan all the latest writes to flashback them all later. -/// 2. Write phase: -/// - Delete all locks we scanned at the read phase. -/// - Write the old MVCC version writes for the keys we scanned at the read -/// phase. +/// The whole flashback progress contains four phases: +/// 1. [PrepareFlashback] RollbackLock phase: +/// - Scan all locks. +/// - Rollback all these locks. +/// 2. [PrepareFlashback] Prewrite phase: +/// - Prewrite the `self.start_key` specifically to prevent the +/// `resolved_ts` from advancing. +/// 3. [FinishFlashback] FlashbackWrite phase: +/// - Scan all the latest writes and their corresponding values at +/// `self.version`. +/// - Write the old MVCC version writes again for all these keys with +/// `self.commit_ts` excluding the `self.start_key`. +/// 4. [FinishFlashback] Commit phase: +/// - Commit the `self.start_key` we write at the second phase to finish the +/// flashback. impl ReadCommand for FlashbackToVersionReadPhase { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { - if self.commit_ts <= self.start_ts { - return Err(Error::from(ErrorInner::InvalidTxnTso { - start_ts: self.start_ts, - commit_ts: self.commit_ts, - })); - } + let tag = self.tag().get_str(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); - // Scan the locks. - let (key_locks, has_remain_locks) = flashback_to_version_read_lock( - &mut reader, - &self.next_lock_key, - &self.end_key, - statistics, - )?; - // Scan the writes. - let (mut key_old_writes, has_remain_writes) = flashback_to_version_read_write( - &mut reader, - key_locks.len(), - &self.next_write_key, - &self.end_key, - self.version, - self.start_ts, - self.commit_ts, - statistics, - )?; - tls_collect_keyread_histogram_vec( - self.tag().get_str(), - (key_locks.len() + key_old_writes.len()) as f64, - ); - - if key_locks.is_empty() && key_old_writes.is_empty() { - Ok(ProcessResult::Res) - } else { - let next_lock_key = if has_remain_locks { - key_locks.last().map(|(key, _)| key.clone()) - } else { - None - }; - let next_write_key = if has_remain_writes && !key_old_writes.is_empty() { - key_old_writes.pop().map(|(key, _)| key) - } else if has_remain_writes && key_old_writes.is_empty() { - // We haven't read any write yet, so we need to read the writes in the next - // batch later. - self.next_write_key - } else { - None - }; - let next_cmd = FlashbackToVersion { + let mut start_key = self.start_key.clone(); + let next_state = match self.state { + FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { + let mut key_locks = + flashback_to_version_read_lock(&mut reader, next_lock_key, &self.end_key)?; + if key_locks.is_empty() { + // - No more locks to rollback, continue to the Prewrite Phase. + // - The start key from the client is actually a range which is used to limit + // the upper bound of this flashback when scanning data, so it may not be a + // real key. In the Prewrite Phase, we make sure that the start key is a real + // key and take this key as a lock for the 2pc. So When overwriting the write, + // we skip the immediate write of this key and instead put it after the + // completion of the 2pc. + // - To make sure the key locked in the latch is the same as the actual key + // written, we pass it to the key in `process_write' after getting it. + let key_to_lock = if let Some(first_key) = + get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + { + first_key + } else { + // If the key is None return directly + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + }; + FlashbackToVersionState::Prewrite { key_to_lock } + } else { + tls_collect_keyread_histogram_vec(tag, key_locks.len() as f64); + FlashbackToVersionState::RollbackLock { + next_lock_key: if key_locks.len() > 1 { + key_locks.pop().map(|(key, _)| key).unwrap() + } else { + key_locks.last().map(|(key, _)| key.clone()).unwrap() + }, + key_locks, + } + } + } + FlashbackToVersionState::FlashbackWrite { + mut next_write_key, .. + } => { + if self.commit_ts <= self.start_ts { + return Err(Error::from(ErrorInner::InvalidTxnTso { + start_ts: self.start_ts, + commit_ts: self.commit_ts, + })); + } + if next_write_key == self.start_key { + // The start key from the client is actually a range which is used to limit the + // upper bound of this flashback when scanning data, so it may not be a real + // key. In the Prewrite Phase, we make sure that the start + // key is a real key and take this key as a lock for the + // 2pc. So When overwriting the write, we skip the immediate + // write of this key and instead put it after the completion + // of the 2pc. + next_write_key = if let Some(first_key) = + get_first_user_key(&mut reader, &self.start_key, &self.end_key)? + { + first_key + } else { + // If the key is None return directly + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + }; + // Commit key needs to match the Prewrite key, which is set as the first user + // key. + start_key = next_write_key.clone(); + // If the key is not locked, it means that the key has been committed before and + // we are in a retry. + if reader.load_lock(&next_write_key)?.is_none() { + statistics.add(&reader.statistics); + return Ok(ProcessResult::Res); + } + } + let mut keys = flashback_to_version_read_write( + &mut reader, + next_write_key, + &start_key, + &self.end_key, + self.version, + self.commit_ts, + )?; + if keys.is_empty() { + FlashbackToVersionState::Commit { + key_to_commit: start_key.clone(), + } + } else { + tls_collect_keyread_histogram_vec(tag, keys.len() as f64); + FlashbackToVersionState::FlashbackWrite { + // DO NOT pop the last key as the next key when it's the only key to prevent + // from making flashback fall into a dead loop. + next_write_key: if keys.len() > 1 { + keys.pop().unwrap() + } else { + keys.last().unwrap().clone() + }, + keys, + } + } + } + _ => unreachable!(), + }; + statistics.add(&reader.statistics); + Ok(ProcessResult::NextCommand { + cmd: Command::FlashbackToVersion(FlashbackToVersion { ctx: self.ctx, deadline: self.deadline, start_ts: self.start_ts, commit_ts: self.commit_ts, version: self.version, + start_key, end_key: self.end_key, - key_locks, - key_old_writes, - next_lock_key, - next_write_key, - }; - Ok(ProcessResult::NextCommand { - cmd: Command::FlashbackToVersion(next_cmd), - }) - } + state: next_state, + }), + }) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index f5331087ac1..7d835462acf 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -5,6 +5,7 @@ #[macro_use] mod macros; pub(crate) mod acquire_pessimistic_lock; +pub(crate) mod acquire_pessimistic_lock_resumed; pub(crate) mod atomic_store; pub(crate) mod check_secondary_locks; pub(crate) mod check_txn_status; @@ -29,9 +30,11 @@ use std::{ iter, marker::PhantomData, ops::{Deref, DerefMut}, + sync::Arc, }; pub use acquire_pessimistic_lock::AcquirePessimisticLock; +pub use acquire_pessimistic_lock_resumed::AcquirePessimisticLockResumed; pub use atomic_store::RawAtomicStore; pub use check_secondary_locks::CheckSecondaryLocks; pub use check_txn_status::CheckTxnStatus; @@ -40,7 +43,10 @@ pub use commit::Commit; pub use compare_and_swap::RawCompareAndSwap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; pub use flashback_to_version::FlashbackToVersion; -pub use flashback_to_version_read_phase::FlashbackToVersionReadPhase; +pub use flashback_to_version_read_phase::{ + new_flashback_rollback_lock_cmd, new_flashback_write_cmd, FlashbackToVersionReadPhase, + FlashbackToVersionState, +}; use kvproto::kvrpcpb::*; pub use mvcc_by_key::MvccByKey; pub use mvcc_by_start_ts::MvccByStartTs; @@ -54,16 +60,19 @@ pub use rollback::Rollback; use tikv_util::deadline::Deadline; use tracker::RequestType; pub use txn_heart_beat::TxnHeartBeat; -use txn_types::{Key, OldValues, TimeStamp, Value, Write}; +use txn_types::{Key, TimeStamp, Value, Write}; use crate::storage::{ kv::WriteData, - lock_manager::{self, LockManager, WaitTimeout}, + lock_manager::{ + self, lock_wait_context::LockWaitContextSharedState, LockManager, LockWaitToken, + WaitTimeout, + }, metrics, mvcc::{Lock as MvccLock, MvccReader, ReleasedLock, SnapshotReader}, txn::{latch, ProcessResult, Result}, types::{ - MvccInfo, PessimisticLockParameters, PessimisticLockRes, PrewriteResult, + MvccInfo, PessimisticLockParameters, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, StorageCallbackType, TxnStatus, }, Result as StorageResult, Snapshot, Statistics, @@ -81,6 +90,7 @@ pub enum Command { Prewrite(Prewrite), PrewritePessimistic(PrewritePessimistic), AcquirePessimisticLock(AcquirePessimisticLock), + AcquirePessimisticLockResumed(AcquirePessimisticLockResumed), Commit(Commit), Cleanup(Cleanup), Rollback(Rollback), @@ -193,7 +203,7 @@ impl From for TypedCommand { } } -impl From for TypedCommand> { +impl From for TypedCommand> { fn from(mut req: PessimisticLockRequest) -> Self { let keys = req .take_mutations() @@ -207,6 +217,11 @@ impl From for TypedCommand false, + PessimisticLockWakeUpMode::WakeUpModeForceLock => true, + }; + AcquirePessimisticLock::new( keys, req.take_primary_lock(), @@ -217,9 +232,9 @@ impl From for TypedCommand for TypedCommand> { } } +impl From for TypedCommand<()> { + fn from(mut req: PrepareFlashbackToVersionRequest) -> Self { + new_flashback_rollback_lock_cmd( + req.get_start_ts().into(), + req.get_version().into(), + Key::from_raw(req.get_start_key()), + Key::from_raw(req.get_end_key()), + req.take_context(), + ) + } +} + impl From for TypedCommand<()> { fn from(mut req: FlashbackToVersionRequest) -> Self { - FlashbackToVersionReadPhase::new( + new_flashback_write_cmd( req.get_start_ts().into(), req.get_commit_ts().into(), req.get_version().into(), - Some(Key::from_raw(req.get_end_key())), - Some(Key::from_raw(req.get_start_key())), - Some(Key::from_raw(req.get_start_key())), + Key::from_raw(req.get_start_key()), + Key::from_raw(req.get_end_key()), req.take_context(), ) } @@ -389,7 +415,7 @@ pub struct WriteResult { pub to_be_write: WriteData, pub rows: usize, pub pr: ProcessResult, - pub lock_info: Option, + pub lock_info: Vec, pub released_locks: ReleasedLocks, pub lock_guards: Vec, pub response_policy: ResponsePolicy, @@ -398,22 +424,40 @@ pub struct WriteResult { pub struct WriteResultLockInfo { pub lock_digest: lock_manager::LockDigest, pub key: Key, + pub should_not_exist: bool, pub lock_info_pb: LockInfo, pub parameters: PessimisticLockParameters, + pub hash_for_latch: u64, + /// If a request is woken up after waiting for some lock, and it encounters + /// another lock again after resuming, this field will carry the token + /// that was already allocated before. + pub lock_wait_token: LockWaitToken, + /// For resumed pessimistic lock requests, this is needed to check if it's + /// canceled outside. + pub req_states: Option>, } impl WriteResultLockInfo { - pub fn new(lock_info_pb: LockInfo, parameters: PessimisticLockParameters) -> Self { + pub fn new( + lock_info_pb: LockInfo, + parameters: PessimisticLockParameters, + key: Key, + should_not_exist: bool, + ) -> Self { let lock = lock_manager::LockDigest { ts: lock_info_pb.get_lock_version().into(), - hash: Key::from_raw(lock_info_pb.get_key()).gen_hash(), + hash: key.gen_hash(), }; - let key = Key::from_raw(lock_info_pb.get_key()); + let hash_for_latch = latch::Lock::hash(&key); Self { lock_digest: lock, key, + should_not_exist, lock_info_pb, parameters, + hash_for_latch, + lock_wait_token: LockWaitToken(None), + req_states: None, } } } @@ -567,6 +611,7 @@ impl Command { Command::Prewrite(t) => t, Command::PrewritePessimistic(t) => t, Command::AcquirePessimisticLock(t) => t, + Command::AcquirePessimisticLockResumed(t) => t, Command::Commit(t) => t, Command::Cleanup(t) => t, Command::Rollback(t) => t, @@ -592,6 +637,7 @@ impl Command { Command::Prewrite(t) => t, Command::PrewritePessimistic(t) => t, Command::AcquirePessimisticLock(t) => t, + Command::AcquirePessimisticLockResumed(t) => t, Command::Commit(t) => t, Command::Cleanup(t) => t, Command::Rollback(t) => t, @@ -635,6 +681,7 @@ impl Command { Command::Prewrite(t) => t.process_write(snapshot, context), Command::PrewritePessimistic(t) => t.process_write(snapshot, context), Command::AcquirePessimisticLock(t) => t.process_write(snapshot, context), + Command::AcquirePessimisticLockResumed(t) => t.process_write(snapshot, context), Command::Commit(t) => t.process_write(snapshot, context), Command::Cleanup(t) => t.process_write(snapshot, context), Command::Rollback(t) => t.process_write(snapshot, context), diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 05bbb508bdc..3dc7d06d5ef 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -48,7 +48,7 @@ impl WriteCommand for Pause { to_be_write: WriteData::default(), rows: 0, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index b575787208a..c35c362f19e 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -90,7 +90,7 @@ impl WriteCommand for PessimisticRollback { to_be_write: write_data, rows, pr: ProcessResult::MultiRes { results: vec![] }, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index 2b0915a5fdc..cd24f54d13b 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -14,7 +14,10 @@ use kvproto::kvrpcpb::{ PrewriteRequestPessimisticAction::{self, *}, }; use tikv_kv::SnapshotExt; -use txn_types::{Key, Mutation, OldValue, OldValues, TimeStamp, TxnExtra, Write, WriteType}; +use txn_types::{ + insert_old_value_if_resolved, Key, Mutation, OldValue, OldValues, TimeStamp, TxnExtra, Write, + WriteType, +}; use super::ReaderWithStats; use crate::storage::{ @@ -508,6 +511,7 @@ impl Prewriter { need_old_value: extra_op == ExtraOp::ReadOldValue, is_retry_request: self.ctx.is_retry_request, assertion_level: self.assertion_level, + txn_source: self.ctx.get_txn_source(), }; let async_commit_pk = self @@ -568,11 +572,13 @@ impl Prewriter { if need_min_commit_ts && final_min_commit_ts < ts { final_min_commit_ts = ts; } - if old_value.resolved() { - let key = key.append_ts(txn.start_ts); - self.old_values - .insert(key, (old_value, Some(mutation_type))); - } + insert_old_value_if_resolved( + &mut self.old_values, + key, + txn.start_ts, + old_value, + Some(mutation_type), + ); } Ok((..)) => { // If it needs min_commit_ts but min_commit_ts is zero, the lock @@ -680,7 +686,7 @@ impl Prewriter { to_be_write, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, @@ -699,7 +705,7 @@ impl Prewriter { to_be_write: WriteData::default(), rows, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, @@ -848,7 +854,9 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { WriteType::from_lock_type(lock.lock_type).unwrap(), txn.start_ts, lock.short_value, - ); + ) + .set_last_change(lock.last_change_ts, lock.versions_to_last_change) + .set_txn_source(lock.txn_source); // Transactions committed with 1PC should be impossible to overwrite rollback // records. txn.put_write(key.clone(), commit_ts, write.as_ref().to_bytes()); @@ -1072,6 +1080,42 @@ mod tests { assert_eq!(d.internal_delete_skipped_count, 0); } + #[test] + fn test_prewrite_1pc_with_txn_source() { + use crate::storage::mvcc::tests::{must_get, must_get_commit_ts, must_unlocked}; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let key = b"k"; + let value = b"v"; + let mutations = vec![Mutation::make_put(Key::from_raw(key), value.to_vec())]; + + let mut statistics = Statistics::default(); + let mut ctx = Context::default(); + ctx.set_txn_source(1); + let cmd = Prewrite::new( + mutations, + key.to_vec(), + TimeStamp::from(10), + 0, + false, + 0, + TimeStamp::default(), + TimeStamp::from(15), + None, + true, + AssertionLevel::Off, + ctx, + ); + prewrite_command(&mut engine, cm, &mut statistics, cmd).unwrap(); + + must_unlocked(&mut engine, key); + must_get(&mut engine, key, 12, value); + must_get_commit_ts(&mut engine, key, 10, 11); + must_get_txn_source(&mut engine, key, 11, 1); + } + #[test] fn test_prewrite_1pc() { use crate::storage::mvcc::tests::{must_get, must_get_commit_ts, must_unlocked}; @@ -2505,4 +2549,187 @@ mod tests { assert_eq!(res.min_commit_ts, 18.into(), "{:?}", res); must_unlocked(&mut engine, b"k2"); } + + #[test] + fn test_1pc_calculate_last_change_ts() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let key = b"k"; + let value = b"v"; + must_prewrite_put(&mut engine, key, value, key, 10); + must_commit(&mut engine, key, 10, 20); + + // 1PC write a new LOCK + let mutations = vec![Mutation::make_lock(Key::from_raw(key))]; + let mut statistics = Statistics::default(); + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations.clone(), + key.to_vec(), + 30, + Some(40), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 30, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 1); + + // 1PC write another LOCK + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 50, + Some(60), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 50, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 2); + + // 1PC write a PUT + let mutations = vec![Mutation::make_put(Key::from_raw(key), b"v2".to_vec())]; + let res = prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 70, + Some(80), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 70, res.one_pc_commit_ts, WriteType::Put); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + + // TiKV 6.4 should not have last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + let mutations = vec![Mutation::make_lock(Key::from_raw(key))]; + let res = prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + mutations, + key.to_vec(), + 80, + Some(90), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 80, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + } + + #[test] + fn test_pessimistic_1pc_calculate_last_change_ts() { + use pd_client::FeatureGate; + + use crate::storage::txn::sched_pool::set_tls_feature_gate; + + let mut engine = TestEngineBuilder::new().build().unwrap(); + let cm = concurrency_manager::ConcurrencyManager::new(1.into()); + + let key = b"k"; + let value = b"v"; + must_prewrite_put(&mut engine, key, value, key, 10); + must_commit(&mut engine, key, 10, 20); + + // Pessimistic 1PC write a new LOCK + must_acquire_pessimistic_lock(&mut engine, key, key, 30, 30); + let mutations = vec![(Mutation::make_lock(Key::from_raw(key)), DoPessimisticCheck)]; + let mut statistics = Statistics::default(); + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations.clone(), + key.to_vec(), + 30, + 30, + Some(40), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 30, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 1); + + // Pessimistic 1PC write another LOCK + must_acquire_pessimistic_lock(&mut engine, key, key, 50, 50); + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 50, + 50, + Some(60), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 50, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, 20.into()); + assert_eq!(write.versions_to_last_change, 2); + + // Pessimistic 1PC write a PUT + must_acquire_pessimistic_lock(&mut engine, key, key, 70, 70); + let mutations = vec![( + Mutation::make_put(Key::from_raw(key), b"v2".to_vec()), + DoPessimisticCheck, + )]; + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm.clone(), + &mut statistics, + mutations, + key.to_vec(), + 70, + 70, + Some(80), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 70, res.one_pc_commit_ts, WriteType::Put); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + + // TiKV 6.4 should not have last_change_ts. + let feature_gate = FeatureGate::default(); + feature_gate.set_version("6.4.0").unwrap(); + set_tls_feature_gate(feature_gate); + must_acquire_pessimistic_lock(&mut engine, key, key, 80, 80); + let mutations = vec![(Mutation::make_lock(Key::from_raw(key)), DoPessimisticCheck)]; + let res = pessimistic_prewrite_with_cm( + &mut engine, + cm, + &mut statistics, + mutations, + key.to_vec(), + 80, + 80, + Some(90), + ) + .unwrap(); + must_unlocked(&mut engine, key); + let write = must_written(&mut engine, key, 80, res.one_pc_commit_ts, WriteType::Lock); + assert_eq!(write.last_change_ts, TimeStamp::zero()); + assert_eq!(write.versions_to_last_change, 0); + } } diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index b89e91593f9..463275b2e1f 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -145,7 +145,7 @@ impl WriteCommand for ResolveLock { to_be_write: write_data, rows, pr, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index a31211c564e..d336d88a9ca 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -70,7 +70,7 @@ impl WriteCommand for ResolveLockLite { to_be_write: write_data, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 479f29cb276..52c05ae34c7 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -65,7 +65,7 @@ impl WriteCommand for Rollback { to_be_write: write_data, rows, pr: ProcessResult::Res, - lock_info: None, + lock_info: vec![], released_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index 9bfbda5c748..f965b863494 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -90,7 +90,7 @@ impl WriteCommand for TxnHeartBeat { to_be_write: write_data, rows: 1, pr, - lock_info: None, + lock_info: vec![], released_locks: ReleasedLocks::new(), lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, diff --git a/src/storage/txn/latch.rs b/src/storage/txn/latch.rs index 86d16858bd3..a662d9bab79 100644 --- a/src/storage/txn/latch.rs +++ b/src/storage/txn/latch.rs @@ -79,6 +79,11 @@ impl Latch { self.waiting.push_back(Some((key_hash, cid))); } + /// Pushes the cid to the front of the queue. Be careful when using it. + fn push_preemptive(&mut self, key_hash: u64, cid: u64) { + self.waiting.push_front(Some((key_hash, cid))); + } + /// For some hot keys, the waiting list maybe very long, so we should shrink /// the waiting VecDeque after pop. fn maybe_shrink(&mut self) { @@ -116,14 +121,7 @@ impl Lock { I: IntoIterator, { // prevent from deadlock, so we sort and deduplicate the index - let mut required_hashes: Vec = keys - .into_iter() - .map(|key| { - let mut s = DefaultHasher::new(); - key.hash(&mut s); - s.finish() - }) - .collect(); + let mut required_hashes: Vec = keys.into_iter().map(|key| Self::hash(key)).collect(); required_hashes.sort_unstable(); required_hashes.dedup(); Lock { @@ -132,12 +130,24 @@ impl Lock { } } + pub fn hash(key: &K) -> u64 { + let mut s = DefaultHasher::new(); + key.hash(&mut s); + s.finish() + } + /// Returns true if all the required latches have be acquired, false /// otherwise. pub fn acquired(&self) -> bool { self.required_hashes.len() == self.owned_count } + /// Force set the state of the `Lock` to be already-acquired. Be careful + /// when using it. + pub fn force_assume_acquired(&mut self) { + self.owned_count = self.required_hashes.len(); + } + pub fn is_write_lock(&self) -> bool { !self.required_hashes.is_empty() } @@ -197,19 +207,62 @@ impl Latches { /// Releases all latches owned by the `lock` of command with ID `who`, /// returns the wakeup list. /// + /// Optionally, this function can release partial of the given `Lock` while + /// leaving the renaming unlocked, so that some of the latches can be + /// used in another command. This can be done by passing the cid of the + /// command who will use the kept latch slots later, and the `Lock` that + /// need to be kept via the parameter `keep_latches_for_next_cmd`. Note + /// that the lock in it is assumed to be a subset of the parameter + /// `lock` which is going to be released. + /// /// Preconditions: the caller must ensure the command is at the front of the /// latches. - pub fn release(&self, lock: &Lock, who: u64) -> Vec { + pub fn release( + &self, + lock: &Lock, + who: u64, + keep_latches_for_next_cmd: Option<(u64, &Lock)>, + ) -> Vec { + // Used to + let dummy_vec = vec![]; + let (keep_latches_for_cid, mut keep_latches_it) = match keep_latches_for_next_cmd { + Some((cid, lock)) => (Some(cid), lock.required_hashes.iter().peekable()), + None => (None, dummy_vec.iter().peekable()), + }; + + // `keep_latches_it` must be sorted and deduped since it's retrieved from a + // `Lock` object. + let mut wakeup_list: Vec = vec![]; for &key_hash in &lock.required_hashes[..lock.owned_count] { let mut latch = self.lock_latch(key_hash); let (v, front) = latch.pop_front(key_hash).unwrap(); assert_eq!(front, who); assert_eq!(v, key_hash); - if let Some(wakeup) = latch.get_first_req_by_hash(key_hash) { - wakeup_list.push(wakeup); + + let keep_for_next_cmd = if let Some(&&next_keep_hash) = keep_latches_it.peek() { + assert!(next_keep_hash >= key_hash); + if next_keep_hash == key_hash { + keep_latches_it.next(); + true + } else { + false + } + } else { + false + }; + + if !keep_for_next_cmd { + if let Some(wakeup) = latch.get_first_req_by_hash(key_hash) { + wakeup_list.push(wakeup); + } + } else { + latch.push_preemptive(key_hash, keep_latches_for_cid.unwrap()); } } + + assert!(keep_latches_it.next().is_none()); + wakeup_list } @@ -221,6 +274,8 @@ impl Latches { #[cfg(test)] mod tests { + use std::iter::once; + use super::*; #[test] @@ -243,7 +298,7 @@ mod tests { assert_eq!(acquired_b, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_b); // b acquire lock success @@ -278,7 +333,7 @@ mod tests { assert_eq!(acquired_c, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_c); // c acquire lock failed again, cause b occupied slot 4 @@ -286,7 +341,7 @@ mod tests { assert_eq!(acquired_c, false); // b release lock, and get wakeup list - let wakeup = latches.release(&lock_b, cid_b); + let wakeup = latches.release(&lock_b, cid_b, None); assert_eq!(wakeup[0], cid_c); // finally c acquire lock success @@ -327,7 +382,7 @@ mod tests { assert_eq!(acquired_d, false); // a release lock, and get wakeup list - let wakeup = latches.release(&lock_a, cid_a); + let wakeup = latches.release(&lock_a, cid_a, None); assert_eq!(wakeup[0], cid_c); // c acquire lock success @@ -335,11 +390,169 @@ mod tests { assert_eq!(acquired_c, true); // b release lock, and get wakeup list - let wakeup = latches.release(&lock_b, cid_b); + let wakeup = latches.release(&lock_b, cid_b, None); assert_eq!(wakeup[0], cid_d); // finally d acquire lock success acquired_d = latches.acquire(&mut lock_d, cid_d); assert_eq!(acquired_d, true); } + + fn check_latch_holder(latches: &Latches, key: &[u8], expected_holder_cid: Option) { + let hash = Lock::hash(&key); + let actual_holder = latches.lock_latch(hash).get_first_req_by_hash(hash); + assert_eq!(actual_holder, expected_holder_cid); + } + + fn is_latches_empty(latches: &Latches) -> bool { + for i in 0..(latches.size as u64) { + if !latches.lock_latch(i).waiting.iter().all(|x| x.is_none()) { + return false; + } + } + true + } + + fn test_partially_releasing_impl(size: usize) { + let latches = Latches::new(size); + + // Single key. + let key = b"k1"; + let mut lock = Lock::new(once(key)); + assert!(latches.acquire(&mut lock, 1)); + assert!(!is_latches_empty(&latches)); + let mut lock2 = Lock::new(once(key)); + let wakeup = latches.release(&lock, 1, Some((2, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, key, Some(2)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 2, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Single key with queueing commands. + let mut lock = Lock::new(once(key)); + let mut queueing_lock = Lock::new(once(key)); + assert!(latches.acquire(&mut lock, 3)); + assert!(!latches.acquire(&mut queueing_lock, 4)); + let mut lock2 = Lock::new(once(key)); + let wakeup = latches.release(&lock, 3, Some((5, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, key, Some(5)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 5, None); + assert_eq!(wakeup, vec![4u64]); + assert!(latches.acquire(&mut queueing_lock, 4)); + let wakeup = latches.release(&queueing_lock, 4, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Multi keys, keep all. + let keys = vec![b"k1", b"k2", b"k3", b"k4"]; + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 11)); + let mut lock2 = Lock::new(keys.iter()); + let wakeup = latches.release(&lock, 11, Some((12, &lock2))); + assert!(wakeup.is_empty()); + for &key in &keys { + check_latch_holder(&latches, key, Some(12)); + } + assert!(!is_latches_empty(&latches)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 12, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + + // Multi keys, keep all, with queueing command. + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 11)); + let mut queueing_locks: Vec<_> = keys.iter().map(|k| Lock::new(once(k))).collect(); + for (cid, lock) in (12..16).zip(queueing_locks.iter_mut()) { + assert!(!latches.acquire(lock, cid)); + } + let mut lock2 = Lock::new(keys.iter()); + let wakeup = latches.release(&lock, 11, Some((17, &lock2))); + assert!(wakeup.is_empty()); + for &key in &keys { + check_latch_holder(&latches, key, Some(17)); + } + assert!(!is_latches_empty(&latches)); + lock2.force_assume_acquired(); + let mut wakeup = latches.release(&lock2, 17, None); + wakeup.sort_unstable(); + // Wake up queueing commands. + assert_eq!(wakeup, vec![12u64, 13, 14, 15]); + for (cid, mut lock) in (12..16).zip(queueing_locks) { + assert!(latches.acquire(&mut lock, cid)); + let wakeup = latches.release(&lock, cid, None); + assert!(wakeup.is_empty()); + } + assert!(is_latches_empty(&latches)); + + // 4 keys, keep 2 of them. + for (i1, &k1) in keys[0..3].iter().enumerate() { + for &k2 in keys[i1 + 1..4].iter() { + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 21)); + let mut lock2 = Lock::new(vec![k1, k2]); + let wakeup = latches.release(&lock, 21, Some((22, &lock2))); + assert!(wakeup.is_empty()); + check_latch_holder(&latches, k1, Some(22)); + check_latch_holder(&latches, k2, Some(22)); + lock2.force_assume_acquired(); + let wakeup = latches.release(&lock2, 22, None); + assert!(wakeup.is_empty()); + assert!(is_latches_empty(&latches)); + } + } + + // 4 keys keep 2 of them, with queueing commands. + for (i1, &k1) in keys[0..3].iter().enumerate() { + for (i2, &k2) in keys[i1 + 1..4].iter().enumerate() { + let mut lock = Lock::new(keys.iter()); + assert!(latches.acquire(&mut lock, 21)); + + let mut queueing_locks: Vec<_> = keys.iter().map(|k| Lock::new(once(k))).collect(); + for (cid, lock) in (22..26).zip(queueing_locks.iter_mut()) { + assert!(!latches.acquire(lock, cid)); + } + + let mut lock2 = Lock::new(vec![k1, k2]); + let mut wakeup = latches.release(&lock, 21, Some((27, &lock2))); + assert_eq!(wakeup.len(), 2); + + // The latch of k1 and k2 is preempted, and queueing locks on the other two keys + // will be woken up. + let preempted_cids = vec![(i1 + 22) as u64, (i1 + 1 + i2 + 22) as u64]; + let expected_wakeup_cids: Vec<_> = (22..26u64) + .filter(|x| !preempted_cids.contains(x)) + .collect(); + wakeup.sort_unstable(); + assert_eq!(wakeup, expected_wakeup_cids); + + check_latch_holder(&latches, k1, Some(27)); + check_latch_holder(&latches, k2, Some(27)); + + lock2.force_assume_acquired(); + let mut wakeup = latches.release(&lock2, 27, None); + wakeup.sort_unstable(); + assert_eq!(wakeup, preempted_cids); + + for (cid, mut lock) in (22..26).zip(queueing_locks) { + assert!(latches.acquire(&mut lock, cid)); + let wakeup = latches.release(&lock, cid, None); + assert!(wakeup.is_empty()); + } + + assert!(is_latches_empty(&latches)); + } + } + } + + #[test] + fn test_partially_releasing() { + test_partially_releasing_impl(256); + test_partially_releasing_impl(4); + test_partially_releasing_impl(2); + } } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index 5b71d60e3bf..f6884b0efb8 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -24,8 +24,8 @@ pub use self::{ cleanup::cleanup, commit::commit, flashback_to_version::{ - flashback_to_version, flashback_to_version_read_lock, flashback_to_version_read_write, - FLASHBACK_BATCH_SIZE, + flashback_to_version_read_lock, flashback_to_version_read_write, + flashback_to_version_write, rollback_locks, FLASHBACK_BATCH_SIZE, }, gc::gc, prewrite::{prewrite, CommitKind, TransactionKind, TransactionProperties}, @@ -40,11 +40,12 @@ pub use self::{ }; use crate::storage::{ mvcc::Error as MvccError, - types::{MvccInfo, PessimisticLockRes, PrewriteResult, SecondaryLocksStatus, TxnStatus}, + types::{MvccInfo, PessimisticLockResults, PrewriteResult, SecondaryLocksStatus, TxnStatus}, Error as StorageError, Result as StorageResult, }; /// Process result of a command. +#[allow(clippy::large_enum_variant)] #[derive(Debug)] pub enum ProcessResult { Res, @@ -73,7 +74,7 @@ pub enum ProcessResult { err: StorageError, }, PessimisticLockRes { - res: StorageResult, + res: StorageResult, }, SecondaryLocksStatus { status: SecondaryLocksStatus, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index 78a891b650e..c7c69b5bbf4 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -9,6 +9,7 @@ use std::{ use collections::HashMap; use file_system::{set_io_type, IoType}; use kvproto::pdpb::QueryKind; +use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; use tikv_util::{ @@ -19,6 +20,7 @@ use tikv_util::{ use crate::storage::{ kv::{destroy_tls_engine, set_tls_engine, Engine, FlowStatsReporter, Statistics}, metrics::*, + test_util::latest_feature_gate, }; pub struct SchedLocalMetrics { @@ -28,13 +30,15 @@ pub struct SchedLocalMetrics { } thread_local! { - static TLS_SCHED_METRICS: RefCell = RefCell::new( + static TLS_SCHED_METRICS: RefCell = RefCell::new( SchedLocalMetrics { local_scan_details: HashMap::default(), command_keyread_histogram_vec: KV_COMMAND_KEYREAD_HISTOGRAM_VEC.local(), local_write_stats:WriteStats::default(), } ); + + static TLS_FEATURE_GATE: RefCell = RefCell::new(latest_feature_gate()); } #[derive(Clone)] @@ -58,6 +62,7 @@ impl SchedPool { engine: E, pool_size: usize, reporter: R, + feature_gate: FeatureGate, name_prefix: &str, ) -> Self { let engine = Arc::new(Mutex::new(engine)); @@ -75,6 +80,7 @@ impl SchedPool { .after_start(move || { set_tls_engine(engine.lock().unwrap().clone()); set_io_type(IoType::ForegroundWrite); + TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); }) .before_stop(move || unsafe { // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. @@ -134,3 +140,12 @@ pub fn tls_collect_keyread_histogram_vec(cmd: &str, count: f64) { .observe(count); }); } + +pub fn tls_can_enable(feature: Feature) -> bool { + TLS_FEATURE_GATE.with(|feature_gate| feature_gate.borrow().can_enable(feature)) +} + +#[cfg(test)] +pub fn set_tls_feature_gate(feature_gate: FeatureGate) { + TLS_FEATURE_GATE.with(|f| *f.borrow_mut() = feature_gate); +} diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index 4ccc868f30d..bfbb860e545 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -39,7 +39,7 @@ use collections::HashMap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use crossbeam::utils::CachePadded; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use futures::compat::Future01CompatExt; +use futures::{compat::Future01CompatExt, StreamExt}; use kvproto::{ kvrpcpb::{self, CommandPri, Context, DiskFullOpt, ExtraOp}, pdpb::QueryKind, @@ -48,7 +48,8 @@ use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; use resource_metering::{FutureExt, ResourceTagFactory}; -use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData}; +use smallvec::{smallvec, SmallVec}; +use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; use tikv_util::{ deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, }; @@ -59,20 +60,22 @@ use crate::{ server::lock_manager::waiter_manager, storage::{ config::Config, + errors::SharedError, get_causal_ts, get_priority_tag, get_raw_key_guard, kv::{ - self, with_tls_engine, Engine, ExtCallback, FlowStatsReporter, Result as EngineResult, - SnapContext, Statistics, + self, with_tls_engine, Engine, FlowStatsReporter, Result as EngineResult, SnapContext, + Statistics, }, lock_manager::{ self, - lock_wait_context::LockWaitContext, + lock_wait_context::{LockWaitContext, PessimisticLockKeyCallback}, lock_waiting_queue::{DelayedNotifyAllFuture, LockWaitEntry, LockWaitQueues}, DiagnosticContext, LockManager, LockWaitToken, }, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner, ReleasedLock}, txn::{ + commands, commands::{ Command, RawExt, ReleasedLocks, ResponsePolicy, WriteContext, WriteResult, WriteResultLockInfo, @@ -84,6 +87,7 @@ use crate::{ }, types::StorageCallback, DynamicConfigs, Error as StorageError, ErrorInner as StorageErrorInner, + PessimisticLockKeyResult, PessimisticLockResults, }, }; @@ -94,6 +98,9 @@ const TASKS_SLOTS_NUM: usize = 1 << 12; // 4096 slots. pub const DEFAULT_EXECUTION_DURATION_LIMIT: Duration = Duration::from_secs(24 * 60 * 60); const IN_MEMORY_PESSIMISTIC_LOCK: Feature = Feature::require(6, 0, 0); +pub const LAST_CHANGE_TS: Feature = Feature::require(6, 5, 0); + +type SVec = SmallVec<[T; 4]>; /// Task is a running command. pub(super) struct Task { @@ -133,8 +140,9 @@ struct TaskContext { task: Option, lock: Lock, - cb: Option, + cb: Option, pr: Option, + woken_up_resumable_lock_requests: SVec>, // The one who sets `owned` from false to true is allowed to take // `cb` and `pr` safely. owned: AtomicBool, @@ -148,9 +156,11 @@ struct TaskContext { } impl TaskContext { - fn new(task: Task, cb: StorageCallback) -> TaskContext { + fn new(task: Task, cb: SchedulerTaskCallback, prepared_latches: Option) -> TaskContext { let tag = task.cmd.tag(); - let lock = task.cmd.gen_lock(); + let lock = prepared_latches.unwrap_or_else(|| task.cmd.gen_lock()); + // The initial locks should be either all acquired or all not acquired. + assert!(lock.owned_count == 0 || lock.owned_count == lock.required_hashes.len()); // Write command should acquire write lock. if !task.cmd.readonly() && !lock.is_write_lock() { panic!("write lock is expected for command {}", task.cmd); @@ -166,6 +176,7 @@ impl TaskContext { lock, cb: Some(cb), pr: None, + woken_up_resumable_lock_requests: smallvec![], owned: AtomicBool::new(false), write_bytes, tag, @@ -192,6 +203,42 @@ impl TaskContext { } } +pub enum SchedulerTaskCallback { + NormalRequestCallback(StorageCallback), + LockKeyCallbacks(Vec), +} + +impl SchedulerTaskCallback { + fn execute(self, pr: ProcessResult) { + match self { + Self::NormalRequestCallback(cb) => cb.execute(pr), + Self::LockKeyCallbacks(cbs) => match pr { + ProcessResult::Failed { err } + | ProcessResult::PessimisticLockRes { res: Err(err) } => { + let err = SharedError::from(err); + for cb in cbs { + cb(Err(err.clone()), false); + } + } + ProcessResult::PessimisticLockRes { res: Ok(v) } => { + assert_eq!(v.0.len(), cbs.len()); + for (res, cb) in v.0.into_iter().zip(cbs) { + cb(Ok(res), false) + } + } + _ => unreachable!(), + }, + } + } + + fn unwrap_normal_request_callback(self) -> StorageCallback { + match self { + Self::NormalRequestCallback(cb) => cb, + _ => panic!(""), + } + } +} + struct SchedulerInner { // slot_id -> { cid -> `TaskContext` } in the slot. task_slots: Vec>>>, @@ -258,8 +305,13 @@ impl SchedulerInner { self.task_slots[id_index(cid)].lock() } - fn new_task_context(&self, task: Task, callback: StorageCallback) -> TaskContext { - let tctx = TaskContext::new(task, callback); + fn new_task_context( + &self, + task: Task, + callback: SchedulerTaskCallback, + prepared_latches: Option, + ) -> TaskContext { + let tctx = TaskContext::new(task, callback, prepared_latches); let running_write_bytes = self .running_write_bytes .fetch_add(tctx.write_bytes, Ordering::AcqRel) as i64; @@ -285,21 +337,31 @@ impl SchedulerInner { /// If the task is been processing, it should be owned. /// If it has been finished, then it is not in the slot. /// In both cases, cb should be None. Otherwise, cb should be some. - fn try_own_and_take_cb(&self, cid: u64) -> Option { + fn try_own_and_take_cb(&self, cid: u64) -> Option { self.get_task_slot(cid) .get_mut(&cid) .and_then(|tctx| if tctx.try_own() { tctx.cb.take() } else { None }) } - fn take_task_cb_and_pr(&self, cid: u64) -> (Option, Option) { + fn take_task_cb(&self, cid: u64) -> Option { self.get_task_slot(cid) .get_mut(&cid) - .map(|tctx| (tctx.cb.take(), tctx.pr.take())) - .unwrap_or((None, None)) + .map(|tctx| tctx.cb.take()) + .unwrap_or(None) } - fn store_pr(&self, cid: u64, pr: ProcessResult) { - self.get_task_slot(cid).get_mut(&cid).unwrap().pr = Some(pr); + fn store_lock_changes( + &self, + cid: u64, + woken_up_resumable_lock_requests: SVec>, + ) { + self.get_task_slot(cid) + .get_mut(&cid) + .map(move |tctx| { + assert!(tctx.woken_up_resumable_lock_requests.is_empty()); + tctx.woken_up_resumable_lock_requests = woken_up_resumable_lock_requests; + }) + .unwrap(); } fn too_busy(&self, region_id: u64) -> bool { @@ -390,12 +452,14 @@ impl Scheduler { engine.clone(), config.scheduler_worker_pool_size, reporter.clone(), + feature_gate.clone(), "sched-worker-pool", ), high_priority_pool: SchedPool::new( engine, std::cmp::max(1, config.scheduler_worker_pool_size / 2), reporter, + feature_gate.clone(), "sched-high-pri-pool", ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), @@ -440,21 +504,41 @@ impl Scheduler { }); return; } - self.schedule_command(cmd, callback); + self.schedule_command( + None, + cmd, + SchedulerTaskCallback::NormalRequestCallback(callback), + None, + ); } /// Releases all the latches held by a command. - fn release_lock(&self, lock: &Lock, cid: u64) { - let wakeup_list = self.inner.latches.release(lock, cid); + fn release_latches( + &self, + lock: Lock, + cid: u64, + keep_latches_for_next_cmd: Option<(u64, &Lock)>, + ) { + let wakeup_list = self + .inner + .latches + .release(&lock, cid, keep_latches_for_next_cmd); for wcid in wakeup_list { self.try_to_wake_up(wcid); } } - fn schedule_command(&self, cmd: Command, callback: StorageCallback) { - let cid = self.inner.gen_id(); + fn schedule_command( + &self, + specified_cid: Option, + cmd: Command, + callback: SchedulerTaskCallback, + prepared_latches: Option, + ) { + let cid = specified_cid.unwrap_or_else(|| self.inner.gen_id()); let tracker = get_tls_tracker_token(); debug!("received new command"; "cid" => cid, "cmd" => ?cmd, "tracker" => ?tracker); + let tag = cmd.tag(); let priority_tag = get_priority_tag(cmd.priority()); SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); @@ -465,7 +549,7 @@ impl Scheduler { let mut task_slot = self.inner.get_task_slot(cid); let tctx = task_slot.entry(cid).or_insert_with(|| { self.inner - .new_task_context(Task::new(cid, tracker, cmd), callback) + .new_task_context(Task::new(cid, tracker, cmd), callback, prepared_latches) }); if self.inner.latches.acquire(&mut tctx.lock, cid) { @@ -563,6 +647,28 @@ impl Scheduler { } } + fn schedule_awakened_pessimistic_locks( + &self, + specified_cid: Option, + prepared_latches: Option, + mut awakened_entries: SVec>, + ) { + let key_callbacks: Vec<_> = awakened_entries + .iter_mut() + .map(|i| i.key_cb.take().unwrap().into_inner()) + .collect(); + + let cmd = commands::AcquirePessimisticLockResumed::from_lock_wait_entries(awakened_entries); + + // TODO: Make flow control take effect on this thing. + self.schedule_command( + specified_cid, + cmd.into(), + SchedulerTaskCallback::LockKeyCallbacks(key_callbacks), + prepared_latches, + ); + } + // pub for test pub fn get_sched_pool(&self, priority: CommandPri) -> &SchedPool { if priority == CommandPri::High { @@ -658,7 +764,10 @@ impl Scheduler { cb.execute(pr); } - self.release_lock(&tctx.lock, cid); + if !tctx.woken_up_resumable_lock_requests.is_empty() { + self.put_back_lock_wait_entries(tctx.woken_up_resumable_lock_requests); + } + self.release_latches(tctx.lock, cid, None); } /// Event handler for the success of read. @@ -672,12 +781,12 @@ impl Scheduler { let tctx = self.inner.dequeue_task_context(cid); if let ProcessResult::NextCommand { cmd } = pr { SCHED_STAGE_COUNTER_VEC.get(tag).next_cmd.inc(); - self.schedule_command(cmd, tctx.cb.unwrap()); + self.schedule_command(None, cmd, tctx.cb.unwrap(), None); } else { tctx.cb.unwrap().execute(pr); } - self.release_lock(&tctx.lock, cid); + self.release_latches(tctx.lock, cid, None); } /// Event handler for the success of write. @@ -711,19 +820,25 @@ impl Scheduler { drop(lock_guards); let tctx = self.inner.dequeue_task_context(cid); + let mut do_wake_up = !tctx.woken_up_resumable_lock_requests.is_empty(); // If pipelined pessimistic lock or async apply prewrite takes effect, it's not // guaranteed that the proposed or committed callback is surely invoked, which // takes and invokes `tctx.cb(tctx.pr)`. if let Some(cb) = tctx.cb { let pr = match result { Ok(()) => pr.or(tctx.pr).unwrap(), - Err(e) => ProcessResult::Failed { - err: StorageError::from(e), - }, + Err(e) => { + if !Self::is_undetermined_error(&e) { + do_wake_up = false; + } + ProcessResult::Failed { + err: StorageError::from(e), + } + } }; if let ProcessResult::NextCommand { cmd } = pr { SCHED_STAGE_COUNTER_VEC.get(tag).next_cmd.inc(); - self.schedule_command(cmd, cb); + self.schedule_command(None, cmd, cb, None); } else { cb.execute(pr); } @@ -731,7 +846,34 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - self.release_lock(&tctx.lock, cid); + // TODO: Update lock wait relationships after acquiring some locks. + + if do_wake_up { + let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; + let next_cid = self.inner.gen_id(); + let mut next_latches = + Self::gen_latches_for_lock_wait_entries(woken_up_resumable_lock_requests.iter()); + + self.release_latches(tctx.lock, cid, Some((next_cid, &next_latches))); + + next_latches.force_assume_acquired(); + self.schedule_awakened_pessimistic_locks( + Some(next_cid), + Some(next_latches), + woken_up_resumable_lock_requests, + ); + } else { + if !tctx.woken_up_resumable_lock_requests.is_empty() { + self.put_back_lock_wait_entries(tctx.woken_up_resumable_lock_requests); + } + self.release_latches(tctx.lock, cid, None); + } + } + + fn gen_latches_for_lock_wait_entries<'a>( + entries: impl IntoIterator>, + ) -> Lock { + Lock::new(entries.into_iter().map(|entry| &entry.key)) } /// Event handler for the request of waiting for lock @@ -784,14 +926,24 @@ impl Scheduler { wait_info, is_first_lock, wait_timeout, - Box::new(lock_req_ctx.get_callback_for_cancellation()), + lock_req_ctx.get_callback_for_cancellation(), diag_ctx, ); } - fn on_release_locks(&self, released_locks: ReleasedLocks) { - let mut legacy_wake_up_list = vec![]; - let mut delay_wake_up_futures = vec![]; + fn on_release_locks(&self, released_locks: ReleasedLocks) -> SVec> { + // This function is always called when holding the latch of the involved keys. + // So if we found the lock waiting queues are empty, there's no chance + // that other threads/commands adds new lock-wait entries to the keys + // concurrently. Therefore it's safe to skip waking up when we found the + // lock waiting queues are empty. + if self.inner.lock_wait_queues.is_empty() { + return smallvec![]; + } + + let mut legacy_wake_up_list = SVec::new(); + let mut delay_wake_up_futures = SVec::new(); + let mut resumable_wake_up_list = SVec::new(); let wake_up_delay_duration_ms = self .inner .pessimistic_lock_wake_up_delay_duration_ms @@ -809,21 +961,29 @@ impl Scheduler { None => return, }; - // TODO: Currently there are only legacy requests. When resumable requests are - // supported, do not put them to the `legacy_wake_up_list`. - legacy_wake_up_list.push((lock_wait_entry, released_lock)); + if lock_wait_entry.parameters.allow_lock_with_conflict { + resumable_wake_up_list.push(lock_wait_entry); + } else { + legacy_wake_up_list.push((lock_wait_entry, released_lock)); + } if let Some(f) = delay_wake_up_future { delay_wake_up_futures.push(f); } }); - self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + if !legacy_wake_up_list.is_empty() || !delay_wake_up_futures.is_empty() { + self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + } + + resumable_wake_up_list } fn wake_up_legacy_pessimistic_locks( &self, - legacy_wake_up_list: Vec<(Box, ReleasedLock)>, - delayed_wake_up_futures: Vec, + legacy_wake_up_list: impl IntoIterator, ReleasedLock)> + + Send + + 'static, + delayed_wake_up_futures: impl IntoIterator + Send + 'static, ) { let self1 = self.clone(); self.get_sched_pool(CommandPri::High) @@ -841,19 +1001,23 @@ impl Scheduler { reason: kvrpcpb::WriteConflictReason::PessimisticRetry, }, ))); - cb(Err(e.into())); + cb(Err(e.into()), false); } for f in delayed_wake_up_futures { + let self2 = self1.clone(); self1 .get_sched_pool(CommandPri::High) .pool .spawn(async move { let res = f.await; - // It returns only None currently. - // TODO: Handle not-none case when supporting resumable pessimistic lock - // requests. - assert!(res.is_none()); + if let Some(resumable_lock_wait_entry) = res { + self2.schedule_awakened_pessimistic_locks( + None, + None, + smallvec![resumable_lock_wait_entry], + ); + } }) .unwrap(); } @@ -861,9 +1025,15 @@ impl Scheduler { .unwrap(); } + fn is_undetermined_error(_e: &tikv_kv::Error) -> bool { + // TODO: If there's some cases that `engine.async_write` returns error but it's + // still possible that the data is successfully written, return true. + false + } + fn early_response( cid: u64, - cb: StorageCallback, + cb: SchedulerTaskCallback, pr: ProcessResult, tag: CommandKind, stage: CommandStageKind, @@ -957,7 +1127,6 @@ impl Scheduler { let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); let cid = task.cid; - let priority = task.cmd.priority(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -1056,23 +1225,51 @@ impl Scheduler { let mut pr = Some(pr); - // TODO: Lock wait handling here. - if let Some(lock_info) = lock_info { - // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates - // that it's a lock-no-wait request and we need to report error - // immediately. - if lock_info.parameters.wait_timeout.is_some() { - assert_eq!(to_be_write.size(), 0); - pr = Some(ProcessResult::Res); - // allow_lock_with_conflict is not supported yet in this version. - assert!(!lock_info.parameters.allow_lock_with_conflict); - - scheduler.on_wait_for_lock(&ctx, cid, lock_info, tracker); + if !lock_info.is_empty() { + if tag == CommandKind::acquire_pessimistic_lock { + assert_eq!(lock_info.len(), 1); + let lock_info = lock_info.into_iter().next().unwrap(); + + // Only handle lock waiting if `wait_timeout` is set. Otherwise it indicates + // that it's a lock-no-wait request and we need to report error + // immediately. + if lock_info.parameters.wait_timeout.is_some() { + assert_eq!(to_be_write.size(), 0); + pr = Some(ProcessResult::Res); + + scheduler.on_wait_for_lock(&ctx, cid, lock_info, tracker); + } else { + // For requests with `allow_lock_with_conflict`, key errors are set key-wise. + // TODO: It's better to return this error from + // `commands::AcquirePessimisticLocks::process_write`. + if lock_info.parameters.allow_lock_with_conflict { + pr = Some(ProcessResult::PessimisticLockRes { + res: Err(StorageError::from(Error::from(MvccError::from( + MvccErrorInner::KeyIsLocked(lock_info.lock_info_pb), + )))), + }); + } + } + } else if tag == CommandKind::acquire_pessimistic_lock_resumed { + // Some requests meets lock again after waiting and resuming. + scheduler.on_wait_for_lock_after_resuming(cid, pr.as_mut().unwrap(), lock_info); + } else { + // WriteResult returning lock info is only expected to exist for pessimistic + // lock requests. + unreachable!(); } } - if !released_locks.is_empty() { - scheduler.on_release_locks(released_locks); + let woken_up_resumable_entries = if !released_locks.is_empty() { + scheduler.on_release_locks(released_locks) + } else { + smallvec![] + }; + + if !woken_up_resumable_entries.is_empty() { + scheduler + .inner + .store_lock_changes(cid, woken_up_resumable_entries); } if to_be_write.modifies.is_empty() { @@ -1080,7 +1277,8 @@ impl Scheduler { return; } - if tag == CommandKind::acquire_pessimistic_lock + if (tag == CommandKind::acquire_pessimistic_lock + || tag == CommandKind::acquire_pessimistic_lock_resumed) && pessimistic_lock_mode == PessimisticLockMode::InMemory && self.try_write_in_memory_pessimistic_locks( txn_ext.as_deref(), @@ -1108,65 +1306,16 @@ impl Scheduler { to_be_write.deadline = Some(deadline); let sched = scheduler.clone(); - let sched_pool = scheduler.get_sched_pool(priority).pool.clone(); - - let (proposed_cb, committed_cb): (Option, Option) = - match response_policy { - ResponsePolicy::OnApplied => (None, None), - ResponsePolicy::OnCommitted => { - self.inner.store_pr(cid, pr.take().unwrap()); - let sched = scheduler.clone(); - // Currently, the only case that response is returned after finishing - // commit is async applying prewrites for async commit transactions. - // The committed callback is not guaranteed to be invoked. So store - // the `pr` to the tctx instead of capturing it to the closure. - let committed_cb = Box::new(move || { - fail_point!("before_async_apply_prewrite_finish", |_| {}); - let (cb, pr) = sched.inner.take_task_cb_and_pr(cid); - Self::early_response( - cid, - cb.unwrap(), - pr.unwrap(), - tag, - CommandStageKind::async_apply_prewrite, - ); - }); - is_async_apply_prewrite = true; - (None, Some(committed_cb)) - } - ResponsePolicy::OnProposed => { - if pipelined { - // The normal write process is respond to clients and release - // latches after async write finished. If pipelined pessimistic - // locking is enabled, the process becomes parallel and there are - // two msgs for one command: - // 1. Msg::PipelinedWrite: respond to clients - // 2. Msg::WriteFinished: deque context and release latches - // The proposed callback is not guaranteed to be invoked. So store - // the `pr` to the tctx instead of capturing it to the closure. - self.inner.store_pr(cid, pr.take().unwrap()); - let sched = scheduler.clone(); - // Currently, the only case that response is returned after finishing - // proposed phase is pipelined pessimistic lock. - // TODO: Unify the code structure of pipelined pessimistic lock and - // async apply prewrite. - let proposed_cb = Box::new(move || { - fail_point!("before_pipelined_write_finish", |_| {}); - let (cb, pr) = sched.inner.take_task_cb_and_pr(cid); - Self::early_response( - cid, - cb.unwrap(), - pr.unwrap(), - tag, - CommandStageKind::pipelined_write, - ); - }); - (Some(proposed_cb), None) - } else { - (None, None) - } - } - }; + + let mut subscribed = WriteEvent::BASIC_EVENT; + match response_policy { + ResponsePolicy::OnCommitted => { + subscribed |= WriteEvent::EVENT_COMMITTED; + is_async_apply_prewrite = true; + } + ResponsePolicy::OnProposed if pipelined => subscribed |= WriteEvent::EVENT_PROPOSED, + _ => (), + } if self.inner.flow_controller.enabled() { if self.inner.flow_controller.is_unlimited(region_id) { @@ -1242,15 +1391,11 @@ impl Scheduler { // transfer leader command must be later than this write command because this // write command has been sent to the raftstore. Then, we don't need to worry // this request will fail due to the voluntary leader transfer. - let _downgraded_guard = pessimistic_locks_guard.and_then(|guard| { + let downgraded_guard = pessimistic_locks_guard.and_then(|guard| { (!removed_pessimistic_locks.is_empty()).then(|| RwLockWriteGuard::downgrade(guard)) }); - - // The callback to receive async results of write prepare from the storage - // engine. - let engine_cb = Box::new(move |result: EngineResult<()>| { - let ok = result.is_ok(); - if ok && !removed_pessimistic_locks.is_empty() { + let on_applied = Box::new(move |res: &mut kv::Result<()>| { + if res.is_ok() && !removed_pessimistic_locks.is_empty() { // Removing pessimistic locks when it succeeds to apply. This should be done in // the apply thread, to make sure it happens before other admin commands are // executed. @@ -1267,15 +1412,69 @@ impl Scheduler { } } } + }); - sched_pool - .spawn(async move { + let mut res = unsafe { + with_tls_engine(|e: &mut E| { + e.async_write(&ctx, to_be_write, subscribed, Some(on_applied)) + }) + }; + drop(downgraded_guard); + + while let Some(ev) = res.next().await { + match ev { + WriteEvent::Committed => { + let early_return = (|| { + fail_point!("before_async_apply_prewrite_finish", |_| false); + true + })(); + if WriteEvent::subscribed_committed(subscribed) && early_return { + // Currently, the only case that response is returned after finishing + // commit is async applying prewrites for async commit transactions. + let cb = scheduler.inner.take_task_cb(cid); + Self::early_response( + cid, + cb.unwrap(), + pr.take().unwrap(), + tag, + CommandStageKind::async_apply_prewrite, + ); + } + } + WriteEvent::Proposed => { + let early_return = (|| { + fail_point!("before_pipelined_write_finish", |_| false); + true + })(); + if WriteEvent::subscribed_proposed(subscribed) && early_return { + // The normal write process is respond to clients and release + // latches after async write finished. If pipelined pessimistic + // locking is enabled, the process becomes parallel and there are + // two msgs for one command: + // 1. Msg::PipelinedWrite: respond to clients + // 2. Msg::WriteFinished: deque context and release latches + // Currently, the only case that response is returned after finishing + // proposed phase is pipelined pessimistic lock. + // TODO: Unify the code structure of pipelined pessimistic lock and + // async apply prewrite. + let cb = scheduler.inner.take_task_cb(cid); + Self::early_response( + cid, + cb.unwrap(), + pr.take().unwrap(), + tag, + CommandStageKind::pipelined_write, + ); + } + } + WriteEvent::Finished(res) => { fail_point!("scheduler_async_write_finish"); + let ok = res.is_ok(); sched.on_write_finished( cid, pr, - result, + res, lock_guards, pipelined, is_async_apply_prewrite, @@ -1293,23 +1492,14 @@ impl Scheduler { sched.inner.flow_controller.unconsume(region_id, write_size); } } - }) - .unwrap() - }); - - // Safety: `self.sched_pool` ensures a TLS engine exists. - unsafe { - with_tls_engine(|engine: &mut E| { - if let Err(e) = - engine.async_write_ext(&ctx, to_be_write, engine_cb, proposed_cb, committed_cb) - { - SCHED_STAGE_COUNTER_VEC.get(tag).async_write_err.inc(); - - info!("engine async_write failed"; "cid" => cid, "err" => ?e); - scheduler.finish_with_err(cid, e); + return; } - }) + } } + // If it's not finished while the channel is closed, it means the write + // is undeterministic. in this case, we don't know whether the + // request is finished or not, so we should not release latch as + // it may break correctness. } /// Returns whether it succeeds to write pessimistic locks to the in-memory @@ -1397,24 +1587,120 @@ impl Scheduler { lock_info.key.clone(), self.inner.lock_wait_queues.clone(), lock_wait_token, - cb, + cb.unwrap_normal_request_callback(), lock_info.parameters.allow_lock_with_conflict, ); let first_batch_cb = ctx.get_callback_for_first_write_batch(); - task_ctx.cb = Some(first_batch_cb); + task_ctx.cb = Some(SchedulerTaskCallback::NormalRequestCallback(first_batch_cb)); drop(slot); + assert!(lock_info.req_states.is_none()); + let lock_wait_entry = Box::new(LockWaitEntry { key: lock_info.key, lock_hash: lock_info.lock_digest.hash, parameters: lock_info.parameters, + should_not_exist: lock_info.should_not_exist, lock_wait_token, + req_states: ctx.get_shared_states().clone(), legacy_wake_up_index: None, key_cb: Some(ctx.get_callback_for_blocked_key().into()), }); (ctx, lock_wait_entry, lock_info.lock_info_pb) } + + fn make_lock_waiting_after_resuming( + &self, + lock_info: WriteResultLockInfo, + cb: PessimisticLockKeyCallback, + ) -> Box { + Box::new(LockWaitEntry { + key: lock_info.key, + lock_hash: lock_info.lock_digest.hash, + parameters: lock_info.parameters, + should_not_exist: lock_info.should_not_exist, + lock_wait_token: lock_info.lock_wait_token, + // This must be called after an execution fo AcquirePessimisticLockResumed, in which + // case there must be a valid req_state. + req_states: lock_info.req_states.unwrap(), + legacy_wake_up_index: None, + key_cb: Some(cb.into()), + }) + } + + fn on_wait_for_lock_after_resuming( + &self, + cid: u64, + pr: &mut ProcessResult, + lock_info: Vec, + ) { + if lock_info.is_empty() { + return; + } + + // TODO: Update lock wait relationship. + + let results = match pr { + ProcessResult::PessimisticLockRes { + res: Ok(PessimisticLockResults(res)), + } => res, + _ => unreachable!(), + }; + + let mut slot = self.inner.get_task_slot(cid); + let task_ctx = slot.get_mut(&cid).unwrap(); + let cbs = match task_ctx.cb { + Some(SchedulerTaskCallback::LockKeyCallbacks(ref mut v)) => v, + _ => unreachable!(), + }; + assert_eq!(results.len(), cbs.len()); + + let finished_len = results.len() - lock_info.len(); + + let original_results = std::mem::replace(results, Vec::with_capacity(finished_len)); + let original_cbs = std::mem::replace(cbs, Vec::with_capacity(finished_len)); + let mut lock_wait_entries = SmallVec::<[_; 10]>::with_capacity(lock_info.len()); + let mut lock_info_it = lock_info.into_iter(); + + for (result, cb) in original_results.into_iter().zip(original_cbs) { + if let PessimisticLockKeyResult::Waiting = &result { + let lock_info = lock_info_it.next().unwrap(); + let lock_info_pb = lock_info.lock_info_pb.clone(); + let entry = self.make_lock_waiting_after_resuming(lock_info, cb); + lock_wait_entries.push((entry, lock_info_pb)); + } else { + results.push(result); + cbs.push(cb); + } + } + + assert!(lock_info_it.next().is_none()); + assert_eq!(results.len(), cbs.len()); + + // Release the mutex in the latch slot. + drop(slot); + + // Add to the lock waiting queue. + // TODO: the request may be canceled from lock manager at this time. If so, it + // should not be added to the queue. + for (entry, lock_info_pb) in lock_wait_entries { + self.inner + .lock_wait_queues + .push_lock_wait(entry, lock_info_pb); + } + } + + fn put_back_lock_wait_entries(&self, entries: impl IntoIterator>) { + for entry in entries.into_iter() { + // TODO: Do not pass `default` as the lock info. Here we need another method + // `put_back_lock_wait`, which doesn't require updating lock info and + // additionally checks if the lock wait entry is already canceled. + self.inner + .lock_wait_queues + .push_lock_wait(entry, Default::default()); + } + } } pub async fn get_raw_ext( @@ -1473,7 +1759,7 @@ mod tests { use kvproto::kvrpcpb::{BatchRollbackRequest, CheckTxnStatusRequest, Context}; use raftstore::store::{ReadStats, WriteStats}; use tikv_util::{config::ReadableSize, future::paired_future_callback}; - use txn_types::{Key, OldValues, TimeStamp}; + use txn_types::{Key, TimeStamp}; use super::*; use crate::storage::{ @@ -1556,7 +1842,7 @@ mod tests { Some(WaitTimeout::Default), false, TimeStamp::default(), - OldValues::default(), + false, false, false, Context::default(), @@ -1638,8 +1924,8 @@ mod tests { if id != 0 { assert!(latches.acquire(&mut lock, id)); } - let unlocked = latches.release(&lock, id); - if id as u64 == max_id { + let unlocked = latches.release(&lock, id, None); + if id == max_id { assert!(unlocked.is_empty()); } else { assert_eq!(unlocked, vec![id + 1]); @@ -1695,7 +1981,7 @@ mod tests { block_on(f).unwrap(), Err(StorageError(box StorageErrorInner::DeadlineExceeded)) )); - scheduler.release_lock(&lock, cid); + scheduler.release_latches(lock, cid, None); // A new request should not be blocked. let mut req = BatchRollbackRequest::default(); @@ -1930,7 +2216,7 @@ mod tests { // When releasing the lock, the queuing tasks should be all waken up without // stack overflow. - scheduler.release_lock(&lock, cid); + scheduler.release_latches(lock, cid, None); // A new request should not be blocked. let mut req = BatchRollbackRequest::default(); diff --git a/src/storage/txn/store.rs b/src/storage/txn/store.rs index b2f25cff640..46879d38e9f 100644 --- a/src/storage/txn/store.rs +++ b/src/storage/txn/store.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. use kvproto::kvrpcpb::IsolationLevel; -use txn_types::{Key, KvPair, OldValue, TimeStamp, TsSet, Value, WriteRef}; +use txn_types::{Key, KvPair, Lock, OldValue, TimeStamp, TsSet, Value, WriteRef}; use super::{Error, ErrorInner, Result}; use crate::storage::{ @@ -159,6 +159,27 @@ impl TxnEntry { } => old_value, } } + + pub fn erasing_last_change_ts(&self) -> TxnEntry { + let mut e = self.clone(); + match &mut e { + TxnEntry::Prewrite { + lock: (_, value), .. + } => { + let l = Lock::parse(value).unwrap(); + *value = l.set_last_change(TimeStamp::zero(), 0).to_bytes(); + } + TxnEntry::Commit { + write: (_, value), .. + } => { + let mut w = WriteRef::parse(value).unwrap(); + w.last_change_ts = TimeStamp::zero(); + w.versions_to_last_change = 0; + *value = w.to_bytes(); + } + } + e + } } impl TxnEntry { @@ -705,6 +726,7 @@ mod tests { need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }, Mutation::make_put(Key::from_raw(key), key.to_vec()), &None, diff --git a/src/storage/types.rs b/src/storage/types.rs index c8303787a41..b4e91811843 100644 --- a/src/storage/types.rs +++ b/src/storage/types.rs @@ -8,6 +8,7 @@ use kvproto::kvrpcpb; use txn_types::{Key, Value}; use crate::storage::{ + errors::SharedError, lock_manager::WaitTimeout, mvcc::{Lock, LockType, TimeStamp, Write, WriteType}, txn::ProcessResult, @@ -52,6 +53,10 @@ impl MvccInfo { write_info.set_start_ts(write.start_ts.into_inner()); write_info.set_commit_ts(commit_ts.into_inner()); write_info.set_short_value(write.short_value.unwrap_or_default()); + if !write.last_change_ts.is_zero() { + write_info.set_last_change_ts(write.last_change_ts.into_inner()); + write_info.set_versions_to_last_change(write.versions_to_last_change); + } write_info }) .collect() @@ -70,6 +75,10 @@ impl MvccInfo { lock_info.set_start_ts(lock.ts.into_inner()); lock_info.set_primary(lock.primary); lock_info.set_short_value(lock.short_value.unwrap_or_default()); + if !lock.last_change_ts.is_zero() { + lock_info.set_last_change_ts(lock.last_change_ts.into_inner()); + lock_info.set_versions_to_last_change(lock.versions_to_last_change); + } mvcc_info.set_lock(lock_info); } let vv = extract_2pc_values(self.values); @@ -122,6 +131,7 @@ pub struct PrewriteResult { pub one_pc_commit_ts: TimeStamp, } +#[derive(Clone, Debug, PartialEq)] #[cfg_attr(test, derive(Default))] pub struct PessimisticLockParameters { pub pb_ctx: kvrpcpb::Context, @@ -134,6 +144,7 @@ pub struct PessimisticLockParameters { pub min_commit_ts: TimeStamp, pub check_existence: bool, pub is_first_lock: bool, + pub lock_only_if_exists: bool, /// Whether it's allowed for an pessimistic lock request to acquire the lock /// even there is write conflict (i.e. the latest version's `commit_ts` is @@ -147,42 +158,221 @@ pub struct PessimisticLockParameters { pub allow_lock_with_conflict: bool, } -#[derive(Clone, Debug, PartialEq)] -pub enum PessimisticLockRes { - /// The previous value is loaded while handling the `AcquirePessimisticLock` - /// command. The i-th item is the value of the i-th key in the - /// `AcquirePessimisticLock` command. - Values(Vec>), - /// Checked whether the key exists while handling the - /// `AcquirePessimisticLock` command. The i-th item is true if the i-th key - /// in the `AcquirePessimisticLock` command exists. - Existence(Vec), +/// Represents the result of pessimistic lock on a single key. +#[derive(Debug, Clone)] +pub enum PessimisticLockKeyResult { + /// The lock is acquired successfully, returning no additional information. Empty, + /// The lock is acquired successfully, and the previous value is read and + /// returned. + Value(Option), + /// The lock is acquired successfully, and also checked if the key exists + /// previously. + Existence(bool), + /// There is a write conflict, but the lock is acquired ignoring the write + /// conflict. + LockedWithConflict { + /// The previous value of the key. + value: Option, + /// The `commit_ts` of the latest Write record found on this key. This + /// is also the actual `for_update_ts` written to the lock. + conflict_ts: TimeStamp, + }, + /// The key is already locked and lock-waiting is needed. + Waiting, + /// Failed to acquire the lock due to some error. + Failed(SharedError), } -impl PessimisticLockRes { - pub fn push(&mut self, value: Option) { +impl PessimisticLockKeyResult { + pub fn new_success( + need_value: bool, + need_check_existence: bool, + locked_with_conflict_ts: Option, + value: Option, + ) -> Self { + if let Some(conflict_ts) = locked_with_conflict_ts { + Self::LockedWithConflict { value, conflict_ts } + } else if need_value { + Self::Value(value) + } else if need_check_existence { + Self::Existence(value.is_some()) + } else { + Self::Empty + } + } + + pub fn unwrap_value(self) -> Option { match self { - PessimisticLockRes::Values(v) => v.push(value), - PessimisticLockRes::Existence(v) => v.push(value.is_some()), - _ => panic!("unexpected PessimisticLockRes"), + Self::Value(v) => v, + x => panic!( + "pessimistic lock key result expected to be a value, got {:?}", + x + ), } } - pub fn into_values_and_not_founds(self) -> (Vec, Vec) { + pub fn unwrap_existence(self) -> bool { match self { - PessimisticLockRes::Values(vals) => vals - .into_iter() - .map(|v| { - let is_not_found = v.is_none(); - (v.unwrap_or_default(), is_not_found) - }) - .unzip(), - PessimisticLockRes::Existence(mut vals) => { - vals.iter_mut().for_each(|x| *x = !*x); - (vec![], vals) + Self::Existence(e) => e, + x => panic!( + "pessimistic lock key result expected to be existence, got {:?}", + x + ), + } + } + + pub fn assert_empty(&self) { + assert!(matches!(self, Self::Empty)); + } + + #[cfg(test)] + pub fn assert_value(&self, expected_value: Option<&[u8]>) { + match self { + Self::Value(v) if v.as_ref().map(|v| v.as_slice()) == expected_value => (), + x => panic!( + "pessimistic lock key result not match, expected Value({:?}), got {:?}", + expected_value, x + ), + } + } + + #[cfg(test)] + pub fn assert_existence(&self, expected_existence: bool) { + match self { + Self::Existence(e) if *e == expected_existence => (), + x => panic!( + "pessimistic lock key result not match, expected Existence({:?}), got {:?}", + expected_existence, x + ), + } + } + + #[cfg(test)] + pub fn assert_locked_with_conflict( + &self, + expected_value: Option<&[u8]>, + expected_conflict_ts: impl Into, + ) { + let expected_conflict_ts = expected_conflict_ts.into(); + match self { + Self::LockedWithConflict { value, conflict_ts } + if value.as_ref().map(|v| v.as_slice()) == expected_value + && *conflict_ts == expected_conflict_ts => {} + x => panic!( + "pessimistic lock key result not match, expected LockedWithConflict{{ value: {:?}, conflict_ts: {} }}, got {:?}", + expected_value, expected_conflict_ts, x + ), + } + } + + #[cfg(test)] + pub fn assert_waiting(&self) { + assert!(matches!(self, Self::Waiting)); + } + + pub fn unwrap_err(&self) -> SharedError { + match self { + Self::Failed(e) => e.clone(), + x => panic!( + "pessimistic lock key result not match expected Failed, got {:?}", + x, + ), + } + } +} + +#[derive(Clone, Debug, Default)] +pub struct PessimisticLockResults(pub Vec); + +impl PessimisticLockResults { + pub fn new() -> Self { + Self(vec![]) + } + + pub fn with_capacity(capacity: usize) -> Self { + Self(Vec::with_capacity(capacity)) + } + + pub fn push(&mut self, key_res: PessimisticLockKeyResult) { + self.0.push(key_res); + } + + pub fn into_pb(self) -> (Vec, Option) { + let mut error = None; + let res = self + .0 + .into_iter() + .map(|res| { + let mut res_pb = kvrpcpb::PessimisticLockKeyResult::default(); + match res { + PessimisticLockKeyResult::Empty => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal) + } + PessimisticLockKeyResult::Value(v) => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal); + res_pb.set_existence(v.is_some()); + res_pb.set_value(v.unwrap_or_default()); + } + PessimisticLockKeyResult::Existence(e) => { + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultNormal); + res_pb.set_existence(e); + } + PessimisticLockKeyResult::LockedWithConflict { value, conflict_ts } => { + res_pb.set_type( + kvrpcpb::PessimisticLockKeyResultType::LockResultLockedWithConflict, + ); + res_pb.set_existence(value.is_some()); + res_pb.set_value(value.unwrap_or_default()); + res_pb.set_locked_with_conflict_ts(conflict_ts.into_inner()); + } + PessimisticLockKeyResult::Waiting => unreachable!(), + PessimisticLockKeyResult::Failed(e) => { + if error.is_none() { + error = Some(e) + } + res_pb.set_type(kvrpcpb::PessimisticLockKeyResultType::LockResultFailed); + } + } + res_pb + }) + .collect(); + (res, error) + } + + pub fn into_legacy_values_and_not_founds(self) -> (Vec, Vec) { + if self.0.is_empty() { + return (vec![], vec![]); + } + + match &self.0[0] { + PessimisticLockKeyResult::Empty => { + self.0.into_iter().for_each(|res| res.assert_empty()); + (vec![], vec![]) + } + PessimisticLockKeyResult::Existence(_) => { + let not_founds = self.0.into_iter().map(|x| !x.unwrap_existence()).collect(); + (vec![], not_founds) + } + PessimisticLockKeyResult::Value(_) => { + let mut not_founds = Vec::with_capacity(self.0.len()); + let mut values = Vec::with_capacity(self.0.len()); + self.0.into_iter().for_each(|x| { + let v = x.unwrap_value(); + match v { + Some(v) => { + not_founds.push(false); + values.push(v); + } + None => { + not_founds.push(true); + values.push(vec![]); + } + } + }); + (values, not_founds) } - PessimisticLockRes::Empty => (vec![], vec![]), + _ => unreachable!(), } } } @@ -238,7 +428,7 @@ storage_callback! { Locks(Vec) ProcessResult::Locks { locks } => locks, TxnStatus(TxnStatus) ProcessResult::TxnStatus { txn_status } => txn_status, Prewrite(PrewriteResult) ProcessResult::PrewriteResult { result } => result, - PessimisticLock(Result) ProcessResult::PessimisticLockRes { res } => res, + PessimisticLock(Result) ProcessResult::PessimisticLockRes { res } => res, SecondaryLocksStatus(SecondaryLocksStatus) ProcessResult::SecondaryLocksStatus { status } => status, RawCompareAndSwap((Option, bool)) ProcessResult::RawCompareAndSwapRes { previous_value, succeed } => (previous_value, succeed), } diff --git a/tests/Cargo.toml b/tests/Cargo.toml index 043e3ad2d23..ae6c6984487 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -41,11 +41,11 @@ path = "benches/deadlock_detector/mod.rs" [features] default = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine", "cloud-aws", "cloud-gcp", "cloud-azure"] -failpoints = ["fail/failpoints", "tikv/failpoints"] +failpoints = ["fail/failpoints", "tikv/failpoints", "pd_client/failpoints"] cloud-aws = ["external_storage_export/cloud-aws"] cloud-gcp = ["external_storage_export/cloud-gcp"] cloud-azure = ["external_storage_export/cloud-azure"] -testexport = ["raftstore/testexport", "tikv/testexport"] +testexport = ["raftstore/testexport", "tikv/testexport", "pd_client/testexport"] profiling = ["profiler/profiling"] test-engine-kv-rocksdb = [ @@ -83,7 +83,7 @@ file_system = { workspace = true } futures = "0.3" grpcio = { workspace = true } grpcio-health = { version = "0.10", default-features = false } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { workspace = true } libc = "0.2" log_wrappers = { workspace = true } more-asserts = "0.2" @@ -106,7 +106,7 @@ tidb_query_expr = { workspace = true } tikv = { workspace = true } tikv_util = { workspace = true } time = "0.1" -tipb = { git = "https://github.com/pingcap/tipb.git" } +tipb = { workspace = true } toml = "0.5" txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } @@ -146,6 +146,7 @@ test_sst_importer = { workspace = true } test_storage = { workspace = true } test_util = { workspace = true } tidb_query_datatype = { workspace = true } +tikv_kv = { workspace = true } tipb_helper = { workspace = true } tokio = { version = "1.5", features = ["rt-multi-thread"] } diff --git a/tests/benches/hierarchy/mvcc/mod.rs b/tests/benches/hierarchy/mvcc/mod.rs index f57946a11cf..7a79b984aaf 100644 --- a/tests/benches/hierarchy/mvcc/mod.rs +++ b/tests/benches/hierarchy/mvcc/mod.rs @@ -47,6 +47,7 @@ where need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, @@ -59,7 +60,7 @@ where .unwrap(); } let write_data = WriteData::from_modifies(txn.into_modifies()); - let _ = engine.async_write(&ctx, write_data, Box::new(move |_| {})); + let _ = tikv_kv::write(engine, &ctx, write_data, None); let keys: Vec = kvs.iter().map(|(k, _)| Key::from_raw(k)).collect(); let snapshot = engine.snapshot(Default::default()).unwrap(); (snapshot, keys) @@ -97,6 +98,7 @@ fn mvcc_prewrite>(b: &mut Bencher<'_>, config: &B need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, diff --git a/tests/benches/hierarchy/txn/mod.rs b/tests/benches/hierarchy/txn/mod.rs index 0bdb7ae8870..404266e2c6f 100644 --- a/tests/benches/hierarchy/txn/mod.rs +++ b/tests/benches/hierarchy/txn/mod.rs @@ -43,6 +43,7 @@ where need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, @@ -90,6 +91,7 @@ fn txn_prewrite>(b: &mut Bencher<'_>, config: &Be need_old_value: false, is_retry_request: false, assertion_level: AssertionLevel::Off, + txn_source: 0, }; prewrite( &mut txn, diff --git a/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs b/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs index 2fcc3915125..7796be6c53b 100644 --- a/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs +++ b/tests/benches/misc/coprocessor/codec/mysql/json/mod.rs @@ -18,7 +18,7 @@ fn download_and_extract_file(url: &str) -> io::Result { .stderr(Stdio::null()) .spawn()?; let mut tar_child = Command::new("tar") - .args(&["xzf", "-", "--to-stdout"]) + .args(["xzf", "-", "--to-stdout"]) .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()) diff --git a/tests/benches/misc/raftkv/mod.rs b/tests/benches/misc/raftkv/mod.rs index a949570ebe1..d567edd5add 100644 --- a/tests/benches/misc/raftkv/mod.rs +++ b/tests/benches/misc/raftkv/mod.rs @@ -6,6 +6,7 @@ use collections::HashSet; use crossbeam::channel::TrySendError; use engine_rocks::{RocksEngine, RocksSnapshot}; use engine_traits::{KvEngine, ALL_CFS, CF_DEFAULT}; +use futures::future::FutureExt; use kvproto::{ kvrpcpb::{Context, ExtraOp as TxnExtraOp}, metapb::Region, @@ -191,14 +192,15 @@ fn bench_async_snapshot(b: &mut test::Bencher) { ctx.set_region_epoch(region.get_region_epoch().clone()); ctx.set_peer(leader); b.iter(|| { - let on_finished: EngineCallback> = Box::new(move |results| { - let _ = test::black_box(results); - }); let snap_ctx = SnapContext { pb_ctx: &ctx, ..Default::default() }; - kv.async_snapshot(snap_ctx, on_finished).unwrap(); + let f = kv.async_snapshot(snap_ctx); + let res = f.map(|res| { + let _ = test::black_box(res); + }); + let _ = test::black_box(res); }); } @@ -224,17 +226,18 @@ fn bench_async_write(b: &mut test::Bencher) { ctx.set_region_epoch(region.get_region_epoch().clone()); ctx.set_peer(leader); b.iter(|| { - let on_finished: EngineCallback<()> = Box::new(|_| { - test::black_box(()); - }); - kv.async_write( + let f = tikv_kv::write( + &kv, &ctx, WriteData::from_modifies(vec![Modify::Delete( CF_DEFAULT, Key::from_encoded(b"fooo".to_vec()), )]), - on_finished, - ) - .unwrap(); + None, + ); + let res = f.map(|res| { + let _ = test::black_box(res); + }); + let _ = test::black_box(res); }); } diff --git a/tests/failpoints/cases/mod.rs b/tests/failpoints/cases/mod.rs index b291e86b88c..9c90211c073 100644 --- a/tests/failpoints/cases/mod.rs +++ b/tests/failpoints/cases/mod.rs @@ -20,6 +20,7 @@ mod test_memory_usage_limit; mod test_merge; mod test_metrics_overflow; mod test_pd_client; +mod test_pd_client_legacy; mod test_pending_peers; mod test_rawkv; mod test_read_execution_tracker; @@ -38,3 +39,4 @@ mod test_transaction; mod test_transfer_leader; mod test_ttl; mod test_unsafe_recovery; +mod test_witness; diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index 481e533a879..c515b8d66cb 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -138,10 +138,10 @@ fn test_snapshot_failed() { #[test] fn test_snapshot_failed_2() { let product = ProductTable::new(); - let (_, endpoint) = init_with_data(&product, &[]); + let (store, endpoint) = init_with_data(&product, &[]); let req = DagSelect::from(&product).build(); - fail::cfg("rockskv_async_snapshot_not_leader", "return()").unwrap(); + store.get_engine().trigger_not_leader(); let resp = handle_request(&endpoint, req); assert!(resp.get_region_error().has_not_leader()); diff --git a/tests/failpoints/cases/test_gc_metrics.rs b/tests/failpoints/cases/test_gc_metrics.rs index e698031f0bc..348b81aaea7 100644 --- a/tests/failpoints/cases/test_gc_metrics.rs +++ b/tests/failpoints/cases/test_gc_metrics.rs @@ -19,7 +19,6 @@ use raftstore::{ coprocessor::{ region_info_accessor::MockRegionInfoProvider, CoprocessorHost, RegionChangeEvent, }, - router::RaftStoreBlackHole, RegionInfoAccessor, }; use tikv::{ @@ -142,7 +141,6 @@ fn test_txn_gc_keys_handled() { feature_gate.set_version("5.0.0").unwrap(); let mut gc_worker = GcWorker::new( prefixed_engine.clone(), - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, @@ -286,7 +284,6 @@ fn test_raw_gc_keys_handled() { let feature_gate = FeatureGate::default(); let mut gc_worker = GcWorker::new( prefixed_engine, - RaftStoreBlackHole, tx, GcConfig::default(), feature_gate, diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 5845d4d4eb7..3dbb7ffc7b0 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -6,7 +6,6 @@ use std::{ time::Duration, }; -use collections::HashMap; use engine_traits::{Peekable, WriteBatch}; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; @@ -28,289 +27,6 @@ use tikv::{ use tikv_util::HandyRwLock; use txn_types::{Key, TimeStamp}; -// In theory, raft can propose conf change as long as there is no pending one. -// Replicas don't apply logs synchronously, so it's possible the old leader is -// removed before the new leader applies all logs. -// In the current implementation, the new leader rejects conf change until it -// applies all logs. It guarantees the correctness of green GC. This test is to -// prevent breaking it in the future. -#[test] -fn test_collect_lock_from_stale_leader() { - let mut cluster = new_server_cluster(0, 2); - cluster.pd_client.disable_default_operator(); - let region_id = cluster.run_conf_change(); - let leader = cluster.leader_of_region(region_id).unwrap(); - - // Create clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Start transferring the region to store 2. - let new_peer = new_peer(2, 1003); - cluster.pd_client.must_add_peer(region_id, new_peer.clone()); - - // Create the ctx of the first region. - let leader_client = clients.get(&leader.get_store_id()).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader.clone()); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // Pause the new peer applying so that when it becomes the leader, it doesn't - // apply all logs. - let new_leader_apply_fp = "on_handle_apply_1003"; - fail::cfg(new_leader_apply_fp, "pause").unwrap(); - must_kv_prewrite( - leader_client, - ctx, - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - - // Leader election only considers the progress of appending logs, so it can - // succeed. - cluster.must_transfer_leader(region_id, new_peer.clone()); - // It shouldn't succeed in the current implementation. - cluster.pd_client.remove_peer(region_id, leader.clone()); - std::thread::sleep(Duration::from_secs(1)); - cluster.pd_client.must_have_peer(region_id, leader); - - // Must scan the lock from the old leader. - let locks = must_physical_scan_lock(leader_client, Context::default(), 100, b"", 10); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), b"k1"); - - // Can't scan the lock from the new leader. - let leader_client = clients.get(&new_peer.get_store_id()).unwrap(); - must_register_lock_observer(leader_client, 100); - let locks = must_check_lock_observer(leader_client, 100, true); - assert!(locks.is_empty()); - let locks = must_physical_scan_lock(leader_client, Context::default(), 100, b"", 10); - assert!(locks.is_empty()); - - fail::remove(new_leader_apply_fp); -} - -#[test] -fn test_observer_send_error() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - let max_ts = 100; - must_register_lock_observer(&client, max_ts); - must_kv_prewrite( - &client, - ctx.clone(), - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - assert_eq!(must_check_lock_observer(&client, max_ts, true).len(), 1); - - let observer_send_fp = "lock_observer_send_full"; - fail::cfg(observer_send_fp, "return").unwrap(); - must_kv_prewrite( - &client, - ctx, - vec![new_mutation(Op::Put, b"k2", b"v")], - b"k1".to_vec(), - 10, - ); - let resp = check_lock_observer(&client, max_ts); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - // Should mark dirty if fails to send locks. - assert!(!resp.get_is_clean()); -} - -#[test] -fn test_notify_observer_after_apply() { - fn retry_until(mut f: impl FnMut() -> bool) { - for _ in 0..100 { - sleep_ms(10); - if f() { - break; - } - } - } - - let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); - cluster.pd_client.disable_default_operator(); - let post_apply_query_fp = "notify_lock_observer_query"; - let apply_plain_kvs_fp = "notify_lock_observer_snapshot"; - - // Write a lock and pause before notifying the lock observer. - let max_ts = 100; - must_register_lock_observer(&client, max_ts); - fail::cfg(post_apply_query_fp, "pause").unwrap(); - let key = b"k"; - let (client_clone, ctx_clone) = (client.clone(), ctx.clone()); - let handle = std::thread::spawn(move || { - must_kv_prewrite( - &client_clone, - ctx_clone, - vec![new_mutation(Op::Put, key, b"v")], - key.to_vec(), - 10, - ); - }); - // We can use physical_scan_lock to get the lock because we notify the lock - // observer after writing data to the rocskdb. - let mut locks = vec![]; - retry_until(|| { - assert!(must_check_lock_observer(&client, max_ts, true).is_empty()); - locks.extend(must_physical_scan_lock( - &client, - ctx.clone(), - max_ts, - b"", - 100, - )); - !locks.is_empty() - }); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), key); - assert!(must_check_lock_observer(&client, max_ts, true).is_empty()); - fail::remove(post_apply_query_fp); - handle.join().unwrap(); - assert_eq!(must_check_lock_observer(&client, max_ts, true).len(), 1); - - // Add a new store. - let store_id = cluster.add_new_engine(); - let channel = ChannelBuilder::new(Arc::new(Environment::new(1))) - .connect(&cluster.sim.rl().get_addr(store_id)); - let replica_client = TikvClient::new(channel); - - // Add a new peer and pause before notifying the lock observer. - must_register_lock_observer(&replica_client, max_ts); - fail::cfg(apply_plain_kvs_fp, "pause").unwrap(); - cluster - .pd_client - .must_add_peer(ctx.get_region_id(), new_peer(store_id, store_id)); - // We can use physical_scan_lock to get the lock because we notify the lock - // observer after writing data to the rocksdb. - let mut locks = vec![]; - retry_until(|| { - assert!(must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - locks.extend(must_physical_scan_lock( - &replica_client, - ctx.clone(), - max_ts, - b"", - 100, - )); - !locks.is_empty() - }); - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), key); - assert!(must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - fail::remove(apply_plain_kvs_fp); - retry_until(|| !must_check_lock_observer(&replica_client, max_ts, true).is_empty()); - assert_eq!( - must_check_lock_observer(&replica_client, max_ts, true).len(), - 1 - ); -} - -// It may cause locks missing during green GC if the raftstore notifies the lock -// observer before writing data to the rocksdb: -// - Store-1 transfers a region to store-2 and store-2 is applying logs. -// - GC worker registers lock observer on store-2 after calling lock observer's -// callback and before finishing applying which means the lock won't be -// observed. -// - GC worker scans locks on each store independently. It's possible GC worker -// has scanned all locks on store-2 and hasn't scanned locks on store-1. -// - Store-2 applies all logs and removes the peer on store-1. -// - GC worker can't scan the lock on store-1 because the peer has been -// destroyed. -// - GC worker can't get the lock from store-2 because it can't observe the lock -// and has scanned it. -#[test] -fn test_collect_applying_locks() { - let mut cluster = new_server_cluster(0, 2); - cluster.pd_client.disable_default_operator(); - let region_id = cluster.run_conf_change(); - let leader = cluster.leader_of_region(region_id).unwrap(); - - // Create clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Start transferring the region to store 2. - let new_peer = new_peer(2, 1003); - cluster.pd_client.must_add_peer(region_id, new_peer.clone()); - - // Create the ctx of the first region. - let store_1_client = clients.get(&leader.get_store_id()).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader.clone()); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // Pause store-2 after calling observer callbacks and before writing to the - // rocksdb. - let new_leader_apply_fp = "post_handle_apply_1003"; - fail::cfg(new_leader_apply_fp, "pause").unwrap(); - - // Write 1 lock. - must_kv_prewrite( - store_1_client, - ctx, - vec![new_mutation(Op::Put, b"k1", b"v")], - b"k1".to_vec(), - 10, - ); - // Wait for store-2 applying. - std::thread::sleep(Duration::from_secs(3)); - - // Starting the process of green GC at safe point 20: - // 1. Register lock observers on all stores. - // 2. Scan locks physically on each store independently. - // 3. Get locks from all observers. - let safe_point = 20; - - // Register lock observers. - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - }); - - // Finish scanning locks on store-2 and find nothing. - let store_2_client = clients.get(&new_peer.get_store_id()).unwrap(); - let locks = must_physical_scan_lock(store_2_client, Context::default(), safe_point, b"", 1); - assert!(locks.is_empty(), "{:?}", locks); - - // Transfer the region from store-1 to store-2. - fail::remove(new_leader_apply_fp); - cluster.must_transfer_leader(region_id, new_peer); - cluster.pd_client.must_remove_peer(region_id, leader); - // Wait for store-1 desroying the region. - std::thread::sleep(Duration::from_secs(3)); - - // Scan locks on store-1 after the region has been destroyed. - let locks = must_physical_scan_lock(store_1_client, Context::default(), safe_point, b"", 1); - assert!(locks.is_empty(), "{:?}", locks); - - // Check lock observers. - let mut locks = vec![]; - clients.iter().for_each(|(_, c)| { - locks.extend(must_check_lock_observer(c, safe_point, true)); - }); - // Must observe the applying lock even through we can't use scan to get it. - assert_eq!(locks.len(), 1); - assert_eq!(locks[0].get_key(), b"k1"); -} - // Test write CF's compaction filter can call `orphan_versions_handler` // correctly. #[test] diff --git a/tests/failpoints/cases/test_kv_service.rs b/tests/failpoints/cases/test_kv_service.rs index 1f7e35b5691..b81673af0e2 100644 --- a/tests/failpoints/cases/test_kv_service.rs +++ b/tests/failpoints/cases/test_kv_service.rs @@ -42,95 +42,6 @@ fn test_kv_scan_memory_lock() { fail::remove("raftkv_async_snapshot_err"); } -#[test] -fn test_scan_lock_push_async_commit() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - for (use_green_gc, ts) in &[(false, 100), (true, 200)] { - // We will perform a async commit transaction with start_ts == `ts`. - // First, try pushing max_ts to `ts + 10`. - if *use_green_gc { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(ts + 10); - let resp = client.register_lock_observer(&req).unwrap(); - assert_eq!(resp.error.len(), 0); - } else { - let mut req = ScanLockRequest::default(); - req.set_context(ctx.clone()); - req.set_max_version(ts + 10); - let resp = client.kv_scan_lock(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(!resp.has_error()); - } - - let k1 = b"k1"; - let v1 = b"v1"; - - // The following code simulates another case: prewrite is locking the memlock, - // and then another scan lock operation request meets the memlock. - - fail::cfg("before-set-lock-in-memory", "pause").unwrap(); - let client1 = client.clone(); - let ctx1 = ctx.clone(); - let handle1 = std::thread::spawn(move || { - let mut prewrite = PrewriteRequest::default(); - prewrite.set_context(ctx1); - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k1.to_vec()); - mutation.set_value(v1.to_vec()); - prewrite.set_mutations(vec![mutation].into()); - prewrite.set_primary_lock(k1.to_vec()); - prewrite.set_start_version(*ts); - prewrite.set_lock_ttl(1000); - prewrite.set_use_async_commit(true); - - let resp = client1.kv_prewrite(&prewrite).unwrap(); - assert!(!resp.has_region_error()); - assert_eq!(resp.get_errors(), &[]); - // min_commit_ts should be the last scan_lock ts + 1. - assert_eq!(resp.min_commit_ts, ts + 11); - }); - - // Wait for the prewrite acquires the memlock - std::thread::sleep(Duration::from_millis(200)); - - let client1 = client.clone(); - let ctx1 = ctx.clone(); - let handle2 = std::thread::spawn(move || { - if *use_green_gc { - let mut req = RegisterLockObserverRequest::default(); - req.set_max_ts(ts + 20); - let resp = client1.register_lock_observer(&req).unwrap(); - assert!(!resp.error.is_empty()); - } else { - let mut req = ScanLockRequest::default(); - req.set_context(ctx1); - req.set_max_version(ts + 20); - let resp = client1.kv_scan_lock(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(resp.has_error()); - } - }); - - fail::remove("before-set-lock-in-memory"); - - handle1.join().unwrap(); - handle2.join().unwrap(); - - // Commit the key so that next turn of test will work. - let mut req = CommitRequest::default(); - req.set_context(ctx.clone()); - req.set_start_version(*ts); - req.set_commit_version(ts + 11); - req.set_keys(vec![k1.to_vec()].into()); - let resp = client.kv_commit(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(!resp.has_error()); - assert_eq!(resp.commit_version, ts + 11); - } -} - #[test] fn test_snapshot_not_block_grpc() { let (cluster, leader, ctx) = must_new_cluster_mul(1); diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index c602fc6e4f7..fa4f6e9cb42 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1346,6 +1346,8 @@ fn test_merge_with_concurrent_pessimistic_locking() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -1433,6 +1435,8 @@ fn test_merge_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1512,6 +1516,8 @@ fn test_retry_pending_prepare_merge_fail() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1586,6 +1592,8 @@ fn test_merge_pessimistic_locks_propose_fail() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 15.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index eb22ac29e45..635b199291b 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -1,4 +1,4 @@ -// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ sync::{mpsc, Arc}, @@ -6,34 +6,35 @@ use std::{ time::Duration, }; +use futures::executor::block_on; use grpcio::EnvBuilder; use kvproto::metapb::*; -use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; +use pd_client::{PdClientV2, RegionInfo, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; use tikv_util::config::ReadableDuration; fn new_test_server_and_client( update_interval: ReadableDuration, -) -> (MockServer, RpcClient) { +) -> (MockServer, RpcClientV2) { let server = MockServer::new(1); let eps = server.bind_addrs(); - let client = new_client_with_update_interval(eps, None, update_interval); + let client = new_client_v2_with_update_interval(eps, None, update_interval); (server, client) } macro_rules! request { ($client: ident => block_on($func: tt($($arg: expr),*))) => { (stringify!($func), { - let client = $client.clone(); + let mut client = $client.clone(); Box::new(move || { - let _ = futures::executor::block_on(client.$func($($arg),*)); + let _ = block_on(client.$func($($arg),*)); }) }) }; ($client: ident => $func: tt($($arg: expr),*)) => { (stringify!($func), { - let client = $client.clone(); + let mut client = $client.clone(); Box::new(move || { let _ = client.$func($($arg),*); }) @@ -44,13 +45,12 @@ macro_rules! request { #[test] fn test_pd_client_deadlock() { let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let client = Arc::new(client); let pd_client_reconnect_fp = "pd_client_reconnect"; // It contains all interfaces of PdClient. let test_funcs: Vec<(_, Box)> = vec![ request!(client => reconnect()), - request!(client => get_cluster_id()), + request!(client => fetch_cluster_id()), request!(client => bootstrap_cluster(Store::default(), Region::default())), request!(client => is_cluster_bootstrapped()), request!(client => alloc_id()), @@ -60,19 +60,15 @@ fn test_pd_client_deadlock() { request!(client => get_cluster_config()), request!(client => get_region(b"")), request!(client => get_region_info(b"")), - request!(client => block_on(get_region_async(b""))), - request!(client => block_on(get_region_info_async(b""))), request!(client => block_on(get_region_by_id(0))), - request!(client => block_on(region_heartbeat(0, Region::default(), Peer::default(), RegionStat::default(), None))), request!(client => block_on(ask_split(Region::default()))), request!(client => block_on(ask_batch_split(Region::default(), 1))), request!(client => block_on(store_heartbeat(Default::default(), None, None))), request!(client => block_on(report_batch_split(vec![]))), request!(client => scatter_region(RegionInfo::new(Region::default(), None))), request!(client => block_on(get_gc_safe_point())), - request!(client => block_on(get_store_stats_async(0))), + request!(client => block_on(get_store_and_stats(0))), request!(client => get_operator(0)), - request!(client => block_on(get_tso())), request!(client => load_global_config(vec![])), ]; @@ -87,10 +83,6 @@ fn test_pd_client_deadlock() { func(); tx.send(()).unwrap(); }); - // Only allow to reconnect once for a func. - client.handle_reconnect(move || { - fail::cfg(pd_client_reconnect_fp, "return").unwrap(); - }); // Remove the fail point to let the PD client thread go on. fail::remove(pd_client_reconnect_fp); @@ -107,7 +99,7 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { - let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); let res = futures::executor::block_on(async move { client .load_global_config( @@ -125,12 +117,11 @@ fn test_load_global_config() { #[test] fn test_watch_global_config_on_closed_server() { - let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let client = Arc::new(client); + let (mut server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); use futures::StreamExt; let j = std::thread::spawn(move || { - futures::executor::block_on(async move { - let mut r = client.watch_global_config().unwrap(); + let mut r = client.watch_global_config().unwrap(); + block_on(async move { let mut i: usize = 0; while let Some(r) = r.next().await { match r { @@ -181,11 +172,11 @@ fn test_slow_periodical_update() { // client1 updates leader frequently (100ms). cfg.update_interval = ReadableDuration(Duration::from_millis(100)); - let _client1 = RpcClient::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); + let _client1 = RpcClientV2::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); // client2 never updates leader in the test. cfg.update_interval = ReadableDuration(Duration::from_secs(100)); - let client2 = RpcClient::new(&cfg, Some(env), mgr).unwrap(); + let mut client2 = RpcClientV2::new(&cfg, Some(env), mgr).unwrap(); fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); // Wait for the PD client thread blocking on the fail point. @@ -208,23 +199,95 @@ fn test_slow_periodical_update() { handle.join().unwrap(); } -// Reconnection will be speed limited. +fn run_on_bad_connection(client: &mut RpcClientV2, mut f: F) +where + F: FnMut(&mut RpcClientV2), +{ + let pd_client_force_reconnect_fp = "pd_client_force_reconnect"; + if !client.initialized() { + client.is_cluster_bootstrapped().unwrap(); + } + client.reset_to_lame_client(); + fail::cfg(pd_client_force_reconnect_fp, "return").unwrap(); + f(client); + fail::remove(pd_client_force_reconnect_fp); +} + #[test] -fn test_reconnect_limit() { - let pd_client_reconnect_fp = "pd_client_reconnect"; - let (_server, client) = new_test_server_and_client(ReadableDuration::secs(100)); +fn test_backoff() { + let pd_client_v2_timeout_fp = "pd_client_v2_request_timeout"; + fail::cfg(pd_client_v2_timeout_fp, "return(5ms)").unwrap(); + // Backoff larger than timeout, so that the second request following the failed + // one can hit backoff. + let pd_client_v2_backoff_fp = "pd_client_v2_backoff"; + fail::cfg(pd_client_v2_backoff_fp, "return(100ms)").unwrap(); + let (_server, mut client) = new_test_server_and_client(ReadableDuration::secs(100)); - // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. - thread::sleep(Duration::from_millis(200)); + run_on_bad_connection(&mut client, |c| { + c.is_cluster_bootstrapped().unwrap_err(); + if c.is_cluster_bootstrapped().is_ok() { + // try again in case the first connect is too early. + run_on_bad_connection(c, |c2| { + c2.is_cluster_bootstrapped().unwrap_err(); + c2.is_cluster_bootstrapped().unwrap_err(); + std::thread::sleep(Duration::from_millis(100)); + c2.is_cluster_bootstrapped().unwrap(); + }); + return; + } + std::thread::sleep(Duration::from_millis(100)); + c.is_cluster_bootstrapped().unwrap(); + }); + + fail::remove(pd_client_v2_timeout_fp); + fail::remove(pd_client_v2_backoff_fp); +} + +#[test] +fn test_retry() { + let pd_client_v2_timeout_fp = "pd_client_v2_request_timeout"; + fail::cfg(pd_client_v2_timeout_fp, "return(10ms)").unwrap(); + // Disable backoff. + let pd_client_v2_backoff_fp = "pd_client_v2_backoff"; + fail::cfg(pd_client_v2_backoff_fp, "return(0s)").unwrap(); + let (_server, mut client) = new_test_server_and_client(ReadableDuration::secs(100)); - // The first reconnection will succeed, and the last_update will not be updated. - fail::cfg(pd_client_reconnect_fp, "return").unwrap(); - client.reconnect().unwrap(); - // The subsequent reconnection will be cancelled. - for _ in 0..10 { - let ret = client.reconnect(); - assert!(format!("{:?}", ret.unwrap_err()).contains("cancel reconnection")); + fn test_retry_success(client: &mut RpcClientV2, mut f: F) + where + F: FnMut(&mut RpcClientV2) -> pd_client::Result, + R: std::fmt::Debug, + { + run_on_bad_connection(client, |c| { + f(c).unwrap_err(); + f(c).unwrap(); + }); } - fail::remove(pd_client_reconnect_fp); + test_retry_success(&mut client, |c| { + c.bootstrap_cluster(Store::default(), Region::default()) + }); + test_retry_success(&mut client, |c| c.is_cluster_bootstrapped()); + test_retry_success(&mut client, |c| c.alloc_id()); + test_retry_success(&mut client, |c| c.put_store(Store::default())); + test_retry_success(&mut client, |c| c.get_store(0)); + test_retry_success(&mut client, |c| c.get_all_stores(false)); + test_retry_success(&mut client, |c| c.get_cluster_config()); + test_retry_success(&mut client, |c| c.get_region_info(b"")); + test_retry_success(&mut client, |c| block_on(c.get_region_by_id(0))); + test_retry_success(&mut client, |c| { + block_on(c.ask_batch_split(Region::default(), 1)) + }); + test_retry_success(&mut client, |c| { + block_on(c.store_heartbeat(Default::default(), None, None)) + }); + test_retry_success(&mut client, |c| block_on(c.report_batch_split(vec![]))); + test_retry_success(&mut client, |c| { + c.scatter_region(RegionInfo::new(Region::default(), None)) + }); + test_retry_success(&mut client, |c| block_on(c.get_gc_safe_point())); + test_retry_success(&mut client, |c| c.get_operator(0)); + test_retry_success(&mut client, |c| block_on(c.load_global_config(vec![]))); + + fail::remove(pd_client_v2_timeout_fp); + fail::remove(pd_client_v2_backoff_fp); } diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs new file mode 100644 index 00000000000..eb22ac29e45 --- /dev/null +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -0,0 +1,230 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{mpsc, Arc}, + thread, + time::Duration, +}; + +use grpcio::EnvBuilder; +use kvproto::metapb::*; +use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; +use security::{SecurityConfig, SecurityManager}; +use test_pd::{mocker::*, util::*, Server as MockServer}; +use tikv_util::config::ReadableDuration; + +fn new_test_server_and_client( + update_interval: ReadableDuration, +) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) +} + +macro_rules! request { + ($client: ident => block_on($func: tt($($arg: expr),*))) => { + (stringify!($func), { + let client = $client.clone(); + Box::new(move || { + let _ = futures::executor::block_on(client.$func($($arg),*)); + }) + }) + }; + ($client: ident => $func: tt($($arg: expr),*)) => { + (stringify!($func), { + let client = $client.clone(); + Box::new(move || { + let _ = client.$func($($arg),*); + }) + }) + }; +} + +#[test] +fn test_pd_client_deadlock() { + let (_server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let client = Arc::new(client); + let pd_client_reconnect_fp = "pd_client_reconnect"; + + // It contains all interfaces of PdClient. + let test_funcs: Vec<(_, Box)> = vec![ + request!(client => reconnect()), + request!(client => get_cluster_id()), + request!(client => bootstrap_cluster(Store::default(), Region::default())), + request!(client => is_cluster_bootstrapped()), + request!(client => alloc_id()), + request!(client => put_store(Store::default())), + request!(client => get_store(0)), + request!(client => get_all_stores(false)), + request!(client => get_cluster_config()), + request!(client => get_region(b"")), + request!(client => get_region_info(b"")), + request!(client => block_on(get_region_async(b""))), + request!(client => block_on(get_region_info_async(b""))), + request!(client => block_on(get_region_by_id(0))), + request!(client => block_on(region_heartbeat(0, Region::default(), Peer::default(), RegionStat::default(), None))), + request!(client => block_on(ask_split(Region::default()))), + request!(client => block_on(ask_batch_split(Region::default(), 1))), + request!(client => block_on(store_heartbeat(Default::default(), None, None))), + request!(client => block_on(report_batch_split(vec![]))), + request!(client => scatter_region(RegionInfo::new(Region::default(), None))), + request!(client => block_on(get_gc_safe_point())), + request!(client => block_on(get_store_stats_async(0))), + request!(client => get_operator(0)), + request!(client => block_on(get_tso())), + request!(client => load_global_config(vec![])), + ]; + + for (name, func) in test_funcs { + fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); + // Wait for the PD client thread blocking on the fail point. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let (tx, rx) = mpsc::channel(); + let handle = thread::spawn(move || { + func(); + tx.send(()).unwrap(); + }); + // Only allow to reconnect once for a func. + client.handle_reconnect(move || { + fail::cfg(pd_client_reconnect_fp, "return").unwrap(); + }); + // Remove the fail point to let the PD client thread go on. + fail::remove(pd_client_reconnect_fp); + + let timeout = Duration::from_millis(500); + if rx.recv_timeout(timeout).is_err() { + panic!("PdClient::{}() hangs", name); + } + handle.join().unwrap(); + } + + drop(client); + fail::remove(pd_client_reconnect_fp); +} + +#[test] +fn test_load_global_config() { + let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let res = futures::executor::block_on(async move { + client + .load_global_config( + ["abc", "123", "xyz"] + .iter() + .map(|x| x.to_string()) + .collect::>(), + ) + .await + }); + for (k, v) in res.unwrap() { + assert_eq!(k, format!("/global/config/{}", v)) + } +} + +#[test] +fn test_watch_global_config_on_closed_server() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let client = Arc::new(client); + use futures::StreamExt; + let j = std::thread::spawn(move || { + futures::executor::block_on(async move { + let mut r = client.watch_global_config().unwrap(); + let mut i: usize = 0; + while let Some(r) = r.next().await { + match r { + Ok(res) => { + let change = &res.get_changes()[0]; + assert_eq!( + change + .get_name() + .split('/') + .collect::>() + .last() + .unwrap() + .to_owned(), + format!("{:?}", i) + ); + assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); + i += 1; + } + Err(e) => { + if let grpcio::Error::RpcFailure(e) = e { + // 14-UNAVAILABLE + assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); + break; + } else { + panic!("other error occur {:?}", e) + } + } + } + } + }); + }); + thread::sleep(Duration::from_millis(200)); + server.stop(); + j.join().unwrap(); +} + +// Updating pd leader may be slow, we need to make sure it does not block other +// RPC in the same gRPC Environment. +#[test] +fn test_slow_periodical_update() { + let pd_client_reconnect_fp = "pd_client_reconnect"; + let server = MockServer::new(1); + let eps = server.bind_addrs(); + + let mut cfg = new_config(eps); + let env = Arc::new(EnvBuilder::new().cq_count(1).build()); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + + // client1 updates leader frequently (100ms). + cfg.update_interval = ReadableDuration(Duration::from_millis(100)); + let _client1 = RpcClient::new(&cfg, Some(env.clone()), mgr.clone()).unwrap(); + + // client2 never updates leader in the test. + cfg.update_interval = ReadableDuration(Duration::from_secs(100)); + let client2 = RpcClient::new(&cfg, Some(env), mgr).unwrap(); + + fail::cfg(pd_client_reconnect_fp, "pause").unwrap(); + // Wait for the PD client thread blocking on the fail point. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let (tx, rx) = mpsc::channel(); + let handle = thread::spawn(move || { + client2.alloc_id().unwrap(); + tx.send(()).unwrap(); + }); + + let timeout = Duration::from_millis(500); + if rx.recv_timeout(timeout).is_err() { + panic!("pd client2 is blocked"); + } + + // Clean up the fail point. + fail::remove(pd_client_reconnect_fp); + handle.join().unwrap(); +} + +// Reconnection will be speed limited. +#[test] +fn test_reconnect_limit() { + let pd_client_reconnect_fp = "pd_client_reconnect"; + let (_server, client) = new_test_server_and_client(ReadableDuration::secs(100)); + + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + // The first reconnection will succeed, and the last_update will not be updated. + fail::cfg(pd_client_reconnect_fp, "return").unwrap(); + client.reconnect().unwrap(); + // The subsequent reconnection will be cancelled. + for _ in 0..10 { + let ret = client.reconnect(); + assert!(format!("{:?}", ret.unwrap_err()).contains("cancel reconnection")); + } + + fail::remove(pd_client_reconnect_fp); +} diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 93acfffc258..dde25bff636 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -163,7 +163,7 @@ fn assert_snapshot(snap_dir: &str, region_id: u64, exist: bool) { let region_id = format!("{}", region_id); let timer = Instant::now(); loop { - for p in fs::read_dir(&snap_dir).unwrap() { + for p in fs::read_dir(snap_dir).unwrap() { let name = p.unwrap().file_name().into_string().unwrap(); let mut parts = name.split('_'); parts.next(); @@ -354,12 +354,12 @@ fn test_shutdown_when_snap_gc() { pd_client.must_add_peer(r1, new_learner_peer(2, 2)); // Snapshot directory on store 2 shouldn't be empty. - let snap_dir = cluster.get_snap_dir(2); + let snap_dir = &cluster.get_snap_dir(2); for i in 0..=100 { if i == 100 { panic!("store 2 snap dir must not be empty"); } - let dir = fs::read_dir(&snap_dir).unwrap(); + let dir = fs::read_dir(snap_dir).unwrap(); if dir.count() > 0 { break; } @@ -377,7 +377,7 @@ fn test_shutdown_when_snap_gc() { cluster.stop_node(2); let snap_dir = cluster.get_snap_dir(2); - let dir = fs::read_dir(&snap_dir).unwrap(); + let dir = fs::read_dir(snap_dir).unwrap(); if dir.count() == 0 { panic!("store 2 snap dir must not be empty"); } @@ -591,7 +591,7 @@ fn test_snapshot_gc_after_failed() { let mut snap_file_path = PathBuf::from(&snap_dir); snap_file_path.push(&f); let snap_file_path = snap_file_path.as_path(); - let mut file = match File::create(&snap_file_path) { + let mut file = match File::create(snap_file_path) { Err(why) => panic!("couldn't create {:?}: {}", snap_file_path, why), Ok(file) => file, }; diff --git a/tests/failpoints/cases/test_split_region.rs b/tests/failpoints/cases/test_split_region.rs index 9ed57b94091..416116c833b 100644 --- a/tests/failpoints/cases/test_split_region.rs +++ b/tests/failpoints/cases/test_split_region.rs @@ -943,6 +943,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: (commit_ts + 10).into(), min_commit_ts: (commit_ts + 10).into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; let lock_c = PessimisticLock { primary: b"c".to_vec().into_boxed_slice(), @@ -950,6 +952,8 @@ fn test_split_pessimistic_locks_with_concurrent_prewrite() { ttl: 3000, for_update_ts: (commit_ts + 10).into(), min_commit_ts: (commit_ts + 10).into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; { let mut locks = txn_ext.pessimistic_locks.write(); diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index e0f68b721b5..40c78dfabde 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -4,7 +4,7 @@ use std::{ sync::{ atomic::{AtomicBool, Ordering}, mpsc::{channel, RecvTimeoutError}, - Arc, + Arc, Mutex, }, thread, time::Duration, @@ -42,7 +42,7 @@ use tikv::{ }, }; use tikv_util::{future::paired_future_callback, worker::dummy_scheduler, HandyRwLock}; -use txn_types::{Key, Mutation, OldValues, TimeStamp}; +use txn_types::{Key, Mutation, TimeStamp}; #[test] fn test_scheduler_leader_change_twice() { @@ -389,7 +389,7 @@ fn test_pipelined_pessimistic_lock() { new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 10, 10, true, false), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![None]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(None)]), ), ) .unwrap(); @@ -452,7 +452,9 @@ fn test_pipelined_pessimistic_lock() { ), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![Some(val.clone())]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(Some( + val.clone(), + ))]), ), ) .unwrap(); @@ -475,7 +477,7 @@ fn test_pipelined_pessimistic_lock() { new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 50, 50, true, false), expect_pessimistic_lock_res_callback( tx.clone(), - PessimisticLockRes::Values(vec![Some(val.clone())]), + PessimisticLockResults(vec![PessimisticLockKeyResult::Value(Some(val.clone()))]), ), ) .unwrap(); @@ -498,7 +500,10 @@ fn test_pipelined_pessimistic_lock() { ), expect_pessimistic_lock_res_callback( tx, - PessimisticLockRes::Values(vec![Some(val), None]), + PessimisticLockResults(vec![ + PessimisticLockKeyResult::Value(Some(val)), + PessimisticLockKeyResult::Value(None), + ]), ), ) .unwrap(); @@ -507,6 +512,170 @@ fn test_pipelined_pessimistic_lock() { delete_pessimistic_lock(&storage, key, 60, 60); } +fn test_pessimistic_lock_resumable_blocked_twice_impl(canceled_when_resumed: bool) { + let lock_mgr = MockLockManager::new(); + let storage = TestStorageBuilderApiV1::new(lock_mgr.clone()) + .wake_up_delay_duration(100) + .build() + .unwrap(); + let (tx, rx) = channel(); + + let empty = PessimisticLockResults(vec![PessimisticLockKeyResult::Empty]); + + fail::cfg("lock_waiting_queue_before_delayed_notify_all", "pause").unwrap(); + let (first_resume_tx, first_resume_rx) = channel(); + let (first_resume_continue_tx, first_resume_continue_rx) = channel(); + let first_resume_tx = Mutex::new(first_resume_tx); + let first_resume_continue_rx = Mutex::new(first_resume_continue_rx); + fail::cfg_callback( + "acquire_pessimistic_lock_resumed_before_process_write", + move || { + // Notify that the failpoint is reached, and block until it receives a continue + // signal. + first_resume_tx.lock().unwrap().send(()).unwrap(); + first_resume_continue_rx.lock().unwrap().recv().unwrap(); + }, + ) + .unwrap(); + + let key = Key::from_raw(b"key"); + + // Lock the key. + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 10, 10, false, false), + expect_pessimistic_lock_res_callback(tx, empty.clone()), + ) + .unwrap(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + // Another non-resumable request blocked. + let (tx_blocked_1, rx_blocked_1) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 11, 11, false, false), + expect_fail_callback(tx_blocked_1, 0, |e| match e { + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::WriteConflict { .. }, + ))))) => (), + e => panic!("unexpected error chain: {:?}", e), + }), + ) + .unwrap(); + rx_blocked_1 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(); + + let tokens_before = lock_mgr.get_all_tokens(); + // Another resumable request blocked, and is queued behind the above one. + let (tx_blocked_2, rx_blocked_2) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 12, 12, false, false) + .allow_lock_with_conflict(true), + if !canceled_when_resumed { + expect_pessimistic_lock_res_callback(tx_blocked_2, empty.clone()) + } else { + expect_value_with_checker_callback( + tx_blocked_2, + 0, + |res: storage::Result| { + let res = res.unwrap().0; + assert_eq!(res.len(), 1); + let e = res[0].unwrap_err(); + match e.inner() { + ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::KeyIsLocked(_), + )))) => (), + e => panic!("unexpected error chain: {:?}", e), + } + }, + ) + }, + ) + .unwrap(); + rx_blocked_2 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(); + // Find the lock wait token of the above request. + let tokens_after = lock_mgr.get_all_tokens(); + let token_of_12 = { + use std::ops::Sub; + let diff = tokens_after.sub(&tokens_before); + assert_eq!(diff.len(), 1); + diff.into_iter().next().unwrap() + }; + + // Release the lock, so that the former (non-resumable) request will be woken + // up, and the other one (resumable) will be woken up after delaying for + // `wake_up_delay_duration`. + delete_pessimistic_lock(&storage, key.clone(), 10, 10); + rx_blocked_1.recv_timeout(Duration::from_secs(1)).unwrap(); + + // The key should be unlocked at this time. + must_have_locks(&storage, 100, b"", b"\xff\xff\xff", &[]); + + // Simulate the transaction at ts=11 retries the pessimistic lock request, and + // succeeds. + let (tx, rx) = channel(); + storage + .sched_txn_command( + new_acquire_pessimistic_lock_command(vec![(key.clone(), false)], 11, 11, false, false), + expect_pessimistic_lock_res_callback(tx, empty), + ) + .unwrap(); + rx.recv_timeout(Duration::from_secs(1)).unwrap(); + + // Remove `pause` in delayed wake up, so that the request of txn 12 can be woken + // up. + fail::remove("lock_waiting_queue_before_delayed_notify_all"); + first_resume_rx.recv().unwrap(); + + if canceled_when_resumed { + lock_mgr.simulate_timeout(token_of_12); + } + + fail::remove("acquire_pessimistic_lock_resumed_before_process_write"); + first_resume_continue_tx.send(()).unwrap(); + + if canceled_when_resumed { + rx_blocked_2.recv_timeout(Duration::from_secs(1)).unwrap(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 11, 11)], + ); + } else { + rx_blocked_2 + .recv_timeout(Duration::from_millis(100)) + .unwrap_err(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 11, 11)], + ); + delete_pessimistic_lock(&storage, key.clone(), 11, 11); + rx_blocked_2.recv_timeout(Duration::from_secs(1)).unwrap(); + must_have_locks( + &storage, + 100, + b"", + b"\xff\xff\xff", + &[(&key.to_raw().unwrap(), Op::PessimisticLock, 12, 12)], + ); + } +} + +#[test] +fn test_pessimistic_lock_resumable_blocked_twice() { + test_pessimistic_lock_resumable_blocked_twice_impl(false); + test_pessimistic_lock_resumable_blocked_twice_impl(true); +} + #[test] fn test_async_commit_prewrite_with_stale_max_ts() { test_async_commit_prewrite_with_stale_max_ts_impl::(); @@ -674,7 +843,7 @@ fn test_async_apply_prewrite_impl( None, false, 0.into(), - OldValues::default(), + false, false, false, ctx.clone(), @@ -1013,7 +1182,7 @@ fn test_async_apply_prewrite_1pc_impl( None, false, 0.into(), - OldValues::default(), + false, false, false, ctx.clone(), diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index e42a44047a4..564b5f393ec 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -566,6 +566,8 @@ fn test_concurrent_write_after_transfer_leader_invalidates_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/failpoints/cases/test_transfer_leader.rs b/tests/failpoints/cases/test_transfer_leader.rs index cc6b043f0e5..ed4a8501188 100644 --- a/tests/failpoints/cases/test_transfer_leader.rs +++ b/tests/failpoints/cases/test_transfer_leader.rs @@ -134,6 +134,8 @@ fn test_delete_lock_proposed_after_proposing_locks_impl(transfer_msg_count: usiz ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -211,6 +213,8 @@ fn test_delete_lock_proposed_before_proposing_locks() { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 20.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); @@ -293,6 +297,8 @@ fn test_read_lock_after_become_follower() { ttl: 1000, for_update_ts, min_commit_ts: for_update_ts, + last_change_ts: start_ts.prev(), + versions_to_last_change: 1, }, )]) .unwrap(); diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs new file mode 100644 index 00000000000..cee75ff44b9 --- /dev/null +++ b/tests/failpoints/cases/test_witness.rs @@ -0,0 +1,71 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{iter::FromIterator, sync::Arc, time::Duration}; + +use futures::executor::block_on; +use kvproto::metapb; +use pd_client::PdClient; +use test_raftstore::*; +use tikv_util::store::find_peer; + +fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(true); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +// Test the case local reader works well with witness peer. +#[test] +fn test_witness_update_region_in_local_reader() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + cluster.must_put(b"k0", b"v0"); + + // update region but the peer is not destroyed yet + fail::cfg("change_peer_after_update_region_store_3", "pause").unwrap(); + + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request.clone(), Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); + + fail::remove("change_peer_after_update_region_store_3"); +} diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index ff07d8a712a..f432fd72246 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -499,7 +499,7 @@ fn test_invalid_external_storage() { // Set backup directory read-only. TiKV fails to backup. let tmp = Builder::new().tempdir().unwrap(); - let f = File::open(&tmp.path()).unwrap(); + let f = File::open(tmp.path()).unwrap(); let mut perms = f.metadata().unwrap().permissions(); perms.set_readonly(true); f.set_permissions(perms.clone()).unwrap(); diff --git a/tests/integrations/config/dynamic/gc_worker.rs b/tests/integrations/config/dynamic/gc_worker.rs index e8b437f941a..623833c3b27 100644 --- a/tests/integrations/config/dynamic/gc_worker.rs +++ b/tests/integrations/config/dynamic/gc_worker.rs @@ -5,9 +5,7 @@ use std::{ time::Duration, }; -use raftstore::{ - coprocessor::region_info_accessor::MockRegionInfoProvider, router::RaftStoreBlackHole, -}; +use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; use tikv::{ config::{ConfigController, Module, TikvConfig}, server::gc_worker::{GcConfig, GcTask, GcWorker}, @@ -27,15 +25,11 @@ fn test_gc_config_validate() { fn setup_cfg_controller( cfg: TikvConfig, -) -> ( - GcWorker, - ConfigController, -) { +) -> (GcWorker, ConfigController) { let engine = TestEngineBuilder::new().build().unwrap(); let (tx, _rx) = std::sync::mpsc::channel(); let mut gc_worker = GcWorker::new( engine, - RaftStoreBlackHole, tx, cfg.gc.clone(), Default::default(), diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 38fdf5c175c..70e70b3cbe6 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -76,7 +76,7 @@ fn start_raftstore( .as_path() .display() .to_string(); - Arc::new(SstImporter::new(&cfg.import, &p, None, cfg.storage.api_version()).unwrap()) + Arc::new(SstImporter::new(&cfg.import, p, None, cfg.storage.api_version()).unwrap()) }; let snap_mgr = { let p = dir @@ -162,6 +162,7 @@ fn test_update_raftstore_config() { ("raftstore.apply-max-batch-size", "1234"), ("raftstore.store-max-batch-size", "4321"), ("raftstore.raft-entry-max-size", "32MiB"), + ("raftstore.apply-yield-write-size", "10KiB"), ]); cfg_controller.update(change).unwrap(); @@ -169,6 +170,7 @@ fn test_update_raftstore_config() { // config should be updated let mut raft_store = config.raft_store; raft_store.messages_per_tick = 12345; + raft_store.apply_yield_write_size = ReadableSize::kb(10); raft_store.raft_log_gc_threshold = 54321; raft_store.apply_batch_system.max_batch_size = Some(1234); raft_store.store_batch_system.max_batch_size = Some(4321); diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 5b9ef72b4c3..1a82ec8005e 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -15,6 +15,7 @@ use tikv::{ config::{ConfigController, TikvConfig}, server::{ config::{Config as ServerConfig, ServerConfigManager}, + raftkv::RaftRouterWrap, snap::{Runner as SnapHandler, Task as SnapTask}, }, }; @@ -60,7 +61,7 @@ fn start_server( let snap_runner = SnapHandler::new( Arc::clone(&env), snap_mgr.clone(), - raft_router, + RaftRouterWrap::new(raft_router), security_mgr, Arc::clone(&server_config), ); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 90524079bfa..ff01788c370 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -88,6 +88,8 @@ fn test_serde_custom_tikv_config() { max_grpc_send_msg_len: 6 * (1 << 20), raft_client_grpc_send_msg_buffer: 1234 * 1024, raft_client_queue_size: 1234, + raft_client_max_backoff: ReadableDuration::secs(5), + raft_client_initial_reconnect_backoff: ReadableDuration::secs(1), raft_msg_max_batch_size: 123, concurrent_send_snap_limit: 4, concurrent_recv_snap_limit: 4, @@ -117,7 +119,6 @@ fn test_serde_custom_tikv_config() { heavy_load_wait_duration: Some(ReadableDuration::millis(2)), enable_request_batch: false, background_thread_count: 999, - raft_client_backoff_step: ReadableDuration::secs(1), end_point_slow_log_threshold: ReadableDuration::secs(1), forward_max_connections_per_address: 5, reject_messages_on_memory_ratio: 0.8, @@ -230,6 +231,7 @@ fn test_serde_custom_tikv_config() { hibernate_regions: false, dev_assert: true, apply_yield_duration: ReadableDuration::millis(333), + apply_yield_write_size: ReadableSize(12345), perf_level: PerfLevel::Disable, evict_cache_on_memory_ratio: 0.8, cmd_batch: false, @@ -669,6 +671,7 @@ fn test_serde_custom_tikv_config() { scheduler_worker_pool_size: 1, scheduler_pending_write_threshold: ReadableSize::kb(123), reserve_space: ReadableSize::gb(10), + reserve_raft_space: ReadableSize::gb(2), enable_async_apply_prewrite: true, api_version: 1, enable_ttl: true, @@ -762,6 +765,7 @@ fn test_serde_custom_tikv_config() { num_threads: 123, stream_channel_window: 123, import_mode_timeout: ReadableDuration::secs(1453), + memory_use_ratio: 0.3, }; value.panic_when_unexpected_key_or_data = true; value.gc = GcConfig { @@ -788,7 +792,6 @@ fn test_serde_custom_tikv_config() { tso_worker_threads: 2, old_value_cache_memory_quota: ReadableSize::mb(14), sink_memory_quota: ReadableSize::mb(7), - raw_min_ts_outlier_threshold: ReadableDuration::secs(60), }; value.resolved_ts = ResolvedTsConfig { enable: true, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index 17f82f9eb87..e5c896238bc 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -54,6 +54,7 @@ status-thread-pool-size = 1 max-grpc-send-msg-len = 6291456 raft-client-grpc-send-msg-buffer = 1263616 raft-client-queue-size = 1234 +raft-client-max-backoff = "5s" raft-msg-max-batch-size = 123 grpc-compression-type = "gzip" grpc-concurrency = 123 @@ -96,6 +97,7 @@ scheduler-worker-pool-size = 1 scheduler-pending-write-threshold = "123KB" enable-async-apply-prewrite = true reserve-space = "10GB" +reserve-raft-space = "2GB" enable-ttl = true ttl-check-poll-interval = "0s" @@ -189,6 +191,7 @@ merge-check-tick-interval = "11s" use-delete-range = true cleanup-import-sst-interval = "12m" local-read-batch-size = 33 +apply-yield-write-size = "12345B" apply-max-batch-size = 22 apply-pool-size = 4 apply-reschedule-duration = "3s" diff --git a/tests/integrations/config/test_config_client.rs b/tests/integrations/config/test_config_client.rs index 6faa68f3932..b56987fa1dc 100644 --- a/tests/integrations/config/test_config_client.rs +++ b/tests/integrations/config/test_config_client.rs @@ -149,7 +149,7 @@ blob-run-mode = "normal" cfg_controller.update(change).unwrap(); let res = { let mut buf = Vec::new(); - let mut f = File::open(&cfg_controller.get_current().cfg_path).unwrap(); + let mut f = File::open(cfg_controller.get_current().cfg_path).unwrap(); f.read_to_end(&mut buf).unwrap(); buf }; diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index 952516daf35..ad195f62774 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -2,13 +2,16 @@ use std::{cmp, thread, time::Duration}; +use engine_traits::CF_LOCK; use kvproto::{ - coprocessor::{Request, Response}, - kvrpcpb::{Context, IsolationLevel}, + coprocessor::{Request, Response, StoreBatchTask}, + errorpb, + kvrpcpb::{Context, IsolationLevel, LockInfo}, }; -use protobuf::Message; +use protobuf::{Message, SingularPtrField}; use raftstore::store::Bucket; use test_coprocessor::*; +use test_raftstore::{Cluster, ServerCluster}; use test_storage::*; use tidb_query_datatype::{ codec::{datum, Datum}, @@ -24,7 +27,7 @@ use tipb::{ AnalyzeColumnsReq, AnalyzeReq, AnalyzeType, ChecksumRequest, Chunk, Expr, ExprType, ScalarFuncSig, SelectResponse, }; -use txn_types::TimeStamp; +use txn_types::{Key, Lock, LockType, TimeStamp}; const FLAG_IGNORE_TRUNCATE: u64 = 1; const FLAG_TRUNCATE_AS_WARNING: u64 = 1 << 1; @@ -764,7 +767,7 @@ fn test_order_by_pk_with_select_from_index() { let name_datum = name.map(|s| s.as_bytes()).into(); let expected_encoded = datum::encode_value( &mut EvalContext::default(), - &[name_datum, (cnt as i64).into(), (id as i64).into()], + &[name_datum, cnt.into(), id.into()], ) .unwrap(); let result_encoded = datum::encode_value(&mut EvalContext::default(), &row).unwrap(); @@ -2006,3 +2009,253 @@ fn test_buckets() { wait_refresh_buckets(0); } + +#[test] +fn test_batch_request() { + let data = vec![ + (1, Some("name:0"), 2), + (2, Some("name:4"), 3), + (4, Some("name:3"), 1), + (5, Some("name:1"), 4), + (9, Some("name:8"), 7), + (10, Some("name:6"), 8), + ]; + + let product = ProductTable::new(); + let (mut cluster, raft_engine, ctx) = new_raft_engine(1, ""); + let (_, endpoint, _) = + init_data_with_engine_and_commit(ctx.clone(), raft_engine, &product, &data, true); + + // Split the region into [1, 2], [4, 5], [9, 10]. + let region = + cluster.get_region(Key::from_raw(&product.get_record_range(1, 1).start).as_encoded()); + let split_key = Key::from_raw(&product.get_record_range(3, 3).start); + cluster.must_split(®ion, split_key.as_encoded()); + let second_region = + cluster.get_region(Key::from_raw(&product.get_record_range(4, 4).start).as_encoded()); + let second_split_key = Key::from_raw(&product.get_record_range(8, 8).start); + cluster.must_split(&second_region, second_split_key.as_encoded()); + + struct HandleRange { + start: i64, + end: i64, + } + + enum QueryResult { + Valid(Vec<(i64, Option<&'static str>, i64)>), + ErrRegion, + ErrLocked, + ErrOther, + } + + // Each case has four fields: + // 1. The input scan handle range. + // 2. The expected output results. + // 3. Should the coprocessor request contain invalid region epoch. + // 4. Should the scanned key be locked. + let cases = vec![ + // Basic valid case. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 3, end: 5 }, + ], + vec![ + QueryResult::Valid(vec![(1_i64, Some("name:0"), 2_i64), (2, Some("name:4"), 3)]), + QueryResult::Valid(vec![(4, Some("name:3"), 1), (5, Some("name:1"), 4)]), + ], + false, + false, + ), + // Original task is valid, batch tasks are not all valid. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 4, end: 6 }, + HandleRange { start: 9, end: 11 }, + HandleRange { start: 1, end: 3 }, // Input range [1, 4) crosses two region ranges. + HandleRange { start: 4, end: 8 }, // Input range [4, 9] crosses two region ranges. + ], + vec![ + QueryResult::Valid(vec![(1, Some("name:0"), 2), (2, Some("name:4"), 3)]), + QueryResult::Valid(vec![(4, Some("name:3"), 1), (5, Some("name:1"), 4)]), + QueryResult::Valid(vec![(9, Some("name:8"), 7), (10, Some("name:6"), 8)]), + QueryResult::ErrOther, + QueryResult::ErrOther, + ], + false, + false, + ), + // Original task is invalid, batch tasks are not all valid. + ( + vec![HandleRange { start: 1, end: 3 }], + vec![QueryResult::ErrOther], + false, + false, + ), + // Invalid epoch case. + ( + vec![ + HandleRange { start: 1, end: 3 }, + HandleRange { start: 4, end: 6 }, + ], + vec![QueryResult::ErrRegion, QueryResult::ErrRegion], + true, + false, + ), + // Locked error case. + ( + vec![ + HandleRange { start: 1, end: 2 }, + HandleRange { start: 4, end: 6 }, + ], + vec![QueryResult::ErrLocked, QueryResult::ErrLocked], + false, + true, + ), + ]; + let prepare_req = + |cluster: &mut Cluster, ranges: &Vec| -> Request { + let original_range = ranges.get(0).unwrap(); + let key_range = product.get_record_range(original_range.start, original_range.end); + let region_key = Key::from_raw(&key_range.start); + let mut req = DagSelect::from(&product) + .key_ranges(vec![key_range]) + .build_with(ctx.clone(), &[0]); + let mut new_ctx = Context::default(); + let new_region = cluster.get_region(region_key.as_encoded()); + let leader = cluster.leader_of_region(new_region.get_id()).unwrap(); + new_ctx.set_region_id(new_region.get_id()); + new_ctx.set_region_epoch(new_region.get_region_epoch().clone()); + new_ctx.set_peer(leader); + req.set_context(new_ctx); + req.set_start_ts(100); + + let batch_handle_ranges = &ranges.as_slice()[1..]; + for handle_range in batch_handle_ranges.iter() { + let range_start_key = Key::from_raw( + &product + .get_record_range(handle_range.start, handle_range.end) + .start, + ); + let batch_region = cluster.get_region(range_start_key.as_encoded()); + let batch_leader = cluster.leader_of_region(batch_region.get_id()).unwrap(); + let batch_key_ranges = + vec![product.get_record_range(handle_range.start, handle_range.end)]; + let mut store_batch_task = StoreBatchTask::new(); + store_batch_task.set_region_id(batch_region.get_id()); + store_batch_task.set_region_epoch(batch_region.get_region_epoch().clone()); + store_batch_task.set_peer(batch_leader); + store_batch_task.set_ranges(batch_key_ranges.into()); + req.tasks.push(store_batch_task); + } + req + }; + let verify_response = |result: &QueryResult, + data: &[u8], + region_err: &SingularPtrField, + locked: &SingularPtrField, + other_err: &String| { + match result { + QueryResult::Valid(res) => { + let expected_len = res.len(); + let mut sel_resp = SelectResponse::default(); + sel_resp.merge_from_bytes(data).unwrap(); + let mut row_count = 0; + let spliter = DagChunkSpliter::new(sel_resp.take_chunks().into(), 3); + for (row, (id, name, cnt)) in spliter.zip(res) { + let name_datum = name.map(|s| s.as_bytes()).into(); + let expected_encoded = datum::encode_value( + &mut EvalContext::default(), + &[Datum::I64(*id), name_datum, Datum::I64(*cnt)], + ) + .unwrap(); + let result_encoded = + datum::encode_value(&mut EvalContext::default(), &row).unwrap(); + assert_eq!(result_encoded, &*expected_encoded); + row_count += 1; + } + assert_eq!(row_count, expected_len); + assert!(region_err.is_none()); + assert!(locked.is_none()); + assert!(other_err.is_empty()); + } + QueryResult::ErrRegion => { + assert!(region_err.is_some()); + assert!(locked.is_none()); + assert!(other_err.is_empty()); + } + QueryResult::ErrLocked => { + assert!(region_err.is_none()); + assert!(locked.is_some()); + assert!(other_err.is_empty()); + } + QueryResult::ErrOther => { + assert!(region_err.is_none()); + assert!(locked.is_none()); + assert!(!other_err.is_empty()) + } + } + }; + + for (ranges, results, invalid_epoch, key_is_locked) in cases.iter() { + let mut req = prepare_req(&mut cluster, ranges); + if *invalid_epoch { + req.context + .as_mut() + .unwrap() + .region_epoch + .as_mut() + .unwrap() + .version -= 1; + for batch_task in req.tasks.iter_mut() { + batch_task.region_epoch.as_mut().unwrap().version -= 1; + } + } else if *key_is_locked { + for range in ranges.iter() { + let lock_key = + Key::from_raw(&product.get_record_range(range.start, range.start).start); + let lock = Lock::new( + LockType::Put, + lock_key.as_encoded().clone(), + 10.into(), + 10, + None, + TimeStamp::zero(), + 1, + TimeStamp::zero(), + ); + cluster.must_put_cf(CF_LOCK, lock_key.as_encoded(), lock.to_bytes().as_slice()); + } + } + let mut resp = handle_request(&endpoint, req); + let batch_results = resp.take_batch_responses().to_vec(); + for (i, result) in results.iter().enumerate() { + if i == 0 { + verify_response( + result, + resp.get_data(), + &resp.region_error, + &resp.locked, + &resp.other_error, + ); + } else { + let batch_resp = batch_results.get(i - 1).unwrap(); + verify_response( + result, + batch_resp.get_data(), + &batch_resp.region_error, + &batch_resp.locked, + &batch_resp.other_error, + ); + }; + } + if *key_is_locked { + for range in ranges.iter() { + let lock_key = + Key::from_raw(&product.get_record_range(range.start, range.start).start); + cluster.must_delete_cf(CF_LOCK, lock_key.as_encoded()); + } + } + } +} diff --git a/tests/integrations/pd/mod.rs b/tests/integrations/pd/mod.rs index 2cadf7db2b5..eb9b6cc092a 100644 --- a/tests/integrations/pd/mod.rs +++ b/tests/integrations/pd/mod.rs @@ -1,3 +1,4 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. mod test_rpc_client; +mod test_rpc_client_legacy; diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 5f44cc0137b..23841ba5dfd 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -1,26 +1,40 @@ -// Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{ - atomic::{AtomicUsize, Ordering}, - mpsc, Arc, - }, - thread, - time::Duration, -}; +use std::{sync::Arc, thread, time::Duration}; use error_code::ErrorCodeExt; -use futures::executor::block_on; +use futures::{executor::block_on, StreamExt}; use grpcio::{EnvBuilder, Error as GrpcError, RpcStatus, RpcStatusCode}; use kvproto::{metapb, pdpb}; -use pd_client::{Error as PdError, Feature, PdClient, PdConnector, RegionStat, RpcClient}; -use raftstore::store; +use pd_client::{Error as PdError, Feature, PdClientV2, PdConnector, RpcClientV2}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; -use tikv_util::config::ReadableDuration; -use tokio::runtime::Builder; +use tikv_util::{config::ReadableDuration, mpsc::future::WakePolicy, thd_name}; +use tokio::runtime::{Builder, Runtime}; use txn_types::TimeStamp; +fn setup_runtime() -> Runtime { + Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .enable_all() + .build() + .unwrap() +} + +fn must_get_tso(client: &mut RpcClientV2, count: u32) -> TimeStamp { + let (tx, mut responses) = client.create_tso_stream(WakePolicy::Immediately).unwrap(); + let mut req = pdpb::TsoRequest::default(); + req.mut_header().cluster_id = client.fetch_cluster_id().unwrap(); + req.count = count; + tx.send(req).unwrap(); + let resp = block_on(responses.next()).unwrap().unwrap(); + let ts = resp.timestamp.unwrap(); + let physical = ts.physical as u64; + let logical = ts.logical as u64; + TimeStamp::compose(physical, logical) +} + #[test] fn test_retry_rpc_client() { let eps_count = 1; @@ -32,7 +46,7 @@ fn test_retry_rpc_client() { server.stop(); let child = thread::spawn(move || { let cfg = new_config(m_eps); - RpcClient::new(&cfg, None, m_mgr).unwrap(); + RpcClientV2::new(&cfg, None, m_mgr).unwrap(); }); thread::sleep(Duration::from_millis(500)); server.start(&mgr, eps); @@ -41,12 +55,14 @@ fn test_retry_rpc_client() { #[test] fn test_rpc_client() { + let rt = setup_runtime(); + let _g = rt.enter(); let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps.clone(), None); - assert_ne!(client.get_cluster_id().unwrap(), 0); + let mut client = new_client_v2(eps.clone(), None); + assert_ne!(client.fetch_cluster_id().unwrap(), 0); let store_id = client.alloc_id().unwrap(); let mut store = metapb::Store::default(); @@ -89,38 +105,32 @@ fn test_rpc_client() { .unwrap(); assert_eq!(tmp_region.get_id(), region.get_id()); - let ts = block_on(client.get_tso()).unwrap(); + let ts = must_get_tso(&mut client, 1); assert_ne!(ts, TimeStamp::zero()); - let ts100 = block_on(client.batch_get_tso(100)).unwrap(); + let ts100 = must_get_tso(&mut client, 100); assert_eq!(ts.logical() + 100, ts100.logical()); let mut prev_id = 0; for _ in 0..100 { - let client = new_client(eps.clone(), None); + let mut client = new_client_v2(eps.clone(), None); let alloc_id = client.alloc_id().unwrap(); assert!(alloc_id > prev_id); prev_id = alloc_id; } - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = client.handle_region_heartbeat_response(1, move |resp| { - let _ = tx.send(resp); - }); - poller.spawn(f); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - RegionStat::default(), - None, - )); - rx.recv_timeout(Duration::from_secs(3)).unwrap(); + let mut req = pdpb::RegionHeartbeatRequest::default(); + req.set_region(region.clone()); + req.set_leader(peer.clone()); + tx.send(req).unwrap(); + block_on(tokio::time::timeout( + Duration::from_secs(3), + responses.next(), + )) + .unwrap(); let region_info = client.get_region_info(region_key).unwrap(); assert_eq!(region_info.region, region); @@ -150,26 +160,14 @@ fn test_connect_follower() { // test switch cfg.enable_forwarding = false; let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); - let client1 = RpcClient::new(&cfg, None, mgr).unwrap(); + let mut client1 = RpcClientV2::new(&cfg, None, mgr).unwrap(); fail::cfg(connect_leader_fp, "return").unwrap(); - // RECONNECT_INTERVAL_SEC is 1s. - thread::sleep(Duration::from_secs(1)); - let res = format!("{}", client1.alloc_id().unwrap_err()); - let err = format!( - "{}", - PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( - RpcStatusCode::UNAVAILABLE, - "".to_string(), - ))) - ); - assert_eq!(res, err); + client1.alloc_id().unwrap_err(); cfg.enable_forwarding = true; let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); - let client = RpcClient::new(&cfg, None, mgr).unwrap(); - // RECONNECT_INTERVAL_SEC is 1s. - thread::sleep(Duration::from_secs(1)); - let leader_addr = client1.get_leader().get_client_urls()[0].clone(); + let mut client = RpcClientV2::new(&cfg, None, mgr).unwrap(); + let leader_addr = client.get_leader().get_client_urls()[0].clone(); let res = format!("{}", client.alloc_id().unwrap_err()); let err = format!( "{}", @@ -188,7 +186,7 @@ fn test_get_tombstone_stores() { let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let mut all_stores = vec![]; let store_id = client.alloc_id().unwrap(); @@ -242,7 +240,7 @@ fn test_get_tombstone_store() { let eps_count = 1; let server = MockServer::new(eps_count); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let mut all_stores = vec![]; let store_id = client.alloc_id().unwrap(); @@ -264,7 +262,7 @@ fn test_get_tombstone_store() { store99.set_state(metapb::StoreState::Tombstone); server.default_handler().add_store(store99.clone()); - let r = block_on(client.get_store_async(99)); + let r = client.get_store(99); assert_eq!(r.unwrap_err().error_code(), error_code::pd::STORE_TOMBSTONE); } @@ -273,7 +271,7 @@ fn test_reboot() { let eps_count = 1; let server = MockServer::with_case(eps_count, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); assert!(!client.is_cluster_bootstrapped().unwrap()); @@ -299,7 +297,7 @@ fn test_validate_endpoints() { let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let connector = PdConnector::new(env, mgr); - assert!(block_on(connector.validate_endpoints(&new_config(eps))).is_err()); + assert!(block_on(connector.validate_endpoints(&new_config(eps), true)).is_err()); } #[test] @@ -318,66 +316,7 @@ fn test_validate_endpoints_retry() { eps.pop(); let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let connector = PdConnector::new(env, mgr); - assert!(block_on(connector.validate_endpoints(&new_config(eps))).is_err()); -} - -fn test_retry(func: F) { - let eps_count = 1; - // Retry mocker returns `Err(_)` for most request, here two thirds are `Err(_)`. - let retry = Arc::new(Retry::new(3)); - let server = MockServer::with_case(eps_count, retry); - let eps = server.bind_addrs(); - - let client = new_client(eps, None); - - for _ in 0..3 { - func(&client); - } -} - -#[test] -fn test_retry_async() { - let r#async = |client: &RpcClient| { - block_on(client.get_region_by_id(1)).unwrap(); - }; - test_retry(r#async); -} - -#[test] -fn test_retry_sync() { - let sync = |client: &RpcClient| { - client.get_store(1).unwrap(); - }; - test_retry(sync) -} - -fn test_not_retry(func: F) { - let eps_count = 1; - // NotRetry mocker returns Ok() with error header first, and next returns Ok() - // without any error header. - let not_retry = Arc::new(NotRetry::new()); - let server = MockServer::with_case(eps_count, not_retry); - let eps = server.bind_addrs(); - - let client = new_client(eps, None); - - func(&client); -} - -#[test] -fn test_not_retry_async() { - let r#async = |client: &RpcClient| { - block_on(client.get_region_by_id(1)).unwrap_err(); - }; - test_not_retry(r#async); -} - -#[test] -fn test_not_retry_sync() { - let sync = |client: &RpcClient| { - client.get_store(1).unwrap_err(); - }; - test_not_retry(sync); + assert!(block_on(connector.validate_endpoints(&new_config(eps), true)).is_err()); } #[test] @@ -386,7 +325,7 @@ fn test_incompatible_version() { let server = MockServer::with_case(1, incompatible); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); let resp = block_on(client.ask_batch_split(metapb::Region::default(), 2)); assert_eq!( @@ -402,7 +341,7 @@ fn restart_leader(mgr: SecurityManager) { MockServer::::with_configuration(&mgr, vec![("127.0.0.1".to_owned(), 0); 3], None); let eps = server.bind_addrs(); - let client = new_client(eps.clone(), Some(Arc::clone(&mgr))); + let mut client = new_client_v2(eps.clone(), Some(Arc::clone(&mgr))); // Put a region. let store_id = client.alloc_id().unwrap(); let mut store = metapb::Store::default(); @@ -453,12 +392,8 @@ fn test_change_leader_async() { let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let counter = Arc::new(AtomicUsize::new(0)); - let client = new_client(eps, None); - let counter1 = Arc::clone(&counter); - client.handle_reconnect(move || { - counter1.fetch_add(1, Ordering::SeqCst); - }); + let mut client = new_client_v2(eps, None); + let mut reconnect_recv = client.subscribe_reconnect(); let leader = client.get_leader(); for _ in 0..5 { @@ -467,7 +402,10 @@ fn test_change_leader_async() { let new = client.get_leader(); if new != leader { - assert!(counter.load(Ordering::SeqCst) >= 1); + assert!(matches!( + reconnect_recv.try_recv(), + Ok(_) | Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) + )); return; } thread::sleep(LeaderChange::get_leader_interval()); @@ -482,7 +420,7 @@ fn test_pd_client_ok_when_cluster_not_ready() { let server = MockServer::with_case(3, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); + let mut client = new_client_v2(eps, None); fail::cfg(pd_client_cluster_id_zero, "return()").unwrap(); // wait 100ms to let client load member. thread::sleep(Duration::from_millis(101)); @@ -492,36 +430,33 @@ fn test_pd_client_ok_when_cluster_not_ready() { #[test] fn test_pd_client_heartbeat_send_failed() { + let rt = setup_runtime(); + let _g = rt.enter(); let pd_client_send_fail_fp = "region_heartbeat_send_failed"; fail::cfg(pd_client_send_fail_fp, "return()").unwrap(); let server = MockServer::with_case(1, Arc::new(AlreadyBootstrapped)); let eps = server.bind_addrs(); - let client = new_client(eps, None); - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() + let mut client = new_client_v2(eps, None); + + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = - client.handle_region_heartbeat_response(1, move |resp| tx.send(resp).unwrap_or_default()); - poller.spawn(f); - let heartbeat_send_fail = |ok| { + let mut heartbeat_send_fail = |ok| { let mut region = metapb::Region::default(); region.set_id(1); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region, - metapb::Peer::default(), - RegionStat::default(), - None, + let mut req = pdpb::RegionHeartbeatRequest::default(); + req.set_region(region); + tx.send(req).unwrap(); + + let rsp = block_on(tokio::time::timeout( + Duration::from_millis(100), + responses.next(), )); - let rsp = rx.recv_timeout(Duration::from_millis(100)); if ok { assert!(rsp.is_ok()); - assert_eq!(rsp.unwrap().get_region_id(), 1); + assert_eq!(rsp.unwrap().unwrap().unwrap().get_region_id(), 1); } else { rsp.unwrap_err(); } @@ -545,35 +480,28 @@ fn test_pd_client_heartbeat_send_failed() { #[test] fn test_region_heartbeat_on_leader_change() { + let rt = setup_runtime(); + let _g = rt.enter(); let eps_count = 3; let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let client = new_client(eps, None); - let poller = Builder::new_multi_thread() - .thread_name(thd_name!("poller")) - .worker_threads(1) - .build() - .unwrap(); - let (tx, rx) = mpsc::channel(); - let f = client.handle_region_heartbeat_response(1, move |resp| { - tx.send(resp).unwrap(); - }); - poller.spawn(f); - let region = metapb::Region::default(); - let peer = metapb::Peer::default(); - let stat = RegionStat::default(); - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - stat.clone(), - None, - )); - rx.recv_timeout(LeaderChange::get_leader_interval()) + let mut client = new_client_v2(eps, None); + + let (tx, mut responses) = client + .create_region_heartbeat_stream(WakePolicy::Immediately) .unwrap(); - let heartbeat_on_leader_change = |count| { + tx.send(pdpb::RegionHeartbeatRequest::default()).unwrap(); + block_on(tokio::time::timeout( + LeaderChange::get_leader_interval(), + responses.next(), + )) + .unwrap() + .unwrap() + .unwrap(); + + let mut heartbeat_on_leader_change = |count| { let mut leader = client.get_leader(); for _ in 0..count { loop { @@ -587,15 +515,14 @@ fn test_region_heartbeat_on_leader_change() { thread::sleep(LeaderChange::get_leader_interval()); } } - poller.spawn(client.region_heartbeat( - store::RAFT_INIT_LOG_TERM, - region.clone(), - peer.clone(), - stat.clone(), - None, - )); - rx.recv_timeout(LeaderChange::get_leader_interval()) - .unwrap(); + tx.send(pdpb::RegionHeartbeatRequest::default()).unwrap(); + block_on(tokio::time::timeout( + LeaderChange::get_leader_interval(), + responses.next(), + )) + .unwrap() + .unwrap() + .unwrap(); }; // Change PD leader once then heartbeat PD. @@ -612,18 +539,17 @@ fn test_periodical_update() { let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); let eps = server.bind_addrs(); - let counter = Arc::new(AtomicUsize::new(0)); - let client = new_client_with_update_interval(eps, None, ReadableDuration::secs(3)); - let counter1 = Arc::clone(&counter); - client.handle_reconnect(move || { - counter1.fetch_add(1, Ordering::SeqCst); - }); + let mut client = new_client_v2_with_update_interval(eps, None, ReadableDuration::secs(3)); + let mut reconnect_recv = client.subscribe_reconnect(); let leader = client.get_leader(); for _ in 0..5 { let new = client.get_leader(); if new != leader { - assert!(counter.load(Ordering::SeqCst) >= 1); + assert!(matches!( + reconnect_recv.try_recv(), + Ok(_) | Err(tokio::sync::broadcast::error::TryRecvError::Lagged(_)) + )); return; } thread::sleep(LeaderChange::get_leader_interval()); @@ -641,13 +567,14 @@ fn test_cluster_version() { let feature_b = Feature::require(5, 0, 0); let feature_c = Feature::require(5, 0, 1); - let client = new_client(eps, None); - let feature_gate = client.feature_gate(); + let mut client = new_client_v2(eps, None); + let feature_gate = client.feature_gate().clone(); assert!(!feature_gate.can_enable(feature_a)); - let emit_heartbeat = || { + let mut client_clone = client.clone(); + let mut emit_heartbeat = || { let req = pdpb::StoreStats::default(); - block_on(client.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); + block_on(client_clone.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); }; let set_cluster_version = |version: &str| { diff --git a/tests/integrations/pd/test_rpc_client_legacy.rs b/tests/integrations/pd/test_rpc_client_legacy.rs new file mode 100644 index 00000000000..d2ff6d6ac11 --- /dev/null +++ b/tests/integrations/pd/test_rpc_client_legacy.rs @@ -0,0 +1,691 @@ +// Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicUsize, Ordering}, + mpsc, Arc, + }, + thread, + time::Duration, +}; + +use error_code::ErrorCodeExt; +use futures::executor::block_on; +use grpcio::{EnvBuilder, Error as GrpcError, RpcStatus, RpcStatusCode}; +use kvproto::{metapb, pdpb}; +use pd_client::{Error as PdError, Feature, PdClient, PdConnector, RegionStat, RpcClient}; +use raftstore::store; +use security::{SecurityConfig, SecurityManager}; +use test_pd::{mocker::*, util::*, Server as MockServer}; +use tikv_util::config::ReadableDuration; +use tokio::runtime::Builder; +use txn_types::TimeStamp; + +#[test] +fn test_retry_rpc_client() { + let eps_count = 1; + let mut server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let m_eps = eps.clone(); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let m_mgr = mgr.clone(); + server.stop(); + let child = thread::spawn(move || { + let cfg = new_config(m_eps); + RpcClient::new(&cfg, None, m_mgr).unwrap(); + }); + thread::sleep(Duration::from_millis(500)); + server.start(&mgr, eps); + child.join().unwrap(); +} + +#[test] +fn test_rpc_client() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + + let client = new_client(eps.clone(), None); + assert_ne!(client.get_cluster_id().unwrap(), 0); + + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + debug!("bootstrap store {:?}", store); + + let peer_id = client.alloc_id().unwrap(); + let mut peer = metapb::Peer::default(); + peer.set_id(peer_id); + peer.set_store_id(store_id); + + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.mut_peers().push(peer.clone()); + debug!("bootstrap region {:?}", region); + + client + .bootstrap_cluster(store.clone(), region.clone()) + .unwrap(); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + + let tmp_stores = client.get_all_stores(false).unwrap(); + assert_eq!(tmp_stores.len(), 1); + assert_eq!(tmp_stores[0], store); + + let tmp_store = client.get_store(store_id).unwrap(); + assert_eq!(tmp_store.get_id(), store.get_id()); + + let region_key = region.get_start_key(); + let tmp_region = client.get_region(region_key).unwrap(); + assert_eq!(tmp_region.get_id(), region.get_id()); + + let region_info = client.get_region_info(region_key).unwrap(); + assert_eq!(region_info.region, region); + assert_eq!(region_info.leader, None); + + let tmp_region = block_on(client.get_region_by_id(region_id)) + .unwrap() + .unwrap(); + assert_eq!(tmp_region.get_id(), region.get_id()); + + let ts = block_on(client.get_tso()).unwrap(); + assert_ne!(ts, TimeStamp::zero()); + + let ts100 = block_on(client.batch_get_tso(100)).unwrap(); + assert_eq!(ts.logical() + 100, ts100.logical()); + + let mut prev_id = 0; + for _ in 0..100 { + let client = new_client(eps.clone(), None); + let alloc_id = client.alloc_id().unwrap(); + assert!(alloc_id > prev_id); + prev_id = alloc_id; + } + + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = client.handle_region_heartbeat_response(1, move |resp| { + let _ = tx.send(resp); + }); + poller.spawn(f); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + RegionStat::default(), + None, + )); + rx.recv_timeout(Duration::from_secs(3)).unwrap(); + + let region_info = client.get_region_info(region_key).unwrap(); + assert_eq!(region_info.region, region); + assert_eq!(region_info.leader.unwrap(), peer); + + block_on(client.store_heartbeat( + pdpb::StoreStats::default(), + None, // store_report + None, + )) + .unwrap(); + block_on(client.ask_batch_split(metapb::Region::default(), 1)).unwrap(); + block_on(client.report_batch_split(vec![metapb::Region::default(), metapb::Region::default()])) + .unwrap(); + + let region_info = client.get_region_info(region_key).unwrap(); + client.scatter_region(region_info).unwrap(); +} + +#[test] +fn test_connect_follower() { + let connect_leader_fp = "connect_leader"; + let server = MockServer::new(2); + let eps = server.bind_addrs(); + let mut cfg = new_config(eps); + + // test switch + cfg.enable_forwarding = false; + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let client1 = RpcClient::new(&cfg, None, mgr).unwrap(); + fail::cfg(connect_leader_fp, "return").unwrap(); + // RECONNECT_INTERVAL_SEC is 1s. + thread::sleep(Duration::from_secs(1)); + let res = format!("{}", client1.alloc_id().unwrap_err()); + let err = format!( + "{}", + PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + "".to_string(), + ))) + ); + assert_eq!(res, err); + + cfg.enable_forwarding = true; + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let client = RpcClient::new(&cfg, None, mgr).unwrap(); + // RECONNECT_INTERVAL_SEC is 1s. + thread::sleep(Duration::from_secs(1)); + let leader_addr = client1.get_leader().get_client_urls()[0].clone(); + let res = format!("{}", client.alloc_id().unwrap_err()); + let err = format!( + "{}", + PdError::Grpc(GrpcError::RpcFailure(RpcStatus::with_message( + RpcStatusCode::UNAVAILABLE, + leader_addr, + ))) + ); + assert_eq!(res, err); + + fail::remove(connect_leader_fp); +} + +#[test] +fn test_get_tombstone_stores() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + let mut all_stores = vec![]; + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + client.bootstrap_cluster(store.clone(), region).unwrap(); + + all_stores.push(store); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + let s = client.get_all_stores(false).unwrap(); + assert_eq!(s, all_stores); + + // Add tombstone store. + let mut store99 = metapb::Store::default(); + store99.set_id(99); + store99.set_state(metapb::StoreState::Tombstone); + server.default_handler().add_store(store99.clone()); + + // do not include tombstone. + let s = client.get_all_stores(true).unwrap(); + assert_eq!(s, all_stores); + + all_stores.push(store99.clone()); + all_stores.sort_by_key(|a| a.get_id()); + // include tombstone, there should be 2 stores. + let mut s = client.get_all_stores(false).unwrap(); + s.sort_by_key(|a| a.get_id()); + assert_eq!(s, all_stores); + + // Add another tombstone store. + let mut store199 = store99; + store199.set_id(199); + server.default_handler().add_store(store199.clone()); + + all_stores.push(store199); + all_stores.sort_by_key(|a| a.get_id()); + let mut s = client.get_all_stores(false).unwrap(); + s.sort_by_key(|a| a.get_id()); + assert_eq!(s, all_stores); + + client.get_store(store_id).unwrap(); + client.get_store(99).unwrap_err(); + client.get_store(199).unwrap_err(); +} + +#[test] +fn test_get_tombstone_store() { + let eps_count = 1; + let server = MockServer::new(eps_count); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + let mut all_stores = vec![]; + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + client.bootstrap_cluster(store.clone(), region).unwrap(); + + all_stores.push(store); + assert_eq!(client.is_cluster_bootstrapped().unwrap(), true); + let s = client.get_all_stores(false).unwrap(); + assert_eq!(s, all_stores); + + // Add tombstone store. + let mut store99 = metapb::Store::default(); + store99.set_id(99); + store99.set_state(metapb::StoreState::Tombstone); + server.default_handler().add_store(store99.clone()); + + let r = block_on(client.get_store_async(99)); + assert_eq!(r.unwrap_err().error_code(), error_code::pd::STORE_TOMBSTONE); +} + +#[test] +fn test_reboot() { + let eps_count = 1; + let server = MockServer::with_case(eps_count, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + let client = new_client(eps, None); + + assert!(!client.is_cluster_bootstrapped().unwrap()); + + match client.bootstrap_cluster(metapb::Store::default(), metapb::Region::default()) { + Err(PdError::ClusterBootstrapped(_)) => (), + _ => { + panic!("failed, should return ClusterBootstrapped"); + } + } +} + +#[test] +fn test_validate_endpoints() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(Split::new())); + let env = Arc::new( + EnvBuilder::new() + .cq_count(1) + .name_prefix(thd_name!("test-pd")) + .build(), + ); + let eps = server.bind_addrs(); + + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let connector = PdConnector::new(env, mgr); + assert!(block_on(connector.validate_endpoints(&new_config(eps), false)).is_err()); +} + +#[test] +fn test_validate_endpoints_retry() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(Split::new())); + let env = Arc::new( + EnvBuilder::new() + .cq_count(1) + .name_prefix(thd_name!("test-pd")) + .build(), + ); + let mut eps = server.bind_addrs(); + let mock_port = 65535; + eps.insert(0, ("127.0.0.1".to_string(), mock_port)); + eps.pop(); + let mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); + let connector = PdConnector::new(env, mgr); + assert!(block_on(connector.validate_endpoints(&new_config(eps), false)).is_err()); +} + +fn test_retry(func: F) { + let eps_count = 1; + // Retry mocker returns `Err(_)` for most request, here two thirds are `Err(_)`. + let retry = Arc::new(Retry::new(3)); + let server = MockServer::with_case(eps_count, retry); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + for _ in 0..3 { + func(&client); + } +} + +#[test] +fn test_retry_async() { + let r#async = |client: &RpcClient| { + block_on(client.get_region_by_id(1)).unwrap(); + }; + test_retry(r#async); +} + +#[test] +fn test_retry_sync() { + let sync = |client: &RpcClient| { + client.get_store(1).unwrap(); + }; + test_retry(sync) +} + +fn test_not_retry(func: F) { + let eps_count = 1; + // NotRetry mocker returns Ok() with error header first, and next returns Ok() + // without any error header. + let not_retry = Arc::new(NotRetry::new()); + let server = MockServer::with_case(eps_count, not_retry); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + func(&client); +} + +#[test] +fn test_not_retry_async() { + let r#async = |client: &RpcClient| { + block_on(client.get_region_by_id(1)).unwrap_err(); + }; + test_not_retry(r#async); +} + +#[test] +fn test_not_retry_sync() { + let sync = |client: &RpcClient| { + client.get_store(1).unwrap_err(); + }; + test_not_retry(sync); +} + +#[test] +fn test_incompatible_version() { + let incompatible = Arc::new(Incompatible); + let server = MockServer::with_case(1, incompatible); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + + let resp = block_on(client.ask_batch_split(metapb::Region::default(), 2)); + assert_eq!( + resp.unwrap_err().to_string(), + PdError::Incompatible.to_string() + ); +} + +fn restart_leader(mgr: SecurityManager) { + let mgr = Arc::new(mgr); + // Service has only one GetMembersResponse, so the leader never changes. + let mut server = + MockServer::::with_configuration(&mgr, vec![("127.0.0.1".to_owned(), 0); 3], None); + let eps = server.bind_addrs(); + + let client = new_client(eps.clone(), Some(Arc::clone(&mgr))); + // Put a region. + let store_id = client.alloc_id().unwrap(); + let mut store = metapb::Store::default(); + store.set_id(store_id); + + let peer_id = client.alloc_id().unwrap(); + let mut peer = metapb::Peer::default(); + peer.set_id(peer_id); + peer.set_store_id(store_id); + + let region_id = client.alloc_id().unwrap(); + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.mut_peers().push(peer); + client.bootstrap_cluster(store, region.clone()).unwrap(); + + let region = block_on(client.get_region_by_id(region.get_id())) + .unwrap() + .unwrap(); + + // Stop servers and restart them again. + server.stop(); + server.start(&mgr, eps); + + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + + let region = block_on(client.get_region_by_id(region.get_id())).unwrap(); + assert_eq!(region.unwrap().get_id(), region_id); +} + +#[test] +fn test_restart_leader_insecure() { + let mgr = SecurityManager::new(&SecurityConfig::default()).unwrap(); + restart_leader(mgr) +} + +#[test] +fn test_restart_leader_secure() { + let security_cfg = test_util::new_security_cfg(None); + let mgr = SecurityManager::new(&security_cfg).unwrap(); + restart_leader(mgr) +} + +#[test] +fn test_change_leader_async() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let counter = Arc::new(AtomicUsize::new(0)); + let client = new_client(eps, None); + let counter1 = Arc::clone(&counter); + client.handle_reconnect(move || { + counter1.fetch_add(1, Ordering::SeqCst); + }); + let leader = client.get_leader(); + + for _ in 0..5 { + let region = block_on(client.get_region_by_id(1)); + region.ok(); + + let new = client.get_leader(); + if new != leader { + assert!(counter.load(Ordering::SeqCst) >= 1); + return; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + + panic!("failed, leader should changed"); +} + +#[test] +fn test_pd_client_ok_when_cluster_not_ready() { + let pd_client_cluster_id_zero = "cluster_id_is_not_ready"; + let server = MockServer::with_case(3, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + fail::cfg(pd_client_cluster_id_zero, "return()").unwrap(); + // wait 100ms to let client load member. + thread::sleep(Duration::from_millis(101)); + assert_eq!(client.reconnect().is_err(), true); + fail::remove(pd_client_cluster_id_zero); +} + +#[test] +fn test_pd_client_heartbeat_send_failed() { + let pd_client_send_fail_fp = "region_heartbeat_send_failed"; + fail::cfg(pd_client_send_fail_fp, "return()").unwrap(); + let server = MockServer::with_case(1, Arc::new(AlreadyBootstrapped)); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = + client.handle_region_heartbeat_response(1, move |resp| tx.send(resp).unwrap_or_default()); + poller.spawn(f); + + let heartbeat_send_fail = |ok| { + let mut region = metapb::Region::default(); + region.set_id(1); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region, + metapb::Peer::default(), + RegionStat::default(), + None, + )); + let rsp = rx.recv_timeout(Duration::from_millis(100)); + if ok { + assert!(rsp.is_ok()); + assert_eq!(rsp.unwrap().get_region_id(), 1); + } else { + rsp.unwrap_err(); + } + + let region = block_on(client.get_region_by_id(1)); + if ok { + assert!(region.is_ok()); + let r = region.unwrap(); + assert!(r.is_some()); + assert_eq!(1, r.unwrap().get_id()); + } else { + region.unwrap_err(); + } + }; + // send fail if network is block. + heartbeat_send_fail(false); + fail::remove(pd_client_send_fail_fp); + // send success after network recovered. + heartbeat_send_fail(true); +} + +#[test] +fn test_region_heartbeat_on_leader_change() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let client = new_client(eps, None); + let poller = Builder::new_multi_thread() + .thread_name(thd_name!("poller")) + .worker_threads(1) + .build() + .unwrap(); + let (tx, rx) = mpsc::channel(); + let f = client.handle_region_heartbeat_response(1, move |resp| { + tx.send(resp).unwrap(); + }); + poller.spawn(f); + let region = metapb::Region::default(); + let peer = metapb::Peer::default(); + let stat = RegionStat::default(); + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + stat.clone(), + None, + )); + rx.recv_timeout(LeaderChange::get_leader_interval()) + .unwrap(); + + let heartbeat_on_leader_change = |count| { + let mut leader = client.get_leader(); + for _ in 0..count { + loop { + let _ = block_on(client.get_region_by_id(1)); + let new = client.get_leader(); + if leader != new { + leader = new; + info!("leader changed!"); + break; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + } + poller.spawn(client.region_heartbeat( + store::RAFT_INIT_LOG_TERM, + region.clone(), + peer.clone(), + stat.clone(), + None, + )); + rx.recv_timeout(LeaderChange::get_leader_interval()) + .unwrap(); + }; + + // Change PD leader once then heartbeat PD. + heartbeat_on_leader_change(1); + + // Change PD leader twice without update the heartbeat sender, then heartbeat + // PD. + heartbeat_on_leader_change(2); +} + +#[test] +fn test_periodical_update() { + let eps_count = 3; + let server = MockServer::with_case(eps_count, Arc::new(LeaderChange::new())); + let eps = server.bind_addrs(); + + let counter = Arc::new(AtomicUsize::new(0)); + let client = new_client_with_update_interval(eps, None, ReadableDuration::secs(3)); + let counter1 = Arc::clone(&counter); + client.handle_reconnect(move || { + counter1.fetch_add(1, Ordering::SeqCst); + }); + let leader = client.get_leader(); + + for _ in 0..5 { + let new = client.get_leader(); + if new != leader { + assert!(counter.load(Ordering::SeqCst) >= 1); + return; + } + thread::sleep(LeaderChange::get_leader_interval()); + } + + panic!("failed, leader should changed"); +} + +#[test] +fn test_cluster_version() { + let server = MockServer::::new(3); + let eps = server.bind_addrs(); + + let feature_a = Feature::require(0, 0, 1); + let feature_b = Feature::require(5, 0, 0); + let feature_c = Feature::require(5, 0, 1); + + let client = new_client(eps, None); + let feature_gate = client.feature_gate(); + assert!(!feature_gate.can_enable(feature_a)); + + let emit_heartbeat = || { + let req = pdpb::StoreStats::default(); + block_on(client.store_heartbeat(req, /* store_report= */ None, None)).unwrap(); + }; + + let set_cluster_version = |version: &str| { + let h = server.default_handler(); + h.set_cluster_version(version.to_owned()); + }; + + // Empty version string will be treated as invalid. + emit_heartbeat(); + assert!(!feature_gate.can_enable(feature_a)); + + // Explicitly invalid version string. + set_cluster_version("invalid-version"); + emit_heartbeat(); + assert!(!feature_gate.can_enable(feature_a)); + + // Correct version string. + set_cluster_version("5.0.0"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_a)); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // Version can't go backwards. + set_cluster_version("4.99"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // After reconnect the version should be still accessable. + // The GLOBAL_RECONNECT_INTERVAL is 0.1s so sleeps 0.2s here. + thread::sleep(Duration::from_millis(200)); + client.reconnect().unwrap(); + assert!(feature_gate.can_enable(feature_b)); + assert!(!feature_gate.can_enable(feature_c)); + + // Version can go forwards. + set_cluster_version("5.0.1"); + emit_heartbeat(); + assert!(feature_gate.can_enable(feature_c)); +} diff --git a/tests/integrations/raftstore/mod.rs b/tests/integrations/raftstore/mod.rs index 9d648c06c8c..08657f7e75a 100644 --- a/tests/integrations/raftstore/mod.rs +++ b/tests/integrations/raftstore/mod.rs @@ -32,3 +32,4 @@ mod test_transfer_leader; mod test_transport; mod test_unsafe_recovery; mod test_update_region_size; +mod test_witness; diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 5227e7ea6bc..89a61223fa2 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -9,17 +9,19 @@ use futures::{channel::oneshot, executor::block_on}; use kvproto::{ errorpb::FlashbackInProgress, metapb, - raft_cmdpb::{AdminCmdType, CmdType, Request}, + raft_cmdpb::{AdminCmdType, RaftCmdResponse, Request}, }; use raftstore::store::Callback; use test_raftstore::*; use txn_types::WriteBatchFlags; +const TEST_KEY: &[u8] = b"k1"; +const TEST_VALUE: &[u8] = b"v1"; + #[test] fn test_prepare_flashback_after_split() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - cluster.must_transfer_leader(1, new_peer(1, 1)); let old_region = cluster.get_region(b"a"); @@ -126,56 +128,42 @@ fn test_prepare_flashback_after_conf_change() { fn test_flashback_unprepared() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - - cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); - let mut region = cluster.get_region(b"k1"); - let mut cmd = Request::default(); - cmd.set_cmd_type(CmdType::Put); - let mut req = new_request( - region.get_id(), - region.take_region_epoch(), - vec![cmd], - false, + let mut region = cluster.get_region(TEST_KEY); + must_get_flashback_not_prepared_error( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), ); - let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); - req.mut_header().set_peer(new_leader.unwrap()); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); - assert!(resp.get_header().get_error().has_flashback_not_prepared()); } #[test] fn test_flashback_for_schedule() { let mut cluster = new_node_cluster(0, 3); cluster.run(); - cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Prepare flashback. + let region = cluster.get_region(TEST_KEY); cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - - // Verify the schedule is disabled. - let mut region = cluster.get_region(b"k3"); + // Make sure the schedule is disabled. + let mut region = cluster.get_region(TEST_KEY); let admin_req = new_transfer_leader_cmd(new_peer(2, 2)); let transfer_leader = new_admin_request(region.get_id(), ®ion.take_region_epoch(), admin_req); let resp = cluster .call_command_on_leader(transfer_leader, Duration::from_secs(3)) .unwrap(); - let e = resp.get_header().get_error(); assert_eq!( - e.get_flashback_in_progress(), + resp.get_header().get_error().get_flashback_in_progress(), &FlashbackInProgress { region_id: region.get_id(), ..Default::default() } ); - + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); // Transfer leader to (2, 2) should succeed. cluster.must_transfer_leader(1, new_peer(2, 2)); @@ -187,27 +175,33 @@ fn test_flashback_for_write() { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Write for cluster - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Write without flashback flag. + let mut region = cluster.get_region(TEST_KEY); + must_request_without_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cmd(TEST_KEY, TEST_VALUE), + ); + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - // Write will be blocked - let value = vec![1_u8; 8096]; - must_get_error_flashback_in_progress(&mut cluster, ®ion, new_put_cmd(b"k1", &value)); - // Write with flashback flag will succeed - must_do_cmd_with_flashback_flag( + must_get_flashback_in_progress_error( &mut cluster, &mut region.clone(), - new_put_cmd(b"k1", &value), + new_put_cmd(TEST_KEY, TEST_VALUE), + ); + // Write with flashback flag will succeed. + must_request_with_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cmd(TEST_KEY, TEST_VALUE), ); - cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); + must_request_without_flashback_flag( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), + ); } #[test] @@ -216,30 +210,18 @@ fn test_flashback_for_read() { cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); - // Write for cluster - let value = vec![1_u8; 8096]; - multi_do_cmd(&mut cluster, new_put_cf_cmd("write", b"k1", &value)); - // read for cluster - multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); - - // Prepare for flashback - let region = cluster.get_region(b"k1"); + // Read without flashback flag. + let mut region = cluster.get_region(TEST_KEY); + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - - // read will be blocked - must_get_error_flashback_in_progress(&mut cluster, ®ion, new_get_cf_cmd("write", b"k1")); - - // Verify the read can be executed if add flashback flag in request's - // header. - must_do_cmd_with_flashback_flag( - &mut cluster, - &mut region.clone(), - new_get_cf_cmd("write", b"k1"), - ); - + // Read will be blocked. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Read with flashback flag will succeed. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - - multi_do_cmd(&mut cluster, new_get_cf_cmd("write", b"k1")); + must_request_without_flashback_flag(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } // LocalReader will attempt to renew the lease. @@ -249,62 +231,44 @@ fn test_flashback_for_read() { fn test_flashback_for_local_read() { let mut cluster = new_node_cluster(0, 3); let election_timeout = configure_for_lease_read(&mut cluster, Some(50), None); - // Avoid triggering the log compaction in this test case. cluster.cfg.raft_store.raft_log_gc_threshold = 100; - + cluster.run(); + cluster.must_put(TEST_KEY, TEST_VALUE); + let mut region = cluster.get_region(TEST_KEY); let store_id = 3; let peer = new_peer(store_id, 3); - cluster.run(); - - cluster.must_put(b"k1", b"v1"); - let region = cluster.get_region(b"k1"); - cluster.must_transfer_leader(region.get_id(), peer.clone()); + cluster.must_transfer_leader(region.get_id(), peer); // Check local read before prepare flashback let state = cluster.raft_local_state(region.get_id(), store_id); let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. sleep(election_timeout * 2); - must_read_on_peer(&mut cluster, peer.clone(), region.clone(), b"k1", b"v1"); + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // Prepare for flashback + // Prepare flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); // Wait for apply_res to set leader lease. sleep_ms(500); - - must_error_read_on_peer( - &mut cluster, - peer.clone(), - region.clone(), - b"k1", - Duration::from_secs(1), - ); - + // Read should fail. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Wait for the leader's lease to expire to ensure that a renew lease interval // has elapsed. sleep(election_timeout * 2); - must_error_read_on_peer( - &mut cluster, - peer.clone(), - region.clone(), - b"k1", - Duration::from_secs(1), - ); - + // Read should fail. + must_get_flashback_in_progress_error(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Also check read by propose was blocked let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 1); - + // Finish flashback. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); - let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index + 2); @@ -313,11 +277,12 @@ fn test_flashback_for_local_read() { let last_index = state.get_last_index(); // Make sure the leader transfer procedure timeouts. sleep(election_timeout * 2); - must_read_on_peer(&mut cluster, peer, region.clone(), b"k1", b"v1"); - + must_request_without_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); + // A local read with flashback flag will also be blocked. + must_get_flashback_not_prepared_error(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } #[test] @@ -326,7 +291,7 @@ fn test_flashback_for_status_cmd_as_region_detail() { cluster.run(); let leader = cluster.leader_of_region(1).unwrap(); - let region = cluster.get_region(b"k1"); + let region = cluster.get_region(TEST_KEY); cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); let region_detail = cluster.region_detail(region.get_id(), leader.get_store_id()); @@ -420,58 +385,63 @@ fn must_check_flashback_state( ); } -fn multi_do_cmd(cluster: &mut Cluster, cmd: Request) { - for _ in 0..100 { - let mut reqs = vec![]; - for _ in 0..100 { - reqs.push(cmd.clone()); - } - cluster.batch_put(b"k1", reqs).unwrap(); - } -} - -fn must_do_cmd_with_flashback_flag( +fn request( cluster: &mut Cluster, region: &mut metapb::Region, - cmd: Request, -) { - // Verify the read can be executed if add flashback flag in request's - // header. - let mut req = new_request( + req: Request, + with_flashback_flag: bool, +) -> RaftCmdResponse { + let mut cmd_req = new_request( region.get_id(), region.take_region_epoch(), - vec![cmd], + vec![req], false, ); let new_leader = cluster.query_leader(1, region.get_id(), Duration::from_secs(1)); - req.mut_header().set_peer(new_leader.unwrap()); - req.mut_header() - .set_flags(WriteBatchFlags::FLASHBACK.bits()); - let resp = cluster.call_command(req, Duration::from_secs(3)).unwrap(); + let header = cmd_req.mut_header(); + header.set_peer(new_leader.unwrap()); + if with_flashback_flag { + header.set_flags(WriteBatchFlags::FLASHBACK.bits()); + } + cluster + .call_command(cmd_req, Duration::from_secs(3)) + .unwrap() +} + +// Make sure the request could be executed with flashback flag. +fn must_request_with_flashback_flag( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, true); assert!(!resp.get_header().has_error()); } -fn must_get_error_flashback_in_progress( +fn must_get_flashback_not_prepared_error( cluster: &mut Cluster, - region: &metapb::Region, - cmd: Request, + region: &mut metapb::Region, + req: Request, ) { - for _ in 0..100 { - let mut reqs = vec![]; - for _ in 0..100 { - reqs.push(cmd.clone()); - } - match cluster.batch_put(b"k1", reqs) { - Ok(_) => {} - Err(e) => { - assert_eq!( - e.get_flashback_in_progress(), - &FlashbackInProgress { - region_id: region.get_id(), - ..Default::default() - } - ); - } - } - } + let resp = request(cluster, region, req, true); + assert!(resp.get_header().get_error().has_flashback_not_prepared()); +} + +// Make sure the request could be executed without flashback flag. +fn must_request_without_flashback_flag( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, false); + assert!(!resp.get_header().has_error()); +} + +fn must_get_flashback_in_progress_error( + cluster: &mut Cluster, + region: &mut metapb::Region, + req: Request, +) { + let resp = request(cluster, region, req, false); + assert!(resp.get_header().get_error().has_flashback_in_progress()); } diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 48adb2eb84c..c72ba5ac595 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1298,6 +1298,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1314,6 +1316,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks @@ -1421,6 +1425,8 @@ fn test_merge_pessimistic_locks_repeated_merge() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; txn_ext .pessimistic_locks diff --git a/tests/integrations/raftstore/test_multi.rs b/tests/integrations/raftstore/test_multi.rs index 2cda3b8a0b8..ef368bbe0cb 100644 --- a/tests/integrations/raftstore/test_multi.rs +++ b/tests/integrations/raftstore/test_multi.rs @@ -833,6 +833,8 @@ fn test_leader_drop_with_pessimistic_lock() { ttl: 1000, for_update_ts: 10.into(), min_commit_ts: 10.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }, )]) .unwrap(); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 6ac72f668db..10771c57863 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -963,6 +963,8 @@ fn test_split_with_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; let lock_c = PessimisticLock { primary: b"c".to_vec().into_boxed_slice(), @@ -970,6 +972,8 @@ fn test_split_with_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; { let mut locks = txn_ext.pessimistic_locks.write(); diff --git a/tests/integrations/raftstore/test_transfer_leader.rs b/tests/integrations/raftstore/test_transfer_leader.rs index b0fade84d8b..b4f8c33d54d 100644 --- a/tests/integrations/raftstore/test_transfer_leader.rs +++ b/tests/integrations/raftstore/test_transfer_leader.rs @@ -304,6 +304,8 @@ fn test_propose_in_memory_pessimistic_locks() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; // Write a pessimistic lock to the in-memory pessimistic lock table. { @@ -344,6 +346,8 @@ fn test_memory_pessimistic_locks_status_after_transfer_leader_failure() { ttl: 3000, for_update_ts: 20.into(), min_commit_ts: 30.into(), + last_change_ts: 5.into(), + versions_to_last_change: 3, }; // Write a pessimistic lock to the in-memory pessimistic lock table. txn_ext diff --git a/tests/integrations/raftstore/test_unsafe_recovery.rs b/tests/integrations/raftstore/test_unsafe_recovery.rs index 505bd3bd0e4..a2c2ea75c64 100644 --- a/tests/integrations/raftstore/test_unsafe_recovery.rs +++ b/tests/integrations/raftstore/test_unsafe_recovery.rs @@ -677,7 +677,6 @@ fn test_force_leader_on_hibernated_leader() { // previous follower. #[test] fn test_force_leader_on_hibernated_follower() { - test_util::init_log_for_test(); let mut cluster = new_node_cluster(0, 5); cluster.pd_client.disable_default_operator(); diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs new file mode 100644 index 00000000000..a2518cc64ae --- /dev/null +++ b/tests/integrations/raftstore/test_witness.rs @@ -0,0 +1,537 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{iter::FromIterator, sync::Arc, time::Duration}; + +use futures::executor::block_on; +use kvproto::{metapb, raft_cmdpb::ChangePeerRequest, raft_serverpb::PeerState}; +use pd_client::PdClient; +use raft::eraftpb::ConfChangeType; +use test_raftstore::*; +use tikv_util::store::find_peer; + +fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(true); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +fn become_non_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { + peer.set_role(metapb::PeerRole::Learner); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + cluster.pd_client.must_remove_peer(region_id, peer.clone()); + peer.set_is_witness(false); + peer.set_id(peer.get_id() + 10); + cluster.pd_client.must_add_peer(region_id, peer.clone()); + peer.set_role(metapb::PeerRole::Voter); + cluster.pd_client.must_add_peer(region_id, peer.clone()); +} + +// Test the case that region split or merge with witness peer +#[test] +fn test_witness_split_merge() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + let before = cluster + .apply_state(region.get_id(), nodes[2]) + .get_applied_index(); + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k2", b"v2"); + cluster.must_split(®ion, b"k2"); + must_get_none(&cluster.get_engine(3), b"k1"); + must_get_none(&cluster.get_engine(3), b"k2"); + // applied index of witness is updated + let after = cluster + .apply_state(region.get_id(), nodes[2]) + .get_applied_index(); + assert!(after - before >= 3); + + // the newly split peer should be witness as well + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k2"); + assert_ne!(left.get_id(), right.get_id()); + assert!(find_peer(&left, nodes[2]).unwrap().is_witness); + assert!(find_peer(&right, nodes[2]).unwrap().is_witness); + + // merge + pd_client.must_merge(left.get_id(), right.get_id()); + let after_merge = cluster.get_region(b"k1"); + assert!(find_peer(&after_merge, nodes[2]).unwrap().is_witness); + must_get_none(&cluster.get_engine(3), b"k1"); + must_get_none(&cluster.get_engine(3), b"k2"); + // epoch of witness is updated + assert_eq!( + cluster + .region_local_state(after_merge.get_id(), nodes[2]) + .get_region() + .get_region_epoch(), + after_merge.get_region_epoch() + ); + + // split again + cluster.must_split(&after_merge, b"k2"); + let left = cluster.get_region(b"k1"); + let right = cluster.get_region(b"k2"); + assert!(find_peer(&left, nodes[2]).unwrap().is_witness); + assert!(find_peer(&right, nodes[2]).unwrap().is_witness); + + // can't merge with different witness location + let mut peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); + become_non_witness(&cluster, left.get_id(), &mut peer_on_store3); + let left = cluster.get_region(b"k1"); + let req = new_admin_request( + left.get_id(), + left.get_region_epoch(), + new_prepare_merge(right), + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("peers doesn't match") + ); +} + +// Test flow of witness conf change +#[test] +fn test_witness_conf_change() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // can't switch witness by conf change + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + let mut peer = peer_on_store3.clone(); + peer.set_is_witness(true); + let mut cp = ChangePeerRequest::default(); + cp.set_change_type(ConfChangeType::AddLearnerNode); + cp.set_peer(peer); + let req = new_admin_request( + region.get_id(), + region.get_region_epoch(), + new_change_peer_v2_request(vec![cp]), + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert!(resp.get_header().has_error()); + + // add a new witness peer + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3.clone()); + peer_on_store3.set_is_witness(true); + let applied_index = cluster.apply_state(1, 2).applied_index; + cluster + .pd_client + .must_add_peer(region.get_id(), peer_on_store3.clone()); + must_get_none(&cluster.get_engine(3), b"k1"); + let region = cluster.get_region(b"k1"); + cluster.wait_applied_index(region.get_id(), nodes[2], applied_index + 1); + assert_eq!( + cluster + .region_local_state(region.get_id(), nodes[2]) + .get_region(), + ®ion + ); + + // remove a witness peer + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster + .pd_client + .must_remove_peer(region.get_id(), peer_on_store3); + + assert_eq!( + cluster + .region_local_state(region.get_id(), nodes[2]) + .get_state(), + PeerState::Tombstone + ); +} + +// #[test] +// // Test flow of switch witness +// fn test_witness_switch_witness() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k1", b"v1"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + +// std::thread::sleep(Duration::from_millis(100)); +// must_get_none(&cluster.get_engine(3), b"k1"); + +// // witness -> nonwitness +// peer_on_store3.set_role(metapb::PeerRole::Learner); +// cluster +// .pd_client +// .must_add_peer(region.get_id(), peer_on_store3.clone()); +// cluster +// .pd_client +// .must_remove_peer(region.get_id(), peer_on_store3.clone()); +// peer_on_store3.set_is_witness(false); +// cluster +// .pd_client +// .must_add_peer(region.get_id(), peer_on_store3.clone()); +// std::thread::sleep(Duration::from_millis(100)); +// must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +// } + +// TODO: add back when switch witness is supported +// // Test the case that leader is forbidden to become witness +// #[test] +// fn test_witness_leader() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k1", b"v1"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let mut peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + +// // can't make leader to witness +// peer_on_store1.set_is_witness(true); +// cluster +// .pd_client +// .add_peer(region.get_id(), peer_on_store1.clone()); + +// std::thread::sleep(Duration::from_millis(100)); +// assert_eq!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// 1 +// ); +// // leader changes to witness failed, so still can get the value +// must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); + +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// // can't transfer leader to witness +// cluster.transfer_leader(region.get_id(), &mut peer_on_store3); +// assert_eq!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// nodes[0], +// ); +// } + +// TODO: add back when election priority is supported +// // Test the case that witness can't be elected as leader based on election +// // priority when there is no log gap +// #[test] +// fn test_witness_election_priority() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); +// cluster.must_put(b"k0", b"v0"); + +// // make sure logs are replicated to the witness +// std::thread::sleep(Duration::from_millis(100)); + +// for i in 1..10 { +// let node = +// cluster.leader_of_region(region.get_id()).unwrap().store_id; cluster. +// stop_node(node); let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// // the witness can't be elected as the leader when there is no log +// gap assert_ne!( +// cluster.leader_of_region(region.get_id()).unwrap().store_id, +// nodes[2], +// ); +// cluster.run_node(node).unwrap(); +// } +// } + +// TODO: add back when raft log gc logic is updated for witness +// // Test the case that truncated index won't advance when there is a witness +// even // if the gap gap exceeds the gc count limit +// #[test] +// fn test_witness_raftlog_gc_lagged_follower() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// cluster.must_put(b"k0", b"v0"); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(200)); +// let mut before_states = HashMap::default(); +// for (&id, engines) in &cluster.engines { +// let mut state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); before_states.insert(id, +// state.take_truncated_state()); } + +// // one follower is down +// cluster.stop_node(nodes[1]); + +// // write some data to make log gap exceeds the gc limit +// for i in 1..1000 { +// let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// } + +// // the truncated index is not advanced +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); assert!(state.get_truncated_state(). +// get_index() - before_states[&id].get_index() < 10); } + +// // the follower is back online +// cluster.run_node(nodes[1]).unwrap(); +// cluster.must_put(b"k00", b"v00"); +// must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(300)); + +// // the truncated index is advanced now, as all the peers has replicated +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); assert_ge!( +// state.get_truncated_state().get_index() - +// before_states[&id].get_index(), 900 +// ); +// } +// } + +// TODO: add back when raft log gc logic is updated for witness +// // Test the case that truncated index is advance when there is a lagged +// witness #[test] +// fn test_witness_raftlog_gc_lagged_witness() { +// let mut cluster = new_server_cluster(0, 3); +// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); +// cluster.run(); +// let nodes = Vec::from_iter(cluster.get_node_ids()); +// assert_eq!(nodes.len(), 3); + +// let pd_client = Arc::clone(&cluster.pd_client); +// pd_client.disable_default_operator(); + +// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); +// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); +// cluster.must_transfer_leader(region.get_id(), peer_on_store1); +// // nonwitness -> witness +// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); +// become_witness(&cluster, region.get_id(), &mut peer_on_store3); +// cluster.must_put(b"k0", b"v0"); + +// // make sure raft log gc is triggered +// std::thread::sleep(Duration::from_millis(200)); +// let mut before_states = HashMap::default(); +// for (&id, engines) in &cluster.engines { +// let mut state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); before_states.insert(id, +// state.take_truncated_state()); } + +// // the witness is down +// cluster.stop_node(nodes[2]); + +// // write some data to make log gap exceeds the gc limit +// for i in 1..1000 { +// let (k, v) = (format!("k{}", i), format!("v{}", i)); +// let key = k.as_bytes(); +// let value = v.as_bytes(); +// cluster.must_put(key, value); +// } + +// // the witness is back online +// cluster.run_node(nodes[2]).unwrap(); + +// cluster.must_put(b"k00", b"v00"); +// std::thread::sleep(Duration::from_millis(200)); + +// // the truncated index is advanced +// for (&id, engines) in &cluster.engines { +// let state: RaftApplyState = get_raft_msg_or_default(engines, +// &keys::apply_state_key(1)); println!("{} {}", id, +// state.get_truncated_state().get_index()); assert_ge!( +// state.get_truncated_state().get_index() - +// before_states[&id].get_index(), 900 +// ); +// } +// } + +// Test the case replica read can't be performed on witness peer. +#[test] +fn test_witness_replica_read() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + } + ); +} + +fn must_get_error_recovery_in_progress( + cluster: &mut Cluster, + region: &metapb::Region, + cmd: kvproto::raft_cmdpb::Request, +) { + let req = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + true, + ); + let resp = cluster + .call_command_on_leader(req, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_recovery_in_progress(), + &kvproto::errorpb::RecoveryInProgress { + region_id: region.get_id(), + ..Default::default() + }, + "{:?}", + resp + ); +} + +// Test the case that witness replicate logs to lagging behind follower when +// leader is down +#[test] +fn test_witness_leader_down() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + + let mut peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + // nonwitness -> witness + become_witness(&cluster, region.get_id(), &mut peer_on_store2); + + // the other follower is isolated + cluster.add_send_filter(IsolationFilterFactory::new(3)); + for i in 1..10 { + cluster.must_put(format!("k{}", i).as_bytes(), format!("v{}", i).as_bytes()); + } + // the leader is down + cluster.stop_node(1); + + // witness would help to replicate the logs + cluster.clear_send_filters(); + + // forbid writes + let put = new_put_cmd(b"k3", b"v3"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + // forbid reads + let get = new_get_cmd(b"k1"); + must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + // forbid read index + let read_index = new_read_index_cmd(); + must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store3); + cluster.must_put(b"k1", b"v1"); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + assert_eq!(cluster.must_get(b"k9"), Some(b"v9".to_vec())); +} diff --git a/tests/integrations/resource_metering/test_read_keys.rs b/tests/integrations/resource_metering/test_read_keys.rs index 87ad50024ad..35ef0e2ba88 100644 --- a/tests/integrations/resource_metering/test_read_keys.rs +++ b/tests/integrations/resource_metering/test_read_keys.rs @@ -50,31 +50,7 @@ pub fn test_read_keys() { let (k, v) = (n.clone(), n); // Prewrite. - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit. - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); } // PointGet diff --git a/tests/integrations/server/gc_worker.rs b/tests/integrations/server/gc_worker.rs index 36f9eed9ca8..cfadde84405 100644 --- a/tests/integrations/server/gc_worker.rs +++ b/tests/integrations/server/gc_worker.rs @@ -2,271 +2,15 @@ use std::sync::Arc; -use collections::HashMap; use engine_traits::{Peekable, CF_WRITE}; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; -use kvproto::{kvrpcpb::*, metapb, tikvpb::TikvClient}; +use kvproto::{kvrpcpb::*, tikvpb::TikvClient}; use test_raftstore::*; use tikv::server::gc_worker::sync_gc; use tikv_util::HandyRwLock; use txn_types::Key; -#[test] -fn test_physical_scan_lock() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - // Generate kvs like k10, v10, ts=10; k11, v11, ts=11; ... - let kv: Vec<_> = (10..20) - .map(|i| (i, vec![b'k', i as u8], vec![b'v', i as u8])) - .collect(); - - for (ts, k, v) in &kv { - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), *ts); - } - - let all_locks: Vec<_> = kv - .into_iter() - .map(|(ts, k, _)| { - // Create a LockInfo that matches the prewrite request in `must_kv_prewrite`. - let mut lock_info = LockInfo::default(); - lock_info.set_primary_lock(k.clone()); - lock_info.set_lock_version(ts); - lock_info.set_key(k); - lock_info.set_lock_ttl(3000); - lock_info.set_lock_type(Op::Put); - lock_info.set_min_commit_ts(ts + 1); - lock_info - }) - .collect(); - - let check_result = |got_locks: &[_], expected_locks: &[_]| { - for i in 0..std::cmp::max(got_locks.len(), expected_locks.len()) { - assert_eq!(got_locks[i], expected_locks[i], "lock {} mismatch", i); - } - }; - - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 30, b"", 100), - &all_locks, - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 15, b"", 100), - &all_locks[0..=5], - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 10, b"", 100), - &all_locks[0..1], - ); - check_result( - &must_physical_scan_lock(&client, ctx.clone(), 9, b"", 100), - &[], - ); - check_result( - &must_physical_scan_lock(&client, ctx, 30, &[b'k', 13], 5), - &all_locks[3..8], - ); -} - -#[test] -fn test_applied_lock_collector() { - let mut cluster = new_server_cluster(0, 3); - cluster.pd_client.disable_default_operator(); - cluster.run(); - - // Create all stores' clients. - let env = Arc::new(Environment::new(1)); - let mut clients = HashMap::default(); - for node_id in cluster.get_node_ids() { - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(node_id)); - let client = TikvClient::new(channel); - clients.insert(node_id, client); - } - - // Create the ctx of the first region. - let region = cluster.get_region(b""); - let region_id = region.get_id(); - let leader_peer = cluster.leader_of_region(region_id).unwrap(); - let leader_store_id = leader_peer.get_store_id(); - let leader_client = clients.get(&leader_store_id).unwrap(); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader_peer); - ctx.set_region_epoch(cluster.get_region_epoch(region_id)); - - // It's used to make sure all stores applies all logs. - let wait_for_apply = |cluster: &mut Cluster<_>, region: &metapb::Region| { - let cluster = &mut *cluster; - region.get_peers().iter().for_each(|p| { - let mut retry_times = 1; - loop { - let resp = - async_read_on_peer(cluster, p.clone(), region.clone(), b"key", true, true) - .recv() - .unwrap(); - if !resp.get_header().has_error() { - return; - } - if retry_times >= 50 { - panic!("failed to read on {:?}: {:?}", p, resp); - } - retry_times += 1; - sleep_ms(20); - } - }); - }; - - let check_lock = |lock: &LockInfo, k: &[u8], pk: &[u8], ts| { - assert_eq!(lock.get_key(), k); - assert_eq!(lock.get_primary_lock(), pk); - assert_eq!(lock.get_lock_version(), ts); - }; - - // Register lock observer at safe point 10000. - let mut safe_point = 10000; - clients.iter().for_each(|(_, c)| { - // Should report error when checking non-existent observer. - assert!(!check_lock_observer(c, safe_point).get_error().is_empty()); - must_register_lock_observer(c, safe_point); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - }); - - // Lock observer should only collect values in lock CF. - let key = b"key0"; - must_kv_prewrite( - leader_client, - ctx.clone(), - vec![new_mutation(Op::Put, key, &b"v".repeat(1024))], - key.to_vec(), - 1, - ); - must_kv_commit(leader_client, ctx.clone(), vec![key.to_vec()], 1, 2, 2); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1); - check_lock(&locks[0], key, key, 1); - }); - - // Lock observer shouldn't collect locks after the safe point. - must_kv_prewrite( - leader_client, - ctx.clone(), - vec![new_mutation(Op::Put, key, b"v")], - key.to_vec(), - safe_point + 1, - ); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1); - check_lock(&locks[0], key, key, 1); - }); - - // Write 999 locks whose timestamp is less than the safe point. - let mutations = (1..1000) - .map(|i| new_mutation(Op::Put, format!("key{}", i).as_bytes(), b"v")) - .collect(); - must_kv_prewrite(leader_client, ctx.clone(), mutations, b"key1".to_vec(), 10); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - let locks = must_check_lock_observer(c, safe_point, true); - // Plus the first lock. - assert_eq!(locks.len(), 1000); - }); - - // Add a new store and register lock observer. - let store_id = cluster.add_new_engine(); - let channel = - ChannelBuilder::new(Arc::clone(&env)).connect(&cluster.sim.rl().get_addr(store_id)); - let client = TikvClient::new(channel); - must_register_lock_observer(&client, safe_point); - - // Add a new peer. Lock observer should collect all locks from snapshot. - let peer = new_peer(store_id, store_id); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_none_pending_peer(peer); - wait_for_apply(&mut cluster, ®ion); - let locks = must_check_lock_observer(&client, safe_point, true); - assert_eq!(locks.len(), 999); - - // Should be dirty when collects too many locks. - let mutations = (1000..1100) - .map(|i| new_mutation(Op::Put, format!("key{}", i).as_bytes(), b"v")) - .collect(); - must_kv_prewrite( - leader_client, - ctx.clone(), - mutations, - b"key1000".to_vec(), - 100, - ); - wait_for_apply(&mut cluster, ®ion); - clients.insert(store_id, client); - clients.iter().for_each(|(_, c)| { - let resp = check_lock_observer(c, safe_point); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert!(!resp.get_is_clean()); - // MAX_COLLECT_SIZE is 1024. - assert_eq!(resp.get_locks().len(), 1024); - }); - - // Reregister and check. It shouldn't clean up state. - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - let resp = check_lock_observer(c, safe_point); - assert!(resp.get_error().is_empty(), "{:?}", resp.get_error()); - assert!(!resp.get_is_clean()); - // MAX_COLLECT_SIZE is 1024. - assert_eq!(resp.get_locks().len(), 1024); - }); - - // Register lock observer at a later safe point. Lock observer should reset its - // state. - safe_point += 1; - clients.iter().for_each(|(_, c)| { - must_register_lock_observer(c, safe_point); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - // Can't register observer with smaller max_ts. - assert!( - !register_lock_observer(c, safe_point - 1) - .get_error() - .is_empty() - ); - assert!(must_check_lock_observer(c, safe_point, true).is_empty()); - }); - let leader_client = clients.get(&leader_store_id).unwrap(); - must_kv_prewrite( - leader_client, - ctx, - vec![new_mutation(Op::Put, b"key1100", b"v")], - b"key1100".to_vec(), - safe_point, - ); - wait_for_apply(&mut cluster, ®ion); - clients.iter().for_each(|(_, c)| { - // Should collect locks according to the new max ts. - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1, "{:?}", locks); - // Shouldn't remove it with a wrong max ts. - assert!( - !remove_lock_observer(c, safe_point - 1) - .get_error() - .is_empty() - ); - let locks = must_check_lock_observer(c, safe_point, true); - assert_eq!(locks.len(), 1, "{:?}", locks); - // Remove lock observers. - must_remove_lock_observer(c, safe_point); - assert!(!check_lock_observer(c, safe_point).get_error().is_empty()); - }); -} - // Since v5.0 GC bypasses Raft, which means GC scans/deletes records with // `keys::DATA_PREFIX`. This case ensures it's performed correctly. #[test] diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index f3e3bda8a24..12cff74861d 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -1,6 +1,7 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + char::from_u32, path::Path, sync::*, thread, @@ -19,7 +20,7 @@ use grpcio_health::{proto::HealthCheckRequest, *}; use kvproto::{ coprocessor::*, debugpb, - kvrpcpb::{self, PrewriteRequestPessimisticAction::*, *}, + kvrpcpb::{PrewriteRequestPessimisticAction::*, *}, metapb, raft_serverpb, raft_serverpb::*, tikvpb::*, @@ -42,6 +43,7 @@ use tikv::{ gc_worker::sync_gc, service::{batch_commands_request, batch_commands_response}, }, + storage::txn::FLASHBACK_BATCH_SIZE, }; use tikv_util::{ config::ReadableSize, @@ -274,33 +276,7 @@ fn test_mvcc_basic() { let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; - - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); // Get ts += 1; @@ -363,33 +339,7 @@ fn test_mvcc_rollback_and_cleanup() { let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; - - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v); // Prewrite puts some locks. ts += 1; @@ -597,43 +547,139 @@ fn test_mvcc_resolve_lock_gc_and_delete() { assert!(del_resp.error.is_empty()); } +#[test] +#[cfg(feature = "failpoints")] +fn test_mvcc_flashback_failed_after_first_batch() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let mut ts = 0; + for i in 0..FLASHBACK_BATCH_SIZE * 2 { + // Meet the constraints of the alphabetical order for test + let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), b"value@0".to_vec()); + ts -= 3; + } + ts += 3; + let check_ts = ts; + for i in 0..FLASHBACK_BATCH_SIZE * 2 { + let k = format!("key@{}", from_u32(i as u32).unwrap()).into_bytes(); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), b"value@1".to_vec()); + ts -= 3; + } + ts += 3; + // Flashback + fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); + fail::cfg("flashback_skip_1_key_in_write", "1*return").unwrap(); + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + fail::remove("flashback_skip_1_key_in_write"); + fail::remove("flashback_failed_after_first_batch"); + // skip for key@1 + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(1_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // The first batch of writes are flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(2_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + // Subsequent batches of writes are not flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // Flashback batch 2. + fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + fail::remove("flashback_failed_after_first_batch"); + // key@1 must be flahsbacked in the second batch firstly. + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(1_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + must_kv_read_equal( + &client, + ctx.clone(), + format!("key@{}", from_u32(FLASHBACK_BATCH_SIZE as u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts + 2, + ); + // 2 * (FLASHBACK_BATCH_SIZE - 1) keys are flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!( + "key@{}", + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 2).unwrap() + ) + .as_bytes() + .to_vec(), + b"value@1".to_vec(), + ts + 2, + ); + // Flashback needs to be continued. + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + // Flashback again to check if any error occurs :) + must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); + ts += 2; + // Subsequent batches of writes are flashbacked. + must_kv_read_equal( + &client, + ctx.clone(), + format!( + "key@{}", + from_u32(2 * FLASHBACK_BATCH_SIZE as u32 - 2).unwrap() + ) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts, + ); + // key@0 which used as prewrite lock also need to be flahsbacked. + must_kv_read_equal( + &client, + ctx, + format!("key@{}", from_u32(0_u32).unwrap()) + .as_bytes() + .to_vec(), + b"value@0".to_vec(), + ts, + ); +} + #[test] fn test_mvcc_flashback() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let mut ts = 0; - let k = b"key".to_vec(); - for i in 0..10 { + // Need to write many batches. + for i in 0..2000 { let v = format!("value@{}", i).into_bytes(); - // Prewrite - ts += 1; - let prewrite_start_version = ts; - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite( - &client, - ctx.clone(), - vec![mutation], - k.clone(), - prewrite_start_version, - ); - // Commit - ts += 1; - let commit_version = ts; - must_kv_commit( - &client, - ctx.clone(), - vec![k.clone()], - prewrite_start_version, - commit_version, - commit_version, - ); - // Get - ts += 1; - must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), ts) + let k = format!("key@{}", i % 1000).into_bytes(); + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); } // Prewrite to leave a lock. + let k = b"key@1".to_vec(); ts += 1; let prewrite_start_version = ts; let mut mutation = Mutation::default(); @@ -651,19 +697,17 @@ fn test_mvcc_flashback() { let get_version = ts; let mut get_req = GetRequest::default(); get_req.set_context(ctx.clone()); - get_req.key = k.clone(); + get_req.key = k; get_req.version = get_version; let get_resp = client.kv_get(&get_req).unwrap(); assert!(!get_resp.has_region_error()); assert!(get_resp.get_error().has_locked()); assert!(get_resp.value.is_empty()); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 5, ts + 1, ts + 2); + must_flashback_to_version(&client, ctx.clone(), 5, ts + 1, ts + 2); ts += 2; - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); // Should not meet the lock and can not get the latest data any more. - must_kv_read_equal(&client, ctx, k, b"value@1".to_vec(), ts); + must_kv_read_equal(&client, ctx, b"key@1".to_vec(), b"value@1".to_vec(), ts); } #[test] @@ -672,9 +716,7 @@ fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); + must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); // Try to read. let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get @@ -712,9 +754,7 @@ fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); // Flashback - let flashback_resp = must_flashback_to_version(&client, ctx, 0, 1, 2); - assert!(!flashback_resp.has_region_error()); - assert!(flashback_resp.get_error().is_empty()); + must_flashback_to_version(&client, ctx, 0, 1, 2); // Try to transfer leader. let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); assert!( @@ -730,15 +770,8 @@ fn test_mvcc_flashback_block_scheduling() { fn test_mvcc_flashback_unprepared() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); - // Prewrite - let mut mutation = Mutation::default(); - mutation.set_op(Op::Put); - mutation.set_key(k.clone()); - mutation.set_value(v.clone()); - must_kv_prewrite(&client, ctx.clone(), vec![mutation], k.clone(), 1); - // Commit - must_kv_commit(&client, ctx.clone(), vec![k.clone()], 1, 2, 2); - must_kv_read_equal(&client, ctx.clone(), k.clone(), v.clone(), 3); + let mut ts = 0; + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); // Try to flashback without preparing first. let mut req = FlashbackToVersionRequest::default(); req.set_context(ctx.clone()); @@ -829,32 +862,6 @@ fn test_split_region_impl(is_raw_kv: bool) { ); } -#[test] -fn test_read_index() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - - // Read index - let mut req = ReadIndexRequest::default(); - req.set_context(ctx.clone()); - let mut resp = client.read_index(&req).unwrap(); - let last_index = resp.get_read_index(); - assert_eq!(last_index > 0, true); - - // Raw put - let (k, v) = (b"key".to_vec(), b"value".to_vec()); - let mut put_req = RawPutRequest::default(); - put_req.set_context(ctx); - put_req.key = k; - put_req.value = v; - let put_resp = client.raw_put(&put_req).unwrap(); - assert!(!put_resp.has_region_error()); - assert!(put_resp.error.is_empty()); - - // Read index again - resp = client.read_index(&req).unwrap(); - assert_eq!(last_index + 1, resp.get_read_index()); -} - #[test] fn test_debug_get() { let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); @@ -1201,7 +1208,229 @@ fn test_pessimistic_lock() { assert_eq!(resp.get_values().to_vec(), vec![v.clone(), vec![]]); assert_eq!(resp.get_not_founds().to_vec(), vec![false, true]); } - must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), 40); + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), 40, 40); + } +} + +#[test] +fn test_pessimistic_lock_resumable() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + + // Resumable pessimistic lock request with multi-key is not supported yet. + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![b"k1".to_vec(), b"k2".to_vec()], + 1, + 1, + None, + false, + false, + ); + assert_eq!(resp.get_results(), &[]); + assert_ne!(resp.get_errors().len(), 0); + + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + + // Prewrite + let mut mutation = Mutation::default(); + mutation.set_op(Op::Put); + mutation.set_key(k.clone()); + mutation.set_value(v.clone()); + must_kv_prewrite(&client, ctx.clone(), vec![mutation.clone()], k.clone(), 5); + + // No wait + let start_time = Instant::now(); + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![k.clone()], + 8, + 8, + None, + false, + false, + ); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert!(start_time.elapsed() < Duration::from_millis(200)); + assert_eq!(resp.errors.len(), 1); + assert!(resp.errors[0].has_locked()); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultFailed + ); + + // Wait Timeout + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![k.clone()], + 8, + 8, + Some(1), + false, + false, + ); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 1); + assert!(resp.errors[0].has_locked()); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultFailed + ); + + must_kv_commit(&client, ctx.clone(), vec![k.clone()], 5, 9, 9); + + let mut curr_ts = 10; + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let prewrite_start_ts = curr_ts; + let commit_ts = curr_ts + 5; + let test_lock_ts = curr_ts + 10; + curr_ts += 20; + + // Prewrite + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation.clone()], + k.clone(), + prewrite_start_ts, + ); + + let (tx, rx) = std::sync::mpsc::channel(); + let handle = { + let client = client.clone(); + let k = k.clone(); + let ctx = ctx.clone(); + thread::spawn(move || { + let res = kv_pessimistic_lock_resumable( + &client, + ctx, + vec![k], + test_lock_ts, + test_lock_ts, + Some(1000), + return_values, + check_existence, + ); + tx.send(()).unwrap(); + res + }) + }; + // Blocked for lock waiting. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + + must_kv_commit( + &client, + ctx.clone(), + vec![k.clone()], + prewrite_start_ts, + commit_ts, + commit_ts, + ); + rx.recv_timeout(Duration::from_millis(1000)).unwrap(); + let resp = handle.join().unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 0); + assert_eq!(resp.get_results().len(), 1); + let res = &resp.get_results()[0]; + if return_values { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b"value"); + assert_eq!(res.get_existence(), true); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } else if check_existence { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b""); + assert_eq!(res.get_existence(), true); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } else { + assert_eq!( + res.get_type(), + PessimisticLockKeyResultType::LockResultNormal + ); + assert_eq!(res.get_value(), b""); + assert_eq!(res.get_existence(), false); + assert_eq!(res.get_locked_with_conflict_ts(), 0); + } + + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), test_lock_ts, test_lock_ts); + } + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let test_lock_ts = curr_ts; + let prewrite_start_ts = curr_ts + 10; + let commit_ts = curr_ts + 11; + curr_ts += 20; + // Prewrite + must_kv_prewrite( + &client, + ctx.clone(), + vec![mutation.clone()], + k.clone(), + prewrite_start_ts, + ); + + let (tx, rx) = std::sync::mpsc::channel(); + let handle = { + let client = client.clone(); + let k = k.clone(); + let ctx = ctx.clone(); + thread::spawn(move || { + let res = kv_pessimistic_lock_resumable( + &client, + ctx, + vec![k], + test_lock_ts, + test_lock_ts, + Some(1000), + return_values, + check_existence, + ); + tx.send(()).unwrap(); + res + }) + }; + // Blocked for lock waiting. + rx.recv_timeout(Duration::from_millis(100)).unwrap_err(); + must_kv_commit( + &client, + ctx.clone(), + vec![k.clone()], + prewrite_start_ts, + commit_ts, + commit_ts, + ); + rx.recv_timeout(Duration::from_millis(1000)).unwrap(); + let resp = handle.join().unwrap(); + assert!(!resp.has_region_error(), "{:?}", resp.get_region_error()); + assert_eq!(resp.errors.len(), 0); + assert_eq!(resp.get_results().len(), 1); + assert_eq!( + resp.get_results()[0].get_type(), + PessimisticLockKeyResultType::LockResultLockedWithConflict + ); + assert_eq!(resp.get_results()[0].get_value(), v); + assert_eq!(resp.get_results()[0].get_existence(), true); + assert_eq!( + resp.get_results()[0].get_locked_with_conflict_ts(), + commit_ts + ); + + must_kv_pessimistic_rollback(&client, ctx.clone(), k.clone(), test_lock_ts, commit_ts); } } @@ -1350,90 +1579,6 @@ fn test_async_commit_check_txn_status() { assert_ne!(resp.get_action(), Action::MinCommitTsPushed); } -#[test] -fn test_read_index_check_memory_locks() { - let mut cluster = new_server_cluster(0, 3); - cluster.cfg.raft_store.hibernate_regions = false; - cluster.run(); - - // make sure leader has been elected. - assert_eq!(cluster.must_get(b"k"), None); - - let region = cluster.get_region(b""); - let leader = cluster.leader_of_region(region.get_id()).unwrap(); - let leader_cm = cluster.sim.rl().get_concurrency_manager(leader.get_id()); - - let keys: Vec<_> = vec![b"k", b"l"] - .into_iter() - .map(|k| Key::from_raw(k)) - .collect(); - let guards = block_on(leader_cm.lock_keys(keys.iter())); - let lock = Lock::new( - LockType::Put, - b"k".to_vec(), - 1.into(), - 20000, - None, - 1.into(), - 1, - 2.into(), - ); - guards[0].with_lock(|l| *l = Some(lock.clone())); - - // read on follower - let mut follower_peer = None; - let peers = region.get_peers(); - for p in peers { - if p.get_id() != leader.get_id() { - follower_peer = Some(p.clone()); - break; - } - } - let follower_peer = follower_peer.unwrap(); - let addr = cluster.sim.rl().get_addr(follower_peer.get_store_id()); - - let env = Arc::new(Environment::new(1)); - let channel = ChannelBuilder::new(env).connect(&addr); - let client = TikvClient::new(channel); - - let mut ctx = Context::default(); - ctx.set_region_id(region.get_id()); - ctx.set_region_epoch(region.get_region_epoch().clone()); - ctx.set_peer(follower_peer); - - let read_index = |ranges: &[(&[u8], &[u8])]| { - let mut req = ReadIndexRequest::default(); - let start_ts = block_on(cluster.pd_client.get_tso()).unwrap(); - req.set_context(ctx.clone()); - req.set_start_ts(start_ts.into_inner()); - for &(start_key, end_key) in ranges { - let mut range = kvrpcpb::KeyRange::default(); - range.set_start_key(start_key.to_vec()); - range.set_end_key(end_key.to_vec()); - req.mut_ranges().push(range); - } - let resp = client.read_index(&req).unwrap(); - (resp, start_ts) - }; - - // wait a while until the node updates its own max ts - thread::sleep(Duration::from_millis(300)); - - let (resp, start_ts) = read_index(&[(b"l", b"yz")]); - assert!(!resp.has_locked()); - assert_eq!(leader_cm.max_ts(), start_ts); - - let (resp, start_ts) = read_index(&[(b"a", b"b"), (b"j", b"k0")]); - assert_eq!(resp.get_locked(), &lock.into_lock_info(b"k".to_vec())); - assert_eq!(leader_cm.max_ts(), start_ts); - - drop(guards); - - let (resp, start_ts) = read_index(&[(b"a", b"z")]); - assert!(!resp.has_locked()); - assert_eq!(leader_cm.max_ts(), start_ts); -} - #[test] fn test_prewrite_check_max_commit_ts() { let mut cluster = new_server_cluster(0, 1); @@ -1775,7 +1920,6 @@ fn test_tikv_forwarding() { req.set_split_key(b"k1".to_vec()); req }); - test_func_init!(client, ctx, call_opt, read_index, ReadIndexRequest); // Test if duplex can be redirect correctly. let cases = vec![ @@ -1904,7 +2048,7 @@ fn test_get_lock_wait_info_api() { entries[0].resource_group_tag, b"resource_group_tag2".to_vec() ); - must_kv_pessimistic_rollback(&client, ctx, b"a".to_vec(), 20); + must_kv_pessimistic_rollback(&client, ctx, b"a".to_vec(), 20, 20); handle.join().unwrap(); } diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index d796d9c1f66..43032dd8cc3 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -42,8 +42,9 @@ fn deadlock(client: &TikvClient, ctx: Context, key1: &[u8], ts: u64) -> bool { handle.join().unwrap(); // Clean up - must_kv_pessimistic_rollback(client, ctx.clone(), key1.clone(), ts); - must_kv_pessimistic_rollback(client, ctx, key2.clone(), ts + 1); + + must_kv_pessimistic_rollback(client, ctx.clone(), key1.clone(), ts, ts); + must_kv_pessimistic_rollback(client, ctx, key2.clone(), ts + 1, ts + 1); assert_eq!(resp.errors.len(), 1); if resp.errors[0].has_deadlock() { diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index 7ee38a72c87..fa7a86f12c4 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -9,7 +9,6 @@ use std::{ time::Duration, }; -use engine_rocks::RocksEngine; use futures::{FutureExt, StreamExt, TryStreamExt}; use grpcio::{ ClientStreamingSink, Environment, RequestStream, RpcContext, RpcStatus, RpcStatusCode, Server, @@ -20,17 +19,16 @@ use kvproto::{ tikvpb::BatchRaftMessage, }; use raft::eraftpb::Entry; -use raftstore::{ - errors::DiscardReason, - router::{RaftStoreBlackHole, RaftStoreRouter}, -}; +use raftstore::{errors::DiscardReason, store::StoreMsg}; use tikv::server::{ - self, load_statistics::ThreadLoadPool, resolve, resolve::Callback, Config, ConnectionBuilder, - RaftClient, StoreAddrResolver, TestRaftStoreRouter, + self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, + Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, }; +use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ - config::VersionTrack, + config::{ReadableDuration, VersionTrack}, worker::{Builder as WorkerBuilder, LazyWorker}, + Either, }; use super::*; @@ -53,13 +51,16 @@ impl StoreAddrResolver for StaticResolver { } } -fn get_raft_client(router: R, resolver: T) -> RaftClient +fn get_raft_client(router: R, resolver: T) -> RaftClient where - R: RaftStoreRouter + Unpin + 'static, + R: RaftExtension + Unpin + 'static, T: StoreAddrResolver + 'static, { let env = Arc::new(Environment::new(2)); - let cfg = Arc::new(VersionTrack::new(Config::default())); + let mut config = Config::default(); + config.raft_client_max_backoff = ReadableDuration::millis(100); + config.raft_client_initial_reconnect_backoff = ReadableDuration::millis(100); + let cfg = Arc::new(VersionTrack::new(config)); let security_mgr = Arc::new(SecurityManager::new(&SecurityConfig::default()).unwrap()); let worker = LazyWorker::new("test-raftclient"); let loads = Arc::new(ThreadLoadPool::with_threshold(1000)); @@ -75,10 +76,8 @@ where RaftClient::new(builder) } -fn get_raft_client_by_port( - port: u16, -) -> RaftClient { - get_raft_client(RaftStoreBlackHole, StaticResolver::new(port)) +fn get_raft_client_by_port(port: u16) -> RaftClient { + get_raft_client(FakeExtension, StaticResolver::new(port)) } #[derive(Clone)] @@ -178,7 +177,8 @@ fn test_raft_client_reconnect() { let (tx, rx) = mpsc::channel(); let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); let router = TestRaftStoreRouter::new(tx, significant_msg_sender); - let mut raft_client = get_raft_client(router, StaticResolver::new(port)); + let wrap = RaftRouterWrap::new(router); + let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); raft_client.flush(); @@ -194,7 +194,6 @@ fn test_raft_client_reconnect() { raft_client.send(RaftMessage::default()).unwrap(); } raft_client.flush(); - rx.recv_timeout(Duration::from_secs(3)).unwrap(); // `send` should success after the mock server restarted. let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); @@ -207,6 +206,59 @@ fn test_raft_client_reconnect() { drop(mock_server); } +#[test] +// Test raft_client reports store unreachable only once until being connected +// again +fn test_raft_client_report_unreachable() { + let msg_count = Arc::new(AtomicUsize::new(0)); + let batch_msg_count = Arc::new(AtomicUsize::new(0)); + let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); + let (mut mock_server, port) = create_mock_server(service, 60100, 60200).unwrap(); + + let (tx, rx) = mpsc::channel(); + let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); + let router = TestRaftStoreRouter::new(tx, significant_msg_sender); + let wrap = RaftRouterWrap::new(router); + let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); + + // server is disconnected + mock_server.shutdown(); + drop(mock_server); + + raft_client.send(RaftMessage::default()).unwrap(); + let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); + if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { + assert_eq!(store_id, 0); + } else { + panic!("expect StoreUnreachable"); + } + // no more unreachable message is sent until it's connected again. + rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); + + // restart the mock server. + let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); + let mut mock_server = create_mock_server_on(service, port); + + // make sure the connection is connected, otherwise the following sent messages + // may be dropped + std::thread::sleep(Duration::from_millis(200)); + (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); + raft_client.flush(); + check_msg_count(500, &msg_count, 50); + + // server is disconnected + mock_server.take().unwrap().shutdown(); + + let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); + if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { + assert_eq!(store_id, 0); + } else { + panic!("expect StoreUnreachable"); + } + // no more unreachable message is sent until it's connected again. + rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); +} + #[test] fn test_batch_size_limit() { let msg_count = Arc::new(AtomicUsize::new(0)); @@ -330,15 +382,14 @@ fn test_tombstone_block_list() { let bg_worker = WorkerBuilder::new(thd_name!("background")) .thread_count(2) .create(); - let resolver = - resolve::new_resolver::<_, _, RocksEngine>(pd_client, &bg_worker, RaftStoreBlackHole).0; + let resolver = resolve::new_resolver(pd_client, &bg_worker, FakeExtension).0; let msg_count = Arc::new(AtomicUsize::new(0)); let batch_msg_count = Arc::new(AtomicUsize::new(0)); let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); let (_mock_server, port) = create_mock_server(service, 60200, 60300).unwrap(); - let mut raft_client = get_raft_client(RaftStoreBlackHole, resolver); + let mut raft_client = get_raft_client(FakeExtension, resolver); let mut store1 = metapb::Store::default(); store1.set_id(1); @@ -387,9 +438,8 @@ fn test_store_allowlist() { let bg_worker = WorkerBuilder::new(thd_name!("background")) .thread_count(2) .create(); - let resolver = - resolve::new_resolver::<_, _, RocksEngine>(pd_client, &bg_worker, RaftStoreBlackHole).0; - let mut raft_client = get_raft_client(RaftStoreBlackHole, resolver); + let resolver = resolve::new_resolver(pd_client, &bg_worker, FakeExtension).0; + let mut raft_client = get_raft_client(FakeExtension, resolver); let msg_count1 = Arc::new(AtomicUsize::new(0)); let batch_msg_count1 = Arc::new(AtomicUsize::new(0)); From 660eb73733aa490181477f9902c949c3059353b9 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 10:48:32 +0800 Subject: [PATCH 018/115] add test for paused Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 37 ++++++++++++++++-------------- proxy_scripts/ci_check.sh | 1 - proxy_tests/proxy/fast_add_peer.rs | 36 +++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 01acf7f4716..9c3d74cfdc0 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -288,14 +288,15 @@ impl TiFlashObserver { is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { // TODO Maybe too much printing - info!("fast path: ongoing {}:{}, skip MsgAppend", self.store_id, region_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - "has_already_inited" => has_already_inited, - "is_first" => is_first, - ); + // info!("fast path: ongoing {}:{}, skip MsgAppend", + // self.store_id, region_id; + // "to_peer_id" => msg.get_to_peer().get_id(), + // "from_peer_id" => msg.get_from_peer().get_id(), + // "inner_msg" => ?inner_msg, + // "is_replicated" => is_replicated, + // "has_already_inited" => has_already_inited, + // "is_first" => is_first, + // ); } } MapEntry::Vacant(v) => { @@ -314,13 +315,13 @@ impl TiFlashObserver { if !is_first { // TODO avoid too much log - info!( - "fast path: normal MsgAppend of {}:{}", - self.store_id, region_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - ); + // info!( + // "fast path: normal MsgAppend of {}:{}", + // self.store_id, region_id; + // "to_peer_id" => msg.get_to_peer().get_id(), + // "from_peer_id" => msg.get_from_peer().get_id(), + // "inner_msg" => ?inner_msg, + // ); return false; } @@ -342,6 +343,7 @@ impl TiFlashObserver { "from_peer_id" => msg.get_from_peer().get_id(), ); fail::fail_point!("go_fast_path_not_allow", |e| { return false }); + fail::fail_point!("fi_fast_add_peer_pause", |e| { return false }); // Feed data let res = self .engine_store_server_helper @@ -349,7 +351,7 @@ impl TiFlashObserver { match res.status { crate::FastAddPeerStatus::Ok => (), crate::FastAddPeerStatus::WaitForData => { - error!( + info!( "fast path: ongoing {}:{}. remote peer preparing data, wait", self.store_id, region_id ); @@ -1310,7 +1312,8 @@ impl ApplySnapshotObserver for TiFlashOb } } MapEntry::Vacant(_) => { - panic!("unknown snapshot!"); + // Compat no fast add peer logic + // panic!("unknown snapshot!"); } }, ).is_err() { diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index d443d70771a..70dbfdfa1f6 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -1,7 +1,6 @@ set -uxeo pipefail if [[ $M == "fmt" ]]; then make gen_proxy_ffi - git status -s GIT_STATUS=$(git status -s) && if [[ ${GIT_STATUS} ]]; then echo "Error: found illegal git status"; echo ${GIT_STATUS}; [[ -z ${GIT_STATUS} ]]; fi cargo fmt -- --check >/dev/null elif [[ $M == "testold" ]]; then diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 3dea073c9fa..7123274950f 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -9,7 +9,7 @@ enum SourceType { InvalidSource, } -fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) { tikv_util::set_panic_hook(true, "./"); let (mut cluster, pd_client) = new_mock_cluster(0, 3); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; @@ -44,6 +44,9 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { _ => (), }; + if pause { + fail::cfg("fi_fast_add_peer_pause", "pause").unwrap(); + } pd_client.must_add_peer(1, new_learner_peer(3, 3)); // std::thread::sleep(std::time::Duration::from_millis(2000)); // match source_type { @@ -79,6 +82,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { _ => (), }; + if pause { + std::thread::sleep(std::time::Duration::from_millis(3000)); + fail::remove("fi_fast_add_peer_pause"); + } + match source_type { SourceType::DelayedLearner => { check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); @@ -122,7 +130,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { #[test] fn test_fast_add_peer_from_leader() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Leader, false); + simple_fast_add_peer(SourceType::Leader, false, false); fail::remove("on_pre_persist_with_finish"); } @@ -130,7 +138,7 @@ fn test_fast_add_peer_from_leader() { #[test] fn test_fast_add_peer_from_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, false); + simple_fast_add_peer(SourceType::Learner, false, false); fail::remove("on_pre_persist_with_finish"); } @@ -138,7 +146,7 @@ fn test_fast_add_peer_from_learner() { #[test] fn test_fast_add_peer_from_delayed_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, false); + simple_fast_add_peer(SourceType::DelayedLearner, false, false); fail::remove("on_pre_persist_with_finish"); } @@ -146,20 +154,34 @@ fn test_fast_add_peer_from_delayed_learner() { /// normal. #[test] fn test_fast_add_peer_from_invalid_source() { - simple_fast_add_peer(SourceType::InvalidSource, false); + simple_fast_add_peer(SourceType::InvalidSource, false, false); } #[test] fn test_fast_add_peer_from_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true); + simple_fast_add_peer(SourceType::Learner, true, false); fail::remove("on_pre_persist_with_finish"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true); + simple_fast_add_peer(SourceType::DelayedLearner, true, false); + fail::remove("on_pre_persist_with_finish"); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked_paused() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, true); + fail::remove("on_pre_persist_with_finish"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, true); fail::remove("on_pre_persist_with_finish"); } From 39f6a147d7051e9f023909ee5109578e9f6177d6 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 10:57:12 +0800 Subject: [PATCH 019/115] fix ci Signed-off-by: CalvinNeo --- .github/workflows/pr-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index e2a05af7a74..2ef30934211 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -27,7 +27,7 @@ jobs: restore-keys: | ${{ runner.os }}-cargo- - name: Install dependencies (protocol buffers compiler) - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v1.1.0 with: version: '3.8.0' - name: install rust @@ -74,9 +74,10 @@ jobs: restore-keys: | ${{ runner.os }}-cargo- - name: Install dependencies (protocol buffers compiler) - uses: arduino/setup-protoc@v1 + uses: arduino/setup-protoc@v1.1.0 with: version: '3.8.0' + repo-token: ${{ secrets.GITHUB_TOKEN }} - name: install rust if: steps.cache-cargo.outputs.cache-hit != 'true' run: | From fffc34541fe99d9f4c1d25aa2d177599b572c9d4 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 11:02:51 +0800 Subject: [PATCH 020/115] fix ci Signed-off-by: CalvinNeo --- .github/workflows/pr-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 2ef30934211..2dd22278d4c 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -27,7 +27,7 @@ jobs: restore-keys: | ${{ runner.os }}-cargo- - name: Install dependencies (protocol buffers compiler) - uses: arduino/setup-protoc@v1.1.0 + uses: arduino/setup-protoc@v1 with: version: '3.8.0' - name: install rust @@ -74,7 +74,7 @@ jobs: restore-keys: | ${{ runner.os }}-cargo- - name: Install dependencies (protocol buffers compiler) - uses: arduino/setup-protoc@v1.1.0 + uses: arduino/setup-protoc@v1 with: version: '3.8.0' repo-token: ${{ secrets.GITHUB_TOKEN }} From 59e1616556ec9981da3f459085beb444f98ece35 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 12:13:55 +0800 Subject: [PATCH 021/115] fix tests Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 42 ++++++++++++++++-------------- proxy_tests/proxy/fast_add_peer.rs | 23 ++++++++++++++++ 2 files changed, 45 insertions(+), 20 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 9c3d74cfdc0..1c6446a821d 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1298,27 +1298,29 @@ impl ApplySnapshotObserver for TiFlashOb ); let region_id = ob_ctx.region().get_id(); let mut should_skip = false; - if self.access_cached_region_info_mut( - region_id, - |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); - if is_first_snapsot { - info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; - "snap_key" => ?snap_key, - ); - should_skip = true; - o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + if self.engine_store_cfg.enable_fast_add_peer { + if self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { + info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; + "snap_key" => ?snap_key, + ); + should_skip = true; + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } } - } - MapEntry::Vacant(_) => { - // Compat no fast add peer logic - // panic!("unknown snapshot!"); - } - }, - ).is_err() { - fatal!("post_apply_snapshot poisoned") - }; + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; + } let snap = match snap { None => return, Some(s) => s, diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 7123274950f..429d356ea61 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -121,6 +121,29 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) _ => (), }; + // Destroy peer + pd_client.must_remove_peer(1, new_learner_peer(3, 3)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + assert!(!server.kvstore.contains_key(&1)); + }, + ); + cluster.must_put(b"k5", b"v5"); + pd_client.must_add_peer(1, new_learner_peer(3, 4)); + cluster.must_put(b"k6", b"v6"); + check_key( + &cluster, + b"k6", + b"v6", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); fail::remove("ffi_fast_add_peer_block_wait"); From eb2c930136affc10a3681945f1cde26112a39c9d Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 12:58:49 +0800 Subject: [PATCH 022/115] fix tests Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 1 + new-mock-engine-store/src/mock_store.rs | 2 -- proxy_tests/proxy/fast_add_peer.rs | 24 +++++++++++++++++------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 1c6446a821d..bc18cd3c428 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1298,6 +1298,7 @@ impl ApplySnapshotObserver for TiFlashOb ); let region_id = ob_ctx.region().get_id(); let mut should_skip = false; + #[allow(clippy::collapsible_if)] if self.engine_store_cfg.enable_fast_add_peer { if self.access_cached_region_info_mut( region_id, diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 9cb56d119fb..3397f27ad67 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -208,13 +208,11 @@ pub fn write_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8], v: &[u8]) let pending_delete = &mut region.pending_delete[cf_index]; let pending_write = &mut region.pending_write[cf_index]; pending_delete.remove(k); - debug!("!!!! write_kv_in_mem {:?}", k); data.insert(k.to_vec(), v.to_vec()); pending_write.insert(k.to_vec(), v.to_vec()); } fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { - debug!("!!!! delete_kv_in_mem {:?}", k); let data = &mut region.data[cf_index]; let pending_delete = &mut region.pending_delete[cf_index]; pending_delete.insert(k.to_vec()); diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 429d356ea61..9f54fbfbc97 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -122,7 +122,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) }; // Destroy peer + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); pd_client.must_remove_peer(1, new_learner_peer(3, 3)); + must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_none() + }); std::thread::sleep(std::time::Duration::from_millis(1000)); iter_ffi_helpers( &cluster, @@ -134,6 +138,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) ); cluster.must_put(b"k5", b"v5"); pd_client.must_add_peer(1, new_learner_peer(3, 4)); + // Wait until Learner has applied ConfChange + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 4).is_some() + }); cluster.must_put(b"k6", b"v6"); check_key( &cluster, @@ -143,6 +152,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) None, Some(vec![1, 2, 3]), ); + fail::remove("fallback_to_slow_path_not_allow"); fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); @@ -154,7 +164,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) fn test_fast_add_peer_from_leader() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::Leader, false, false); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } /// Fast path by learner snapshot. @@ -162,7 +172,7 @@ fn test_fast_add_peer_from_leader() { fn test_fast_add_peer_from_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::Learner, false, false); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } /// If a learner is delayed, but already applied ConfChange. @@ -170,7 +180,7 @@ fn test_fast_add_peer_from_learner() { fn test_fast_add_peer_from_delayed_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::DelayedLearner, false, false); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } /// If we select a wrong source, or we can't run fast path, we can fallback to @@ -184,28 +194,28 @@ fn test_fast_add_peer_from_invalid_source() { fn test_fast_add_peer_from_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::Learner, true, false); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::DelayedLearner, true, false); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_learner_blocked_paused() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::Learner, true, true); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked_paused() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); simple_fast_add_peer(SourceType::DelayedLearner, true, true); - fail::remove("on_pre_persist_with_finish"); + fail::remove("fallback_to_slow_path_not_allow"); } #[test] From a9ff447bef1971e13e73fcd26ecebb6c390177c7 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 13:35:47 +0800 Subject: [PATCH 023/115] fix tests Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 1 + new-mock-engine-store/src/mock_store.rs | 7 +++---- proxy_tests/proxy/fast_add_peer.rs | 3 +++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index bc18cd3c428..2190bb0a8a7 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -322,6 +322,7 @@ impl TiFlashObserver { // "from_peer_id" => msg.get_from_peer().get_id(), // "inner_msg" => ?inner_msg, // ); + fail::fail_point!("fast_path_is_not_first", |_| {}); return false; } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 3397f27ad67..4de240344b1 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1236,7 +1236,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { Some(s) => { let len = s.len() as u64; let ptr = Box::into_raw(Box::new(s.clone())); // leak - let s = ffi_interfaces::CppStrWithView { + ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { ptr: ptr as RawVoidPtr, type_: RawCppPtrTypeImpl::String.into(), @@ -1245,8 +1245,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { data: (*ptr).as_ptr() as *const _, len, }, - }; - s + } } None => ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { @@ -1428,5 +1427,5 @@ unsafe extern "C" fn ffi_fast_add_peer( }; } error!("recover from remote peer: failed after retry"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 9f54fbfbc97..44c68e9b6a9 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -122,6 +122,8 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) }; // Destroy peer + // These failpoints make sure we will cause again a fast path. + fail::cfg("fast_path_is_not_first", "panic").unwrap(); fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); pd_client.must_remove_peer(1, new_learner_peer(3, 3)); must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { @@ -153,6 +155,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) Some(vec![1, 2, 3]), ); fail::remove("fallback_to_slow_path_not_allow"); + fail::remove("fast_path_is_not_first"); fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); From 492974938054d53520fd8689bb52f0ccc75d25a9 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 14:55:05 +0800 Subject: [PATCH 024/115] support re add peer Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 86 ++++++++++++++++--------- new-mock-engine-store/src/mock_store.rs | 4 ++ proxy_tests/proxy/fast_add_peer.rs | 22 +++++-- 3 files changed, 75 insertions(+), 37 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 2190bb0a8a7..e666c43d021 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -17,7 +17,7 @@ use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}, + raft_serverpb::{PeerState, RaftApplyState, RaftMessage, RegionLocalState}, }; use protobuf::Message; use raft::{eraftpb, eraftpb::MessageType, StateRole}; @@ -220,6 +220,21 @@ impl TiFlashObserver { Ok(()) } + pub fn remove_cached_region_info(&self, region_id: u64) { + let slot_id = Self::slot_index(region_id); + match self.cached_region_info.get(slot_id).unwrap().write() { + Ok(mut g) => { + info!( + "remove_cached_region_info"; + "region_id" => region_id, + "store_id" => self.store_id, + ); + let _ = g.remove(®ion_id); + } + Err(_) => (), + }; + } + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { self.access_cached_region_info_mut( region_id, @@ -246,7 +261,10 @@ impl TiFlashObserver { pub fn is_initialized(&self, region_id: u64) -> bool { match get_region_local_state(&self.engine, region_id) { None => false, - Some(r) => raftstore::store::util::is_region_initialized(r.get_region()), + Some(r) => { + raftstore::store::util::is_region_initialized(r.get_region()) + && (r.get_state() != PeerState::Tombstone) + } } } @@ -289,18 +307,19 @@ impl TiFlashObserver { if is_first { // TODO Maybe too much printing // info!("fast path: ongoing {}:{}, skip MsgAppend", - // self.store_id, region_id; - // "to_peer_id" => msg.get_to_peer().get_id(), - // "from_peer_id" => msg.get_from_peer().get_id(), - // "inner_msg" => ?inner_msg, - // "is_replicated" => is_replicated, - // "has_already_inited" => has_already_inited, - // "is_first" => is_first, + // self.store_id, region_id; + // "to_peer_id" => msg.get_to_peer().get_id(), + // "from_peer_id" => + // msg.get_from_peer().get_id(), + // "inner_msg" => ?inner_msg, + // "is_replicated" => is_replicated, + // "has_already_inited" => has_already_inited, + // "is_first" => is_first, // ); } } MapEntry::Vacant(v) => { - info!("fast path: ongoing {}:{}, first message", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, first message", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, @@ -316,13 +335,12 @@ impl TiFlashObserver { if !is_first { // TODO avoid too much log // info!( - // "fast path: normal MsgAppend of {}:{}", - // self.store_id, region_id; + // "fast path: normal MsgAppend of {}:{} {}", + // self.store_id, region_id, new_peer_id; // "to_peer_id" => msg.get_to_peer().get_id(), // "from_peer_id" => msg.get_from_peer().get_id(), // "inner_msg" => ?inner_msg, // ); - fail::fail_point!("fast_path_is_not_first", |_| {}); return false; } @@ -330,7 +348,7 @@ impl TiFlashObserver { // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, // see `check_msg`. if !is_replicated { - info!("fast path: ongoing {}:{}, wait replicating peer", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, @@ -339,12 +357,12 @@ impl TiFlashObserver { } } - info!("fast path: ongoing {}:{}, fetch data from remote peer", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, fetch data from remote peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); - fail::fail_point!("go_fast_path_not_allow", |e| { return false }); - fail::fail_point!("fi_fast_add_peer_pause", |e| { return false }); + fail::fail_point!("go_fast_path_not_allow", |_| { return false }); + fail::fail_point!("ffi_fast_add_peer_pause", |_| { return false }); // Feed data let res = self .engine_store_server_helper @@ -353,22 +371,22 @@ impl TiFlashObserver { crate::FastAddPeerStatus::Ok => (), crate::FastAddPeerStatus::WaitForData => { info!( - "fast path: ongoing {}:{}. remote peer preparing data, wait", - self.store_id, region_id + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + self.store_id, region_id, new_peer_id ); return true; } _ => { error!( - "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", - self.store_id, region_id, res + "fast path: ongoing {}:{}{}failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res ); self.fallback_to_slow_path(region_id); return false; } }; - info!("fast path: ongoing {}:{}, parse", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, parse", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); @@ -378,7 +396,7 @@ impl TiFlashObserver { let mut new_region = kvproto::metapb::Region::default(); apply_state.merge_from_bytes(apply_state_str).unwrap(); new_region.merge_from_bytes(region_str).unwrap(); - info!("fast path: ongoing {}:{}, start build and send", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "new_region" => ?new_region, @@ -388,20 +406,20 @@ impl TiFlashObserver { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { - info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); } crate::FastAddPeerStatus::WaitForData => { error!( - "fast path: ongoing {}:{}. remote peer preparing data, wait", - self.store_id, region_id + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + new_peer_id, self.store_id, region_id ); return true; } _ => { - error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; + error!("fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", self.store_id, region_id, new_peer_id, s; "is_first" => is_first,); self.fallback_to_slow_path(region_id); return false; @@ -409,7 +427,7 @@ impl TiFlashObserver { }; } Err(e) => { - error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; + error!("fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", self.store_id, region_id, new_peer_id, e; "is_first" => is_first,); self.fallback_to_slow_path(region_id); return false; @@ -451,14 +469,14 @@ impl TiFlashObserver { // check if the source already knows the know peer if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { info!( - "fast path: ongoing {}:{}. remote peer has not applied conf change for {}", + "fast path: ongoing {}:{} {}. remote peer has not applied conf change", self.store_id, region_id, new_peer_id; "region" => ?new_region, ); return Ok(crate::FastAddPeerStatus::WaitForData); } else { info!( - "fast path: ongoing {}:{}. remote peer has applied conf change for {}", + "fast path: ongoing {}:{} {}. remote peer has applied conf change", self.store_id, region_id, new_peer_id ); } @@ -1052,14 +1070,18 @@ impl RegionChangeObserver for TiFlashObs e: RegionChangeEvent, _: StateRole, ) { + let region_id = ob_ctx.region().get_id(); if e == RegionChangeEvent::Destroy { info!( "observe destroy"; - "region_id" => ob_ctx.region().get_id(), + "region_id" => region_id, "store_id" => self.store_id, ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); + if self.engine_store_cfg.enable_fast_add_peer { + self.remove_cached_region_info(region_id); + } } } @@ -1307,7 +1329,7 @@ impl ApplySnapshotObserver for TiFlashOb MapEntry::Occupied(mut o) => { let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); if is_first_snapsot { - info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; + info!("fast path: applied first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; "snap_key" => ?snap_key, ); should_skip = true; diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 4de240344b1..34a4eadfcbc 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -75,6 +75,7 @@ impl Region { #[derive(Default)] pub struct RegionStats { pub pre_handle_count: AtomicU64, + pub fast_add_peer_count: AtomicU64, } pub struct EngineStoreServer { @@ -1268,6 +1269,9 @@ unsafe extern "C" fn ffi_fast_add_peer( let store = into_engine_store_server_wrap(arg1); let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); let store_id = (*store.engine_store_server).id; + (*store.engine_store_server).mutate_region_states(region_id, |e: &mut RegionStats| { + e.fast_add_peer_count.fetch_add(1, Ordering::SeqCst); + }); let failed_add_peer_res = |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 44c68e9b6a9..8ec71e1f036 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -45,7 +45,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) }; if pause { - fail::cfg("fi_fast_add_peer_pause", "pause").unwrap(); + fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); } pd_client.must_add_peer(1, new_learner_peer(3, 3)); // std::thread::sleep(std::time::Duration::from_millis(2000)); @@ -84,7 +84,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) if pause { std::thread::sleep(std::time::Duration::from_millis(3000)); - fail::remove("fi_fast_add_peer_pause"); + fail::remove("ffi_fast_add_peer_pause"); } match source_type { @@ -122,9 +122,6 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) }; // Destroy peer - // These failpoints make sure we will cause again a fast path. - fail::cfg("fast_path_is_not_first", "panic").unwrap(); - fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); pd_client.must_remove_peer(1, new_learner_peer(3, 3)); must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_none() @@ -136,15 +133,30 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) &mut |_, _, ffi: &mut FFIHelperSet| { let server = &ffi.engine_store_server; assert!(!server.kvstore.contains_key(&1)); + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + e.fast_add_peer_count.store(0, Ordering::SeqCst); + }); }, ); cluster.must_put(b"k5", b"v5"); + // These failpoints make sure we will cause again a fast path. + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); pd_client.must_add_peer(1, new_learner_peer(3, 4)); // Wait until Learner has applied ConfChange std::thread::sleep(std::time::Duration::from_millis(1000)); must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { find_peer_by_id(states.in_disk_region_state.get_region(), 4).is_some() }); + // If we re-add peer, we can still go fast path. + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert!(e.fast_add_peer_count.load(Ordering::SeqCst) > 0); + }); + }, + ); cluster.must_put(b"k6", b"v6"); check_key( &cluster, From e4d6453d36e7385846170e0b9895f20d094dabf0 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 13 Dec 2022 16:53:13 +0800 Subject: [PATCH 025/115] re fmt Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 82 +++++++++++++------------ new-mock-engine-store/src/mock_store.rs | 13 +--- 2 files changed, 43 insertions(+), 52 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index e666c43d021..3e22b97a407 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -222,17 +222,14 @@ impl TiFlashObserver { pub fn remove_cached_region_info(&self, region_id: u64) { let slot_id = Self::slot_index(region_id); - match self.cached_region_info.get(slot_id).unwrap().write() { - Ok(mut g) => { - info!( - "remove_cached_region_info"; - "region_id" => region_id, - "store_id" => self.store_id, - ); - let _ = g.remove(®ion_id); - } - Err(_) => (), - }; + if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { + info!( + "remove_cached_region_info"; + "region_id" => region_id, + "store_id" => self.store_id, + ); + let _ = g.remove(®ion_id); + } } pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { @@ -378,7 +375,7 @@ impl TiFlashObserver { } _ => { error!( - "fast path: ongoing {}:{}{}failed. fetch and replace error {:?}, fallback to normal", + "fast path: ongoing {}:{} {} failed. fetch and replace error {:?}, fallback to normal", self.store_id, region_id, new_peer_id, res ); self.fallback_to_slow_path(region_id); @@ -386,16 +383,24 @@ impl TiFlashObserver { } }; - info!("fast path: ongoing {}:{} {}, parse", self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - ); let apply_state_str = res.apply_state.view.to_slice(); let region_str = res.region.view.to_slice(); let mut apply_state = RaftApplyState::default(); let mut new_region = kvproto::metapb::Region::default(); apply_state.merge_from_bytes(apply_state_str).unwrap(); new_region.merge_from_bytes(region_str).unwrap(); + + // Validate + // check if the source already knows the know peer + if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { + info!( + "fast path: ongoing {}:{} {}. remote peer has not applied conf change", + self.store_id, region_id, new_peer_id; + "region" => ?new_region, + ); + return Ok(crate::FastAddPeerStatus::WaitForData); + } + info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), @@ -412,23 +417,27 @@ impl TiFlashObserver { ); } crate::FastAddPeerStatus::WaitForData => { - error!( + info!( "fast path: ongoing {}:{} {}. remote peer preparing data, wait", new_peer_id, self.store_id, region_id ); return true; } _ => { - error!("fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", self.store_id, region_id, new_peer_id, s; - "is_first" => is_first,); + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", + self.store_id, region_id, new_peer_id, s + ); self.fallback_to_slow_path(region_id); return false; } }; } Err(e) => { - error!("fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", self.store_id, region_id, new_peer_id, e; - "is_first" => is_first,); + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", + self.store_id, region_id, new_peer_id, e + ); self.fallback_to_slow_path(region_id); return false; } @@ -441,15 +450,17 @@ impl TiFlashObserver { region_id: u64, index: u64, peer_id: u64, + tag: &str, ) -> RaftStoreResult { match self.raft_engine.get_entry(region_id, index)? { Some(entry) => Ok(entry.get_term()), None => { return Err(box_err!( - "can't find entry for index {} of region {}, peer_id: {}", + "can't find entry for index {} of region {}, peer_id: {}, tag {}", index, region_id, - peer_id + peer_id, + tag )); } } @@ -466,26 +477,17 @@ impl TiFlashObserver { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building let (snap, key) = { - // check if the source already knows the know peer - if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { - info!( - "fast path: ongoing {}:{} {}. remote peer has not applied conf change", - self.store_id, region_id, new_peer_id; - "region" => ?new_region, - ); - return Ok(crate::FastAddPeerStatus::WaitForData); - } else { - info!( - "fast path: ongoing {}:{} {}. remote peer has applied conf change", - self.store_id, region_id, new_peer_id - ); - } - // Find term of entry at applied_index. let applied_index = apply_state.get_applied_index(); - let applied_term = self.check_entry_at_index(region_id, applied_index, new_peer_id)?; + let applied_term = + self.check_entry_at_index(region_id, applied_index, new_peer_id, "applied_index")?; // Will otherwise cause "got message with lower index than committed" loop. - self.check_entry_at_index(region_id, apply_state.get_commit_index(), new_peer_id)?; + self.check_entry_at_index( + region_id, + apply_state.get_commit_index(), + new_peer_id, + "commit_index", + )?; let key = SnapKey::new(region_id, applied_term, applied_index); self.snap_mgr.register(key.clone(), SnapEntry::Generating); diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 34a4eadfcbc..41d172d72fd 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1324,18 +1324,7 @@ unsafe extern "C" fn ffi_fast_add_peer( }; // TODO We must ask the remote peer to persist before get a snapshot. - // { - // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { - // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add - // peer".to_string()); } else { - // error!("recover from remote peer: failed persist source region"; - // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes - // { status: ffi_interfaces::FastAddPeerStatus::BadData, - // apply_state: create_cpp_str(None), - // region: create_cpp_str(None), - // }; - // } - // } + let source_region = match source_server.kvstore.get(®ion_id) { Some(s) => s, None => { From 391ba67b4deca904514a6c93831db10f3d06075b Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 14 Dec 2022 10:48:59 +0800 Subject: [PATCH 026/115] Add support for recover and re-add peer (#231) --- .github/workflows/pr-ci.yml | 1 + engine_store_ffi/src/observer.rs | 249 ++++++++++++++++-------- new-mock-engine-store/src/mock_store.rs | 41 ++-- new-mock-engine-store/src/node.rs | 2 +- proxy_tests/proxy/fast_add_peer.rs | 122 ++++++++++-- proxy_tests/proxy/flashback.rs | 2 - proxy_tests/proxy/region.rs | 2 +- 7 files changed, 301 insertions(+), 118 deletions(-) diff --git a/.github/workflows/pr-ci.yml b/.github/workflows/pr-ci.yml index 4480af5f7ba..34a1cbc3de2 100644 --- a/.github/workflows/pr-ci.yml +++ b/.github/workflows/pr-ci.yml @@ -77,6 +77,7 @@ jobs: uses: arduino/setup-protoc@v1 with: version: '3.8.0' + repo-token: ${{ secrets.GITHUB_TOKEN }} - name: install rust if: steps.cache-cargo.outputs.cache-hit != 'true' run: | diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index e41707e8717..20d5c71b0d8 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -13,11 +13,11 @@ use std::{ use collections::HashMap; use engine_tiflash::FsStatsExt; -use engine_traits::{RaftEngine, SstMetaInfo}; +use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, CmdType, RaftCmdRequest}, - raft_serverpb::{RaftApplyState, RaftMessage}, + raft_serverpb::{PeerState, RaftApplyState, RaftMessage, RegionLocalState}, }; use protobuf::Message; use raft::{eraftpb, eraftpb::MessageType, StateRole}; @@ -189,6 +189,16 @@ pub fn validate_remote_peer_region( } } +pub fn get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) +} + impl TiFlashObserver { #[inline] fn slot_index(id: u64) -> usize { @@ -210,6 +220,18 @@ impl TiFlashObserver { Ok(()) } + pub fn remove_cached_region_info(&self, region_id: u64) { + let slot_id = Self::slot_index(region_id); + if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { + info!( + "remove_cached_region_info"; + "region_id" => region_id, + "store_id" => self.store_id, + ); + let _ = g.remove(®ion_id); + } + } + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { self.access_cached_region_info_mut( region_id, @@ -233,6 +255,16 @@ impl TiFlashObserver { } } + pub fn is_initialized(&self, region_id: u64) -> bool { + match get_region_local_state(&self.engine, region_id) { + None => false, + Some(r) => { + raftstore::store::util::is_region_initialized(r.get_region()) + && (r.get_state() != PeerState::Tombstone) + } + } + } + // Returns whether we need to ignore this message and run fast path instead. pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { if !self.engine_store_cfg.enable_fast_add_peer { @@ -249,24 +281,42 @@ impl TiFlashObserver { let new_peer_id = msg.get_to_peer().get_id(); let mut is_first = false; let mut is_replicated = false; + let mut has_already_inited = None; let f = |info: MapEntry>| { match info { - MapEntry::Occupied(o) => { - is_first = !o.get().inited_or_fallback.load(Ordering::SeqCst); + MapEntry::Occupied(mut o) => { + (is_first, has_already_inited) = + if !o.get().inited_or_fallback.load(Ordering::SeqCst) { + // If `has_already_inited` is true, usually means we recover from a + // restart. So we have data in disk, but not + // in memory. TODO maybe only check once, or + // we can remove apply snapshot. + let has_already_inited = self.is_initialized(region_id); + if has_already_inited { + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } + (!has_already_inited, Some(has_already_inited)) + } else { + (false, None) + }; // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { // TODO Maybe too much printing - info!("fast path: ongoing {}:{}, skip MsgAppend", self.store_id, region_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - ); + // info!("fast path: ongoing {}:{}, skip MsgAppend", + // self.store_id, region_id; + // "to_peer_id" => msg.get_to_peer().get_id(), + // "from_peer_id" => + // msg.get_from_peer().get_id(), + // "inner_msg" => ?inner_msg, + // "is_replicated" => is_replicated, + // "has_already_inited" => has_already_inited, + // "is_first" => is_first, + // ); } } MapEntry::Vacant(v) => { - info!("fast path: ongoing {}:{}, first message", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, first message", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, @@ -280,10 +330,14 @@ impl TiFlashObserver { self.access_cached_region_info_mut(region_id, f).unwrap(); if !is_first { - info!( - "fast path: normal MsgAppend of {}:{}", - self.store_id, region_id - ); + // TODO avoid too much log + // info!( + // "fast path: normal MsgAppend of {}:{} {}", + // self.store_id, region_id, new_peer_id; + // "to_peer_id" => msg.get_to_peer().get_id(), + // "from_peer_id" => msg.get_from_peer().get_id(), + // "inner_msg" => ?inner_msg, + // ); return false; } @@ -291,7 +345,7 @@ impl TiFlashObserver { // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, // see `check_msg`. if !is_replicated { - info!("fast path: ongoing {}:{}, wait replicating peer", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "inner_msg" => ?inner_msg, @@ -300,11 +354,12 @@ impl TiFlashObserver { } } - info!("fast path: ongoing {}:{}, fetch data from remote peer", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, fetch data from remote peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); - fail::fail_point!("go_fast_path_not_allow", |e| { return false }); + fail::fail_point!("go_fast_path_not_allow", |_| { return false }); + fail::fail_point!("ffi_fast_add_peer_pause", |_| { return false }); // Feed data let res = self .engine_store_server_helper @@ -312,33 +367,42 @@ impl TiFlashObserver { match res.status { crate::FastAddPeerStatus::Ok => (), crate::FastAddPeerStatus::WaitForData => { - error!( - "fast path: ongoing {}:{}. remote peer preparing data, wait", - self.store_id, region_id + info!( + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + self.store_id, region_id, new_peer_id ); return true; } _ => { error!( - "fast path: ongoing {}:{} failed. fetch and replace error {:?}, fallback to normal", - self.store_id, region_id, res + "fast path: ongoing {}:{} {} failed. fetch and replace error {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res ); self.fallback_to_slow_path(region_id); return false; } }; - info!("fast path: ongoing {}:{}, parse", self.store_id, region_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - ); let apply_state_str = res.apply_state.view.to_slice(); let region_str = res.region.view.to_slice(); let mut apply_state = RaftApplyState::default(); let mut new_region = kvproto::metapb::Region::default(); apply_state.merge_from_bytes(apply_state_str).unwrap(); new_region.merge_from_bytes(region_str).unwrap(); - info!("fast path: ongoing {}:{}, start build and send", self.store_id, region_id; + + // Validate + // check if the source already knows the know peer + if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { + info!( + "fast path: ongoing {}:{} {}. failed remote peer has not applied conf change", + self.store_id, region_id, new_peer_id; + "region" => ?new_region, + ); + self.fallback_to_slow_path(region_id); + return false; + } + + info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), "new_region" => ?new_region, @@ -348,29 +412,33 @@ impl TiFlashObserver { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { - info!("fast path: ongoing {}:{}, finish build and send", self.store_id, region_id; + info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), ); } crate::FastAddPeerStatus::WaitForData => { - error!( - "fast path: ongoing {}:{}. remote peer preparing data, wait", - self.store_id, region_id + info!( + "fast path: ongoing {}:{} {}. remote peer preparing data, wait", + new_peer_id, self.store_id, region_id ); return true; } _ => { - error!("fast path: ongoing {}:{} failed. build and sent snapshot code {:?}", self.store_id, region_id, s; - "is_first" => is_first,); + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", + self.store_id, region_id, new_peer_id, s + ); self.fallback_to_slow_path(region_id); return false; } }; } Err(e) => { - error!("fast path: ongoing {}:{} failed. build and sent snapshot error {:?}", self.store_id, region_id, e; - "is_first" => is_first,); + error!( + "fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", + self.store_id, region_id, new_peer_id, e + ); self.fallback_to_slow_path(region_id); return false; } @@ -378,6 +446,27 @@ impl TiFlashObserver { is_first } + fn check_entry_at_index( + &self, + region_id: u64, + index: u64, + peer_id: u64, + tag: &str, + ) -> RaftStoreResult { + match self.raft_engine.get_entry(region_id, index)? { + Some(entry) => Ok(entry.get_term()), + None => { + return Err(box_err!( + "can't find entry for index {} of region {}, peer_id: {}, tag {}", + index, + region_id, + peer_id, + tag + )); + } + } + } + fn build_and_send_snapshot( &self, region_id: u64, @@ -389,34 +478,18 @@ impl TiFlashObserver { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building let (snap, key) = { - // check if the source already knows the know peer - if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { - info!( - "fast path: ongoing {}:{}. remote peer has not applied conf change for {}", - self.store_id, region_id, new_peer_id; - "region" => ?new_region, - ); - return Ok(crate::FastAddPeerStatus::WaitForData); - } else { - info!( - "fast path: ongoing {}:{}. remote peer has applied conf change for {}", - self.store_id, region_id, new_peer_id - ); - } - // Find term of entry at applied_index. let applied_index = apply_state.get_applied_index(); - let applied_term = match self.raft_engine.get_entry(region_id, applied_index)? { - Some(apply_entry) => apply_entry.get_term(), - None => { - return Err(box_err!( - "can't find entry for applied_index {} of region {}, peer_id: {}", - applied_index, - region_id, - new_peer_id - )); - } - }; + let applied_term = + self.check_entry_at_index(region_id, applied_index, new_peer_id, "applied_index")?; + // Will otherwise cause "got message with lower index than committed" loop. + self.check_entry_at_index( + region_id, + apply_state.get_commit_index(), + new_peer_id, + "commit_index", + )?; + let key = SnapKey::new(region_id, applied_term, applied_index); self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); @@ -491,8 +564,8 @@ impl TiFlashObserver { response.mut_message().set_term(inner_msg.get_term()); response.mut_message().set_snapshot(pb_snapshot); debug!( - "!!!! send snapshot key {} raft message {:?} snap data {:?}", - key, response, snap_data + "!!!! send snapshot key {} raft message {:?} snap data {:?} apply_state {:?}", + key, response, snap_data, apply_state ); match self.trans.lock() { Ok(mut trans) => match trans.send(response) { @@ -1000,14 +1073,18 @@ impl RegionChangeObserver for TiFlashObs e: RegionChangeEvent, _: StateRole, ) { + let region_id = ob_ctx.region().get_id(); if e == RegionChangeEvent::Destroy { info!( "observe destroy"; - "region_id" => ob_ctx.region().get_id(), + "region_id" => region_id, "store_id" => self.store_id, ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); + if self.engine_store_cfg.enable_fast_add_peer { + self.remove_cached_region_info(region_id); + } } } @@ -1247,26 +1324,30 @@ impl ApplySnapshotObserver for TiFlashOb ); let region_id = ob_ctx.region().get_id(); let mut should_skip = false; - if self.access_cached_region_info_mut( - region_id, - |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - if !o.get().inited_or_fallback.load(Ordering::SeqCst) { - info!("fast path: applied first snapshot {}:{}, recover MsgAppend", self.store_id, region_id; - "snap_key" => ?snap_key, - ); + #[allow(clippy::collapsible_if)] + if self.engine_store_cfg.enable_fast_add_peer { + if self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { + info!("fast path: applied first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; + "snap_key" => ?snap_key, + ); + should_skip = true; + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + } } - should_skip = o.get().inited_or_fallback.load(Ordering::SeqCst); - o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); - } - MapEntry::Vacant(_) => { - // Compat no fast add peer logic - // panic!("unknown snapshot!"); - } - }, - ).is_err() { - fatal!("post_apply_snapshot poisoned") - }; + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; + } let snap = match snap { None => return, Some(s) => s, diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 1b90d1749ea..9712cd5e018 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -21,7 +21,7 @@ pub use engine_traits::{ }; pub use kvproto::{ raft_cmdpb::AdminCmdType, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; pub use protobuf::Message; pub use tikv_util::{box_err, box_try, debug, error, info, warn}; @@ -75,6 +75,7 @@ impl Region { #[derive(Default)] pub struct RegionStats { pub pre_handle_count: AtomicU64, + pub fast_add_peer_count: AtomicU64, } pub struct EngineStoreServer { @@ -1250,7 +1251,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { Some(s) => { let len = s.len() as u64; let ptr = Box::into_raw(Box::new(s.clone())); // leak - let s = ffi_interfaces::CppStrWithView { + ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { ptr: ptr as RawVoidPtr, type_: RawCppPtrTypeImpl::String.into(), @@ -1259,8 +1260,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { data: (*ptr).as_ptr() as *const _, len, }, - }; - s + } } None => ffi_interfaces::CppStrWithView { inner: ffi_interfaces::RawCppPtr { @@ -1283,6 +1283,9 @@ unsafe extern "C" fn ffi_fast_add_peer( let store = into_engine_store_server_wrap(arg1); let cluster = &*(store.cluster_ptr as *const mock_cluster::Cluster); let store_id = (*store.engine_store_server).id; + (*store.engine_store_server).mutate_region_states(region_id, |e: &mut RegionStats| { + e.fast_add_peer_count.fetch_add(1, Ordering::SeqCst); + }); let failed_add_peer_res = |status: ffi_interfaces::FastAddPeerStatus| ffi_interfaces::FastAddPeerRes { @@ -1335,18 +1338,7 @@ unsafe extern "C" fn ffi_fast_add_peer( }; // TODO We must ask the remote peer to persist before get a snapshot. - // { - // if let Some(s) = source_server.kvstore.get_mut(®ion_id) { - // write_to_db_data_by_engine(0, &source_engines.kv, s, "fast add - // peer".to_string()); } else { - // error!("recover from remote peer: failed persist source region"; - // "region_id" => region_id); return ffi_interfaces::FastAddPeerRes - // { status: ffi_interfaces::FastAddPeerStatus::BadData, - // apply_state: create_cpp_str(None), - // region: create_cpp_str(None), - // }; - // } - // } + let source_region = match source_server.kvstore.get(®ion_id) { Some(s) => s, None => { @@ -1360,7 +1352,7 @@ unsafe extern "C" fn ffi_fast_add_peer( ) { Some(x) => x, None => { - debug!("recover from remote peer: preparing from {} to {}, not region state {}", from_store, store_id, new_peer_id; "region_id" => region_id); + debug!("recover from remote peer: preparing from {} to {}:{}, not region state", from_store, store_id, new_peer_id; "region_id" => region_id); // We don't return BadData here, since the data may not be persisted. if block_wait { continue; @@ -1369,7 +1361,16 @@ unsafe extern "C" fn ffi_fast_add_peer( } }; let new_region_meta = region_local_state.get_region(); + let peer_state = region_local_state.get_state(); + // Validation + match peer_state { + PeerState::Tombstone | PeerState::Applying => { + info!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); + } + _ => {} + }; if !engine_store_ffi::observer::validate_remote_peer_region( new_region_meta, store_id, @@ -1381,6 +1382,7 @@ unsafe extern "C" fn ffi_fast_add_peer( } return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } + // TODO check commit_index and applied_index here debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); let new_region = make_new_region( @@ -1431,6 +1433,9 @@ unsafe extern "C" fn ffi_fast_add_peer( let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); let region_ptr = create_cpp_str(Some(region_bytes)); + + // Check if we have commit_index. + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); return ffi_interfaces::FastAddPeerRes { status: ffi_interfaces::FastAddPeerStatus::Ok, @@ -1439,5 +1444,5 @@ unsafe extern "C" fn ffi_fast_add_peer( }; } error!("recover from remote peer: failed after retry"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) } diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 7f88e47a613..e88b5a8acac 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -145,7 +145,7 @@ impl Transport for ChannelTransport { SnapshotStatus::Finish, ), None => return Err(box_err!("Find no from_store {}", from_store)), - }; + }?; } Ok(()) } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 962abcbe0b9..8ec71e1f036 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -9,7 +9,7 @@ enum SourceType { InvalidSource, } -fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) { tikv_util::set_panic_hook(true, "./"); let (mut cluster, pd_client) = new_mock_cluster(0, 3); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; @@ -44,6 +44,9 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { _ => (), }; + if pause { + fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + } pd_client.must_add_peer(1, new_learner_peer(3, 3)); // std::thread::sleep(std::time::Duration::from_millis(2000)); // match source_type { @@ -79,6 +82,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { _ => (), }; + if pause { + std::thread::sleep(std::time::Duration::from_millis(3000)); + fail::remove("ffi_fast_add_peer_pause"); + } + match source_type { SourceType::DelayedLearner => { check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); @@ -113,6 +121,54 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { _ => (), }; + // Destroy peer + pd_client.must_remove_peer(1, new_learner_peer(3, 3)); + must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_none() + }); + std::thread::sleep(std::time::Duration::from_millis(1000)); + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + assert!(!server.kvstore.contains_key(&1)); + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + e.fast_add_peer_count.store(0, Ordering::SeqCst); + }); + }, + ); + cluster.must_put(b"k5", b"v5"); + // These failpoints make sure we will cause again a fast path. + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 4)); + // Wait until Learner has applied ConfChange + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 4).is_some() + }); + // If we re-add peer, we can still go fast path. + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert!(e.fast_add_peer_count.load(Ordering::SeqCst) > 0); + }); + }, + ); + cluster.must_put(b"k6", b"v6"); + check_key( + &cluster, + b"k6", + b"v6", + Some(true), + None, + Some(vec![1, 2, 3]), + ); + fail::remove("fallback_to_slow_path_not_allow"); + fail::remove("fast_path_is_not_first"); + fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); fail::remove("ffi_fast_add_peer_block_wait"); @@ -122,43 +178,85 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool) { #[test] fn test_fast_add_peer_from_leader() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Leader, false); - fail::remove("on_pre_persist_with_finish"); + simple_fast_add_peer(SourceType::Leader, false, false); + fail::remove("fallback_to_slow_path_not_allow"); } /// Fast path by learner snapshot. #[test] fn test_fast_add_peer_from_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, false); - fail::remove("on_pre_persist_with_finish"); + simple_fast_add_peer(SourceType::Learner, false, false); + fail::remove("fallback_to_slow_path_not_allow"); } /// If a learner is delayed, but already applied ConfChange. #[test] fn test_fast_add_peer_from_delayed_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, false); - fail::remove("on_pre_persist_with_finish"); + simple_fast_add_peer(SourceType::DelayedLearner, false, false); + fail::remove("fallback_to_slow_path_not_allow"); } /// If we select a wrong source, or we can't run fast path, we can fallback to /// normal. #[test] fn test_fast_add_peer_from_invalid_source() { - simple_fast_add_peer(SourceType::InvalidSource, false); + simple_fast_add_peer(SourceType::InvalidSource, false, false); } #[test] fn test_fast_add_peer_from_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true); - fail::remove("on_pre_persist_with_finish"); + simple_fast_add_peer(SourceType::Learner, true, false); + fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true); - fail::remove("on_pre_persist_with_finish"); + simple_fast_add_peer(SourceType::DelayedLearner, true, false); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked_paused() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, true); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, true); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_existing_peer() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 3, 4, Some(true), None, None); + fail::remove("fallback_to_slow_path_not_allow"); + + stop_tiflash_node(&mut cluster, 2); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + restart_tiflash_node(&mut cluster, 2); + must_put_and_check_key(&mut cluster, 5, 6, Some(true), None, None); + + cluster.shutdown(); + fail::remove("go_fast_path_not_allow"); } diff --git a/proxy_tests/proxy/flashback.rs b/proxy_tests/proxy/flashback.rs index b6d115376b5..c6286f3ae18 100644 --- a/proxy_tests/proxy/flashback.rs +++ b/proxy_tests/proxy/flashback.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::DerefMut; - use futures::executor::block_on; use tikv_util::time::Duration; use txn_types::WriteBatchFlags; diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index f93834c6423..51362ef3917 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -227,7 +227,7 @@ fn test_add_absent_learner_peer_by_joint() { } use engine_traits::{Engines, KvEngine, RaftEngine}; -use raftstore::store::{write_initial_apply_state, write_initial_raft_state, RAFT_INIT_LOG_INDEX}; +use raftstore::store::{write_initial_apply_state, write_initial_raft_state}; pub fn prepare_bootstrap_cluster_with( engines: &Engines, From d6617f95f0d4c88032777ec01fdab9545e9dfe41 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Wed, 14 Dec 2022 11:39:23 +0800 Subject: [PATCH 027/115] add tests for apply snapshot Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 14 ++--- new-mock-engine-store/src/mock_store.rs | 16 +++++- proxy_tests/proxy/fast_add_peer.rs | 69 +++++++++++++++++++++---- 3 files changed, 77 insertions(+), 22 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 3e22b97a407..0e684dc22ab 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -394,11 +394,12 @@ impl TiFlashObserver { // check if the source already knows the know peer if !validate_remote_peer_region(&new_region, self.store_id, new_peer_id) { info!( - "fast path: ongoing {}:{} {}. remote peer has not applied conf change", + "fast path: ongoing {}:{} {}. failed remote peer has not applied conf change", self.store_id, region_id, new_peer_id; "region" => ?new_region, ); - return Ok(crate::FastAddPeerStatus::WaitForData); + self.fallback_to_slow_path(region_id); + return false; } info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; @@ -411,6 +412,7 @@ impl TiFlashObserver { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { + fail::fail_point!("go_fast_path_succeed", |_| { return false }); info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), @@ -538,13 +540,7 @@ impl TiFlashObserver { // TODO The rest is test, please remove it after we can fetch the real data. pb_snapshot_metadata - .mut_conf_state() - .mut_voters() - .push(msg.get_from_peer().get_id()); - pb_snapshot_metadata - .mut_conf_state() - .mut_learners() - .push(msg.get_to_peer().get_id()); + .set_conf_state(raftstore::store::util::conf_state_from_region(&new_region)); pb_snapshot_metadata.set_index(key.idx); pb_snapshot_metadata.set_term(key.term); diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 41d172d72fd..80c77b565a0 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -21,7 +21,7 @@ pub use engine_traits::{ }; pub use kvproto::{ raft_cmdpb::AdminCmdType, - raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; pub use protobuf::Message; pub use tikv_util::{box_err, box_try, debug, error, info, warn}; @@ -1293,6 +1293,7 @@ unsafe extern "C" fn ffi_fast_add_peer( }); 0 })() != 0; + debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); for retry in 0..300 { @@ -1338,7 +1339,7 @@ unsafe extern "C" fn ffi_fast_add_peer( ) { Some(x) => x, None => { - debug!("recover from remote peer: preparing from {} to {}, not region state {}", from_store, store_id, new_peer_id; "region_id" => region_id); + debug!("recover from remote peer: preparing from {} to {}:{}, not region state", from_store, store_id, new_peer_id; "region_id" => region_id); // We don't return BadData here, since the data may not be persisted. if block_wait { continue; @@ -1347,7 +1348,17 @@ unsafe extern "C" fn ffi_fast_add_peer( } }; let new_region_meta = region_local_state.get_region(); + let peer_state = region_local_state.get_state(); + // Validation + match peer_state { + PeerState::Tombstone | PeerState::Applying => { + // Note in real implementation, we will avoid selecting this peer. + error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + _ => (), + }; if !engine_store_ffi::observer::validate_remote_peer_region( new_region_meta, store_id, @@ -1359,6 +1370,7 @@ unsafe extern "C" fn ffi_fast_add_peer( } return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); } + // TODO check commit_index and applied_index here debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); let new_region = make_new_region( diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 8ec71e1f036..9da2dd37441 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -48,17 +48,6 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); } pd_client.must_add_peer(1, new_learner_peer(3, 3)); - // std::thread::sleep(std::time::Duration::from_millis(2000)); - // match source_type { - // SourceType::Learner => { - // // Wait until Learner has applied ConfChange - // must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: - // &States| -> bool { - // find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() - // }); - // } - // _ => {}, - // } cluster.must_put(b"k2", b"v2"); match source_type { @@ -259,4 +248,62 @@ fn test_existing_peer() { cluster.shutdown(); fail::remove("go_fast_path_not_allow"); + fail::remove("before_tiflash_check_double_write"); +} + +#[test] +fn test_apply_snapshot() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + // We add peer 3, it will be paused before fetching peer 2's data. + // However, peer 2 will apply conf change. + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_put_and_check_key(&mut cluster, 2, 3, Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); + + // peer 2 can't apply new kvs. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Recv), + )); + cluster.must_put(b"k3", b"v3"); + cluster.must_put(b"k4", b"v4"); + force_compact_log(&mut cluster, b"k2", Some(vec![1])); + // Log compacted, peer 2 will get snapshot, however, we pause when applying + // snapshot. + fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); + // Trigger a snapshot to 2. + cluster.clear_send_filters(); + + std::thread::sleep(std::time::Duration::from_millis(300)); + // Now if we continue fast path, peer 2 will be in Applying state. + // We will end up going slow path. + fail::remove("ffi_fast_add_peer_pause"); + fail::cfg("go_fast_path_succeed", "panic").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(300)); + // Resume applying snapshot + fail::remove("on_ob_post_apply_snapshot"); + check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![1, 3])); + cluster.shutdown(); + fail::remove("go_fast_path_succeed"); + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("before_tiflash_check_double_write"); } From 8b7abda347b25b812b71eb3ec5bd218d642e7f46 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Wed, 14 Dec 2022 15:29:35 +0800 Subject: [PATCH 028/115] add split and merge test Signed-off-by: CalvinNeo --- components/raftstore/src/store/peer.rs | 2 + new-mock-engine-store/src/lib.rs | 1 + new-mock-engine-store/src/mock_store.rs | 83 ++++++++++++++++++++----- proxy_tests/proxy/fast_add_peer.rs | 46 ++++++++++++++ proxy_tests/proxy/snapshot.rs | 5 +- 5 files changed, 119 insertions(+), 18 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index cffb7e40a9a..2f7d4604d6a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4620,6 +4620,8 @@ where "there is a pending conf change, try later"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), + "!!!! conf_index" => self.raft_group.raft.pending_conf_index, + "!!!! applied_index" => self.get_store().applied_index(), ); return Err(box_err!( "{} there is a pending conf change, try later", diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 58db2bb0f2c..5e5020a6309 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -74,6 +74,7 @@ pub fn copy_data_from( // kv data in memory for cf in 0..3 { for (k, v) in &source.data[cf] { + debug!("copy_data_from region {} {:?} {:?}", region_id, k, v); write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 80c77b565a0..a6ed8a47be2 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -220,23 +220,10 @@ fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { data.remove(k); } -unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { +unsafe fn load_data_from_db(store: &mut EngineStoreServer, region_id: u64) { let store_id = store.id; let engine = &mut store.engines.as_mut().unwrap().kv; - let apply_state: RaftApplyState = engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .unwrap() - .unwrap(); - let region_state: RegionLocalState = engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - .unwrap() - .unwrap(); - let region = store.kvstore.get_mut(®ion_id).unwrap(); - region.apply_state = apply_state; - region.region = region_state.get_region().clone(); - set_new_region_peer(region, store.id); - for cf in 0..3 { let cf_name = cf_to_name(cf.into()); region.data[cf].clear(); @@ -265,6 +252,19 @@ unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { } } +unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { + let engine = &mut store.engines.as_mut().unwrap().kv; + let apply_state: RaftApplyState = general_get_apply_state(engine, region_id).unwrap(); + let region_state: RegionLocalState = general_get_region_local_state(engine, region_id).unwrap(); + + let region = store.kvstore.get_mut(®ion_id).unwrap(); + region.apply_state = apply_state; + region.region = region_state.get_region().clone(); + set_new_region_peer(region, store.id); + + load_data_from_db(store, region_id); +} + unsafe fn write_to_db_data( store: &mut EngineStoreServer, region: &mut Box, @@ -417,6 +417,13 @@ impl EngineStoreServerWrap { .insert(region_meta.id, Box::new(new_region)); } } + + { + // Move data + let region_ids = + regions.iter().map(|r| r.get_id()).collect::>(); + move_data_from(engine_store_server, region_id, region_ids.as_slice()); + } } AdminCmdType::PrepareMerge => { let tikv_region = resp.get_split().get_left(); @@ -445,11 +452,12 @@ impl EngineStoreServerWrap { // We don't handle MergeState and PeerState here } AdminCmdType::CommitMerge => { + let (target_id, source_id) = + { (region_id, req.get_commit_merge().get_source().get_id()) }; { - let tikv_target_region_meta = resp.get_split().get_left(); - let target_region = &mut (engine_store_server.kvstore.get_mut(®ion_id).unwrap()); + let target_region_meta = &mut target_region.region; let target_version = target_region_meta.get_region_epoch().get_version(); @@ -477,6 +485,8 @@ impl EngineStoreServerWrap { == std::cmp::Ordering::Equal }; + // The validation of applied result on TiFlash's side. + let tikv_target_region_meta = resp.get_split().get_left(); if source_at_left { target_region_meta .set_start_key(source_region.get_start_key().to_vec()); @@ -494,6 +504,9 @@ impl EngineStoreServerWrap { } target_region.set_applied(header.index, header.term); } + { + move_data_from(engine_store_server, source_id, &[target_id]); + } let to_remove = req.get_commit_merge().get_source().get_id(); engine_store_server.kvstore.remove(&to_remove); } @@ -1434,3 +1447,41 @@ unsafe extern "C" fn ffi_fast_add_peer( error!("recover from remote peer: failed after retry"; "region_id" => region_id); failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) } + +pub fn move_data_from( + engine_store_server: &mut EngineStoreServer, + old_region_id: u64, + new_region_ids: &[u64], +) { + let kvs = { + let old_region = engine_store_server.kvstore.get_mut(&old_region_id).unwrap(); + let res = old_region.data.clone(); + old_region.data = Default::default(); + res + }; + for new_region_id in new_region_ids { + let new_region = engine_store_server.kvstore.get_mut(&new_region_id).unwrap(); + let new_region_meta = new_region.region.clone(); + let start_key = new_region_meta.get_start_key(); + let end_key = new_region_meta.get_end_key(); + for cf in &[ffi_interfaces::ColumnFamilyType::Default] { + let cf = (*cf) as usize; + for (k, v) in &kvs[cf] { + let k = k.as_slice(); + let v = v.as_slice(); + match k { + keys::PREPARE_BOOTSTRAP_KEY | keys::STORE_IDENT_KEY => {} + _ => { + if k >= start_key && (end_key.is_empty() || k < end_key) { + debug!( + "move region data {:?} {:?} from {} to {}", + k, v, old_region_id, new_region_id + ); + write_kv_in_mem(new_region, cf, k, v); + } + } + }; + } + } + } +} diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 9da2dd37441..196be5614eb 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -307,3 +307,49 @@ fn test_apply_snapshot() { fail::remove("ffi_fast_add_peer_from_id"); fail::remove("before_tiflash_check_double_write"); } + +#[test] +fn test_split_merge() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1])); + + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + cluster.must_split(&r1, b"k2"); + let r1_new = cluster.get_region(b"k1"); // 1000 + let r3_new = cluster.get_region(b"k3"); // 1 + debug!("r1_new {} r3_new {}", r1_new.get_id(), r3_new.get_id()); + + pd_client.must_add_peer(r1_new.get_id(), new_learner_peer(2, 2000)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + pd_client.must_add_peer(r3_new.get_id(), new_learner_peer(2, 2001)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(false), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![2])); + + pd_client.must_merge(r1_new.get_id(), r3_new.get_id()); + pd_client.must_add_peer(r3_new.get_id(), new_learner_peer(3, 3000)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![3])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![3])); + + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/snapshot.rs b/proxy_tests/proxy/snapshot.rs index 69211e2bdfd..628fb06811d 100644 --- a/proxy_tests/proxy/snapshot.rs +++ b/proxy_tests/proxy/snapshot.rs @@ -292,6 +292,7 @@ fn test_prehandle_fail() { #[test] fn test_split_merge() { let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); // Can always apply snapshot immediately @@ -331,7 +332,7 @@ fn test_split_merge() { assert_eq!(server.kvstore.get(&r1_new.get_id()).unwrap().region, r1_new); assert_eq!(server.kvstore.get(&r3_new.get_id()).unwrap().region, r3_new); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not @@ -358,7 +359,7 @@ fn test_split_merge() { r3_new2 ); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not delete data From 042fc6f1c7640cc61ef63499e6576a30fc1162ec Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Wed, 14 Dec 2022 17:01:58 +0800 Subject: [PATCH 029/115] add test for split and merged Signed-off-by: CalvinNeo --- new-mock-engine-store/src/mock_store.rs | 7 ++++++- proxy_tests/proxy/fast_add_peer.rs | 20 ++++++++++++++------ proxy_tests/proxy/proxy.rs | 16 ++++++++++++++++ 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index a6ed8a47be2..b8aa688c3ee 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -452,6 +452,9 @@ impl EngineStoreServerWrap { // We don't handle MergeState and PeerState here } AdminCmdType::CommitMerge => { + fail::fail_point!("ffi_before_commit_merge", |_| { + return ffi_interfaces::EngineStoreApplyRes::Persist; + }); let (target_id, source_id) = { (region_id, req.get_commit_merge().get_source().get_id()) }; { @@ -1370,7 +1373,9 @@ unsafe extern "C" fn ffi_fast_add_peer( error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); } - _ => (), + _ => { + info!("recover from remote peer: preparing from {} to {}:{}, ok peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + } }; if !engine_store_ffi::observer::validate_remote_peer_region( new_region_meta, diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 196be5614eb..baca9531d6c 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -334,21 +334,29 @@ fn test_split_merge() { cluster.must_split(&r1, b"k2"); let r1_new = cluster.get_region(b"k1"); // 1000 let r3_new = cluster.get_region(b"k3"); // 1 - debug!("r1_new {} r3_new {}", r1_new.get_id(), r3_new.get_id()); - pd_client.must_add_peer(r1_new.get_id(), new_learner_peer(2, 2000)); + let r1_id = r1_new.get_id(); + let r3_id = r3_new.get_id(); + debug!("r1_new {} r3_new {}", r1_id, r3_id); + + // Test add peer after split + pd_client.must_add_peer(r1_id, new_learner_peer(2, 2001)); std::thread::sleep(std::time::Duration::from_millis(1000)); check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![2])); check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); - pd_client.must_add_peer(r3_new.get_id(), new_learner_peer(2, 2001)); + pd_client.must_add_peer(r3_id, new_learner_peer(2, 2003)); std::thread::sleep(std::time::Duration::from_millis(1000)); check_key(&cluster, b"k1", b"v1", Some(false), None, Some(vec![2])); check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![2])); - pd_client.must_merge(r1_new.get_id(), r3_new.get_id()); - pd_client.must_add_peer(r3_new.get_id(), new_learner_peer(3, 3000)); - check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![3])); + // Test merge + pd_client.must_add_peer(r3_id, new_learner_peer(3, 3003)); + pd_client.merge_region(r1_id, r3_id); + must_not_merged(pd_client.clone(), r1_id, Duration::from_millis(1000)); + pd_client.must_add_peer(r1_id, new_learner_peer(3, 3001)); + pd_client.must_merge(r1_id, r3_id); check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![3])); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![3])); fail::remove("on_can_apply_snapshot"); cluster.shutdown(); diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 724b9418807..ea441de5fd5 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -35,6 +35,7 @@ pub use new_mock_engine_store::{ }, write_kv_in_mem, Cluster, ProxyConfig, RegionStats, Simulator, TestPdClient, }; +pub use pd_client::PdClient; pub use raft::eraftpb::{ConfChangeType, MessageType}; pub use raftstore::coprocessor::ConsistencyCheckMethod; pub use test_raftstore::{new_learner_peer, new_peer}; @@ -652,3 +653,18 @@ pub fn restart_tiflash_node(cluster: &mut Cluster, node_id: u64) { } cluster.run_node(node_id).unwrap(); } + +pub fn must_not_merged(pd_client: Arc, from: u64, duration: Duration) { + let timer = tikv_util::time::Instant::now(); + loop { + let region = futures::executor::block_on(pd_client.get_region_by_id(from)).unwrap(); + if let Some(r) = region { + if timer.saturating_elapsed() > duration { + return; + } + } else { + panic!("region {} is merged.", from); + } + std::thread::sleep_ms(10); + } +} From 5ca79058585c99557ca27fa8f44dfebf9b230d44 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 14 Dec 2022 19:02:11 +0800 Subject: [PATCH 030/115] Add support for merge split (#232) --- engine_store_ffi/src/observer.rs | 9 +- new-mock-engine-store/src/lib.rs | 1 + new-mock-engine-store/src/mock_store.rs | 94 ++++++++++++++---- proxy_tests/proxy/fast_add_peer.rs | 123 +++++++++++++++++++++--- proxy_tests/proxy/proxy.rs | 16 +++ proxy_tests/proxy/snapshot.rs | 5 +- 6 files changed, 209 insertions(+), 39 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 20d5c71b0d8..0e684dc22ab 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -412,6 +412,7 @@ impl TiFlashObserver { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { + fail::fail_point!("go_fast_path_succeed", |_| { return false }); info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), @@ -539,13 +540,7 @@ impl TiFlashObserver { // TODO The rest is test, please remove it after we can fetch the real data. pb_snapshot_metadata - .mut_conf_state() - .mut_voters() - .push(msg.get_from_peer().get_id()); - pb_snapshot_metadata - .mut_conf_state() - .mut_learners() - .push(msg.get_to_peer().get_id()); + .set_conf_state(raftstore::store::util::conf_state_from_region(&new_region)); pb_snapshot_metadata.set_index(key.idx); pb_snapshot_metadata.set_term(key.term); diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 58db2bb0f2c..5e5020a6309 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -74,6 +74,7 @@ pub fn copy_data_from( // kv data in memory for cf in 0..3 { for (k, v) in &source.data[cf] { + debug!("copy_data_from region {} {:?} {:?}", region_id, k, v); write_kv_in_mem(target, cf, k.as_slice(), v.as_slice()); } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 9712cd5e018..fa89ed431fb 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -220,23 +220,10 @@ fn delete_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8]) { data.remove(k); } -unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { +unsafe fn load_data_from_db(store: &mut EngineStoreServer, region_id: u64) { let store_id = store.id; let engine = &mut store.engines.as_mut().unwrap().kv; - let apply_state: RaftApplyState = engine - .get_msg_cf(CF_RAFT, &keys::apply_state_key(region_id)) - .unwrap() - .unwrap(); - let region_state: RegionLocalState = engine - .get_msg_cf(CF_RAFT, &keys::region_state_key(region_id)) - .unwrap() - .unwrap(); - let region = store.kvstore.get_mut(®ion_id).unwrap(); - region.apply_state = apply_state; - region.region = region_state.get_region().clone(); - set_new_region_peer(region, store.id); - for cf in 0..3 { let cf_name = cf_to_name(cf.into()); region.data[cf].clear(); @@ -265,6 +252,19 @@ unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { } } +unsafe fn load_from_db(store: &mut EngineStoreServer, region_id: u64) { + let engine = &mut store.engines.as_mut().unwrap().kv; + let apply_state: RaftApplyState = general_get_apply_state(engine, region_id).unwrap(); + let region_state: RegionLocalState = general_get_region_local_state(engine, region_id).unwrap(); + + let region = store.kvstore.get_mut(®ion_id).unwrap(); + region.apply_state = apply_state; + region.region = region_state.get_region().clone(); + set_new_region_peer(region, store.id); + + load_data_from_db(store, region_id); +} + unsafe fn write_to_db_data( store: &mut EngineStoreServer, region: &mut Box, @@ -417,6 +417,12 @@ impl EngineStoreServerWrap { .insert(region_meta.id, Box::new(new_region)); } } + { + // Move data + let region_ids = + regions.iter().map(|r| r.get_id()).collect::>(); + move_data_from(engine_store_server, region_id, region_ids.as_slice()); + } } AdminCmdType::PrepareMerge => { let tikv_region = resp.get_split().get_left(); @@ -445,11 +451,15 @@ impl EngineStoreServerWrap { // We don't handle MergeState and PeerState here } AdminCmdType::CommitMerge => { + fail::fail_point!("ffi_before_commit_merge", |_| { + return ffi_interfaces::EngineStoreApplyRes::Persist; + }); + let (target_id, source_id) = + { (region_id, req.get_commit_merge().get_source().get_id()) }; { - let tikv_target_region_meta = resp.get_split().get_left(); - let target_region = &mut (engine_store_server.kvstore.get_mut(®ion_id).unwrap()); + let target_region_meta = &mut target_region.region; let target_version = target_region_meta.get_region_epoch().get_version(); @@ -477,6 +487,8 @@ impl EngineStoreServerWrap { == std::cmp::Ordering::Equal }; + // The validation of applied result on TiFlash's side. + let tikv_target_region_meta = resp.get_split().get_left(); if source_at_left { target_region_meta .set_start_key(source_region.get_start_key().to_vec()); @@ -494,6 +506,9 @@ impl EngineStoreServerWrap { } target_region.set_applied(header.index, header.term); } + { + move_data_from(engine_store_server, source_id, &[target_id]); + } let to_remove = req.get_commit_merge().get_source().get_id(); engine_store_server.kvstore.remove(&to_remove); } @@ -1366,10 +1381,13 @@ unsafe extern "C" fn ffi_fast_add_peer( // Validation match peer_state { PeerState::Tombstone | PeerState::Applying => { - info!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); + // Note in real implementation, we will avoid selecting this peer. + error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + } + _ => { + info!("recover from remote peer: preparing from {} to {}:{}, ok peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); } - _ => {} }; if !engine_store_ffi::observer::validate_remote_peer_region( new_region_meta, @@ -1446,3 +1464,41 @@ unsafe extern "C" fn ffi_fast_add_peer( error!("recover from remote peer: failed after retry"; "region_id" => region_id); failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) } + +pub fn move_data_from( + engine_store_server: &mut EngineStoreServer, + old_region_id: u64, + new_region_ids: &[u64], +) { + let kvs = { + let old_region = engine_store_server.kvstore.get_mut(&old_region_id).unwrap(); + let res = old_region.data.clone(); + old_region.data = Default::default(); + res + }; + for new_region_id in new_region_ids { + let new_region = engine_store_server.kvstore.get_mut(&new_region_id).unwrap(); + let new_region_meta = new_region.region.clone(); + let start_key = new_region_meta.get_start_key(); + let end_key = new_region_meta.get_end_key(); + for cf in &[ffi_interfaces::ColumnFamilyType::Default] { + let cf = (*cf) as usize; + for (k, v) in &kvs[cf] { + let k = k.as_slice(); + let v = v.as_slice(); + match k { + keys::PREPARE_BOOTSTRAP_KEY | keys::STORE_IDENT_KEY => {} + _ => { + if k >= start_key && (end_key.is_empty() || k < end_key) { + debug!( + "move region data {:?} {:?} from {} to {}", + k, v, old_region_id, new_region_id + ); + write_kv_in_mem(new_region, cf, k, v); + } + } + }; + } + } + } +} diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 8ec71e1f036..baca9531d6c 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -48,17 +48,6 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); } pd_client.must_add_peer(1, new_learner_peer(3, 3)); - // std::thread::sleep(std::time::Duration::from_millis(2000)); - // match source_type { - // SourceType::Learner => { - // // Wait until Learner has applied ConfChange - // must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: - // &States| -> bool { - // find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() - // }); - // } - // _ => {}, - // } cluster.must_put(b"k2", b"v2"); match source_type { @@ -259,4 +248,116 @@ fn test_existing_peer() { cluster.shutdown(); fail::remove("go_fast_path_not_allow"); + fail::remove("before_tiflash_check_double_write"); +} + +#[test] +fn test_apply_snapshot() { + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 3); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); + + // We add peer 3, it will be paused before fetching peer 2's data. + // However, peer 2 will apply conf change. + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + must_put_and_check_key(&mut cluster, 2, 3, Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); + + // peer 2 can't apply new kvs. + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgAppend) + .direction(Direction::Recv), + )); + cluster.must_put(b"k3", b"v3"); + cluster.must_put(b"k4", b"v4"); + force_compact_log(&mut cluster, b"k2", Some(vec![1])); + // Log compacted, peer 2 will get snapshot, however, we pause when applying + // snapshot. + fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); + // Trigger a snapshot to 2. + cluster.clear_send_filters(); + + std::thread::sleep(std::time::Duration::from_millis(300)); + // Now if we continue fast path, peer 2 will be in Applying state. + // We will end up going slow path. + fail::remove("ffi_fast_add_peer_pause"); + fail::cfg("go_fast_path_succeed", "panic").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(300)); + // Resume applying snapshot + fail::remove("on_ob_post_apply_snapshot"); + check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![1, 3])); + cluster.shutdown(); + fail::remove("go_fast_path_succeed"); + fail::remove("ffi_fast_add_peer_from_id"); + fail::remove("before_tiflash_check_double_write"); +} + +#[test] +fn test_split_merge() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1])); + + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + cluster.must_split(&r1, b"k2"); + let r1_new = cluster.get_region(b"k1"); // 1000 + let r3_new = cluster.get_region(b"k3"); // 1 + + let r1_id = r1_new.get_id(); + let r3_id = r3_new.get_id(); + debug!("r1_new {} r3_new {}", r1_id, r3_id); + + // Test add peer after split + pd_client.must_add_peer(r1_id, new_learner_peer(2, 2001)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(false), None, Some(vec![2])); + pd_client.must_add_peer(r3_id, new_learner_peer(2, 2003)); + std::thread::sleep(std::time::Duration::from_millis(1000)); + check_key(&cluster, b"k1", b"v1", Some(false), None, Some(vec![2])); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![2])); + + // Test merge + pd_client.must_add_peer(r3_id, new_learner_peer(3, 3003)); + pd_client.merge_region(r1_id, r3_id); + must_not_merged(pd_client.clone(), r1_id, Duration::from_millis(1000)); + pd_client.must_add_peer(r1_id, new_learner_peer(3, 3001)); + pd_client.must_merge(r1_id, r3_id); + check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![3])); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![3])); + + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); } diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 2fd0de1ef72..3ab7d201217 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -35,6 +35,7 @@ pub use new_mock_engine_store::{ }, write_kv_in_mem, Cluster, ProxyConfig, RegionStats, Simulator, TestPdClient, }; +pub use pd_client::PdClient; pub use raft::eraftpb::{ConfChangeType, MessageType}; pub use raftstore::coprocessor::ConsistencyCheckMethod; pub use test_raftstore::{new_learner_peer, new_peer}; @@ -653,3 +654,18 @@ pub fn restart_tiflash_node(cluster: &mut Cluster, node_id: u64) { } cluster.run_node(node_id).unwrap(); } + +pub fn must_not_merged(pd_client: Arc, from: u64, duration: Duration) { + let timer = tikv_util::time::Instant::now(); + loop { + let region = futures::executor::block_on(pd_client.get_region_by_id(from)).unwrap(); + if let Some(r) = region { + if timer.saturating_elapsed() > duration { + return; + } + } else { + panic!("region {} is merged.", from); + } + std::thread::sleep_ms(10); + } +} diff --git a/proxy_tests/proxy/snapshot.rs b/proxy_tests/proxy/snapshot.rs index 69211e2bdfd..628fb06811d 100644 --- a/proxy_tests/proxy/snapshot.rs +++ b/proxy_tests/proxy/snapshot.rs @@ -292,6 +292,7 @@ fn test_prehandle_fail() { #[test] fn test_split_merge() { let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); assert_eq!(cluster.cfg.proxy_cfg.raft_store.snap_handle_pool_size, 2); // Can always apply snapshot immediately @@ -331,7 +332,7 @@ fn test_split_merge() { assert_eq!(server.kvstore.get(&r1_new.get_id()).unwrap().region, r1_new); assert_eq!(server.kvstore.get(&r3_new.get_id()).unwrap().region, r3_new); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not @@ -358,7 +359,7 @@ fn test_split_merge() { r3_new2 ); - // Can get from disk + // Can get from disk, note in old version, we don't support migrate memory data check_key(&cluster, b"k1", b"v1", None, Some(true), None); check_key(&cluster, b"k3", b"v3", None, Some(true), None); // TODO Region in memory data must not contradict, but now we do not delete data From cf4da9fc663bbdbfac479d8dcc7d0e1633def0b3 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Thu, 15 Dec 2022 14:34:09 +0800 Subject: [PATCH 031/115] record inflight message Signed-off-by: CalvinNeo --- components/raftstore/src/store/peer.rs | 2 - engine_store_ffi/src/lib.rs | 1 + engine_store_ffi/src/observer.rs | 112 ++++++++++++++++------ new-mock-engine-store/src/mock_store.rs | 1 + proxy_tests/proxy/fast_add_peer.rs | 119 +++++++++++++++++++----- proxy_tests/proxy/proxy.rs | 26 +++++- 6 files changed, 201 insertions(+), 60 deletions(-) diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 2f7d4604d6a..cffb7e40a9a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4620,8 +4620,6 @@ where "there is a pending conf change, try later"; "region_id" => self.region_id, "peer_id" => self.peer.get_id(), - "!!!! conf_index" => self.raft_group.raft.pending_conf_index, - "!!!! applied_index" => self.get_store().applied_index(), ); return Err(box_err!( "{} there is a pending conf change, try later", diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 9ee547cc3cd..b25079ada90 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(drain_filter)] +#![feature(integer_atomics)] #[allow(dead_code)] pub mod interfaces; diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 0e684dc22ab..34316b1ec07 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -6,9 +6,10 @@ use std::{ path::PathBuf, str::FromStr, sync::{ - atomic::{AtomicBool, Ordering}, + atomic::{AtomicBool, AtomicU64, AtomicU128, Ordering}, mpsc, Arc, Mutex, RwLock, }, + time::SystemTime, }; use collections::HashMap; @@ -117,6 +118,7 @@ pub struct CachedRegionInfo { // NOTE If we want a fallback, then we must set inited_or_fallback to true, // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. pub inited_or_fallback: AtomicBool, + pub snapshot_inflight: AtomicU128, } pub type CachedRegionInfoMap = HashMap>; @@ -246,6 +248,20 @@ impl TiFlashObserver { ) } + pub fn set_snapshot_inflight(&self, region_id: u64, v: u128) -> RaftStoreResult<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().snapshot_inflight.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + fn fallback_to_slow_path(&self, region_id: u64) { // TODO clean local, and prepare to request snapshot from TiKV as a trivial // procedure. @@ -282,6 +298,7 @@ impl TiFlashObserver { let mut is_first = false; let mut is_replicated = false; let mut has_already_inited = None; + let mut early_skip = false; let f = |info: MapEntry>| { match info { MapEntry::Occupied(mut o) => { @@ -302,17 +319,35 @@ impl TiFlashObserver { // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { - // TODO Maybe too much printing - // info!("fast path: ongoing {}:{}, skip MsgAppend", - // self.store_id, region_id; - // "to_peer_id" => msg.get_to_peer().get_id(), - // "from_peer_id" => - // msg.get_from_peer().get_id(), - // "inner_msg" => ?inner_msg, - // "is_replicated" => is_replicated, - // "has_already_inited" => has_already_inited, - // "is_first" => is_first, - // ); + #[cfg(any(test, feature = "testexport"))] + { + info!("fast path: ongoing {}:{} {}, MsgAppend skipped", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); + } + } + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + if last != 0 { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + "elapsed" => current.as_millis() - last, + ); + early_skip = true; } } MapEntry::Vacant(v) => { @@ -330,17 +365,23 @@ impl TiFlashObserver { self.access_cached_region_info_mut(region_id, f).unwrap(); if !is_first { - // TODO avoid too much log - // info!( - // "fast path: normal MsgAppend of {}:{} {}", - // self.store_id, region_id, new_peer_id; - // "to_peer_id" => msg.get_to_peer().get_id(), - // "from_peer_id" => msg.get_from_peer().get_id(), - // "inner_msg" => ?inner_msg, - // ); + #[cfg(any(test, feature = "testexport"))] + { + info!( + "fast path: normal MsgAppend of {}:{} {}", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + } return false; } + if early_skip { + return true; + } + { // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, // see `check_msg`. @@ -553,18 +594,32 @@ impl TiFlashObserver { response.set_region_id(region_id); response.set_from_peer(msg.get_from_peer().clone()); response.set_to_peer(msg.get_to_peer().clone()); - response - .mut_message() - .set_msg_type(MessageType::MsgSnapshot); - response.mut_message().set_term(inner_msg.get_term()); - response.mut_message().set_snapshot(pb_snapshot); + + let message = response.mut_message(); + message.set_msg_type(MessageType::MsgSnapshot); + message.set_term(inner_msg.get_term()); + message.set_snapshot(pb_snapshot); + // If no set, will result in a MsgResponse to peer 0. + message.set_from(msg.get_from_peer().get_id()); + message.set_to(msg.get_to_peer().get_id()); debug!( - "!!!! send snapshot key {} raft message {:?} snap data {:?} apply_state {:?}", - key, response, snap_data, apply_state + "!!!! send snapshot to {} key {} raft message {:?} snap data {:?} apply_state {:?}", + msg.get_to_peer().get_id(), + key, + response, + snap_data, + apply_state ); match self.trans.lock() { Ok(mut trans) => match trans.send(response) { - Ok(_) | Err(RaftStoreError::RegionNotFound(_)) => (), + Ok(_) => { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + self.set_snapshot_inflight(region_id, current.as_millis()) + .unwrap(); + } + Err(RaftStoreError::RegionNotFound(_)) => (), _ => return Ok(crate::FastAddPeerStatus::OtherError), }, Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), @@ -1331,6 +1386,7 @@ impl ApplySnapshotObserver for TiFlashOb "snap_key" => ?snap_key, ); should_skip = true; + o.get_mut().snapshot_inflight.store(0, Ordering::SeqCst); o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index b8aa688c3ee..8d531a9d04e 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1277,6 +1277,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { } } +#[allow(clippy::redundant_closure_call)] unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, region_id: u64, diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index baca9531d6c..b556e7dc807 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -9,7 +9,33 @@ enum SourceType { InvalidSource, } -fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) { +enum PauseType { + None, + Build, + ApplySnapshot, +} + +#[test] +fn basic_fast_add_peer() { + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k0", b"v0"); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + + cluster.shutdown(); +} + +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseType) { tikv_util::set_panic_hook(true, "./"); let (mut cluster, pd_client) = new_mock_cluster(0, 3); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; @@ -33,6 +59,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) cluster.must_put(b"k1", b"v1"); check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + // Getting (k1,v1) not necessarily means peer 2 is ready. + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + // Add learner 3 according to source_type match source_type { SourceType::Learner | SourceType::DelayedLearner => { @@ -44,26 +75,24 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) _ => (), }; - if pause { - fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + match pause { + PauseType::Build => fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(), + PauseType::ApplySnapshot => fail::cfg("on_can_apply_snapshot", "return(false)").unwrap(), + _ => (), } + + // Add peer 3 pd_client.must_add_peer(1, new_learner_peer(3, 3)); cluster.must_put(b"k2", b"v2"); match source_type { SourceType::DelayedLearner => { - // Make sure conf change is applied. - check_key( - &cluster, - b"k2", - b"v2", - Some(true), - None, - Some(vec![1, 2, 3]), - ); + // Make sure conf change is applied in peer 2. + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(1, 2) .msg_type(MessageType::MsgAppend) + .msg_type(MessageType::MsgSnapshot) .direction(Direction::Recv), )); cluster.must_put(b"k3", b"v3"); @@ -71,9 +100,18 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) _ => (), }; - if pause { - std::thread::sleep(std::time::Duration::from_millis(3000)); - fail::remove("ffi_fast_add_peer_pause"); + match pause { + PauseType::Build => { + std::thread::sleep(std::time::Duration::from_millis(3000)); + fail::remove("ffi_fast_add_peer_pause"); + } + PauseType::ApplySnapshot => { + std::thread::sleep(std::time::Duration::from_millis(4000)); + fail::remove("on_can_apply_snapshot"); + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(5000)); + } + _ => (), } match source_type { @@ -103,6 +141,22 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) } }; + match pause { + PauseType::ApplySnapshot => { + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert_eq!(1, e.fast_add_peer_count.load(Ordering::SeqCst)); + }); + }, + ); + } + _ => (), + } + match source_type { SourceType::DelayedLearner => { cluster.clear_send_filters(); @@ -158,6 +212,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) fail::remove("fallback_to_slow_path_not_allow"); fail::remove("fast_path_is_not_first"); + fail::remove("on_can_apply_snapshot"); fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); fail::remove("ffi_fast_add_peer_block_wait"); @@ -167,7 +222,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) #[test] fn test_fast_add_peer_from_leader() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Leader, false, false); + simple_fast_add_peer(SourceType::Leader, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -175,7 +230,7 @@ fn test_fast_add_peer_from_leader() { #[test] fn test_fast_add_peer_from_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, false, false); + simple_fast_add_peer(SourceType::Learner, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -183,7 +238,7 @@ fn test_fast_add_peer_from_learner() { #[test] fn test_fast_add_peer_from_delayed_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, false, false); + simple_fast_add_peer(SourceType::DelayedLearner, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -191,34 +246,48 @@ fn test_fast_add_peer_from_delayed_learner() { /// normal. #[test] fn test_fast_add_peer_from_invalid_source() { - simple_fast_add_peer(SourceType::InvalidSource, false, false); + simple_fast_add_peer(SourceType::InvalidSource, false, PauseType::None); } #[test] fn test_fast_add_peer_from_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true, false); + simple_fast_add_peer(SourceType::Learner, true, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true, false); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, PauseType::Build); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::Build); fail::remove("fallback_to_slow_path_not_allow"); } #[test] -fn test_fast_add_peer_from_learner_blocked_paused() { +fn test_fast_add_peer_from_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true, true); + simple_fast_add_peer(SourceType::Learner, true, PauseType::ApplySnapshot); fail::remove("fallback_to_slow_path_not_allow"); } #[test] -fn test_fast_add_peer_from_delayed_learner_blocked_paused() { +fn test_fast_add_peer_from_delayed_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true, true); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::ApplySnapshot); fail::remove("fallback_to_slow_path_not_allow"); } diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index ea441de5fd5..bbc894a1d8d 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -98,14 +98,26 @@ pub fn maybe_collect_states( Ok(Some(i)) => i, _ => unreachable!(), }; + let apply_state = get_apply_state(&engine, region_id); + let region_state = get_region_local_state(&engine, region_id); + let raft_state = get_raft_local_state(raft_engine, region_id); + if apply_state.is_none() { + return; + } + if region_state.is_none() { + return; + } + if raft_state.is_none() { + return; + } prev_state.insert( id, States { in_memory_apply_state: region.apply_state.clone(), in_memory_applied_term: region.applied_term, - in_disk_apply_state: get_apply_state(&engine, region_id).unwrap(), - in_disk_region_state: get_region_local_state(&engine, region_id).unwrap(), - in_disk_raft_state: get_raft_local_state(raft_engine, region_id).unwrap(), + in_disk_apply_state: apply_state.unwrap(), + in_disk_region_state: region_state.unwrap(), + in_disk_raft_state: raft_state.unwrap(), ident, }, ); @@ -587,8 +599,12 @@ pub fn must_wait_until_cond_node( } let mut ok = true; for i in new_states.keys() { - let new = new_states.get(i).unwrap(); - if !pred(new) { + if let Some(new) = new_states.get(i) { + if !pred(new) { + ok = false; + break; + } + } else { ok = false; break; } From cc2408754c7f457c051ea8206a7f1df421edfd02 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Thu, 15 Dec 2022 16:18:06 +0800 Subject: [PATCH 032/115] enhance tests Signed-off-by: CalvinNeo --- proxy_tests/proxy/fast_add_peer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index b556e7dc807..9aac9cdd378 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -140,6 +140,9 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT ); } }; + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); match pause { PauseType::ApplySnapshot => { From ccb539c1f3f3d0986db1034673941b5fe6dd2574 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Thu, 15 Dec 2022 16:34:35 +0800 Subject: [PATCH 033/115] portable-atomic Signed-off-by: CalvinNeo --- Cargo.lock | 7 +++++++ engine_store_ffi/Cargo.toml | 1 + engine_store_ffi/src/lib.rs | 1 - engine_store_ffi/src/observer.rs | 4 ++-- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ec5469e991a..3d2fcca5501 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1632,6 +1632,7 @@ dependencies = [ "panic_hook", "parking_lot 0.12.0", "pd_client", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -4104,6 +4105,12 @@ dependencies = [ "ws2_32-sys", ] +[[package]] +name = "portable-atomic" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ef946e2f765276038550e74abfbda40c84d73278417c071e0f19f8af6ba100b" + [[package]] name = "pprof" version = "0.11.0" diff --git a/engine_store_ffi/Cargo.toml b/engine_store_ffi/Cargo.toml index 4fef4837a8b..dafec6ab7b2 100644 --- a/engine_store_ffi/Cargo.toml +++ b/engine_store_ffi/Cargo.toml @@ -62,6 +62,7 @@ online_config = { workspace = true } ordered-float = "2.6" parking_lot = "0.12" pd_client = { workspace = true, default-features = false } +portable-atomic = "0.3" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index b25079ada90..9ee547cc3cd 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(drain_filter)] -#![feature(integer_atomics)] #[allow(dead_code)] pub mod interfaces; diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 34316b1ec07..cb1e2901c62 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -6,7 +6,7 @@ use std::{ path::PathBuf, str::FromStr, sync::{ - atomic::{AtomicBool, AtomicU64, AtomicU128, Ordering}, + atomic::{AtomicBool, Ordering}, mpsc, Arc, Mutex, RwLock, }, time::SystemTime, @@ -118,7 +118,7 @@ pub struct CachedRegionInfo { // NOTE If we want a fallback, then we must set inited_or_fallback to true, // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. pub inited_or_fallback: AtomicBool, - pub snapshot_inflight: AtomicU128, + pub snapshot_inflight: portable_atomic::AtomicU128, } pub type CachedRegionInfoMap = HashMap>; From 50e4a559d0d4a52db86de301d0773c3bc0e441dd Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 15 Dec 2022 18:06:48 +0800 Subject: [PATCH 034/115] Add record and reject inflight logic (#233) --- Cargo.lock | 7 ++ engine_store_ffi/Cargo.toml | 1 + engine_store_ffi/src/observer.rs | 110 +++++++++++++++------ new-mock-engine-store/src/mock_store.rs | 1 + proxy_tests/proxy/fast_add_peer.rs | 122 +++++++++++++++++++----- proxy_tests/proxy/proxy.rs | 26 ++++- 6 files changed, 210 insertions(+), 57 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ec5469e991a..3d2fcca5501 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1632,6 +1632,7 @@ dependencies = [ "panic_hook", "parking_lot 0.12.0", "pd_client", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -4104,6 +4105,12 @@ dependencies = [ "ws2_32-sys", ] +[[package]] +name = "portable-atomic" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ef946e2f765276038550e74abfbda40c84d73278417c071e0f19f8af6ba100b" + [[package]] name = "pprof" version = "0.11.0" diff --git a/engine_store_ffi/Cargo.toml b/engine_store_ffi/Cargo.toml index 4fef4837a8b..dafec6ab7b2 100644 --- a/engine_store_ffi/Cargo.toml +++ b/engine_store_ffi/Cargo.toml @@ -62,6 +62,7 @@ online_config = { workspace = true } ordered-float = "2.6" parking_lot = "0.12" pd_client = { workspace = true, default-features = false } +portable-atomic = "0.3" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = { version = "2.8", features = ["bytes"] } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 0e684dc22ab..cb1e2901c62 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -9,6 +9,7 @@ use std::{ atomic::{AtomicBool, Ordering}, mpsc, Arc, Mutex, RwLock, }, + time::SystemTime, }; use collections::HashMap; @@ -117,6 +118,7 @@ pub struct CachedRegionInfo { // NOTE If we want a fallback, then we must set inited_or_fallback to true, // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. pub inited_or_fallback: AtomicBool, + pub snapshot_inflight: portable_atomic::AtomicU128, } pub type CachedRegionInfoMap = HashMap>; @@ -246,6 +248,20 @@ impl TiFlashObserver { ) } + pub fn set_snapshot_inflight(&self, region_id: u64, v: u128) -> RaftStoreResult<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().snapshot_inflight.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + fn fallback_to_slow_path(&self, region_id: u64) { // TODO clean local, and prepare to request snapshot from TiKV as a trivial // procedure. @@ -282,6 +298,7 @@ impl TiFlashObserver { let mut is_first = false; let mut is_replicated = false; let mut has_already_inited = None; + let mut early_skip = false; let f = |info: MapEntry>| { match info { MapEntry::Occupied(mut o) => { @@ -302,17 +319,35 @@ impl TiFlashObserver { // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); if is_first { - // TODO Maybe too much printing - // info!("fast path: ongoing {}:{}, skip MsgAppend", - // self.store_id, region_id; - // "to_peer_id" => msg.get_to_peer().get_id(), - // "from_peer_id" => - // msg.get_from_peer().get_id(), - // "inner_msg" => ?inner_msg, - // "is_replicated" => is_replicated, - // "has_already_inited" => has_already_inited, - // "is_first" => is_first, - // ); + #[cfg(any(test, feature = "testexport"))] + { + info!("fast path: ongoing {}:{} {}, MsgAppend skipped", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); + } + } + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + if last != 0 { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + "elapsed" => current.as_millis() - last, + ); + early_skip = true; } } MapEntry::Vacant(v) => { @@ -330,17 +365,23 @@ impl TiFlashObserver { self.access_cached_region_info_mut(region_id, f).unwrap(); if !is_first { - // TODO avoid too much log - // info!( - // "fast path: normal MsgAppend of {}:{} {}", - // self.store_id, region_id, new_peer_id; - // "to_peer_id" => msg.get_to_peer().get_id(), - // "from_peer_id" => msg.get_from_peer().get_id(), - // "inner_msg" => ?inner_msg, - // ); + #[cfg(any(test, feature = "testexport"))] + { + info!( + "fast path: normal MsgAppend of {}:{} {}", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + ); + } return false; } + if early_skip { + return true; + } + { // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, // see `check_msg`. @@ -553,18 +594,32 @@ impl TiFlashObserver { response.set_region_id(region_id); response.set_from_peer(msg.get_from_peer().clone()); response.set_to_peer(msg.get_to_peer().clone()); - response - .mut_message() - .set_msg_type(MessageType::MsgSnapshot); - response.mut_message().set_term(inner_msg.get_term()); - response.mut_message().set_snapshot(pb_snapshot); + + let message = response.mut_message(); + message.set_msg_type(MessageType::MsgSnapshot); + message.set_term(inner_msg.get_term()); + message.set_snapshot(pb_snapshot); + // If no set, will result in a MsgResponse to peer 0. + message.set_from(msg.get_from_peer().get_id()); + message.set_to(msg.get_to_peer().get_id()); debug!( - "!!!! send snapshot key {} raft message {:?} snap data {:?} apply_state {:?}", - key, response, snap_data, apply_state + "!!!! send snapshot to {} key {} raft message {:?} snap data {:?} apply_state {:?}", + msg.get_to_peer().get_id(), + key, + response, + snap_data, + apply_state ); match self.trans.lock() { Ok(mut trans) => match trans.send(response) { - Ok(_) | Err(RaftStoreError::RegionNotFound(_)) => (), + Ok(_) => { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + self.set_snapshot_inflight(region_id, current.as_millis()) + .unwrap(); + } + Err(RaftStoreError::RegionNotFound(_)) => (), _ => return Ok(crate::FastAddPeerStatus::OtherError), }, Err(e) => return Err(box_err!("send snapshot meets error {:?}", e)), @@ -1331,6 +1386,7 @@ impl ApplySnapshotObserver for TiFlashOb "snap_key" => ?snap_key, ); should_skip = true; + o.get_mut().snapshot_inflight.store(0, Ordering::SeqCst); o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index fa89ed431fb..a482d72f273 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1290,6 +1290,7 @@ unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { } } +#[allow(clippy::redundant_closure_call)] unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, region_id: u64, diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index baca9531d6c..9aac9cdd378 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -9,7 +9,33 @@ enum SourceType { InvalidSource, } -fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) { +enum PauseType { + None, + Build, + ApplySnapshot, +} + +#[test] +fn basic_fast_add_peer() { + tikv_util::set_panic_hook(true, "./"); + let (mut cluster, pd_client) = new_mock_cluster(0, 2); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("before_tiflash_check_double_write", "return").unwrap(); + disable_auto_gen_compact_log(&mut cluster); + // Disable auto generate peer. + pd_client.disable_default_operator(); + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k0", b"v0"); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + + cluster.shutdown(); +} + +fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseType) { tikv_util::set_panic_hook(true, "./"); let (mut cluster, pd_client) = new_mock_cluster(0, 3); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; @@ -33,6 +59,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) cluster.must_put(b"k1", b"v1"); check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + // Getting (k1,v1) not necessarily means peer 2 is ready. + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + // Add learner 3 according to source_type match source_type { SourceType::Learner | SourceType::DelayedLearner => { @@ -44,26 +75,24 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) _ => (), }; - if pause { - fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); + match pause { + PauseType::Build => fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(), + PauseType::ApplySnapshot => fail::cfg("on_can_apply_snapshot", "return(false)").unwrap(), + _ => (), } + + // Add peer 3 pd_client.must_add_peer(1, new_learner_peer(3, 3)); cluster.must_put(b"k2", b"v2"); match source_type { SourceType::DelayedLearner => { - // Make sure conf change is applied. - check_key( - &cluster, - b"k2", - b"v2", - Some(true), - None, - Some(vec![1, 2, 3]), - ); + // Make sure conf change is applied in peer 2. + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(1, 2) .msg_type(MessageType::MsgAppend) + .msg_type(MessageType::MsgSnapshot) .direction(Direction::Recv), )); cluster.must_put(b"k3", b"v3"); @@ -71,9 +100,18 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) _ => (), }; - if pause { - std::thread::sleep(std::time::Duration::from_millis(3000)); - fail::remove("ffi_fast_add_peer_pause"); + match pause { + PauseType::Build => { + std::thread::sleep(std::time::Duration::from_millis(3000)); + fail::remove("ffi_fast_add_peer_pause"); + } + PauseType::ApplySnapshot => { + std::thread::sleep(std::time::Duration::from_millis(4000)); + fail::remove("on_can_apply_snapshot"); + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + std::thread::sleep(std::time::Duration::from_millis(5000)); + } + _ => (), } match source_type { @@ -102,6 +140,25 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) ); } }; + must_wait_until_cond_node(&cluster, 1, Some(vec![3]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_some() + }); + + match pause { + PauseType::ApplySnapshot => { + iter_ffi_helpers( + &cluster, + Some(vec![3]), + &mut |_, _, ffi: &mut FFIHelperSet| { + let server = &ffi.engine_store_server; + (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { + assert_eq!(1, e.fast_add_peer_count.load(Ordering::SeqCst)); + }); + }, + ); + } + _ => (), + } match source_type { SourceType::DelayedLearner => { @@ -158,6 +215,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) fail::remove("fallback_to_slow_path_not_allow"); fail::remove("fast_path_is_not_first"); + fail::remove("on_can_apply_snapshot"); fail::remove("ffi_fast_add_peer_from_id"); fail::remove("on_pre_persist_with_finish"); fail::remove("ffi_fast_add_peer_block_wait"); @@ -167,7 +225,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: bool) #[test] fn test_fast_add_peer_from_leader() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Leader, false, false); + simple_fast_add_peer(SourceType::Leader, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -175,7 +233,7 @@ fn test_fast_add_peer_from_leader() { #[test] fn test_fast_add_peer_from_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, false, false); + simple_fast_add_peer(SourceType::Learner, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -183,7 +241,7 @@ fn test_fast_add_peer_from_learner() { #[test] fn test_fast_add_peer_from_delayed_learner() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, false, false); + simple_fast_add_peer(SourceType::DelayedLearner, false, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } @@ -191,34 +249,48 @@ fn test_fast_add_peer_from_delayed_learner() { /// normal. #[test] fn test_fast_add_peer_from_invalid_source() { - simple_fast_add_peer(SourceType::InvalidSource, false, false); + simple_fast_add_peer(SourceType::InvalidSource, false, PauseType::None); } #[test] fn test_fast_add_peer_from_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true, false); + simple_fast_add_peer(SourceType::Learner, true, PauseType::None); fail::remove("fallback_to_slow_path_not_allow"); } #[test] fn test_fast_add_peer_from_delayed_learner_blocked() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true, false); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::None); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::Learner, true, PauseType::Build); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::Build); fail::remove("fallback_to_slow_path_not_allow"); } #[test] -fn test_fast_add_peer_from_learner_blocked_paused() { +fn test_fast_add_peer_from_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::Learner, true, true); + simple_fast_add_peer(SourceType::Learner, true, PauseType::ApplySnapshot); fail::remove("fallback_to_slow_path_not_allow"); } #[test] -fn test_fast_add_peer_from_delayed_learner_blocked_paused() { +fn test_fast_add_peer_from_delayed_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); - simple_fast_add_peer(SourceType::DelayedLearner, true, true); + simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::ApplySnapshot); fail::remove("fallback_to_slow_path_not_allow"); } diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 3ab7d201217..5b9e373a882 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -98,14 +98,26 @@ pub fn maybe_collect_states( Ok(Some(i)) => i, _ => unreachable!(), }; + let apply_state = get_apply_state(&engine, region_id); + let region_state = get_region_local_state(&engine, region_id); + let raft_state = get_raft_local_state(raft_engine, region_id); + if apply_state.is_none() { + return; + } + if region_state.is_none() { + return; + } + if raft_state.is_none() { + return; + } prev_state.insert( id, States { in_memory_apply_state: region.apply_state.clone(), in_memory_applied_term: region.applied_term, - in_disk_apply_state: get_apply_state(&engine, region_id).unwrap(), - in_disk_region_state: get_region_local_state(&engine, region_id).unwrap(), - in_disk_raft_state: get_raft_local_state(raft_engine, region_id).unwrap(), + in_disk_apply_state: apply_state.unwrap(), + in_disk_region_state: region_state.unwrap(), + in_disk_raft_state: raft_state.unwrap(), ident, }, ); @@ -588,8 +600,12 @@ pub fn must_wait_until_cond_node( } let mut ok = true; for i in new_states.keys() { - let new = new_states.get(i).unwrap(); - if !pred(new) { + if let Some(new) = new_states.get(i) { + if !pred(new) { + ok = false; + break; + } + } else { ok = false; break; } From 4d3dd53ab50ac69a2c8527c9285058890e90629b Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 16 Dec 2022 15:37:23 +0800 Subject: [PATCH 035/115] fix tests Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 13 +++++---- new-mock-engine-store/src/mock_store.rs | 35 ++++++++++++++++++++++-- proxy_tests/proxy/fast_add_peer.rs | 36 +++++++++++++++++++++++++ proxy_tests/proxy/proxy.rs | 23 ++++++++-------- 4 files changed, 89 insertions(+), 18 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index cb1e2901c62..a8381e43941 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -287,7 +287,8 @@ impl TiFlashObserver { // fast path not enabled return false; } - // TODO Need to recover all region infomation from restart. + // TODO We don't need to recover all region infomation from restart, + // since we have `has_already_inited`. let inner_msg = msg.get_message(); if inner_msg.get_msg_type() != MessageType::MsgAppend { // we only handles the first MsgAppend @@ -305,9 +306,12 @@ impl TiFlashObserver { (is_first, has_already_inited) = if !o.get().inited_or_fallback.load(Ordering::SeqCst) { // If `has_already_inited` is true, usually means we recover from a - // restart. So we have data in disk, but not - // in memory. TODO maybe only check once, or - // we can remove apply snapshot. + // restart. So we have data in disk, but not in memory. + // TODO Maybe only check once if we are not from recover. + // If we do not, we can then remove logics in apply snapshot. + // This is because if the next maybe_fast_path after apply snapshot + // will have has_already_inited == true, which leads to normal + // MsgAppend. let has_already_inited = self.is_initialized(region_id); if has_already_inited { o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); @@ -579,7 +583,6 @@ impl TiFlashObserver { snap_data.set_meta(snapshot_meta); } - // TODO The rest is test, please remove it after we can fetch the real data. pb_snapshot_metadata .set_conf_state(raftstore::store::util::conf_state_from_region(&new_region)); pb_snapshot_metadata.set_index(key.idx); diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 8d531a9d04e..3586d9632ca 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -15,6 +15,7 @@ pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, RawCppPtr, RawVoidPtr, UnwrapExternCFunc, }; +use engine_traits::RaftEngineReadOnly; pub use engine_traits::{ Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, SyncMutable, WriteBatch, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, @@ -209,6 +210,7 @@ pub fn write_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8], v: &[u8]) let pending_delete = &mut region.pending_delete[cf_index]; let pending_write = &mut region.pending_write[cf_index]; pending_delete.remove(k); + debug!("write into {} k {:?} v {:?}", region.region.get_id(), k, v); data.insert(k.to_vec(), v.to_vec()); pending_write.insert(k.to_vec(), v.to_vec()); } @@ -1310,7 +1312,13 @@ unsafe extern "C" fn ffi_fast_add_peer( }); 0 })() != 0; - + let fail_after_write: bool = (|| { + fail::fail_point!("ffi_fast_add_peer_fail_after_write", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); for retry in 0..300 { @@ -1322,7 +1330,7 @@ unsafe extern "C" fn ffi_fast_add_peer( let mut guard = match lock { Ok(e) => e, Err(_) => { - error!("ffi_debug_func failed to lock"); + error!("ffi_fast_add_peer failed to lock"); return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); } }; @@ -1426,6 +1434,7 @@ unsafe extern "C" fn ffi_fast_add_peer( }; debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); + // TODO In TiFlash we should take care of write batch size if let Err(e) = copy_data_from( &source_engines, &target_engines, @@ -1436,6 +1445,28 @@ unsafe extern "C" fn ffi_fast_add_peer( return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); } + if fail_after_write { + let mut raft_wb = target_engines.raft.log_batch(1024); + let mut entries: Vec = Default::default(); + target_engines + .raft + .get_all_entries_to(region_id, &mut entries) + .unwrap(); + + let l = entries.len(); + // Manually delete one raft log + // let from = entries.get(l - 2).unwrap().get_index(); + let from = 7; + let to = entries.get(l - 1).unwrap().get_index() + 1; + debug!("recover from remote peer: simulate error from {} to {}", from_store, store_id; + "region_id" => region_id, + "from" => from, + "to" => to, + ); + raft_wb.cut_logs(region_id, from, to); + target_engines.raft.consume(&mut raft_wb, true).unwrap(); + } + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 9aac9cdd378..4f520344efa 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -266,6 +266,7 @@ fn test_fast_add_peer_from_delayed_learner_blocked() { fail::remove("fallback_to_slow_path_not_allow"); } +// Delay when fetch and build data #[test] fn test_fast_add_peer_from_learner_blocked_paused_build() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); @@ -280,6 +281,9 @@ fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { fail::remove("fallback_to_slow_path_not_allow"); } +// Delay when applying snapshot +// This test is origianlly aimed to test multiple MsgSnapshot. +// However, #[test] fn test_fast_add_peer_from_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); @@ -323,6 +327,7 @@ fn test_existing_peer() { fail::remove("before_tiflash_check_double_write"); } +// We will reject remote peer in Applying state. #[test] fn test_apply_snapshot() { fail::cfg("before_tiflash_check_double_write", "return").unwrap(); @@ -433,3 +438,34 @@ fn test_split_merge() { fail::remove("on_can_apply_snapshot"); cluster.shutdown(); } + +#[test] +fn test_fall_back_to_slow_path() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 2); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("ffi_fast_add_peer_fail_after_write", "return(1)").unwrap(); + fail::cfg("go_fast_path_succeed", "panic").unwrap(); + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + cluster.must_put(b"k2", b"v2"); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + + fail::remove("ffi_fast_add_peer_fail_after_write"); + fail::remove("on_can_apply_snapshot"); + fail::remove("on_pre_persist_with_finish"); + fail::remove("go_fast_path_succeed"); + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index bbc894a1d8d..dc1d9cbc7d8 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -594,19 +594,20 @@ pub fn must_wait_until_cond_node( let mut retry = 0; loop { let new_states = maybe_collect_states(&cluster, region_id, store_ids.clone()); - if let Some(ref e) = store_ids { - assert_eq!(e.len(), new_states.len()); - } let mut ok = true; - for i in new_states.keys() { - if let Some(new) = new_states.get(i) { - if !pred(new) { - ok = false; - break; + if let Some(ref e) = store_ids { + if e.len() == new_states.len() { + for i in new_states.keys() { + if let Some(new) = new_states.get(i) { + if !pred(new) { + ok = false; + break; + } + } else { + ok = false; + break; + } } - } else { - ok = false; - break; } } if ok { From c4b01d2accfa11f46cd40a22298b0913aa9429c0 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 20 Dec 2022 14:37:42 +0800 Subject: [PATCH 036/115] hide ffi_helper_set Signed-off-by: CalvinNeo --- new-mock-engine-store/src/mock_cluster.rs | 12 +- new-mock-engine-store/src/mock_store.rs | 303 +++++++++++----------- 2 files changed, 164 insertions(+), 151 deletions(-) diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index d1a6e175287..9a2353afcc6 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -93,7 +93,7 @@ pub struct TestData { pub struct Cluster> { // Helper to set ffi_helper_set. pub ffi_helper_lst: Vec, - pub ffi_helper_set: Arc>>, + ffi_helper_set: Arc>>, pub cfg: Config, leaders: HashMap, @@ -254,6 +254,16 @@ impl> Cluster { } } + pub fn access_ffi_helpers(&self, f: &mut dyn FnMut(&mut HashMap)) { + let lock = self.ffi_helper_set.lock(); + match lock { + Ok(mut l) => { + f(&mut l); + } + Err(_) => std::process::exit(1), + } + } + pub fn create_engines(&mut self) { self.io_rate_limiter = Some(Arc::new( self.cfg diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 3586d9632ca..aa6af5ce677 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1,8 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Arc; pub use std::{ cell::RefCell, - collections::{BTreeMap, HashMap, HashSet}, + collections::BTreeMap, pin::Pin, sync::{ atomic::{AtomicU64, Ordering}, @@ -11,6 +12,7 @@ pub use std::{ time::Duration, }; +use collections::{HashMap, HashSet}; pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, RawCppPtr, RawVoidPtr, UnwrapExternCFunc, @@ -1322,164 +1324,165 @@ unsafe extern "C" fn ffi_fast_add_peer( debug!("recover from remote peer: enter from {} to {}", from_store, store_id; "region_id" => region_id); for retry in 0..300 { + let mut ret: Option = None; if retry > 0 { std::thread::sleep(std::time::Duration::from_millis(30)); } - - let lock = cluster.ffi_helper_set.lock(); - let mut guard = match lock { - Ok(e) => e, - Err(_) => { - error!("ffi_fast_add_peer failed to lock"); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); - } - }; - debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); - let source_server = match guard.get_mut(&from_store) { - Some(s) => &mut s.engine_store_server, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable); - } - }; - let source_engines = match source_server.engines.clone() { - Some(s) => s, - None => { - error!("recover from remote peer: failed get source engine"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - - // TODO We must ask the remote peer to persist before get a snapshot. - - let source_region = match source_server.kvstore.get(®ion_id) { - Some(s) => s, - None => { - error!("recover from remote peer: failed read source region info"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - }; - let region_local_state: RegionLocalState = match general_get_region_local_state( - &source_engines.kv, - region_id, - ) { - Some(x) => x, - None => { - debug!("recover from remote peer: preparing from {} to {}:{}, not region state", from_store, store_id, new_peer_id; "region_id" => region_id); - // We don't return BadData here, since the data may not be persisted. - if block_wait { - continue; + cluster.access_ffi_helpers(&mut |guard: &mut HashMap| { + debug!("recover from remote peer: preparing from {} to {}, persist and check source", from_store, store_id; "region_id" => region_id); + let source_server = match guard.get_mut(&from_store) { + Some(s) => &mut s.engine_store_server, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::NoSuitable)); + return; } - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); - } - }; - let new_region_meta = region_local_state.get_region(); - let peer_state = region_local_state.get_state(); - - // Validation - match peer_state { - PeerState::Tombstone | PeerState::Applying => { - // Note in real implementation, we will avoid selecting this peer. - error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); - } - _ => { - info!("recover from remote peer: preparing from {} to {}:{}, ok peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); - } - }; - if !engine_store_ffi::observer::validate_remote_peer_region( - new_region_meta, - store_id, - new_peer_id, - ) { - debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); - if block_wait { - continue; + }; + let source_engines = match source_server.engines.clone() { + Some(s) => s, + None => { + error!("recover from remote peer: failed get source engine"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return + } + }; + // TODO We must ask the remote peer to persist before get a snapshot. + let source_region = match source_server.kvstore.get(®ion_id) { + Some(s) => s, + None => { + error!("recover from remote peer: failed read source region info"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + let region_local_state: RegionLocalState = match general_get_region_local_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + debug!("recover from remote peer: preparing from {} to {}:{}, not region state", from_store, store_id, new_peer_id; "region_id" => region_id); + // We don't return BadData here, since the data may not be persisted. + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData)); + return; + } + }; + let new_region_meta = region_local_state.get_region(); + let peer_state = region_local_state.get_state(); + // Validation + match peer_state { + PeerState::Tombstone | PeerState::Applying => { + // Note in real implementation, we will avoid selecting this peer. + error!("recover from remote peer: preparing from {} to {}:{}, error peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + _ => { + info!("recover from remote peer: preparing from {} to {}:{}, ok peer state {:?}", from_store, store_id, new_peer_id, peer_state; "region_id" => region_id); + } + }; + if !engine_store_ffi::observer::validate_remote_peer_region( + new_region_meta, + store_id, + new_peer_id, + ) { + debug!("recover from remote peer: preparing from {} to {}, not applied conf change {}", from_store, store_id, new_peer_id; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData)); + return; } - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::WaitForData); - } - // TODO check commit_index and applied_index here - - debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); - let new_region = make_new_region( - Some(new_region_meta.clone()), - Some((*store.engine_store_server).id), - ); - (*store.engine_store_server) - .kvstore - .insert(region_id, Box::new(new_region)); - let target_engines = match (*store.engine_store_server).engines.clone() { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError); + // TODO check commit_index and applied_index here + debug!("recover from remote peer: preparing from {} to {}, check target", from_store, store_id; "region_id" => region_id); + let new_region = make_new_region( + Some(new_region_meta.clone()), + Some((*store.engine_store_server).id), + ); + (*store.engine_store_server) + .kvstore + .insert(region_id, Box::new(new_region)); + let target_engines = match (*store.engine_store_server).engines.clone() { + Some(s) => s, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::OtherError)); + return; + } + }; + let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { + Some(s) => s, + None => { + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); + // Must first dump meta then data, otherwise data may lag behind. + // We can see a raft log hole at applied_index otherwise. + let apply_state: RaftApplyState = match general_get_apply_state( + &source_engines.kv, + region_id, + ) { + Some(x) => x, + None => { + error!("recover from remote peer: failed read apply state"; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData)); + return; + } + }; + debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); + // TODO In TiFlash we should take care of write batch size + if let Err(e) = copy_data_from( + &source_engines, + &target_engines, + &source_region, + target_region, + ) { + error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); + ret = Some(failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject)); + return; } - }; - let target_region = match (*store.engine_store_server).kvstore.get_mut(®ion_id) { - Some(s) => s, - None => { - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + if fail_after_write { + let mut raft_wb = target_engines.raft.log_batch(1024); + let mut entries: Vec = Default::default(); + target_engines + .raft + .get_all_entries_to(region_id, &mut entries) + .unwrap(); + let l = entries.len(); + // Manually delete one raft log + // let from = entries.get(l - 2).unwrap().get_index(); + let from = 7; + let to = entries.get(l - 1).unwrap().get_index() + 1; + debug!("recover from remote peer: simulate error from {} to {}", from_store, store_id; + "region_id" => region_id, + "from" => from, + "to" => to, + ); + raft_wb.cut_logs(region_id, from, to); + target_engines.raft.consume(&mut raft_wb, true).unwrap(); } - }; - debug!("recover from remote peer: meta from {} to {}", from_store, store_id; "region_id" => region_id); - // Must first dump meta then data, otherwise data may lag behind. - // We can see a raft log hole at applied_index otherwise. - let apply_state: RaftApplyState = match general_get_apply_state( - &source_engines.kv, - region_id, - ) { - Some(x) => x, - None => { - error!("recover from remote peer: failed read apply state"; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData); + let apply_state_bytes = apply_state.write_to_bytes().unwrap(); + let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); + let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); + let region_ptr = create_cpp_str(Some(region_bytes)); + // Check if we have commit_index. + debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); + ret = Some(ffi_interfaces::FastAddPeerRes { + status: ffi_interfaces::FastAddPeerStatus::Ok, + apply_state: apply_state_ptr, + region: region_ptr, + }); + return; + }); + if let Some(r) = ret { + match r.status { + ffi_interfaces::FastAddPeerStatus::WaitForData => { + if block_wait { + continue; + } else { + return r; + } + } + _ => return r, } - }; - - debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); - // TODO In TiFlash we should take care of write batch size - if let Err(e) = copy_data_from( - &source_engines, - &target_engines, - &source_region, - target_region, - ) { - error!("recover from remote peer: inject error {:?}", e; "region_id" => region_id); - return failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::FailedInject); } - - if fail_after_write { - let mut raft_wb = target_engines.raft.log_batch(1024); - let mut entries: Vec = Default::default(); - target_engines - .raft - .get_all_entries_to(region_id, &mut entries) - .unwrap(); - - let l = entries.len(); - // Manually delete one raft log - // let from = entries.get(l - 2).unwrap().get_index(); - let from = 7; - let to = entries.get(l - 1).unwrap().get_index() + 1; - debug!("recover from remote peer: simulate error from {} to {}", from_store, store_id; - "region_id" => region_id, - "from" => from, - "to" => to, - ); - raft_wb.cut_logs(region_id, from, to); - target_engines.raft.consume(&mut raft_wb, true).unwrap(); - } - - let apply_state_bytes = apply_state.write_to_bytes().unwrap(); - let region_bytes = region_local_state.get_region().write_to_bytes().unwrap(); - let apply_state_ptr = create_cpp_str(Some(apply_state_bytes)); - let region_ptr = create_cpp_str(Some(region_bytes)); - - // Check if we have commit_index. - - debug!("recover from remote peer: ok from {} to {}", from_store, store_id; "region_id" => region_id); - return ffi_interfaces::FastAddPeerRes { - status: ffi_interfaces::FastAddPeerStatus::Ok, - apply_state: apply_state_ptr, - region: region_ptr, - }; } error!("recover from remote peer: failed after retry"; "region_id" => region_id); failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) From 152757499ce49083ec5e546d3621ad30893ddc76 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Tue, 20 Dec 2022 17:52:17 +0800 Subject: [PATCH 037/115] remove Signed-off-by: CalvinNeo --- new-mock-engine-store/src/mock_store.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index aa6af5ce677..e2f43a93b1c 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -212,7 +212,6 @@ pub fn write_kv_in_mem(region: &mut Region, cf_index: usize, k: &[u8], v: &[u8]) let pending_delete = &mut region.pending_delete[cf_index]; let pending_write = &mut region.pending_write[cf_index]; pending_delete.remove(k); - debug!("write into {} k {:?} v {:?}", region.region.get_id(), k, v); data.insert(k.to_vec(), v.to_vec()); pending_write.insert(k.to_vec(), v.to_vec()); } From 9e55c603e6392aea44769757b851859dd4c525f0 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Wed, 21 Dec 2022 12:11:39 +0800 Subject: [PATCH 038/115] address merge problems Signed-off-by: CalvinNeo --- new-mock-engine-store/src/mock_store.rs | 5 ++- proxy_tests/proxy/proxy.rs | 49 ------------------------- 2 files changed, 4 insertions(+), 50 deletions(-) diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index b10c40e145a..a482d72f273 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -23,11 +23,14 @@ pub use kvproto::{ raft_cmdpb::AdminCmdType, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; +pub use protobuf::Message; +pub use tikv_util::{box_err, box_try, debug, error, info, warn}; use crate::node::NodeCluster; pub use crate::{ config::MockConfig, - mock_cluster, + copy_data_from, copy_meta_from, general_get_apply_state, general_get_region_local_state, + get_apply_state, get_raft_local_state, get_region_local_state, mock_cluster, mock_cluster::{ must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, }, diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index a36e21c3ffb..087d8c16dc6 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -255,55 +255,6 @@ pub fn must_put_and_check_key( ); } -pub fn must_put_and_check_key_with_generator (String, String)>( - cluster: &mut Cluster, - gen: F, - from: u64, - to: u64, - in_mem: Option, - in_disk: Option, - engines: Option>, -) { - for i in from..to { - let (k, v) = gen(i); - cluster.must_put(k.as_bytes(), v.as_bytes()); - } - for i in from..to { - let (k, v) = gen(i); - check_key( - &cluster, - k.as_bytes(), - v.as_bytes(), - in_mem, - in_disk, - engines.clone(), - ); - } -} - -pub fn must_put_and_check_key( - cluster: &mut Cluster, - from: u64, - to: u64, - in_mem: Option, - in_disk: Option, - engines: Option>, -) { - must_put_and_check_key_with_generator( - cluster, - |i: u64| { - let k = format!("k{}", i); - let v = format!("v{}", i); - (k, v) - }, - from, - to, - in_mem, - in_disk, - engines.clone(), - ); -} - pub fn check_key( cluster: &Cluster, k: &[u8], From fc54c32173058ff02c9592062b9a2bc1f7027035 Mon Sep 17 00:00:00 2001 From: lidezhu <47731263+lidezhu@users.noreply.github.com> Date: Wed, 21 Dec 2022 20:48:24 +0800 Subject: [PATCH 039/115] store raft meta in ps (#238) * store raft meta in ps * fmt * more fix * fix test * fix test * fix test --- engine_store_ffi/src/interfaces.rs | 20 +- engine_store_ffi/src/lib.rs | 19 +- engine_store_ffi/src/observer.rs | 82 +++- engine_store_ffi/src/ps_engine.rs | 10 +- engine_tiflash/src/engine.rs | 190 ++++++++- engine_tiflash/src/lib.rs | 9 + engine_tiflash/src/ps_write_batch.rs | 368 ++++++++++++++++++ engine_tiflash/src/raft_engine.rs | 6 +- engine_tiflash/src/write_batch.rs | 47 ++- new-mock-engine-store/src/mock_store.rs | 2 +- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 18 +- 12 files changed, 736 insertions(+), 37 deletions(-) create mode 100644 engine_tiflash/src/ps_write_batch.rs diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 46dc3152ea2..abd201d588b 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -144,8 +144,16 @@ pub mod root { } #[repr(C)] #[derive(Debug)] - pub struct PageWithViewVec { - pub inner: *mut root::DB::PageWithView, + pub struct PageAndCppStrWithView { + pub page: root::DB::RawCppPtr, + pub key: root::DB::RawCppPtr, + pub page_view: root::DB::BaseBuffView, + pub key_view: root::DB::BaseBuffView, + } + #[repr(C)] + #[derive(Debug)] + pub struct PageAndCppStrWithViewVec { + pub inner: *mut root::DB::PageAndCppStrWithView, pub len: u64, } #[repr(u8)] @@ -435,10 +443,10 @@ pub mod root { arg1: *const root::DB::EngineStoreServerWrap, arg2: root::DB::BaseBuffView, arg3: root::DB::BaseBuffView, - ) -> root::DB::PageWithViewVec, + ) -> root::DB::PageAndCppStrWithViewVec, >, - pub fn_gc_page_with_view_vec: ::std::option::Option< - unsafe extern "C" fn(inner: *mut root::DB::PageWithView, len: u64), + pub fn_gc_page_and_cpp_str_with_view_vec: ::std::option::Option< + unsafe extern "C" fn(arg1: *mut root::DB::PageAndCppStrWithView, arg2: u64), >, pub fn_handle_purge_pagestorage: ::std::option::Option< unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap), @@ -543,7 +551,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 4954147441045435430; + pub const RAFT_STORE_PROXY_VERSION: u64 = 3525220209235231360; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index bfb08b84f11..6924024b342 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -37,9 +37,10 @@ pub use sst_reader_impls::*; pub use self::interfaces::root::DB::{ BaseBuffView, ColumnFamilyType, CppStrVecView, CppStrWithView, EngineStoreApplyRes, EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, - FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageWithView, - PageWithViewVec, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, - RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, + PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, + RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, + WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -377,11 +378,11 @@ impl Drop for RawCppPtr { } } -impl Drop for PageWithViewVec { +impl Drop for PageAndCppStrWithViewVec { fn drop(&mut self) { if self.inner != std::ptr::null_mut() { let helper = get_engine_store_server_helper(); - helper.gc_page_with_view_vec(self.inner, self.len); + helper.gc_page_and_cpp_str_with_view_vec(self.inner, self.len); self.inner = std::ptr::null_mut(); self.len = 0; } @@ -570,14 +571,14 @@ impl EngineStoreServerHelper { &self, start_page_id: BaseBuffView, end_page_id: BaseBuffView, - ) -> PageWithViewVec { + ) -> PageAndCppStrWithViewVec { debug_assert!(self.fn_handle_scan_page.is_some()); unsafe { (self.fn_handle_scan_page.into_inner())(self.inner, start_page_id, end_page_id) } } - pub fn gc_page_with_view_vec(&self, arg1: *mut PageWithView, arg2: u64) { - debug_assert!(self.fn_gc_page_with_view_vec.is_some()); - unsafe { (self.fn_gc_page_with_view_vec.into_inner())(arg1, arg2) } + pub fn gc_page_and_cpp_str_with_view_vec(&self, arg1: *mut PageAndCppStrWithView, arg2: u64) { + debug_assert!(self.fn_gc_page_and_cpp_str_with_view_vec.is_some()); + unsafe { (self.fn_gc_page_and_cpp_str_with_view_vec.into_inner())(arg1, arg2) } } pub fn purge_pagestorage(&self) { diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index cb1e2901c62..4d50a779304 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -13,7 +13,7 @@ use std::{ }; use collections::HashMap; -use engine_tiflash::FsStatsExt; +use engine_tiflash::{FsStatsExt, RawPSWriteBatchPtr, RawPSWriteBatchWrapper}; use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, @@ -69,6 +69,18 @@ impl Into for ffi_interfaces::StoreStats { } } +impl From for RawPSWriteBatchWrapper { + fn from(src: RawCppPtr) -> Self { + let result = RawPSWriteBatchWrapper { + ptr: src.ptr, + type_: src.type_, + }; + let mut src = src; + src.ptr = std::ptr::null_mut(); + result + } +} + pub struct TiFlashFFIHub { pub engine_store_server_helper: &'static EngineStoreServerHelper, } @@ -80,6 +92,74 @@ impl engine_tiflash::FFIHubInner for TiFlashFFIHub { .handle_compute_store_stats() .into() } + + fn create_write_batch(&self) -> RawPSWriteBatchWrapper { + self.engine_store_server_helper.create_write_batch().into() + } + + fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper) { + self.engine_store_server_helper + .gc_raw_cpp_ptr(wb_wrapper.ptr, wb_wrapper.type_); + } + + fn consume_write_batch(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.consume_write_batch(wb) + } + + fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize { + self.engine_store_server_helper.write_batch_size(wb) as usize + } + + fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool { + self.engine_store_server_helper.write_batch_is_empty(wb) != 0 + } + + fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_merge(lwb, rwb) + } + + fn write_batch_clear(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_clear(wb) + } + + fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]) { + self.engine_store_server_helper + .write_batch_put_page(wb, page_id.into(), page.into()) + } + + fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]) { + self.engine_store_server_helper + .write_batch_del_page(wb, page_id.into()) + } + + fn read_page(&self, page_id: &[u8]) -> Option> { + let value = self.engine_store_server_helper.read_page(page_id.into()); + return if value.view.len == 0 { + None + } else { + Some(value.view.to_slice().to_vec()) + }; + } + + fn scan_page( + &self, + start_page_id: &[u8], + end_page_id: &[u8], + f: &mut dyn FnMut(&[u8], &[u8]) -> engine_traits::Result, + ) { + let values = self + .engine_store_server_helper + .scan_page(start_page_id.into(), end_page_id.into()); + for i in 0..values.len { + let value = unsafe { &*values.inner.offset(i as isize) }; + if value.page_view.len != 0 { + f( + &value.key_view.to_slice().to_vec(), + &value.page_view.to_slice().to_vec(), + ); + } + } + } } pub struct PtrWrapper(RawCppPtr); diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 89d50ba9eb7..dd555dbaf18 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -268,8 +268,11 @@ impl PSEngine { let values = helper.scan_page(start_key.into(), end_key.into()); for i in 0..values.len { let value = unsafe { &*values.inner.offset(i as isize) }; - if value.view.len != 0 { - if !f(&[], &value.view.to_slice().to_vec())? { + if value.page_view.len != 0 { + if !f( + &value.key_view.to_slice().to_vec(), + &value.page_view.to_slice().to_vec(), + )? { break; } } @@ -335,8 +338,6 @@ impl RaftEngineReadOnly for PSEngine { let start_key = keys::raft_log_key(region_id, low); let end_key = keys::raft_log_key(region_id, high); - let mut count = 1; - self.scan(&start_key, &end_key, |_, page| { let mut entry = Entry::default(); entry.merge_from_bytes(page)?; @@ -362,7 +363,6 @@ impl RaftEngineReadOnly for PSEngine { } fn is_empty(&self) -> Result { - let mut is_empty = true; Ok(self.is_empty()) } diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index 29118a22023..ee6d9d68e86 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -3,8 +3,9 @@ #![allow(dead_code)] #![allow(unused_variables)] use std::{ - fmt::Formatter, + fmt::{self, Debug, Formatter}, fs, + ops::Deref, path::Path, sync::{ atomic::{AtomicUsize, Ordering}, @@ -14,8 +15,8 @@ use std::{ use engine_rocks::{RocksDbVector, RocksEngineIterator, RocksSnapshot}; use engine_traits::{ - Checkpointable, Checkpointer, Error, IterOptions, Iterable, KvEngine, Peekable, ReadOptions, - Result, SyncMutable, + Checkpointable, Checkpointer, DbVector, Error, IterOptions, Iterable, KvEngine, Peekable, + ReadOptions, Result, SyncMutable, }; use rocksdb::{Writable, DB}; use tikv_util::box_err; @@ -28,8 +29,48 @@ pub struct FsStatsExt { pub available: u64, } +pub type RawPSWriteBatchPtr = *mut ::std::os::raw::c_void; +pub type RawPSWriteBatchWrapperTag = u32; + +// This is just a copy from engine_store_ffi::RawCppPtr +#[repr(C)] +#[derive(Debug)] +pub struct RawPSWriteBatchWrapper { + pub ptr: RawPSWriteBatchPtr, + pub type_: RawPSWriteBatchWrapperTag, +} + +unsafe impl Send for RawPSWriteBatchWrapper {} + pub trait FFIHubInner { fn get_store_stats(&self) -> FsStatsExt; + + fn create_write_batch(&self) -> RawPSWriteBatchWrapper; + + fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper); + + fn consume_write_batch(&self, wb: RawPSWriteBatchPtr); + + fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize; + + fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool; + + fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr); + + fn write_batch_clear(&self, wb: RawPSWriteBatchPtr); + + fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]); + + fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]); + + fn read_page(&self, page_id: &[u8]) -> Option>; + + fn scan_page( + &self, + start_page_id: &[u8], + end_page_id: &[u8], + f: &mut dyn FnMut(&[u8], &[u8]) -> Result, + ); } pub trait FFIHub: FFIHubInner + Send + Sync {} @@ -176,18 +217,71 @@ impl KvEngine for RocksEngine { impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; + #[cfg(not(any(test, feature = "testexport")))] + fn scan( + &self, + cf: &str, + start_key: &[u8], + end_key: &[u8], + fill_cache: bool, + f: F, + ) -> Result<()> + where + F: FnMut(&[u8], &[u8]) -> Result, + { + let mut f = f; + self.ffi_hub + .as_ref() + .unwrap() + .scan_page(start_key.into(), end_key.into(), &mut f); + Ok(()) + } + fn iterator_opt(&self, cf: &str, opts: IterOptions) -> Result { self.rocks.iterator_opt(cf, opts) } } +pub struct PsDbVector(Vec); + +impl PsDbVector { + pub fn from_raw(raw: Vec) -> PsDbVector { + PsDbVector(raw) + } +} + +impl DbVector for PsDbVector {} + +impl Deref for PsDbVector { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &self.0 + } +} + +impl Debug for PsDbVector { + fn fmt(&self, formatter: &mut Formatter<'_>) -> fmt::Result { + write!(formatter, "{:?}", &**self) + } +} + +impl<'a> PartialEq<&'a [u8]> for PsDbVector { + fn eq(&self, rhs: &&[u8]) -> bool { + **rhs == **self + } +} + impl Peekable for RocksEngine { + #[cfg(any(test, feature = "testexport"))] type DbVector = RocksDbVector; + #[cfg(any(test, feature = "testexport"))] fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { self.rocks.get_value_opt(opts, key) } + #[cfg(any(test, feature = "testexport"))] fn get_value_cf_opt( &self, opts: &ReadOptions, @@ -196,6 +290,28 @@ impl Peekable for RocksEngine { ) -> Result> { self.rocks.get_value_cf_opt(opts, cf, key) } + + #[cfg(not(any(test, feature = "testexport")))] + type DbVector = PsDbVector; + + #[cfg(not(any(test, feature = "testexport")))] + fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { + let result = self.ffi_hub.as_ref().unwrap().read_page(key); + return match result { + None => Ok(None), + Some(v) => Ok(Some(PsDbVector::from_raw(v))), + }; + } + + #[cfg(not(any(test, feature = "testexport")))] + fn get_value_cf_opt( + &self, + opts: &ReadOptions, + cf: &str, + key: &[u8], + ) -> Result> { + self.get_value_opt(opts, key) + } } impl RocksEngine { @@ -205,6 +321,7 @@ impl RocksEngine { } impl SyncMutable for RocksEngine { + #[cfg(any(test, feature = "testexport"))] fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().put(key, value).map_err(r2e); @@ -212,6 +329,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(any(test, feature = "testexport"))] fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -225,6 +343,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(any(test, feature = "testexport"))] fn delete(&self, key: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().delete(key).map_err(r2e); @@ -232,6 +351,7 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(any(test, feature = "testexport"))] fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -241,6 +361,70 @@ impl SyncMutable for RocksEngine { Ok(()) } + #[cfg(not(any(test, feature = "testexport")))] + fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { + if self.do_write(engine_traits::CF_DEFAULT, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(ps_wb.ptr, key, value); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(not(any(test, feature = "testexport")))] + fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if self.do_write(cf, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(ps_wb.ptr, key, value); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(not(any(test, feature = "testexport")))] + fn delete(&self, key: &[u8]) -> Result<()> { + if self.do_write(engine_traits::CF_DEFAULT, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(ps_wb.ptr, key); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + + #[cfg(not(any(test, feature = "testexport")))] + fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { + if self.do_write(cf, key) { + let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(ps_wb.ptr, key); + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(ps_wb.ptr); + } + Ok(()) + } + fn delete_range(&self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { // do nothing Ok(()) diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index c2e5e5ef838..23c6014cdb6 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -54,8 +54,17 @@ mod status; pub use crate::status::*; mod table_properties; pub use crate::table_properties::*; + +#[cfg(any(test, feature = "testexport"))] mod write_batch; +#[cfg(any(test, feature = "testexport"))] pub use crate::write_batch::*; + +#[cfg(not(any(test, feature = "testexport")))] +mod ps_write_batch; +#[cfg(not(any(test, feature = "testexport")))] +pub use crate::ps_write_batch::*; + pub mod mvcc_properties; pub use crate::mvcc_properties::*; pub mod perf_context; diff --git a/engine_tiflash/src/ps_write_batch.rs b/engine_tiflash/src/ps_write_batch.rs new file mode 100644 index 00000000000..e76a0daa468 --- /dev/null +++ b/engine_tiflash/src/ps_write_batch.rs @@ -0,0 +1,368 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; +use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; + +use crate::{ + engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle, FFIHubInner, + RawPSWriteBatchWrapper, +}; + +const WRITE_BATCH_MAX_BATCH: usize = 16; +const WRITE_BATCH_LIMIT: usize = 16; + +impl WriteBatchExt for RocksEngine { + type WriteBatch = RocksWriteBatchVec; + + const WRITE_BATCH_MAX_KEYS: usize = 256; + + fn write_batch(&self) -> RocksWriteBatchVec { + RocksWriteBatchVec::new( + Arc::clone(self.as_inner()), + self.ffi_hub.clone(), + self.ffi_hub.as_ref().unwrap().create_write_batch(), + WRITE_BATCH_LIMIT, + 1, + self.support_multi_batch_write(), + ) + } + + fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { + RocksWriteBatchVec::with_unit_capacity( + self, + self.ffi_hub.as_ref().unwrap().create_write_batch(), + cap, + ) + } +} + +/// `RocksWriteBatchVec` is for method `MultiBatchWrite` of RocksDB, which +/// splits a large WriteBatch into many smaller ones and then any thread could +/// help to deal with these small WriteBatch when it is calling +/// `MultiBatchCommit` and wait the front writer to finish writing. +/// `MultiBatchWrite` will perform much better than traditional +/// `pipelined_write` when TiKV writes very large data into RocksDB. +/// We will remove this feature when `unordered_write` of RocksDB becomes more +/// stable and becomes compatible with Titan. +pub struct RocksWriteBatchVec { + pub db: Arc, + pub wbs: Vec, + pub ffi_hub: Option>, + pub ps_wb: RawPSWriteBatchWrapper, + save_points: Vec, + index: usize, + batch_size_limit: usize, + support_write_batch_vec: bool, +} + +impl Drop for RocksWriteBatchVec { + fn drop(&mut self) { + if !self.ps_wb.ptr.is_null() { + self.ffi_hub + .as_ref() + .unwrap() + .destroy_write_batch(&self.ps_wb); + } + self.ps_wb.ptr = std::ptr::null_mut(); + } +} + +impl RocksWriteBatchVec { + pub fn new( + db: Arc, + ffi_hub: Option>, + ps_wb: RawPSWriteBatchWrapper, + batch_size_limit: usize, + cap: usize, + support_write_batch_vec: bool, + ) -> RocksWriteBatchVec { + let wb = RawWriteBatch::with_capacity(cap); + RocksWriteBatchVec { + db, + wbs: vec![wb], + ffi_hub, + ps_wb, + save_points: vec![], + index: 0, + batch_size_limit, + support_write_batch_vec, + } + } + + pub fn with_unit_capacity( + engine: &RocksEngine, + ps_wb: RawPSWriteBatchWrapper, + cap: usize, + ) -> RocksWriteBatchVec { + Self::new( + engine.as_inner().clone(), + engine.ffi_hub.clone(), + ps_wb, + WRITE_BATCH_LIMIT, + cap, + engine.support_multi_batch_write(), + ) + } + + pub fn as_inner(&self) -> &[RawWriteBatch] { + &self.wbs[0..=self.index] + } + + pub fn get_db(&self) -> &DB { + self.db.as_ref() + } + + /// `check_switch_batch` will split a large WriteBatch into many smaller + /// ones. This is to avoid a large WriteBatch blocking write_thread too + /// long. + #[inline(always)] + fn check_switch_batch(&mut self) { + if self.support_write_batch_vec + && self.batch_size_limit > 0 + && self.wbs[self.index].count() >= self.batch_size_limit + { + self.index += 1; + if self.index >= self.wbs.len() { + self.wbs.push(RawWriteBatch::default()); + } + } + } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + // write into ps + self.ffi_hub + .as_ref() + .unwrap() + .consume_write_batch(self.ps_wb.ptr); + Ok(self + .ffi_hub + .as_ref() + .unwrap() + .write_batch_size(self.ps_wb.ptr) as u64) + } + + fn data_size(&self) -> usize { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_size(self.ps_wb.ptr) + } + + fn count(&self) -> usize { + // FIXME + 0 + } + + fn is_empty(&self) -> bool { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_is_empty(self.ps_wb.ptr) + } + + fn should_write_to_engine(&self) -> bool { + // Disable TiKV's logic, and using Proxy's instead. + false + } + + fn clear(&mut self) { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_clear(self.ps_wb.ptr); + } + + fn set_save_point(&mut self) { + self.wbs[self.index].set_save_point(); + self.save_points.push(self.index); + } + + fn pop_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + return self.wbs[x].pop_save_point().map_err(r2e); + } + Err(r2e("no save point")) + } + + fn rollback_to_save_point(&mut self) -> Result<()> { + if let Some(x) = self.save_points.pop() { + for i in x + 1..=self.index { + self.wbs[i].clear(); + } + self.index = x; + return self.wbs[x].rollback_to_save_point().map_err(r2e); + } + Err(r2e("no save point")) + } + + fn merge(&mut self, other: Self) -> Result<()> { + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_merge(self.ps_wb.ptr, other.ps_wb.ptr); + Ok(()) + } +} + +impl RocksWriteBatchVec { + fn do_write(&self, cf: &str, key: &[u8]) -> bool { + crate::do_write(cf, key) + } +} + +impl Mutable for RocksWriteBatchVec { + fn put(&mut self, key: &[u8], value: &[u8]) -> Result<()> { + if !self.do_write(engine_traits::CF_DEFAULT, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(self.ps_wb.ptr, key, value); + Ok(()) + } + + fn put_cf(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + if !self.do_write(cf, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_put_page(self.ps_wb.ptr, key, value); + Ok(()) + } + + fn delete(&mut self, key: &[u8]) -> Result<()> { + if !self.do_write(engine_traits::CF_DEFAULT, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(self.ps_wb.ptr, key); + Ok(()) + } + + fn delete_cf(&mut self, cf: &str, key: &[u8]) -> Result<()> { + if !self.do_write(cf, key) { + return Ok(()); + } + self.ffi_hub + .as_ref() + .unwrap() + .write_batch_del_page(self.ps_wb.ptr, key); + Ok(()) + } + + fn delete_range(&mut self, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + Ok(()) + } + + fn delete_range_cf(&mut self, cf: &str, begin_key: &[u8], end_key: &[u8]) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use engine_traits::{Peekable, WriteBatch, CF_DEFAULT}; + use rocksdb::DBOptions as RawDBOptions; + use tempfile::Builder; + + use super::{ + super::{util::new_engine_opt, RocksDbOptions}, + *, + }; + use crate::RocksCfOptions; + + #[test] + fn test_should_write_to_engine_with_pipeline_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(true); + opt.enable_multi_batch_write(false); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert!( + !engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.write().unwrap(); + + let v = engine.get_value(b"aaa").unwrap(); + + assert!(v.is_some()); + assert_eq!(v.unwrap(), b"bbb"); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } + + #[test] + fn test_should_write_to_engine_with_multi_batch_write_mode() { + let path = Builder::new() + .prefix("test-should-write-to-engine") + .tempdir() + .unwrap(); + let opt = RawDBOptions::default(); + opt.enable_unordered_write(false); + opt.enable_pipelined_write(false); + opt.enable_multi_batch_write(true); + let engine = new_engine_opt( + path.path().join("db").to_str().unwrap(), + RocksDbOptions::from_raw(opt), + vec![(CF_DEFAULT, RocksCfOptions::default())], + ) + .unwrap(); + assert!( + engine + .as_inner() + .get_db_options() + .is_enable_multi_batch_write() + ); + let mut wb = engine.write_batch(); + for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + for _i in 0..WRITE_BATCH_MAX_BATCH * WRITE_BATCH_LIMIT { + wb.put(b"aaa", b"bbb").unwrap(); + } + assert!(!wb.should_write_to_engine()); + wb.put(b"aaa", b"bbb").unwrap(); + assert!(wb.should_write_to_engine()); + wb.clear(); + assert!(!wb.should_write_to_engine()); + } +} diff --git a/engine_tiflash/src/raft_engine.rs b/engine_tiflash/src/raft_engine.rs index da15b1708b8..dce56148ae6 100644 --- a/engine_tiflash/src/raft_engine.rs +++ b/engine_tiflash/src/raft_engine.rs @@ -220,7 +220,11 @@ impl RaftEngine for RocksEngine { type LogBatch = RocksWriteBatchVec; fn log_batch(&self, capacity: usize) -> Self::LogBatch { - RocksWriteBatchVec::with_unit_capacity(self, capacity) + RocksWriteBatchVec::with_unit_capacity( + self, + self.ffi_hub.as_ref().unwrap().create_write_batch(), + capacity, + ) } fn sync(&self) -> Result<()> { diff --git a/engine_tiflash/src/write_batch.rs b/engine_tiflash/src/write_batch.rs index 2f62271e3b3..825e6cf1d33 100644 --- a/engine_tiflash/src/write_batch.rs +++ b/engine_tiflash/src/write_batch.rs @@ -5,7 +5,10 @@ use std::sync::Arc; use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; -use crate::{engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle}; +use crate::{ + engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle, FFIHubInner, + RawPSWriteBatchWrapper, +}; const WRITE_BATCH_MAX_BATCH: usize = 16; const WRITE_BATCH_LIMIT: usize = 16; @@ -18,6 +21,11 @@ impl WriteBatchExt for RocksEngine { fn write_batch(&self) -> RocksWriteBatchVec { RocksWriteBatchVec::new( Arc::clone(self.as_inner()), + self.ffi_hub.clone(), + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, WRITE_BATCH_LIMIT, 1, self.support_multi_batch_write(), @@ -25,7 +33,14 @@ impl WriteBatchExt for RocksEngine { } fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { - RocksWriteBatchVec::with_unit_capacity(self, cap) + RocksWriteBatchVec::with_unit_capacity( + self, + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, + cap, + ) } } @@ -49,6 +64,8 @@ pub struct RocksWriteBatchVec { impl RocksWriteBatchVec { pub fn new( db: Arc, + ffi_hub: Option>, + ps_wb: RawPSWriteBatchWrapper, batch_size_limit: usize, cap: usize, support_write_batch_vec: bool, @@ -64,9 +81,15 @@ impl RocksWriteBatchVec { } } - pub fn with_unit_capacity(engine: &RocksEngine, cap: usize) -> RocksWriteBatchVec { + pub fn with_unit_capacity( + engine: &RocksEngine, + ps_wb: RawPSWriteBatchWrapper, + cap: usize, + ) -> RocksWriteBatchVec { Self::new( engine.as_inner().clone(), + engine.ffi_hub.clone(), + ps_wb, WRITE_BATCH_LIMIT, cap, engine.support_multi_batch_write(), @@ -278,7 +301,14 @@ mod tests { assert!(v.is_some()); assert_eq!(v.unwrap(), b"bbb"); - let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity( + &engine, + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, + 1024, + ); for _i in 0..RocksEngine::WRITE_BATCH_MAX_KEYS { wb.put(b"aaa", b"bbb").unwrap(); } @@ -318,7 +348,14 @@ mod tests { assert!(!wb.should_write_to_engine()); wb.put(b"aaa", b"bbb").unwrap(); assert!(wb.should_write_to_engine()); - let mut wb = RocksWriteBatchVec::with_unit_capacity(&engine, 1024); + let mut wb = RocksWriteBatchVec::with_unit_capacity( + &engine, + RawPSWriteBatchWrapper { + ptr: std::ptr::null_mut(), + type_: 0, + }, + 1024, + ); for _i in 0..WRITE_BATCH_MAX_BATCH * WRITE_BATCH_LIMIT { wb.put(b"aaa", b"bbb").unwrap(); } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index a482d72f273..71e700e96e0 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -723,7 +723,7 @@ pub fn gen_engine_store_server_helper( fn_write_batch_clear: None, fn_consume_write_batch: None, fn_handle_read_page: None, - fn_gc_page_with_view_vec: None, + fn_gc_page_and_cpp_str_with_view_vec: None, fn_handle_purge_pagestorage: None, fn_handle_scan_page: None, fn_handle_seek_ps_key: None, diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index c795931123c..e93b27a0a63 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4954147441045435430ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 3525220209235231360ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 190de54c6c5..cb2e76d5b66 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -91,8 +91,15 @@ struct PageWithView { BaseBuffView view; }; -struct PageWithViewVec { - PageWithView *inner; +struct PageAndCppStrWithView { + RawCppPtr page; + RawCppPtr key; + BaseBuffView page_view; + BaseBuffView key_view; +}; + +struct PageAndCppStrWithViewVec { + PageAndCppStrWithView *inner; const uint64_t len; }; @@ -225,9 +232,10 @@ struct EngineStoreServerHelper { void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); PageWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, BaseBuffView); - PageWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, - BaseBuffView, BaseBuffView); - void (*fn_gc_page_with_view_vec)(PageWithView *inner, uint64_t len); + PageAndCppStrWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, + BaseBuffView, BaseBuffView); + void (*fn_gc_page_and_cpp_str_with_view_vec)(PageAndCppStrWithView *, + uint64_t); void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, BaseBuffView); From f228c10bb496a6cd50d5b831d58b927110d799be Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Wed, 21 Dec 2022 23:27:55 +0800 Subject: [PATCH 040/115] addr err Signed-off-by: CalvinNeo --- new-mock-engine-store/src/mock_store.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index e2f43a93b1c..dc992262fb6 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1,6 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; pub use std::{ cell::RefCell, collections::BTreeMap, @@ -1468,7 +1467,6 @@ unsafe extern "C" fn ffi_fast_add_peer( apply_state: apply_state_ptr, region: region_ptr, }); - return; }); if let Some(r) = ret { match r.status { @@ -1487,6 +1485,7 @@ unsafe extern "C" fn ffi_fast_add_peer( failed_add_peer_res(ffi_interfaces::FastAddPeerStatus::BadData) } +#[allow(clippy::single_element_loop)] pub fn move_data_from( engine_store_server: &mut EngineStoreServer, old_region_id: u64, From 3a254aad87a7a7d63aa4541cd028d905f639328f Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 10:45:56 +0800 Subject: [PATCH 041/115] add tests for split Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 32 +++++++++++++++++------ proxy_tests/proxy/fast_add_peer.rs | 41 +++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index a8381e43941..e90b925c687 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -305,10 +305,13 @@ impl TiFlashObserver { MapEntry::Occupied(mut o) => { (is_first, has_already_inited) = if !o.get().inited_or_fallback.load(Ordering::SeqCst) { - // If `has_already_inited` is true, usually means we recover from a - // restart. So we have data in disk, but not in memory. - // TODO Maybe only check once if we are not from recover. - // If we do not, we can then remove logics in apply snapshot. + // If `has_already_inited` is true: + // 1. We recover from a restart, + // 2. The peer is created by TiKV like split; + // So we have data in disk, but not in memory. + // In these cases, we need to check everytime. + + // TODO We can then remove logics in apply snapshot. // This is because if the next maybe_fast_path after apply snapshot // will have has_already_inited == true, which leads to normal // MsgAppend. @@ -322,9 +325,9 @@ impl TiFlashObserver { }; // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); - if is_first { - #[cfg(any(test, feature = "testexport"))] - { + #[cfg(any(test, feature = "testexport"))] + { + if is_first { info!("fast path: ongoing {}:{} {}, MsgAppend skipped", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), @@ -334,6 +337,16 @@ impl TiFlashObserver { "has_already_inited" => has_already_inited, "is_first" => is_first, ); + } else { + info!("fast path: ongoing {}:{} {}, MsgAppend accepted", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); } } let last = o.get().snapshot_inflight.load(Ordering::SeqCst); @@ -372,7 +385,7 @@ impl TiFlashObserver { #[cfg(any(test, feature = "testexport"))] { info!( - "fast path: normal MsgAppend of {}:{} {}", + "fast path: ongoing {}:{} {}, normal MsgAppend", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), @@ -1206,6 +1219,9 @@ impl RegionChangeObserver for TiFlashObs v.insert(Arc::new(c)); } }; + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, "NA"; + ); // TODO remove unwrap self.access_cached_region_info_mut(region_id, f).unwrap(); } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 4f520344efa..6c417916ac4 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -385,6 +385,43 @@ fn test_apply_snapshot() { fail::remove("before_tiflash_check_double_write"); } +#[test] +fn test_split_no_fast_add() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + cluster.cfg.raft_store.right_derive_when_split = true; + + let _ = cluster.run(); + + // Compose split keys + cluster.must_put(b"k1", b"v1"); + cluster.must_put(b"k3", b"v3"); + check_key(&cluster, b"k1", b"v1", Some(true), None, None); + check_key(&cluster, b"k3", b"v3", Some(true), None, None); + let r1 = cluster.get_region(b"k1"); + let r3 = cluster.get_region(b"k3"); + assert_eq!(r1.get_id(), r3.get_id()); + + fail::cfg("go_fast_path_succeed", "panic").unwrap(); + cluster.must_split(&r1, b"k2"); + must_wait_until_cond_node(&cluster, 1000, None, &|states: &States| -> bool { + states.in_disk_region_state.get_region().get_peers().len() == 3 + }); + let r1_new = cluster.get_region(b"k1"); // 1000 + let r3_new = cluster.get_region(b"k3"); // 1 + cluster.must_put(b"k0", b"v0"); + check_key(&cluster, b"k0", b"v0", Some(true), None, None); + + fail::remove("go_fast_path_succeed"); + fail::remove("on_can_apply_snapshot"); + cluster.shutdown(); +} + #[test] fn test_split_merge() { let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); @@ -398,12 +435,11 @@ fn test_split_merge() { let _ = cluster.run_conf_change(); + // Compose split keys cluster.must_put(b"k1", b"v1"); cluster.must_put(b"k3", b"v3"); - check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1])); - let r1 = cluster.get_region(b"k1"); let r3 = cluster.get_region(b"k3"); assert_eq!(r1.get_id(), r3.get_id()); @@ -411,7 +447,6 @@ fn test_split_merge() { cluster.must_split(&r1, b"k2"); let r1_new = cluster.get_region(b"k1"); // 1000 let r3_new = cluster.get_region(b"k3"); // 1 - let r1_id = r1_new.get_id(); let r3_id = r3_new.get_id(); debug!("r1_new {} r3_new {}", r1_id, r3_id); From ddb675943d9b8562ad31d4e55710797608bd3cae Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 11:05:11 +0800 Subject: [PATCH 042/115] add immutable Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 85 +++++++++++++++++++------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index e90b925c687..40a8e55357e 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -222,6 +222,30 @@ impl TiFlashObserver { Ok(()) } + pub fn access_cached_region_info>)>( + &self, + region_id: u64, + mut f: F, + ) -> RaftStoreResult<()> { + let slot_id = Self::slot_index(region_id); + let mut guard = match self.cached_region_info.get(slot_id).unwrap().read() { + Ok(g) => g, + Err(_) => return Err(box_err!("access_cached_region_info_mut poisoned")), + }; + f(guard.entry(region_id)); + Ok(()) + } + + pub fn get_is_first_or_fallback(&self, region_id: u64) -> bool { + let mut is_first = false; + let f = |info: MapEntry>| match info { + MapEntry::Occupied(o) => is_first = o.get().inited_or_fallback.load(vOrdering::SeqCst), + _ => (), + }; + self.access_cached_region_info(region_id, f); + is_first + } + pub fn remove_cached_region_info(&self, region_id: u64) { let slot_id = Self::slot_index(region_id); if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { @@ -325,30 +349,6 @@ impl TiFlashObserver { }; // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); - #[cfg(any(test, feature = "testexport"))] - { - if is_first { - info!("fast path: ongoing {}:{} {}, MsgAppend skipped", - self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - "has_already_inited" => has_already_inited, - "is_first" => is_first, - ); - } else { - info!("fast path: ongoing {}:{} {}, MsgAppend accepted", - self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - "has_already_inited" => has_already_inited, - "is_first" => is_first, - ); - } - } let last = o.get().snapshot_inflight.load(Ordering::SeqCst); if last != 0 { let current = SystemTime::now() @@ -378,20 +378,37 @@ impl TiFlashObserver { } } }; - // Can use immutable version. - self.access_cached_region_info_mut(region_id, f).unwrap(); - if !is_first { - #[cfg(any(test, feature = "testexport"))] - { - info!( - "fast path: ongoing {}:{} {}, normal MsgAppend", + if self.get_is_first_or_fallback(region_id) { + self.access_cached_region_info_mut(region_id, f).unwrap(); + } + + #[cfg(any(test, feature = "testexport"))] + { + if is_first { + info!("fast path: ongoing {}:{} {}, MsgAppend skipped", self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + ); + } else { + info!("fast path: ongoing {}:{} {}, MsgAppend accepted", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, ); } + } + + if !is_first { return false; } From 0a58ca762c1bf0591656e28dd3b34583f05c669e Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 11:42:59 +0800 Subject: [PATCH 043/115] fix Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 34 ++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 59503dea63a..029f57de24a 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -13,7 +13,7 @@ use std::{ }; use collections::HashMap; -use engine_tiflash::FsStatsExt; +use engine_tiflash::{FsStatsExt, RawPSWriteBatchPtr, RawPSWriteBatchWrapper}; use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, @@ -302,25 +302,26 @@ impl TiFlashObserver { Ok(()) } - pub fn access_cached_region_info>)>( + pub fn access_cached_region_info)>( &self, region_id: u64, mut f: F, - ) -> RaftStoreResult<()> { + ) { let slot_id = Self::slot_index(region_id); - let mut guard = match self.cached_region_info.get(slot_id).unwrap().read() { + let guard = match self.cached_region_info.get(slot_id).unwrap().read() { Ok(g) => g, - Err(_) => return Err(box_err!("access_cached_region_info_mut poisoned")), + Err(_) => panic!("access_cached_region_info poisoned!"), }; - f(guard.entry(region_id)); - Ok(()) + match guard.get(®ion_id) { + Some(g) => f(g.clone()), + None => (), + } } - pub fn get_is_first_or_fallback(&self, region_id: u64) -> bool { - let mut is_first = false; - let f = |info: MapEntry>| match info { - MapEntry::Occupied(o) => is_first = o.get().inited_or_fallback.load(vOrdering::SeqCst), - _ => (), + pub fn get_inited_or_fallback(&self, region_id: u64) -> Option { + let mut is_first: Option = None; + let f = |info: Arc| { + is_first = Some(info.inited_or_fallback.load(Ordering::SeqCst)); }; self.access_cached_region_info(region_id, f); is_first @@ -459,9 +460,12 @@ impl TiFlashObserver { } }; - if self.get_is_first_or_fallback(region_id) { - self.access_cached_region_info_mut(region_id, f).unwrap(); - } + match self.get_inited_or_fallback(region_id) { + Some(true) => { + is_first = false; + } + None | Some(false) => self.access_cached_region_info_mut(region_id, f).unwrap(), + }; #[cfg(any(test, feature = "testexport"))] { From c589200f75f5984317a14b881dfdbea393ba3380 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 21:38:48 +0800 Subject: [PATCH 044/115] add part of mock pagestorage Signed-off-by: CalvinNeo --- Cargo.lock | 40 +++++ engine_tiflash/src/write_batch.rs | 10 +- new-mock-engine-store/Cargo.toml | 2 + new-mock-engine-store/src/lib.rs | 1 + .../src/mock_page_storage.rs | 137 ++++++++++++++++++ new-mock-engine-store/src/mock_store.rs | 69 ++++----- 6 files changed, 217 insertions(+), 42 deletions(-) create mode 100644 new-mock-engine-store/src/mock_page_storage.rs diff --git a/Cargo.lock b/Cargo.lock index 3d2fcca5501..f886d100ff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -137,6 +137,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "assert-type-eq" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd49a41856ee21a0cfb2b1cfbfcca0f1d3e6c257c38939f0d6ecfaf177f2ea47" + [[package]] name = "async-channel" version = "1.6.1" @@ -2756,6 +2762,27 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "int-enum" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff87d3cc4b79b4559e3c75068d64247284aceb6a038bd4bb38387f3f164476d" +dependencies = [ + "int-enum-impl", +] + +[[package]] +name = "int-enum-impl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1f2f068675add1a3fc77f5f5ab2e29290c841ee34d151abc007bce902e5d34" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "into_other" version = "0.0.1" @@ -3379,6 +3406,7 @@ name = "new-mock-engine-store" version = "0.0.1" dependencies = [ "api_version", + "assert-type-eq", "causal_ts", "collections", "concurrency_manager", @@ -3395,6 +3423,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "grpcio-health", + "int-enum", "keys", "kvproto", "lazy_static", @@ -4150,6 +4179,17 @@ dependencies = [ "syn", ] +[[package]] +name = "proc-macro-crate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9" +dependencies = [ + "once_cell", + "thiserror", + "toml", +] + [[package]] name = "proc-macro-error" version = "1.0.4" diff --git a/engine_tiflash/src/write_batch.rs b/engine_tiflash/src/write_batch.rs index 825e6cf1d33..ea4143fb74a 100644 --- a/engine_tiflash/src/write_batch.rs +++ b/engine_tiflash/src/write_batch.rs @@ -22,10 +22,7 @@ impl WriteBatchExt for RocksEngine { RocksWriteBatchVec::new( Arc::clone(self.as_inner()), self.ffi_hub.clone(), - RawPSWriteBatchWrapper { - ptr: std::ptr::null_mut(), - type_: 0, - }, + self.ffi_hub.as_ref().unwrap().create_write_batch(), WRITE_BATCH_LIMIT, 1, self.support_multi_batch_write(), @@ -35,10 +32,7 @@ impl WriteBatchExt for RocksEngine { fn write_batch_with_cap(&self, cap: usize) -> RocksWriteBatchVec { RocksWriteBatchVec::with_unit_capacity( self, - RawPSWriteBatchWrapper { - ptr: std::ptr::null_mut(), - type_: 0, - }, + self.ffi_hub.as_ref().unwrap().create_write_batch(), cap, ) } diff --git a/new-mock-engine-store/Cargo.toml b/new-mock-engine-store/Cargo.toml index a3bdbf6a7a7..f34adc256b6 100644 --- a/new-mock-engine-store/Cargo.toml +++ b/new-mock-engine-store/Cargo.toml @@ -17,6 +17,7 @@ protobuf-codec = [ [dependencies] api_version = { workspace = true, default-features = false } +assert-type-eq = "0.1.0" causal_ts = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true, default-features = false } @@ -33,6 +34,7 @@ file_system = { workspace = true, default-features = false } futures = { version = "0.3", features = ["thread-pool", "compat"] } grpcio = { version = "0.10", default-features = false, features = ["openssl-vendored", "protobuf-codec"] } grpcio-health = { version = "0.10", default-features = false, features = ["protobuf-codec"] } +int-enum = "0.5" keys = { workspace = true, default-features = false } kvproto = { git = "https://github.com/pingcap/kvproto.git", default-features = false } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 5e5020a6309..e20fc61e261 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -3,6 +3,7 @@ #![feature(slice_take)] pub mod config; pub mod mock_cluster; +pub mod mock_page_storage; pub mod mock_store; pub mod node; pub mod server; diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs new file mode 100644 index 00000000000..61328e41a5c --- /dev/null +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -0,0 +1,137 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{collections::btree_map::OccupiedEntry, sync::RwLock}; + +use collections::HashMap; +pub use engine_store_ffi::{ + interfaces::root::DB as ffi_interfaces, BaseBuffView, CppStrWithView, EngineStoreServerHelper, + PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftStoreProxyFFIHelper, + RawCppPtr, RawVoidPtr, +}; + +use crate::mock_store::{into_engine_store_server_wrap, EngineStoreServerWrap, RawCppPtrTypeImpl}; + +#[derive(Default)] +pub struct MockPSWriteBatch { + pub data: HashMap, MockPSUniversalPage>, +} + +pub struct MockPSUniversalPage { + data: Vec, +} + +impl Into for BaseBuffView { + fn into(self) -> MockPSUniversalPage { + MockPSUniversalPage { + data: self.to_slice().to_owned(), + } + } +} + +#[derive(Default)] +pub struct MockPageStorage { + pub data: RwLock, MockPSUniversalPage>>, +} + +pub unsafe extern "C" fn ffi_mockps_create_write_batch() -> RawCppPtr { + let ptr = Box::into_raw(Box::new(MockPSWriteBatch::default())); + RawCppPtr { + ptr: ptr as RawVoidPtr, + type_: RawCppPtrTypeImpl::PSWriteBatch.into(), + } +} + +impl From for &mut MockPSWriteBatch { + fn from(value: RawVoidPtr) -> Self { + unsafe { &mut *(value as *mut MockPSWriteBatch) } + } +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_put_page( + wb: RawVoidPtr, + page_id: BaseBuffView, + page: BaseBuffView, +) { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.insert(page_id.to_slice().to_owned(), page.into()); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_del_page(wb: RawVoidPtr, page_id: BaseBuffView) { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.remove(page_id.to_slice()); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_size(wb: RawVoidPtr) -> u64 { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.len() +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + if wb.data.is_empty() { 1 } else { 0 } +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_merge(lwb: RawVoidPtr, rwb: RawVoidPtr) { + let lwb: _ = <&mut MockPSWriteBatch as From>::from(lwb); + let rwb: _ = <&mut MockPSWriteBatch as From>::from(rwb); + lwb.data.extend(rwb.data.into_iter()); +} + +pub unsafe extern "C" fn ffi_mockps_write_batch_clear(wb: RawVoidPtr) { + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + wb.data.clear(); +} + +pub unsafe extern "C" fn ffi_mockps_consume_write_batch( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + wb: RawVoidPtr, +) { + let store = into_engine_store_server_wrap(wrap); + let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); + let guard = store.engine_store_server.page_storage.data.write().unwrap(); + match guard.entry(key) { + std::collections::hash_map::Entry::Occupied(mut o) => o.insert(wb), + std::collections::hash_map::Entry::Vacant(v) => v.insert(wb), + } +} + +pub unsafe extern "C" fn ffi_mockps_handle_read_page( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + page_id: BaseBuffView, +) -> PageWithView { + todo!() +} + +pub unsafe extern "C" fn ffi_mockps_handle_scan_page( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + start_page_id: BaseBuffView, + end_page_id: BaseBuffView, +) -> PageAndCppStrWithViewVec { + todo!() +} + +pub unsafe extern "C" fn ffi_mockps_gc_page_and_cpp_str_with_view_vec( + arg1: *mut PageAndCppStrWithView, + arg2: u64, +) { + todo!() +} + +pub unsafe extern "C" fn ffi_mockps_handle_purge_pagestorage( + wrap: *const ffi_interfaces::EngineStoreServerWrap, +) { + todo!() +} + +pub unsafe extern "C" fn ffi_mockps_handle_seek_ps_key( + wrap: *const ffi_interfaces::EngineStoreServerWrap, + page_id: BaseBuffView, +) -> CppStrWithView { + todo!() +} + +pub unsafe extern "C" fn ffi_mockps_ps_is_empty( + wrap: *const ffi_interfaces::EngineStoreServerWrap, +) -> u8 { + todo!() +} diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index ce4abf48b4a..4cb4482ea71 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -11,6 +11,7 @@ pub use std::{ time::Duration, }; +use assert_type_eq; use collections::{HashMap, HashSet}; pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, RaftStoreProxyFFIHelper, @@ -21,6 +22,7 @@ pub use engine_traits::{ Engines, Iterable, KvEngine, Mutable, Peekable, RaftEngine, RaftLogBatch, SyncMutable, WriteBatch, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; +use int_enum::IntEnum; pub use kvproto::{ raft_cmdpb::AdminCmdType, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, @@ -36,6 +38,7 @@ pub use crate::{ mock_cluster::{ must_get_equal, must_get_none, Cluster, ProxyConfig, Simulator, TestPdClient, TiFlashEngine, }, + mock_page_storage::*, server::ServerCluster, }; @@ -87,6 +90,7 @@ pub struct EngineStoreServer { pub proxy_compat: bool, pub mock_cfg: MockConfig, pub region_states: RefCell>, + pub page_storage: MockPageStorage, } impl EngineStoreServer { @@ -101,6 +105,7 @@ impl EngineStoreServer { proxy_compat: false, mock_cfg: MockConfig::default(), region_states: RefCell::new(Default::default()), + page_storage: Default::default(), } } @@ -717,13 +722,13 @@ pub fn gen_engine_store_server_helper( fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), fn_handle_safe_ts_update: Some(ffi_handle_safe_ts_update), fn_fast_add_peer: Some(ffi_fast_add_peer), - fn_create_write_batch: None, - fn_write_batch_put_page: None, - fn_write_batch_del_page: None, - fn_write_batch_size: None, - fn_write_batch_is_empty: None, - fn_write_batch_merge: None, - fn_write_batch_clear: None, + fn_create_write_batch: Some(ffi_mockps_create_write_batch), + fn_write_batch_put_page: Some(ffi_mockps_write_batch_put_page), + fn_write_batch_del_page: Some(ffi_mockps_write_batch_del_page), + fn_write_batch_size: Some(ffi_mockps_write_batch_size), + fn_write_batch_is_empty: Some(ffi_mockps_write_batch_is_empty), + fn_write_batch_merge: Some(ffi_mockps_write_batch_merge), + fn_write_batch_clear: Some(ffi_mockps_write_batch_clear), fn_consume_write_batch: None, fn_handle_read_page: None, fn_gc_page_and_cpp_str_with_view_vec: None, @@ -734,7 +739,7 @@ pub fn gen_engine_store_server_helper( } } -unsafe fn into_engine_store_server_wrap( +pub unsafe fn into_engine_store_server_wrap( arg1: *const ffi_interfaces::EngineStoreServerWrap, ) -> &'static mut EngineStoreServerWrap { &mut *(arg1 as *mut EngineStoreServerWrap) @@ -763,37 +768,27 @@ unsafe extern "C" fn ffi_handle_write_raft_cmd( store.handle_write_raft_cmd(arg2, arg3) } -enum RawCppPtrTypeImpl { +#[repr(u32)] +#[derive(IntEnum, Copy, Clone)] +pub enum RawCppPtrTypeImpl { None = 0, - String, - PreHandledSnapshotWithBlock, - WakerNotifier, + String = 1, + PreHandledSnapshotWithBlock = 2, + WakerNotifier = 3, + PSWriteBatch = 4, + PSUniversalPage = 5, } -// TODO -#[allow(clippy::from_over_into)] -impl From for RawCppPtrTypeImpl { - fn from(o: ffi_interfaces::RawCppPtrType) -> Self { - match o { - 0 => RawCppPtrTypeImpl::None, - 1 => RawCppPtrTypeImpl::String, - 2 => RawCppPtrTypeImpl::PreHandledSnapshotWithBlock, - 3 => RawCppPtrTypeImpl::WakerNotifier, - _ => unreachable!(), - } +impl From for ffi_interfaces::RawCppPtrType { + fn from(value: RawCppPtrTypeImpl) -> Self { + assert_type_eq::assert_type_eq!(ffi_interfaces::RawCppPtrType, u32); + value.int_value() } } -// TODO remove this warn. -#[allow(clippy::from_over_into)] -impl Into for RawCppPtrTypeImpl { - fn into(self) -> ffi_interfaces::RawCppPtrType { - match self { - RawCppPtrTypeImpl::None => 0, - RawCppPtrTypeImpl::String => 1, - RawCppPtrTypeImpl::PreHandledSnapshotWithBlock => 2, - RawCppPtrTypeImpl::WakerNotifier => 3, - } +impl From for RawCppPtrTypeImpl { + fn from(value: ffi_interfaces::RawCppPtrType) -> Self { + RawCppPtrTypeImpl::from_int(value).unwrap() } } @@ -952,7 +947,7 @@ extern "C" fn ffi_gc_raw_cpp_ptr( ptr: ffi_interfaces::RawVoidPtr, tp: ffi_interfaces::RawCppPtrType, ) { - match RawCppPtrTypeImpl::from(tp) { + match tp.into() { RawCppPtrTypeImpl::None => {} RawCppPtrTypeImpl::String => unsafe { drop(Box::>::from_raw(ptr as *mut _)); @@ -963,6 +958,12 @@ extern "C" fn ffi_gc_raw_cpp_ptr( RawCppPtrTypeImpl::WakerNotifier => unsafe { drop(Box::from_raw(ptr as *mut ProxyNotifier)); }, + RawCppPtrTypeImpl::PSWriteBatch => unsafe { + drop(Box::from_raw(ptr as *mut MockPSWriteBatch)); + }, + RawCppPtrTypeImpl::PSUniversalPage => unsafe { + drop(Box::from_raw(ptr as *mut MockPSUniversalPage)); + }, } } From a8a471162f56d6530926a9c69b029f9f9590d3ea Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 22:26:07 +0800 Subject: [PATCH 045/115] p Signed-off-by: CalvinNeo --- engine_store_ffi/src/interfaces.rs | 15 ++++++++- engine_store_ffi/src/lib.rs | 32 ++++++++++++++++++- engine_store_ffi/src/ps_engine.rs | 5 +-- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 6 ++++ 5 files changed, 53 insertions(+), 7 deletions(-) diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index abd201d588b..4e4d40540c7 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -156,6 +156,12 @@ pub mod root { pub inner: *mut root::DB::PageAndCppStrWithView, pub len: u64, } + #[repr(C)] + #[derive(Debug)] + pub struct RawCppPtrArr { + pub inner: *mut root::DB::RawCppPtr, + pub len: u64, + } #[repr(u8)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum HttpRequestStatus { @@ -516,6 +522,13 @@ pub mod root { pub fn_gc_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawCppPtrType), >, + pub fn_gc_raw_cpp_ptr_arr: ::std::option::Option< + unsafe extern "C" fn( + head: root::DB::RawVoidPtr, + arg1: root::DB::RawCppPtrType, + len: u64, + ), + >, pub fn_get_config: ::std::option::Option< unsafe extern "C" fn( arg1: *mut root::DB::EngineStoreServerWrap, @@ -551,7 +564,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 3525220209235231360; + pub const RAFT_STORE_PROXY_VERSION: u64 = 13243545128523171780; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 6924024b342..cca8bf5328c 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -40,7 +40,7 @@ pub use self::interfaces::root::DB::{ FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, - WriteCmdType, WriteCmdsView, + WriteCmdType, WriteCmdsView, RawCppPtrArr }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -378,6 +378,29 @@ impl Drop for RawCppPtr { } } +impl RawCppPtrArr { + pub fn is_null(&self) -> bool { + unsafe { + (*self.inner).ptr.is_null() + } + } +} + +unsafe impl Send for RawCppPtrArr {} + +impl Drop for RawCppPtrArr { + fn drop(&mut self) { + unsafe { + if !self.is_null() { + let helper = get_engine_store_server_helper(); + helper.gc_raw_cpp_ptr_arr((*self.inner).ptr, (*self.inner).type_, self.len); + (*self.inner).ptr = std::ptr::null_mut(); + self.len = 0; + } + } + } +} + impl Drop for PageAndCppStrWithViewVec { fn drop(&mut self) { if self.inner != std::ptr::null_mut() { @@ -428,6 +451,13 @@ impl EngineStoreServerHelper { } } + fn gc_raw_cpp_ptr_arr(&self, head: *mut ::std::os::raw::c_void, tp: RawCppPtrType, len: u64) { + debug_assert!(self.fn_gc_raw_cpp_ptr_arr.is_some()); + unsafe { + (self.fn_gc_raw_cpp_ptr_arr.into_inner())(head, tp, len); + } + } + pub fn handle_compute_store_stats(&self) -> StoreStats { debug_assert!(self.fn_handle_compute_store_stats.is_some()); unsafe { (self.fn_handle_compute_store_stats.into_inner())(self.inner) } diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index dd555dbaf18..21018c9bb2a 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -4,12 +4,9 @@ #![allow(unused_variables)] use std::{ - fmt, - fmt::{Debug, Formatter}, - mem, slice, + fmt::{Debug, Formatter}, slice }; -use byteorder::{BigEndian, ByteOrder}; use engine_traits::{ Error, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index e93b27a0a63..42ad9c24dde 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 3525220209235231360ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 13243545128523171780ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index cb2e76d5b66..3d497f7922d 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -103,6 +103,11 @@ struct PageAndCppStrWithViewVec { const uint64_t len; }; +struct RawCppPtrArr { + RawCppPtr *inner; + const uint64_t len; +}; + enum class HttpRequestStatus : uint8_t { Ok = 0, ErrorParam, @@ -258,6 +263,7 @@ struct EngineStoreServerHelper { BaseBuffView body); uint8_t (*fn_check_http_uri_available)(BaseBuffView); void (*fn_gc_raw_cpp_ptr)(RawVoidPtr, RawCppPtrType); + void (*fn_gc_raw_cpp_ptr_arr)(RawVoidPtr head, RawCppPtrType, uint64_t len); CppStrWithView (*fn_get_config)(EngineStoreServerWrap *, uint8_t full); void (*fn_set_store)(EngineStoreServerWrap *, BaseBuffView); void (*fn_set_pb_msg_by_bytes)(MsgPBType type, RawVoidPtr ptr, From 069d4c884f6fe6fcd8a801450e2b1f511048d612 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 23 Dec 2022 22:38:19 +0800 Subject: [PATCH 046/115] fix compile Signed-off-by: CalvinNeo --- .../src/mock_page_storage.rs | 23 ++++++++++++------- new-mock-engine-store/src/mock_store.rs | 4 ++-- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs index 61328e41a5c..e13ddd2c801 100644 --- a/new-mock-engine-store/src/mock_page_storage.rs +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -63,7 +63,7 @@ pub unsafe extern "C" fn ffi_mockps_write_batch_del_page(wb: RawVoidPtr, page_id pub unsafe extern "C" fn ffi_mockps_write_batch_size(wb: RawVoidPtr) -> u64 { let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); - wb.data.len() + wb.data.len() as u64 } pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { @@ -74,7 +74,7 @@ pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { pub unsafe extern "C" fn ffi_mockps_write_batch_merge(lwb: RawVoidPtr, rwb: RawVoidPtr) { let lwb: _ = <&mut MockPSWriteBatch as From>::from(lwb); let rwb: _ = <&mut MockPSWriteBatch as From>::from(rwb); - lwb.data.extend(rwb.data.into_iter()); + lwb.data.extend(rwb.data.drain()); } pub unsafe extern "C" fn ffi_mockps_write_batch_clear(wb: RawVoidPtr) { @@ -88,11 +88,12 @@ pub unsafe extern "C" fn ffi_mockps_consume_write_batch( ) { let store = into_engine_store_server_wrap(wrap); let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); - let guard = store.engine_store_server.page_storage.data.write().unwrap(); - match guard.entry(key) { - std::collections::hash_map::Entry::Occupied(mut o) => o.insert(wb), - std::collections::hash_map::Entry::Vacant(v) => v.insert(wb), - } + let mut guard = (*store.engine_store_server) + .page_storage + .data + .write() + .unwrap(); + guard.extend(wb.data.drain()); } pub unsafe extern "C" fn ffi_mockps_handle_read_page( @@ -133,5 +134,11 @@ pub unsafe extern "C" fn ffi_mockps_handle_seek_ps_key( pub unsafe extern "C" fn ffi_mockps_ps_is_empty( wrap: *const ffi_interfaces::EngineStoreServerWrap, ) -> u8 { - todo!() + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + if guard.is_empty() { 1 } else { 0 } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 4cb4482ea71..fea69546c2f 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -729,13 +729,13 @@ pub fn gen_engine_store_server_helper( fn_write_batch_is_empty: Some(ffi_mockps_write_batch_is_empty), fn_write_batch_merge: Some(ffi_mockps_write_batch_merge), fn_write_batch_clear: Some(ffi_mockps_write_batch_clear), - fn_consume_write_batch: None, + fn_consume_write_batch: Some(ffi_mockps_consume_write_batch), fn_handle_read_page: None, fn_gc_page_and_cpp_str_with_view_vec: None, fn_handle_purge_pagestorage: None, fn_handle_scan_page: None, fn_handle_seek_ps_key: None, - fn_ps_is_empty: None, + fn_ps_is_empty: Some(ffi_mockps_ps_is_empty), } } From 51a2b42dfa9a53e2eb16b54a24c444af4b23df52 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Sat, 24 Dec 2022 03:10:28 +0800 Subject: [PATCH 047/115] add RawCppPtrArr Signed-off-by: CalvinNeo --- engine_store_ffi/src/interfaces.rs | 16 ++++--- engine_store_ffi/src/lib.rs | 42 ++++++++++++++----- engine_store_ffi/src/observer.rs | 1 + engine_store_ffi/src/ps_engine.rs | 3 +- new-mock-engine-store/src/mock_store.rs | 23 ++++++++-- proxy_tests/proxy/ffi.rs | 41 ++++++++++++++++++ proxy_tests/proxy/mod.rs | 3 ++ .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 8 +++- 9 files changed, 116 insertions(+), 23 deletions(-) create mode 100644 proxy_tests/proxy/ffi.rs diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 4e4d40540c7..7b9760b4df7 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -51,6 +51,12 @@ pub mod root { } #[repr(u32)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] + pub enum SpecialCppPtrType { + None = 0, + ArrayOfRawCppPtr = 1, + } + #[repr(u32)] + #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum EngineStoreApplyRes { None = 0, Persist = 1, @@ -522,11 +528,11 @@ pub mod root { pub fn_gc_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawCppPtrType), >, - pub fn_gc_raw_cpp_ptr_arr: ::std::option::Option< + pub fn_gc_special_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn( - head: root::DB::RawVoidPtr, - arg1: root::DB::RawCppPtrType, - len: u64, + arg1: root::DB::RawVoidPtr, + arg2: u64, + arg3: root::DB::SpecialCppPtrType, ), >, pub fn_get_config: ::std::option::Option< @@ -564,7 +570,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 13243545128523171780; + pub const RAFT_STORE_PROXY_VERSION: u64 = 1634948592330095866; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index cca8bf5328c..a417854bf04 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -28,6 +28,7 @@ pub use domain_impls::*; use encryption::DataKeyManager; pub use encryption_impls::*; use engine_traits::{Peekable, CF_LOCK}; +use interfaces::root::DB::SpecialCppPtrType; use kvproto::{kvrpcpb, metapb, raft_cmdpb}; use lazy_static::lazy_static; use protobuf::Message; @@ -39,8 +40,8 @@ pub use self::interfaces::root::DB::{ EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, - RaftStoreProxyFFIHelper, RawCppPtr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, StoreStats, - WriteCmdType, WriteCmdsView, RawCppPtrArr + RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrArr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, + StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -380,9 +381,7 @@ impl Drop for RawCppPtr { impl RawCppPtrArr { pub fn is_null(&self) -> bool { - unsafe { - (*self.inner).ptr.is_null() - } + unsafe { (*self.inner).ptr.is_null() } } } @@ -392,9 +391,25 @@ impl Drop for RawCppPtrArr { fn drop(&mut self) { unsafe { if !self.is_null() { + println!("!!!! RawCppPtrArr 1"); let helper = get_engine_store_server_helper(); - helper.gc_raw_cpp_ptr_arr((*self.inner).ptr, (*self.inner).type_, self.len); - (*self.inner).ptr = std::ptr::null_mut(); + let len = self.len; + println!("!!!! RawCppPtrArr 2"); + // Delete all `T**` + for i in 0..len { + let i = i as usize; + let inner_i = self.inner.add(i); + helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + } + println!("!!!! RawCppPtrArr 3"); + // Delete `T*` + helper.gc_special_raw_cpp_ptr( + self.inner as RawVoidPtr, + self.len, + SpecialCppPtrType::ArrayOfRawCppPtr, + ); + println!("!!!! RawCppPtrArr 4"); + self.inner = std::ptr::null_mut(); self.len = 0; } } @@ -418,7 +433,7 @@ pub fn get_engine_store_server_helper_ptr() -> isize { unsafe { ENGINE_STORE_SERVER_HELPER_PTR } } -fn get_engine_store_server_helper() -> &'static EngineStoreServerHelper { +pub fn get_engine_store_server_helper() -> &'static EngineStoreServerHelper { gen_engine_store_server_helper(unsafe { ENGINE_STORE_SERVER_HELPER_PTR }) } @@ -451,10 +466,15 @@ impl EngineStoreServerHelper { } } - fn gc_raw_cpp_ptr_arr(&self, head: *mut ::std::os::raw::c_void, tp: RawCppPtrType, len: u64) { - debug_assert!(self.fn_gc_raw_cpp_ptr_arr.is_some()); + fn gc_special_raw_cpp_ptr( + &self, + ptr: *mut ::std::os::raw::c_void, + hint_len: u64, + tp: SpecialCppPtrType, + ) { + debug_assert!(self.fn_gc_special_raw_cpp_ptr.is_some()); unsafe { - (self.fn_gc_raw_cpp_ptr_arr.into_inner())(head, tp, len); + (self.fn_gc_special_raw_cpp_ptr.into_inner())(ptr, hint_len, tp); } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 029f57de24a..6b7f5d45b8c 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -133,6 +133,7 @@ impl engine_tiflash::FFIHubInner for TiFlashFFIHub { } fn read_page(&self, page_id: &[u8]) -> Option> { + // TODO maybe we can steal memory from C++ here to reduce redundant copy? let value = self.engine_store_server_helper.read_page(page_id.into()); return if value.view.len == 0 { None diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 21018c9bb2a..d0ef2d97082 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -4,7 +4,8 @@ #![allow(unused_variables)] use std::{ - fmt::{Debug, Formatter}, slice + fmt::{Debug, Formatter}, + slice, }; use engine_traits::{ diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index fea69546c2f..f45be832cba 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -717,6 +717,7 @@ pub fn gen_engine_store_server_helper( fn_handle_http_request: None, fn_check_http_uri_available: None, fn_gc_raw_cpp_ptr: Some(ffi_gc_raw_cpp_ptr), + fn_gc_special_raw_cpp_ptr: Some(ffi_gc_special_raw_cpp_ptr), fn_get_config: None, fn_set_store: None, fn_set_pb_msg_by_bytes: Some(ffi_set_pb_msg_by_bytes), @@ -773,10 +774,10 @@ unsafe extern "C" fn ffi_handle_write_raft_cmd( pub enum RawCppPtrTypeImpl { None = 0, String = 1, - PreHandledSnapshotWithBlock = 2, - WakerNotifier = 3, - PSWriteBatch = 4, - PSUniversalPage = 5, + PreHandledSnapshotWithBlock = 11, + WakerNotifier = 12, + PSWriteBatch = 13, + PSUniversalPage = 14, } impl From for ffi_interfaces::RawCppPtrType { @@ -943,6 +944,20 @@ impl ProxyNotifier { } } +extern "C" fn ffi_gc_special_raw_cpp_ptr( + ptr: ffi_interfaces::RawVoidPtr, + hint_len: u64, + tp: ffi_interfaces::SpecialCppPtrType, +) { + match tp { + ffi_interfaces::SpecialCppPtrType::None => (), + ffi_interfaces::SpecialCppPtrType::ArrayOfRawCppPtr => unsafe { + let p = std::slice::from_raw_parts_mut(ptr as *mut RawCppPtr, hint_len as usize); + drop(p); + }, + } +} + extern "C" fn ffi_gc_raw_cpp_ptr( ptr: ffi_interfaces::RawVoidPtr, tp: ffi_interfaces::RawCppPtrType, diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs new file mode 100644 index 00000000000..dfb66bf6248 --- /dev/null +++ b/proxy_tests/proxy/ffi.rs @@ -0,0 +1,41 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use core::slice::SlicePattern; + +use engine_store_ffi::{ + get_engine_store_server_helper, EngineStoreServerHelper, RawCppPtr, RawCppPtrArr, + UnwrapExternCFunc, +}; +use new_mock_engine_store::mock_cluster::init_global_ffi_helper_set; + +#[test] +fn test_array_of_raw_cpp_ptr() { + println!("!!!! A 0"); + tikv_util::set_panic_hook(true, "./"); + unsafe { + println!("!!!! A 1"); + init_global_ffi_helper_set(); + println!("!!!! A 2"); + let helper = get_engine_store_server_helper(); + + let len = 10; + let mut v: Vec = vec![]; + + println!("!!!! A 3"); + for i in 0..len { + let s = format!("s{}", i); + v.push((helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into())); + } + + println!("!!!! A 4"); + + let (ptr_v, l, cap) = v.into_raw_parts(); + let cpp_ptr_arr = RawCppPtrArr { + inner: ptr_v, + len: cap as u64, + }; + + println!("!!!! A 5"); + drop(cpp_ptr_arr); + } +} diff --git a/proxy_tests/proxy/mod.rs b/proxy_tests/proxy/mod.rs index 1d7edced540..c464315afdc 100644 --- a/proxy_tests/proxy/mod.rs +++ b/proxy_tests/proxy/mod.rs @@ -3,12 +3,15 @@ #![feature(custom_test_frameworks)] #![test_runner(test_util::run_failpoint_tests)] #![recursion_limit = "100"] +#![feature(vec_into_raw_parts)] +#![feature(slice_pattern)] #[macro_use] extern crate slog_global; mod config; mod fast_add_peer; +mod ffi; mod flashback; mod normal; mod proxy; diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 42ad9c24dde..03cc5790a73 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 13243545128523171780ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 1634948592330095866ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 3d497f7922d..f0baaf34847 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -6,6 +6,11 @@ namespace DB { struct EngineStoreServerWrap; +enum class SpecialCppPtrType : uint32_t { + None = 0, + ArrayOfRawCppPtr = 1, +}; + enum class EngineStoreApplyRes : uint32_t { None = 0, Persist, @@ -103,6 +108,7 @@ struct PageAndCppStrWithViewVec { const uint64_t len; }; +// An array of pointers, like `T **` struct RawCppPtrArr { RawCppPtr *inner; const uint64_t len; @@ -263,7 +269,7 @@ struct EngineStoreServerHelper { BaseBuffView body); uint8_t (*fn_check_http_uri_available)(BaseBuffView); void (*fn_gc_raw_cpp_ptr)(RawVoidPtr, RawCppPtrType); - void (*fn_gc_raw_cpp_ptr_arr)(RawVoidPtr head, RawCppPtrType, uint64_t len); + void (*fn_gc_special_raw_cpp_ptr)(RawVoidPtr, uint64_t, SpecialCppPtrType); CppStrWithView (*fn_get_config)(EngineStoreServerWrap *, uint8_t full); void (*fn_set_store)(EngineStoreServerWrap *, BaseBuffView); void (*fn_set_pb_msg_by_bytes)(MsgPBType type, RawVoidPtr ptr, From 9fa6b4a0fed4b76faf732c967d5eb3949a196778 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Sun, 25 Dec 2022 00:11:49 +0800 Subject: [PATCH 048/115] add RawCppPtrTuple Signed-off-by: CalvinNeo --- engine_store_ffi/src/interfaces.rs | 14 +++-- engine_store_ffi/src/lib.rs | 53 +++++++++++++++---- new-mock-engine-store/src/mock_store.rs | 4 ++ proxy_scripts/ci_check.sh | 1 + proxy_tests/proxy/ffi.rs | 45 ++++++++++++---- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 16 ++++-- 7 files changed, 108 insertions(+), 27 deletions(-) diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 7b9760b4df7..5a3acd21ad9 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -53,7 +53,8 @@ pub mod root { #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum SpecialCppPtrType { None = 0, - ArrayOfRawCppPtr = 1, + TupleOfRawCppPtr = 1, + ArrayOfRawCppPtr = 2, } #[repr(u32)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] @@ -164,10 +165,17 @@ pub mod root { } #[repr(C)] #[derive(Debug)] - pub struct RawCppPtrArr { + pub struct RawCppPtrTuple { pub inner: *mut root::DB::RawCppPtr, pub len: u64, } + #[repr(C)] + #[derive(Debug)] + pub struct RawCppPtrArr { + pub inner: *mut root::DB::RawVoidPtr, + pub len: u64, + pub type_: root::DB::RawCppPtrType, + } #[repr(u8)] #[derive(Debug, Copy, Clone, Hash, PartialEq, Eq)] pub enum HttpRequestStatus { @@ -570,7 +578,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 1634948592330095866; + pub const RAFT_STORE_PROXY_VERSION: u64 = 4326611643816778519; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index a417854bf04..e4d61145423 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -40,8 +40,8 @@ pub use self::interfaces::root::DB::{ EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, - RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrArr, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, - StoreStats, WriteCmdType, WriteCmdsView, + RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawCppStringPtr, RawVoidPtr, + SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -354,7 +354,7 @@ impl RaftStoreProxyFFIHelper { } impl RawCppPtr { - fn into_raw(mut self) -> RawVoidPtr { + pub fn into_raw(mut self) -> RawVoidPtr { let ptr = self.ptr; self.ptr = std::ptr::null_mut(); ptr @@ -379,36 +379,67 @@ impl Drop for RawCppPtr { } } -impl RawCppPtrArr { +impl RawCppPtrTuple { pub fn is_null(&self) -> bool { unsafe { (*self.inner).ptr.is_null() } } } +unsafe impl Send for RawCppPtrTuple {} + +impl Drop for RawCppPtrTuple { + fn drop(&mut self) { + unsafe { + if !self.is_null() { + let helper = get_engine_store_server_helper(); + let len = self.len; + // Delete all `void *` + for i in 0..len { + let i = i as usize; + let inner_i = self.inner.add(i); + helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + } + // Delete `void **` + helper.gc_special_raw_cpp_ptr( + self.inner as RawVoidPtr, + self.len, + SpecialCppPtrType::TupleOfRawCppPtr, + ); + self.inner = std::ptr::null_mut(); + self.len = 0; + } + } + } +} + +impl RawCppPtrArr { + pub fn is_null(&self) -> bool { + unsafe { self.inner.is_null() } + } +} + unsafe impl Send for RawCppPtrArr {} impl Drop for RawCppPtrArr { fn drop(&mut self) { unsafe { if !self.is_null() { - println!("!!!! RawCppPtrArr 1"); let helper = get_engine_store_server_helper(); let len = self.len; - println!("!!!! RawCppPtrArr 2"); - // Delete all `T**` + // Delete all `T *` for i in 0..len { let i = i as usize; let inner_i = self.inner.add(i); - helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + if !(*inner_i).is_null() { + helper.gc_raw_cpp_ptr(*inner_i, self.type_); + } } - println!("!!!! RawCppPtrArr 3"); - // Delete `T*` + // Delete `T **` helper.gc_special_raw_cpp_ptr( self.inner as RawVoidPtr, self.len, SpecialCppPtrType::ArrayOfRawCppPtr, ); - println!("!!!! RawCppPtrArr 4"); self.inner = std::ptr::null_mut(); self.len = 0; } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index f45be832cba..d7c56fb4ad0 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -951,6 +951,10 @@ extern "C" fn ffi_gc_special_raw_cpp_ptr( ) { match tp { ffi_interfaces::SpecialCppPtrType::None => (), + ffi_interfaces::SpecialCppPtrType::TupleOfRawCppPtr => unsafe { + let p = std::slice::from_raw_parts_mut(ptr as *mut RawCppPtr, hint_len as usize); + drop(p); + }, ffi_interfaces::SpecialCppPtrType::ArrayOfRawCppPtr => unsafe { let p = std::slice::from_raw_parts_mut(ptr as *mut RawCppPtr, hint_len as usize); drop(p); diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 4cda63cf674..8483d2bbb0b 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -44,6 +44,7 @@ elif [[ $M == "testnew" ]]; then cargo test --package proxy_tests --test proxy region cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test + cargo test --package proxy_tests --test proxy ffi elif [[ $M == "debug" ]]; then # export RUSTC_WRAPPER=~/.cargo/bin/sccache export ENGINE_LABEL_VALUE=tiflash diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index dfb66bf6248..4d1fb76efb2 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -4,38 +4,65 @@ use core::slice::SlicePattern; use engine_store_ffi::{ get_engine_store_server_helper, EngineStoreServerHelper, RawCppPtr, RawCppPtrArr, - UnwrapExternCFunc, + RawCppPtrTuple, RawVoidPtr, UnwrapExternCFunc, +}; +use new_mock_engine_store::{ + mock_cluster::init_global_ffi_helper_set, mock_store::RawCppPtrTypeImpl, }; -use new_mock_engine_store::mock_cluster::init_global_ffi_helper_set; #[test] -fn test_array_of_raw_cpp_ptr() { - println!("!!!! A 0"); +fn test_tuple_of_raw_cpp_ptr() { tikv_util::set_panic_hook(true, "./"); unsafe { - println!("!!!! A 1"); init_global_ffi_helper_set(); - println!("!!!! A 2"); let helper = get_engine_store_server_helper(); let len = 10; let mut v: Vec = vec![]; - println!("!!!! A 3"); for i in 0..len { let s = format!("s{}", i); v.push((helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into())); } - println!("!!!! A 4"); + let (ptr_v, l, cap) = v.into_raw_parts(); + let cpp_ptr_tp = RawCppPtrTuple { + inner: ptr_v, + len: cap as u64, + }; + drop(cpp_ptr_tp); + } +} + +#[test] +fn test_array_of_raw_cpp_ptr() { + tikv_util::set_panic_hook(true, "./"); + unsafe { + init_global_ffi_helper_set(); + let helper = get_engine_store_server_helper(); + + let len = 10; + let mut v: Vec = vec![]; + + println!("AAAA 1"); + for i in 0..len { + let s = format!("s{}", i); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + let raw_void_ptr = raw_cpp_ptr.into_raw(); + v.push(raw_void_ptr); + } + println!("AAAA 2"); let (ptr_v, l, cap) = v.into_raw_parts(); + println!("AAAA cap {}", cap); let cpp_ptr_arr = RawCppPtrArr { inner: ptr_v, + type_: RawCppPtrTypeImpl::String.into(), len: cap as u64, }; - println!("!!!! A 5"); + println!("AAAA 3"); drop(cpp_ptr_arr); + println!("AAAA 4"); } } diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 03cc5790a73..10338141dea 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 1634948592330095866ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4326611643816778519ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index f0baaf34847..e55bb786a4f 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -8,7 +8,8 @@ struct EngineStoreServerWrap; enum class SpecialCppPtrType : uint32_t { None = 0, - ArrayOfRawCppPtr = 1, + TupleOfRawCppPtr = 1, + ArrayOfRawCppPtr = 2, }; enum class EngineStoreApplyRes : uint32_t { @@ -108,12 +109,21 @@ struct PageAndCppStrWithViewVec { const uint64_t len; }; -// An array of pointers, like `T **` -struct RawCppPtrArr { +// An tuple of pointers, like `void **`, +// Can be used to represent structures. +struct RawCppPtrTuple { RawCppPtr *inner; const uint64_t len; }; +// An array of pointers(same type), like `T **`, +// Can be used to represent arrays. +struct RawCppPtrArr { + RawVoidPtr *inner; + const uint64_t len; + RawCppPtrType type; +}; + enum class HttpRequestStatus : uint8_t { Ok = 0, ErrorParam, From e1fe58303c8adbf58653f32955316bf92d815628 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Sun, 25 Dec 2022 00:14:16 +0800 Subject: [PATCH 049/115] fix fmt Signed-off-by: CalvinNeo --- proxy_tests/proxy/ffi.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index 4d1fb76efb2..8bccae10bf0 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -1,7 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use core::slice::SlicePattern; - use engine_store_ffi::{ get_engine_store_server_helper, EngineStoreServerHelper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawVoidPtr, UnwrapExternCFunc, @@ -44,7 +42,6 @@ fn test_array_of_raw_cpp_ptr() { let len = 10; let mut v: Vec = vec![]; - println!("AAAA 1"); for i in 0..len { let s = format!("s{}", i); let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); @@ -52,17 +49,12 @@ fn test_array_of_raw_cpp_ptr() { v.push(raw_void_ptr); } - println!("AAAA 2"); let (ptr_v, l, cap) = v.into_raw_parts(); - println!("AAAA cap {}", cap); let cpp_ptr_arr = RawCppPtrArr { inner: ptr_v, type_: RawCppPtrTypeImpl::String.into(), len: cap as u64, }; - - println!("AAAA 3"); drop(cpp_ptr_arr); - println!("AAAA 4"); } } From 86168dcf63c7ad89df37da167f29f9d1b8c9055b Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Sun, 25 Dec 2022 00:45:56 +0800 Subject: [PATCH 050/115] enhance Signed-off-by: CalvinNeo --- engine_store_ffi/src/lib.rs | 15 ++++++++++++--- proxy_tests/proxy/ffi.rs | 8 +++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index e4d61145423..5050669cd92 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -393,13 +393,19 @@ impl Drop for RawCppPtrTuple { if !self.is_null() { let helper = get_engine_store_server_helper(); let len = self.len; - // Delete all `void *` + // Delete all `void *`. for i in 0..len { let i = i as usize; let inner_i = self.inner.add(i); - helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + // Will not fire even without the if in tests, + // since type must be 0 which is None. + if !inner_i.is_null() { + helper.gc_raw_cpp_ptr((*inner_i).ptr, (*inner_i).type_); + // We still set to nullptr, even though we will immediately delete it. + (*inner_i).ptr = std::ptr::null_mut(); + } } - // Delete `void **` + // Delete `void **`. helper.gc_special_raw_cpp_ptr( self.inner as RawVoidPtr, self.len, @@ -430,8 +436,11 @@ impl Drop for RawCppPtrArr { for i in 0..len { let i = i as usize; let inner_i = self.inner.add(i); + // Will fire even without the if in tests, since type is not 0. if !(*inner_i).is_null() { helper.gc_raw_cpp_ptr(*inner_i, self.type_); + // We still set to nullptr, even though we will immediately delete it. + *inner_i = std::ptr::null_mut(); } } // Delete `T **` diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index 8bccae10bf0..93f35037366 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -20,14 +20,19 @@ fn test_tuple_of_raw_cpp_ptr() { for i in 0..len { let s = format!("s{}", i); - v.push((helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into())); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + v.push(raw_cpp_ptr); } let (ptr_v, l, cap) = v.into_raw_parts(); + assert_ne!(l, cap); let cpp_ptr_tp = RawCppPtrTuple { inner: ptr_v, len: cap as u64, }; + for i in 0..cap { + let inner_i = cpp_ptr_tp.inner.add(i); + } drop(cpp_ptr_tp); } } @@ -50,6 +55,7 @@ fn test_array_of_raw_cpp_ptr() { } let (ptr_v, l, cap) = v.into_raw_parts(); + assert_ne!(l, cap); let cpp_ptr_arr = RawCppPtrArr { inner: ptr_v, type_: RawCppPtrTypeImpl::String.into(), From 88d31dfa101d5273bfad5feca73c597c0be506e9 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 26 Dec 2022 11:12:29 +0800 Subject: [PATCH 051/115] fix fmt Signed-off-by: CalvinNeo --- engine_store_ffi/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 5050669cd92..72e090776bc 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -28,7 +28,6 @@ pub use domain_impls::*; use encryption::DataKeyManager; pub use encryption_impls::*; use engine_traits::{Peekable, CF_LOCK}; -use interfaces::root::DB::SpecialCppPtrType; use kvproto::{kvrpcpb, metapb, raft_cmdpb}; use lazy_static::lazy_static; use protobuf::Message; @@ -41,7 +40,7 @@ pub use self::interfaces::root::DB::{ FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawCppStringPtr, RawVoidPtr, - SSTReaderPtr, StoreStats, WriteCmdType, WriteCmdsView, + SSTReaderPtr, SpecialCppPtrType, StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, From 6b6fa7d6b00e8f423d0e92e22757a81c28a3b2d1 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Mon, 26 Dec 2022 12:16:57 +0800 Subject: [PATCH 052/115] f Signed-off-by: CalvinNeo --- proxy_scripts/ci_check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 8483d2bbb0b..1a95bfc69ab 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -44,7 +44,7 @@ elif [[ $M == "testnew" ]]; then cargo test --package proxy_tests --test proxy region cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test - cargo test --package proxy_tests --test proxy ffi + cargo test --package proxy_tests --test proxy ffi -- --test-threads 1 elif [[ $M == "debug" ]]; then # export RUSTC_WRAPPER=~/.cargo/bin/sccache export ENGINE_LABEL_VALUE=tiflash From 09f9aac35ecdc6dc8aee2eb11d6aaccaeddd3e23 Mon Sep 17 00:00:00 2001 From: Zwb Date: Wed, 28 Dec 2022 12:00:16 +0800 Subject: [PATCH 053/115] modify raft gc log impl for witness (#13869) ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 210 ++++++++++++++++- components/raftstore/src/store/fsm/peer.rs | 118 +++++++++- components/raftstore/src/store/fsm/store.rs | 3 + components/raftstore/src/store/msg.rs | 3 + components/raftstore/src/store/peer.rs | 15 ++ tests/failpoints/cases/test_witness.rs | 167 +++++++++++++- tests/integrations/raftstore/test_witness.rs | 224 ++++++++++--------- 8 files changed, 628 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c510da6d77..8433f54c512 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2747,7 +2747,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#e53d558bc6d7d8b7bb2d283cdf6dda52a2615632" +source = "git+https://github.com/pingcap/kvproto.git#ae3b086b09afbb26cebcd4c1fe14b82bbe1f0796" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index affa0205e8f..ec2d7bf72a8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -151,6 +151,7 @@ impl HeapSize for PendingCmd {} pub struct PendingCmdQueue { normals: VecDeque>, conf_change: Option>, + compacts: VecDeque>, } impl PendingCmdQueue { @@ -158,6 +159,7 @@ impl PendingCmdQueue { PendingCmdQueue { normals: VecDeque::new(), conf_change: None, + compacts: VecDeque::new(), } } @@ -190,6 +192,23 @@ impl PendingCmdQueue { fn set_conf_change(&mut self, cmd: PendingCmd) { self.conf_change = Some(cmd); } + + fn push_compact(&mut self, cmd: PendingCmd) { + self.compacts.push_back(cmd); + } + + fn pop_compact(&mut self, index: u64) -> Option> { + let mut front = None; + while self.compacts.front().map_or(false, |c| c.index < index) { + front = self.compacts.pop_front(); + front.as_mut().unwrap().cb.take().unwrap(); + } + front + } + + fn has_compact(&mut self) -> bool { + !self.compacts.is_empty() + } } #[derive(Default, Debug)] @@ -281,6 +300,7 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, + PendingCompactCmd, } /// The possible returned value when applying logs. @@ -1488,7 +1508,8 @@ where | ExecResult::CompactLog { .. } | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } - | ExecResult::TransferLeader { .. } => {} + | ExecResult::TransferLeader { .. } + | ExecResult::PendingCompactCmd => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -1545,6 +1566,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_region_removed(self.region.get_id(), id, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), id, cmd); + } self.yield_state = None; let mut event = TraceEvent::default(); @@ -1562,6 +1586,9 @@ where if let Some(cmd) = self.pending_cmds.conf_change.take() { notify_stale_command(region_id, peer_id, self.term, cmd); } + for cmd in self.pending_cmds.compacts.drain(..) { + notify_region_removed(self.region.get_id(), peer_id, cmd); + } } fn clear_all_commands_silently(&mut self) { @@ -1571,6 +1598,9 @@ where if let Some(mut cmd) = self.pending_cmds.conf_change.take() { cmd.cb.take(); } + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take(); + } } } @@ -2937,13 +2967,86 @@ where )) } + fn try_compact_log( + &mut self, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) -> Result>> { + PEER_ADMIN_CMD_COUNTER.compact.all.inc(); + let first_index = entry_storage::first_index(&self.apply_state); + + if self.is_merging { + info!( + "in merging mode, skip compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + return Ok(None); + } + + // When the witness restarted, the pending compact cmd has been lost, so use + // `voter_replicated_index` for gc to avoid log accumulation. + if !self.pending_cmds.has_compact() { + if voter_replicated_index <= first_index { + debug!( + "voter_replicated_index <= first index, no need to compact"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "compact_index" => voter_replicated_index, + "first_index" => first_index, + ); + return Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index: 0, + has_pending: false, + })); + } + // compact failure is safe to be omitted, no need to assert. + compact_raft_log( + &self.tag, + &mut self.apply_state, + voter_replicated_index, + voter_replicated_term, + )?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + return Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: false, + })); + } + + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + // compact failure is safe to be omitted, no need to assert. + compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; + PEER_ADMIN_CMD_COUNTER.compact.success.inc(); + Ok(Some(TaskRes::Compact { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + })) + } + None => { + info!( + "latest voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "voter_replicated_index" => voter_replicated_index, + ); + Ok(None) + } + } + } + fn exec_compact_log( &mut self, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); - let compact_index = req.get_compact_log().get_compact_index(); + let mut compact_index = req.get_compact_log().get_compact_index(); let resp = AdminResponse::default(); let first_index = entry_storage::first_index(&self.apply_state); if compact_index <= first_index { @@ -2966,7 +3069,7 @@ where return Ok((resp, ApplyResult::None)); } - let compact_term = req.get_compact_log().get_compact_term(); + let mut compact_term = req.get_compact_log().get_compact_term(); // TODO: add unit tests to cover all the message integrity checks. if compact_term == 0 { info!( @@ -2981,6 +3084,41 @@ where )); } + let voter_replicated_index = req.get_compact_log().get_voter_replicated_index(); + // If there is any voter lagging behind, the log truncation of the witness + // shouldn't be triggered even if it's force mode(raft log size/count exceeds + // the threshold or raft engine purge), otherwise the witness can't help the + // lagging voter catch up logs when leader is down. In this situation Compact + // index should be queued. If witness receives a voter_replicated_index + // that is larger than the pending compact index, logs can be deleted. + if self.peer.is_witness { + if voter_replicated_index < compact_index { + self.pending_cmds.push_compact(PendingCmd::new( + compact_index, + compact_term, + Callback::None, + )); + match self.pending_cmds.pop_compact(voter_replicated_index) { + Some(cmd) => { + compact_index = cmd.index; + compact_term = cmd.term; + } + None => { + info!( + "voter_replicated_index < compact_index, skip"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "command" => ?req.get_compact_log() + ); + return Ok((resp, ApplyResult::Res(ExecResult::PendingCompactCmd))); + } + } + } else { + for mut cmd in self.pending_cmds.compacts.drain(..) { + cmd.cb.take().unwrap(); + } + } + } // compact failure is safe to be omitted, no need to assert. compact_raft_log( &self.tag, @@ -3451,6 +3589,11 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + CheckCompact { + region_id: u64, + voter_replicated_index: u64, + voter_replicated_term: u64, + }, } impl Msg @@ -3498,6 +3641,17 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::CheckCompact { + region_id, + voter_replicated_index, + voter_replicated_term, + } => { + write!( + f, + "[region {}] check compact, voter_replicated_index: {}, voter_replicated_term: {}", + region_id, voter_replicated_index, voter_replicated_term + ) + } } } } @@ -3542,6 +3696,11 @@ where // Whether destroy request is from its target region's snapshot merge_from_snapshot: bool, }, + Compact { + state: RaftTruncatedState, + first_index: u64, + has_pending: bool, + }, } pub struct ApplyFsm @@ -3947,6 +4106,34 @@ where cb.invoke_read(resp); } + fn check_pending_compact_log( + &mut self, + ctx: &mut ApplyContext, + voter_replicated_index: u64, + voter_replicated_term: u64, + ) { + let res = self + .delegate + .try_compact_log(voter_replicated_index, voter_replicated_term); + match res { + Ok(res) => { + if let Some(res) = res { + ctx.prepare_for(&mut self.delegate); + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + ctx.finish_for(&mut self.delegate, VecDeque::new()); + ctx.notifier + .notify_one(self.delegate.region_id(), PeerMsg::ApplyRes { res }); + } + } + Err(e) => error!(?e; + "failed to compact log"; + "region_id" => self.delegate.region.get_id(), + "peer_id" => self.delegate.id(), + ), + } + } + fn handle_tasks(&mut self, apply_ctx: &mut ApplyContext, msgs: &mut Vec>) { let mut drainer = msgs.drain(..); let mut batch_apply = None; @@ -4019,6 +4206,17 @@ where let delegate = &self.delegate as *const ApplyDelegate as *const u8; f(delegate) } + Msg::CheckCompact { + voter_replicated_index, + voter_replicated_term, + .. + } => { + self.check_pending_compact_log( + apply_ctx, + voter_replicated_index, + voter_replicated_term, + ); + } } } } @@ -4429,6 +4627,11 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::CheckCompact { region_id, .. } => { + info!("target region is not found"; + "region_id" => region_id); + return; + } }, Either::Left(Err(TrySendError::Full(_))) => unreachable!(), }; @@ -4561,6 +4764,7 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::CheckCompact { .. } => 0, } } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1b484df5316..abd8fd84771 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -53,7 +53,7 @@ use tikv_alloc::trace::TraceEvent; use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, - store::{find_peer, is_learner, region_on_same_stores}, + store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, @@ -1195,6 +1195,7 @@ where PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1217,6 +1218,9 @@ where self.fsm.has_ready = true; } self.fsm.peer.maybe_gen_approximate_buckets(self.ctx); + if self.fsm.peer.is_witness() { + self.register_pull_voter_replicated_index_tick(); + } } fn on_gc_snap(&mut self, snaps: Vec<(SnapKey, bool)>) { @@ -2310,6 +2314,21 @@ where *is_ready = true; } } + ApplyTaskRes::Compact { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + // When the witness restarts, the pending compact cmds will be lost. We will try + // to use `voter_replicated_index` as the `compact index` to avoid log + // accumulation, but if `voter_replicated_index` is less than `first_index`, + // then gc is not needed. In this case, the `first_index` we pass back will be + // 0, and `has_pending` set to false. + if first_index != 0 { + self.on_ready_compact_log(first_index, state); + } + } } if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); @@ -2667,6 +2686,53 @@ where ); } + fn on_voter_replicated_index_request(&mut self, from: &metapb::Peer) { + if !self.fsm.peer.is_leader() { + return; + } + let mut voter_replicated_idx = self.fsm.peer.get_store().last_index(); + for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if voter_replicated_idx > p.matched && !is_learner(peer) { + voter_replicated_idx = p.matched; + } + } + let first_index = self.fsm.peer.get_store().first_index(); + if voter_replicated_idx > first_index { + voter_replicated_idx = first_index; + } + let mut resp = ExtraMessage::default(); + resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); + resp.voter_replicated_index = voter_replicated_idx; + self.fsm + .peer + .send_extra_message(resp, &mut self.ctx.trans, from); + debug!( + "leader responses voter_replicated_index to witness"; + "region_id" => self.region().get_id(), + "witness_id" => from.id, + "leader_id" => self.fsm.peer.peer.get_id(), + "voter_replicated_index" => voter_replicated_idx, + ); + } + + fn on_voter_replicated_index_response(&mut self, msg: &ExtraMessage) { + if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { + return; + } + let voter_replicated_index = msg.voter_replicated_index; + if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { + self.ctx.apply_router.schedule_task( + self.region_id(), + ApplyTask::CheckCompact { + region_id: self.region_id(), + voter_replicated_index, + voter_replicated_term, + }, + ) + } + } + fn on_extra_message(&mut self, mut msg: RaftMessage) { match msg.get_extra_msg().get_type() { ExtraMessageType::MsgRegionWakeUp | ExtraMessageType::MsgCheckStalePeer => { @@ -2716,6 +2782,12 @@ where ExtraMessageType::MsgAvailabilityResponse => { self.on_availability_response(msg.get_from_peer(), msg.get_extra_msg()); } + ExtraMessageType::MsgVoterReplicatedIndexRequest => { + self.on_voter_replicated_index_request(msg.get_from_peer()); + } + ExtraMessageType::MsgVoterReplicatedIndexResponse => { + self.on_voter_replicated_index_response(msg.get_extra_msg()); + } } } @@ -3871,6 +3943,9 @@ where self.fsm.peer.schedule_raftlog_gc(self.ctx, compact_to); self.fsm.peer.last_compacted_idx = compact_to; self.fsm.peer.mut_store().on_compact_raftlog(compact_to); + if self.fsm.peer.is_witness() { + self.fsm.peer.last_compacted_time = Instant::now(); + } } fn on_ready_split_region( @@ -4897,6 +4972,10 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::PendingCompactCmd => { + self.fsm.peer.has_pending_compact_cmd = true; + self.register_pull_voter_replicated_index_tick(); + } } } @@ -5315,8 +5394,13 @@ where let first_idx = self.fsm.peer.get_store().first_index(); let last_idx = self.fsm.peer.get_store().last_index(); + let mut voter_replicated_idx = last_idx; let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); for (peer_id, p) in self.fsm.peer.raft_group.raft.prs().iter() { + let peer = find_peer_by_id(self.region(), *peer_id).unwrap(); + if !is_learner(peer) && voter_replicated_idx > p.matched { + voter_replicated_idx = p.matched; + } if replicated_idx > p.matched { replicated_idx = p.matched; } @@ -5405,7 +5489,8 @@ where let region_id = self.fsm.peer.region().get_id(); let peer = self.fsm.peer.peer.clone(); let term = self.fsm.peer.get_index_term(compact_idx); - let request = new_compact_log_request(region_id, peer, compact_idx, term); + let request = + new_compact_log_request(region_id, peer, compact_idx, term, voter_replicated_idx); self.propose_raft_command_internal( request, Callback::None, @@ -5444,6 +5529,27 @@ where self.register_check_long_uncommitted_tick(); } + fn on_request_voter_replicated_index(&mut self) { + if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { + return; + } + // TODO: make it configurable + if self.fsm.peer.last_compacted_time.elapsed() + > self.ctx.cfg.raft_log_gc_tick_interval.0 * 2 + { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); + let leader_id = self.fsm.peer.leader_id(); + let leader = self.fsm.peer.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &leader); + } + } + self.register_pull_voter_replicated_index_tick(); + } + fn register_check_leader_lease_tick(&mut self) { self.schedule_tick(PeerTick::CheckLeaderLease) } @@ -5979,6 +6085,10 @@ where } } + fn register_pull_voter_replicated_index_tick(&mut self) { + self.schedule_tick(PeerTick::RequestVoterReplicatedIndex); + } + fn on_check_peer_stale_state_tick(&mut self) { if self.fsm.peer.pending_remove { return; @@ -6421,6 +6531,7 @@ fn new_compact_log_request( peer: metapb::Peer, compact_index: u64, compact_term: u64, + voter_replicated_index: u64, ) -> RaftCmdRequest { let mut request = new_admin_request(region_id, peer); @@ -6428,6 +6539,9 @@ fn new_compact_log_request( admin.set_cmd_type(AdminCmdType::CompactLog); admin.mut_compact_log().set_compact_index(compact_index); admin.mut_compact_log().set_compact_term(compact_term); + admin + .mut_compact_log() + .set_voter_replicated_index(voter_replicated_index); request.set_admin_request(admin); request } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 54bb7d0cc0b..ceb8858046d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -594,6 +594,9 @@ where self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = self.cfg.check_peers_availability_interval.0; + // TODO: make it reasonable + self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0 * 2; } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index a4c6c435741..08b0e9367dc 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -375,6 +375,7 @@ pub enum PeerTick { ReportBuckets = 9, CheckLongUncommitted = 10, CheckPeersAvailability = 11, + RequestVoterReplicatedIndex = 12, } impl PeerTick { @@ -395,6 +396,7 @@ impl PeerTick { PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -412,6 +414,7 @@ impl PeerTick { PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, PeerTick::CheckPeersAvailability, + PeerTick::RequestVoterReplicatedIndex, ]; TICKS } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index a72bb59d8bf..7752a0a1b0e 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -939,6 +939,15 @@ where /// The index of last compacted raft log. It is used for the next compact /// log task. pub last_compacted_idx: u64, + /// Record the time of the last raft log compact, the witness should query + /// the leader periodically whether `voter_replicated_index` is updated + /// if CompactLog admin command isn't triggered for a while. + pub last_compacted_time: Instant, + /// When the peer is witness, and there is any voter lagging behind, the + /// log truncation of the witness shouldn't be triggered even if it's + /// force mode, and this item will be set to `true`, after all pending + /// compact cmds have been handled, it will be set to `false`. + pub has_pending_compact_cmd: bool, /// The index of the latest urgent proposal index. last_urgent_proposal_idx: u64, /// The index of the latest committed split command. @@ -1083,6 +1092,10 @@ where let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + // In order to avoid excessive log accumulation due to the loss of pending + // compaction cmds after the witness is restarted, it will actively pull + // voter_request_index once at start. + let has_pending_compact_cmd = peer.is_witness; let mut peer = Peer { peer, @@ -1118,6 +1131,8 @@ where tag: tag.clone(), last_applying_idx: applied_index, last_compacted_idx: 0, + last_compacted_time: Instant::now(), + has_pending_compact_cmd, last_urgent_proposal_idx: u64::MAX, last_committed_split_idx: 0, last_sent_snapshot_idx: 0, diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index cee75ff44b9..98a845b7016 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -2,11 +2,12 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::metapb; +use kvproto::{metapb, raft_serverpb::RaftApplyState}; use pd_client::PdClient; use test_raftstore::*; -use tikv_util::store::find_peer; +use tikv_util::{config::ReadableDuration, store::find_peer}; fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { peer.set_role(metapb::PeerRole::Learner); @@ -69,3 +70,165 @@ fn test_witness_update_region_in_local_reader() { fail::remove("change_peer_after_update_region_store_3"); } + +// Test the case witness pull voter_replicated_index when has pending compact +// cmd. +#[test] +fn test_witness_raftlog_gc_pull_voter_replicated_index() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} + +// Test the case witness gc raftlog after reboot. +#[test] +fn test_witness_raftlog_gc_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } + + // one follower is down + cluster.stop_node(nodes[1]); + + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } + + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } + + fail::cfg("on_raft_gc_log_tick", "return").unwrap(); + + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + + // the witness is down + cluster.stop_node(nodes[2]); + std::thread::sleep(Duration::from_millis(100)); + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); + + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + fail::remove("on_raft_gc_log_tick"); +} diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index a2518cc64ae..301a743588e 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -2,8 +2,13 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; +use collections::HashMap; use futures::executor::block_on; -use kvproto::{metapb, raft_cmdpb::ChangePeerRequest, raft_serverpb::PeerState}; +use kvproto::{ + metapb, + raft_cmdpb::ChangePeerRequest, + raft_serverpb::{PeerState, RaftApplyState}, +}; use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; @@ -296,127 +301,134 @@ fn test_witness_conf_change() { // } // } -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index won't advance when there is a witness -// even // if the gap gap exceeds the gc count limit -// #[test] -// fn test_witness_raftlog_gc_lagged_follower() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); +// Test the case that truncated index won't advance when there is a witness even +// if the gap gap exceeds the gc count limit +#[test] +fn test_witness_raftlog_gc_lagged_follower() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); -// cluster.must_put(b"k0", b"v0"); + cluster.must_put(b"k0", b"v0"); -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } -// // one follower is down -// cluster.stop_node(nodes[1]); + // one follower is down + cluster.stop_node(nodes[1]); -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } -// // the truncated index is not advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert!(state.get_truncated_state(). -// get_index() - before_states[&id].get_index() < 10); } - -// // the follower is back online -// cluster.run_node(nodes[1]).unwrap(); -// cluster.must_put(b"k00", b"v00"); -// must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(300)); - -// // the truncated index is advanced now, as all the peers has replicated -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } + // the witness truncated index is not advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + if id == 2 { + assert_eq!( + state.get_truncated_state().get_index() - before_states[&id].get_index(), + 0 + ); + } else { + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } + } -// TODO: add back when raft log gc logic is updated for witness -// // Test the case that truncated index is advance when there is a lagged -// witness #[test] -// fn test_witness_raftlog_gc_lagged_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); + // the follower is back online + cluster.run_node(nodes[1]).unwrap(); + cluster.must_put(b"k00", b"v00"); + must_get_equal(&cluster.get_engine(nodes[1]), b"k00", b"v00"); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(300)); + + // the truncated index is advanced now, as all the peers has replicated + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); +// Test the case that truncated index is advance when there is a lagged witness +#[test] +fn test_witness_raftlog_gc_lagged_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); -// // make sure raft log gc is triggered -// std::thread::sleep(Duration::from_millis(200)); -// let mut before_states = HashMap::default(); -// for (&id, engines) in &cluster.engines { -// let mut state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); before_states.insert(id, -// state.take_truncated_state()); } + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + become_witness(&cluster, region.get_id(), &mut peer_on_store3); + cluster.must_put(b"k0", b"v0"); -// // the witness is down -// cluster.stop_node(nodes[2]); + // make sure raft log gc is triggered + std::thread::sleep(Duration::from_millis(200)); + let mut before_states = HashMap::default(); + for (&id, engines) in &cluster.engines { + let mut state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + before_states.insert(id, state.take_truncated_state()); + } -// // write some data to make log gap exceeds the gc limit -// for i in 1..1000 { -// let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// } + // the witness is down + cluster.stop_node(nodes[2]); -// // the witness is back online -// cluster.run_node(nodes[2]).unwrap(); + // write some data to make log gap exceeds the gc limit + for i in 1..1000 { + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + } -// cluster.must_put(b"k00", b"v00"); -// std::thread::sleep(Duration::from_millis(200)); + // the witness is back online + cluster.run_node(nodes[2]).unwrap(); -// // the truncated index is advanced -// for (&id, engines) in &cluster.engines { -// let state: RaftApplyState = get_raft_msg_or_default(engines, -// &keys::apply_state_key(1)); println!("{} {}", id, -// state.get_truncated_state().get_index()); assert_ge!( -// state.get_truncated_state().get_index() - -// before_states[&id].get_index(), 900 -// ); -// } -// } + cluster.must_put(b"k00", b"v00"); + std::thread::sleep(Duration::from_millis(200)); + + // the truncated index is advanced + for (&id, engines) in &cluster.engines { + let state: RaftApplyState = get_raft_msg_or_default(engines, &keys::apply_state_key(1)); + assert_ne!( + 900, + state.get_truncated_state().get_index() - before_states[&id].get_index() + ); + } +} // Test the case replica read can't be performed on witness peer. #[test] From 177efafee39a7f1cf7cbc6330d834cdbbe42a657 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Tue, 27 Dec 2022 23:52:16 -0800 Subject: [PATCH 054/115] raftstore-v2: a few panic fix (#13996) ref tikv/tikv#12842 a few panic fix 1) update_approximate_raft_log_size may run into divid by zero error 2) appy_delete may have None write_batch 3) StoreMeta::set_region may run into region corruption error if it's destroyed and re-created. 4) TabletSnapManager's snapshot size calculation may throw Other error. Signed-off-by: qi.xu Signed-off-by: Jay Lee Co-authored-by: qi.xu Co-authored-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/fsm/store.rs | 14 ++++++++++++++ .../raftstore-v2/src/operation/command/mod.rs | 5 +++++ .../src/operation/command/write/mod.rs | 1 + components/raftstore-v2/src/operation/life.rs | 6 ++++++ components/raftstore-v2/src/worker/pd/mod.rs | 8 +++++++- .../raftstore-v2/src/worker/pd/store_heartbeat.rs | 7 +++++-- 7 files changed, 39 insertions(+), 3 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index a3800085522..800dbc98f91 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -535,6 +535,7 @@ impl StoreSystem { causal_ts_provider, self.logger.clone(), self.shutdown.clone(), + cfg.clone(), )); let split_check_scheduler = workers.background.start( diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index cb7aa99b179..f107715a535 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -80,6 +80,20 @@ impl StoreMeta { ); } } + + pub fn remove_region(&mut self, region_id: u64) { + let prev = self.regions.remove(®ion_id); + if let Some((prev, initialized)) = prev { + if initialized { + let key = ( + data_end_key(prev.get_end_key()), + prev.get_region_epoch().get_version(), + ); + let prev_id = self.region_ranges.remove(&key); + assert_eq!(prev_id, Some(prev.get_id())); + } + } + } } impl StoreRegionMeta for StoreMeta { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 35b4ec1918e..0a58bb64016 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -245,6 +245,11 @@ impl Peer { let apply = CommittedEntries { entry_and_proposals, }; + assert!( + self.apply_scheduler().is_some(), + "apply_scheduler should be something. region_id {}", + self.region_id() + ); self.apply_scheduler() .unwrap() .send(ApplyTask::CommittedEntries(apply)); diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index ad6e537b956..af806e3024e 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -177,6 +177,7 @@ impl Apply { } util::check_key_in_region(key, self.region_state().get_region())?; keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector self.write_batch.as_mut().unwrap().delete(&self.key_buffer) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index ea42832eaea..0f2e72061ef 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -331,6 +331,12 @@ impl Peer { pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); ctx.router.close(self.region_id()); + { + ctx.store_meta + .lock() + .unwrap() + .remove_region(self.region_id()); + } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a // new peer. Ignore error as it's just a best effort. diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index cc977e68236..bfcf3389754 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -11,9 +11,12 @@ use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; -use raftstore::store::{util::KeysInfoFormatter, FlowStatsReporter, ReadStats, TxnExt, WriteStats}; +use raftstore::store::{ + util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TxnExt, WriteStats, +}; use slog::{error, info, Logger}; use tikv_util::{ + config::VersionTrack, time::UnixSecs, worker::{Runnable, Scheduler}, }; @@ -122,6 +125,7 @@ where logger: Logger, shutdown: Arc, + cfg: Arc>, } impl Runner @@ -141,6 +145,7 @@ where causal_ts_provider: Option>, // used for rawkv apiv2 logger: Logger, shutdown: Arc, + cfg: Arc>, ) -> Self { Self { store_id, @@ -158,6 +163,7 @@ where causal_ts_provider, logger, shutdown, + cfg, } } } diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 2fbe378cff8..22bee3cbf26 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -272,8 +272,11 @@ where Ok(stats) => stats, }; let disk_cap = disk_stats.total_space(); - // TODO: custom capacity. - let capacity = disk_cap; + let capacity = if self.cfg.value().capacity.0 == 0 { + disk_cap + } else { + std::cmp::min(disk_cap, self.cfg.value().capacity.0) + }; // TODO: accurate snapshot size and kv engines size. let snap_size = 0; let kv_size = 0; From 3253bfc37a0a646163f895ae1b7de59064074227 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 29 Dec 2022 19:10:02 +0800 Subject: [PATCH 055/115] Fix mem leak in mock for RawCppPtrArr(Tuple) (#245) --- engine_store_ffi/src/lib.rs | 2 +- engine_store_ffi/src/observer.rs | 19 ++++++++++++++++--- engine_store_ffi/src/ps_engine.rs | 2 +- engine_tiflash/src/engine.rs | 1 - engine_tiflash/src/lib.rs | 2 -- new-mock-engine-store/src/mock_store.rs | 16 +++++++++++++--- 6 files changed, 31 insertions(+), 11 deletions(-) diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 72e090776bc..4cd8c58932c 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -419,7 +419,7 @@ impl Drop for RawCppPtrTuple { impl RawCppPtrArr { pub fn is_null(&self) -> bool { - unsafe { self.inner.is_null() } + self.inner.is_null() } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 6b7f5d45b8c..c7677230699 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -157,7 +157,8 @@ impl engine_tiflash::FFIHubInner for TiFlashFFIHub { f( &value.key_view.to_slice().to_vec(), &value.page_view.to_slice().to_vec(), - ); + ) + .unwrap(); } } } @@ -547,8 +548,20 @@ impl TiFlashObserver { let region_str = res.region.view.to_slice(); let mut apply_state = RaftApplyState::default(); let mut new_region = kvproto::metapb::Region::default(); - apply_state.merge_from_bytes(apply_state_str).unwrap(); - new_region.merge_from_bytes(region_str).unwrap(); + if let Err(e) = apply_state.merge_from_bytes(apply_state_str) { + error!( + "fast path: ongoing {}:{} {} failed. parse apply_state {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res + ); + self.fallback_to_slow_path(region_id); + } + if let Err(e) = new_region.merge_from_bytes(region_str) { + error!( + "fast path: ongoing {}:{} {} failed. parse region {:?}, fallback to normal", + self.store_id, region_id, new_peer_id, res + ); + self.fallback_to_slow_path(region_id); + } // Validate // check if the source already knows the know peer diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index d0ef2d97082..386d5e88d6a 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -20,7 +20,7 @@ use kvproto::{ }; use protobuf::Message; use raft::eraftpb::Entry; -use tikv_util::{box_err, box_try, info}; +use tikv_util::{box_try, info}; use tracker::TrackerToken; use crate::{gen_engine_store_server_helper, RawCppPtr}; diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index ee6d9d68e86..dfe590af323 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -19,7 +19,6 @@ use engine_traits::{ ReadOptions, Result, SyncMutable, }; use rocksdb::{Writable, DB}; -use tikv_util::box_err; use crate::{r2e, util::get_cf_handle}; diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 23c6014cdb6..18e73b1ee13 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -15,9 +15,7 @@ //! //! Please read the engine_trait crate docs before hacking. #![allow(dead_code)] -#![feature(backtrace)] #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #[allow(unused_extern_crates)] extern crate tikv_alloc; diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index d7c56fb4ad0..790eafc1074 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -789,7 +789,11 @@ impl From for ffi_interfaces::RawCppPtrType { impl From for RawCppPtrTypeImpl { fn from(value: ffi_interfaces::RawCppPtrType) -> Self { - RawCppPtrTypeImpl::from_int(value).unwrap() + if let Ok(s) = RawCppPtrTypeImpl::from_int(value) { + s + } else { + panic!("unknown RawCppPtrType {:?}", value); + } } } @@ -952,11 +956,17 @@ extern "C" fn ffi_gc_special_raw_cpp_ptr( match tp { ffi_interfaces::SpecialCppPtrType::None => (), ffi_interfaces::SpecialCppPtrType::TupleOfRawCppPtr => unsafe { - let p = std::slice::from_raw_parts_mut(ptr as *mut RawCppPtr, hint_len as usize); + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawCppPtr, + hint_len as usize, + )); drop(p); }, ffi_interfaces::SpecialCppPtrType::ArrayOfRawCppPtr => unsafe { - let p = std::slice::from_raw_parts_mut(ptr as *mut RawCppPtr, hint_len as usize); + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawVoidPtr, + hint_len as usize, + )); drop(p); }, } From 06bfaa42a120d1c2cefa5515810a699b3abd458b Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 30 Dec 2022 10:40:19 +0800 Subject: [PATCH 056/115] raftstore-v2: avoid ticking when there are many unapplied logs (#13995) ref tikv/tikv#12842 Whenever timeout, the peer will check for unapplied logs whether there are pending conf change and trigger heavy reads. So we wait till most logs are applied before ticking. It also fix following issues: - PersistenceListener is not installed - implementation of persisted_apply_index is wrong - parse tablet name is wrong Signed-off-by: Jay Lee --- components/engine_rocks/src/event_listener.rs | 2 +- components/engine_traits/src/flush.rs | 8 +- components/engine_traits/src/tablet.rs | 23 +++++- components/raftstore-v2/src/fsm/apply.rs | 3 + components/raftstore-v2/src/fsm/peer.rs | 19 ++--- .../operation/command/admin/compact_log.rs | 42 +++++----- .../operation/command/admin/conf_change.rs | 2 +- .../src/operation/command/admin/mod.rs | 10 ++- .../src/operation/command/admin/split.rs | 21 +++-- .../src/operation/command/control.rs | 5 ++ .../raftstore-v2/src/operation/command/mod.rs | 28 ++++++- .../src/operation/ready/apply_trace.rs | 55 ++++++++++--- .../raftstore-v2/src/operation/ready/mod.rs | 78 ++++++++++++++---- .../src/operation/ready/snapshot.rs | 51 +++++++++--- components/raftstore-v2/src/raft/apply.rs | 12 ++- components/raftstore-v2/src/raft/peer.rs | 44 ++++++++--- components/raftstore-v2/src/raft/storage.rs | 21 ++--- .../src/router/internal_message.rs | 1 + .../integrations/test_transfer_leader.rs | 63 ++++++++------- components/raftstore/src/store/metrics.rs | 1 + components/raftstore/src/store/snap.rs | 7 +- components/server/src/server2.rs | 79 ++++++++++--------- src/config/mod.rs | 3 + src/server/raftkv2/node.rs | 34 ++++---- 24 files changed, 417 insertions(+), 195 deletions(-) diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index b940fcb39f3..3bbf03cb77f 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -261,7 +261,7 @@ mod tests { let (region_id, tablet_index) = (2, 3); let storage = Arc::new(MemStorage::default()); - let state = Arc::new(FlushState::default()); + let state = Arc::new(FlushState::new(0)); let listener = PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); let mut db_opt = RocksDbOptions::default(); diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index cfed95f0426..b3a827c234e 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -50,12 +50,18 @@ impl FlushProgress { /// raftstore will update state changes and corresponding apply index, when /// flush, `PersistenceListener` will query states related to the memtable /// and persist the relation to raft engine. -#[derive(Default, Debug)] +#[derive(Debug)] pub struct FlushState { applied_index: AtomicU64, } impl FlushState { + pub fn new(applied_index: u64) -> Self { + Self { + applied_index: AtomicU64::new(applied_index), + } + } + /// Set the latest applied index. #[inline] pub fn set_applied_index(&self, index: u64) { diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index edc0bd99870..f552fbc01aa 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -222,10 +222,20 @@ impl TabletRegistry { }) } + /// Format the name as {prefix}_{id}_{suffix}. If prefix is empty, it will + /// be format as {id}_{suffix}. pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { - format!("{}{}_{}", prefix, id, suffix) + format!( + "{}{:_(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { let name = path.file_name().unwrap().to_str().unwrap(); let mut parts = name.rsplit('_'); @@ -463,10 +473,19 @@ mod tests { }); assert_eq!(count, 1); - let name = registry.tablet_name("prefix_", 12, 30); + let name = registry.tablet_name("prefix", 12, 30); assert_eq!(name, "prefix_12_30"); let normal_name = registry.tablet_name("", 20, 15); let normal_tablet_path = registry.tablet_path(20, 15); assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + + let full_prefix_path = registry.tablet_root().join(name); + let res = registry.parse_tablet_name(&full_prefix_path); + assert_eq!(res, Some(("prefix", 12, 30))); + let res = registry.parse_tablet_name(&normal_tablet_path); + assert_eq!(res, Some(("", 20, 15))); + let invalid_path = registry.tablet_root().join("invalid_12"); + let res = registry.parse_tablet_name(&invalid_path); + assert_eq!(res, None); } } diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index c0eabd2120e..b81d31329cb 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -65,6 +65,7 @@ impl ApplyFsm { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); @@ -76,6 +77,7 @@ impl ApplyFsm { read_scheduler, flush_state, log_recovery, + applied_term, logger, ); ( @@ -114,6 +116,7 @@ impl ApplyFsm { ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), + ApplyTask::ManualFlush => self.apply.on_manual_flush(), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 49f1efcb760..8b05435246b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -43,7 +43,11 @@ impl PeerFsm { storage: Storage, ) -> Result> { let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; - info!(peer.logger, "create peer"); + info!(peer.logger, "create peer"; + "raft_state" => ?peer.storage().raft_state(), + "apply_state" => ?peer.storage().apply_state(), + "region_state" => ?peer.storage().region_state() + ); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { peer, @@ -187,20 +191,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_start(&mut self) { - self.schedule_tick(PeerTick::Raft); + if !self.fsm.peer.maybe_pause_for_recovery() { + self.schedule_tick(PeerTick::Raft); + } self.schedule_tick(PeerTick::SplitRegionCheck); self.schedule_tick(PeerTick::PdHeartbeat); self.schedule_tick(PeerTick::CompactLog); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } - // Unlike v1, it's a must to set ready when there are pending entries. Otherwise - // it may block for ever when there is unapplied conf change. - let entry_storage = self.fsm.peer.storage().entry_storage(); - if entry_storage.commit_index() > entry_storage.applied_index() - // Speed up setup if there is only one peer. - || self.fsm.peer.is_leader() - { + // Speed up setup if there is only one peer. + if self.fsm.peer.is_leader() { self.fsm.peer.set_has_ready(); } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index d1d10d366bf..c36c7353871 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -21,7 +21,7 @@ use raftstore::{ Result, }; use slog::{debug, error, info}; -use tikv_util::{box_err, Either}; +use tikv_util::box_err; use crate::{ batch::StoreContext, @@ -255,7 +255,15 @@ impl Peer { .unwrap(); self.set_has_extra_write(); - self.maybe_compact_log_from_engine(store_ctx, Either::Right(old_truncated)); + // All logs < perssited_apply will be deleted, so should check with +1. + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() { + self.compact_log_from_engine(store_ctx); + } + + let applied = *self.last_applying_index_mut(); + let total_cnt = applied - old_truncated; + let remain_cnt = applied - res.compact_index; + self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); } #[inline] @@ -278,7 +286,9 @@ impl Peer { } else { self.set_has_extra_write(); } - self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); + if old_persisted < self.entry_storage().truncated_index() + 1 { + self.compact_log_from_engine(store_ctx); + } if self.remove_tombstone_tablets_before(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); task.persisted_cbs.push(Box::new(move || { @@ -288,19 +298,10 @@ impl Peer { } } - pub fn maybe_compact_log_from_engine( - &mut self, - store_ctx: &mut StoreContext, - old_index: Either, - ) { - let truncated = self.entry_storage().truncated_index(); - let persisted = self.storage().apply_trace().persisted_apply_index(); - match old_index { - Either::Left(old_persisted) if old_persisted >= truncated => return, - Either::Right(old_truncated) if old_truncated >= persisted => return, - _ => {} - } - let compact_index = std::cmp::min(truncated, persisted); + fn compact_log_from_engine(&mut self, store_ctx: &mut StoreContext) { + let truncated = self.entry_storage().truncated_index() + 1; + let persisted_applied = self.storage().apply_trace().persisted_apply_index(); + let compact_index = std::cmp::min(truncated, persisted_applied); // Raft Engine doesn't care about first index. if let Err(e) = store_ctx @@ -309,11 +310,12 @@ impl Peer { { error!(self.logger, "failed to compact raft logs"; "err" => ?e); } else { + // TODO: make this debug when stable. + info!(self.logger, "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state()); self.set_has_extra_write(); - let applied = self.storage().apply_state().get_applied_index(); - let total_cnt = applied - self.storage().entry_storage().first_index() + 1; - let remain_cnt = applied - compact_index; - self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 5a6c91d3567..72b582d775d 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -232,7 +232,7 @@ impl Apply { ) -> Result<(AdminResponse, AdminCmdResult)> { let region = self.region_state().get_region(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); - info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); + info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch(), "index" => index); let mut new_region = region.clone(); match change_kind { ConfChangeKind::LeaveJoint => self.apply_leave_joint(&mut new_region), diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 977e26e0675..4f2abb9c65e 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -110,9 +110,13 @@ impl Peer { } }; match &res { - Ok(index) => self - .proposal_control_mut() - .record_proposed_admin(cmd_type, *index), + Ok(index) => { + self.proposal_control_mut() + .record_proposed_admin(cmd_type, *index); + if self.proposal_control_mut().has_uncommitted_admin() { + self.raft_group_mut().skip_bcast_commit(false); + } + } Err(e) => { info!( self.logger, diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index e1f4ae552f6..faf059b3871 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -65,7 +65,7 @@ use crate::{ Error, }; -pub const SPLIT_PREFIX: &str = "split_"; +pub const SPLIT_PREFIX: &str = "split"; #[derive(Debug)] pub struct SplitResult { @@ -171,6 +171,9 @@ impl Peer { pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { let control = self.split_flow_control_mut(); control.size_diff_hint += metrics.size_diff_hint; + if self.is_leader() { + self.add_pending_tick(PeerTick::SplitRegionCheck); + } } pub fn on_request_split( @@ -265,6 +268,7 @@ impl Apply { self.logger, "split region"; "region" => ?region, + "index" => log_index, "boundaries" => %KeysInfoFormatter(boundaries.iter()), ); @@ -449,6 +453,8 @@ impl Peer { // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. self.report_batch_split_pd(store_ctx, res.regions.to_vec()); + // After split, the peer may need to update its metrics. + self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } @@ -629,7 +635,7 @@ mod test { kv::TestTabletFactory, }; use engine_traits::{ - Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, }; use kvproto::{ metapb::RegionEpoch, @@ -787,8 +793,9 @@ mod test { reporter, reg, read_scheduler, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger.clone(), ); @@ -803,7 +810,7 @@ mod test { splits.mut_requests().clear(); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 6).unwrap_err(); // Empty requests should be rejected. assert!(err.to_string().contains("missing split requests")); @@ -824,7 +831,7 @@ mod test { .mut_requests() .push(new_split_req(b"", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 7).unwrap_err(); // Empty key will not in any region exclusively. assert!(err.to_string().contains("missing split key"), "{:?}", err); @@ -836,7 +843,7 @@ mod test { .mut_requests() .push(new_split_req(b"k1", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 8).unwrap_err(); // keys should be in ascend order. assert!( err.to_string().contains("invalid split request"), @@ -852,7 +859,7 @@ mod test { .mut_requests() .push(new_split_req(b"k2", 1, vec![11, 12])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = apply.apply_batch_split(&req, 9).unwrap_err(); // All requests should be checked. assert!(err.to_string().contains("id count"), "{:?}", err); diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index b330d0093fe..fd53090fd65 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -181,6 +181,11 @@ impl ProposalControl { } } + #[inline] + pub fn has_uncommitted_admin(&self) -> bool { + !self.proposed_admin_cmd.is_empty() && !self.proposed_admin_cmd.back().unwrap().committed + } + pub fn advance_apply(&mut self, index: u64, term: u64, region: &metapb::Region) { while !self.proposed_admin_cmd.is_empty() { let cmd = self.proposed_admin_cmd.front_mut().unwrap(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 0a58bb64016..a533ae9af87 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -38,13 +38,14 @@ use raftstore::{ }, Error, Result, }; +use slog::{info, warn}; use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel}, + router::{ApplyRes, ApplyTask, CmdResChannel, PeerTick}, }; mod admin; @@ -111,6 +112,7 @@ impl Peer { read_scheduler, self.flush_state().clone(), self.storage().apply_trace().log_recovery(), + self.entry_storage().applied_term(), logger, ); @@ -306,6 +308,22 @@ impl Peer { apply_res.applied_index, progress_to_be_updated, ); + if self.pause_for_recovery() + && self.storage().entry_storage().commit_index() <= apply_res.applied_index + { + info!(self.logger, "recovery completed"; "apply_index" => apply_res.applied_index); + self.set_pause_for_recovery(false); + // Flush to avoid recover again and again. + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + self.add_pending_tick(PeerTick::Raft); + } + if !self.pause_for_recovery() && self.storage_mut().apply_trace_mut().should_flush() { + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::ManualFlush); + } + } } } @@ -347,6 +365,13 @@ impl Apply { } } + pub fn on_manual_flush(&mut self) { + self.flush(); + if let Err(e) = self.tablet().flush_cfs(&[], false) { + warn!(self.logger, "failed to flush: {:?}", e); + } + } + #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); @@ -512,6 +537,7 @@ impl Apply { #[inline] pub fn flush(&mut self) { + // TODO: maybe we should check whether there is anything to flush. let (index, term) = self.apply_progress(); let flush_state = self.flush_state().clone(); if let Some(wb) = &mut self.write_batch && !wb.is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index d6a83b7933b..1e9d1ef4221 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -40,7 +40,7 @@ use kvproto::{ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::Logger; +use slog::{trace, Logger}; use tikv_util::{box_err, worker::Scheduler}; use crate::{ @@ -130,7 +130,7 @@ impl engine_traits::StateStorage for StateStorage< /// Mapping from data cf to an u64 index. pub type DataTrace = [u64; DATA_CFS_LEN]; -#[derive(Clone, Copy, Default)] +#[derive(Clone, Copy, Default, Debug)] struct Progress { flushed: u64, /// The index of last entry that has modification to the CF. @@ -154,7 +154,7 @@ pub fn cf_offset(cf: &str) -> usize { /// interact with other peers will be traced. /// - support query the flushed progress without actually scanning raft engine, /// which is useful for cleaning up stale flush records. -#[derive(Default)] +#[derive(Default, Debug)] pub struct ApplyTrace { /// The modified indexes and flushed index of each data CF. data_cfs: Box<[Progress; DATA_CFS_LEN]>, @@ -168,6 +168,10 @@ pub struct ApplyTrace { admin: Progress, /// Index that is issued to be written. It may not be truely persisted. persisted_applied: u64, + /// Flush will be triggered explicitly when there are too many pending + /// writes. It marks the last index that is flushed to avoid too many + /// flushes. + last_flush_trigger: u64, /// `true` means the raft cf record should be persisted in next ready. try_persist: bool, } @@ -187,6 +191,7 @@ impl ApplyTrace { trace.admin.flushed = i; trace.admin.last_modified = i; trace.persisted_applied = i; + trace.last_flush_trigger = i; let applied_region_state = engine .get_region_state(region_id, trace.admin.flushed)? .unwrap(); @@ -218,7 +223,31 @@ impl ApplyTrace { } pub fn persisted_apply_index(&self) -> u64 { - self.admin.flushed + self.persisted_applied + } + + pub fn should_flush(&mut self) -> bool { + if self.admin.flushed < self.admin.last_modified { + // It's waiting for other peers, flush will not help. + return false; + } + let last_modified = self + .data_cfs + .iter() + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.last_modified) + } else { + None + } + }) + .max(); + if let Some(m) = last_modified && m >= self.admin.flushed + 4096 && m >= self.last_flush_trigger + 4096 { + self.last_flush_trigger = m; + true + } else { + false + } } // All events before `mem_index` must be consumed before calling this function. @@ -456,6 +485,7 @@ impl Storage { impl Peer { pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + trace!(self.logger, "data flushed"; "cf" => cf, "tablet_index" => tablet_index, "index" => index, "trace" => ?self.storage().apply_trace()); if tablet_index < self.storage().tablet_index() { // Stale tablet. return; @@ -467,6 +497,7 @@ impl Peer { } pub fn on_data_modified(&mut self, modification: DataTrace) { + trace!(self.logger, "on data modified"; "modification" => ?modification, "trace" => ?self.storage().apply_trace()); let apply_index = self.storage().entry_storage().applied_index(); let apply_trace = self.storage_mut().apply_trace_mut(); for (cf, index) in DATA_CFS.iter().zip(modification) { @@ -556,22 +587,22 @@ mod tests { #[test] fn test_apply_trace() { let mut trace = ApplyTrace::default(); - assert_eq!(0, trace.persisted_apply_index()); + assert_eq!(0, trace.admin.flushed); // If there is no modifications, index should be advanced anyway. trace.maybe_advance_admin_flushed(2); - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 3); } trace.maybe_advance_admin_flushed(3); // Modification is not flushed. - assert_eq!(2, trace.persisted_apply_index()); + assert_eq!(2, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 3); } trace.maybe_advance_admin_flushed(3); // No admin is recorded, index should be advanced. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_modify(4); for cf in DATA_CFS { trace.on_flush(cf, 4); @@ -581,25 +612,25 @@ mod tests { } trace.maybe_advance_admin_flushed(4); // Unflushed admin modification should hold index. - assert_eq!(3, trace.persisted_apply_index()); + assert_eq!(3, trace.admin.flushed); trace.on_admin_flush(4); trace.maybe_advance_admin_flushed(4); // Admin is flushed, index should be advanced. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_flush(cf, 5); } trace.maybe_advance_admin_flushed(4); // Though all data CFs are flushed, but index should not be // advanced as we don't know whether there is admin modification. - assert_eq!(4, trace.persisted_apply_index()); + assert_eq!(4, trace.admin.flushed); for cf in DATA_CFS { trace.on_modify(cf, 5); } trace.maybe_advance_admin_flushed(5); // Because modify is recorded, so we know there should be no admin // modification and index can be advanced. - assert_eq!(5, trace.persisted_apply_index()); + assert_eq!(5, trace.admin.flushed); } #[test] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 8b125844d0e..8a0e0770b1f 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -32,7 +32,7 @@ use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, }; -use slog::{debug, error, trace, warn}; +use slog::{debug, error, info, trace, warn}; use tikv_util::{ store::find_peer, time::{duration_to_sec, monotonic_raw_now}, @@ -50,6 +50,8 @@ use crate::{ router::{ApplyTask, PeerMsg, PeerTick}, }; +const PAUSE_FOR_RECOVERY_GAP: u64 = 128; + impl Store { pub fn on_store_unreachable( &mut self, @@ -76,9 +78,33 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } impl Peer { + pub fn maybe_pause_for_recovery(&mut self) -> bool { + let entry_storage = self.storage().entry_storage(); + let committed_index = entry_storage.commit_index(); + let applied_index = entry_storage.applied_index(); + if committed_index > applied_index { + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + self.set_has_ready(); + } + if committed_index > applied_index + PAUSE_FOR_RECOVERY_GAP { + // If there are too many the missing logs, we need to skip ticking otherwise + // it may block the raftstore thread for a long time in reading logs for + // election timeout. + info!(self.logger, "pause for recovery"; "applied" => applied_index, "committed" => committed_index); + self.set_pause_for_recovery(true); + true + } else { + false + } + } + #[inline] fn tick(&mut self) -> bool { - self.raft_group_mut().tick() + // When it's handling snapshot, it's pointless to tick as all the side + // affects have to wait till snapshot is applied. On the other hand, ticking + // will bring other corner cases like elections. + !self.is_handling_snapshot() && self.raft_group_mut().tick() } pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { @@ -107,6 +133,10 @@ impl Peer { "from_peer_id" => msg.get_from_peer().get_id(), "to_peer_id" => msg.get_to_peer().get_id(), ); + if self.pause_for_recovery() && msg.get_message().get_msg_type() == MessageType::MsgAppend { + ctx.raft_metrics.message_dropped.recovery.inc(); + return; + } if !self.serving() { return; } @@ -273,31 +303,44 @@ impl Peer { ) { // TODO: skip handling committed entries if a snapshot is being applied // asynchronously. - if self.is_leader() { + let mut update_lease = self.is_leader(); + if update_lease { for entry in committed_entries.iter().rev() { self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); - let propose_time = self - .proposals() - .find_propose_time(entry.get_term(), entry.get_index()); - if let Some(propose_time) = propose_time { - // We must renew current_time because this value may be created a long time ago. - // If we do not renew it, this time may be smaller than propose_time of a - // command, which was proposed in another thread while this thread receives its - // AppendEntriesResponse and is ready to calculate its commit-log-duration. - ctx.current_time.replace(monotonic_raw_now()); - ctx.raft_metrics.commit_log.observe(duration_to_sec( - (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), - )); - self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); - break; + if update_lease { + let propose_time = self + .proposals() + .find_propose_time(entry.get_term(), entry.get_index()); + if let Some(propose_time) = propose_time { + // We must renew current_time because this value may be created a long time + // ago. If we do not renew it, this time may be + // smaller than propose_time of a command, which was + // proposed in another thread while this thread receives its + // AppendEntriesResponse and is ready to calculate its commit-log-duration. + ctx.current_time.replace(monotonic_raw_now()); + ctx.raft_metrics.commit_log.observe(duration_to_sec( + (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), + )); + self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); + update_lease = false; + } } } } + let applying_index = committed_entries.last().unwrap().index; + let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); + *self.last_applying_index_mut() = applying_index; if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); } self.schedule_apply_committed_entries(committed_entries); + if self.is_leader() + && commit_to_current_term + && !self.proposal_control().has_uncommitted_admin() + { + self.raft_group_mut().skip_bcast_commit(true); + } } /// Processing the ready of raft. A detail description of how it's handled @@ -525,6 +568,7 @@ impl Peer { self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); + self.add_pending_tick(PeerTick::SplitRegionCheck); } StateRole::Follower => { self.leader_lease_mut().expire(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 41dc0d39429..c040bdcbb3b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -37,7 +37,7 @@ use raftstore::{ coprocessor::RegionChangeEvent, store::{ metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; use slog::{error, info, warn}; @@ -197,19 +197,24 @@ impl Peer { StateRole::Follower, ); let persisted_index = self.persisted_index(); - let first_index = self.storage().entry_storage().first_index(); - if first_index == persisted_index + 1 { + *self.last_applying_index_mut() = persisted_index; + let snapshot_index = self.entry_storage().truncated_index(); + assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); + // If leader sends a message append to the follower while it's applying + // snapshot (via split init for example), the persisted_index may be larger + // than the first index. But as long as first index is not larger, the + // latest snapshot should be applied. + if snapshot_index <= persisted_index { let region_id = self.region_id(); - self.reset_flush_state(); + self.reset_flush_state(snapshot_index); let flush_state = self.flush_state().clone(); - let mut tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.record_tablet_as_tombstone_and_refresh(persisted_index, ctx); - self.schedule_apply_fsm(ctx); + self.record_tablet_as_tombstone_and_refresh(snapshot_index, ctx); self.storage_mut().on_applied_snapshot(); - self.raft_group_mut().advance_apply_to(persisted_index); + self.raft_group_mut().advance_apply_to(snapshot_index); { let mut meta = ctx.store_meta.lock().unwrap(); meta.set_region(self.region(), true, &self.logger); @@ -218,18 +223,18 @@ impl Peer { meta.region_read_progress .insert(region_id, self.read_progress().clone()); } - self.read_progress_mut() - .update_applied_core(persisted_index); + self.read_progress_mut().update_applied_core(snapshot_index); let split = self.storage_mut().split_init_mut().take(); if split.as_ref().map_or(true, |s| { - !s.scheduled || persisted_index != RAFT_INIT_LOG_INDEX + !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX }) { info!(self.logger, "apply tablet snapshot completely"); } if let Some(init) = split { - info!(self.logger, "init with snapshot finished"); + info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } + self.schedule_apply_fsm(ctx); } } } @@ -343,6 +348,15 @@ impl Storage { /// Validate the snapshot. Returns true if it's valid. fn validate_snap(&self, snap: &Snapshot, request_index: u64) -> bool { let idx = snap.get_metadata().get_index(); + if idx < RAFT_INIT_LOG_INDEX || snap.get_metadata().get_term() < RAFT_INIT_LOG_TERM { + info!( + self.logger(), + "corrupted snapshot detected, generate again"; + "snap" => ?snap, + "request_index" => request_index, + ); + return false; + } // TODO(nolouch): check tuncated index if idx < request_index { // stale snapshot, should generate again. @@ -489,8 +503,21 @@ impl Storage { )); } + let old_last_index = self.entry_storage().last_index(); + if self.entry_storage().first_index() <= old_last_index { + // All states are rewritten in the following blocks. Stale states will be + // cleaned up by compact worker. + task.cut_logs = Some((0, old_last_index + 1)); + self.entry_storage_mut().clear(); + } + let last_index = snap.get_metadata().get_index(); let last_term = snap.get_metadata().get_term(); + assert!( + last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, + "{:?}", + self.logger().list() + ); let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); region_state.set_region(region); diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 666f3adb699..6818d7ae0d9 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -34,6 +34,9 @@ pub struct Apply { /// command. tombstone: bool, applied_term: u64, + // Apply progress is set after every command in case there is a flush. But it's + // wrong to update flush_state immediately as a manual flush from other thread + // can fetch the wrong apply index from flush_state. applied_index: u64, /// The largest index that have modified each column family. modifications: DataTrace, @@ -64,11 +67,15 @@ impl Apply { read_scheduler: Scheduler>, flush_state: Arc, log_recovery: Option>, + applied_term: u64, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) .unwrap(); + assert_ne!(applied_term, 0, "{:?}", logger.list()); + let applied_index = flush_state.applied_index(); + assert_ne!(applied_index, 0, "{:?}", logger.list()); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), @@ -76,7 +83,7 @@ impl Apply { write_batch: None, callbacks: vec![], tombstone: false, - applied_term: 0, + applied_term, applied_index: flush_state.applied_index(), modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], @@ -125,9 +132,6 @@ impl Apply { let log_recovery = self.log_recovery.as_ref().unwrap(); if log_recovery.iter().all(|v| index >= *v) { self.log_recovery.take(); - // Now all logs are recovered, flush them to avoid recover again - // and again. - let _ = self.tablet.flush_cfs(&[], false); } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 668b0ebf41d..f3734b6821d 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -62,6 +62,7 @@ pub struct Peer { /// For raft log compaction. skip_compact_log_ticks: usize, approximate_raft_log_size: u64, + last_applying_index: u64, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -73,6 +74,7 @@ pub struct Peer { has_ready: bool, /// Sometimes there is no ready at all, but we need to trigger async write. has_extra_write: bool, + pause_for_recovery: bool, /// Writer for persisting side effects asynchronously. pub(crate) async_writer: AsyncWriter, @@ -133,7 +135,7 @@ impl Peer { let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); - let flush_state: Arc = Arc::default(); + let flush_state: Arc = Arc::new(FlushState::new(applied_index)); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. @@ -155,12 +157,14 @@ impl Peer { peer_heartbeats: HashMap::default(), skip_compact_log_ticks: 0, approximate_raft_log_size: 0, + last_applying_index: raft_group.store().apply_state().get_applied_index(), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), apply_scheduler: None, has_ready: false, has_extra_write: false, + pause_for_recovery: false, destroy_progress: DestroyProgress::None, raft_group, logger, @@ -366,14 +370,17 @@ impl Peer { /// Returns if there's any tombstone being removed. #[inline] pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { - let mut removed = 0; - while let Some(i) = self.pending_tombstone_tablets.first() - && *i <= persisted - { - removed += 1; + let removed = self + .pending_tombstone_tablets + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + self.pending_tombstone_tablets.drain(..removed); + true + } else { + false } - self.pending_tombstone_tablets.drain(..removed); - removed > 0 } #[inline] @@ -431,6 +438,16 @@ impl Peer { mem::take(&mut self.has_extra_write) } + #[inline] + pub fn set_pause_for_recovery(&mut self, pause: bool) { + self.pause_for_recovery = pause; + } + + #[inline] + pub fn pause_for_recovery(&self) -> bool { + self.pause_for_recovery + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { @@ -551,6 +568,10 @@ impl Peer { self.approximate_raft_log_size = f(self.approximate_raft_log_size); } + pub fn last_applying_index_mut(&mut self) -> &mut u64 { + &mut self.last_applying_index + } + #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state @@ -654,8 +675,7 @@ impl Peer { /// See the comments of `check_snap_status` for more details. #[inline] pub fn is_handling_snapshot(&self) -> bool { - // todo: This method may be unnecessary now? - false + self.persisted_index() < self.entry_storage().truncated_index() } /// Returns `true` if the raft group has replicated a snapshot but not @@ -774,8 +794,8 @@ impl Peer { &self.flush_state } - pub fn reset_flush_state(&mut self) { - self.flush_state = Arc::default(); + pub fn reset_flush_state(&mut self, index: u64) { + self.flush_state = Arc::new(FlushState::new(index)); } // Note: Call `set_has_extra_write` after adding new state changes. diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 636970c0ad1..51bd41ba253 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -298,7 +298,9 @@ mod tests { ctor::{CfOptions, DbOptions}, kv::TestTabletFactory, }; - use engine_traits::{RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS}; + use engine_traits::{ + FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, + }; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, @@ -379,25 +381,25 @@ mod tests { .unwrap() .unwrap(); - let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); + let snapshot = new_empty_snapshot(region.clone(), 10, 9, false); let mut task = WriteTask::new(region.get_id(), 5, 0); s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); assert_eq!(10, s.entry_storage().truncated_index()); - assert_eq!(1, s.entry_storage().truncated_term()); - assert_eq!(1, s.entry_storage().last_term()); + assert_eq!(9, s.entry_storage().truncated_term()); + assert_eq!(9, s.entry_storage().last_term()); assert_eq!(10, s.entry_storage().raft_state().last_index); // This index can't be set before load tablet. assert_ne!(10, s.entry_storage().applied_index()); - assert_ne!(1, s.entry_storage().applied_term()); + assert_ne!(9, s.entry_storage().applied_term()); assert_eq!(10, s.region_state().get_tablet_index()); assert!(!task.persisted_cbs.is_empty()); s.on_applied_snapshot(); assert_eq!(10, s.entry_storage().applied_index()); - assert_eq!(1, s.entry_storage().applied_term()); + assert_eq!(9, s.entry_storage().applied_term()); assert_eq!(10, s.region_state().get_tablet_index()); } @@ -440,8 +442,9 @@ mod tests { router, reg, sched, - Arc::default(), + Arc::new(FlushState::new(5)), None, + 5, logger, ); @@ -460,8 +463,8 @@ mod tests { SnapState::Generated(ref snap) => *snap.clone(), ref s => panic!("unexpected state: {:?}", s), }; - assert_eq!(snap.get_metadata().get_index(), 0); - assert_eq!(snap.get_metadata().get_term(), 0); + assert_eq!(snap.get_metadata().get_index(), 5); + assert_eq!(snap.get_metadata().get_term(), 5); assert_eq!(snap.get_data().is_empty(), false); let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 05e1baea1cf..092e7e21b5f 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -10,6 +10,7 @@ pub enum ApplyTask { Snapshot(GenSnapTask), /// Writes that doesn't care consistency. UnsafeWrite(Box<[u8]>), + ManualFlush, } #[derive(Debug, Default)] diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs index d031d6b1eba..18d81ef16aa 100644 --- a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -1,6 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{assert_matches::assert_matches, time::Duration}; use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; @@ -9,35 +9,32 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, }; use raft::prelude::ConfChangeType; -use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; use tikv_util::store::new_peer; use crate::cluster::Cluster; fn put_data( region_id: u64, - cluster: &Cluster, + cluster: &mut Cluster, node_off: usize, node_off_for_verify: usize, key: &[u8], ) { - let router = &cluster.routers[node_off]; + let mut router = &mut cluster.routers[node_off]; router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); // router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let tablet_registry = cluster.node(node_off).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert!(tablet.get_value(key).unwrap().is_none()); + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); let header = Box::new(router.new_request_for(region_id).take_header()); let mut put = SimpleWriteEncoder::with_capacity(64); - put.put(CF_DEFAULT, &key[1..], b"value"); + put.put(CF_DEFAULT, key, b"value"); let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); std::thread::sleep(std::time::Duration::from_millis(10)); @@ -53,17 +50,29 @@ fn put_data( let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); - - // Verify the data is ready in the other node - let tablet_registry = cluster.node(node_off_for_verify).tablet_registry(); - let tablet = tablet_registry - .get(region_id) - .unwrap() - .latest() - .unwrap() - .clone(); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); + router = &mut cluster.routers[node_off]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); + + // Because of skip bcast commit, the data should not be applied yet. + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_matches!(snap.get_value(key), Ok(None)); + // Trigger heartbeat explicitly to commit on follower. + router = &mut cluster.routers[node_off]; + for _ in 0..2 { + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + router + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + } + cluster.dispatch(region_id, vec![]); + std::thread::sleep(std::time::Duration::from_millis(100)); + router = &mut cluster.routers[node_off_for_verify]; + let snap = router.stale_snapshot(region_id); + assert_eq!(snap.get_value(key).unwrap().unwrap(), b"value"); } pub fn must_transfer_leader( @@ -97,7 +106,7 @@ pub fn must_transfer_leader( #[test] fn test_transfer_leader() { - let cluster = Cluster::with_node_count(3, None); + let mut cluster = Cluster::with_node_count(3, None); let region_id = 2; let router0 = &cluster.routers[0]; @@ -137,13 +146,13 @@ fn test_transfer_leader() { cluster.dispatch(region_id, vec![]); // Ensure follower has latest entries before transfer leader. - put_data(region_id, &cluster, 0, 1, b"zkey1"); + put_data(region_id, &mut cluster, 0, 1, b"key1"); // Perform transfer leader must_transfer_leader(&cluster, region_id, 0, 1, peer1); // Before transfer back to peer0, put some data again. - put_data(region_id, &cluster, 1, 0, b"zkey2"); + put_data(region_id, &mut cluster, 1, 0, b"key2"); // Perform transfer leader let store_id = cluster.node(0).id(); diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index b0f44c30c0f..ce4f099610e 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -177,6 +177,7 @@ make_static_metric! { region_nonexistent, applying_snap, disk_full, + recovery, } pub label_enum ProposalType { diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 05decd62815..939bc2a1078 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1998,7 +1998,12 @@ impl TabletSnapManager { { continue; } - for e in file_system::read_dir(path)? { + let entries = match file_system::read_dir(path) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + for e in entries { match e.and_then(|e| e.metadata()) { Ok(m) => total_size += m.len(), Err(e) if e.kind() == ErrorKind::NotFound => continue, diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 5beddf60151..4d4e283ea7e 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -61,6 +61,7 @@ use raftstore::{ }, RegionInfoAccessor, }; +use raftstore_v2::{router::RaftRouter, StateStorage}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -136,8 +137,7 @@ fn run_impl(config: TikvConfig) { tikv.init_encryption(); let fetcher = tikv.init_io_utility(); let listener = tikv.init_flow_receiver(); - let (raft_engine, engines_info) = tikv.init_raw_engines(listener); - tikv.init_engines(raft_engine); + let engines_info = tikv.init_engines(listener); let server_config = tikv.init_servers::(); tikv.register_services(); tikv.init_metrics_flusher(fetcher, engines_info); @@ -201,6 +201,7 @@ struct TikvServer { pd_client: Arc, flow_info_sender: Option>, flow_info_receiver: Option>, + router: Option>, node: Option>, resolver: Option, store_path: PathBuf, @@ -310,6 +311,7 @@ where cfg_controller: Some(cfg_controller), security_mgr, pd_client, + router: None, node: None, resolver: None, store_path, @@ -567,36 +569,6 @@ where engine_rocks::FlowListener::new(tx) } - fn init_engines(&mut self, raft_engine: ER) { - let tablet_registry = self.tablet_registry.clone().unwrap(); - let mut node = NodeV2::new( - &self.config.server, - self.pd_client.clone(), - None, - tablet_registry, - ); - node.try_bootstrap_store(&self.config.raft_store, &raft_engine) - .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); - assert_ne!(node.id(), 0); - - let router = node.router(); - let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( - router.store_router().clone(), - self.config.coprocessor.clone(), - ); - let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); - - let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); - - self.engines = Some(TikvEngines { - raft_engine, - engine, - }); - self.node = Some(node); - self.coprocessor_host = Some(coprocessor_host); - self.region_info_accessor = Some(region_info_accessor); - } - fn init_gc_worker(&mut self) -> GcWorker> { let engines = self.engines.as_ref().unwrap(); let gc_worker = GcWorker::new( @@ -774,7 +746,7 @@ where }; let check_leader_runner = CheckLeaderRunner::new( - self.node.as_ref().unwrap().router().store_meta().clone(), + self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), ); let check_leader_scheduler = self @@ -855,6 +827,8 @@ where .unwrap() .start( engines.raft_engine.clone(), + self.tablet_registry.clone().unwrap(), + self.router.as_ref().unwrap(), server.transport(), snap_mgr, self.concurrency_manager.clone(), @@ -1392,10 +1366,10 @@ impl ConfiguredRaftEngine for RaftLogEngine { } impl TikvServer { - fn init_raw_engines( + fn init_engines( &mut self, flow_listener: engine_rocks::FlowListener, - ) -> (CER, Arc) { + ) -> Arc { let block_cache = self.config.storage.block_cache.build_shared_cache(); let env = self .config @@ -1415,6 +1389,19 @@ impl TikvServer { let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); + + let mut node = NodeV2::new(&self.config.server, self.pd_client.clone(), None); + node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router().clone(); + + // Create kv engine. + let builder = builder.state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); let factory = Box::new(builder.build()); self.kv_statistics = Some(factory.rocks_statistics()); let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) @@ -1428,12 +1415,30 @@ impl TikvServer { raft_engine.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - registry, + registry.clone(), raft_engine.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); - (raft_engine, engines_info) + let router = RaftRouter::new(node.id(), registry, router); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.router = Some(router); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + + engines_info } } diff --git a/src/config/mod.rs b/src/config/mod.rs index 6ed8da3f111..c78ec02182f 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3122,6 +3122,9 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); + if !self.raft_engine.enable { + panic!("raft-kv2 only supports raft log engine."); + } } self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index ed6f16e8bec..b876951894c 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -11,7 +11,7 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, }; -use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreSystem}; +use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; use slog::{info, o, Logger}; use tikv_util::{ config::VersionTrack, @@ -24,11 +24,10 @@ use crate::server::{node::init_store, Result}; pub struct NodeV2 { cluster_id: u64, store: metapb::Store, - system: Option<(RaftRouter, StoreSystem)>, + system: Option<(StoreRouter, StoreSystem)>, has_started: bool, pd_client: Arc, - registry: TabletRegistry, logger: Logger, } @@ -43,7 +42,6 @@ where cfg: &crate::server::Config, pd_client: Arc, store: Option, - registry: TabletRegistry, ) -> NodeV2 { let store = init_store(store, cfg); @@ -53,7 +51,6 @@ where pd_client, system: None, has_started: false, - registry, logger: slog_global::borrow_global().new(o!()), } } @@ -71,16 +68,14 @@ where ) .bootstrap_store()?; self.store.set_id(store_id); + let (router, system) = raftstore_v2::create_store_batch_system(cfg, store_id, self.logger.clone()); - self.system = Some(( - RaftRouter::new(store_id, self.registry.clone(), router), - system, - )); + self.system = Some((router, system)); Ok(()) } - pub fn router(&self) -> &RaftRouter { + pub fn router(&self) -> &StoreRouter { &self.system.as_ref().unwrap().0 } @@ -90,6 +85,8 @@ where pub fn start( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, @@ -112,15 +109,10 @@ where ) .bootstrap_first_region(&self.store, store_id)? { - let path = self - .registry - .tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); // TODO: make follow line can recover from abort. - self.registry - .tablet_factory() - .open_tablet(ctx, &path) - .unwrap(); + registry.tablet_factory().open_tablet(ctx, &path).unwrap(); } // Put store only if the cluster is bootstrapped. @@ -130,6 +122,8 @@ where self.start_store( raft_engine, + registry, + router, trans, snap_mgr, concurrency_manager, @@ -187,6 +181,8 @@ where fn start_store( &mut self, raft_engine: ER, + registry: TabletRegistry, + router: &RaftRouter, trans: T, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, @@ -207,13 +203,13 @@ where } self.has_started = true; - let (router, system) = self.system.as_mut().unwrap(); + let system = &mut self.system.as_mut().unwrap().1; system.start( store_id, store_cfg, raft_engine, - self.registry.clone(), + registry, trans, self.pd_client.clone(), router.store_router(), From 64293cb434c42c30fc37daeaaeae5c963aea26ea Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 30 Dec 2022 17:02:17 +0800 Subject: [PATCH 057/115] add commit/apply duration for raft store (#13946) ref tikv/tikv#12842 Signed-off-by: bufferflies <1045931706@qq.com> --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/operation/command/mod.rs | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 800dbc98f91..72f05801a0e 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -153,6 +153,7 @@ impl StorePoller { fn flush_events(&mut self) { self.schedule_ticks(); + self.poll_ctx.raft_metrics.maybe_flush(); } fn schedule_ticks(&mut self) { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index a533ae9af87..4831c4abf9f 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -33,13 +33,17 @@ use raftstore::{ Proposal, }, local_metrics::RaftMetrics, + metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, msg::ErrorCallback, util, WriteCallback, }, Error, Result, }; use slog::{info, warn}; -use tikv_util::{box_err, time::monotonic_raw_now}; +use tikv_util::{ + box_err, + time::{duration_to_sec, monotonic_raw_now, Instant}, +}; use crate::{ batch::StoreContext, @@ -81,6 +85,7 @@ pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. entry_and_proposals: Vec<(Entry, Vec)>, + committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { @@ -246,6 +251,7 @@ impl Peer { // memtables in kv engine is flushed. let apply = CommittedEntries { entry_and_proposals, + committed_time: Instant::now(), }; assert!( self.apply_scheduler().is_some(), @@ -375,6 +381,8 @@ impl Apply { #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); + APPLY_TASK_WAIT_TIME_HISTOGRAM + .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { apply::notify_req_region_removed(self.region_state().get_region().get_id(), ch); From a6afe78c43e293addd18251dee209d630322dd9e Mon Sep 17 00:00:00 2001 From: hongyunyan <649330952@qq.com> Date: Tue, 3 Jan 2023 11:02:19 +0800 Subject: [PATCH 058/115] extend evict_entry_cache for restart (#13998) close tikv/tikv#13997 Support to use evict_entry_cache when restart node. Signed-off-by: tabokie Signed-off-by: hongyunyan <649330952@qq.com> Signed-off-by: Xinye Tao Signed-off-by: Jay Lee Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Xinye Tao Co-authored-by: Jay Co-authored-by: Zwb Co-authored-by: Ti Chi Robot --- components/raftstore/src/store/entry_storage.rs | 4 ++++ components/raftstore/src/store/peer.rs | 3 +++ 2 files changed, 7 insertions(+) diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index c6278c890f7..4d6372dd582 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1227,6 +1227,10 @@ impl EntryStorage { let idx = cache.cache[drain_to].index; let mem_size_change = cache.compact_to(idx + 1); RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); + } else if !half { + let cache = &mut self.cache; + let mem_size_change = cache.compact_to(u64::MAX); + RAFT_ENTRIES_EVICT_BYTES.inc_by(mem_size_change); } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 7752a0a1b0e..9384a4940c7 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1188,6 +1188,9 @@ where peer.raft_group.campaign()?; } + let persisted_index = peer.raft_group.raft.raft_log.persisted; + peer.mut_store().update_cache_persisted(persisted_index); + Ok(peer) } From 5de5fd24da76d35060fab0ac6e85b903a7b32af2 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 15:36:19 +0800 Subject: [PATCH 059/115] raft-engine: remove confusing API cut logs (#14010) ref tikv/tikv#12842 The API is supposed to be used with `append` but nowhere can we find the clue. This PR merges `cut_logs` and `append` to reduce confusion and mistakes. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/raft_engine.rs | 11 ++++---- components/engine_rocks/src/raft_engine.rs | 21 +++++++++------ components/engine_traits/src/raft_engine.rs | 20 +++++++++----- components/raft_log_engine/src/engine.rs | 13 ++++----- components/raftstore-v2/src/operation/life.rs | 16 +++-------- .../src/operation/ready/snapshot.rs | 1 - .../raftstore/src/store/async_io/write.rs | 27 +++++++++++++------ .../src/store/async_io/write_tests.rs | 20 +++++--------- .../raftstore/src/store/entry_storage.rs | 3 +-- .../raftstore/src/store/peer_storage.rs | 12 ++++----- components/raftstore/src/store/snap.rs | 2 +- .../raftstore/src/store/worker/raftlog_gc.rs | 2 +- components/server/src/raft_engine_switch.rs | 8 +++--- tests/integrations/server/kv_service.rs | 2 +- 14 files changed, 83 insertions(+), 75 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index c3de53b4932..854b75fe30d 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -167,11 +167,12 @@ impl RaftEngine for PanicEngine { } impl RaftLogBatch for PanicWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { - panic!() - } - - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index d5331a2ce29..d566ac3821b 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -361,7 +361,19 @@ impl RaftEngine for RocksEngine { } impl RaftLogBatch for RocksWriteBatchVec { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -369,13 +381,6 @@ impl RaftLogBatch for RocksWriteBatchVec { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 9e95ae95e14..68036eae1eb 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -66,7 +66,7 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { Ok(true) }) .unwrap(); - batch.append(region_id, entries).unwrap(); + batch.append(region_id, None, entries).unwrap(); if let Some(state) = self.get_raft_state(region_id).unwrap() { batch.put_raft_state(region_id, &state).unwrap(); } @@ -150,11 +150,19 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send } pub trait RaftLogBatch: Send { - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()>; - - /// Remove Raft logs in [`from`, `to`) which will be overwritten later. - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64); + /// Append continuous entries to the batch. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. + /// Nothing will be deleted if entries is empty. Note: `RaftLocalState` + /// won't be updated in this call. + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()>; fn put_store_ident(&mut self, ident: &StoreIdent) -> Result<()>; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 7c98adf325f..1ae148ba41c 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -385,17 +385,18 @@ const FLUSH_STATE_KEY: &[u8] = &[0x06]; const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + _overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + // overwrite is handled within raft log engine. self.0 .add_entries::(raft_group_id, &entries) .map_err(transfer_error) } - fn cut_logs(&mut self, _: u64, _: u64, _: u64) { - // It's unnecessary because overlapped entries can be handled in - // `append`. - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.0 .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 0f2e72061ef..954c6992cf9 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -10,8 +10,6 @@ //! sending a message to store fsm first, and then using split to initialized //! the peer. -use std::cmp; - use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; @@ -296,32 +294,24 @@ impl Peer { /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. pub fn start_destroy(&mut self, write_task: &mut WriteTask) { - let entry_storage = self.storage().entry_storage(); if self.postponed_destroy() { return; } - let first_index = entry_storage.first_index(); - let last_index = entry_storage.last_index(); - if first_index <= last_index { - write_task.cut_logs = match write_task.cut_logs { - None => Some((first_index, last_index)), - Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), - }; - } let raft_engine = self.entry_storage().raft_engine(); let mut region_state = self.storage().region_state().clone(); let region_id = region_state.get_region().get_id(); + // Use extra write to ensure these writes are the last writes to raft engine. let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(2)); - // We only use raft-log-engine for v2, first index is not important. + // We only use raft-log-engine for v2, first index and state are not important. let raft_state = self.entry_storage().raft_state(); raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); - // Write worker will do the clean up when meeting tombstone state. region_state.set_state(PeerState::Tombstone); let applied_index = self.entry_storage().applied_index(); lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); + self.set_has_extra_write(); self.destroy_progress_mut().start(); } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index c040bdcbb3b..76a5b4297b3 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -507,7 +507,6 @@ impl Storage { if self.entry_storage().first_index() <= old_last_index { // All states are rewritten in the following blocks. Stale states will be // cleaned up by compact worker. - task.cut_logs = Some((0, old_last_index + 1)); self.entry_storage_mut().clear(); } diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index b4cceb96a82..56d0f93a11d 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -186,8 +186,8 @@ where pub raft_wb: Option, // called after writing to kvdb and raftdb. pub persisted_cbs: Vec>, - pub entries: Vec, - pub cut_logs: Option<(u64, u64)>, + overwrite_to: Option, + entries: Vec, pub raft_state: Option, pub extra_write: ExtraWrite, pub messages: Vec, @@ -207,8 +207,8 @@ where ready_number, send_time: Instant::now(), raft_wb: None, + overwrite_to: None, entries: vec![], - cut_logs: None, raft_state: None, extra_write: ExtraWrite::None, messages: vec![], @@ -221,11 +221,21 @@ where pub fn has_data(&self) -> bool { !(self.raft_state.is_none() && self.entries.is_empty() - && self.cut_logs.is_none() && self.extra_write.is_empty() && self.raft_wb.as_ref().map_or(true, |wb| wb.is_empty())) } + /// Append continous entries. + /// + /// All existing entries with same index will be overwritten. If + /// `overwrite_to` is set to a larger value, then entries in + /// `[entries.last().get_index(), overwrite_to)` will be deleted. If + /// entries is empty, nothing will be deleted. + pub fn set_append(&mut self, overwrite_to: Option, entries: Vec) { + self.entries = entries; + self.overwrite_to = overwrite_to; + } + #[inline] pub fn ready_number(&self) -> u64 { self.ready_number @@ -387,11 +397,12 @@ where raft_wb.merge(wb).unwrap(); } raft_wb - .append(task.region_id, std::mem::take(&mut task.entries)) + .append( + task.region_id, + task.overwrite_to, + std::mem::take(&mut task.entries), + ) .unwrap(); - if let Some((from, to)) = task.cut_logs { - raft_wb.cut_logs(task.region_id, from, to); - } if let Some(raft_state) = task.raft_state.take() && self.raft_states.insert(task.region_id, raft_state).is_none() { diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 6007b39489e..d1861a8903c 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -167,7 +167,9 @@ fn delete_kv(wb: Option<&mut TestKvWriteBatch>, key: &[u8]) { /// Simulate kv puts on raft engine. fn put_raft_kv(wb: Option<&mut TestRaftLogBatch>, key: u64) { - wb.unwrap().append(key, vec![new_entry(key, key)]).unwrap(); + wb.unwrap() + .append(key, None, vec![new_entry(key, key)]) + .unwrap(); } fn delete_raft_kv(engine: &RaftTestEngine, wb: Option<&mut TestRaftLogBatch>, key: u64) { @@ -294,10 +296,7 @@ fn test_worker() { put_kv(task_3.extra_write.v1_mut(), b"kv_k3", b"kv_v3"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); task_3 .messages @@ -392,10 +391,7 @@ fn test_worker_split_raft_wb() { lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); - task_3 - .entries - .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); - task_3.cut_logs = Some((8, 9)); + task_3.set_append(Some(9), vec![new_entry(6, 6), new_entry(7, 7)]); task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); if split.1 { expected_wbs += 1; @@ -500,8 +496,7 @@ fn test_basic_flow() { delete_kv(task_3.extra_write.v1_mut(), b"kv_k1"); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages @@ -603,8 +598,7 @@ fn test_basic_flow_with_states() { lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); - task_3.entries.append(&mut vec![new_entry(6, 6)]); - task_3.cut_logs = Some((7, 8)); + task_3.set_append(Some(8), vec![new_entry(6, 6)]); task_3.raft_state = Some(new_raft_state(6, 345, 6, 6)); task_3 .messages diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index 4d6372dd582..bc85ecedc34 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -1075,9 +1075,8 @@ impl EntryStorage { self.cache.append(self.region_id, self.peer_id, &entries); - task.entries = entries; // Delete any previously appended log entries which never committed. - task.cut_logs = Some((last_index + 1, prev_last_index + 1)); + task.set_append(Some(prev_last_index + 1), entries); self.raft_state.set_last_index(last_index); self.last_term = last_term; diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index c9e460d1cbc..b060a866d71 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -2082,7 +2082,7 @@ pub mod tests { let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - lb.append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); lb.put_raft_state(1, &raft_state).unwrap(); @@ -2093,7 +2093,7 @@ pub mod tests { let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); @@ -2138,7 +2138,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); engines.raft.gc(1, 0, 21, &mut lb).unwrap(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); @@ -2150,7 +2150,7 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); @@ -2158,7 +2158,7 @@ pub mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - lb.append(1, entries).unwrap(); + lb.append(1, None, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); @@ -2168,7 +2168,7 @@ pub mod tests { engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, None, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); lb.put_raft_state(1, &raft_state).unwrap(); engines.raft.consume(&mut lb, false).unwrap(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 939bc2a1078..a9ef7df8c62 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2154,7 +2154,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - lb.append(region_id, vec![apply_entry])?; + lb.append(region_id, None, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index ce829ed61b2..3edabae71a0 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -214,7 +214,7 @@ mod tests { for i in 0..100 { let mut e = Entry::new(); e.set_index(i); - raft_wb.append(region_id, vec![e]).unwrap(); + raft_wb.append(region_id, None, vec![e]).unwrap(); } raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index d0637a04b0a..bfaa2a6587e 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -161,7 +161,7 @@ fn run_dump_raftdb_worker( // Assume that we always scan entry first and raft state at the // end. batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); } _ => unreachable!("There is only 2 types of keys in raft"), @@ -170,7 +170,7 @@ fn run_dump_raftdb_worker( if local_size >= BATCH_THRESHOLD { local_size = 0; batch - .append(region_id, std::mem::take(&mut entries)) + .append(region_id, None, std::mem::take(&mut entries)) .unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); @@ -205,7 +205,7 @@ fn run_dump_raft_engine_worker( begin += old_engine .fetch_entries_to(id, begin, end, Some(BATCH_THRESHOLD), &mut entries) .unwrap() as u64; - batch.append(id, entries).unwrap(); + batch.append(id, None, entries).unwrap(); let size = new_engine.consume(&mut batch, false).unwrap(); count_size.fetch_add(size, Ordering::Relaxed); } @@ -303,7 +303,7 @@ mod tests { e.set_index(i); entries.push(e); } - batch.append(num, entries).unwrap(); + batch.append(num, None, entries).unwrap(); } // Get data from raft engine and assert. diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 496c587a7b9..8709373b766 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -966,7 +966,7 @@ fn test_debug_raft_log() { entry.set_entry_type(eraftpb::EntryType::EntryNormal); entry.set_data(vec![42].into()); let mut lb = engine.log_batch(0); - lb.append(region_id, vec![entry.clone()]).unwrap(); + lb.append(region_id, None, vec![entry.clone()]).unwrap(); engine.consume(&mut lb, false).unwrap(); assert_eq!( engine.get_entry(region_id, log_index).unwrap().unwrap(), From bce01cfbc82b58a38b066892a3c679daf91dd33f Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 16:42:19 +0800 Subject: [PATCH 060/115] raftstore-v2: publish tablet in raftstore thread only (#14009) ref tikv/tikv#12842 Publish tablet in apply thread is unsafe. This PR moves the operation to raftstore. It also fixes the issues that applying two splits at a time can cause panic. It also makes sure cache will be cleared after tablet is published. Signed-off-by: Jay Lee --- components/engine_traits/src/tablet.rs | 34 +- components/raftstore-v2/src/batch/store.rs | 65 ++- components/raftstore-v2/src/fsm/store.rs | 17 +- .../operation/command/admin/compact_log.rs | 125 ++++- .../src/operation/command/admin/mod.rs | 1 + .../src/operation/command/admin/split.rs | 31 +- .../raftstore-v2/src/operation/command/mod.rs | 3 +- components/raftstore-v2/src/operation/life.rs | 10 +- components/raftstore-v2/src/operation/mod.rs | 11 +- .../raftstore-v2/src/operation/query/lease.rs | 6 +- .../raftstore-v2/src/operation/query/local.rs | 430 ++++++++++++------ .../raftstore-v2/src/operation/query/mod.rs | 4 +- .../raftstore-v2/src/operation/ready/mod.rs | 16 +- .../src/operation/ready/snapshot.rs | 52 ++- components/raftstore-v2/src/raft/apply.rs | 20 +- components/raftstore-v2/src/raft/peer.rs | 88 +--- components/raftstore-v2/src/raft/storage.rs | 76 +++- components/raftstore-v2/src/router/imp.rs | 8 +- .../raftstore-v2/src/worker/tablet_gc.rs | 15 +- .../tests/failpoints/test_split.rs | 3 + .../tests/integrations/cluster.rs | 2 +- .../raftstore/src/store/async_io/write.rs | 6 +- components/raftstore/src/store/mod.rs | 5 +- components/raftstore/src/store/worker/read.rs | 13 +- components/server/src/server2.rs | 4 +- src/config/mod.rs | 27 +- tests/integrations/config/mod.rs | 10 +- 27 files changed, 707 insertions(+), 375 deletions(-) diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs index f552fbc01aa..6bdfa97a6e6 100644 --- a/components/engine_traits/src/tablet.rs +++ b/components/engine_traits/src/tablet.rs @@ -31,6 +31,13 @@ pub struct CachedTablet { version: u64, } +impl CachedTablet { + fn release(&mut self) { + self.cache = None; + self.version = 0; + } +} + impl CachedTablet { #[inline] fn new(data: Option) -> Self { @@ -44,13 +51,11 @@ impl CachedTablet { } } - pub fn set(&mut self, data: EK) { - self.version = { - let mut latest_data = self.latest.data.lock().unwrap(); - *latest_data = Some(data.clone()); - self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 - }; - self.cache = Some(data); + pub fn set(&mut self, data: EK) -> Option { + self.cache = Some(data.clone()); + let mut latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.fetch_add(1, Ordering::Relaxed) + 1; + latest_data.replace(data) } /// Get the tablet from cache without checking if it's up to date. @@ -69,19 +74,6 @@ impl CachedTablet { } self.cache() } - - /// Returns how many versions has passed. - #[inline] - pub fn refresh(&mut self) -> u64 { - let old_version = self.version; - if self.latest.version.load(Ordering::Relaxed) > old_version { - let latest_data = self.latest.data.lock().unwrap(); - self.version = self.latest.version.load(Ordering::Relaxed); - self.cache = latest_data.clone(); - return self.version - old_version; - } - 0 - } } /// Context to be passed to `TabletFactory`. @@ -317,8 +309,10 @@ impl TabletRegistry { let mut tablets = self.tablets.tablets.lock().unwrap(); for (id, tablet) in tablets.iter_mut() { if !f(*id, tablet) { + tablet.release(); return; } + tablet.release(); } } } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 72f05801a0e..9ba7a63139c 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -45,7 +45,7 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, - operation::SPLIT_PREFIX, + operation::{SharedReadTablet, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, worker::{pd, tablet_gc}, @@ -72,7 +72,7 @@ pub struct StoreContext { pub timer: SteadyTimer, pub schedulers: Schedulers, /// store meta - pub store_meta: Arc>, + pub store_meta: Arc>>, pub engine: ER, pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, @@ -259,7 +259,7 @@ struct StorePollerBuilder { schedulers: Schedulers, apply_pool: FuturePool, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, } @@ -273,7 +273,7 @@ impl StorePollerBuilder { router: StoreRouter, schedulers: Schedulers, logger: Logger, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, coprocessor_host: CoprocessorHost, ) -> Self { @@ -429,13 +429,22 @@ pub struct Schedulers { pub split_check: Scheduler, } +impl Schedulers { + fn stop(&self) { + self.read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + self.split_check.stop(); + } +} + /// A set of background threads that will processing offloaded work from /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously async_read: Worker, pd: LazyWorker, - tablet_gc_worker: Worker, + tablet_gc: Worker, async_write: StoreWriters, purge: Option, @@ -448,18 +457,29 @@ impl Workers { Self { async_read: Worker::new("async-read-worker"), pd, - tablet_gc_worker: Worker::new("tablet-gc-worker"), + tablet_gc: Worker::new("tablet-gc-worker"), async_write: StoreWriters::default(), purge, background, } } + + fn stop(mut self) { + self.async_write.shutdown(); + self.async_read.stop(); + self.pd.stop(); + self.tablet_gc.stop(); + if let Some(w) = self.purge { + w.stop(); + } + } } /// The system used for polling Raft activities. pub struct StoreSystem { system: BatchSystem, StoreFsm>, workers: Option>, + schedulers: Option>, logger: Logger, shutdown: Arc, } @@ -474,7 +494,7 @@ impl StoreSystem { trans: T, pd_client: Arc, router: &StoreRouter, - store_meta: Arc>, + store_meta: Arc>>, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 @@ -548,7 +568,7 @@ impl StoreSystem { ), ); - let tablet_gc_scheduler = workers.tablet_gc_worker.start( + let tablet_gc_scheduler = workers.tablet_gc.start_with_timer( "tablet-gc-worker", tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), ); @@ -568,13 +588,14 @@ impl StoreSystem { tablet_registry, trans, router.clone(), - schedulers, + schedulers.clone(), self.logger.clone(), store_meta.clone(), snap_mgr, coprocessor_host, ); self.workers = Some(workers); + self.schedulers = Some(schedulers); let peers = builder.init()?; // Choose a different name so we know what version is actually used. rs stands // for raft store. @@ -585,9 +606,14 @@ impl StoreSystem { let mut address = Vec::with_capacity(peers.len()); { let mut meta = store_meta.as_ref().lock().unwrap(); - for (region_id, (tx, fsm)) in peers { - meta.readers - .insert(region_id, fsm.peer().generate_read_delegate()); + for (region_id, (tx, mut fsm)) in peers { + if let Some(tablet) = fsm.peer_mut().tablet() { + let read_tablet = SharedReadTablet::new(tablet.clone()); + meta.readers.insert( + region_id, + (fsm.peer().generate_read_delegate(), read_tablet), + ); + } address.push(region_id); mailboxes.push(( @@ -612,18 +638,16 @@ impl StoreSystem { if self.workers.is_none() { return; } - let mut workers = self.workers.take().unwrap(); + let workers = self.workers.take().unwrap(); - // TODO: gracefully shutdown future pool + // TODO: gracefully shutdown future apply pool + // Stop schedulers first, so all background future worker pool will be stopped + // gracefully. + self.schedulers.take().unwrap().stop(); self.system.shutdown(); - workers.async_write.shutdown(); - workers.async_read.stop(); - workers.pd.stop(); - if let Some(w) = workers.purge { - w.stop(); - } + workers.stop(); } } @@ -707,6 +731,7 @@ where let system = StoreSystem { system, workers: None, + schedulers: None, logger: logger.clone(), shutdown: Arc::new(AtomicBool::new(false)), }; diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index f107715a535..a5f22d7e1a8 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -12,9 +12,7 @@ use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; use keys::{data_end_key, data_key}; use kvproto::metapb::Region; -use raftstore::store::{ - fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, -}; +use raftstore::store::{fsm::store::StoreRegionMeta, Config, RegionReadProgressRegistry}; use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, @@ -24,13 +22,14 @@ use tikv_util::{ use crate::{ batch::StoreContext, + operation::ReadDelegatePair, router::{StoreMsg, StoreTick}, }; -pub struct StoreMeta { +pub struct StoreMeta { pub store_id: u64, /// region_id -> reader - pub readers: HashMap, + pub readers: HashMap>, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, /// (region_end_key, epoch.version) -> region_id @@ -42,9 +41,9 @@ pub struct StoreMeta { pub(crate) regions: HashMap, } -impl StoreMeta { - pub fn new(store_id: u64) -> StoreMeta { - StoreMeta { +impl StoreMeta { + pub fn new(store_id: u64) -> Self { + Self { store_id, readers: HashMap::default(), region_read_progress: RegionReadProgressRegistry::default(), @@ -96,7 +95,7 @@ impl StoreMeta { } } -impl StoreRegionMeta for StoreMeta { +impl StoreRegionMeta for StoreMeta { #[inline] fn store_id(&self) -> u64 { self.store_id diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index c36c7353871..7127cd45306 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -17,7 +17,9 @@ use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; use protobuf::Message; use raftstore::{ - store::{fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask}, + store::{ + fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + }, Result, }; use slog::{debug, error, info}; @@ -32,6 +34,47 @@ use crate::{ worker::tablet_gc, }; +#[derive(Debug)] +pub struct CompactLogContext { + skipped_ticks: usize, + approximate_log_size: u64, + last_applying_index: u64, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// worker to destroy them. + tombstone_tablets_wait_index: Vec, +} + +impl CompactLogContext { + pub fn new(last_applying_index: u64) -> CompactLogContext { + CompactLogContext { + skipped_ticks: 0, + approximate_log_size: 0, + last_applying_index, + tombstone_tablets_wait_index: vec![], + } + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skipped_ticks < max_skip_ticks { + self.skipped_ticks += 1; + true + } else { + false + } + } + + pub fn add_log_size(&mut self, size: u64) { + self.approximate_log_size += size; + } + + pub fn set_last_applying_index(&mut self, index: u64) { + self.last_applying_index = index; + } +} + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { pub fn on_compact_log_tick(&mut self, force: bool) { if !self.fsm.peer().is_leader() { @@ -130,13 +173,16 @@ impl Peer { replicated_idx } else if applied_idx > first_idx && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() - || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 + || self.compact_log_context().approximate_log_size + >= store_ctx.cfg.raft_log_gc_size_limit().0 { std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) } else if replicated_idx < first_idx || last_idx - first_idx < 3 || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold - && self.maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + && self + .compact_log_context_mut() + .maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) { return; } else { @@ -163,7 +209,7 @@ impl Peer { let (ch, _) = CmdResChannel::pair(); self.on_admin_command(store_ctx, req, ch); - self.reset_skip_compact_log_ticks(); + self.compact_log_context_mut().skipped_ticks = 0; } } @@ -217,6 +263,46 @@ impl Apply { } impl Peer { + #[inline] + pub fn record_tombstone_tablet( + &mut self, + ctx: &StoreContext, + old_tablet: EK, + new_tablet_index: u64, + ) { + let compact_log_context = self.compact_log_context_mut(); + compact_log_context + .tombstone_tablets_wait_index + .push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy( + old_tablet, + self.region_id(), + new_tablet_index, + )); + } + + /// Returns if there's any tombstone being removed. + #[inline] + fn remove_tombstone_tablets(&mut self, persisted: u64) -> bool { + let compact_log_context = self.compact_log_context_mut(); + let removed = compact_log_context + .tombstone_tablets_wait_index + .iter() + .take_while(|i| **i <= persisted) + .count(); + if removed > 0 { + compact_log_context + .tombstone_tablets_wait_index + .drain(..removed); + true + } else { + false + } + } + pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, @@ -260,18 +346,25 @@ impl Peer { self.compact_log_from_engine(store_ctx); } - let applied = *self.last_applying_index_mut(); + let context = self.compact_log_context_mut(); + let applied = context.last_applying_index; let total_cnt = applied - old_truncated; let remain_cnt = applied - res.compact_index; - self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); + context.approximate_log_size = + (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; } + /// Called when apply index is persisted. There are two different situation: + /// + /// Generally, additional writes are triggered to persist apply index. In + /// this case task is `Some`. But after applying snapshot, the apply + /// index is persisted ahead of time. In this case task is `None`. #[inline] pub fn on_advance_persisted_apply_index( &mut self, store_ctx: &mut StoreContext, old_persisted: u64, - task: &mut WriteTask, + task: Option<&mut WriteTask>, ) { let new_persisted = self.storage().apply_trace().persisted_apply_index(); if old_persisted < new_persisted { @@ -286,14 +379,20 @@ impl Peer { } else { self.set_has_extra_write(); } - if old_persisted < self.entry_storage().truncated_index() + 1 { + // If it's snapshot, logs are gc already. + if task.is_some() && old_persisted < self.entry_storage().truncated_index() + 1 { self.compact_log_from_engine(store_ctx); } - if self.remove_tombstone_tablets_before(new_persisted) { + if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); - task.persisted_cbs.push(Box::new(move || { + if let Some(task) = task { + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + })); + } else { + // In snapshot, the index is persisted, tablet can be destroyed directly. let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); - })) + } } } } @@ -302,6 +401,10 @@ impl Peer { let truncated = self.entry_storage().truncated_index() + 1; let persisted_applied = self.storage().apply_trace().persisted_apply_index(); let compact_index = std::cmp::min(truncated, persisted_applied); + if compact_index == RAFT_INIT_LOG_INDEX + 1 { + // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. + return; + } // Raft Engine doesn't care about first index. if let Err(e) = store_ctx diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 4f2abb9c65e..9ceaa76c03b 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -5,6 +5,7 @@ mod conf_change; mod split; mod transfer_leader; +pub use compact_log::CompactLogContext; use compact_log::CompactLogResult; use conf_change::ConfChangeResult; use engine_traits::{KvEngine, RaftEngine}; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index faf059b3871..add5af1ce52 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,7 +25,7 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{borrow::Cow, cmp, path::PathBuf}; +use std::{any::Any, borrow::Cow, cmp, path::PathBuf}; use collections::HashSet; use crossbeam::channel::SendError; @@ -58,7 +58,7 @@ use slog::info; use crate::{ batch::StoreContext, fsm::{ApplyResReporter, PeerFsmDelegate}, - operation::AdminCmdResult, + operation::{AdminCmdResult, SharedReadTablet}, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, worker::tablet_gc, @@ -73,6 +73,10 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, + // Hack: in common case we should use generic, but split is an unfrequent + // event that performance is not critical. And using `Any` can avoid polluting + // all existing code. + tablet: Box, } #[derive(Debug)] @@ -370,8 +374,6 @@ impl Apply { ) }); } - // Remove the old write batch. - self.write_batch.take(); let reg = self.tablet_registry(); let path = reg.tablet_path(region_id, log_index); let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); @@ -380,7 +382,7 @@ impl Apply { // TODO: Should we avoid flushing for the old tablet? ctx.flush_state = Some(self.flush_state().clone()); let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); - self.publish_tablet(tablet); + self.set_tablet(tablet.clone()); self.region_state_mut() .set_region(regions[derived_index].clone()); @@ -396,6 +398,7 @@ impl Apply { regions, derived_index, tablet_index: log_index, + tablet: Box::new(tablet), }), )) } @@ -427,10 +430,14 @@ impl Peer { }; fail_point!("on_split_invalidate_locks"); + let tablet: EK = match res.tablet.downcast() { + Ok(t) => *t, + Err(t) => unreachable!("tablet type should be the same: {:?}", t), + }; { let mut meta = store_ctx.store_meta.lock().unwrap(); meta.set_region(derived, true, &self.logger); - let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); + let (reader, read_tablet) = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( &store_ctx.coprocessor_host, reader, @@ -438,6 +445,12 @@ impl Peer { RegionChangeReason::Split, res.tablet_index, ); + + // Tablet should be updated in lock to match the epoch. + *read_tablet = SharedReadTablet::new(tablet.clone()); + } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); } self.post_split(); @@ -457,8 +470,6 @@ impl Peer { self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } - - self.record_tablet_as_tombstone_and_refresh(res.tablet_index, store_ctx); let _ = store_ctx .schedulers .tablet_gc @@ -632,7 +643,7 @@ mod test { use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactory, + kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, @@ -679,7 +690,7 @@ mod test { } fn assert_split( - apply: &mut Apply, + apply: &mut Apply, parent_id: u64, right_derived: bool, new_region_ids: Vec, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 4831c4abf9f..8b0d3d7d461 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -57,7 +57,8 @@ mod control; mod write; pub use admin::{ - temp_split_path, AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, + temp_split_path, AdminCmdResult, CompactLogContext, RequestSplit, SplitFlowControl, SplitInit, + SPLIT_PREFIX, }; pub use control::ProposalControl; pub use write::{ diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 954c6992cf9..f312162d1e5 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -320,12 +320,12 @@ impl Peer { /// memory states. pub fn finish_destroy(&mut self, ctx: &mut StoreContext) { info!(self.logger, "peer destroyed"); - ctx.router.close(self.region_id()); + let region_id = self.region_id(); + ctx.router.close(region_id); { - ctx.store_meta - .lock() - .unwrap() - .remove_region(self.region_id()); + let mut meta = ctx.store_meta.lock().unwrap(); + meta.remove_region(region_id); + meta.readers.remove(®ion_id); } if let Some(msg) = self.destroy_progress_mut().finish() { // The message will be dispatched to store fsm, which will create a diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index c49a14142ce..dc245c24384 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,9 +7,9 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, - SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, - SPLIT_PREFIX, + AdminCmdResult, CommittedEntries, CompactLogContext, ProposalControl, RequestSplit, + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, + SplitFlowControl, SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ @@ -17,4 +17,7 @@ pub use ready::{ StateStorage, }; -pub(crate) use self::{command::SplitInit, query::LocalReader}; +pub(crate) use self::{ + command::SplitInit, + query::{LocalReader, ReadDelegatePair, SharedReadTablet}, +}; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index ca92729ee6f..0abd0cccd72 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -150,7 +150,7 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &Mutex, + store_meta: &Mutex>, progress: Option, ) { // A nonleader peer should never has leader lease. @@ -170,12 +170,12 @@ impl Peer { }; if let Some(progress) = progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { let mut meta = store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 2cb5497d789..e4c0aa6d0b9 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -8,7 +8,7 @@ use std::{ use batch_system::Router; use crossbeam::channel::TrySendError; -use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ errorpb, @@ -20,10 +20,9 @@ use raftstore::{ cmd_resp, util::LeaseState, worker_metrics::{self, TLS_LOCAL_READ_METRICS}, - LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, - RegionSnapshot, RequestPolicy, + LocalReaderCore, ReadDelegate, ReadExecutorProvider, RegionSnapshot, }, - Error, Result, + Result, }; use slog::{debug, Logger}; use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; @@ -50,6 +49,87 @@ where } } +pub type ReadDelegatePair = (ReadDelegate, SharedReadTablet); + +/// A share struct for local reader. +/// +/// Though it looks like `CachedTablet`, but there are subtle differences. +/// 1. `CachedTablet` always hold the latest version of the tablet. But +/// `SharedReadTablet` should only hold the tablet that matches epoch. So it +/// will be updated only when the epoch is updated. +/// 2. `SharedReadTablet` should always hold a tablet and the same tablet. If +/// tablet is taken, then it should be considered as stale and should check +/// again epoch to load the new `SharedReadTablet`. +/// 3. `SharedReadTablet` may be cloned into thread local. So its cache should +/// be released as soon as possible, so there should be no strong reference +/// that prevents tablet from being dropped after it's marked as stale by other +/// threads. +pub struct SharedReadTablet { + tablet: Arc>>, + cache: Option, + source: bool, +} + +impl SharedReadTablet { + pub fn new(tablet: EK) -> Self { + Self { + tablet: Arc::new(Mutex::new(Some(tablet))), + cache: None, + source: true, + } + } + + /// Should call `fill_cache` first. + pub fn cache(&self) -> &EK { + self.cache.as_ref().unwrap() + } + + pub fn fill_cache(&mut self) -> bool + where + EK: Clone, + { + self.cache = self.tablet.lock().unwrap().clone(); + self.cache.is_some() + } + + pub fn release(&mut self) { + self.cache = None; + } +} + +impl Clone for SharedReadTablet { + fn clone(&self) -> Self { + Self { + tablet: Arc::clone(&self.tablet), + cache: None, + source: false, + } + } +} + +impl Drop for SharedReadTablet { + fn drop(&mut self) { + if self.source { + self.tablet.lock().unwrap().take(); + } + } +} + +enum ReadResult { + Ok(T), + Redirect, + RetryForStaleDelegate, + Err(E), +} + +fn fail_resp(msg: String) -> RaftCmdResponse { + let mut err = errorpb::Error::default(); + err.set_message(msg); + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + resp +} + #[derive(Clone)] pub struct LocalReader where @@ -67,63 +147,69 @@ where E: KvEngine, C: MsgRouter, { - pub fn new( - store_meta: Arc>, - reg: TabletRegistry, - router: C, - logger: Logger, - ) -> Self { + pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { Self { - local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta, reg)), + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), router, logger, } } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { &self.local_reader.store_meta().store_meta } - pub fn pre_propose_raft_command( + fn pre_propose_raft_command( &mut self, req: &RaftCmdRequest, - ) -> Result, RequestPolicy)>> { - if let Some(delegate) = self.local_reader.validate_request(req)? { - let mut inspector = SnapRequestInspector { - delegate: &delegate, - logger: &self.logger, - }; - match inspector.inspect(req) { - Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), - Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), - // It can not handle other policies. - // TODO: we should only abort when lease expires. For other cases we should retry - // infinitely. - Ok(_) => Ok(None), - Err(e) => Err(e), + ) -> ReadResult<(CachedReadDelegate, ReadRequestPolicy)> { + let mut delegate = match self.local_reader.validate_request(req) { + Ok(Some(delegate)) => delegate, + Ok(None) => return ReadResult::Redirect, + Err(e) => return ReadResult::Err(e), + }; + + if !delegate.cached_tablet.fill_cache() { + return ReadResult::RetryForStaleDelegate; + } + let mut inspector = SnapRequestInspector { + delegate: &delegate, + logger: &self.logger, + }; + match inspector.inspect(req) { + Ok(ReadRequestPolicy::ReadLocal) => { + ReadResult::Ok((delegate, ReadRequestPolicy::ReadLocal)) } - } else { - Err(Error::RegionNotFound(req.get_header().get_region_id())) + Ok(ReadRequestPolicy::StaleRead) => { + ReadResult::Ok((delegate, ReadRequestPolicy::StaleRead)) + } + // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. + Ok(ReadRequestPolicy::ReadIndex) => ReadResult::Redirect, + Err(e) => ReadResult::Err(e), } } fn try_get_snapshot( &mut self, req: &RaftCmdRequest, - ) -> std::result::Result>, RaftCmdResponse> { + ) -> ReadResult, RaftCmdResponse> { match self.pre_propose_raft_command(req) { - Ok(Some((mut delegate, policy))) => { + ReadResult::Ok((mut delegate, policy)) => { let mut snap = match policy { - RequestPolicy::ReadLocal => { + ReadRequestPolicy::ReadLocal => { let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); // Ensures the snapshot is acquired before getting the time atomic::fence(atomic::Ordering::Release); let snapshot_ts = monotonic_raw_now(); if !delegate.is_in_leader_lease(snapshot_ts) { - return Ok(None); + return ReadResult::Redirect; } TLS_LOCAL_READ_METRICS @@ -133,18 +219,24 @@ where self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); snap } - RequestPolicy::StaleRead => { + ReadRequestPolicy::StaleRead => { let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } let region = Arc::clone(&delegate.region); - let snap = - RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_requests.inc()); - delegate.check_stale_read_safe(read_ts)?; + if let Err(e) = delegate.check_stale_read_safe(read_ts) { + return ReadResult::Err(e); + } TLS_LOCAL_READ_METRICS .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); @@ -156,10 +248,11 @@ where snap.txn_ext = Some(delegate.txn_ext.clone()); snap.bucket_meta = delegate.bucket_meta.clone(); - Ok(Some(snap)) + delegate.cached_tablet.release(); + + ReadResult::Ok(snap) } - Ok(None) => Ok(None), - Err(e) => { + ReadResult::Err(e) => { let mut response = cmd_resp::new_error(e); if let Some(delegate) = self .local_reader @@ -168,8 +261,10 @@ where { cmd_resp::bind_term(&mut response, delegate.term); } - Err(response) + ReadResult::Err(response) } + ReadResult::Redirect => ReadResult::Redirect, + ReadResult::RetryForStaleDelegate => ReadResult::RetryForStaleDelegate, } } @@ -179,50 +274,85 @@ where ) -> impl Future, RaftCmdResponse>> + Send { let region_id = req.header.get_ref().region_id; - let res = match self.try_get_snapshot(&req) { - res @ (Ok(Some(_)) | Err(_)) => Either::Left(res), - Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), + let mut tried_cnt = 0; + let res = loop { + let res = self.try_get_snapshot(&req); + match res { + ReadResult::Ok(snap) => break Either::Left(Ok(snap)), + ReadResult::Err(e) => break Either::Left(Err(e)), + ReadResult::Redirect => { + break Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())); + } + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + break Either::Left(Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", + region_id + )))); + } + } }; worker_metrics::maybe_tls_local_read_metrics_flush(); async move { - match res { - Either::Left(Ok(Some(snap))) => Ok(snap), - Either::Left(Err(e)) => Err(e), - Either::Right((fut, mut reader)) => { - let err = match fut.await? { - Some(query_res) => { - if query_res.read().is_some() { - // If query successful, try again. - req.mut_header().set_read_quorum(false); - if let Some(snap) = reader.try_get_snapshot(&req)? { - return Ok(snap); - } else { - let mut err = errorpb::Error::default(); - err.set_message(format!("no delegate found for {}", region_id)); - err - } - } else { - let QueryResult::Response(res) = query_res else { unreachable!() }; - assert!(res.get_header().has_error(), "{:?}", res); - return Err(res); + let (mut fut, mut reader) = match res { + Either::Left(Ok(snap)) => return Ok(snap), + Either::Left(Err(e)) => return Err(e), + Either::Right((fut, reader)) => (fut, reader), + }; + + let mut tried_cnt = 0; + loop { + match fut.await? { + Some(query_res) => { + if query_res.read().is_none() { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); + } + } + None => { + return Err(fail_resp(format!( + "internal error: failed to extend lease: canceled: {}", + region_id + ))); + } + } + + // If query successful, try again. + req.mut_header().set_read_quorum(false); + loop { + let r = reader.try_get_snapshot(&req); + match r { + ReadResult::Ok(snap) => return Ok(snap), + ReadResult::Err(e) => return Err(e), + ReadResult::Redirect => { + tried_cnt += 1; + if tried_cnt < 10 { + fut = reader.try_to_renew_lease(region_id, &req); + break; } + return Err(fail_resp(format!( + "internal error: can't handle msg in local reader for {}", + region_id + ))); } - None => { - let mut err = errorpb::Error::default(); - err.set_message(format!( - "failed to extend lease: canceled: {}", + ReadResult::RetryForStaleDelegate => { + tried_cnt += 1; + if tried_cnt < 10 { + continue; + } + return Err(fail_resp(format!( + "internal error: failed to get valid dalegate for {}", region_id - )); - err + ))); } - }; - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + } } - Either::Left(Ok(None)) => unreachable!(), } } } @@ -309,7 +439,7 @@ where // The reason for this to be Arc, see the comment on get_delegate in // raftstore/src/store/worker/read.rs delegate: Arc, - cached_tablet: CachedTablet, + cached_tablet: SharedReadTablet, } impl Deref for CachedReadDelegate @@ -335,36 +465,20 @@ where } } -impl ReadExecutor for CachedReadDelegate -where - E: KvEngine, -{ - type Tablet = E; - - fn get_tablet(&mut self) -> &E { - self.cached_tablet.latest().unwrap() - } - - fn get_snapshot(&mut self, _: &Option>) -> Arc { - Arc::new(self.cached_tablet.latest().unwrap().snapshot()) - } -} - #[derive(Clone)] struct StoreMetaDelegate where E: KvEngine, { - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, } impl StoreMetaDelegate where E: KvEngine, { - pub fn new(store_meta: Arc>, reg: TabletRegistry) -> StoreMetaDelegate { - StoreMetaDelegate { store_meta, reg } + pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta } } } @@ -373,7 +487,7 @@ where E: KvEngine, { type Executor = CachedReadDelegate; - type StoreMeta = Arc>; + type StoreMeta = Arc>>; fn store_id(&self) -> Option { Some(self.store_meta.as_ref().lock().unwrap().store_id) @@ -384,14 +498,13 @@ where fn get_executor_and_len(&self, region_id: u64) -> (usize, Option) { let meta = self.store_meta.as_ref().lock().unwrap(); let reader = meta.readers.get(®ion_id).cloned(); - if let Some(reader) = reader { + if let Some((reader, read_tablet)) = reader { // If reader is not None, cache must not be None. - let cached_tablet = self.reg.get(region_id).unwrap(); return ( meta.readers.len(), Some(CachedReadDelegate { delegate: Arc::new(reader), - cached_tablet, + cached_tablet: read_tablet, }), ); } @@ -399,13 +512,19 @@ where } } +enum ReadRequestPolicy { + StaleRead, + ReadLocal, + ReadIndex, +} + struct SnapRequestInspector<'r> { delegate: &'r ReadDelegate, logger: &'r Logger, } impl<'r> SnapRequestInspector<'r> { - fn inspect(&mut self, req: &RaftCmdRequest) -> Result { + fn inspect(&mut self, req: &RaftCmdRequest) -> Result { assert!(!req.has_admin_request()); if req.get_requests().len() != 1 || req.get_requests().first().unwrap().get_cmd_type() != CmdType::Snap @@ -417,26 +536,26 @@ impl<'r> SnapRequestInspector<'r> { let flags = WriteBatchFlags::from_bits_check(req.get_header().get_flags()); if flags.contains(WriteBatchFlags::STALE_READ) { - return Ok(RequestPolicy::StaleRead); + return Ok(ReadRequestPolicy::StaleRead); } if req.get_header().get_read_quorum() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // If applied index's term differs from current raft's term, leader transfer // must happened, if read locally, we may read old value. if !self.has_applied_to_current_term() { - return Ok(RequestPolicy::ReadIndex); + return Ok(ReadRequestPolicy::ReadIndex); } // Local read should be performed, if and only if leader is in lease. // None for now. match self.inspect_lease() { - LeaseState::Valid => Ok(RequestPolicy::ReadLocal), + LeaseState::Valid => Ok(ReadRequestPolicy::ReadLocal), LeaseState::Expired | LeaseState::Suspect => { // Perform a consistent read to Raft quorum and try to renew the leader lease. - Ok(RequestPolicy::ReadIndex) + Ok(ReadRequestPolicy::ReadIndex) } } } @@ -480,12 +599,13 @@ mod tests { thread::{self, JoinHandle}, }; + use collections::HashSet; use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; + use engine_traits::{MiscExt, SyncMutable, TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; use pd_client::BucketMeta; @@ -505,17 +625,27 @@ mod tests { #[derive(Clone)] struct MockRouter { p_router: SyncSender<(u64, PeerMsg)>, + addresses: Arc>>, } impl MockRouter { - fn new() -> (MockRouter, Receiver<(u64, PeerMsg)>) { + fn new(addresses: Arc>>) -> (MockRouter, Receiver<(u64, PeerMsg)>) { let (p_ch, p_rx) = sync_channel(1); - (MockRouter { p_router: p_ch }, p_rx) + ( + MockRouter { + p_router: p_ch, + addresses, + }, + p_rx, + ) } } impl MsgRouter for MockRouter { fn send(&self, addr: u64, cmd: PeerMsg) -> std::result::Result<(), TrySendError> { + if !self.addresses.lock().unwrap().contains(&addr) { + return Err(TrySendError::Disconnected(cmd)); + } self.p_router.send((addr, cmd)).unwrap(); Ok(()) } @@ -524,16 +654,15 @@ mod tests { #[allow(clippy::type_complexity)] fn new_reader( store_id: u64, - store_meta: Arc>, - reg: TabletRegistry, + store_meta: Arc>>, + addresses: Arc>>, ) -> ( LocalReader, Receiver<(u64, PeerMsg)>, ) { - let (ch, rx) = MockRouter::new(); + let (ch, rx) = MockRouter::new(addresses); let mut reader = LocalReader::new( store_meta, - reg, ch, Logger::root(slog::Discard, o!("key1" => "value1")), ); @@ -607,7 +736,8 @@ mod tests { let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); - let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); + let addresses: Arc>> = Arc::default(); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), addresses.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -649,9 +779,11 @@ mod tests { ); // No msg will ben sent rx.try_recv().unwrap_err(); + // It will be rejected first when processing local, and then rejected when + // trying to forward to raftstore. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.no_region.get()), - 1 + 2 ); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -683,10 +815,11 @@ mod tests { track_ver: TrackVer::new(), bucket_meta: Some(bucket_meta.clone()), }; - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let ctx = TabletContext::new(®ion1, Some(10)); - reg.load(ctx, true).unwrap(); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + meta.readers.insert(1, (read_delegate, shared)); } let (ch_tx, ch_rx) = sync_channel(1); @@ -701,6 +834,7 @@ mod tests { meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::applied_term(term6)); }), rx, @@ -710,6 +844,7 @@ mod tests { // The first try will be rejected due to unmatched applied term but after update // the applied term by the above thread, the snapshot will be acquired by // retrying. + addresses.lock().unwrap().insert(1); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); assert!(Arc::ptr_eq( @@ -730,14 +865,16 @@ mod tests { // Case: Expire lease to make the local reader lease check fail. lease.expire_remote_lease(); let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let meta = store_meta.clone(); // Send what we want to do to mock raftstore mix_tx .send(( Box::new(move || { - let mut meta = store_meta.lock().unwrap(); + let mut meta = meta.lock().unwrap(); meta.readers .get_mut(&1) .unwrap() + .0 .update(ReadProgress::leader_lease(remote)); }), rx, @@ -757,6 +894,25 @@ mod tests { ); rx = ch_rx.recv().unwrap(); + // Case: Tablet miss should triger retry. + { + let ctx = TabletContext::new(®ion1, Some(15)); + let mut tablet = reg.load(ctx, true).unwrap(); + let shared = SharedReadTablet::new(tablet.latest().unwrap().clone()); + let mut meta = store_meta.lock().unwrap(); + meta.readers.get_mut(&1).unwrap().1 = shared; + } + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Tablet miss should trigger reload tablet, so cache miss should increase. + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), + 6 + ); + assert_eq!( + TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), + 1 + ); + // Case: Read quorum. let mut cmd_read_quorum = cmd.clone(); cmd_read_quorum.mut_header().set_read_quorum(true); @@ -806,8 +962,7 @@ mod tests { let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1))), reg.clone()); + let store_meta = StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1)))); let tablet1; let tablet2; @@ -816,43 +971,46 @@ mod tests { // Create read_delegate with region id 1 let read_delegate = ReadDelegate::mock(1); - meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data let mut ctx = TabletContext::with_infinite_region(1, Some(10)); reg.load(ctx, true).unwrap(); tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); + let shared1 = SharedReadTablet::new(tablet1.clone()); + meta.readers.insert(1, (read_delegate, shared1)); // Create read_delegate with region id 2 let read_delegate = ReadDelegate::mock(2); - meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data ctx = TabletContext::with_infinite_region(2, Some(10)); reg.load(ctx, true).unwrap(); tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); + let shared2 = SharedReadTablet::new(tablet2.clone()); + meta.readers.insert(2, (read_delegate, shared2)); } let (_, delegate) = store_meta.get_executor_and_len(1); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet1.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val1".to_vec(), - *snapshot.get_value(b"a1").unwrap().unwrap() - ); + let path1 = tablet.path().to_owned(); + delegate.cached_tablet.release(); let (_, delegate) = store_meta.get_executor_and_len(2); let mut delegate = delegate.unwrap(); - let tablet = delegate.get_tablet(); + assert!(delegate.cached_tablet.fill_cache()); + let tablet = delegate.cached_tablet.cache(); assert_eq!(tablet2.path(), tablet.path()); - let snapshot = delegate.get_snapshot(&None); - assert_eq!( - b"val2".to_vec(), - *snapshot.get_value(b"a2").unwrap().unwrap() - ); + + assert!(KvTestEngine::locked(&path1).unwrap()); + drop(tablet1); + drop(reg); + assert!(KvTestEngine::locked(&path1).unwrap()); + store_meta.store_meta.lock().unwrap().readers.remove(&1); + assert!(!KvTestEngine::locked(&path1).unwrap()); } } diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 59c6f2d0f7c..f26659c7b89 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -46,7 +46,7 @@ mod lease; mod local; mod replica; -pub(crate) use self::local::LocalReader; +pub(crate) use self::local::{LocalReader, ReadDelegatePair, SharedReadTablet}; impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> PeerFsmDelegate<'a, EK, ER, T> @@ -436,7 +436,7 @@ impl Peer { } let progress = ReadProgress::applied_term(applied_term); let mut meta = ctx.store_meta.lock().unwrap(); - let reader = meta.readers.get_mut(&self.region_id()).unwrap(); + let reader = &mut meta.readers.get_mut(&self.region_id()).unwrap().0; self.maybe_update_read_progress(reader, progress); } } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 8a0e0770b1f..fe4208db549 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -306,7 +306,8 @@ impl Peer { let mut update_lease = self.is_leader(); if update_lease { for entry in committed_entries.iter().rev() { - self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); + self.compact_log_context_mut() + .add_log_size(entry.get_data().len() as u64); if update_lease { let propose_time = self .proposals() @@ -329,7 +330,8 @@ impl Peer { } let applying_index = committed_entries.last().unwrap().index; let commit_to_current_term = committed_entries.last().unwrap().term == self.term(); - *self.last_applying_index_mut() = applying_index; + self.compact_log_context_mut() + .set_last_applying_index(applying_index); if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); @@ -426,7 +428,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); + self.on_advance_persisted_apply_index(ctx, prev_persisted, Some(&mut write_task)); if !ready.persisted_messages().is_empty() { write_task.messages = ready @@ -612,9 +614,11 @@ impl Peer { // leader apply the split command or an election timeout is passed since split // is committed. We already forbid renewing lease after committing split, and // original leader will update the reader delegate with latest epoch after - // applying split before the split peer starts campaign, so here the only thing - // we need to do is marking split is committed (which is done by `commit_to` - // above). It's correct to allow local read during split. + // applying split before the split peer starts campaign, so what needs to be + // done are 1. mark split is committed, which is done by `commit_to` above, + // 2. make sure split result is invisible until epoch is updated or reader may + // miss data from the new tablet. This is done by always publish tablet in + // `on_apply_res_split`. So it's correct to allow local read during split. // // - For merge, after the prepare merge command is committed, the target peers // may apply commit merge at any time, so we need to forbid any type of read diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 76a5b4297b3..2e1b9362a69 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -45,7 +45,7 @@ use tikv_util::box_err; use crate::{ fsm::ApplyResReporter, - operation::command::temp_split_path, + operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, Result, StoreContext, }; @@ -197,7 +197,8 @@ impl Peer { StateRole::Follower, ); let persisted_index = self.persisted_index(); - *self.last_applying_index_mut() = persisted_index; + self.compact_log_context_mut() + .set_last_applying_index(persisted_index); let snapshot_index = self.entry_storage().truncated_index(); assert!(snapshot_index >= RAFT_INIT_LOG_INDEX, "{:?}", self.logger); // If leader sends a message append to the follower while it's applying @@ -211,18 +212,41 @@ impl Peer { let mut tablet_ctx = TabletContext::new(self.region(), Some(snapshot_index)); // Use a new FlushState to avoid conflicts with the old one. tablet_ctx.flush_state = Some(flush_state); - ctx.tablet_registry.load(tablet_ctx, false).unwrap(); - self.record_tablet_as_tombstone_and_refresh(snapshot_index, ctx); + let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); + assert!( + path.exists(), + "{:?} {} not exists", + self.logger.list(), + path.display() + ); + let tablet = ctx + .tablet_registry + .tablet_factory() + .open_tablet(tablet_ctx, &path) + .unwrap_or_else(|e| { + panic!( + "{:?} failed to load tablet at {}: {:?}", + self.logger.list(), + path.display(), + e + ); + }); + + let prev_persisted_applied = self.storage().apply_trace().persisted_apply_index(); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(snapshot_index); + let read_tablet = SharedReadTablet::new(tablet.clone()); { let mut meta = ctx.store_meta.lock().unwrap(); meta.set_region(self.region(), true, &self.logger); meta.readers - .insert(region_id, self.generate_read_delegate()); + .insert(region_id, (self.generate_read_delegate(), read_tablet)); meta.region_read_progress .insert(region_id, self.read_progress().clone()); } + if let Some(tablet) = self.set_tablet(tablet) { + self.record_tombstone_tablet(ctx, tablet, snapshot_index); + } self.read_progress_mut().update_applied_core(snapshot_index); let split = self.storage_mut().split_init_mut().take(); if split.as_ref().map_or(true, |s| { @@ -234,6 +258,7 @@ impl Peer { info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } + self.on_advance_persisted_apply_index(ctx, prev_persisted_applied, None); self.schedule_apply_fsm(ctx); } } @@ -506,7 +531,22 @@ impl Storage { let old_last_index = self.entry_storage().last_index(); if self.entry_storage().first_index() <= old_last_index { // All states are rewritten in the following blocks. Stale states will be - // cleaned up by compact worker. + // cleaned up by compact worker. Have to use raft write batch here becaue + // raft log engine expects deletes before writes. + let raft_engine = self.entry_storage().raft_engine(); + if task.raft_wb.is_none() { + task.raft_wb = Some(raft_engine.log_batch(64)); + } + let wb = task.raft_wb.as_mut().unwrap(); + raft_engine + .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) + .unwrap_or_else(|e| { + panic!( + "{:?} failed to clean up region: {:?}", + self.logger().list(), + e + ) + }); self.entry_storage_mut().clear(); } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 6818d7ae0d9..8660e4795d0 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,7 @@ use std::{mem, sync::Arc}; -use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; +use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, @@ -19,8 +19,6 @@ use crate::{ /// Apply applies all the committed commands to kv db. pub struct Apply { peer: metapb::Peer, - /// publish the update of the tablet - remote_tablet: CachedTablet, tablet: EK, pub write_batch: Option, /// A buffer for encoding key. @@ -79,7 +77,6 @@ impl Apply { Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), - remote_tablet, write_batch: None, callbacks: vec![], tombstone: false, @@ -155,13 +152,16 @@ impl Apply { &mut self.region_state } - /// Publish the tablet so that it can be used by read worker. - /// - /// Note, during split/merge, lease is expired explicitly and read is - /// forbidden. So publishing it immediately is OK. + /// The tablet can't be public yet, otherwise content of latest tablet + /// doesn't matches its epoch in both readers and peer fsm. #[inline] - pub fn publish_tablet(&mut self, tablet: EK) { - self.remote_tablet.set(tablet.clone()); + pub fn set_tablet(&mut self, tablet: EK) { + assert!( + self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), + "{:?}", + self.logger.list() + ); + self.write_batch.take(); self.tablet = tablet; } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index f3734b6821d..bc3d8a5af8e 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -30,10 +30,10 @@ use crate::{ batch::StoreContext, fsm::ApplyScheduler, operation::{ - AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, + SplitFlowControl, }, router::{CmdResChannel, PeerTick, QueryResChannel}, - worker::tablet_gc, Result, }; @@ -43,11 +43,6 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, - /// Tombstone tablets can only be destroyed when the tablet that replaces it - /// is persisted. This is a list of tablet index that awaits to be - /// persisted. When persisted_apply is advanced, we need to notify tablet_gc - /// worker to destroy them. - pending_tombstone_tablets: Vec, /// Statistics for self. self_stat: PeerStat, @@ -60,9 +55,7 @@ pub struct Peer { peer_heartbeats: HashMap, /// For raft log compaction. - skip_compact_log_ticks: usize, - approximate_raft_log_size: u64, - last_applying_index: u64, + compact_log_context: CompactLogContext, /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. @@ -151,13 +144,10 @@ impl Peer { let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { tablet: cached_tablet, - pending_tombstone_tablets: Vec::new(), self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), - skip_compact_log_ticks: 0, - approximate_raft_log_size: 0, - last_applying_index: raft_group.store().apply_state().get_applied_index(), + compact_log_context: CompactLogContext::new(applied_index), raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), @@ -346,41 +336,18 @@ impl Peer { } #[inline] - pub fn record_tablet_as_tombstone_and_refresh( - &mut self, - new_tablet_index: u64, - ctx: &StoreContext, - ) { - if let Some(old_tablet) = self.tablet.cache() { - self.pending_tombstone_tablets.push(new_tablet_index); - let _ = ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::prepare_destroy( - old_tablet.clone(), - self.region_id(), - new_tablet_index, - )); - } - // TODO: Handle race between split and snapshot. So that we can assert - // `self.tablet.refresh() == 1` - assert!(self.tablet.refresh() > 0); + pub fn set_tablet(&mut self, tablet: EK) -> Option { + self.tablet.set(tablet) } - /// Returns if there's any tombstone being removed. #[inline] - pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { - let removed = self - .pending_tombstone_tablets - .iter() - .take_while(|i| **i <= persisted) - .count(); - if removed > 0 { - self.pending_tombstone_tablets.drain(..removed); - true - } else { - false - } + pub fn compact_log_context_mut(&mut self) -> &mut CompactLogContext { + &mut self.compact_log_context + } + + #[inline] + pub fn compact_log_context(&self) -> &CompactLogContext { + &self.compact_log_context } #[inline] @@ -543,35 +510,6 @@ impl Peer { down_peers } - #[inline] - pub fn reset_skip_compact_log_ticks(&mut self) { - self.skip_compact_log_ticks = 0; - } - - #[inline] - pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { - if self.skip_compact_log_ticks < max_skip_ticks { - self.skip_compact_log_ticks += 1; - true - } else { - false - } - } - - #[inline] - pub fn approximate_raft_log_size(&self) -> u64 { - self.approximate_raft_log_size - } - - #[inline] - pub fn update_approximate_raft_log_size(&mut self, f: impl Fn(u64) -> u64) { - self.approximate_raft_log_size = f(self.approximate_raft_log_size); - } - - pub fn last_applying_index_mut(&mut self) -> &mut u64 { - &mut self.last_applying_index - } - #[inline] pub fn state_role(&self) -> StateRole { self.raft_group.raft.state diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 51bd41ba253..b0eec5a196c 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -307,8 +307,9 @@ mod tests { }; use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, - TabletSnapKey, TabletSnapManager, WriteTask, + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, FetchedLogs, GenSnapRes, + ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, + RAFT_INIT_LOG_TERM, }; use slog::o; use tempfile::TempDir; @@ -357,14 +358,20 @@ mod tests { region } + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.set_index(index); + e.set_term(term); + e + } + #[test] fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); - let raft_engine = - engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) - .unwrap(); + let engines = engine_test::new_temp_engine(&path); + let raft_engine = engines.raft.clone(); let mut wb = raft_engine.log_batch(10); write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); @@ -381,26 +388,57 @@ mod tests { .unwrap() .unwrap(); - let snapshot = new_empty_snapshot(region.clone(), 10, 9, false); - let mut task = WriteTask::new(region.get_id(), 5, 0); - s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); + let mut task = WriteTask::new(region.get_id(), 5, 1); + let entries = (RAFT_INIT_LOG_INDEX + 1..RAFT_INIT_LOG_INDEX + 10) + .map(|i| new_entry(i, RAFT_INIT_LOG_TERM)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + write_to_db_for_test(&engines, task); + + let snap_index = RAFT_INIT_LOG_INDEX + 20; + let snap_term = 9; + let path = mgr.final_recv_path(&TabletSnapKey::new( + region.get_id(), + 5, + snap_term, + snap_index, + )); + reg.tablet_factory() + .open_tablet(TabletContext::new(®ion, Some(snap_index)), &path) + .unwrap(); + let snapshot = new_empty_snapshot(region.clone(), snap_index, snap_term, false); + let mut task = WriteTask::new(region.get_id(), 5, 1); + s.apply_snapshot(&snapshot, &mut task, mgr, reg.clone()) + .unwrap(); + // Add more entries to check if old entries are cleared. If not, it should panic + // with memtable hole when using raft engine. + let entries = (snap_index + 1..=snap_index + 10) + .map(|i| new_entry(i, snap_term)) + .collect(); + s.entry_storage_mut().append(entries, &mut task); + + assert!(!reg.tablet_path(region.get_id(), snap_index).exists()); + assert!(!task.persisted_cbs.is_empty()); + + write_to_db_for_test(&engines, task); + + assert!(reg.tablet_path(region.get_id(), snap_index).exists()); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); - assert_eq!(10, s.entry_storage().truncated_index()); - assert_eq!(9, s.entry_storage().truncated_term()); - assert_eq!(9, s.entry_storage().last_term()); - assert_eq!(10, s.entry_storage().raft_state().last_index); + assert_eq!(snap_index, s.entry_storage().truncated_index()); + assert_eq!(snap_term, s.entry_storage().truncated_term()); + assert_eq!(snap_term, s.entry_storage().last_term()); + assert_eq!(snap_index + 10, s.entry_storage().raft_state().last_index); // This index can't be set before load tablet. - assert_ne!(10, s.entry_storage().applied_index()); - assert_ne!(9, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); - assert!(!task.persisted_cbs.is_empty()); + assert_ne!(snap_index, s.entry_storage().applied_index()); + assert_ne!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); s.on_applied_snapshot(); - assert_eq!(10, s.entry_storage().applied_index()); - assert_eq!(9, s.entry_storage().applied_term()); - assert_eq!(10, s.region_state().get_tablet_index()); + assert_eq!(snap_index, s.entry_storage().applied_index()); + assert_eq!(snap_term, s.entry_storage().applied_term()); + assert_eq!(snap_index, s.region_state().get_tablet_index()); } #[test] diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 668d7591a40..7a10c6c6b16 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -6,7 +6,7 @@ use std::{ }; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use engine_traits::{KvEngine, RaftEngine}; use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, @@ -115,13 +115,13 @@ where } impl RaftRouter { - pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { + pub fn new(store_id: u64, router: StoreRouter) -> Self { let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let logger = router.logger().clone(); RaftRouter { router: router.clone(), - local_reader: LocalReader::new(store_meta, reg, router, logger), + local_reader: LocalReader::new(store_meta, router, logger), } } @@ -138,7 +138,7 @@ impl RaftRouter { self.router.check_send(addr, msg) } - pub fn store_meta(&self) -> &Arc> { + pub fn store_meta(&self) -> &Arc>> { self.local_reader.store_meta() } diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index cc1fcd971e9..aba477f883f 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -9,7 +9,7 @@ use std::{ use collections::HashMap; use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; use kvproto::metapb::Region; -use slog::{error, warn, Logger}; +use slog::{debug, error, warn, Logger}; use tikv_util::worker::{Runnable, RunnableWithTimer}; pub enum Task { @@ -156,10 +156,15 @@ impl Runner { "path" => path.display(), ), Ok(false) => { + let (_, region_id, tablet_index) = + registry.parse_tablet_name(path).unwrap_or(("", 0, 0)); // TODO: use a meaningful table context. let _ = registry .tablet_factory() - .destroy_tablet(TabletContext::with_infinite_region(0, None), path) + .destroy_tablet( + TabletContext::with_infinite_region(region_id, Some(tablet_index)), + path, + ) .map_err(|e| { warn!( logger, @@ -170,7 +175,9 @@ impl Runner { }); return true; } - _ => {} + Ok(true) => { + debug!(logger, "ignore locked tablet"; "path" => path.display()); + } } false } @@ -222,6 +229,6 @@ where } fn get_interval(&self) -> Duration { - Duration::from_secs(2) + Duration::from_secs(10) } } diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs index 79356ae5805..e67041ab181 100644 --- a/components/raftstore-v2/tests/failpoints/test_split.rs +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -82,6 +82,9 @@ fn test_restart_resume() { .new_request_for(split_region_id) .take_header() .take_region_epoch(); + // Split will be resumed for region 2, not removing the fp will make write block + // forever. + fail::remove(fp); let timer = Instant::now(); for (region_id, key, val) in cases { let mut put = SimpleWriteEncoder::with_capacity(64); diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 4c025a0fc85..ce0248130fb 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -276,7 +276,7 @@ impl RunningState { factory.open_tablet(ctx, &path).unwrap(); } - let router = RaftRouter::new(store_id, registry.clone(), router); + let router = RaftRouter::new(store_id, router); let store_meta = router.store_meta().clone(); let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 56d0f93a11d..817ff576f67 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -912,7 +912,6 @@ where } /// Used for test to write task to kv db and raft db. -#[cfg(test)] pub fn write_to_db_for_test( engines: &engine_traits::Engines, task: WriteTask, @@ -922,7 +921,8 @@ pub fn write_to_db_for_test( { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); batch.add_write_task(&engines.raft, task); - batch.before_write_to_db(&StoreWriteMetrics::new(false)); + let metrics = StoreWriteMetrics::new(false); + batch.before_write_to_db(&metrics); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); @@ -939,6 +939,8 @@ pub fn write_to_db_for_test( }); } } + batch.after_write_to_raft_db(&metrics); + batch.after_write_all(); } #[cfg(test)] diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 62561c63cbc..42fb320035b 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -31,7 +31,10 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, - write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, + write::{ + write_to_db_for_test, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, + WriteTask, + }, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a8fc2e6e3df..a7849f5e1dd 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -286,7 +286,7 @@ impl Drop for ReadDelegate { /// #[RaftstoreCommon] pub trait ReadExecutorProvider: Send + Clone + 'static { - type Executor: ReadExecutor; + type Executor; type StoreMeta; fn store_id(&self) -> Option; @@ -687,11 +687,7 @@ where /// #[RaftstoreCommon]: LocalReader is an entry point where local read requests are dipatch to the /// relevant regions by LocalReader so that these requests can be handled by the /// relevant ReadDelegate respectively. -pub struct LocalReaderCore -where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, -{ +pub struct LocalReaderCore { pub store_id: Cell>, store_meta: S, pub delegates: LruCache, @@ -699,7 +695,7 @@ where impl LocalReaderCore where - D: ReadExecutor + Deref + Clone, + D: Deref + Clone, S: ReadExecutorProvider, { pub fn new(store_meta: S) -> Self { @@ -827,8 +823,7 @@ where impl Clone for LocalReaderCore where - D: ReadExecutor + Deref, - S: ReadExecutorProvider, + S: Clone, { fn clone(&self) -> Self { LocalReaderCore { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 4d4e283ea7e..5d037fa3412 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -1415,12 +1415,12 @@ impl TikvServer { raft_engine.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - registry.clone(), + registry, raft_engine.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); - let router = RaftRouter::new(node.id(), registry, router); + let router = RaftRouter::new(node.id(), router); let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( router.store_router().clone(), self.config.coprocessor.clone(), diff --git a/src/config/mod.rs b/src/config/mod.rs index c78ec02182f..d2c5941c5ec 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -344,7 +344,7 @@ macro_rules! cf_config { #[online_config(skip)] pub enable_doubly_skiplist: bool, #[online_config(skip)] - pub enable_compaction_guard: bool, + pub enable_compaction_guard: Option, #[online_config(skip)] pub compaction_guard_min_output_file_size: ReadableSize, #[online_config(skip)] @@ -596,7 +596,7 @@ macro_rules! build_cf_opt { if $opt.enable_doubly_skiplist { cf_opts.set_doubly_skiplist(); } - if $opt.enable_compaction_guard { + if $opt.enable_compaction_guard.unwrap_or(false) { if let Some(provider) = $region_info_provider { let factory = CompactionGuardGeneratorFactory::new( $cf_name, @@ -671,7 +671,7 @@ impl Default for DefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -796,7 +796,7 @@ impl Default for WriteCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Zstd, @@ -902,7 +902,7 @@ impl Default for LockCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -985,7 +985,7 @@ impl Default for RaftCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -1218,6 +1218,8 @@ impl DbConfig { match engine { EngineType::RaftKv => { self.allow_concurrent_memtable_write.get_or_insert(true); + self.defaultcf.enable_compaction_guard.get_or_insert(true); + self.writecf.enable_compaction_guard.get_or_insert(true); } EngineType::RaftKv2 => { self.enable_multi_batch_write.get_or_insert(false); @@ -1475,7 +1477,7 @@ impl Default for RaftDefaultCfConfig { prop_size_index_distance: DEFAULT_PROP_SIZE_INDEX_DISTANCE, prop_keys_index_distance: DEFAULT_PROP_KEYS_INDEX_DISTANCE, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: None, compaction_guard_min_output_file_size: ReadableSize::mb(8), compaction_guard_max_output_file_size: ReadableSize::mb(128), bottommost_level_compression: DBCompressionType::Disable, @@ -5203,7 +5205,7 @@ mod tests { // Test comopaction guard disabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: false, + enable_compaction_guard: Some(false), ..Default::default() }; let provider = Some(MockRegionInfoProvider::new(vec![])); @@ -5216,7 +5218,7 @@ mod tests { // Test compaction guard enabled but region info provider is missing. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), ..Default::default() }; let provider: Option = None; @@ -5229,7 +5231,7 @@ mod tests { // Test compaction guard enabled. let config = DefaultCfConfig { target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(4), compaction_guard_max_output_file_size: ReadableSize::mb(64), ..Default::default() @@ -5541,22 +5543,27 @@ mod tests { cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. cfg.rocksdb.write_buffer_limit = None; + cfg.rocksdb.defaultcf.enable_compaction_guard = None; cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.defaultcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.writecf.enable_compaction_guard = None; cfg.rocksdb.writecf.level0_slowdown_writes_trigger = None; cfg.rocksdb.writecf.level0_stop_writes_trigger = None; cfg.rocksdb.writecf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.writecf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.lockcf.enable_compaction_guard = None; cfg.rocksdb.lockcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.lockcf.level0_stop_writes_trigger = None; cfg.rocksdb.lockcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.lockcf.hard_pending_compaction_bytes_limit = None; + cfg.rocksdb.raftcf.enable_compaction_guard = None; cfg.rocksdb.raftcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.raftcf.level0_stop_writes_trigger = None; cfg.rocksdb.raftcf.soft_pending_compaction_bytes_limit = None; cfg.rocksdb.raftcf.hard_pending_compaction_bytes_limit = None; + cfg.raftdb.defaultcf.enable_compaction_guard = None; cfg.raftdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.raftdb.defaultcf.level0_stop_writes_trigger = None; cfg.raftdb.defaultcf.soft_pending_compaction_bytes_limit = None; diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index c6f8e565218..0c6cf7cdd9c 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -359,7 +359,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: false, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -428,7 +428,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: false, + enable_compaction_guard: Some(false), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Zstd, @@ -497,7 +497,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -566,7 +566,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, @@ -650,7 +650,7 @@ fn test_serde_custom_tikv_config() { prop_size_index_distance: 4000000, prop_keys_index_distance: 40000, enable_doubly_skiplist: true, - enable_compaction_guard: true, + enable_compaction_guard: Some(true), compaction_guard_min_output_file_size: ReadableSize::mb(12), compaction_guard_max_output_file_size: ReadableSize::mb(34), bottommost_level_compression: DBCompressionType::Disable, From 8aef20c019c969d5f7984d0ea953c0678f98cd95 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 3 Jan 2023 17:36:19 +0800 Subject: [PATCH 061/115] *: introduce slog_panic and SlogFormat (#14014) ref tikv/tikv#12842 These two are helpers to utilize the static KV pairs in logger. In the past, we use `logger.list()` to try to format the configured KV pairs, but it will not work as values are omitted. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/batch/store.rs | 7 +- components/raftstore-v2/src/fsm/store.rs | 14 +- .../operation/command/admin/conf_change.rs | 20 +-- .../src/operation/command/admin/mod.rs | 11 +- .../src/operation/command/admin/split.rs | 49 +++--- .../raftstore-v2/src/operation/command/mod.rs | 16 +- .../src/operation/command/write/mod.rs | 27 ++-- .../operation/command/write/simple_write.rs | 13 +- components/raftstore-v2/src/operation/pd.rs | 9 +- .../raftstore-v2/src/operation/query/mod.rs | 7 +- .../src/operation/ready/apply_trace.rs | 11 +- .../src/operation/ready/async_writer.rs | 23 +-- .../raftstore-v2/src/operation/ready/mod.rs | 16 +- .../src/operation/ready/snapshot.rs | 17 +-- components/raftstore-v2/src/raft/apply.rs | 6 +- components/tikv_util/src/log.rs | 142 ++++++++++++++++++ 16 files changed, 272 insertions(+), 116 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 9ba7a63139c..e25ad53df8b 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -34,6 +34,7 @@ use slog::{warn, Logger}; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, + log::SlogFormat, sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, @@ -339,9 +340,9 @@ impl StorePollerBuilder { let prev = regions.insert(region_id, (sender, peer_fsm)); if let Some((_, p)) = prev { return Err(box_err!( - "duplicate region {:?} vs {:?}", - p.logger().list(), - regions[®ion_id].1.logger().list() + "duplicate region {} vs {}", + SlogFormat(p.logger()), + SlogFormat(regions[®ion_id].1.logger()) )); } Ok(()) diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index a5f22d7e1a8..86e3540d23c 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -17,7 +17,9 @@ use slog::{info, o, Logger}; use tikv_util::{ future::poll_future_notify, is_zero_duration, + log::SlogFormat, mpsc::{self, LooseBoundedSender, Receiver}, + slog_panic, }; use crate::{ @@ -60,12 +62,12 @@ impl StoreMeta { .insert(region_id, (region.clone(), initialized)); // `prev` only makes sense when it's initialized. if let Some((prev, prev_init)) = prev && prev_init { - assert!(initialized, "{:?} region corrupted", logger.list()); + assert!(initialized, "{} region corrupted", SlogFormat(logger)); if prev.get_region_epoch().get_version() != version { let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); - assert_eq!(prev_id, Some(region_id), "{:?} region corrupted", logger.list()); + assert_eq!(prev_id, Some(region_id), "{} region corrupted", SlogFormat(logger)); } else { - assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{:?} region corrupted", logger.list()); + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{} region corrupted", SlogFormat(logger)); return; } } @@ -74,8 +76,8 @@ impl StoreMeta { self.region_ranges .insert((data_end_key(region.get_end_key()), version), region_id) .is_none(), - "{:?} region corrupted", - logger.list() + "{} region corrupted", + SlogFormat(logger) ); } } @@ -216,7 +218,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { fn on_start(&mut self) { if self.fsm.store.start_time.is_some() { - panic!("{:?} unable to start again", self.fsm.store.logger.list(),); + slog_panic!(self.fsm.store.logger, "store is already started"); } self.fsm.store.start_time = Some( diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 72b582d775d..6c041a551fe 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -27,7 +27,7 @@ use raftstore::{ Error, Result, }; use slog::{error, info, warn}; -use tikv_util::box_err; +use tikv_util::{box_err, slog_panic}; use super::AdminCmdResult; use crate::{ @@ -312,10 +312,10 @@ impl Apply { change_num += 1; } if change_num == 0 { - panic!( - "{:?} can't leave a non-joint config, region: {:?}", - self.logger.list(), - self.region_state() + slog_panic!( + self.logger, + "can't leave a non-joint config"; + "region" => ?self.region_state() ); } let conf_ver = region.get_region_epoch().get_conf_ver() + change_num; @@ -433,11 +433,11 @@ impl Apply { if let Some(exist_peer) = tikv_util::store::find_peer(region, store_id) { let r = exist_peer.get_role(); if r == PeerRole::IncomingVoter || r == PeerRole::DemotingVoter { - panic!( - "{:?} can't apply confchange because configuration is still in joint state, confchange: {:?}, region: {:?}", - self.logger.list(), - cp, - self.region_state() + slog_panic!( + self.logger, + "can't apply confchange because configuration is still in joint state"; + "confchange" => ?cp, + "region_state" => ?self.region_state() ); } } diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 9ceaa76c03b..52bc5329dd4 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -15,7 +15,7 @@ use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; use split::SplitResult; pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; @@ -43,7 +43,10 @@ impl Peer { return; } if !req.has_admin_request() { - let e = box_err!("{:?} expect only execute admin command", self.logger.list()); + let e = box_err!( + "{} expect only execute admin command", + SlogFormat(&self.logger) + ); let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -67,8 +70,8 @@ impl Peer { // checker. if !self.applied_to_current_term() { let e = box_err!( - "{:?} peer has not applied to current term, applied_term {}, current_term {}", - self.logger.list(), + "{} peer has not applied to current term, applied_term {}, current_term {}", + SlogFormat(&self.logger), self.storage().entry_storage().applied_term(), self.term() ); diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index add5af1ce52..23fc6e3a8d9 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -54,6 +54,7 @@ use raftstore::{ Result, }; use slog::info; +use tikv_util::{log::SlogFormat, slog_panic}; use crate::{ batch::StoreContext, @@ -330,10 +331,10 @@ impl Apply { // We will freeze the memtable rather than flush it in the following PR. let tablet = self.tablet().clone(); let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint object: {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "error" => ?e ) }); @@ -348,11 +349,11 @@ impl Apply { checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - split_temp_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %split_temp_path.display(), + "error" => ?e ) }); } @@ -366,11 +367,11 @@ impl Apply { checkpointer .create_at(&derived_path, None, 0) .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - derived_path, - e + slog_panic!( + self.logger, + "fails to create checkpoint"; + "path" => %derived_path.display(), + "error" => ?e ) }); } @@ -505,10 +506,10 @@ impl Peer { .router .force_send_control(StoreMsg::SplitInit(msg)) .unwrap_or_else(|e| { - panic!( - "{:?} fails to send split peer intialization msg to store : {:?}", - self.logger.list(), - e + slog_panic!( + self.logger, + "fails to send split peer intialization msg to store"; + "error" => ?e, ) }); } @@ -556,11 +557,11 @@ impl Peer { let res = self.raft_group_mut().step(msg); let accept_snap = self.raft_group().snap().is_some(); if res.is_err() || !accept_snap { - panic!( - "{:?} failed to accept snapshot {:?} with error {}", - self.logger.list(), - res, - accept_snap + slog_panic!( + self.logger, + "failed to accept snapshot"; + "accept_snapshot" => accept_snap, + "res" => ?res, ); } let prev = self.storage_mut().split_init_mut().replace(split_init); @@ -610,7 +611,7 @@ impl Peer { break; } } - assert!(found, "{:?} {}", self.logger.list(), region_id); + assert!(found, "{} {}", SlogFormat(&self.logger), region_id); let split_trace = self.split_trace_mut(); let mut off = 0; let mut admin_flushed = 0; diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 8b0d3d7d461..439d2136d76 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -41,7 +41,7 @@ use raftstore::{ }; use slog::{info, warn}; use tikv_util::{ - box_err, + box_err, slog_panic, time::{duration_to_sec, monotonic_raw_now, Instant}, }; @@ -71,12 +71,12 @@ fn parse_at(logger: &slog::Logger, buf: &[u8], index: u64, let mut m = M::default(); match m.merge_from_bytes(buf) { Ok(()) => m, - Err(e) => panic!( - "{:?} data is corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data is corrupted"; + "term" => term, + "index" => index, + "error" => ?e, ), } } @@ -555,7 +555,7 @@ impl Apply { if let Err(e) = wb.write_callback_opt(&write_opt, || { flush_state.set_applied_index(index); }) { - panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); + slog_panic!(self.logger, "failed to write data"; "error" => ?e); } self.metrics.written_bytes += wb.data_size() as u64; self.metrics.written_keys += wb.count() as u64; diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index af806e3024e..14011d6fc1b 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -11,6 +11,7 @@ use raftstore::{ }, Result, }; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -150,13 +151,13 @@ impl Apply { .put_cf(cf, &self.key_buffer, value) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to write ({}, {}) {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - log_wrappers::Value::value(value), - cf, - e + slog_panic!( + self.logger, + "failed to write"; + "key" => %log_wrappers::Value::key(key), + "value" => %log_wrappers::Value::value(value), + "cf" => cf, + "error" => ?e ); }); fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( @@ -188,12 +189,12 @@ impl Apply { .delete_cf(cf, &self.key_buffer) }; res.unwrap_or_else(|e| { - panic!( - "{:?} failed to delete {} {}: {:?}", - self.logger.list(), - log_wrappers::Value::key(key), - cf, - e + slog_panic!( + self.logger, + "failed to delete"; + "key" => %log_wrappers::Value::key(key), + "cf" => cf, + "error" => ?e ); }); self.metrics.size_diff_hint -= self.key_buffer.len() as i64; diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index 57c01fca9d8..e6f81b20af1 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -5,6 +5,7 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; +use tikv_util::slog_panic; use crate::{operation::command::parse_at, router::CmdResChannel}; @@ -191,12 +192,12 @@ impl<'a> SimpleWriteReqDecoder<'a> { let mut is = CodedInputStream::from_bytes(&buf[1..]); let header = match is.read_message() { Ok(h) => h, - Err(e) => panic!( - "{:?} data corrupted at [{}] {}: {:?}", - logger.list(), - term, - index, - e + Err(e) => slog_panic!( + logger, + "data corrupted"; + "term" => term, + "index" => index, + "error" => ?e ), }; let read = is.pos(); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 894f39f278b..50b612f207d 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -7,6 +7,7 @@ use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::Transport; use slog::error; +use tikv_util::slog_panic; use crate::{ batch::StoreContext, @@ -137,10 +138,10 @@ impl Peer { pending_peers.push(p); } else { if ctx.cfg.dev_assert { - panic!( - "{:?} failed to get peer {} from cache", - self.logger.list(), - id + slog_panic!( + self.logger, + "failed to get peer from cache"; + "get_peer_id" => id ); } error!( diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index f26659c7b89..305cdb666cc 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -30,7 +30,7 @@ use raftstore::{ Error, Result, }; use slog::{debug, info}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; use crate::{ @@ -363,7 +363,10 @@ impl Peer { } } StatusCmdType::InvalidStatus => { - return Err(box_err!("{:?} invalid status command!", self.logger.list())); + return Err(box_err!( + "{} invalid status command!", + SlogFormat(&self.logger) + )); } } diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 1e9d1ef4221..5ff9a27dee0 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -41,7 +41,7 @@ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::{trace, Logger}; -use tikv_util::{box_err, worker::Scheduler}; +use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ operation::{ @@ -444,11 +444,10 @@ impl Storage { return; } } - panic!( - "{:?} data loss detected: {}_{} not found", - self.logger().list(), - region_id, - tablet_index + slog_panic!( + self.logger(), + "tablet loss detected"; + "tablet_index" => tablet_index ); } diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index a2707b6d411..96f1611d9f1 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -9,6 +9,7 @@ use raftstore::store::{ WriteSenders, WriteTask, }; use slog::{warn, Logger}; +use tikv_util::slog_panic; use crate::{ batch::{StoreContext, StoreRouter}, @@ -117,11 +118,11 @@ impl AsyncWriter { let last_unpersisted = self.unpersisted_readies.back(); if last_unpersisted.map_or(true, |u| u.number < ready_number) { - panic!( - "{:?} ready number is too large {:?} vs {}", - logger.list(), - last_unpersisted, - ready_number + slog_panic!( + logger, + "ready number is too large"; + "last_unpersisted" => ?last_unpersisted, + "ready_number" => ready_number ); } @@ -130,15 +131,15 @@ impl AsyncWriter { // There must be a match in `self.unpersisted_readies`. loop { let Some(v) = self.unpersisted_readies.pop_front() else { - panic!("{:?} ready number not found {}", logger.list(), ready_number); + slog_panic!(logger, "ready number not found"; "ready_number" => ready_number); }; has_snapshot |= v.has_snapshot; if v.number > ready_number { - panic!( - "{:?} ready number not matched {:?} vs {}", - logger.list(), - v, - ready_number + slog_panic!( + logger, + "ready number not matched"; + "ready" => ?v, + "ready_number" => ready_number ); } if raft_messages.is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index fe4208db549..29452533632 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -34,6 +34,8 @@ use raftstore::{ }; use slog::{debug, error, info, trace, warn}; use tikv_util::{ + log::SlogFormat, + slog_panic, store::find_peer, time::{duration_to_sec, monotonic_raw_now}, }; @@ -388,8 +390,8 @@ impl Peer { let prev_commit_index = self.entry_storage().commit_index(); assert!( hs.get_commit() >= prev_commit_index, - "{:?} {:?} {}", - self.logger.list(), + "{} {:?} {}", + SlogFormat(&self.logger), hs, prev_commit_index ); @@ -456,11 +458,11 @@ impl Peer { } } if !light_rd.messages().is_empty() || light_rd.commit_index().is_some() { - panic!( - "{:?} unexpected messages [{}] commit index [{:?}]", - self.logger.list(), - light_rd.messages().len(), - light_rd.commit_index() + slog_panic!( + self.logger, + "unexpected messages"; + "messages_count" => ?light_rd.messages().len(), + "commit_index" => ?light_rd.commit_index() ); } if !light_rd.committed_entries().is_empty() { diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 2e1b9362a69..8716f0c75ea 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -41,7 +41,7 @@ use raftstore::{ }, }; use slog::{error, info, warn}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat, slog_panic}; use crate::{ fsm::ApplyResReporter, @@ -554,8 +554,8 @@ impl Storage { let last_term = snap.get_metadata().get_term(); assert!( last_index >= RAFT_INIT_LOG_INDEX && last_term >= RAFT_INIT_LOG_TERM, - "{:?}", - self.logger().list() + "{}", + SlogFormat(self.logger()) ); let region_state = self.region_state_mut(); region_state.set_state(PeerState::Normal); @@ -599,12 +599,11 @@ impl Storage { // it should load it into the factory after it persisted. let hook = move || { if !install_tablet(®, &path, region_id, last_index) { - panic!( - "{:?} failed to install tablet, path: {}, region_id: {}, tablet_index: {}", - logger.list(), - path.display(), - region_id, - last_index + slog_panic!( + logger, + "failed to install tablet"; + "path" => %path.display(), + "tablet_index" => last_index ); } if clean_split { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 8660e4795d0..2407d1ab3fe 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -9,7 +9,7 @@ use raftstore::store::{ ReadTask, }; use slog::Logger; -use tikv_util::worker::Scheduler; +use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ operation::{AdminCmdResult, DataTrace}, @@ -71,9 +71,9 @@ impl Apply { let mut remote_tablet = tablet_registry .get(region_state.get_region().get_id()) .unwrap(); - assert_ne!(applied_term, 0, "{:?}", logger.list()); + assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); let applied_index = flush_state.applied_index(); - assert_ne!(applied_index, 0, "{:?}", logger.list()); + assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), diff --git a/components/tikv_util/src/log.rs b/components/tikv_util/src/log.rs index 10facfa2287..fd351eecbd4 100644 --- a/components/tikv_util/src/log.rs +++ b/components/tikv_util/src/log.rs @@ -82,3 +82,145 @@ macro_rules! debug(($($args:tt)+) => { macro_rules! trace(($($args:tt)+) => { ::slog_global::trace!($($args)+) };); + +use std::fmt::{self, Display, Write}; + +use slog::{BorrowedKV, OwnedKVList, Record, KV}; + +struct FormatKeyValueList<'a, W> { + buffer: &'a mut W, + written: bool, +} + +impl<'a, W: Write> slog::Serializer for FormatKeyValueList<'a, W> { + fn emit_arguments(&mut self, key: slog::Key, val: &fmt::Arguments<'_>) -> slog::Result { + if !self.written { + write!(&mut self.buffer, "[{}={}]", key, val).unwrap(); + self.written = true; + } else { + write!(&mut self.buffer, " [{}={}]", key, val).unwrap() + } + Ok(()) + } +} + +/// A helper struct to format the key-value list of a slog logger. It's not +/// exact the same format as `TiKVFormat` and etc. It's just a simple +/// implementation for panic, return errors that doesn't show in normal logs +/// processing. +pub struct SlogFormat<'a>(pub &'a slog::Logger); + +impl<'a> Display for SlogFormat<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut formatter = FormatKeyValueList { + buffer: f, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + self.0 + .list() + .serialize( + &Record::new(&record, &format_args!(""), slog::b!()), + &mut formatter, + ) + .unwrap(); + Ok(()) + } +} + +#[doc(hidden)] +pub fn format_kv_list(buffer: &mut String, kv_list: &OwnedKVList, borrow_list: BorrowedKV<'_>) { + let mut formatter = FormatKeyValueList { + buffer, + written: false, + }; + let record = slog::record_static!(slog::Level::Trace, ""); + let args = format_args!(""); + let record = Record::new(&record, &args, slog::b!()); + // Serialize borrow list first to make region_id, peer_id at the end. + borrow_list.serialize(&record, &mut formatter).unwrap(); + kv_list.serialize(&record, &mut formatter).unwrap(); +} + +/// A helper macro to panic with the key-value list of a slog logger. +/// +/// Similar to `SlogFormat`, but just panic. +#[macro_export] +macro_rules! slog_panic { + ($logger:expr, $msg:expr, $borrowed_kv:expr) => {{ + let owned_kv = ($logger).list(); + let mut s = String::new(); + $crate::log::format_kv_list(&mut s, &owned_kv, $borrowed_kv); + if s.is_empty() { + panic!("{}", $msg) + } else { + panic!("{} {}", $msg, s) + } + }}; + ($logger:expr, $msg:expr) => {{ + $crate::slog_panic!($logger, $msg, slog::b!()) + }}; + ($logger:expr, $msg:expr; $($arg:tt)+) => {{ + $crate::slog_panic!($logger, $msg, slog::b!($($arg)+)) + }}; +} + +#[cfg(test)] +mod tests { + #[test] + fn test_format_kv() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, String::new()); + + let logger = logger.new(slog::o!("a" => 1)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let s = format!("{}", super::SlogFormat(&logger)); + assert_eq!(s, "[b=2] [a=1]"); + } + + #[test] + fn test_slog_panic() { + let logger = slog::Logger::root(slog::Discard, slog::o!()); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test"); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [k=v]"); + + let logger = logger.new(slog::o!("a" => 1)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!(err.downcast::().unwrap().as_str(), "test [a=1]"); + + let logger = logger.new(slog::o!("b" => 2)); + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [b=2] [a=1]" + ); + + let err = panic_hook::recover_safe(|| { + crate::slog_panic!(logger, "test"; "k" => "v"); + }) + .unwrap_err(); + assert_eq!( + err.downcast::().unwrap().as_str(), + "test [k=v] [b=2] [a=1]" + ); + } +} From 4619f32f07207343692dc641656822c65157c616 Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 3 Jan 2023 18:08:19 -0800 Subject: [PATCH 062/115] Introduce priority queue for priority scheduling (#14002) ref tikv/tikv#13730 Introduce priority-based channel Signed-off-by: Connor1996 Co-authored-by: Ti Chi Robot --- Cargo.lock | 59 ++-- components/concurrency_manager/Cargo.toml | 7 +- components/tikv_util/Cargo.toml | 2 + components/tikv_util/src/mpsc/mod.rs | 2 + .../tikv_util/src/mpsc/priority_queue.rs | 289 ++++++++++++++++++ 5 files changed, 314 insertions(+), 45 deletions(-) create mode 100644 components/tikv_util/src/mpsc/priority_queue.rs diff --git a/Cargo.lock b/Cargo.lock index 8433f54c512..7a3c9ced013 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -800,7 +800,7 @@ dependencies = [ "kvproto", "lazy_static", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -1041,7 +1041,7 @@ dependencies = [ "fail", "futures 0.3.15", "kvproto", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "rand 0.8.5", "tikv_alloc", "tikv_util", @@ -1204,18 +1204,6 @@ dependencies = [ "crossbeam-utils 0.8.11", ] -[[package]] -name = "crossbeam-epoch" -version = "0.9.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.3", - "lazy_static", - "memoffset", - "scopeguard", -] - [[package]] name = "crossbeam-epoch" version = "0.9.8" @@ -1255,12 +1243,13 @@ dependencies = [ [[package]] name = "crossbeam-skiplist" -version = "0.0.0" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883a5821d7d079fcf34ac55f27a833ee61678110f6b97637cc74513c0d0b42fc" dependencies = [ "cfg-if 1.0.0", - "crossbeam-epoch 0.9.3", - "crossbeam-utils 0.8.3", + "crossbeam-epoch 0.9.8", + "crossbeam-utils 0.8.8", "scopeguard", ] @@ -1275,16 +1264,6 @@ dependencies = [ "lazy_static", ] -[[package]] -name = "crossbeam-utils" -version = "0.8.3" -source = "git+https://github.com/tikv/crossbeam.git?branch=tikv-5.0#e0e083d062649484188b7337fe388fd12f2c8d94" -dependencies = [ - "autocfg", - "cfg-if 1.0.0", - "lazy_static", -] - [[package]] name = "crossbeam-utils" version = "0.8.8" @@ -1379,7 +1358,7 @@ checksum = "c0834a35a3fce649144119e18da2a4d8ed12ef3862f47183fd46f625d072d96c" dependencies = [ "cfg-if 1.0.0", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", ] [[package]] @@ -1867,7 +1846,7 @@ dependencies = [ "maligned", "online_config", "openssl", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "rand 0.8.5", @@ -3650,9 +3629,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87f5ec2493a61ac0506c0f4199f99070cbe83857b0337006a30f3e6719b8ef58" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", "parking_lot_core 0.9.1", @@ -3929,7 +3908,7 @@ dependencies = [ "log", "nix 0.24.1", "once_cell", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "protobuf", "protobuf-codegen-pure", "smallvec", @@ -4214,7 +4193,7 @@ dependencies = [ "nix 0.25.0", "num-derive", "num-traits", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "prometheus", "prometheus-static-metric", "protobuf", @@ -4312,7 +4291,7 @@ dependencies = [ "openssl", "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -4362,7 +4341,7 @@ dependencies = [ "keys", "kvproto", "log_wrappers", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "protobuf", @@ -6294,7 +6273,7 @@ dependencies = [ "online_config", "openssl", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "paste", "pd_client", "pin-project", @@ -6515,6 +6494,7 @@ dependencies = [ "cpu-time", "crc32fast", "crossbeam", + "crossbeam-skiplist", "derive_more", "error_code", "fail", @@ -6536,6 +6516,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", + "parking_lot 0.12.1", "pin-project", "procfs", "procinfo", @@ -6620,7 +6601,7 @@ dependencies = [ "memchr", "mio 0.8.5", "num_cpus", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", @@ -6874,7 +6855,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "kvproto", "lazy_static", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pin-project", "prometheus", "slab", diff --git a/components/concurrency_manager/Cargo.toml b/components/concurrency_manager/Cargo.toml index e225cbe0519..b391c1d239a 100644 --- a/components/concurrency_manager/Cargo.toml +++ b/components/concurrency_manager/Cargo.toml @@ -5,6 +5,7 @@ publish = false version = "0.0.1" [dependencies] +crossbeam-skiplist = "0.1" fail = "0.5" kvproto = { workspace = true } parking_lot = "0.12" @@ -12,12 +13,6 @@ tikv_util = { workspace = true } tokio = { version = "1.5", features = ["macros", "sync", "time"] } txn_types = { workspace = true } -# FIXME: switch to the crates.io version after crossbeam-skiplist is released -[dependencies.crossbeam-skiplist] -git = "https://github.com/tikv/crossbeam.git" -branch = "tikv-5.0" -package = "crossbeam-skiplist" - [dev-dependencies] criterion = "0.3" futures = "0.3" diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 663eb2b681f..92f3bac3d5b 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -19,6 +19,7 @@ collections = { workspace = true } cpu-time = "1.0.0" crc32fast = "1.2" crossbeam = "0.8" +crossbeam-skiplist = "0.1" derive_more = "0.99.3" error_code = { workspace = true } fail = "0.5" @@ -37,6 +38,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" +parking_lot = "0.12.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/tikv_util/src/mpsc/mod.rs b/components/tikv_util/src/mpsc/mod.rs index 45249fed9bc..700691f1189 100644 --- a/components/tikv_util/src/mpsc/mod.rs +++ b/components/tikv_util/src/mpsc/mod.rs @@ -3,7 +3,9 @@ //! This module provides an implementation of mpsc channel based on //! crossbeam_channel. Comparing to the crossbeam_channel, this implementation //! supports closed detection and try operations. + pub mod future; +pub mod priority_queue; use std::{ cell::Cell, diff --git a/components/tikv_util/src/mpsc/priority_queue.rs b/components/tikv_util/src/mpsc/priority_queue.rs new file mode 100644 index 00000000000..3389d6154c3 --- /dev/null +++ b/components/tikv_util/src/mpsc/priority_queue.rs @@ -0,0 +1,289 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{ + atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{RecvError, SendError, TryRecvError, TrySendError}; +use crossbeam_skiplist::SkipMap; +use parking_lot::{Condvar, Mutex}; + +// Create a priority based channel. Sender can send message with priority of +// u64, and receiver will receive messages in ascending order of priority. For +// two messages of same priority, the receiving order follows FIFO. +pub fn unbounded() -> (Sender, Receiver) { + let queue = Arc::new(PriorityQueue::new()); + let sender = Sender { + inner: queue.clone(), + }; + let receiver = Receiver { inner: queue }; + (sender, receiver) +} + +struct Cell { + ptr: AtomicPtr, +} + +unsafe impl Send for Cell {} +unsafe impl Sync for Cell {} + +impl Cell { + fn new(value: T) -> Self { + Self { + ptr: AtomicPtr::new(Box::into_raw(Box::new(value))), + } + } + + fn take(&self) -> Option { + let p = self.ptr.swap(std::ptr::null_mut(), Ordering::SeqCst); + if !p.is_null() { + unsafe { Some(*Box::from_raw(p)) } + } else { + None + } + } +} + +impl Drop for Cell { + fn drop(&mut self) { + self.take(); + } +} + +#[derive(Default)] +struct PriorityQueue { + queue: SkipMap>, + disconnected: Mutex, + available: Condvar, + + sequencer: AtomicU64, + + senders: AtomicUsize, + receivers: AtomicUsize, +} + +impl PriorityQueue { + pub fn new() -> Self { + Self { + queue: SkipMap::new(), + disconnected: Mutex::new(false), + available: Condvar::new(), + sequencer: AtomicU64::new(0), + senders: AtomicUsize::new(1), + receivers: AtomicUsize::new(1), + } + } + + pub fn get_map_key(&self, pri: u64) -> MapKey { + MapKey { + priority: pri, + sequence: self.sequencer.fetch_add(1, Ordering::Relaxed), + } + } +} + +// When derived `PartialOrd` on structs, it will produce a lexicographic +// ordering based on the top-to-bottom declaration order of the struct’s +// members. +#[derive(Eq, PartialEq, Ord, PartialOrd)] +struct MapKey { + priority: u64, + sequence: u64, +} + +pub struct Sender { + inner: Arc>, +} + +impl Sender { + pub fn try_send(&self, msg: T, pri: u64) -> Result<(), TrySendError> { + self.send(msg, pri) + .map_err(|SendError(msg)| TrySendError::Disconnected(msg)) + } + + pub fn send(&self, msg: T, pri: u64) -> Result<(), SendError> { + if self.inner.receivers.load(Ordering::Acquire) == 0 { + return Err(SendError(msg)); + } + self.inner + .queue + .insert(self.inner.get_map_key(pri), Cell::new(msg)); + self.inner.available.notify_one(); + Ok(()) + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Sender { + fn clone(&self) -> Self { + self.inner.senders.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + let old = self.inner.senders.fetch_sub(1, Ordering::AcqRel); + if old <= 1 { + *self.inner.disconnected.lock() = true; + self.inner.available.notify_all(); + } + } +} + +pub struct Receiver { + inner: Arc>, +} + +impl Receiver { + pub fn try_recv(&self) -> Result { + match self.inner.queue.pop_front() { + Some(entry) => Ok(entry.value().take().unwrap()), + None if self.inner.senders.load(Ordering::SeqCst) == 0 => { + Err(TryRecvError::Disconnected) + } + None => Err(TryRecvError::Empty), + } + } + + pub fn recv(&self) -> Result { + loop { + match self.try_recv() { + Ok(msg) => return Ok(msg), + Err(TryRecvError::Disconnected) => { + return Err(RecvError); + } + Err(TryRecvError::Empty) => { + let mut disconnected = self.inner.disconnected.lock(); + if *disconnected { + return Err(RecvError); + } + self.inner.available.wait(&mut disconnected); + } + } + } + } + + #[cfg(test)] + fn len(&self) -> usize { + self.inner.queue.len() + } +} + +impl Clone for Receiver { + fn clone(&self) -> Self { + self.inner.receivers.fetch_add(1, Ordering::AcqRel); + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + self.inner.receivers.fetch_sub(1, Ordering::AcqRel); + } +} + +#[cfg(test)] +mod tests { + use std::{sync::atomic::AtomicU64, thread, time::Duration}; + + use crossbeam::channel::TrySendError; + use rand::Rng; + + use super::*; + + #[test] + fn test_priority() { + let (tx, rx) = super::unbounded::(); + tx.try_send(1, 2).unwrap(); + tx.send(2, 1).unwrap(); + tx.send(3, 3).unwrap(); + + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(1)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Empty)); + + drop(rx); + assert_eq!(tx.send(2, 1), Err(SendError(2))); + assert_eq!(tx.try_send(2, 1), Err(TrySendError::Disconnected(2))); + + let (tx, rx) = super::unbounded::(); + drop(tx); + assert_eq!(rx.recv(), Err(RecvError)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + + let (tx, rx) = super::unbounded::(); + thread::spawn(move || { + thread::sleep(Duration::from_millis(100)); + tx.send(10, 1).unwrap(); + }); + assert_eq!(rx.recv(), Ok(10)); + + let (tx, rx) = super::unbounded::(); + assert_eq!(tx.len(), 0); + assert_eq!(rx.len(), 0); + tx.send(2, 1).unwrap(); + tx.send(3, 2).unwrap(); + assert_eq!(tx.len(), 2); + assert_eq!(rx.len(), 2); + drop(tx); + assert_eq!(rx.try_recv(), Ok(2)); + assert_eq!(rx.recv(), Ok(3)); + assert_eq!(rx.try_recv(), Err(TryRecvError::Disconnected)); + assert_eq!(rx.recv(), Err(RecvError)); + } + + #[test] + fn test_priority_multi_thread() { + let (tx, rx) = super::unbounded::(); + + let mut handlers = Vec::with_capacity(10); + let expected_count = Arc::new(AtomicU64::new(0)); + let real_counter = Arc::new(AtomicU64::new(0)); + for _ in 0..10 { + let sender = tx.clone(); + let expected_count = expected_count.clone(); + let handle = thread::spawn(move || { + let mut rng = rand::thread_rng(); + let pri = rng.gen_range(0..1000); + let mut cnt = 0; + for i in 0..1000 { + sender.send(i, pri).unwrap(); + cnt += i; + } + expected_count.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + for _i in 0..10 { + let recv = rx.clone(); + let real_counter = real_counter.clone(); + let handle = thread::spawn(move || { + let mut cnt = 0; + while let Ok(v) = recv.recv() { + cnt += v; + } + real_counter.fetch_add(cnt, Ordering::Relaxed); + }); + handlers.push(handle); + } + drop(tx); + for h in handlers { + h.join().unwrap(); + } + assert_eq!( + expected_count.load(Ordering::Relaxed), + real_counter.load(Ordering::Relaxed) + ); + } +} From ad250ba6a92a7b89d637286664607c88e89f7ce8 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 4 Jan 2023 12:03:11 +0800 Subject: [PATCH 063/115] Fix some tests and pre-handle and transport (#246) --- components/raftstore/src/store/snap.rs | 7 ++-- engine_store_ffi/src/observer.rs | 50 ++++++++++++++++++++++++-- new-mock-engine-store/src/node.rs | 16 ++++++++- proxy_server/src/proxy.rs | 5 +++ proxy_tests/proxy/fast_add_peer.rs | 14 ++++++-- src/server/raft_client.rs | 1 + 6 files changed, 84 insertions(+), 9 deletions(-) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index d564bcb17e0..b360d2d20c3 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -420,7 +420,7 @@ impl CfFile { } #[derive(Default)] -struct MetaFile { +pub struct MetaFile { pub meta: Option, pub path: PathBuf, pub file: Option, @@ -436,8 +436,8 @@ pub struct Snapshot { cf_files: Vec, cf_index: usize, cf_file_index: usize, - meta_file: MetaFile, - hold_tmp_files: bool, + pub meta_file: MetaFile, + pub hold_tmp_files: bool, mgr: SnapManagerCore, } @@ -458,6 +458,7 @@ impl Snapshot { mgr: &SnapManagerCore, ) -> RaftStoreResult { let dir_path = dir.into(); + if !dir_path.exists() { file_system::create_dir_all(dir_path.as_path())?; } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index c7677230699..f9a06f4808d 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -651,7 +651,7 @@ impl TiFlashObserver { ) -> RaftStoreResult { let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building - let (snap, key) = { + let (mut snap, key) = { // Find term of entry at applied_index. let applied_index = apply_state.get_applied_index(); let applied_term = @@ -668,6 +668,9 @@ impl TiFlashObserver { self.snap_mgr.register(key.clone(), SnapEntry::Generating); defer!(self.snap_mgr.deregister(&key, &SnapEntry::Generating)); let snapshot = self.snap_mgr.get_snapshot_for_building(&key)?; + for cf in snapshot.cf_files().iter() { + info!("!!!! snapshot cf_file of {} size {:?}", cf.cf, cf.size); + } (snapshot, key.clone()) }; @@ -686,10 +689,17 @@ impl TiFlashObserver { .ok_or(box_err!("can't find index for cf {}", cf)); let cf_index = cf_index?; let cf_file = &snap.cf_files()[cf_index]; + // Create fake file. let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); - let mut _file = std::fs::File::create(path.as_path())?; + info!( + "!!!!! create snapshot data file {:?} {}", + path, snap.hold_tmp_files + ); + let mut f = std::fs::File::create(path.as_path())?; + f.flush()?; + f.sync_all()?; } snap_data.set_region(new_region.clone()); snap_data.set_file_size(0); @@ -704,11 +714,13 @@ impl TiFlashObserver { { let v = snapshot_meta.write_to_bytes()?; let mut f = std::fs::File::create(snap.meta_path())?; + info!("!!!!! create snapshot meta file {:?}", snap.meta_path()); f.write_all(&v[..])?; f.flush()?; f.sync_all()?; } snap_data.set_meta(snapshot_meta); + snap.hold_tmp_files = false; } pb_snapshot_metadata @@ -749,6 +761,8 @@ impl TiFlashObserver { .unwrap(); self.set_snapshot_inflight(region_id, current.as_millis()) .unwrap(); + // If we don't flush here, packet will lost. + trans.flush(); } Err(RaftStoreError::RegionNotFound(_)) => (), _ => return Ok(crate::FastAddPeerStatus::OtherError), @@ -1422,9 +1436,10 @@ impl ApplySnapshotObserver for TiFlashOb snap_key: &store::SnapKey, snap: Option<&store::Snapshot>, ) { + let region_id = ob_ctx.region().get_id(); info!("pre apply snapshot"; "peer_id" => peer_id, - "region_id" => ob_ctx.region().get_id(), + "region_id" => region_id, "snap_key" => ?snap_key, "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); @@ -1444,6 +1459,31 @@ impl ApplySnapshotObserver for TiFlashOb return; }); + let mut should_skip = false; + #[allow(clippy::collapsible_if)] + if self.engine_store_cfg.enable_fast_add_peer { + if self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); + if is_first_snapsot { + info!("fast path: prehandle first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; + "snap_key" => ?snap_key, + ); + should_skip = true; + } + } + MapEntry::Vacant(_) => { + // Compat no fast add peer logic + // panic!("unknown snapshot!"); + } + }, + ).is_err() { + fatal!("post_apply_snapshot poisoned") + }; + } + let (sender, receiver) = mpsc::channel(); let task = Arc::new(PrehandleTask::new(receiver, peer_id)); { @@ -1455,6 +1495,10 @@ impl ApplySnapshotObserver for TiFlashOb ctx.tracer.insert(snap_key.clone(), task.clone()); } + if should_skip { + return; + } + let engine_store_server_helper = self.engine_store_server_helper; let region = ob_ctx.region().clone(); let snap_key = snap_key.clone(); diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index e88b5a8acac..0a9a284a27a 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -83,15 +83,25 @@ impl Default for ChannelTransport { impl Transport for ChannelTransport { #[allow(clippy::significant_drop_in_scrutinee)] fn send(&mut self, msg: RaftMessage) -> Result<()> { - let from_store = msg.get_from_peer().get_store_id(); + let mut from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); let to_peer_id = msg.get_to_peer().get_id(); let region_id = msg.get_region_id(); let is_snapshot = msg.get_message().get_msg_type() == MessageType::MsgSnapshot; if is_snapshot { + let fake_self_snapshot = (|| { + fail::fail_point!("fast_add_peer_fake_snapshot", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })(); let snap = msg.get_message().get_snapshot(); let key = SnapKey::from_snap(snap).unwrap(); + if fake_self_snapshot == 1 { + from_store = to_store; + } let from = match self.core.lock().unwrap().snap_paths.get(&from_store) { Some(p) => { p.0.register(key.clone(), SnapEntry::Sending); @@ -99,6 +109,9 @@ impl Transport for ChannelTransport { } None => return Err(box_err!("missing temp dir for store {}", from_store)), }; + if fake_self_snapshot == 1 && !from.exists() { + panic!("non-exist snapshot"); + } let to = match self.core.lock().unwrap().snap_paths.get(&to_store) { Some(p) => { p.0.register(key.clone(), SnapEntry::Receiving); @@ -302,6 +315,7 @@ impl Simulator for NodeCluster { (snap_mgr.clone(), None) }; + debug!("snapshot_mgr path of {} is {:?}", node_id, snap_mgr_path); self.snap_mgrs.insert(node_id, snap_mgr.clone()); let importer = { diff --git a/proxy_server/src/proxy.rs b/proxy_server/src/proxy.rs index 731c16d92f3..c34c9ef54f9 100644 --- a/proxy_server/src/proxy.rs +++ b/proxy_server/src/proxy.rs @@ -59,6 +59,11 @@ pub fn gen_tikv_config( }, ) .unwrap_or_else(|e| { + eprintln!( + "invalid default auto generated configuration file {}, err {}", + path.display(), + e + ); error!( "invalid default auto generated configuration file {}, err {}", path.display(), diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 6c417916ac4..5beaa900582 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -21,6 +21,7 @@ fn basic_fast_add_peer() { let (mut cluster, pd_client) = new_mock_cluster(0, 2); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; // fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("fast_add_peer_fake_snapshot", "return(1)").unwrap(); fail::cfg("before_tiflash_check_double_write", "return").unwrap(); disable_auto_gen_compact_log(&mut cluster); // Disable auto generate peer. @@ -33,6 +34,9 @@ fn basic_fast_add_peer() { check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); cluster.shutdown(); + fail::remove("fallback_to_slow_path_not_allow"); + fail::remove("fast_add_peer_fake_snapshot"); + fail::remove("before_tiflash_check_double_write"); } fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseType) { @@ -167,12 +171,13 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT _ => (), }; - // Destroy peer + // Destroy peer, and then try re-add a new peer of the same region. pd_client.must_remove_peer(1, new_learner_peer(3, 3)); must_wait_until_cond_node(&cluster, 1, Some(vec![1]), &|states: &States| -> bool { find_peer_by_id(states.in_disk_region_state.get_region(), 3).is_none() }); std::thread::sleep(std::time::Duration::from_millis(1000)); + // Assert the peer removing succeeed. iter_ffi_helpers( &cluster, Some(vec![3]), @@ -186,7 +191,12 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT ); cluster.must_put(b"k5", b"v5"); // These failpoints make sure we will cause again a fast path. - fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + if source_type == SourceType::InvalidSource { + // If we still use InvalidSource, we still need to goto slow path. + } else { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + } + // Re-add peer in store. pd_client.must_add_peer(1, new_learner_peer(3, 4)); // Wait until Learner has applied ConfChange std::thread::sleep(std::time::Duration::from_millis(1000)); diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index fa12600bb98..b6b08cf4a7d 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -456,6 +456,7 @@ where Some(msg) => msg, None => return, }; + if msg.get_message().has_snapshot() { let mut snapshot = RaftSnapshotData::default(); snapshot From 075303bcd9b83a0d6a1f31971b434c680efa225e Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 5 Jan 2023 12:43:47 +0800 Subject: [PATCH 064/115] Introduce gc_raw_cpp_ptr_carr FFI, and fix early_skip (#251) --- engine_store_ffi/src/interfaces.rs | 9 +++- engine_store_ffi/src/lib.rs | 17 +++++++ engine_store_ffi/src/observer.rs | 49 +++++++++++-------- new-mock-engine-store/src/mock_store.rs | 27 ++++++++++ proxy_tests/proxy/ffi.rs | 30 ++++++++++-- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 1 + 7 files changed, 109 insertions(+), 26 deletions(-) diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 5a3acd21ad9..6b9d062c4fd 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -536,6 +536,13 @@ pub mod root { pub fn_gc_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn(arg1: root::DB::RawVoidPtr, arg2: root::DB::RawCppPtrType), >, + pub fn_gc_raw_cpp_ptr_carr: ::std::option::Option< + unsafe extern "C" fn( + arg1: root::DB::RawVoidPtr, + arg2: root::DB::RawCppPtrType, + arg3: u64, + ), + >, pub fn_gc_special_raw_cpp_ptr: ::std::option::Option< unsafe extern "C" fn( arg1: root::DB::RawVoidPtr, @@ -578,7 +585,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 4326611643816778519; + pub const RAFT_STORE_PROXY_VERSION: u64 = 14213283800760119223; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 4cd8c58932c..eb393e7cf09 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -388,6 +388,11 @@ unsafe impl Send for RawCppPtrTuple {} impl Drop for RawCppPtrTuple { fn drop(&mut self) { + /// Note the layout is: + /// [0] RawCppPtr to T + /// [1] RawCppPtr to R + /// ... + /// [len-1] RawCppPtr to S unsafe { if !self.is_null() { let helper = get_engine_store_server_helper(); @@ -427,6 +432,11 @@ unsafe impl Send for RawCppPtrArr {} impl Drop for RawCppPtrArr { fn drop(&mut self) { + /// Note the layout is: + /// [0] RawVoidPtr to T + /// [1] RawVoidPtr + /// ... + /// [len-1] RawVoidPtr unsafe { if !self.is_null() { let helper = get_engine_store_server_helper(); @@ -505,6 +515,13 @@ impl EngineStoreServerHelper { } } + fn gc_raw_cpp_ptr_carr(&self, ptr: *mut ::std::os::raw::c_void, tp: RawCppPtrType, len: u64) { + debug_assert!(self.fn_gc_raw_cpp_ptr_carr.is_some()); + unsafe { + (self.fn_gc_raw_cpp_ptr_carr.into_inner())(ptr, tp, len); + } + } + fn gc_special_raw_cpp_ptr( &self, ptr: *mut ::std::os::raw::c_void, diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index f9a06f4808d..72830812db5 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -410,6 +410,26 @@ impl TiFlashObserver { let f = |info: MapEntry>| { match info { MapEntry::Occupied(mut o) => { + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + if last != 0 { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + "elapsed" => current.as_millis() - last, + ); + early_skip = true; + // We must return here to avoid changing `inited_or_fallback`. + // Otherwise will cause different value in pre/post_apply_snapshot. + return; + } (is_first, has_already_inited) = if !o.get().inited_or_fallback.load(Ordering::SeqCst) { // If `has_already_inited` is true: @@ -432,23 +452,6 @@ impl TiFlashObserver { }; // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); - let last = o.get().snapshot_inflight.load(Ordering::SeqCst); - if last != 0 { - let current = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap(); - info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", - self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - "has_already_inited" => has_already_inited, - "is_first" => is_first, - "elapsed" => current.as_millis() - last, - ); - early_skip = true; - } } MapEntry::Vacant(v) => { info!("fast path: ongoing {}:{} {}, first message", self.store_id, region_id, new_peer_id; @@ -462,6 +465,7 @@ impl TiFlashObserver { } }; + // Try not acquire write lock firstly. match self.get_inited_or_fallback(region_id) { Some(true) => { is_first = false; @@ -494,14 +498,17 @@ impl TiFlashObserver { } } - if !is_first { - return false; - } - + // If early_skip is true, we don't read the value of `is_first`. if early_skip { return true; } + if !is_first { + // Most cases, the region is already inited or fallback. + // Skip fast add peer. + return false; + } + { // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, // see `check_msg`. diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 790eafc1074..908e0ea7450 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -717,6 +717,7 @@ pub fn gen_engine_store_server_helper( fn_handle_http_request: None, fn_check_http_uri_available: None, fn_gc_raw_cpp_ptr: Some(ffi_gc_raw_cpp_ptr), + fn_gc_raw_cpp_ptr_carr: Some(ffi_gc_raw_cpp_ptr_carr), fn_gc_special_raw_cpp_ptr: Some(ffi_gc_special_raw_cpp_ptr), fn_get_config: None, fn_set_store: None, @@ -778,6 +779,7 @@ pub enum RawCppPtrTypeImpl { WakerNotifier = 12, PSWriteBatch = 13, PSUniversalPage = 14, + PSPageAndCppStr = 15, } impl From for ffi_interfaces::RawCppPtrType { @@ -993,6 +995,31 @@ extern "C" fn ffi_gc_raw_cpp_ptr( RawCppPtrTypeImpl::PSUniversalPage => unsafe { drop(Box::from_raw(ptr as *mut MockPSUniversalPage)); }, + _ => todo!(), + } +} + +extern "C" fn ffi_gc_raw_cpp_ptr_carr( + ptr: ffi_interfaces::RawVoidPtr, + tp: ffi_interfaces::RawCppPtrType, + len: u64, +) { + match tp.into() { + RawCppPtrTypeImpl::String => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut RawVoidPtr, + len as usize, + )); + for i in 0..len { + let i = i as usize; + if !p[i].is_null() { + ffi_gc_raw_cpp_ptr(p[i], RawCppPtrTypeImpl::String.into()); + } + } + drop(p); + }, + RawCppPtrTypeImpl::PSPageAndCppStr => unsafe { todo!() }, + _ => todo!(), } } diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index 93f35037366..c9edca2a02f 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -30,9 +30,6 @@ fn test_tuple_of_raw_cpp_ptr() { inner: ptr_v, len: cap as u64, }; - for i in 0..cap { - let inner_i = cpp_ptr_tp.inner.add(i); - } drop(cpp_ptr_tp); } } @@ -64,3 +61,30 @@ fn test_array_of_raw_cpp_ptr() { drop(cpp_ptr_arr); } } + +#[test] +fn test_carray_of_raw_cpp_ptr() { + tikv_util::set_panic_hook(true, "./"); + unsafe { + init_global_ffi_helper_set(); + let helper = get_engine_store_server_helper(); + + const len: usize = 10; + let mut v: Vec = vec![]; + + for i in 0..len { + let s = format!("s{}", i); + let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); + let raw_void_ptr = raw_cpp_ptr.into_raw(); + v.push(raw_void_ptr); + } + + let (pv1, l, cap) = v.into_raw_parts(); + let pv1 = pv1 as RawVoidPtr; + (helper.fn_gc_raw_cpp_ptr_carr.into_inner())( + pv1, + RawCppPtrTypeImpl::String.into(), + cap as u64, + ); + } +} diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 10338141dea..5aacd28d46f 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 4326611643816778519ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 14213283800760119223ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index e55bb786a4f..034683554ac 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -279,6 +279,7 @@ struct EngineStoreServerHelper { BaseBuffView body); uint8_t (*fn_check_http_uri_available)(BaseBuffView); void (*fn_gc_raw_cpp_ptr)(RawVoidPtr, RawCppPtrType); + void (*fn_gc_raw_cpp_ptr_carr)(RawVoidPtr, RawCppPtrType, uint64_t); void (*fn_gc_special_raw_cpp_ptr)(RawVoidPtr, uint64_t, SpecialCppPtrType); CppStrWithView (*fn_get_config)(EngineStoreServerWrap *, uint8_t full); void (*fn_set_store)(EngineStoreServerWrap *, BaseBuffView); From 4ef9ea07a8d93176afea97574fdcbab5d75d4081 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 5 Jan 2023 14:30:48 +0800 Subject: [PATCH 065/115] Use fn_gc_raw_cpp_ptr_carr for PageAndCppStrWithViewVec (#252) --- engine_store_ffi/src/interfaces.rs | 6 ++--- engine_store_ffi/src/lib.rs | 27 ++++++++----------- engine_store_ffi/src/observer.rs | 4 +-- .../src/mock_page_storage.rs | 7 ----- new-mock-engine-store/src/mock_store.rs | 1 - .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 3 +-- 7 files changed, 17 insertions(+), 33 deletions(-) diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 6b9d062c4fd..97a3161b2e5 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -162,6 +162,7 @@ pub mod root { pub struct PageAndCppStrWithViewVec { pub inner: *mut root::DB::PageAndCppStrWithView, pub len: u64, + pub type_: root::DB::RawCppPtrType, } #[repr(C)] #[derive(Debug)] @@ -465,9 +466,6 @@ pub mod root { arg3: root::DB::BaseBuffView, ) -> root::DB::PageAndCppStrWithViewVec, >, - pub fn_gc_page_and_cpp_str_with_view_vec: ::std::option::Option< - unsafe extern "C" fn(arg1: *mut root::DB::PageAndCppStrWithView, arg2: u64), - >, pub fn_handle_purge_pagestorage: ::std::option::Option< unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap), >, @@ -585,7 +583,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 14213283800760119223; + pub const RAFT_STORE_PROXY_VERSION: u64 = 10253455389063462714; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index eb393e7cf09..d908c52dce1 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -388,11 +388,11 @@ unsafe impl Send for RawCppPtrTuple {} impl Drop for RawCppPtrTuple { fn drop(&mut self) { - /// Note the layout is: - /// [0] RawCppPtr to T - /// [1] RawCppPtr to R - /// ... - /// [len-1] RawCppPtr to S + // Note the layout is: + // [0] RawCppPtr to T + // [1] RawCppPtr to R + // ... + // [len-1] RawCppPtr to S unsafe { if !self.is_null() { let helper = get_engine_store_server_helper(); @@ -432,11 +432,11 @@ unsafe impl Send for RawCppPtrArr {} impl Drop for RawCppPtrArr { fn drop(&mut self) { - /// Note the layout is: - /// [0] RawVoidPtr to T - /// [1] RawVoidPtr - /// ... - /// [len-1] RawVoidPtr + // Note the layout is: + // [0] RawVoidPtr to T + // [1] RawVoidPtr + // ... + // [len-1] RawVoidPtr unsafe { if !self.is_null() { let helper = get_engine_store_server_helper(); @@ -469,7 +469,7 @@ impl Drop for PageAndCppStrWithViewVec { fn drop(&mut self) { if self.inner != std::ptr::null_mut() { let helper = get_engine_store_server_helper(); - helper.gc_page_and_cpp_str_with_view_vec(self.inner, self.len); + helper.gc_raw_cpp_ptr_carr(self.inner as RawVoidPtr, self.type_, self.len); self.inner = std::ptr::null_mut(); self.len = 0; } @@ -682,11 +682,6 @@ impl EngineStoreServerHelper { unsafe { (self.fn_handle_scan_page.into_inner())(self.inner, start_page_id, end_page_id) } } - pub fn gc_page_and_cpp_str_with_view_vec(&self, arg1: *mut PageAndCppStrWithView, arg2: u64) { - debug_assert!(self.fn_gc_page_and_cpp_str_with_view_vec.is_some()); - unsafe { (self.fn_gc_page_and_cpp_str_with_view_vec.into_inner())(arg1, arg2) } - } - pub fn purge_pagestorage(&self) { debug_assert!(self.fn_handle_purge_pagestorage.is_some()); unsafe { (self.fn_handle_purge_pagestorage.into_inner())(self.inner) } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 72830812db5..5ed8e5de8f6 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -555,14 +555,14 @@ impl TiFlashObserver { let region_str = res.region.view.to_slice(); let mut apply_state = RaftApplyState::default(); let mut new_region = kvproto::metapb::Region::default(); - if let Err(e) = apply_state.merge_from_bytes(apply_state_str) { + if let Err(_e) = apply_state.merge_from_bytes(apply_state_str) { error!( "fast path: ongoing {}:{} {} failed. parse apply_state {:?}, fallback to normal", self.store_id, region_id, new_peer_id, res ); self.fallback_to_slow_path(region_id); } - if let Err(e) = new_region.merge_from_bytes(region_str) { + if let Err(_e) = new_region.merge_from_bytes(region_str) { error!( "fast path: ongoing {}:{} {} failed. parse region {:?}, fallback to normal", self.store_id, region_id, new_peer_id, res diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs index e13ddd2c801..1aad90660b7 100644 --- a/new-mock-engine-store/src/mock_page_storage.rs +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -111,13 +111,6 @@ pub unsafe extern "C" fn ffi_mockps_handle_scan_page( todo!() } -pub unsafe extern "C" fn ffi_mockps_gc_page_and_cpp_str_with_view_vec( - arg1: *mut PageAndCppStrWithView, - arg2: u64, -) { - todo!() -} - pub unsafe extern "C" fn ffi_mockps_handle_purge_pagestorage( wrap: *const ffi_interfaces::EngineStoreServerWrap, ) { diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 908e0ea7450..a3edbb5dfcb 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -733,7 +733,6 @@ pub fn gen_engine_store_server_helper( fn_write_batch_clear: Some(ffi_mockps_write_batch_clear), fn_consume_write_batch: Some(ffi_mockps_consume_write_batch), fn_handle_read_page: None, - fn_gc_page_and_cpp_str_with_view_vec: None, fn_handle_purge_pagestorage: None, fn_handle_scan_page: None, fn_handle_seek_ps_key: None, diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index 5aacd28d46f..ae998a90559 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 14213283800760119223ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 10253455389063462714ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index 034683554ac..c7c8fa353bb 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -107,6 +107,7 @@ struct PageAndCppStrWithView { struct PageAndCppStrWithViewVec { PageAndCppStrWithView *inner; const uint64_t len; + RawCppPtrType type; }; // An tuple of pointers, like `void **`, @@ -255,8 +256,6 @@ struct EngineStoreServerHelper { BaseBuffView); PageAndCppStrWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, BaseBuffView, BaseBuffView); - void (*fn_gc_page_and_cpp_str_with_view_vec)(PageAndCppStrWithView *, - uint64_t); void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, BaseBuffView); From cdc2e486277d775b70f5db28a7b643ed2c3edbe1 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 5 Jan 2023 14:38:20 +0800 Subject: [PATCH 066/115] raftstore-v2: only send clean snapshot (#14015) ref tikv/tikv#12842 When the tablet contains dirty data right after split, generating snapshot may just a waste. On the other hand, split usually happens on all peers, so delay it a bit actually makes all peers more likely to be initialized by split. So this PR rejects generating snapshot when it detects it still has dirty data. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/raft_engine.rs | 8 ++++ components/engine_rocks/src/raft_engine.rs | 13 ++++++ components/engine_traits/src/raft_engine.rs | 4 ++ components/raft_log_engine/src/engine.rs | 16 +++++++ components/raftstore-v2/src/fsm/peer.rs | 5 ++- .../src/operation/command/admin/split.rs | 45 +++++++++++++++---- .../raftstore-v2/src/operation/ready/mod.rs | 21 ++++++++- .../src/operation/ready/snapshot.rs | 25 ++++++++--- components/raftstore-v2/src/raft/storage.rs | 25 +++++++++++ components/raftstore-v2/src/router/message.rs | 3 ++ .../raftstore-v2/src/worker/tablet_gc.rs | 21 ++++++--- 11 files changed, 163 insertions(+), 23 deletions(-) diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index 854b75fe30d..c0539c1edd5 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -67,6 +67,10 @@ impl RaftEngineReadOnly for PanicEngine { panic!() } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { panic!() } @@ -232,6 +236,10 @@ impl RaftLogBatch for PanicWriteBatch { panic!() } + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index d566ac3821b..a0a5acd5dd8 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -166,6 +166,10 @@ impl RaftEngineReadOnly for RocksEngine { panic!() } + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) } @@ -439,6 +443,15 @@ impl RaftLogBatch for RocksWriteBatchVec { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 68036eae1eb..671fed8b3cf 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -33,6 +33,7 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { ) -> Result>; /// Get the flushed index of the given CF. fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result; fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -201,6 +202,9 @@ pub trait RaftLogBatch: Send { apply_index: u64, ) -> Result<()>; + /// Mark a tablet may contain data that is not supposed to be in its range. + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()>; + /// Indicate whether region states should be recovered from raftdb and /// replay raft logs. /// When kvdb's write-ahead-log is disabled, the sequence number of the last diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 1ae148ba41c..3db865ed8ad 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -381,6 +381,7 @@ const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; const FLUSH_STATE_KEY: &[u8] = &[0x06]; +const DIRTY_MARK_KEY: &[u8] = &[0x07]; // All keys are of the same length. const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); @@ -475,6 +476,16 @@ impl RaftLogBatchTrait for RaftLogBatch { Ok(()) } + fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + if dirty { + self.0.put(raft_group_id, key.to_vec(), vec![]); + } else { + self.0.delete(raft_group_id, key.to_vec()); + } + Ok(()) + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.0 .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) @@ -601,6 +612,11 @@ impl RaftEngineReadOnly for RaftLogEngine { Ok(index) } + fn get_dirty_mark(&self, raft_group_id: u64, tablet_index: u64) -> Result { + let key = encode_key(DIRTY_MARK_KEY, tablet_index); + Ok(self.0.get(raft_group_id, &key).is_some()) + } + fn get_recover_state(&self) -> Result> { self.0 .get_message(STORE_STATE_ID, RECOVER_STATE_KEY) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 8b05435246b..c05b58d0839 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -191,7 +191,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_start(&mut self) { - if !self.fsm.peer.maybe_pause_for_recovery() { + if !self.fsm.peer.maybe_pause_for_recovery(self.store_ctx) { self.schedule_tick(PeerTick::Raft); } self.schedule_tick(PeerTick::SplitRegionCheck); @@ -308,6 +308,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .on_request_split(self.store_ctx, request, ch) } PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), + PeerMsg::TabletTrimmed { tablet_index } => { + self.fsm.peer_mut().on_tablet_trimmed(tablet_index) + } #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 23fc6e3a8d9..71c1e095d8c 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -471,12 +471,18 @@ impl Peer { self.split_flow_control_mut().may_skip_split_check = false; self.add_pending_tick(PeerTick::SplitRegionCheck); } + self.storage_mut().set_has_dirty_data(true); + let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); + let tablet_index = res.tablet_index; let _ = store_ctx .schedulers .tablet_gc .schedule(tablet_gc::Task::trim( self.tablet().unwrap().clone(), derived, + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, )); let last_region_id = res.regions.last().unwrap().get_id(); @@ -521,6 +527,9 @@ impl Peer { self.state_changes_mut() .put_region_state(region_id, res.tablet_index, ®ion_state) .unwrap(); + self.state_changes_mut() + .put_dirty_mark(region_id, res.tablet_index, true) + .unwrap(); self.set_has_extra_write(); } @@ -574,13 +583,21 @@ impl Peer { store_ctx: &mut StoreContext, split_init: Box, ) { - let _ = store_ctx - .schedulers - .tablet_gc - .schedule(tablet_gc::Task::trim( - self.tablet().unwrap().clone(), - self.region(), - )); + let region_id = self.region_id(); + if self.storage().has_dirty_data() { + let tablet_index = self.storage().tablet_index(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } if split_init.source_leader && self.leader_id() == INVALID_ID && self.term() == RAFT_INIT_LOG_TERM @@ -593,7 +610,6 @@ impl Peer { // reduce client query miss. self.region_heartbeat_pd(store_ctx); } - let region_id = self.region_id(); if split_init.check_split { self.add_pending_tick(PeerTick::SplitRegionCheck); @@ -633,6 +649,19 @@ impl Peer { self.set_has_extra_write(); } } + + pub fn on_tablet_trimmed(&mut self, tablet_index: u64) { + info!(self.logger, "tablet is trimmed"; "tablet_index" => tablet_index); + let region_id = self.region_id(); + let changes = self.state_changes_mut(); + changes + .put_dirty_mark(region_id, tablet_index, false) + .unwrap(); + self.set_has_extra_write(); + if self.storage().tablet_index() == tablet_index { + self.storage_mut().set_has_dirty_data(false); + } + } } #[cfg(test)] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 29452533632..3ac500b7f49 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -50,6 +50,7 @@ use crate::{ fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, router::{ApplyTask, PeerMsg, PeerTick}, + worker::tablet_gc, }; const PAUSE_FOR_RECOVERY_GAP: u64 = 128; @@ -80,7 +81,25 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } impl Peer { - pub fn maybe_pause_for_recovery(&mut self) -> bool { + pub fn maybe_pause_for_recovery(&mut self, store_ctx: &mut StoreContext) -> bool { + // The task needs to be scheduled even if the tablet may be replaced during + // recovery. Otherwise if there are merges during recovery, the FSM may + // be paused forever. + if self.storage().has_dirty_data() { + let region_id = self.region_id(); + let mailbox = store_ctx.router.mailbox(region_id).unwrap(); + let tablet_index = self.storage().tablet_index(); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + move || { + let _ = mailbox.force_send(PeerMsg::TabletTrimmed { tablet_index }); + }, + )); + } let entry_storage = self.storage().entry_storage(); let committed_index = entry_storage.commit_index(); let applied_index = entry_storage.applied_index(); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 8716f0c75ea..1919ce269a6 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -345,12 +345,23 @@ impl Storage { }; } - info!( - self.logger(), - "requesting snapshot"; - "request_index" => request_index, - "request_peer" => to, - ); + if self.has_dirty_data() { + info!(self.logger(), "delay generating snapshot as there are still dirty data"; "request_index" => request_index, "request_peer" => to); + // It's OK to delay. If there are still dirty data, it means the tablet is just + // split. In normal cases, all peers will apply split, so reject generates + // snapshot may actually good for all peers as they are more likely + // to be initialized by split. + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } else { + info!( + self.logger(), + "requesting snapshot"; + "request_index" => request_index, + "request_peer" => to, + ); + } let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); let mut gen_snap_task = self.gen_snap_task_mut(); @@ -586,6 +597,8 @@ impl Storage { let (path, clean_split) = match self.split_init_mut() { // If index not match, the peer may accept a newer snapshot after split. Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + lb.put_dirty_mark(region_id, last_index, true).unwrap(); + self.set_has_dirty_data(true); (temp_split_path(®, region_id), false) } si => ( diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b0eec5a196c..aca8f0fafce 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -35,6 +35,9 @@ pub struct Storage { /// by messages, it has not persisted any states, we need to persist them /// at least once dispite whether the state changes since create. ever_persisted: bool, + /// It may have dirty data after split. Use a flag to indicate whether it + /// has finished clean up. + has_dirty_data: bool, logger: Logger, /// Snapshot part. @@ -116,6 +119,16 @@ impl Storage { pub fn apply_trace(&self) -> &ApplyTrace { &self.apply_trace } + + #[inline] + pub fn set_has_dirty_data(&mut self, has_dirty_data: bool) { + self.has_dirty_data = has_dirty_data; + } + + #[inline] + pub fn has_dirty_data(&self) -> bool { + self.has_dirty_data + } } impl Storage { @@ -139,6 +152,17 @@ impl Storage { }; let region = region_state.get_region(); let logger = logger.new(o!("region_id" => region.id, "peer_id" => peer.get_id())); + let has_dirty_data = + match engine.get_dirty_mark(region.get_id(), region_state.get_tablet_index()) { + Ok(b) => b, + Err(e) => { + return Err(box_err!( + "failed to get dirty mark for {}: {:?}", + region.get_id(), + e + )); + } + }; let entry_storage = EntryStorage::new( peer.get_id(), engine, @@ -153,6 +177,7 @@ impl Storage { peer: peer.clone(), region_state, ever_persisted: persisted, + has_dirty_data, logger, snap_states: RefCell::new(HashMap::default()), gen_snap_task: RefCell::new(Box::new(None)), diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 930de5ff036..353e17b0cb0 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -182,6 +182,9 @@ pub enum PeerMsg { ch: CmdResChannel, }, ForceCompactLog, + TabletTrimmed { + tablet_index: u64, + }, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index aba477f883f..d4593223db3 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -17,6 +17,7 @@ pub enum Task { tablet: EK, start_key: Box<[u8]>, end_key: Box<[u8]>, + cb: Box, }, PrepareDestroy { tablet: EK, @@ -31,11 +32,9 @@ pub enum Task { impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - match *self { + match self { Task::Trim { - ref start_key, - ref end_key, - .. + start_key, end_key, .. } => write!( f, "trim tablet for start_key {}, end_key {}", @@ -65,11 +64,12 @@ impl Display for Task { impl Task { #[inline] - pub fn trim(tablet: EK, region: &Region) -> Self { + pub fn trim(tablet: EK, region: &Region, cb: impl FnOnce() + Send + 'static) -> Self { Task::Trim { tablet, start_key: region.get_start_key().into(), end_key: region.get_end_key().into(), + cb: Box::new(cb), } } @@ -110,7 +110,12 @@ impl Runner { } } - fn trim(tablet: &EK, start_key: &[u8], end_key: &[u8]) -> engine_traits::Result<()> { + fn trim( + tablet: &EK, + start_key: &[u8], + end_key: &[u8], + cb: Box, + ) -> engine_traits::Result<()> { let start_key = keys::data_key(start_key); let end_key = keys::data_end_key(end_key); let range1 = Range::new(&[], &start_key); @@ -121,6 +126,7 @@ impl Runner { for r in [range1, range2] { tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; } + cb(); Ok(()) } @@ -195,8 +201,9 @@ where tablet, start_key, end_key, + cb, } => { - if let Err(e) = Self::trim(&tablet, &start_key, &end_key) { + if let Err(e) = Self::trim(&tablet, &start_key, &end_key, cb) { error!( self.logger, "failed to trim tablet"; From df3ee59d3d134e2ef5d8e5ec90d36d218b86e4a4 Mon Sep 17 00:00:00 2001 From: Jay Date: Thu, 5 Jan 2023 17:24:20 +0800 Subject: [PATCH 067/115] raftstore-v2: update region size after split check (#14019) ref tikv/tikv#12842 Signed-off-by: Jay Lee --- components/raftstore-v2/src/fsm/peer.rs | 7 +++ .../src/operation/command/admin/split.rs | 52 ++++++++++++++++++- components/raftstore-v2/src/operation/pd.rs | 6 +-- components/raftstore-v2/src/router/imp.rs | 8 +-- components/raftstore-v2/src/router/message.rs | 7 +++ 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index c05b58d0839..fee1a00993b 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -307,6 +307,13 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, .peer_mut() .on_request_split(self.store_ctx, request, ch) } + PeerMsg::UpdateRegionSize { size } => { + self.fsm.peer_mut().on_update_region_size(size) + } + PeerMsg::UpdateRegionKeys { keys } => { + self.fsm.peer_mut().on_update_region_keys(keys) + } + PeerMsg::ClearRegionSize => self.fsm.peer_mut().on_clear_region_size(), PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), PeerMsg::TabletTrimmed { tablet_index } => { self.fsm.peer_mut().on_tablet_trimmed(tablet_index) diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 71c1e095d8c..f63f1f2ae17 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -74,7 +74,7 @@ pub struct SplitResult { // The index of the derived region in `regions` pub derived_index: usize, pub tablet_index: u64, - // Hack: in common case we should use generic, but split is an unfrequent + // Hack: in common case we should use generic, but split is an infrequent // event that performance is not critical. And using `Any` can avoid polluting // all existing code. tablet: Box, @@ -91,6 +91,8 @@ pub struct SplitInit { /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, + approximate_size: Option, + approximate_keys: Option, } impl SplitInit { @@ -123,6 +125,20 @@ pub struct SplitFlowControl { size_diff_hint: i64, skip_split_count: u64, may_skip_split_check: bool, + approximate_size: Option, + approximate_keys: Option, +} + +impl SplitFlowControl { + #[inline] + pub fn approximate_size(&self) -> Option { + self.approximate_size + } + + #[inline] + pub fn approximate_keys(&self) -> Option { + self.approximate_keys + } } pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { @@ -173,6 +189,25 @@ impl Peer { false } + pub fn on_update_region_size(&mut self, size: u64) { + self.split_flow_control_mut().approximate_size = Some(size); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_update_region_keys(&mut self, keys: u64) { + self.split_flow_control_mut().approximate_keys = Some(keys); + self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::PdHeartbeat); + } + + pub fn on_clear_region_size(&mut self) { + let control = self.split_flow_control_mut(); + control.approximate_size.take(); + control.approximate_keys.take(); + self.add_pending_tick(PeerTick::SplitRegionCheck); + } + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { let control = self.split_flow_control_mut(); control.size_diff_hint += metrics.size_diff_hint; @@ -454,6 +489,11 @@ impl Peer { self.record_tombstone_tablet(store_ctx, tablet, res.tablet_index); } + let new_region_count = res.regions.len() as u64; + let control = self.split_flow_control_mut(); + let estimated_size = control.approximate_size.map(|v| v / new_region_count); + let estimated_keys = control.approximate_keys.map(|v| v / new_region_count); + self.post_split(); if self.is_leader() { @@ -468,7 +508,10 @@ impl Peer { // so we send it independently here. self.report_batch_split_pd(store_ctx, res.regions.to_vec()); // After split, the peer may need to update its metrics. - self.split_flow_control_mut().may_skip_split_check = false; + let control = self.split_flow_control_mut(); + control.may_skip_split_check = false; + control.approximate_size = estimated_size; + control.approximate_keys = estimated_keys; self.add_pending_tick(PeerTick::SplitRegionCheck); } self.storage_mut().set_has_dirty_data(true); @@ -500,6 +543,8 @@ impl Peer { source_id: region_id, check_split: last_region_id == new_region_id, scheduled: false, + approximate_size: estimated_size, + approximate_keys: estimated_keys, locks, })); @@ -606,6 +651,9 @@ impl Peer { self.set_has_ready(); *self.txn_ext().pessimistic_locks.write() = split_init.locks; + let control = self.split_flow_control_mut(); + control.approximate_size = split_init.approximate_size; + control.approximate_keys = split_init.approximate_keys; // The new peer is likely to become leader, send a heartbeat immediately to // reduce client query miss. self.region_heartbeat_pd(store_ctx); diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 50b612f207d..d80258f14b1 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -77,7 +77,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { #[inline] - pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { + pub fn region_heartbeat_pd(&mut self, ctx: &StoreContext) { let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), @@ -86,8 +86,8 @@ impl Peer { pending_peers: self.collect_pending_peers(ctx), written_bytes: self.self_stat().written_bytes, written_keys: self.self_stat().written_keys, - approximate_size: None, - approximate_keys: None, + approximate_size: self.split_flow_control_mut().approximate_size(), + approximate_keys: self.split_flow_control_mut().approximate_keys(), wait_data_peers: Vec::new(), }); if let Err(e) = ctx.schedulers.pd.schedule(task) { diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 7a10c6c6b16..315f8a0d8eb 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -33,12 +33,12 @@ impl AsyncReadNotifier for StoreRouter { } impl raftstore::coprocessor::StoreHandle for StoreRouter { - fn update_approximate_size(&self, _region_id: u64, _size: u64) { - // TODO + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionSize { size }); } - fn update_approximate_keys(&self, _region_id: u64, _keys: u64) { - // TODO + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.send(region_id, PeerMsg::UpdateRegionKeys { keys }); } fn ask_split( diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 353e17b0cb0..c1e5f0d37dc 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -181,6 +181,13 @@ pub enum PeerMsg { request: RequestSplit, ch: CmdResChannel, }, + UpdateRegionSize { + size: u64, + }, + UpdateRegionKeys { + keys: u64, + }, + ClearRegionSize, ForceCompactLog, TabletTrimmed { tablet_index: u64, From cc9e69b925020e58b786bb811f1bcdba05a7c09f Mon Sep 17 00:00:00 2001 From: buffer <1045931706@qq.com> Date: Fri, 6 Jan 2023 14:20:21 +0800 Subject: [PATCH 068/115] raftstore-v2: store heartbeat add kv size and snap size (#14016) ref tikv/tikv#12842 1. store heartbeat should add snapshot and kv engine used size Signed-off-by: bufferflies <1045931706@qq.com> Co-authored-by: Xinye Tao --- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/operation/pd.rs | 4 +--- components/raftstore-v2/src/worker/pd/mod.rs | 6 +++++- .../raftstore-v2/src/worker/pd/store_heartbeat.rs | 11 ++++++++--- .../tests/integrations/test_pd_heartbeat.rs | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index e25ad53df8b..621f826619b 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -551,6 +551,7 @@ impl StoreSystem { pd_client, raft_engine.clone(), tablet_registry.clone(), + snap_mgr.clone(), router.clone(), workers.pd.remote(), concurrency_manager, diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index d80258f14b1..26945a3e176 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -50,9 +50,7 @@ impl Store { stats.set_bytes_written(0); stats.set_keys_written(0); stats.set_is_busy(false); - - // stats.set_query_stats(query_stats); - + // TODO: add query stats let task = pd::Task::StoreHeartbeat { stats }; if let Err(e) = ctx.schedulers.pd.schedule(task) { error!(self.logger(), "notify pd failed"; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index bfcf3389754..b54d088db66 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -12,7 +12,8 @@ use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; use raftstore::store::{ - util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TxnExt, WriteStats, + util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TabletSnapManager, TxnExt, + WriteStats, }; use slog::{error, info, Logger}; use tikv_util::{ @@ -105,6 +106,7 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, remote: Remote, @@ -139,6 +141,7 @@ where pd_client: Arc, raft_engine: ER, tablet_registry: TabletRegistry, + snap_mgr: TabletSnapManager, router: StoreRouter, remote: Remote, concurrency_manager: ConcurrencyManager, @@ -152,6 +155,7 @@ where pd_client, raft_engine, tablet_registry, + snap_mgr, router, remote, region_peers: HashMap::default(), diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 22bee3cbf26..ba75354c753 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -277,9 +277,14 @@ where } else { std::cmp::min(disk_cap, self.cfg.value().capacity.0) }; - // TODO: accurate snapshot size and kv engines size. - let snap_size = 0; - let kv_size = 0; + let mut kv_size = 0; + self.tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + let snap_size = self.snap_mgr.total_snap_size().unwrap(); let used_size = snap_size + kv_size + self diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index 96bcbbccf7a..09ead81c0c2 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -52,6 +52,7 @@ fn test_store_heartbeat() { let stats = block_on(cluster.node(0).pd_client().get_store_stats_async(store_id)).unwrap(); if stats.get_start_time() > 0 { assert_ne!(stats.get_capacity(), 0); + assert_ne!(stats.get_used_size(), 0); return; } std::thread::sleep(std::time::Duration::from_millis(50)); From c71fdfc49414005c4630e357e1ab6418ddf104f7 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Fri, 6 Jan 2023 17:52:22 +0800 Subject: [PATCH 069/115] log-backup: limit inflight raft msg from pitr (#13976) close tikv/tikv#13977 Signed-off-by: tabokie --- .../src/worker/pd/update_max_timestamp.rs | 3 - components/sst_importer/src/sst_importer.rs | 2 +- src/import/mod.rs | 2 +- src/import/sst_service.rs | 626 ++++++++---------- 4 files changed, 295 insertions(+), 338 deletions(-) diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs index 0de3fb9a87c..178d00ebd15 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -93,13 +93,10 @@ where } }; - #[cfg(feature = "failpoints")] let delay = (|| { fail::fail_point!("delay_update_max_ts", |_| true); false })(); - #[cfg(not(feature = "failpoints"))] - let delay = false; if delay { info!(self.logger, "[failpoint] delay update max ts for 1s"; "region_id" => region_id); diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 3e06eb76899..8b6d64f483f 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -763,7 +763,7 @@ impl SstImporter { start_ts: u64, restore_ts: u64, file_buff: Arc>, - build_fn: &mut dyn FnMut(Vec, Vec), + mut build_fn: impl FnMut(Vec, Vec), ) -> Result> { let mut event_iter = EventIterator::new(file_buff.as_slice()); let mut smallest_key = None; diff --git a/src/import/mod.rs b/src/import/mod.rs index d3a522ede5e..e2fa3729e52 100644 --- a/src/import/mod.rs +++ b/src/import/mod.rs @@ -29,7 +29,7 @@ pub fn make_rpc_error(err: E) -> RpcStatus { #[macro_export] macro_rules! send_rpc_response { - ($res:ident, $sink:ident, $label:ident, $timer:ident) => {{ + ($res:expr, $sink:ident, $label:ident, $timer:ident) => {{ let res = match $res { Ok(resp) => { IMPORT_RPC_DURATION diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 8ce6f9961fb..ea52cad0095 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -1,7 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, + collections::{HashMap, VecDeque}, future::Future, path::PathBuf, sync::{Arc, Mutex}, @@ -11,7 +11,7 @@ use std::{ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; -use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures::{sink::SinkExt, stream::TryStreamExt, TryFutureExt}; use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, @@ -19,9 +19,12 @@ use grpcio::{ use kvproto::{ encryptionpb::EncryptionMethod, errorpb, - import_sstpb::{RawWriteRequest_oneof_chunk as RawChunk, WriteRequest_oneof_chunk as Chunk, *}, + import_sstpb::{ + Error as ImportPbError, ImportSst, Range, RawWriteRequest_oneof_chunk as RawChunk, SstMeta, + SwitchMode, WriteRequest_oneof_chunk as Chunk, *, + }, kvrpcpb::Context, - raft_cmdpb::*, + raft_cmdpb::{CmdType, DeleteRequest, PutRequest, RaftCmdRequest, RaftRequestHeader, Request}, }; use protobuf::Message; use raftstore::{ @@ -44,6 +47,8 @@ use txn_types::{Key, WriteRef, WriteType}; use super::make_rpc_error; use crate::{import::duplicate_detect::DuplicateDetector, server::CONFIG_ROCKSDB_GAUGE}; +const MAX_INFLIGHT_RAFT_MSGS: usize = 64; + /// ImportSstService provides tikv-server with the ability to ingest SST files. /// /// It saves the SST sent from client to a file and then sends a command to @@ -74,6 +79,161 @@ pub struct SnapshotResult { term: u64, } +struct RequestCollector { + context: Context, + max_raft_req_size: usize, + /// Retain the last ts of each key in each request. + /// This is used for write CF because resolved ts observer hates duplicated + /// key in the same request. + write_reqs: HashMap, (Request, u64)>, + /// Collector favor that simple collect all items, and it do not contains + /// duplicated key-value. This is used for default CF. + default_reqs: HashMap, Request>, + /// Size of all `Request`s. + unpacked_size: usize, + + pending_raft_reqs: Vec, +} + +impl RequestCollector { + fn new(context: Context, max_raft_req_size: usize) -> Self { + Self { + context, + max_raft_req_size, + write_reqs: HashMap::default(), + default_reqs: HashMap::default(), + unpacked_size: 0, + pending_raft_reqs: Vec::new(), + } + } + + fn accept_kv(&mut self, cf: &str, is_delete: bool, k: Vec, v: Vec) { + // Need to skip the empty key/value that could break the transaction or cause + // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. + if k.is_empty() || (!is_delete && v.is_empty()) { + return; + } + let mut req = Request::default(); + if is_delete { + let mut del = DeleteRequest::default(); + del.set_key(k); + del.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Delete); + req.set_delete(del); + } else { + if cf == CF_WRITE && !write_needs_restore(&v) { + return; + } + + let mut put = PutRequest::default(); + put.set_key(k); + put.set_value(v); + put.set_cf(cf.to_string()); + req.set_cmd_type(CmdType::Put); + req.set_put(put); + } + self.accept(cf, req); + } + + // we need to remove duplicate keys in here, since + // in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 + // will panic if found duplicated entry during Vec. + fn accept(&mut self, cf: &str, req: Request) { + let k = key_from_request(&req); + match cf { + CF_WRITE => { + let (encoded_key, ts) = match Key::split_on_ts_for(k) { + Ok(k) => k, + Err(err) => { + warn!( + "key without ts, skipping"; + "key" => %log_wrappers::Value::key(k), + "err" => %err + ); + return; + } + }; + if self + .write_reqs + .get(encoded_key) + .map(|(_, old_ts)| *old_ts < ts.into_inner()) + .unwrap_or(true) + { + self.unpacked_size += req.compute_size() as usize; + if let Some((v, _)) = self + .write_reqs + .insert(encoded_key.to_owned(), (req, ts.into_inner())) + { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + } + CF_DEFAULT => { + self.unpacked_size += req.compute_size() as usize; + if let Some(v) = self.default_reqs.insert(k.to_owned(), req) { + self.unpacked_size -= v.get_cached_size() as usize; + } + } + _ => unreachable!(), + } + + if self.unpacked_size >= self.max_raft_req_size { + self.pack_all(); + } + } + + #[cfg(test)] + fn drain_unpacked_reqs(&mut self, cf: &str) -> Vec { + let res: Vec = if cf == CF_DEFAULT { + self.default_reqs.drain().map(|(_, req)| req).collect() + } else { + self.write_reqs.drain().map(|(_, (req, _))| req).collect() + }; + for r in &res { + self.unpacked_size -= r.get_cached_size() as usize; + } + res + } + + #[inline] + fn drain_raft_reqs(&mut self, take_unpacked: bool) -> std::vec::Drain<'_, RaftCmdRequest> { + if take_unpacked { + self.pack_all(); + } + self.pending_raft_reqs.drain(..) + } + + fn pack_all(&mut self) { + if self.unpacked_size == 0 { + return; + } + let mut cmd = RaftCmdRequest::default(); + let mut header = make_request_header(self.context.clone()); + // Set the UUID of header to prevent raftstore batching our requests. + // The current `resolved_ts` observer assumes that each batch of request doesn't + // has two writes to the same key. (Even with 2 different TS). That was true + // for normal cases because the latches reject concurrency write to keys. + // However we have bypassed the latch layer :( + header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); + cmd.set_header(header); + let mut reqs: Vec<_> = self.write_reqs.drain().map(|(_, (req, _))| req).collect(); + reqs.append(&mut self.default_reqs.drain().map(|(_, req)| req).collect()); + if reqs.is_empty() { + debug_assert!(false, "attempt to pack an empty request"); + return; + } + cmd.set_requests(reqs.into()); + + self.pending_raft_reqs.push(cmd); + self.unpacked_size = 0; + } + + #[inline] + fn is_empty(&self) -> bool { + self.pending_raft_reqs.is_empty() && self.unpacked_size == 0 + } +} + impl ImportSstService where E: KvEngine, @@ -281,6 +441,101 @@ where Ok(resp) } } + + async fn apply_imp( + mut req: ApplyRequest, + importer: Arc, + router: Router, + limiter: Limiter, + max_raft_size: usize, + ) -> std::result::Result, ImportPbError> { + type RaftWriteFuture = futures::channel::oneshot::Receiver; + async fn handle_raft_write(fut: RaftWriteFuture) -> std::result::Result<(), ImportPbError> { + match fut.await { + Err(e) => { + let msg = format!("failed to complete raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + Ok(mut r) if r.response.get_header().has_error() => { + let mut e = ImportPbError::default(); + e.set_message("failed to complete raft command".to_string()); + e.set_store_error(r.response.take_header().take_error()); + return Err(e); + } + _ => {} + } + Ok(()) + } + + let mut range: Option = None; + + let mut collector = RequestCollector::new(req.take_context(), max_raft_size * 7 / 8); + let mut metas = req.take_metas(); + let mut rules = req.take_rewrite_rules(); + // For compatibility with old requests. + if req.has_meta() { + metas.push(req.take_meta()); + rules.push(req.take_rewrite_rule()); + } + let ext_storage = importer.wrap_kms( + importer + .external_storage_or_cache(req.get_storage_backend(), req.get_storage_cache_id())?, + false, + ); + + let mut inflight_futures: VecDeque = VecDeque::new(); + + let mut tasks = metas.iter().zip(rules.iter()).peekable(); + while let Some((meta, rule)) = tasks.next() { + let buff = importer.read_from_kv_file( + meta, + rule, + ext_storage.clone(), + req.get_storage_backend(), + &limiter, + )?; + if let Some(mut r) = importer.do_apply_kv_file( + meta.get_start_key(), + meta.get_end_key(), + meta.get_start_ts(), + meta.get_restore_ts(), + buff, + |k, v| collector.accept_kv(meta.get_cf(), meta.get_is_delete(), k, v), + )? { + if let Some(range) = range.as_mut() { + range.start = range.take_start().min(r.take_start()); + range.end = range.take_end().max(r.take_end()); + } else { + range = Some(r); + } + } + + let is_last_task = tasks.peek().is_none(); + for req in collector.drain_raft_reqs(is_last_task) { + while inflight_futures.len() >= MAX_INFLIGHT_RAFT_MSGS { + handle_raft_write(inflight_futures.pop_front().unwrap()).await?; + } + let (cb, future) = paired_future_callback(); + match router.send_command(req, Callback::write(cb), RaftCmdExtraOpts::default()) { + Ok(_) => inflight_futures.push_back(future), + Err(e) => { + let msg = format!("failed to send raft command: {}", e); + let mut e = ImportPbError::default(); + e.set_message(msg); + return Err(e); + } + } + } + } + assert!(collector.is_empty()); + for fut in inflight_futures { + handle_raft_write(fut).await?; + } + + Ok(range) + } } #[macro_export] @@ -375,8 +630,7 @@ where } let task = async move { - let res = Ok(SwitchModeResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!(Ok(SwitchModeResponse::default()), sink, label, timer); }; ctx.spawn(task); } @@ -448,7 +702,7 @@ where .observe(start.saturating_elapsed().as_secs_f64()); if let Err(e) = importer.remove_dir(req.get_prefix()) { - let mut import_err = kvproto::import_sstpb::Error::default(); + let mut import_err = ImportPbError::default(); import_err.set_message(format!("failed to remove directory: {}", e)); resp.set_error(import_err); } @@ -456,176 +710,37 @@ where .with_label_values(&[label]) .observe(start.saturating_elapsed().as_secs_f64()); - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); } // Downloads KV file and performs key-rewrite then apply kv into this tikv // store. - fn apply( - &mut self, - _ctx: RpcContext<'_>, - mut req: ApplyRequest, - sink: UnarySink, - ) { + fn apply(&mut self, _ctx: RpcContext<'_>, req: ApplyRequest, sink: UnarySink) { let label = "apply"; - let timer = Instant::now_coarse(); - let importer = Arc::clone(&self.importer); + let start = Instant::now(); + let importer = self.importer.clone(); let router = self.router.clone(); let limiter = self.limiter.clone(); - let start = Instant::now(); - let raft_size = self.raft_entry_max_size; + let max_raft_size = self.raft_entry_max_size.0 as usize; let handle_task = async move { // Records how long the apply task waits to be scheduled. sst_importer::metrics::IMPORTER_APPLY_DURATION .with_label_values(&["queue"]) .observe(start.saturating_elapsed().as_secs_f64()); - let mut start_apply = Instant::now(); - let mut futs = vec![]; - let mut apply_resp = ApplyResponse::default(); - let context = req.take_context(); - let mut rules = req.take_rewrite_rules(); - let mut metas = req.take_metas(); - // For compatibility with old requests. - if req.has_meta() { - metas.push(req.take_meta()); - rules.push(req.take_rewrite_rule()); - } - let result = (|| -> Result<()> { - let mut cmd_reqs = vec![]; - let mut reqs_default = RequestCollector::from_cf(CF_DEFAULT); - let mut reqs_write = RequestCollector::from_cf(CF_WRITE); - let mut req_default_size = 0_u64; - let mut req_write_size = 0_u64; - let mut range: Option = None; - let ext_storage = { - let inner = importer.wrap_kms( - importer.external_storage_or_cache( - req.get_storage_backend(), - req.get_storage_cache_id(), - )?, - false, - ); - inner - }; - - for (i, meta) in metas.iter().enumerate() { - let (reqs, req_size) = if meta.get_cf() == CF_DEFAULT { - (&mut reqs_default, &mut req_default_size) - } else { - (&mut reqs_write, &mut req_write_size) - }; - - let mut build_req_fn = build_apply_request( - req_size, - raft_size.0, - reqs, - cmd_reqs.as_mut(), - meta.get_is_delete(), - meta.get_cf(), - context.clone(), - ); - - let buff = importer.read_from_kv_file( - meta, - &rules[i], - Arc::clone(&ext_storage), - req.get_storage_backend(), - &limiter, - )?; - let r: Option = importer.do_apply_kv_file( - meta.get_start_key(), - meta.get_end_key(), - meta.get_start_ts(), - meta.get_restore_ts(), - buff, - &mut build_req_fn, - )?; - - if let Some(mut r) = r { - range = match range { - Some(mut v) => { - let s = v.take_start().min(r.take_start()); - let e = v.take_end().max(r.take_end()); - Some(Range { - start: s, - end: e, - ..Default::default() - }) - } - None => Some(r), - }; - } - } + let mut resp = ApplyResponse::default(); - if !reqs_default.is_empty() { - let cmd = make_request(&mut reqs_default, context.clone()); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_default_size as _); - } - if !reqs_write.is_empty() { - let cmd = make_request(&mut reqs_write, context); - cmd_reqs.push(cmd); - IMPORTER_APPLY_BYTES.observe(req_write_size as _); - } - - start_apply = Instant::now(); - for cmd in cmd_reqs { - let (cb, future) = paired_future_callback(); - match router.send_command(cmd, Callback::write(cb), RaftCmdExtraOpts::default()) - { - Ok(_) => futs.push(future), - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to send raft command: {}", e)); - apply_resp.set_error(import_err); - } - } - } - if let Some(r) = range { - apply_resp.set_range(r); - } - Ok(()) - })(); - if let Err(e) = result { - apply_resp.set_error(e.into()); + match Self::apply_imp(req, importer, router, limiter, max_raft_size).await { + Ok(Some(r)) => resp.set_range(r), + Err(e) => resp.set_error(e), + _ => {} } - let resp = Ok(join_all(futs).await.iter().fold(apply_resp, |mut resp, x| { - match x { - Err(e) => { - let mut import_err = kvproto::import_sstpb::Error::default(); - import_err.set_message(format!("failed to complete raft command: {}", e)); - resp.set_error(import_err); - } - Ok(r) => { - if r.response.get_header().has_error() { - let mut import_err = kvproto::import_sstpb::Error::default(); - let err = r.response.get_header().get_error(); - import_err.set_message("failed to complete raft command".to_string()); - // FIXME: if there are many errors, we may lose some of them here. - import_err.set_store_error(err.clone()); - warn!("failed to apply the file to the store"; "error" => ?err); - resp.set_error(import_err); - } - } - } - resp - })); - - // Records how long the apply task waits to be scheduled. - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["apply"]) - .observe(start_apply.saturating_elapsed().as_secs_f64()); - sst_importer::metrics::IMPORTER_APPLY_DURATION - .with_label_values(&["finish"]) - .observe(start.saturating_elapsed().as_secs_f64()); debug!("finished apply kv file with {:?}", resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, start); }; self.block_threads.spawn_ok(handle_task); } @@ -678,8 +793,7 @@ where }, Err(e) => resp.set_error(e.into()), } - let resp = Ok(resp); - crate::send_rpc_response!(resp, sink, label, timer); + crate::send_rpc_response!(Ok(resp), sink, label, timer); }; self.threads.spawn(handle_task); @@ -848,8 +962,12 @@ where }); let ctx_task = async move { - let res = Ok(SetDownloadSpeedLimitResponse::default()); - crate::send_rpc_response!(res, sink, label, timer); + crate::send_rpc_response!( + Ok(SetDownloadSpeedLimitResponse::default()), + sink, + label, + timer + ); }; ctx.spawn(ctx_task); @@ -958,70 +1076,6 @@ fn pb_error_inc(type_: &str, e: &errorpb::Error) { IMPORTER_ERROR_VEC.with_label_values(&[type_, label]).inc(); } -enum RequestCollector { - /// Retain the last ts of each key in each request. - /// This is used for write CF because resolved ts observer hates duplicated - /// key in the same request. - RetainLastTs(HashMap, (Request, u64)>), - /// Collector favor that simple collect all items, and it do not contains - /// duplicated key-value. This is used for default CF. - KeepAll(HashMap, Request>), -} - -impl RequestCollector { - fn from_cf(cf: &str) -> Self { - match cf { - CF_DEFAULT | "" => Self::KeepAll(Default::default()), - CF_WRITE => Self::RetainLastTs(Default::default()), - _ => { - warn!("unknown cf name, using default request collector"; "cf" => %cf); - Self::RetainLastTs(Default::default()) - } - } - } - - fn accept(&mut self, req: Request) { - let k = key_from_request(&req); - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - let (encoded_key, ts) = match Key::split_on_ts_for(k) { - Ok(k) => k, - Err(err) => { - warn!("key without ts, skipping"; "key" => %log_wrappers::Value::key(k), "err" => %err); - return; - } - }; - if reqs - .get(encoded_key) - .map(|(_, old_ts)| *old_ts < ts.into_inner()) - .unwrap_or(true) - { - reqs.insert(encoded_key.to_owned(), (req, ts.into_inner())); - } - } - RequestCollector::KeepAll(ref mut reqs) => { - reqs.insert(k.to_owned(), req); - } - } - } - - fn drain(&mut self) -> Vec { - match self { - RequestCollector::RetainLastTs(ref mut reqs) => { - reqs.drain().map(|(_, (req, _))| req).collect() - } - RequestCollector::KeepAll(ref mut reqs) => reqs.drain().map(|(_, req)| req).collect(), - } - } - - fn is_empty(&self) -> bool { - match self { - RequestCollector::RetainLastTs(reqs) => reqs.is_empty(), - RequestCollector::KeepAll(reqs) => reqs.is_empty(), - } - } -} - fn key_from_request(req: &Request) -> &[u8] { if req.has_put() { return req.get_put().get_key(); @@ -1029,8 +1083,7 @@ fn key_from_request(req: &Request) -> &[u8] { if req.has_delete() { return req.get_delete().get_key(); } - warn!("trying to extract key from request is neither put nor delete."); - b"" + panic!("trying to extract key from request is neither put nor delete.") } fn make_request_header(mut context: Context) -> RaftRequestHeader { @@ -1042,77 +1095,6 @@ fn make_request_header(mut context: Context) -> RaftRequestHeader { header } -fn make_request(reqs: &mut RequestCollector, context: Context) -> RaftCmdRequest { - let mut cmd = RaftCmdRequest::default(); - let mut header = make_request_header(context); - // Set the UUID of header to prevent raftstore batching our requests. - // The current `resolved_ts` observer assumes that each batch of request doesn't - // has two writes to the same key. (Even with 2 different TS). That was true - // for normal cases because the latches reject concurrency write to keys. - // However we have bypassed the latch layer :( - header.set_uuid(uuid::Uuid::new_v4().as_bytes().to_vec()); - cmd.set_header(header); - cmd.set_requests(reqs.drain().into()); - cmd -} - -// we need to remove duplicate keys in here, since -// in https://github.com/tikv/tikv/blob/a401f78bc86f7e6ea6a55ad9f453ae31be835b55/components/resolved_ts/src/cmd.rs#L204 -// will panic if found duplicated entry during Vec. -fn build_apply_request<'a, 'b>( - req_size: &'a mut u64, - raft_size: u64, - reqs: &'a mut RequestCollector, - cmd_reqs: &'a mut Vec, - is_delete: bool, - cf: &'b str, - context: Context, -) -> Box, Vec) + 'b> -where - 'a: 'b, -{ - // use callback to collect kv data. - Box::new(move |k: Vec, v: Vec| { - // Need to skip the empty key/value that could break the transaction or cause - // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. - if k.is_empty() || (!is_delete && v.is_empty()) { - return; - } - - let mut req = Request::default(); - if is_delete { - let mut del = DeleteRequest::default(); - del.set_key(k); - del.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Delete); - req.set_delete(del); - } else { - if cf == CF_WRITE && !write_needs_restore(&v) { - return; - } - - let mut put = PutRequest::default(); - put.set_key(k); - put.set_value(v); - put.set_cf(cf.to_string()); - req.set_cmd_type(CmdType::Put); - req.set_put(put); - } - - // When the request size get grow to max request size, - // build the request and add it to a batch. - if *req_size + req.compute_size() as u64 > raft_size * 7 / 8 { - IMPORTER_APPLY_BYTES.observe(*req_size as _); - *req_size = 0; - let cmd = make_request(reqs, context.clone()); - cmd_reqs.push(cmd); - } - - *req_size += req.compute_size() as u64; - reqs.accept(req); - }) -} - fn write_needs_restore(write: &[u8]) -> bool { let w = WriteRef::parse(write); match w { @@ -1146,9 +1128,7 @@ mod test { use kvproto::{kvrpcpb::Context, raft_cmdpb::*}; use txn_types::{Key, TimeStamp, Write, WriteType}; - use crate::import::sst_service::{ - build_apply_request, key_from_request, make_request, RequestCollector, - }; + use crate::import::sst_service::{key_from_request, RequestCollector}; fn write(key: &[u8], ty: WriteType, commit_ts: u64, start_ts: u64) -> (Vec, Vec) { let k = Key::from_raw(key).append_ts(TimeStamp::new(commit_ts)); @@ -1213,30 +1193,14 @@ mod test { } fn run_case(c: &Case) { - let mut cmds = vec![]; - let mut reqs = RequestCollector::from_cf(c.cf); - let mut req_size = 0_u64; - - let mut builder = build_apply_request( - &mut req_size, - 1024, - &mut reqs, - &mut cmds, - c.is_delete, - c.cf, - Context::new(), - ); + let mut collector = RequestCollector::new(Context::new(), 1024); for (k, v) in c.mutations.clone() { - builder(k, v); - } - drop(builder); - if !reqs.is_empty() { - let cmd = make_request(&mut reqs, Context::new()); - cmds.push(cmd); + collector.accept_kv(c.cf, c.is_delete, k, v); } + let reqs = collector.drain_raft_reqs(true); - let mut req1: HashMap<_, _> = cmds + let mut req1: HashMap<_, _> = reqs .into_iter() .flat_map(|mut x| x.take_requests().into_iter()) .map(|req| { @@ -1318,8 +1282,7 @@ mod test { #[test] fn test_request_collector_with_write_cf() { - let mut request_collector = RequestCollector::from_cf(CF_WRITE); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ write_req(b"foo", WriteType::Put, 40, 39), write_req(b"aar", WriteType::Put, 38, 37), @@ -1333,23 +1296,21 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_WRITE, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_WRITE); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let k2 = key_from_request(r2); k1.cmp(k2) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } #[test] fn test_request_collector_with_default_cf() { - let mut request_collector = RequestCollector::from_cf(CF_DEFAULT); - assert_eq!(request_collector.is_empty(), true); + let mut request_collector = RequestCollector::new(Context::new(), 102400); let reqs = vec![ default_req(b"foo", b"", 39), default_req(b"zzz", b"", 40), @@ -1363,10 +1324,9 @@ mod test { ]; for req in reqs { - request_collector.accept(req); + request_collector.accept(CF_DEFAULT, req); } - assert_eq!(request_collector.is_empty(), false); - let mut reqs = request_collector.drain(); + let mut reqs: Vec<_> = request_collector.drain_unpacked_reqs(CF_DEFAULT); reqs.sort_by(|r1, r2| { let k1 = key_from_request(r1); let (k1, ts1) = Key::split_on_ts_for(k1).unwrap(); @@ -1376,6 +1336,6 @@ mod test { k1.cmp(k2).then(ts1.cmp(&ts2)) }); assert_eq!(reqs, reqs_result); - assert_eq!(request_collector.is_empty(), true); + assert!(request_collector.is_empty()); } } From 71efe9e6af802761bec9fcc0e468035cf3adb3b7 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 6 Jan 2023 18:16:21 +0800 Subject: [PATCH 070/115] raftstore-v2: adaptive apply (#14020) ref tikv/tikv#12842 Make apply adaptive to reduce high tail latency. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/apply.rs | 12 ++- .../operation/command/admin/compact_log.rs | 99 +++++++++++++------ .../operation/command/admin/conf_change.rs | 11 ++- .../src/operation/command/admin/split.rs | 4 +- .../raftstore-v2/src/operation/command/mod.rs | 88 ++++++++++++++++- components/raftstore-v2/src/operation/life.rs | 8 +- components/raftstore-v2/src/operation/mod.rs | 6 +- .../src/operation/ready/apply_trace.rs | 17 ++-- .../raftstore-v2/src/operation/ready/mod.rs | 4 +- .../src/operation/ready/snapshot.rs | 28 +++--- components/raftstore-v2/src/raft/apply.rs | 21 +++- components/raftstore-v2/src/raft/storage.rs | 5 +- 12 files changed, 225 insertions(+), 78 deletions(-) diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index b81d31329cb..1544a703c6d 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -10,7 +10,7 @@ use crossbeam::channel::TryRecvError; use engine_traits::{FlushState, KvEngine, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; -use raftstore::store::ReadTask; +use raftstore::store::{Config, ReadTask}; use slog::Logger; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, @@ -58,6 +58,7 @@ pub struct ApplyFsm { impl ApplyFsm { pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -70,6 +71,7 @@ impl ApplyFsm { ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); let apply = Apply::new( + cfg, peer, region_state, res_reporter, @@ -100,6 +102,7 @@ impl ApplyFsm { res = self.receiver.next().fuse() => res, _ = timeout.fuse() => None, }; + self.apply.on_start_apply(); let mut task = match res { Some(r) => r, None => { @@ -116,10 +119,10 @@ impl ApplyFsm { ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), - ApplyTask::ManualFlush => self.apply.on_manual_flush(), + ApplyTask::ManualFlush => self.apply.on_manual_flush().await, } - // TODO: yield after some time. + self.apply.maybe_flush().await; // Perhaps spin sometime? match self.receiver.try_recv() { @@ -128,7 +131,8 @@ impl ApplyFsm { Err(TryRecvError::Disconnected) => return, } } - self.apply.flush(); + let written_bytes = self.apply.flush(); + self.apply.maybe_reschedule(written_bytes).await; } } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 7127cd45306..39cf02de775 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -23,7 +23,7 @@ use raftstore::{ Result, }; use slog::{debug, error, info}; -use tikv_util::box_err; +use tikv_util::{box_err, log::SlogFormat}; use crate::{ batch::StoreContext, @@ -303,6 +303,35 @@ impl Peer { } } + #[inline] + pub fn record_tombstone_tablet_for_destroy( + &mut self, + ctx: &StoreContext, + task: &mut WriteTask, + ) { + let compact_log_context = self.compact_log_context_mut(); + assert!( + compact_log_context.tombstone_tablets_wait_index.is_empty(), + "{} all tombstone should be cleared before being destroyed.", + SlogFormat(&self.logger) + ); + let tablet = match self.tablet() { + Some(tablet) => tablet.clone(), + None => return, + }; + let region_id = self.region_id(); + let applied_index = self.entry_storage().applied_index(); + let sched = ctx.schedulers.tablet_gc.clone(); + let _ = sched.schedule(tablet_gc::Task::prepare_destroy( + tablet, + self.region_id(), + applied_index, + )); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, applied_index)); + })); + } + pub fn on_apply_res_compact_log( &mut self, store_ctx: &mut StoreContext, @@ -342,8 +371,17 @@ impl Peer { self.set_has_extra_write(); // All logs < perssited_apply will be deleted, so should check with +1. - if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() { - self.compact_log_from_engine(store_ctx); + if old_truncated + 1 < self.storage().apply_trace().persisted_apply_index() + && let Some(index) = self.compact_log_index() { + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } + // Extra write set right above. } let context = self.compact_log_context_mut(); @@ -354,38 +392,44 @@ impl Peer { (context.approximate_log_size as f64 * (remain_cnt as f64 / total_cnt as f64)) as u64; } - /// Called when apply index is persisted. There are two different situation: - /// - /// Generally, additional writes are triggered to persist apply index. In - /// this case task is `Some`. But after applying snapshot, the apply - /// index is persisted ahead of time. In this case task is `None`. + /// Called when apply index is persisted. #[inline] pub fn on_advance_persisted_apply_index( &mut self, store_ctx: &mut StoreContext, old_persisted: u64, - task: Option<&mut WriteTask>, + task: &mut WriteTask, ) { let new_persisted = self.storage().apply_trace().persisted_apply_index(); if old_persisted < new_persisted { let region_id = self.region_id(); // TODO: batch it. + // TODO: avoid allocation if there is nothing to delete. if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( region_id, new_persisted, - self.state_changes_mut(), + task.extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)), ) { error!(self.logger, "failed to delete raft states"; "err" => ?e); - } else { - self.set_has_extra_write(); } // If it's snapshot, logs are gc already. - if task.is_some() && old_persisted < self.entry_storage().truncated_index() + 1 { - self.compact_log_from_engine(store_ctx); + if !task.has_snapshot + && old_persisted < self.entry_storage().truncated_index() + 1 + && let Some(index) = self.compact_log_index() { + let batch = task.extra_write.ensure_v2(|| self.entry_storage().raft_engine().log_batch(0)); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, index, batch) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } } if self.remove_tombstone_tablets(new_persisted) { let sched = store_ctx.schedulers.tablet_gc.clone(); - if let Some(task) = task { + if !task.has_snapshot { task.persisted_cbs.push(Box::new(move || { let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); })); @@ -397,28 +441,19 @@ impl Peer { } } - fn compact_log_from_engine(&mut self, store_ctx: &mut StoreContext) { + fn compact_log_index(&mut self) -> Option { let truncated = self.entry_storage().truncated_index() + 1; let persisted_applied = self.storage().apply_trace().persisted_apply_index(); let compact_index = std::cmp::min(truncated, persisted_applied); if compact_index == RAFT_INIT_LOG_INDEX + 1 { // There is no logs at RAFT_INIT_LOG_INDEX, nothing to delete. - return; - } - // Raft Engine doesn't care about first index. - if let Err(e) = - store_ctx - .engine - .gc(self.region_id(), 0, compact_index, self.state_changes_mut()) - { - error!(self.logger, "failed to compact raft logs"; "err" => ?e); - } else { - // TODO: make this debug when stable. - info!(self.logger, "compact log"; - "index" => compact_index, - "apply_trace" => ?self.storage().apply_trace(), - "truncated" => ?self.entry_storage().apply_state()); - self.set_has_extra_write(); + return None; } + // TODO: make this debug when stable. + info!(self.logger, "compact log"; + "index" => compact_index, + "apply_trace" => ?self.storage().apply_trace(), + "truncated" => ?self.entry_storage().apply_state()); + Some(compact_index) } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 6c041a551fe..8c9771b0201 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -9,7 +9,7 @@ use std::time::Instant; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::{self, PeerRole}, raft_cmdpb::{AdminRequest, AdminResponse, ChangePeerRequest, RaftCmdRequest}, @@ -146,7 +146,7 @@ impl Peer { let remove_self = conf_change.region_state.get_state() == PeerState::Tombstone; self.storage_mut() - .set_region_state(conf_change.region_state); + .set_region_state(conf_change.region_state.clone()); if self.is_leader() { info!( self.logger, @@ -189,7 +189,14 @@ impl Peer { self.raft_group().raft.state, ); if remove_self { + // When self is destroyed, all metas will be cleaned in `start_destroy`. self.mark_for_destroy(None); + } else { + let region_id = self.region_id(); + self.state_changes_mut() + .put_region_state(region_id, conf_change.index, &conf_change.region_state) + .unwrap(); + self.set_has_extra_write(); } } } diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index f63f1f2ae17..d01b1371338 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -731,7 +731,7 @@ mod test { raft_cmdpb::{BatchSplitRequest, SplitRequest}, raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::cmd_resp::new_error; + use raftstore::store::{cmd_resp::new_error, Config}; use slog::o; use tempfile::TempDir; use tikv_util::{ @@ -872,6 +872,7 @@ mod test { let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let mut apply = Apply::new( + &Config::default(), region .get_peers() .iter() @@ -1059,6 +1060,7 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); + apply.apply_flow_control_mut().set_need_flush(true); assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 439d2136d76..a6ab227d402 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,7 +16,7 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::mem; +use std::{mem, time::Duration}; use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ @@ -35,7 +35,7 @@ use raftstore::{ local_metrics::RaftMetrics, metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, msg::ErrorCallback, - util, WriteCallback, + util, Config, WriteCallback, }, Error, Result, }; @@ -111,6 +111,7 @@ impl Peer { let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( + &store_ctx.cfg, self.peer().clone(), region_state, mailbox, @@ -268,6 +269,8 @@ impl Peer { if !self.serving() { return; } + // TODO: remove following log once stable. + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -334,7 +337,38 @@ impl Peer { } } +#[derive(Debug)] +pub struct ApplyFlowControl { + timer: Instant, + last_check_keys: u64, + need_flush: bool, + yield_time: Duration, + yield_written_bytes: u64, +} + +impl ApplyFlowControl { + pub fn new(cfg: &Config) -> Self { + ApplyFlowControl { + timer: Instant::now_coarse(), + last_check_keys: 0, + need_flush: false, + yield_time: cfg.apply_yield_duration.0, + yield_written_bytes: cfg.apply_yield_write_size.0, + } + } + + #[cfg(test)] + pub fn set_need_flush(&mut self, need_flush: bool) { + self.need_flush = need_flush; + } +} + impl Apply { + #[inline] + pub fn on_start_apply(&mut self) { + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + #[inline] fn should_skip(&self, off: usize, index: u64) -> bool { let log_recovery = self.log_recovery(); @@ -370,13 +404,15 @@ impl Apply { } } } + self.apply_flow_control_mut().need_flush = true; } - pub fn on_manual_flush(&mut self) { - self.flush(); + pub async fn on_manual_flush(&mut self) { + let written_bytes = self.flush(); if let Err(e) = self.tablet().flush_cfs(&[], false) { warn!(self.logger, "failed to flush: {:?}", e); } + self.maybe_reschedule(written_bytes).await } #[inline] @@ -414,6 +450,7 @@ impl Apply { } // Flush may be triggerred in the middle, so always update the index and term. self.set_apply_progress(e.index, e.term); + self.apply_flow_control_mut().need_flush = true; } } @@ -544,10 +581,49 @@ impl Apply { } } + fn should_reschedule(&self, written_bytes: u64) -> bool { + let control = self.apply_flow_control(); + written_bytes >= control.yield_written_bytes + || control.timer.saturating_elapsed() >= control.yield_time + } + + pub async fn maybe_reschedule(&mut self, written_bytes: u64) { + if self.should_reschedule(written_bytes) { + yatp::task::future::reschedule().await; + self.apply_flow_control_mut().timer = Instant::now_coarse(); + } + } + + /// Check whether it needs to flush. + /// + /// We always batch as much inputs as possible, flush will only be triggered + /// when it has been processing too long. + pub async fn maybe_flush(&mut self) { + let buffer_keys = self.metrics.written_keys; + let control = self.apply_flow_control_mut(); + if buffer_keys >= control.last_check_keys + 128 { + // Reschedule by write size was designed to avoid too many deletes impacts + // performance so it doesn't need pricise control. If checking bytes here may + // make the batch too small and hurt performance. + if self.should_reschedule(0) { + let written_bytes = self.flush(); + self.maybe_reschedule(written_bytes).await; + } else { + self.apply_flow_control_mut().last_check_keys = self.metrics.written_keys; + } + } + } + #[inline] - pub fn flush(&mut self) { + pub fn flush(&mut self) -> u64 { // TODO: maybe we should check whether there is anything to flush. let (index, term) = self.apply_progress(); + let control = self.apply_flow_control_mut(); + control.last_check_keys = 0; + if !control.need_flush { + return 0; + } + control.need_flush = false; let flush_state = self.flush_state().clone(); if let Some(wb) = &mut self.write_batch && !wb.is_empty() { let mut write_opt = WriteOptions::default(); @@ -578,6 +654,8 @@ impl Apply { apply_res.admin_result = self.take_admin_result().into_boxed_slice(); apply_res.modifications = *self.modifications_mut(); apply_res.metrics = mem::take(&mut self.metrics); + let written_bytes = apply_res.metrics.written_bytes; self.res_reporter().report(apply_res); + written_bytes } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index f312162d1e5..f82fb1e8386 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -293,7 +293,11 @@ impl Peer { /// /// After destroy is finished, `finish_destroy` should be called to clean up /// memory states. - pub fn start_destroy(&mut self, write_task: &mut WriteTask) { + pub fn start_destroy( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + ) { if self.postponed_destroy() { return; } @@ -311,7 +315,7 @@ impl Peer { let applied_index = self.entry_storage().applied_index(); lb.put_region_state(region_id, applied_index, ®ion_state) .unwrap(); - self.set_has_extra_write(); + self.record_tombstone_tablet_for_destroy(ctx, write_task); self.destroy_progress_mut().start(); } diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index dc245c24384..807f425e998 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,9 +7,9 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, CompactLogContext, ProposalControl, RequestSplit, - SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, - SplitFlowControl, SPLIT_PREFIX, + AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, + RequestSplit, SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, + SimpleWriteReqEncoder, SplitFlowControl, SPLIT_PREFIX, }; pub use life::DestroyProgress; pub use ready::{ diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 5ff9a27dee0..e5b1c169c5b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -301,19 +301,24 @@ impl ApplyTrace { None } - pub fn reset_snapshot(&mut self, index: u64) { + pub fn restore_snapshot(&mut self, index: u64) { for pr in self.data_cfs.iter_mut() { - pr.flushed = index; pr.last_modified = index; } - self.admin.flushed = index; + self.admin.last_modified = index; + // Snapshot is a special case that KVs are not flushed yet, so all flushed + // state should not be changed. But persisted_applied is updated whenever an + // asynchronous write is triggered. So it can lead to a special case that + // persisted_applied < admin.flushed. It seems no harm ATM though. self.persisted_applied = index; self.try_persist = false; } - #[inline] - pub fn reset_should_persist(&mut self) { - self.try_persist = false; + pub fn on_applied_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + } + self.admin.flushed = index; } #[inline] diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3ac500b7f49..2fdc228ea2f 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -449,7 +449,7 @@ impl Peer { self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); - self.on_advance_persisted_apply_index(ctx, prev_persisted, Some(&mut write_task)); + self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); if !ready.persisted_messages().is_empty() { write_task.messages = ready @@ -459,7 +459,7 @@ impl Peer { .collect(); } if !self.serving() { - self.start_destroy(&mut write_task); + self.start_destroy(ctx, &mut write_task); ctx.coprocessor_host.on_region_changed( self.region(), RegionChangeEvent::Destroy, diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 1919ce269a6..04b6ed7e12b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -215,8 +215,8 @@ impl Peer { let path = ctx.tablet_registry.tablet_path(region_id, snapshot_index); assert!( path.exists(), - "{:?} {} not exists", - self.logger.list(), + "{} {} not exists", + SlogFormat(&self.logger), path.display() ); let tablet = ctx @@ -224,15 +224,14 @@ impl Peer { .tablet_factory() .open_tablet(tablet_ctx, &path) .unwrap_or_else(|e| { - panic!( - "{:?} failed to load tablet at {}: {:?}", - self.logger.list(), - path.display(), - e + slog_panic!( + self.logger, + "failed to load tablet"; + "path" => path.display(), + "error" => ?e ); }); - let prev_persisted_applied = self.storage().apply_trace().persisted_apply_index(); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(snapshot_index); let read_tablet = SharedReadTablet::new(tablet.clone()); @@ -258,7 +257,6 @@ impl Peer { info!(self.logger, "init split with snapshot finished"); self.post_split_init(ctx, init); } - self.on_advance_persisted_apply_index(ctx, prev_persisted_applied, None); self.schedule_apply_fsm(ctx); } } @@ -511,7 +509,7 @@ impl Storage { let index = entry.truncated_index(); entry.set_applied_term(term); entry.apply_state_mut().set_applied_index(index); - self.apply_trace_mut().reset_snapshot(index); + self.apply_trace_mut().on_applied_snapshot(index); } pub fn apply_snapshot( @@ -552,10 +550,10 @@ impl Storage { raft_engine .clean(region.get_id(), 0, self.entry_storage().raft_state(), wb) .unwrap_or_else(|e| { - panic!( - "{:?} failed to clean up region: {:?}", - self.logger().list(), - e + slog_panic!( + self.logger(), + "failed to clean up region"; + "error" => ?e ) }); self.entry_storage_mut().clear(); @@ -578,7 +576,7 @@ impl Storage { entry_storage.set_truncated_term(last_term); entry_storage.set_last_term(last_term); - self.apply_trace_mut().reset_should_persist(); + self.apply_trace_mut().restore_snapshot(last_index); self.set_ever_persisted(); let lb = task .extra_write diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 2407d1ab3fe..7a5b03120b1 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -6,13 +6,13 @@ use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_L use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, - ReadTask, + Config, ReadTask, }; use slog::Logger; use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ - operation::{AdminCmdResult, DataTrace}, + operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, router::CmdResChannel, }; @@ -28,6 +28,8 @@ pub struct Apply { callbacks: Vec<(Vec, RaftCmdResponse)>, + flow_control: ApplyFlowControl, + /// A flag indicates whether the peer is destroyed by applying admin /// command. tombstone: bool, @@ -58,6 +60,7 @@ pub struct Apply { impl Apply { #[inline] pub fn new( + cfg: &Config, peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, @@ -79,6 +82,7 @@ impl Apply { tablet: remote_tablet.latest().unwrap().clone(), write_batch: None, callbacks: vec![], + flow_control: ApplyFlowControl::new(cfg), tombstone: false, applied_term, applied_index: flush_state.applied_index(), @@ -158,8 +162,8 @@ impl Apply { pub fn set_tablet(&mut self, tablet: EK) { assert!( self.write_batch.as_ref().map_or(true, |wb| wb.is_empty()), - "{:?}", - self.logger.list() + "{} setting tablet while still have dirty write batch", + SlogFormat(&self.logger) ); self.write_batch.take(); self.tablet = tablet; @@ -222,4 +226,13 @@ impl Apply { pub fn log_recovery(&self) -> &Option> { &self.log_recovery } + + #[inline] + pub fn apply_flow_control_mut(&mut self) -> &mut ApplyFlowControl { + &mut self.flow_control + } + + pub fn apply_flow_control(&self) -> &ApplyFlowControl { + &self.flow_control + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index aca8f0fafce..1d1f53f9c53 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -332,8 +332,8 @@ mod tests { }; use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, FetchedLogs, GenSnapRes, - ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, + util::new_empty_snapshot, write_to_db_for_test, AsyncReadNotifier, Config, FetchedLogs, + GenSnapRes, ReadRunner, TabletSnapKey, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; use slog::o; @@ -500,6 +500,7 @@ mod tests { state.set_region(region.clone()); // setup peer applyer let mut apply = Apply::new( + &Config::default(), region.get_peers()[0].clone(), state, router, From f0a0bf73fca282cfd43b9f1875a8e2cfd01ffaf0 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Sat, 7 Jan 2023 19:55:22 +0800 Subject: [PATCH 071/115] [Cloud] Fix observer and FFI and add more log (#253) --- .../raftstore/src/coprocessor/dispatcher.rs | 4 +- components/raftstore/src/coprocessor/mod.rs | 7 +- components/raftstore/src/store/fsm/peer.rs | 11 ++- components/raftstore/src/store/fsm/store.rs | 10 ++- .../raftstore/src/store/worker/region.rs | 3 +- engine_store_ffi/src/observer.rs | 84 ++++++++++++------- proxy_tests/proxy/ffi.rs | 25 ++++-- 7 files changed, 99 insertions(+), 45 deletions(-) diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 69ebfa7b385..64b49a227c9 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -680,10 +680,10 @@ impl CoprocessorHost { false } - pub fn on_peer_created(&self, region_id: u64) { + pub fn on_peer_created(&self, region_id: u64, peer_id: u64, event: PeerCreateEvent) { for observer in &self.registry.region_change_observers { let observer = observer.observer.inner(); - observer.on_peer_created(region_id) + observer.on_peer_created(region_id, peer_id, event) } } diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 70427df9922..dc7cb7471f0 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -307,6 +307,11 @@ pub enum RegionChangeEvent { UpdateBuckets(usize), } +#[derive(Clone, Copy, Debug, PartialEq)] +pub enum PeerCreateEvent { + Replicate, + Create, +} pub trait RegionChangeObserver: Coprocessor { /// Hook to call when a region changed on this TiKV fn on_region_changed(&self, _: &mut ObserverContext<'_>, _: RegionChangeEvent, _: StateRole) {} @@ -333,7 +338,7 @@ pub trait RegionChangeObserver: Coprocessor { false } - fn on_peer_created(&self, _: u64) {} + fn on_peer_created(&self, _: u64, _: u64, _: PeerCreateEvent) {} } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 75979a4afd5..b31ca7c1afa 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -67,7 +67,7 @@ use self::memtrace::*; #[cfg(any(test, feature = "testexport"))] use crate::store::PeerInternalStat; use crate::{ - coprocessor::{RegionChangeEvent, RegionChangeReason}, + coprocessor::{CoprocessorHost, PeerCreateEvent, RegionChangeEvent, RegionChangeReason}, store::{ cmd_resp::{bind_term, new_error}, entry_storage::MAX_WARMED_UP_CACHE_KEEP_TIME, @@ -247,6 +247,7 @@ where raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, + coprocessor_host: &CoprocessorHost, ) -> Result> { let meta_peer = match find_peer(region, store_id) { None => { @@ -266,6 +267,11 @@ where ); HIBERNATED_PEER_STATE_GAUGE.awaken.inc(); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + coprocessor_host.on_peer_created( + region.get_id(), + meta_peer.get_id(), + PeerCreateEvent::Create, + ); Ok(( tx, Box::new(PeerFsm { @@ -307,6 +313,7 @@ where engines: Engines, region_id: u64, peer: metapb::Peer, + coprocessor_host: &CoprocessorHost, ) -> Result> { // We will remove tombstone key when apply snapshot info!( @@ -320,6 +327,7 @@ where HIBERNATED_PEER_STATE_GAUGE.awaken.inc(); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); + coprocessor_host.on_peer_created(region_id, peer.get_id(), PeerCreateEvent::Replicate); Ok(( tx, Box::new(PeerFsm { @@ -4013,6 +4021,7 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, + &self.ctx.coprocessor_host, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index fafc839dce2..18bc623fa44 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -67,8 +67,8 @@ use time::{self, Timespec}; use crate::{ bytes_capacity, coprocessor::{ - split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, RegionChangeEvent, - RegionChangeReason, + split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, PeerCreateEvent, + RegionChangeEvent, RegionChangeReason, }, store::{ async_io::{ @@ -1173,6 +1173,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, + &self.coprocessor_host, )); peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { @@ -1213,6 +1214,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, + &self.coprocessor_host, )?; peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); @@ -2228,10 +2230,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.engines.clone(), region_id, target.clone(), + &self.ctx.coprocessor_host, )?; - self.ctx.coprocessor_host.on_peer_created(region_id); - // WARNING: The checking code must be above this line. // Now all checking passed @@ -2885,6 +2886,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, + &self.ctx.coprocessor_host, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { diff --git a/components/raftstore/src/store/worker/region.rs b/components/raftstore/src/store/worker/region.rs index 694be1a2b8c..951083bbc6a 100644 --- a/components/raftstore/src/store/worker/region.rs +++ b/components/raftstore/src/store/worker/region.rs @@ -840,7 +840,8 @@ where task @ Task::Apply { .. } => { fail_point!("on_region_worker_apply", true, |_| {}); if self.coprocessor_host.should_pre_apply_snapshot() { - let _ = self.pre_apply_snapshot(&task); + let e = self.pre_apply_snapshot(&task); + tikv_util::debug!("!!!! pre handle error {:?}", e); } SNAP_COUNTER.apply.all.inc(); // to makes sure applying snapshots in order. diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 5ed8e5de8f6..03adae1becb 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -27,8 +27,8 @@ use raftstore::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, BoxApplySnapshotObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxUpdateSafeTsObserver, Cmd, Coprocessor, CoprocessorHost, ObserverContext, - PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, RegionState, - StoreSizeInfo, UpdateSafeTsObserver, + PdTaskObserver, PeerCreateEvent, QueryObserver, RegionChangeEvent, RegionChangeObserver, + RegionState, StoreSizeInfo, UpdateSafeTsObserver, }, store::{ self, check_sst_for_ingestion, @@ -419,6 +419,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "inner_msg" => ?inner_msg, "is_replicated" => is_replicated, "has_already_inited" => has_already_inited, @@ -457,6 +458,7 @@ impl TiFlashObserver { info!("fast path: ongoing {}:{} {}, first message", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "inner_msg" => ?inner_msg, ); v.insert(Arc::new(CachedRegionInfo::default())); @@ -480,6 +482,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "inner_msg" => ?inner_msg, "is_replicated" => is_replicated, "has_already_inited" => has_already_inited, @@ -490,6 +493,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "inner_msg" => ?inner_msg, "is_replicated" => is_replicated, "has_already_inited" => has_already_inited, @@ -516,6 +520,7 @@ impl TiFlashObserver { info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "inner_msg" => ?inner_msg, ); return true; @@ -525,6 +530,7 @@ impl TiFlashObserver { info!("fast path: ongoing {}:{} {}, fetch data from remote peer", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, ); fail::fail_point!("go_fast_path_not_allow", |_| { return false }); fail::fail_point!("ffi_fast_add_peer_pause", |_| { return false }); @@ -537,14 +543,16 @@ impl TiFlashObserver { crate::FastAddPeerStatus::WaitForData => { info!( "fast path: ongoing {}:{} {}. remote peer preparing data, wait", - self.store_id, region_id, new_peer_id + self.store_id, region_id, new_peer_id; + "region_id" => region_id, ); return true; } _ => { error!( "fast path: ongoing {}:{} {} failed. fetch and replace error {:?}, fallback to normal", - self.store_id, region_id, new_peer_id, res + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, ); self.fallback_to_slow_path(region_id); return false; @@ -558,14 +566,16 @@ impl TiFlashObserver { if let Err(_e) = apply_state.merge_from_bytes(apply_state_str) { error!( "fast path: ongoing {}:{} {} failed. parse apply_state {:?}, fallback to normal", - self.store_id, region_id, new_peer_id, res + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, ); self.fallback_to_slow_path(region_id); } if let Err(_e) = new_region.merge_from_bytes(region_str) { error!( "fast path: ongoing {}:{} {} failed. parse region {:?}, fallback to normal", - self.store_id, region_id, new_peer_id, res + self.store_id, region_id, new_peer_id, res; + "region_id" => region_id, ); self.fallback_to_slow_path(region_id); } @@ -576,6 +586,7 @@ impl TiFlashObserver { info!( "fast path: ongoing {}:{} {}. failed remote peer has not applied conf change", self.store_id, region_id, new_peer_id; + "region_id" => region_id, "region" => ?new_region, ); self.fallback_to_slow_path(region_id); @@ -585,6 +596,7 @@ impl TiFlashObserver { info!("fast path: ongoing {}:{} {}, start build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, "new_region" => ?new_region, "apply_state" => ?apply_state, ); @@ -596,19 +608,22 @@ impl TiFlashObserver { info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, ); } crate::FastAddPeerStatus::WaitForData => { info!( "fast path: ongoing {}:{} {}. remote peer preparing data, wait", - new_peer_id, self.store_id, region_id + new_peer_id, self.store_id, region_id; + "region_id" => region_id, ); return true; } _ => { error!( "fast path: ongoing {}:{} {} failed. build and sent snapshot code {:?}", - self.store_id, region_id, new_peer_id, s + self.store_id, region_id, new_peer_id, s; + "region_id" => region_id, ); self.fallback_to_slow_path(region_id); return false; @@ -618,7 +633,8 @@ impl TiFlashObserver { Err(e) => { error!( "fast path: ongoing {}:{} {} failed. build and sent snapshot error {:?}", - self.store_id, region_id, new_peer_id, e + self.store_id, region_id, new_peer_id, e; + "region_id" => region_id, ); self.fallback_to_slow_path(region_id); return false; @@ -1342,24 +1358,27 @@ impl RegionChangeObserver for TiFlashObs false } - fn on_peer_created(&self, region_id: u64) { - let f = |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - o.get_mut() - .replicated_or_created - .store(true, Ordering::SeqCst); - } - MapEntry::Vacant(v) => { - let c = CachedRegionInfo::default(); - c.replicated_or_created.store(true, Ordering::SeqCst); - v.insert(Arc::new(c)); - } - }; - info!("fast path: ongoing {}:{} {}, peer created", - self.store_id, region_id, "NA"; - ); - // TODO remove unwrap - self.access_cached_region_info_mut(region_id, f).unwrap(); + fn on_peer_created(&self, region_id: u64, peer_id: u64, event: PeerCreateEvent) { + if event == PeerCreateEvent::Replicate { + let f = |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + MapEntry::Vacant(v) => { + let c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); + } + }; + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, peer_id; + "region_id" => region_id, + ); + // TODO remove unwrap + self.access_cached_region_info_mut(region_id, f).unwrap(); + } } } @@ -1477,6 +1496,7 @@ impl ApplySnapshotObserver for TiFlashOb if is_first_snapsot { info!("fast path: prehandle first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; "snap_key" => ?snap_key, + "region_id" => region_id, ); should_skip = true; } @@ -1552,12 +1572,14 @@ impl ApplySnapshotObserver for TiFlashOb fail::fail_point!("on_ob_post_apply_snapshot", |_| { return; }); + let region_id = ob_ctx.region().get_id(); info!("post apply snapshot"; "peer_id" => ?peer_id, "snap_key" => ?snap_key, + "region_id" => region_id, "region" => ?ob_ctx.region(), + "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); - let region_id = ob_ctx.region().get_id(); let mut should_skip = false; #[allow(clippy::collapsible_if)] if self.engine_store_cfg.enable_fast_add_peer { @@ -1567,8 +1589,14 @@ impl ApplySnapshotObserver for TiFlashOb MapEntry::Occupied(mut o) => { let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); if is_first_snapsot { + let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); info!("fast path: applied first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; "snap_key" => ?snap_key, + "region_id" => region_id, + "cost" => current.as_millis() - last, ); should_skip = true; o.get_mut().snapshot_inflight.store(0, Ordering::SeqCst); diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index c9edca2a02f..ea1c12c57c6 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -25,6 +25,11 @@ fn test_tuple_of_raw_cpp_ptr() { } let (ptr_v, l, cap) = v.into_raw_parts(); + for i in l..cap { + let v = ptr_v.add(i); + (*v).ptr = std::ptr::null_mut(); + (*v).type_ = RawCppPtrTypeImpl::None.into(); + } assert_ne!(l, cap); let cpp_ptr_tp = RawCppPtrTuple { inner: ptr_v, @@ -52,6 +57,10 @@ fn test_array_of_raw_cpp_ptr() { } let (ptr_v, l, cap) = v.into_raw_parts(); + for i in l..cap { + let v = ptr_v.add(i); + *v = std::ptr::null_mut(); + } assert_ne!(l, cap); let cpp_ptr_arr = RawCppPtrArr { inner: ptr_v, @@ -69,22 +78,22 @@ fn test_carray_of_raw_cpp_ptr() { init_global_ffi_helper_set(); let helper = get_engine_store_server_helper(); - const len: usize = 10; - let mut v: Vec = vec![]; + const LEN: usize = 10; + let mut v: [RawVoidPtr; LEN] = [std::ptr::null_mut(); LEN]; - for i in 0..len { + for i in 0..LEN { + let i = i as usize; let s = format!("s{}", i); let raw_cpp_ptr = (helper.fn_gen_cpp_string.into_inner())(s.as_bytes().into()); let raw_void_ptr = raw_cpp_ptr.into_raw(); - v.push(raw_void_ptr); + v[i] = raw_void_ptr; } - let (pv1, l, cap) = v.into_raw_parts(); - let pv1 = pv1 as RawVoidPtr; + let pv1 = Box::into_raw(Box::new(v)); (helper.fn_gc_raw_cpp_ptr_carr.into_inner())( - pv1, + pv1 as RawVoidPtr, RawCppPtrTypeImpl::String.into(), - cap as u64, + LEN as u64, ); } } From 8e6e348505e7f1f7b5e023c00b30f90e8d1b4084 Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 10 Jan 2023 14:28:23 +0800 Subject: [PATCH 072/115] raftstore-v2: add waterfall metrics (#14029) ref tikv/tikv#12842 - add water metrics - fix potential panic when destroying a peer - fix incorrect store size Signed-off-by: Jay Lee --- components/engine_rocks/src/misc.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 14 +- .../operation/command/admin/compact_log.rs | 10 +- .../raftstore-v2/src/operation/command/mod.rs | 71 ++++++++- components/raftstore-v2/src/operation/life.rs | 2 + .../src/operation/ready/apply_trace.rs | 5 + .../raftstore-v2/src/operation/ready/mod.rs | 144 +++++++++++++++++- components/raftstore-v2/src/raft/apply.rs | 15 +- .../src/router/response_channel.rs | 36 +++-- components/raftstore/src/lib.rs | 1 + .../raftstore/src/store/async_io/write.rs | 6 +- components/raftstore/src/store/fsm/apply.rs | 28 ++-- components/raftstore/src/store/fsm/peer.rs | 19 +-- .../raftstore/src/store/local_metrics.rs | 4 +- components/raftstore/src/store/msg.rs | 44 ++++-- components/raftstore/src/store/peer.rs | 24 +-- 16 files changed, 350 insertions(+), 77 deletions(-) diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 55546869272..e339facaac4 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -2,7 +2,7 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, + Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, }; use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; @@ -258,7 +258,7 @@ impl MiscExt for RocksEngine { fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; - for cf in ALL_CFS { + for cf in self.cf_names() { let handle = util::get_cf_handle(self.as_inner(), cf)?; used_size += util::get_engine_cf_used_size(self.as_inner(), handle); } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 621f826619b..6183778c369 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -36,7 +36,7 @@ use tikv_util::{ config::{Tracker, VersionTrack}, log::SlogFormat, sys::SysQuota, - time::Instant as TiInstant, + time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, @@ -122,6 +122,7 @@ struct StorePoller { /// Buffers to hold in-coming messages. store_msg_buf: Vec, peer_msg_buf: Vec, + timer: tikv_util::time::Instant, /// These fields controls the timing of flushing messages generated by /// FSMs. last_flush_time: TiInstant, @@ -135,6 +136,7 @@ impl StorePoller { cfg_tracker, store_msg_buf: Vec::new(), peer_msg_buf: Vec::new(), + timer: tikv_util::time::Instant::now(), last_flush_time: TiInstant::now(), need_flush_events: false, } @@ -185,6 +187,8 @@ impl PollHandler Option { @@ -234,7 +238,13 @@ impl PollHandler>>]) {} + fn end(&mut self, _batch: &mut [Option>>]) { + let dur = self.timer.saturating_elapsed(); + self.poll_ctx + .raft_metrics + .process_ready + .observe(duration_to_sec(dur)); + } fn pause(&mut self) { if self.poll_ctx.trans.need_flush() { diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index 39cf02de775..a4983b28a47 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -303,15 +303,21 @@ impl Peer { } } + pub fn has_pending_tombstone_tablets(&self) -> bool { + !self + .compact_log_context() + .tombstone_tablets_wait_index + .is_empty() + } + #[inline] pub fn record_tombstone_tablet_for_destroy( &mut self, ctx: &StoreContext, task: &mut WriteTask, ) { - let compact_log_context = self.compact_log_context_mut(); assert!( - compact_log_context.tombstone_tablets_wait_index.is_empty(), + !self.has_pending_tombstone_tablets(), "{} all tombstone should be cleared before being destroyed.", SlogFormat(&self.logger) ); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index a6ab227d402..047fe026ffe 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -18,7 +18,7 @@ use std::{mem, time::Duration}; -use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; +use engine_traits::{KvEngine, PerfContext, RaftEngine, WriteBatch, WriteOptions}; use kvproto::raft_cmdpb::{ AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; @@ -32,8 +32,8 @@ use raftstore::{ apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, - local_metrics::RaftMetrics, - metrics::APPLY_TASK_WAIT_TIME_HISTOGRAM, + local_metrics::{RaftMetrics, TimeTracker}, + metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, util, Config, WriteCallback, }, @@ -221,12 +221,35 @@ impl Peer { } proposal.must_pass_epoch_check = self.applied_to_current_term(); proposal.propose_time = Some(*ctx.current_time.get_or_insert_with(monotonic_raw_now)); + self.report_batch_wait_duration(ctx, &proposal.cb); self.proposals_mut().push(proposal); self.set_has_ready(); } + fn report_batch_wait_duration( + &self, + ctx: &mut StoreContext, + ch: &Vec, + ) { + if !ctx.raft_metrics.waterfall_metrics || ch.is_empty() { + return; + } + let now = std::time::Instant::now(); + for c in ch { + for tracker in c.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_batch_wait, |t| { + &mut t.metrics.wf_batch_wait_nanos + }); + } + } + } + #[inline] - pub fn schedule_apply_committed_entries(&mut self, committed_entries: Vec) { + pub fn schedule_apply_committed_entries( + &mut self, + ctx: &mut StoreContext, + committed_entries: Vec, + ) { if committed_entries.is_empty() { return; } @@ -246,6 +269,7 @@ impl Peer { } else { entry_and_proposals = committed_entries.into_iter().map(|e| (e, vec![])).collect(); } + self.report_store_time_duration(ctx, &mut entry_and_proposals); // Unlike v1, v2 doesn't need to persist commit index and commit term. The // point of persist commit index/term of raft apply state is to recover commit // index when the writes to raft engine is lost but writes to kv engine is @@ -265,6 +289,26 @@ impl Peer { .send(ApplyTask::CommittedEntries(apply)); } + #[inline] + fn report_store_time_duration( + &mut self, + ctx: &mut StoreContext, + entry_and_proposals: &mut [(Entry, Vec)], + ) { + let now = std::time::Instant::now(); + for (_, chs) in entry_and_proposals { + for tracker in chs.write_trackers_mut() { + tracker.observe(now, &ctx.raft_metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + if let TimeTracker::Instant(t) = tracker { + *t = now; + } + } + } + } + pub fn on_apply_res(&mut self, ctx: &mut StoreContext, apply_res: ApplyRes) { if !self.serving() { return; @@ -625,9 +669,11 @@ impl Apply { } control.need_flush = false; let flush_state = self.flush_state().clone(); - if let Some(wb) = &mut self.write_batch && !wb.is_empty() { + if let Some(wb) = &self.write_batch && !wb.is_empty() { + self.perf_context().start_observe(); let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); + let wb = self.write_batch.as_mut().unwrap(); if let Err(e) = wb.write_callback_opt(&write_opt, || { flush_state.set_applied_index(index); }) { @@ -640,11 +686,26 @@ impl Apply { } else { self.write_batch.take(); } + let tokens: Vec<_> = self + .callbacks_mut() + .iter() + .flat_map(|(v, _)| { + v.write_trackers() + .flat_map(|t| t.as_tracker_token().cloned()) + }) + .collect(); + self.perf_context().report_metrics(&tokens); } let callbacks = self.callbacks_mut(); + let now = std::time::Instant::now(); + let apply_time = APPLY_TIME_HISTOGRAM.local(); for (ch, resp) in callbacks.drain(..) { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_time, |t| &mut t.metrics.apply_time_nanos); + } ch.set_result(resp); } + apply_time.flush(); if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index f82fb1e8386..88646f06b59 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -286,6 +286,8 @@ impl Peer { let entry_storage = self.storage().entry_storage(); // TODO: check actual split index instead of commit index. entry_storage.applied_index() != entry_storage.commit_index() + // Wait for critical commands like split. + || self.has_pending_tombstone_tablets() } /// Start the destroy progress. It will write `Tombstone` state diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index e5b1c169c5b..5b88a6ba94d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -473,6 +473,11 @@ impl Storage { } pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let trace = self.apply_trace(); + // Maybe tablet index can be different? + if trace.persisted_applied > trace.admin.flushed { + return; + } let region_id = self.region().get_id(); let raft_engine = self.entry_storage().raft_engine(); let tablet_index = self.tablet_index(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 2fdc228ea2f..3f559feff8b 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -30,7 +30,10 @@ use protobuf::Message as _; use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, - store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, + store::{ + needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteCallback, + WriteTask, + }, }; use slog::{debug, error, info, trace, warn}; use tikv_util::{ @@ -205,10 +208,14 @@ impl Peer { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } self.insert_peer_cache(msg.take_from_peer()); + let pre_committed_index = self.raft_group().raft.raft_log.committed; if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); + } else { + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); } self.set_has_ready(); @@ -317,6 +324,56 @@ impl Peer { } } + /// Send a message. + /// + /// The message is pushed into the send buffer, it may not be sent out until + /// transport is flushed explicitly. + fn send_raft_message_on_leader( + &mut self, + ctx: &mut StoreContext, + msg: RaftMessage, + ) { + let message = msg.get_message(); + if message.get_msg_type() == MessageType::MsgAppend + && let Some(fe) = message.get_entries().first() + && let Some(le) = message.get_entries().last() + { + let last = (le.get_term(), le.get_index()); + let first = (fe.get_term(), fe.get_index()); + let now = Instant::now(); + let queue = self.proposals_mut().queue_mut(); + // Proposals are batched up, so it will liely hit after one or two steps. + for p in queue.iter_mut().rev() { + if p.sent { + break; + } + let cur = (p.term, p.index); + if cur > last { + continue; + } + if cur < first { + break; + } + for tracker in p.cb.write_trackers() { + tracker.observe(now, &ctx.raft_metrics.wf_send_proposal, |t| { + &mut t.metrics.wf_send_proposal_nanos + }); + } + p.sent = true; + } + } + if message.get_msg_type() == MessageType::MsgTimeoutNow { + // After a leader transfer procedure is triggered, the lease for + // the old leader may be expired earlier than usual, since a new leader + // may be elected and the old leader doesn't step down due to + // network partition from the new leader. + // For lease safety during leader transfer, transit `leader_lease` + // to suspect. + self.leader_lease_mut().suspect(monotonic_raw_now()); + } + self.send_raft_message(ctx, msg) + } + fn handle_raft_committed_entries( &mut self, ctx: &mut crate::batch::StoreContext, @@ -357,7 +414,7 @@ impl Peer { // Compact all cached entries instead of half evict. self.entry_storage_mut().evict_entry_cache(false); } - self.schedule_apply_committed_entries(committed_entries); + self.schedule_apply_committed_entries(ctx, committed_entries); if self.is_leader() && commit_to_current_term && !self.proposal_control().has_uncommitted_admin() @@ -423,7 +480,7 @@ impl Peer { debug_assert!(self.is_leader()); for msg in ready.take_messages() { if let Some(msg) = self.build_raft_message(msg) { - self.send_raft_message(ctx, msg); + self.send_raft_message_on_leader(ctx, msg); } } } @@ -445,6 +502,7 @@ impl Peer { let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + self.report_send_to_queue_duration(ctx, &mut write_task, ready.entries()); let prev_persisted = self.storage().apply_trace().persisted_apply_index(); self.merge_state_changes_to(&mut write_task); self.storage_mut() @@ -519,8 +577,13 @@ impl Peer { } let persisted_number = self.async_writer.persisted_number(); + let pre_persisted_index = self.persisted_index(); + let pre_committed_index = self.raft_group().raft.raft_log.committed; self.raft_group_mut().on_persist_ready(persisted_number); let persisted_index = self.persisted_index(); + let committed_index = self.raft_group().raft.raft_log.committed; + self.report_persist_log_duration(ctx, pre_persisted_index, persisted_index); + self.report_commit_log_duration(ctx, pre_committed_index, committed_index); // The apply snapshot process order would be: // - Get the snapshot from the ready // - Wait for async writer to load this tablet @@ -543,6 +606,81 @@ impl Peer { } } + #[inline] + fn report_persist_log_duration( + &self, + ctx: &mut StoreContext, + from: u64, + to: u64, + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + for tracker in trackers { + tracker.observe(now, &ctx.raft_metrics.wf_persist_log, |t| { + &mut t.metrics.wf_persist_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_commit_log_duration(&self, ctx: &mut StoreContext, from: u64, to: u64) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() || from >= to { + return; + } + let now = Instant::now(); + for i in from + 1..to { + if let Some((term, trackers)) = self.proposals().find_trackers(i) { + if self.entry_storage().term(i).map_or(false, |t| t == term) { + let commit_persisted = i <= self.persisted_index(); + let hist = if commit_persisted { + &ctx.raft_metrics.wf_commit_log + } else { + &ctx.raft_metrics.wf_commit_not_persist_log + }; + for tracker in trackers { + tracker.observe(now, hist, |t| { + t.metrics.commit_not_persisted = !commit_persisted; + &mut t.metrics.wf_commit_log_nanos + }); + } + } + } + } + } + + #[inline] + fn report_send_to_queue_duration( + &mut self, + ctx: &mut StoreContext, + write_task: &mut WriteTask, + entries: &[raft::eraftpb::Entry], + ) { + if !ctx.cfg.waterfall_metrics || self.proposals().is_empty() { + return; + } + let now = Instant::now(); + for entry in entries { + if let Some((term, trackers)) = self.proposals().find_trackers(entry.index) { + if entry.term == term { + for tracker in trackers { + write_task.trackers.push(*tracker); + tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { + &mut t.metrics.wf_send_to_queue_nanos + }); + } + } + } + } + } + #[cfg(feature = "testexport")] pub fn on_wait_flush(&mut self, ch: crate::router::FlushChannel) { self.async_writer.subscirbe_flush(ch); diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 7a5b03120b1..6d1faa98cbf 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,7 +2,9 @@ use std::{mem, sync::Arc}; -use engine_traits::{FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; +use engine_traits::{ + FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, +}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use raftstore::store::{ fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, @@ -20,6 +22,7 @@ use crate::{ pub struct Apply { peer: metapb::Peer, tablet: EK, + perf_context: EK::PerfContext, pub write_batch: Option, /// A buffer for encoding key. pub key_buffer: Vec, @@ -77,9 +80,12 @@ impl Apply { assert_ne!(applied_term, 0, "{}", SlogFormat(&logger)); let applied_index = flush_state.applied_index(); assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); + let tablet = remote_tablet.latest().unwrap().clone(); + let perf_context = tablet.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); Apply { peer, - tablet: remote_tablet.latest().unwrap().clone(), + tablet, + perf_context, write_batch: None, callbacks: vec![], flow_control: ApplyFlowControl::new(cfg), @@ -174,6 +180,11 @@ impl Apply { &self.tablet } + #[inline] + pub fn perf_context(&mut self) -> &mut EK::PerfContext { + &mut self.perf_context + } + #[inline] pub fn peer(&self) -> &metapb::Peer { &self.peer diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index 2cb75acccfc..eeeb13f6555 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -30,8 +30,7 @@ use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, WriteCallback, }; -use smallvec::SmallVec; -use tracker::TrackerToken; +use tracker::{TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; /// A struct allows to watch and notify specific events. /// @@ -54,6 +53,7 @@ struct EventCore { before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, + tracker: UnsafeCell, } unsafe impl Send for EventCore {} @@ -244,16 +244,19 @@ impl BaseChannel { /// Creates a pair of channel and subscriber. #[inline] pub fn pair() -> (Self, BaseSubscriber) { - Self::with_mask(u32::MAX) + let tracker_token = tracker::get_tls_tracker_token(); + Self::with_mask(u32::MAX, TimeTracker::Tracker(tracker_token)) } - fn with_mask(mask: u32) -> (Self, BaseSubscriber) { + #[inline] + fn with_mask(mask: u32, tracker: TimeTracker) -> (Self, BaseSubscriber) { let core: Arc> = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), event_mask: mask, before_set: UnsafeCell::new(None), waker: AtomicWaker::new(), + tracker: UnsafeCell::new(tracker), }); (Self { core: core.clone() }, BaseSubscriber { core }) } @@ -449,7 +452,17 @@ impl CmdResChannelBuilder { #[inline] pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { - let (c, s) = CmdResChannel::with_mask(self.event_mask); + let tracker_token = tracker::get_tls_tracker_token(); + let now = std::time::Instant::now(); + let tracker = if tracker_token == INVALID_TRACKER_TOKEN { + TimeTracker::Instant(now) + } else { + GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { + tracker.metrics.write_instant = Some(now); + }); + TimeTracker::Tracker(tracker_token) + }; + let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); if let Some(f) = self.before_set { unsafe { *c.core.before_set.get() = Some(f); @@ -493,12 +506,15 @@ impl WriteCallback for CmdResChannel { self.core.notify_event(Self::COMMITTED_EVENT); } - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListRef<'a> = &'a [TimeTracker]; + #[inline] + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + std::slice::from_ref(unsafe { &*self.core.tracker.get() }) } - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + std::slice::from_mut(unsafe { &mut *self.core.tracker.get() }) } // TODO: support executing hooks inside setting result. @@ -577,7 +593,7 @@ impl ReadCallback for QueryResChannel { } fn read_tracker(&self) -> Option<&TrackerToken> { - None + unsafe { (*self.core.tracker.get()).as_tracker_token() } } } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index 6104ae7b7cf..1db5f79d226 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -8,6 +8,7 @@ #![feature(hash_drain_filter)] #![feature(let_chains)] #![feature(assert_matches)] +#![feature(type_alias_impl_trait)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 817ff576f67..7016d0ab606 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -718,7 +718,11 @@ where .batch .tasks .iter() - .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) + .flat_map(|task| { + task.trackers + .iter() + .flat_map(|t| t.as_tracker_token().cloned()) + }) .collect(); self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index ec2d7bf72a8..cab6ae0ffe8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -582,8 +582,7 @@ where .cb_batch .iter() .flat_map(|(cb, _)| cb.write_trackers()) - .flat_map(|trackers| trackers.iter().map(|t| t.as_tracker_token())) - .flatten() + .flat_map(|trackers| trackers.as_tracker_token().cloned()) .collect(); self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; @@ -620,7 +619,7 @@ where // Invoke callbacks let now = std::time::Instant::now(); for (cb, resp) in cb_batch.drain(..) { - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.apply_time, |t| &mut t.metrics.apply_time_nanos); } cb.invoke_with_response(resp); @@ -3333,15 +3332,13 @@ impl Apply { pub fn on_schedule(&mut self, metrics: &RaftMetrics) { let now = std::time::Instant::now(); for cb in &mut self.cbs { - if let Some(trackers) = cb.cb.write_trackers_mut() { - for tracker in trackers { - tracker.observe(now, &metrics.store_time, |t| { - t.metrics.write_instant = Some(now); - &mut t.metrics.store_time_nanos - }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + for tracker in cb.cb.write_trackers_mut() { + tracker.observe(now, &metrics.store_time, |t| { + t.metrics.write_instant = Some(now); + &mut t.metrics.store_time_nanos + }); + if let TimeTracker::Instant(t) = tracker { + *t = now; } } } @@ -3410,6 +3407,7 @@ pub struct Proposal { /// lease. pub propose_time: Option, pub must_pass_epoch_check: bool, + pub sent: bool, } impl Proposal { @@ -3421,6 +3419,7 @@ impl Proposal { propose_time: None, must_pass_epoch_check: false, is_conf_change: false, + sent: false, } } } @@ -4170,9 +4169,9 @@ where .cbs .iter() .flat_map(|p| p.cb.write_trackers()) - .flat_map(|ts| ts.iter().flat_map(|t| t.as_tracker_token())) + .flat_map(|ts| ts.as_tracker_token()) { - GLOBAL_TRACKERS.with_tracker(tracker, |t| { + GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; }); } @@ -5082,6 +5081,7 @@ mod tests { cb, propose_time: None, must_pass_epoch_check: false, + sent: true, } } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index abd8fd84771..e302ea6588a 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -524,13 +524,14 @@ where })) }; - let tokens: SmallVec<[TimeTracker; 4]> = cbs + let trackers: SmallVec<[TimeTracker; 4]> = cbs .iter_mut() - .filter_map(|cb| cb.write_trackers().map(|t| t[0])) + .flat_map(|cb| cb.write_trackers()) + .cloned() .collect(); - let mut cb = Callback::write_ext( - Box::new(move |resp| { + let cb = Callback::Write { + cb: Box::new(move |resp| { for cb in cbs { let mut cmd_resp = RaftCmdResponse::default(); cmd_resp.set_header(resp.response.get_header().clone()); @@ -539,12 +540,8 @@ where }), proposed_cb, committed_cb, - ); - - if let Some(trackers) = cb.write_trackers_mut() { - *trackers = tokens; - } - + trackers, + }; return Some((req, cb)); } None @@ -5245,7 +5242,7 @@ where if self.ctx.raft_metrics.waterfall_metrics { let now = Instant::now(); - for tracker in cb.write_trackers().iter().flat_map(|v| *v) { + for tracker in cb.write_trackers() { tracker.observe(now, &self.ctx.raft_metrics.wf_batch_wait, |t| { &mut t.metrics.wf_batch_wait_nanos }); diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 5cfbb645612..c1db17f8cae 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -214,9 +214,9 @@ pub enum TimeTracker { } impl TimeTracker { - pub fn as_tracker_token(&self) -> Option { + pub fn as_tracker_token(&self) -> Option<&TrackerToken> { match self { - TimeTracker::Tracker(tt) => Some(*tt), + TimeTracker::Tracker(tt) => Some(tt), TimeTracker::Instant(_) => None, } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 08b0e9367dc..e3fc8530d76 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -225,8 +225,16 @@ pub trait WriteCallback: ErrorCallback { fn notify_proposed(&mut self); fn notify_committed(&mut self); - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>>; - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>>; + + type TimeTrackerListRef<'a>: IntoIterator + where + Self: 'a; + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_>; + + type TimeTrackerListMut<'a>: IntoIterator + where + Self: 'a; + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_>; fn set_result(self, result: Self::Response); } @@ -276,16 +284,24 @@ impl WriteCallback for Callback { self.invoke_committed(); } + type TimeTrackerListRef<'a> = impl IntoIterator; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } + type TimeTrackerListMut<'a> = impl IntoIterator; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - let Callback::Write { trackers, .. } = self else { return None; }; - Some(trackers) + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + let trackers = match self { + Callback::Write { trackers, .. } => Some(trackers), + _ => None, + }; + trackers.into_iter().flatten() } #[inline] @@ -296,7 +312,7 @@ impl WriteCallback for Callback { impl WriteCallback for Vec where - C: WriteCallback, + C: WriteCallback + 'static, C::Response: Clone, { type Response = C::Response; @@ -315,14 +331,16 @@ where } } + type TimeTrackerListRef<'a> = impl Iterator + 'a; #[inline] - fn write_trackers(&self) -> Option<&SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { + self.iter().flat_map(|c| c.write_trackers()) } + type TimeTrackerListMut<'a> = impl Iterator + 'a; #[inline] - fn write_trackers_mut(&mut self) -> Option<&mut SmallVec<[TimeTracker; 4]>> { - None + fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { + self.iter_mut().flat_map(|c| c.write_trackers_mut()) } #[inline] diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9384a4940c7..347f62dd945 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -70,7 +70,7 @@ use uuid::Uuid; use super::{ cmd_resp, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, metrics::*, peer_storage::{write_peer_state, CheckApplyingSnapStatus, HandleReadyResult, PeerStorage}, read_queue::{ReadIndexQueue, ReadIndexRequest}, @@ -141,16 +141,16 @@ impl ProposalQueue { /// Find the trackers of given index. /// Caller should check if term is matched before using trackers. - fn find_trackers(&self, index: u64) -> Option<(u64, &SmallVec<[TimeTracker; 4]>)> { + pub fn find_trackers(&self, index: u64) -> Option<(u64, C::TimeTrackerListRef<'_>)> { self.queue .binary_search_by_key(&index, |p: &Proposal<_>| p.index) .ok() - .and_then(|i| { - self.queue[i] - .cb - .write_trackers() - .map(|ts| (self.queue[i].term, ts)) - }) + .map(|i| (self.queue[i].term, self.queue[i].cb.write_trackers())) + } + + #[inline] + pub fn queue_mut(&mut self) -> &mut VecDeque> { + &mut self.queue } pub fn find_propose_time(&self, term: u64, index: u64) -> Option { @@ -1825,7 +1825,7 @@ where { let proposal = &self.proposals.queue[idx]; if term == proposal.term { - for tracker in proposal.cb.write_trackers().iter().flat_map(|v| v.iter()) { + for tracker in proposal.cb.write_trackers() { tracker.observe(std_now, &ctx.raft_metrics.wf_send_proposal, |t| { &mut t.metrics.wf_send_proposal_nanos }); @@ -2767,8 +2767,8 @@ where for entry in ready.entries() { if let Some((term, times)) = self.proposals.find_trackers(entry.get_index()) { if entry.term == term { - trackers.extend_from_slice(times); for tracker in times { + trackers.push(*tracker); tracker.observe(now, &ctx.raft_metrics.wf_send_to_queue, |t| { &mut t.metrics.wf_send_to_queue_nanos }); @@ -3687,6 +3687,7 @@ where cb, propose_time: None, must_pass_epoch_check: has_applied_to_current_term, + sent: false, }; if let Some(cmd_type) = req_admin_cmd_type { self.cmd_epoch_checker @@ -4018,6 +4019,7 @@ where cb: Callback::None, propose_time: Some(now), must_pass_epoch_check: false, + sent: false, }; self.post_propose(poll_ctx, p); } @@ -5941,6 +5943,7 @@ mod tests { cb: Callback::write(Box::new(|_| {})), propose_time: Some(u64_to_timespec(index)), must_pass_epoch_check: false, + sent: false, }); }; for index in 1..=100 { @@ -6014,6 +6017,7 @@ mod tests { is_conf_change: false, propose_time: None, must_pass_epoch_check: false, + sent: false, }); } for (index, term) in entries { From 1f91b6e3de8b3f1602a9c1493c07cbf567513307 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Wed, 11 Jan 2023 10:53:25 +0800 Subject: [PATCH 073/115] [Cloud]Refine FFI interface and mock PageStorage (#255) --- Cargo.lock | 1 + engine_store_ffi/Cargo.toml | 1 + engine_store_ffi/src/interfaces.rs | 23 ++- engine_store_ffi/src/lib.rs | 14 +- engine_store_ffi/src/observer.rs | 7 +- engine_store_ffi/src/ps_engine.rs | 5 +- engine_tiflash/Cargo.toml | 1 + engine_tiflash/src/engine.rs | 34 ++-- engine_tiflash/src/lib.rs | 8 +- new-mock-engine-store/src/lib.rs | 2 +- new-mock-engine-store/src/mock_cluster.rs | 18 ++- .../src/mock_page_storage.rs | 149 +++++++++++++++--- new-mock-engine-store/src/mock_store.rs | 43 +++-- proxy_scripts/ci_check.sh | 1 + proxy_server/Cargo.toml | 2 + proxy_server/src/run.rs | 1 + proxy_tests/Cargo.toml | 2 + proxy_tests/proxy/config.rs | 22 +++ proxy_tests/proxy/fast_add_peer.rs | 2 +- proxy_tests/proxy/proxy.rs | 10 +- proxy_tests/proxy/region.rs | 6 +- raftstore-proxy/Cargo.toml | 2 +- .../ffi/src/RaftStoreProxyFFI/@version | 2 +- .../ffi/src/RaftStoreProxyFFI/ProxyFFI.h | 19 +-- 24 files changed, 263 insertions(+), 112 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f886d100ff6..443f74b80e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4437,6 +4437,7 @@ dependencies = [ "engine_rocks", "engine_rocks_helper", "engine_store_ffi", + "engine_tiflash", "engine_traits", "error_code", "fail", diff --git a/engine_store_ffi/Cargo.toml b/engine_store_ffi/Cargo.toml index dafec6ab7b2..f24a69afae2 100644 --- a/engine_store_ffi/Cargo.toml +++ b/engine_store_ffi/Cargo.toml @@ -22,6 +22,7 @@ test-engines-rocksdb = [ test-engines-panic = [ "engine_test/test-engines-panic", ] +enable-pagestorage = [] cloud-aws = ["sst_importer/cloud-aws"] cloud-gcp = ["sst_importer/cloud-gcp"] diff --git a/engine_store_ffi/src/interfaces.rs b/engine_store_ffi/src/interfaces.rs index 97a3161b2e5..24e7db30543 100644 --- a/engine_store_ffi/src/interfaces.rs +++ b/engine_store_ffi/src/interfaces.rs @@ -145,12 +145,6 @@ pub mod root { } #[repr(C)] #[derive(Debug)] - pub struct PageWithView { - pub inner: root::DB::RawCppPtr, - pub view: root::DB::BaseBuffView, - } - #[repr(C)] - #[derive(Debug)] pub struct PageAndCppStrWithView { pub page: root::DB::RawCppPtr, pub key: root::DB::RawCppPtr, @@ -159,8 +153,8 @@ pub mod root { } #[repr(C)] #[derive(Debug)] - pub struct PageAndCppStrWithViewVec { - pub inner: *mut root::DB::PageAndCppStrWithView, + pub struct RawCppPtrCarr { + pub inner: root::DB::RawVoidPtr, pub len: u64, pub type_: root::DB::RawCppPtrType, } @@ -426,8 +420,11 @@ pub mod root { arg5: u64, ) -> u8, >, - pub fn_create_write_batch: - ::std::option::Option root::DB::RawCppPtr>, + pub fn_create_write_batch: ::std::option::Option< + unsafe extern "C" fn( + arg1: *const root::DB::EngineStoreServerWrap, + ) -> root::DB::RawCppPtr, + >, pub fn_write_batch_put_page: ::std::option::Option< unsafe extern "C" fn( arg1: root::DB::RawVoidPtr, @@ -457,14 +454,14 @@ pub mod root { unsafe extern "C" fn( arg1: *const root::DB::EngineStoreServerWrap, arg2: root::DB::BaseBuffView, - ) -> root::DB::PageWithView, + ) -> root::DB::CppStrWithView, >, pub fn_handle_scan_page: ::std::option::Option< unsafe extern "C" fn( arg1: *const root::DB::EngineStoreServerWrap, arg2: root::DB::BaseBuffView, arg3: root::DB::BaseBuffView, - ) -> root::DB::PageAndCppStrWithViewVec, + ) -> root::DB::RawCppPtrCarr, >, pub fn_handle_purge_pagestorage: ::std::option::Option< unsafe extern "C" fn(arg1: *const root::DB::EngineStoreServerWrap), @@ -583,7 +580,7 @@ pub mod root { ) -> root::DB::FastAddPeerRes, >, } - pub const RAFT_STORE_PROXY_VERSION: u64 = 10253455389063462714; + pub const RAFT_STORE_PROXY_VERSION: u64 = 17394545035928865111; pub const RAFT_STORE_PROXY_MAGIC_NUMBER: u32 = 324508639; } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index d908c52dce1..a1af1bf3fe0 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -38,9 +38,9 @@ pub use self::interfaces::root::DB::{ BaseBuffView, ColumnFamilyType, CppStrVecView, CppStrWithView, EngineStoreApplyRes, EngineStoreServerHelper, EngineStoreServerStatus, FastAddPeerRes, FastAddPeerStatus, FileEncryptionRes, FsStats, HttpRequestRes, HttpRequestStatus, KVGetStatus, - PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftCmdHeader, RaftProxyStatus, - RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawCppStringPtr, RawVoidPtr, - SSTReaderPtr, SpecialCppPtrType, StoreStats, WriteCmdType, WriteCmdsView, + PageAndCppStrWithView, RaftCmdHeader, RaftProxyStatus, RaftStoreProxyFFIHelper, RawCppPtr, + RawCppPtrArr, RawCppPtrCarr, RawCppPtrTuple, RawCppStringPtr, RawVoidPtr, SSTReaderPtr, + SpecialCppPtrType, StoreStats, WriteCmdType, WriteCmdsView, }; use self::interfaces::root::DB::{ ConstRawVoidPtr, RaftStoreProxyPtr, RawCppPtrType, RawRustPtr, SSTReaderInterfaces, SSTView, @@ -465,7 +465,7 @@ impl Drop for RawCppPtrArr { } } -impl Drop for PageAndCppStrWithViewVec { +impl Drop for RawCppPtrCarr { fn drop(&mut self) { if self.inner != std::ptr::null_mut() { let helper = get_engine_store_server_helper(); @@ -630,7 +630,7 @@ impl EngineStoreServerHelper { pub fn create_write_batch(&self) -> RawCppPtr { debug_assert!(self.fn_create_write_batch.is_some()); - unsafe { (self.fn_create_write_batch.into_inner())() } + unsafe { (self.fn_create_write_batch.into_inner())(self.inner) } } pub fn write_batch_put_page(&self, wb: RawVoidPtr, page_id: BaseBuffView, page: BaseBuffView) { @@ -668,7 +668,7 @@ impl EngineStoreServerHelper { unsafe { (self.fn_consume_write_batch.into_inner())(self.inner, wb) } } - pub fn read_page(&self, page_id: BaseBuffView) -> PageWithView { + pub fn read_page(&self, page_id: BaseBuffView) -> CppStrWithView { debug_assert!(self.fn_handle_read_page.is_some()); unsafe { (self.fn_handle_read_page.into_inner())(self.inner, page_id) } } @@ -677,7 +677,7 @@ impl EngineStoreServerHelper { &self, start_page_id: BaseBuffView, end_page_id: BaseBuffView, - ) -> PageAndCppStrWithViewVec { + ) -> RawCppPtrCarr { debug_assert!(self.fn_handle_scan_page.is_some()); unsafe { (self.fn_handle_scan_page.into_inner())(self.inner, start_page_id, end_page_id) } } diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 03adae1becb..6da35fb7887 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -47,8 +47,8 @@ use yatp::{ use crate::{ gen_engine_store_server_helper, interfaces::root::{DB as ffi_interfaces, DB::EngineStoreApplyRes}, - name_to_cf, ColumnFamilyType, EngineStoreServerHelper, RaftCmdHeader, RawCppPtr, TiFlashEngine, - WriteCmdType, WriteCmds, CF_LOCK, + name_to_cf, ColumnFamilyType, EngineStoreServerHelper, PageAndCppStrWithView, RaftCmdHeader, + RawCppPtr, TiFlashEngine, WriteCmdType, WriteCmds, CF_LOCK, }; macro_rules! fatal { @@ -151,8 +151,9 @@ impl engine_tiflash::FFIHubInner for TiFlashFFIHub { let values = self .engine_store_server_helper .scan_page(start_page_id.into(), end_page_id.into()); + let arr = values.inner as *mut PageAndCppStrWithView; for i in 0..values.len { - let value = unsafe { &*values.inner.offset(i as isize) }; + let value = unsafe { &*arr.offset(i as isize) }; if value.page_view.len != 0 { f( &value.key_view.to_slice().to_vec(), diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 386d5e88d6a..bd727c013d5 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -23,7 +23,7 @@ use raft::eraftpb::Entry; use tikv_util::{box_try, info}; use tracker::TrackerToken; -use crate::{gen_engine_store_server_helper, RawCppPtr}; +use crate::{gen_engine_store_server_helper, PageAndCppStrWithView, RawCppPtr}; // 1. STORE_IDENT 0 // 2. PREPARE_BOOTSTRAP 1 @@ -264,8 +264,9 @@ impl PSEngine { { let helper = gen_engine_store_server_helper(self.engine_store_server_helper); let values = helper.scan_page(start_key.into(), end_key.into()); + let arr = values.inner as *mut PageAndCppStrWithView; for i in 0..values.len { - let value = unsafe { &*values.inner.offset(i as isize) }; + let value = unsafe { &*arr.offset(i as isize) }; if value.page_view.len != 0 { if !f( &value.key_view.to_slice().to_vec(), diff --git a/engine_tiflash/Cargo.toml b/engine_tiflash/Cargo.toml index 09e55f8d60c..d06c6ba75f3 100644 --- a/engine_tiflash/Cargo.toml +++ b/engine_tiflash/Cargo.toml @@ -10,6 +10,7 @@ portable = ["rocksdb/portable"] sse = ["rocksdb/sse"] failpoints = ["fail/failpoints"] testexport = [] +enable-pagestorage = [] # Disables runtime checks of invariants required by RocksDB that are redundant # with assertions inside RocksDB itself. This makes it possible to test those diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index dfe590af323..f65ca8200cd 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -108,6 +108,10 @@ impl RocksEngine { snap_handle_pool_size: usize, ffi_hub: Option>, ) { + #[cfg(feature = "enable-pagestorage")] + tikv_util::info!("enabled pagestorage"); + #[cfg(not(feature = "enable-pagestorage"))] + tikv_util::info!("disabled pagestorage"); self.engine_store_server_helper = engine_store_server_helper; self.pool_capacity = snap_handle_pool_size; self.pending_applies_count.store(0, Ordering::SeqCst); @@ -216,7 +220,7 @@ impl KvEngine for RocksEngine { impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn scan( &self, cf: &str, @@ -272,15 +276,15 @@ impl<'a> PartialEq<&'a [u8]> for PsDbVector { } impl Peekable for RocksEngine { - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] type DbVector = RocksDbVector; - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { self.rocks.get_value_opt(opts, key) } - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn get_value_cf_opt( &self, opts: &ReadOptions, @@ -290,10 +294,10 @@ impl Peekable for RocksEngine { self.rocks.get_value_cf_opt(opts, cf, key) } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] type DbVector = PsDbVector; - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn get_value_opt(&self, opts: &ReadOptions, key: &[u8]) -> Result> { let result = self.ffi_hub.as_ref().unwrap().read_page(key); return match result { @@ -302,7 +306,7 @@ impl Peekable for RocksEngine { }; } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn get_value_cf_opt( &self, opts: &ReadOptions, @@ -320,7 +324,7 @@ impl RocksEngine { } impl SyncMutable for RocksEngine { - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().put(key, value).map_err(r2e); @@ -328,7 +332,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -342,7 +346,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn delete(&self, key: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { return self.rocks.get_sync_db().delete(key).map_err(r2e); @@ -350,7 +354,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(any(test, feature = "testexport"))] + #[cfg(not(feature = "enable-pagestorage"))] fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { if self.do_write(cf, key) { let db = self.rocks.get_sync_db(); @@ -360,7 +364,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn put(&self, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); @@ -376,7 +380,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn put_cf(&self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { if self.do_write(cf, key) { let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); @@ -392,7 +396,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn delete(&self, key: &[u8]) -> Result<()> { if self.do_write(engine_traits::CF_DEFAULT, key) { let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); @@ -408,7 +412,7 @@ impl SyncMutable for RocksEngine { Ok(()) } - #[cfg(not(any(test, feature = "testexport")))] + #[cfg(feature = "enable-pagestorage")] fn delete_cf(&self, cf: &str, key: &[u8]) -> Result<()> { if self.do_write(cf, key) { let ps_wb = self.ffi_hub.as_ref().unwrap().create_write_batch(); diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 18e73b1ee13..1d733e724be 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -53,14 +53,14 @@ pub use crate::status::*; mod table_properties; pub use crate::table_properties::*; -#[cfg(any(test, feature = "testexport"))] +#[cfg(not(feature = "enable-pagestorage"))] mod write_batch; -#[cfg(any(test, feature = "testexport"))] +#[cfg(not(feature = "enable-pagestorage"))] pub use crate::write_batch::*; -#[cfg(not(any(test, feature = "testexport")))] +#[cfg(feature = "enable-pagestorage")] mod ps_write_batch; -#[cfg(not(any(test, feature = "testexport")))] +#[cfg(feature = "enable-pagestorage")] pub use crate::ps_write_batch::*; pub mod mvcc_properties; diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index e20fc61e261..8de0101555a 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -1,5 +1,5 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. - +#![feature(vec_into_raw_parts)] #![feature(slice_take)] pub mod config; pub mod mock_cluster; diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index dc5e33d1276..146b4e87a7f 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -235,14 +235,14 @@ impl> Cluster { pub fn iter_ffi_helpers( &self, store_ids: Option>, - f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet), + f: &mut dyn FnMut(u64, &engine_store_ffi::TiFlashEngine, &mut FFIHelperSet), ) { let ids = match store_ids { Some(ids) => ids, None => self.engines.keys().copied().collect::>(), }; for id in ids { - let engine = self.get_engine(id); + let engine = self.get_tiflash_engine(id); let lock = self.ffi_helper_set.lock(); match lock { Ok(mut l) => { @@ -279,14 +279,12 @@ impl> Cluster { pub fn run(&mut self) { self.create_engines(); self.bootstrap_region().unwrap(); - self.bootstrap_ffi_helper_set(); self.start().unwrap(); } pub fn run_conf_change(&mut self) -> u64 { self.create_engines(); let region_id = self.bootstrap_conf_change(); - self.bootstrap_ffi_helper_set(); // Will not start new nodes in `start` self.start().unwrap(); region_id @@ -295,7 +293,6 @@ impl> Cluster { pub fn run_conf_change_no_start(&mut self) -> u64 { self.create_engines(); let region_id = self.bootstrap_conf_change(); - self.bootstrap_ffi_helper_set(); region_id } @@ -307,6 +304,7 @@ impl> Cluster { key_manager: &Option>, router: &Option>, ) { + init_global_ffi_helper_set(); let (mut ffi_helper_set, _node_cfg) = self.make_ffi_helper_set(0, engines, key_manager, router); @@ -348,6 +346,7 @@ impl> Cluster { } else { self.ffi_helper_lst.pop().unwrap() }; + debug!("set up ffi helper set for {}", node_id); ffi_helper_set.engine_store_server.id = node_id; self.ffi_helper_set .lock() @@ -355,6 +354,7 @@ impl> Cluster { .insert(node_id, ffi_helper_set); } + // Need self.engines be filled. pub fn bootstrap_ffi_helper_set(&mut self) { let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); // We force iterate engines in sorted order. @@ -388,8 +388,6 @@ impl> Cluster { } pub fn start_with(&mut self, skip_set: HashSet) -> ServerResult<()> { - init_global_ffi_helper_set(); - // Try recover from last shutdown. // `self.engines` is inited in bootstrap_region or bootstrap_conf_change. let mut node_ids: Vec = self.engines.iter().map(|(&id, _)| id).collect(); @@ -496,6 +494,7 @@ pub fn make_global_ffi_helper_set_no_bind() -> (EngineHelperSet, *const u8) { pub fn init_global_ffi_helper_set() { unsafe { START.call_once(|| { + debug!("init_global_ffi_helper_set"); assert_eq!(engine_store_ffi::get_engine_store_server_helper_ptr(), 0); let (set, ptr) = make_global_ffi_helper_set_no_bind(); engine_store_ffi::init_engine_store_server_helper(ptr); @@ -888,6 +887,7 @@ impl> Cluster { .insert(id, self.key_managers[i].clone()); } + self.bootstrap_ffi_helper_set(); let mut region = metapb::Region::default(); region.set_id(1); region.set_start_key(keys::EMPTY_KEY.to_vec()); @@ -906,6 +906,9 @@ impl> Cluster { "node_id" => id, ); prepare_bootstrap_cluster(engines, ®ion)?; + tikv_util::debug!("prepare_bootstrap_cluster finish"; + "node_id" => id, + ); } self.bootstrap_cluster(region); @@ -924,6 +927,7 @@ impl> Cluster { .insert(id, self.key_managers[i].clone()); } + self.bootstrap_ffi_helper_set(); for (&id, engines) in &self.engines { bootstrap_store(engines, self.id(), id).unwrap(); } diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs index 1aad90660b7..5782ee4f987 100644 --- a/new-mock-engine-store/src/mock_page_storage.rs +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -1,19 +1,38 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::btree_map::OccupiedEntry, sync::RwLock}; +use core::ops::Bound::{Excluded, Included, Unbounded}; +use std::{ + collections::BTreeMap, + sync::{atomic::AtomicU64, Arc, RwLock}, +}; -use collections::HashMap; pub use engine_store_ffi::{ interfaces::root::DB as ffi_interfaces, BaseBuffView, CppStrWithView, EngineStoreServerHelper, - PageAndCppStrWithView, PageAndCppStrWithViewVec, PageWithView, RaftStoreProxyFFIHelper, - RawCppPtr, RawVoidPtr, + PageAndCppStrWithView, RaftStoreProxyFFIHelper, RawCppPtr, RawCppPtrCarr, RawVoidPtr, }; -use crate::mock_store::{into_engine_store_server_wrap, EngineStoreServerWrap, RawCppPtrTypeImpl}; +use crate::{ + create_cpp_str, create_cpp_str_parts, + mock_store::{into_engine_store_server_wrap, RawCppPtrTypeImpl}, +}; + +pub enum MockPSSingleWrite { + Put((Vec, MockPSUniversalPage)), + Delete(Vec), +} -#[derive(Default)] pub struct MockPSWriteBatch { - pub data: HashMap, MockPSUniversalPage>, + pub data: Vec<(u64, MockPSSingleWrite)>, + core: Arc>, +} + +impl MockPSWriteBatch { + fn new(core: Arc>) -> Self { + Self { + data: Default::default(), + core, + } + } } pub struct MockPSUniversalPage { @@ -28,13 +47,37 @@ impl Into for BaseBuffView { } } +pub struct MockPageStorageCore { + current_id: AtomicU64, +} + +impl MockPageStorageCore { + pub fn alloc_id(&mut self) -> u64 { + self.current_id + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + } +} + +impl Default for MockPageStorageCore { + fn default() -> Self { + Self { + current_id: AtomicU64::new(1), + } + } +} + #[derive(Default)] pub struct MockPageStorage { - pub data: RwLock, MockPSUniversalPage>>, + pub data: RwLock, MockPSUniversalPage>>, + pub core: Arc>, } -pub unsafe extern "C" fn ffi_mockps_create_write_batch() -> RawCppPtr { - let ptr = Box::into_raw(Box::new(MockPSWriteBatch::default())); +pub unsafe extern "C" fn ffi_mockps_create_write_batch( + wrap: *const ffi_interfaces::EngineStoreServerWrap, +) -> RawCppPtr { + let store = into_engine_store_server_wrap(wrap); + let core = (*store.engine_store_server).page_storage.core.clone(); + let ptr = Box::into_raw(Box::new(MockPSWriteBatch::new(core))); RawCppPtr { ptr: ptr as RawVoidPtr, type_: RawCppPtrTypeImpl::PSWriteBatch.into(), @@ -52,13 +95,17 @@ pub unsafe extern "C" fn ffi_mockps_write_batch_put_page( page_id: BaseBuffView, page: BaseBuffView, ) { - let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); - wb.data.insert(page_id.to_slice().to_owned(), page.into()); + let wb: &mut MockPSWriteBatch = <&mut MockPSWriteBatch as From>::from(wb); + let wid = wb.core.write().unwrap().alloc_id(); + let write = MockPSSingleWrite::Put((page_id.to_slice().to_vec(), page.into())); + wb.data.push((wid, write)); } pub unsafe extern "C" fn ffi_mockps_write_batch_del_page(wb: RawVoidPtr, page_id: BaseBuffView) { - let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); - wb.data.remove(page_id.to_slice()); + let wb: &mut MockPSWriteBatch = <&mut MockPSWriteBatch as From>::from(wb); + let wid = wb.core.write().unwrap().alloc_id(); + let write = MockPSSingleWrite::Delete(page_id.to_slice().to_vec()); + wb.data.push((wid, write)); } pub unsafe extern "C" fn ffi_mockps_write_batch_size(wb: RawVoidPtr) -> u64 { @@ -74,7 +121,7 @@ pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { pub unsafe extern "C" fn ffi_mockps_write_batch_merge(lwb: RawVoidPtr, rwb: RawVoidPtr) { let lwb: _ = <&mut MockPSWriteBatch as From>::from(lwb); let rwb: _ = <&mut MockPSWriteBatch as From>::from(rwb); - lwb.data.extend(rwb.data.drain()); + lwb.data.extend(rwb.data.drain(..)); } pub unsafe extern "C" fn ffi_mockps_write_batch_clear(wb: RawVoidPtr) { @@ -93,35 +140,93 @@ pub unsafe extern "C" fn ffi_mockps_consume_write_batch( .data .write() .unwrap(); - guard.extend(wb.data.drain()); + wb.data.sort_by_key(|k| k.0); + for (_, write) in wb.data.drain(..) { + match write { + MockPSSingleWrite::Put(w) => { + guard.insert(w.0, w.1); + } + MockPSSingleWrite::Delete(w) => { + guard.remove(&w); + } + } + } } pub unsafe extern "C" fn ffi_mockps_handle_read_page( wrap: *const ffi_interfaces::EngineStoreServerWrap, page_id: BaseBuffView, -) -> PageWithView { - todo!() +) -> CppStrWithView { + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let key = page_id.to_slice().to_vec(); + match guard.get(&key) { + Some(p) => create_cpp_str(Some(p.data.clone())), + None => create_cpp_str(None), + } } pub unsafe extern "C" fn ffi_mockps_handle_scan_page( wrap: *const ffi_interfaces::EngineStoreServerWrap, start_page_id: BaseBuffView, end_page_id: BaseBuffView, -) -> PageAndCppStrWithViewVec { - todo!() +) -> RawCppPtrCarr { + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let range = guard.range(( + Included(start_page_id.to_slice().to_vec()), + Excluded(end_page_id.to_slice().to_vec()), + )); + let range = range.collect::>(); + let mut result: Vec = Vec::with_capacity(range.len()); + for (k, v) in range.into_iter() { + let (page, page_view) = create_cpp_str_parts(Some(v.data.clone())); + let (key, key_view) = create_cpp_str_parts(Some(k.clone())); + let pacwv = PageAndCppStrWithView { + page, + key, + page_view, + key_view, + }; + result.push(pacwv) + } + let (result_ptr, l, c) = result.into_raw_parts(); + assert_eq!(l, c); + RawCppPtrCarr { + inner: result_ptr as RawVoidPtr, + len: c as u64, + type_: RawCppPtrTypeImpl::PSPageAndCppStr.into(), + } } pub unsafe extern "C" fn ffi_mockps_handle_purge_pagestorage( wrap: *const ffi_interfaces::EngineStoreServerWrap, ) { - todo!() + // TODO } pub unsafe extern "C" fn ffi_mockps_handle_seek_ps_key( wrap: *const ffi_interfaces::EngineStoreServerWrap, page_id: BaseBuffView, ) -> CppStrWithView { - todo!() + // Find the first great or equal than + let store = into_engine_store_server_wrap(wrap); + let guard = (*store.engine_store_server) + .page_storage + .data + .read() + .unwrap(); + let mut range = guard.range((Included(page_id.to_slice().to_vec()), Unbounded)); + let kv = range.next().unwrap(); + create_cpp_str(Some(kv.0.clone())) } pub unsafe extern "C" fn ffi_mockps_ps_is_empty( diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index a3edbb5dfcb..0f9effcaf1c 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -732,10 +732,10 @@ pub fn gen_engine_store_server_helper( fn_write_batch_merge: Some(ffi_mockps_write_batch_merge), fn_write_batch_clear: Some(ffi_mockps_write_batch_clear), fn_consume_write_batch: Some(ffi_mockps_consume_write_batch), - fn_handle_read_page: None, - fn_handle_purge_pagestorage: None, - fn_handle_scan_page: None, - fn_handle_seek_ps_key: None, + fn_handle_read_page: Some(ffi_mockps_handle_read_page), + fn_handle_purge_pagestorage: Some(ffi_mockps_handle_purge_pagestorage), + fn_handle_scan_page: Some(ffi_mockps_handle_scan_page), + fn_handle_seek_ps_key: Some(ffi_mockps_handle_seek_ps_key), fn_ps_is_empty: Some(ffi_mockps_ps_is_empty), } } @@ -1017,7 +1017,13 @@ extern "C" fn ffi_gc_raw_cpp_ptr_carr( } drop(p); }, - RawCppPtrTypeImpl::PSPageAndCppStr => unsafe { todo!() }, + RawCppPtrTypeImpl::PSPageAndCppStr => unsafe { + let p = Box::from_raw(std::slice::from_raw_parts_mut( + ptr as *mut PageAndCppStrWithView, + len as usize, + )); + drop(p) + }, _ => todo!(), } } @@ -1320,35 +1326,42 @@ unsafe extern "C" fn ffi_handle_compute_store_stats( } } -unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { +pub unsafe fn create_cpp_str_parts( + s: Option>, +) -> (ffi_interfaces::RawCppPtr, ffi_interfaces::BaseBuffView) { match s { Some(s) => { let len = s.len() as u64; - let ptr = Box::into_raw(Box::new(s.clone())); // leak - ffi_interfaces::CppStrWithView { - inner: ffi_interfaces::RawCppPtr { + let ptr = Box::into_raw(Box::new(s)); // leak + ( + ffi_interfaces::RawCppPtr { ptr: ptr as RawVoidPtr, type_: RawCppPtrTypeImpl::String.into(), }, - view: ffi_interfaces::BaseBuffView { + ffi_interfaces::BaseBuffView { data: (*ptr).as_ptr() as *const _, len, }, - } + ) } - None => ffi_interfaces::CppStrWithView { - inner: ffi_interfaces::RawCppPtr { + None => ( + ffi_interfaces::RawCppPtr { ptr: std::ptr::null_mut(), type_: RawCppPtrTypeImpl::None.into(), }, - view: ffi_interfaces::BaseBuffView { + ffi_interfaces::BaseBuffView { data: std::ptr::null(), len: 0, }, - }, + ), } } +pub unsafe fn create_cpp_str(s: Option>) -> ffi_interfaces::CppStrWithView { + let (p, v) = create_cpp_str_parts(s); + ffi_interfaces::CppStrWithView { inner: p, view: v } +} + #[allow(clippy::redundant_closure_call)] unsafe extern "C" fn ffi_fast_add_peer( arg1: *mut ffi_interfaces::EngineStoreServerWrap, diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 1a95bfc69ab..1aa71509fc7 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -45,6 +45,7 @@ elif [[ $M == "testnew" ]]; then cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test cargo test --package proxy_tests --test proxy ffi -- --test-threads 1 + cargo test --package proxy_tests --test proxy write --features="proxy_tests/enable-pagestorage" elif [[ $M == "debug" ]]; then # export RUSTC_WRAPPER=~/.cargo/bin/sccache export ENGINE_LABEL_VALUE=tiflash diff --git a/proxy_server/Cargo.toml b/proxy_server/Cargo.toml index b4c42af2cd1..5ce3068e2e7 100644 --- a/proxy_server/Cargo.toml +++ b/proxy_server/Cargo.toml @@ -34,6 +34,7 @@ nortcheck = ["engine_rocks/nortcheck"] backup-stream-debug = ["backup-stream/backup-stream-debug"] pprof-fp = ["tikv/pprof-fp"] +enable-pagestorage = ["engine_tiflash/enable-pagestorage", "engine_store_ffi/enable-pagestorage"] [dependencies] api_version = { workspace = true } @@ -51,6 +52,7 @@ encryption_export = { workspace = true, default-features = false } engine_rocks = { workspace = true, default-features = false } engine_rocks_helper = { workspace = true } engine_store_ffi = { workspace = true, default-features = false } +engine_tiflash = { workspace = true, default-features = false } engine_traits = { workspace = true, default-features = false } error_code = { workspace = true, default-features = false } fail = "0.5" diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 9bfdbc83a61..b819713c9cd 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -535,6 +535,7 @@ impl TiKvServer { // Initialize and check config info!("using proxy config"; "config" => ?proxy_config); + let cfg_controller = Self::init_config(config, &proxy_config); let config = cfg_controller.get_current(); diff --git a/proxy_tests/Cargo.toml b/proxy_tests/Cargo.toml index e9730c960c5..31e8d93a498 100644 --- a/proxy_tests/Cargo.toml +++ b/proxy_tests/Cargo.toml @@ -37,6 +37,8 @@ mem-profiling = ["tikv/mem-profiling"] sse = ["tikv/sse"] portable = ["tikv/portable"] +enable-pagestorage = ["engine_tiflash/enable-pagestorage", "engine_store_ffi/enable-pagestorage"] + [dependencies] api_version = { workspace = true } async-trait = "0.1" diff --git a/proxy_tests/proxy/config.rs b/proxy_tests/proxy/config.rs index 8e791735405..af5421c40b1 100644 --- a/proxy_tests/proxy/config.rs +++ b/proxy_tests/proxy/config.rs @@ -197,3 +197,25 @@ apply-low-priority-pool-size = 41 config.raft_store.apply_batch_system.low_priority_pool_size ); } + +#[test] +fn test_config_proxy_owned_config() { + test_util::init_log_for_test(); + let mut file = tempfile::NamedTempFile::new().unwrap(); + write!( + file, + " +[engine-store] +enable-fast-add-peer = true + " + ) + .unwrap(); + let path = file.path(); + + let mut v: Vec = vec![]; + let cpath = Some(path.as_os_str()); + let proxy_config = gen_proxy_config(&cpath, false, &mut v); + + info!("using proxy config"; "config" => ?proxy_config); + assert_eq!(true, proxy_config.engine_store.enable_fast_add_peer); +} diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 5beaa900582..1bd26d4a124 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -207,7 +207,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT iter_ffi_helpers( &cluster, Some(vec![3]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { assert!(e.fast_add_peer_count.load(Ordering::SeqCst) > 0); }); diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 087d8c16dc6..ecfabe6d364 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -26,7 +26,7 @@ pub use kvproto::{ }; pub use new_mock_engine_store::{ config::Config, - get_apply_state, get_raft_local_state, get_region_local_state, make_new_region, + general_get_apply_state, general_get_region_local_state, get_raft_local_state, make_new_region, mock_cluster::{new_put_cmd, new_request, FFIHelperSet}, must_get_equal, must_get_none, node::NodeCluster, @@ -76,7 +76,7 @@ pub struct States { pub fn iter_ffi_helpers>( cluster: &Cluster, store_ids: Option>, - f: &mut dyn FnMut(u64, &engine_rocks::RocksEngine, &mut FFIHelperSet) -> (), + f: &mut dyn FnMut(u64, &engine_store_ffi::TiFlashEngine, &mut FFIHelperSet) -> (), ) { cluster.iter_ffi_helpers(store_ids, f); } @@ -90,7 +90,7 @@ pub fn maybe_collect_states( iter_ffi_helpers( cluster, store_ids, - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, engine: &engine_store_ffi::TiFlashEngine, ffi: &mut FFIHelperSet| { let server = &ffi.engine_store_server; let raft_engine = &cluster.get_engines(id).raft; if let Some(region) = server.kvstore.get(®ion_id) { @@ -98,8 +98,8 @@ pub fn maybe_collect_states( Ok(Some(i)) => i, _ => unreachable!(), }; - let apply_state = get_apply_state(&engine, region_id); - let region_state = get_region_local_state(&engine, region_id); + let apply_state = general_get_apply_state(engine, region_id); + let region_state = general_get_region_local_state(engine, region_id); let raft_state = get_raft_local_state(raft_engine, region_id); if apply_state.is_none() { return; diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 51362ef3917..270953e0cd4 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -373,7 +373,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ iter_ffi_helpers( cluster, Some(vec![from]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { let server = &mut ffi.engine_store_server; maybe_source_region = server.kvstore.get(®ion_id).cloned(); }, @@ -386,7 +386,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ iter_ffi_helpers( cluster, Some(vec![to]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { let server = &mut ffi.engine_store_server; assert!(server.kvstore.get(®ion_id).is_none()); @@ -603,7 +603,7 @@ fn test_add_delayed_started_learner_snapshot() { iter_ffi_helpers( &cluster, Some(vec![5]), - &mut |id: u64, engine: &engine_rocks::RocksEngine, ffi: &mut FFIHelperSet| { + &mut |id: u64, _, ffi: &mut FFIHelperSet| { (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { assert_eq!(e.pre_handle_count.load(Ordering::SeqCst), 1); }); diff --git a/raftstore-proxy/Cargo.toml b/raftstore-proxy/Cargo.toml index 5ab8af974a7..e6e254e87a8 100644 --- a/raftstore-proxy/Cargo.toml +++ b/raftstore-proxy/Cargo.toml @@ -40,4 +40,4 @@ name = "raftstore_proxy" crate-type = ["cdylib"] [dependencies] -proxy_server = { workspace = true } +proxy_server = { workspace = true, features = ["enable-pagestorage"] } diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version index ae998a90559..9705433b49c 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/@version @@ -1,3 +1,3 @@ #pragma once #include -namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 10253455389063462714ull; } \ No newline at end of file +namespace DB { constexpr uint64_t RAFT_STORE_PROXY_VERSION = 17394545035928865111ull; } \ No newline at end of file diff --git a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h index c7c8fa353bb..967508ce0ed 100644 --- a/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h +++ b/raftstore-proxy/ffi/src/RaftStoreProxyFFI/ProxyFFI.h @@ -92,11 +92,6 @@ struct CppStrWithView { BaseBuffView view; }; -struct PageWithView { - RawCppPtr inner; - BaseBuffView view; -}; - struct PageAndCppStrWithView { RawCppPtr page; RawCppPtr key; @@ -104,8 +99,8 @@ struct PageAndCppStrWithView { BaseBuffView key_view; }; -struct PageAndCppStrWithViewVec { - PageAndCppStrWithView *inner; +struct RawCppPtrCarr { + RawVoidPtr inner; const uint64_t len; RawCppPtrType type; }; @@ -244,7 +239,7 @@ struct EngineStoreServerHelper { uint8_t (*fn_need_flush_data)(EngineStoreServerWrap *, uint64_t); uint8_t (*fn_try_flush_data)(EngineStoreServerWrap *, uint64_t, uint8_t, uint64_t, uint64_t); - RawCppPtr (*fn_create_write_batch)(); + RawCppPtr (*fn_create_write_batch)(const EngineStoreServerWrap *); void (*fn_write_batch_put_page)(RawVoidPtr, BaseBuffView, BaseBuffView); void (*fn_write_batch_del_page)(RawVoidPtr, BaseBuffView); uint64_t (*fn_write_batch_size)(RawVoidPtr); @@ -252,10 +247,10 @@ struct EngineStoreServerHelper { void (*fn_write_batch_merge)(RawVoidPtr, RawVoidPtr); void (*fn_write_batch_clear)(RawVoidPtr); void (*fn_consume_write_batch)(const EngineStoreServerWrap *, RawVoidPtr); - PageWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, - BaseBuffView); - PageAndCppStrWithViewVec (*fn_handle_scan_page)(const EngineStoreServerWrap *, - BaseBuffView, BaseBuffView); + CppStrWithView (*fn_handle_read_page)(const EngineStoreServerWrap *, + BaseBuffView); + RawCppPtrCarr (*fn_handle_scan_page)(const EngineStoreServerWrap *, + BaseBuffView, BaseBuffView); void (*fn_handle_purge_pagestorage)(const EngineStoreServerWrap *); CppStrWithView (*fn_handle_seek_ps_key)(const EngineStoreServerWrap *, BaseBuffView); From 528e06dcc4ffa5d099b60fbe93972732d141e014 Mon Sep 17 00:00:00 2001 From: Wenxuan Date: Thu, 12 Jan 2023 16:22:34 +0800 Subject: [PATCH 074/115] util: Fix incorrect memory capacity (#14034) * util: Fix incorrect memory capacity Signed-off-by: Wish * Fix lints Signed-off-by: Wish * Check capacity with /proc/meminfo Signed-off-by: Wish Signed-off-by: Wish --- components/tikv_util/src/sys/mod.rs | 4 +- src/server/service/diagnostics/sys.rs | 61 ++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 49e6812b81f..797da2aea54 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -22,7 +22,7 @@ use mnt::get_mount; use sysinfo::RefreshKind; pub use sysinfo::{CpuExt, DiskExt, NetworkExt, ProcessExt, SystemExt}; -use crate::config::{ReadableSize, KIB}; +use crate::config::ReadableSize; pub const HIGH_PRI: i32 = -1; const CPU_CORES_QUOTA_ENV_VAR_KEY: &str = "TIKV_CPU_CORES_QUOTA"; @@ -93,7 +93,7 @@ impl SysQuota { fn sysinfo_memory_limit_in_bytes() -> u64 { let system = sysinfo::System::new_with_specifics(RefreshKind::new().with_memory()); - system.total_memory() * KIB + system.total_memory() } } diff --git a/src/server/service/diagnostics/sys.rs b/src/server/service/diagnostics/sys.rs index 6e9585ab2c9..8a84eaf6293 100644 --- a/src/server/service/diagnostics/sys.rs +++ b/src/server/service/diagnostics/sys.rs @@ -3,10 +3,7 @@ use std::{collections::HashMap, string::ToString}; use kvproto::diagnosticspb::{ServerInfoItem, ServerInfoPair}; -use tikv_util::{ - config::KIB, - sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}, -}; +use tikv_util::sys::{cpu_time::LinuxStyleCpuTime, ioload, SysQuota, *}; use walkdir::WalkDir; use crate::server::service::diagnostics::SYS_INFO; @@ -129,12 +126,12 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) fn mem_load_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_memory(); - let total_memory = system.total_memory() * KIB; - let used_memory = system.used_memory() * KIB; - let free_memory = system.free_memory() * KIB; - let total_swap = system.total_swap() * KIB; - let used_swap = system.used_swap() * KIB; - let free_swap = system.free_swap() * KIB; + let total_memory = system.total_memory(); + let used_memory = system.used_memory(); + let free_memory = system.free_memory(); + let total_swap = system.total_swap(); + let used_swap = system.used_swap(); + let free_swap = system.free_swap(); drop(system); let used_memory_pct = (used_memory as f64) / (total_memory as f64); let free_memory_pct = (free_memory as f64) / (total_memory as f64); @@ -683,6 +680,50 @@ mod tests { assert_ne!(processes.get_pairs().len(), 0); } + #[test] + #[cfg(target_os = "linux")] + fn test_memory() { + let mut mem_total_kb: u64 = 0; + { + use std::io::BufRead; + + let f = std::fs::File::open("/proc/meminfo").unwrap(); + let reader = std::io::BufReader::new(f); + for line in reader.lines() { + let l = line.unwrap(); + let mut parts = l.split_whitespace(); + if parts.next().unwrap() != "MemTotal:" { + continue; + } + mem_total_kb = parts.next().unwrap().parse().unwrap(); + let unit = parts.next().unwrap(); + assert_eq!(unit, "kB"); + } + } + assert!(mem_total_kb > 0); + + let mut collector = vec![]; + hardware_info(&mut collector); + + let mut memory_checked = false; + + 'outer: for item in &collector { + if item.get_tp() != "memory" { + continue; + } + for pair in item.get_pairs() { + if pair.get_key() != "capacity" { + continue; + } + assert_eq!(pair.get_value(), (mem_total_kb * 1024).to_string()); + memory_checked = true; + break 'outer; + } + } + + assert!(memory_checked); + } + #[test] fn test_hardware_info() { let mut collector = vec![]; From e1467c56a445d36a8fd8642f9467a0b18fbb8203 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 12 Jan 2023 17:35:52 +0800 Subject: [PATCH 075/115] pd_client: fix the kvproto compatibility for global config (#14041) * hotfix kvproto for global config Signed-off-by: husharp * make format happy Signed-off-by: husharp Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/pd_client/src/client.rs | 4 ++-- components/pd_client/src/client_v2.rs | 6 +++--- components/pd_client/src/lib.rs | 2 +- components/test_pd/src/mocker/mod.rs | 8 ++++---- tests/failpoints/cases/test_pd_client.rs | 15 +++++---------- tests/failpoints/cases/test_pd_client_legacy.rs | 16 +++++----------- 7 files changed, 21 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7a3c9ced013..c98cd025fad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2726,7 +2726,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#ae3b086b09afbb26cebcd4c1fe14b82bbe1f0796" +source = "git+https://github.com/pingcap/kvproto.git#a14c44ef44b378d15adb5baad8402b838f031b51" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 9f466a6a351..5bccdcfacea 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -286,10 +286,10 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, list: Vec) -> PdFuture> { + fn load_global_config(&self, config_path: String) -> PdFuture> { use kvproto::pdpb::LoadGlobalConfigRequest; let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner .rl() diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 3d17a94a494..b42d8fb3ddb 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -542,7 +542,7 @@ pub trait PdClient { fn fetch_cluster_id(&mut self) -> Result; - fn load_global_config(&mut self, list: Vec) -> PdFuture>; + fn load_global_config(&mut self, config_path: String) -> PdFuture>; fn watch_global_config( &mut self, @@ -791,10 +791,10 @@ impl PdClient for RpcClient { Ok((tx, resp_rx)) } - fn load_global_config(&mut self, list: Vec) -> PdFuture> { + fn load_global_config(&mut self, config_path: String) -> PdFuture> { use kvproto::pdpb::LoadGlobalConfigRequest; let mut req = LoadGlobalConfigRequest::new(); - req.set_names(list.into()); + req.set_config_path(config_path); let mut raw_client = self.raw_client.clone(); Box::pin(async move { raw_client.wait_for_ready().await?; diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 8674130c799..46a3e6924db 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -209,7 +209,7 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _list: Vec) -> PdFuture> { + fn load_global_config(&self, _config_path: String) -> PdFuture> { unimplemented!(); } diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index d904c95d4a8..84c2508d4ea 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -27,13 +27,13 @@ pub type Result = result::Result; pub trait PdMocker { fn load_global_config( &self, - req: &LoadGlobalConfigRequest, + _req: &LoadGlobalConfigRequest, ) -> Option> { let mut send = vec![]; - for r in req.get_names() { + for r in 0..10 { let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r.clone())); - i.set_value(r.clone()); + i.set_name(format!("/global/config/{}", r)); + i.set_value(r.to_string()); send.push(i); } let mut res = LoadGlobalConfigResponse::default(); diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index ca0a473a8b7..7dd767d19c9 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -69,7 +69,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_gc_safe_point())), request!(client => block_on(get_store_and_stats(0))), request!(client => get_operator(0)), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -101,14 +101,7 @@ fn test_pd_client_deadlock() { fn test_load_global_config() { let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await + client.load_global_config("global".to_string()).await }); for (k, v) in res.unwrap() { assert_eq!(k, format!("/global/config/{}", v)) @@ -293,7 +286,9 @@ fn test_retry() { }); test_retry_success(&mut client, |c| block_on(c.get_gc_safe_point())); test_retry_success(&mut client, |c| c.get_operator(0)); - test_retry_success(&mut client, |c| block_on(c.load_global_config(vec![]))); + test_retry_success(&mut client, |c| { + block_on(c.load_global_config(String::default())) + }); fail::remove(pd_client_v2_timeout_fp); fail::remove(pd_client_v2_backoff_fp); diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index eb22ac29e45..172db8ac09e 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -73,7 +73,7 @@ fn test_pd_client_deadlock() { request!(client => block_on(get_store_stats_async(0))), request!(client => get_operator(0)), request!(client => block_on(get_tso())), - request!(client => load_global_config(vec![])), + request!(client => load_global_config(String::default())), ]; for (name, func) in test_funcs { @@ -108,16 +108,10 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client - .load_global_config( - ["abc", "123", "xyz"] - .iter() - .map(|x| x.to_string()) - .collect::>(), - ) - .await - }); + let res = + futures::executor::block_on( + async move { client.load_global_config("global".into()).await }, + ); for (k, v) in res.unwrap() { assert_eq!(k, format!("/global/config/{}", v)) } From 2daa168f13831ab9cfd653ad2971eccbb3f38a22 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 13 Jan 2023 09:03:46 +0800 Subject: [PATCH 076/115] *: add resource group for the read path (#14001) ref tikv/tikv#13730 Signed-off-by: glorv --- Cargo.lock | 25 +- Cargo.toml | 3 + components/resource_control/Cargo.toml | 20 + components/resource_control/src/future.rs | 46 ++ components/resource_control/src/lib.rs | 18 + .../resource_control/src/resource_group.rs | 482 ++++++++++++++++++ components/server/Cargo.toml | 1 + components/server/src/server.rs | 21 + components/server/src/server2.rs | 21 + .../tikv_util/src/yatp_pool/future_pool.rs | 23 +- components/tikv_util/src/yatp_pool/mod.rs | 16 +- src/config/mod.rs | 5 + src/coprocessor/endpoint.rs | 12 + src/read_pool.rs | 95 ++-- src/storage/mod.rs | 32 ++ 15 files changed, 783 insertions(+), 37 deletions(-) create mode 100644 components/resource_control/Cargo.toml create mode 100644 components/resource_control/src/future.rs create mode 100644 components/resource_control/src/lib.rs create mode 100644 components/resource_control/src/resource_group.rs diff --git a/Cargo.lock b/Cargo.lock index c98cd025fad..0b7ca52725c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4656,6 +4656,25 @@ dependencies = [ "txn_types", ] +[[package]] +name = "resource_control" +version = "0.0.1" +dependencies = [ + "byteorder", + "crossbeam-skiplist", + "dashmap", + "kvproto", + "lazy_static", + "online_config", + "pin-project", + "prometheus", + "serde", + "slog", + "slog-global", + "tikv_util", + "yatp", +] + [[package]] name = "resource_metering" version = "0.0.1" @@ -5209,6 +5228,7 @@ dependencies = [ "raftstore-v2", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "serde_json", @@ -6290,6 +6310,7 @@ dependencies = [ "rand 0.7.3", "regex", "reqwest", + "resource_control", "resource_metering", "rev_lines", "seahash", @@ -7363,9 +7384,11 @@ checksum = "541b12c998c5b56aa2b4e6f18f03664eef9a4fd0a246a55594efae6cc2d964b5" [[package]] name = "yatp" version = "0.0.1" -source = "git+https://github.com/tikv/yatp.git?branch=master#39cb495953d40a7e846363c06090755c2eac65fa" +source = "git+https://github.com/tikv/yatp.git?branch=master#bcf431a2619c06ab7fa0c72073a0c775646c484f" dependencies = [ "crossbeam-deque", + "crossbeam-skiplist", + "crossbeam-utils 0.8.8", "dashmap", "fail", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index 4c8af61e554..d76dce26a18 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -139,6 +139,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" +resource_control = { workspace = true } resource_metering = { workspace = true } rev_lines = "0.2.1" seahash = "4.1.0" @@ -267,6 +268,7 @@ members = [ "components/raftstore", "components/raftstore-v2", "components/resolved_ts", + "components/resource_control", "components/resource_metering", "components/security", "components/server", @@ -341,6 +343,7 @@ raft_log_engine = { path = "components/raft_log_engine" } raftstore = { path = "components/raftstore", default-features = false } raftstore-v2 = { path = "components/raftstore-v2", default-features = false } resolved_ts = { path = "components/resolved_ts" } +resource_control = { path = "components/resource_control" } resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } server = { path = "components/server" } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml new file mode 100644 index 00000000000..822aed2cd2d --- /dev/null +++ b/components/resource_control/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "resource_control" +version = "0.0.1" +edition = "2021" +publish = false + +[dependencies] +byteorder = "1.2" +crossbeam-skiplist = "0.1" +dashmap = "5.1" +kvproto = { git = "https://github.com/pingcap/kvproto.git" } +lazy_static = "1.0" +online_config = { workspace = true } +pin-project = "1.0" +prometheus = { version = "0.13", features = ["nightly"] } +serde = { version = "1.0", features = ["derive"] } +slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } +slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +tikv_util = { workspace = true } +yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/future.rs b/components/resource_control/src/future.rs new file mode 100644 index 00000000000..8027a27b394 --- /dev/null +++ b/components/resource_control/src/future.rs @@ -0,0 +1,46 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + future::Future, + pin::Pin, + sync::Arc, + task::{Context, Poll}, +}; + +use pin_project::pin_project; +use tikv_util::time::Instant; + +use crate::resource_group::{ResourceConsumeType, ResourceController}; + +#[pin_project] +pub struct ControlledFuture { + #[pin] + future: F, + controller: Arc, + group_name: Vec, +} + +impl ControlledFuture { + pub fn new(future: F, controller: Arc, group_name: Vec) -> Self { + Self { + future, + controller, + group_name, + } + } +} + +impl Future for ControlledFuture { + type Output = F::Output; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.project(); + let now = Instant::now(); + let res = this.future.poll(cx); + this.controller.consume( + this.group_name, + ResourceConsumeType::CpuTime(now.saturating_elapsed()), + ); + res + } +} diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs new file mode 100644 index 00000000000..516e5dd6c8d --- /dev/null +++ b/components/resource_control/src/lib.rs @@ -0,0 +1,18 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use online_config::OnlineConfig; +use serde::{Deserialize, Serialize}; + +mod resource_group; +pub use resource_group::{ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; + +mod future; +pub use future::ControlledFuture; + +#[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + #[online_config(skip)] + pub enabled: bool, +} diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs new file mode 100644 index 00000000000..d9fa3ccf14c --- /dev/null +++ b/components/resource_control/src/resource_group.rs @@ -0,0 +1,482 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, + time::Duration, +}; + +use dashmap::{mapref::one::Ref, DashMap}; +use kvproto::resource_manager::{GroupMode, ResourceGroup}; +use yatp::queue::priority::TaskPriorityProvider; + +// a read task cost at least 50us. +const DEFAULT_PRIORITY_PER_READ_TASK: u64 = 50; +// extra task schedule factor +const TASK_EXTRA_FACTOR_BY_LEVEL: [u64; 3] = [0, 20, 100]; +/// duration to update the minimal priority value of each resource group. +pub const MIN_PRIORITY_UPDATE_INTERVAL: Duration = Duration::from_secs(1); +/// default resource group name +const DEFAULT_RESOURCE_GROUP_NAME: &str = "default"; +/// default value of max RU quota. +const DEFAULT_MAX_RU_QUOTA: u64 = 10_000; + +pub enum ResourceConsumeType { + CpuTime(Duration), + IoBytes(u64), +} + +/// ResourceGroupManager manages the metadata of each resource group. +#[derive(Default)] +pub struct ResourceGroupManager { + resource_groups: DashMap, + registry: Mutex>>, +} + +impl ResourceGroupManager { + fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { + match (rg.get_mode(), is_read) { + (GroupMode::RuMode, true) => rg + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + (GroupMode::RuMode, false) => rg + .get_r_u_settings() + .get_w_r_u() + .get_settings() + .get_fill_rate(), + // TODO: currently we only consider the cpu usage in the read path, we may also take + // io read bytes into account later. + (GroupMode::RawMode, true) => rg + .get_resource_settings() + .get_cpu() + .get_settings() + .get_fill_rate(), + (GroupMode::RawMode, false) => rg + .get_resource_settings() + .get_io_write() + .get_settings() + .get_fill_rate(), + // return a default value for unsupported config. + (GroupMode::Unknown, _) => 1, + } + } + + pub fn add_resource_group(&self, rg: ResourceGroup) { + let group_name = rg.get_name().to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + let ru_quota = Self::get_ru_setting(&rg, controller.is_read); + controller.add_resource_group(group_name.clone().into_bytes(), ru_quota); + }); + self.resource_groups.insert(group_name, rg); + } + + pub fn remove_resource_group(&self, name: &str) { + let group_name = name.to_ascii_lowercase(); + self.registry.lock().unwrap().iter().for_each(|controller| { + controller.remove_resource_group(group_name.as_bytes()); + }); + self.resource_groups.remove(&group_name); + } + + pub fn get_resource_group(&self, name: &str) -> Option> { + self.resource_groups.get(&name.to_ascii_lowercase()) + } + + pub fn get_all_resource_groups(&self) -> Vec { + self.resource_groups.iter().map(|g| g.clone()).collect() + } + + pub fn derive_controller(&self, name: String, is_read: bool) -> Arc { + let controller = Arc::new(ResourceController::new(name, is_read)); + self.registry.lock().unwrap().push(controller.clone()); + for g in &self.resource_groups { + let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); + controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); + } + + controller + } + + pub fn advance_min_virtual_time(&self) { + for controller in self.registry.lock().unwrap().iter() { + controller.update_min_virtual_time(); + } + } +} + +pub struct ResourceController { + // resource controller name is not used currently. + #[allow(dead_code)] + name: String, + // We handle the priority differently between read and write request: + // 1. the priority factor is calculate based on read/write RU settings. + // 2. for read request, we increase a constant virtual time delta at each `get_priority` call + // because the cost can't be calculated at start, so we only increase a constant delta and + // increase the real cost after task is executed; but don't increase it at write because + // the cost is known so we just pre-consume it. + is_read: bool, + // Track the maximum ru quota used to calculate the factor of each resource group. + // factor = max_ru_quota / group_ru_quota * 10.0 + // We use mutex here to ensure when we need to change this value and do adjust all resource + // groups' factors, it can't be changed concurrently. + max_ru_quota: Mutex, + // record consumption of each resource group, name --> resource_group + resource_consumptions: DashMap, GroupPriorityTracker>, + + last_min_vt: AtomicU64, +} + +impl ResourceController { + pub fn new(name: String, is_read: bool) -> Self { + let controller = Self { + name, + is_read, + max_ru_quota: Mutex::new(DEFAULT_MAX_RU_QUOTA), + resource_consumptions: DashMap::new(), + last_min_vt: AtomicU64::new(0), + }; + // add the "default" resource group + controller.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + controller + } + + fn calculate_factor(max_quota: u64, quota: u64) -> u64 { + if quota > 0 { + // we use max_quota / quota as the resource group factor, but because we need to + // cast the value to integer, so we times it by 10 to ensure the accuracy is + // enough. + (max_quota as f64 / quota as f64 * 10.0).round() as u64 + } else { + 1 + } + } + + fn add_resource_group(&self, name: Vec, ru_quota: u64) { + let mut max_ru_quota = self.max_ru_quota.lock().unwrap(); + if ru_quota > *max_ru_quota { + *max_ru_quota = ru_quota; + // adjust all group weight because the current value is too small. + self.adjust_all_resource_group_factors(ru_quota); + } + let weight = Self::calculate_factor(*max_ru_quota, ru_quota); + + let vt_delta_for_get = if self.is_read { + DEFAULT_PRIORITY_PER_READ_TASK * weight + } else { + 0 + }; + let group = GroupPriorityTracker { + ru_quota, + weight, + virtual_time: AtomicU64::new(self.last_min_vt.load(Ordering::Acquire)), + vt_delta_for_get, + }; + // maybe update existed group + self.resource_consumptions.insert(name, group); + } + + // we calculate the weight of each resource group based on the currently maximum + // ru quota, if a incoming resource group has a bigger quota, we need to + // adjust all the existing groups. As we expect this won't happen very + // often, and iterate 10k entry cost less than 5ms, so the performance is + // acceptable. + fn adjust_all_resource_group_factors(&self, max_ru_quota: u64) { + self.resource_consumptions.iter_mut().for_each(|mut g| { + g.value_mut().weight = Self::calculate_factor(max_ru_quota, g.ru_quota); + }); + } + + fn remove_resource_group(&self, name: &[u8]) { + // do not remove the default resource group, reset to default setting instead. + if DEFAULT_RESOURCE_GROUP_NAME.as_bytes() == name { + self.add_resource_group(DEFAULT_RESOURCE_GROUP_NAME.as_bytes().to_owned(), 0); + } + self.resource_consumptions.remove(name); + } + + #[inline] + fn resource_group(&self, name: &[u8]) -> Ref<'_, Vec, GroupPriorityTracker> { + if let Some(g) = self.resource_consumptions.get(name) { + g + } else { + self.resource_consumptions + .get(DEFAULT_RESOURCE_GROUP_NAME.as_bytes()) + .unwrap() + } + } + + pub fn consume(&self, name: &[u8], delta: ResourceConsumeType) { + self.resource_group(name).consume(delta) + } + + pub fn update_min_virtual_time(&self) { + let mut min_vt = u64::MAX; + let mut max_vt = 0; + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if min_vt > vt { + min_vt = vt; + } + if max_vt < vt { + max_vt = vt; + } + }); + + // TODO: use different threshold for different resource type + // needn't do update if the virtual different is less than 100ms/100KB. + if min_vt + 100_000 >= max_vt { + return; + } + + self.resource_consumptions.iter().for_each(|g| { + let vt = g.current_vt(); + if vt < max_vt { + // TODO: is increase by half is a good choice. + g.increase_vt((max_vt - vt) / 2); + } + }); + // max_vt is actually a little bigger than the current min vt, but we don't + // need totally accurate here. + self.last_min_vt.store(max_vt, Ordering::Relaxed); + } +} + +impl TaskPriorityProvider for ResourceController { + fn priority_of(&self, extras: &yatp::queue::Extras) -> u64 { + self.resource_group(extras.metadata()) + .get_priority(extras.current_level() as usize) + } +} + +struct GroupPriorityTracker { + // the ru setting of this group. + ru_quota: u64, + weight: u64, + virtual_time: AtomicU64, + // the constant delta value for each `get_priority` call, + vt_delta_for_get: u64, +} + +impl GroupPriorityTracker { + fn get_priority(&self, level: usize) -> u64 { + let task_extra_priority = TASK_EXTRA_FACTOR_BY_LEVEL[level] * 1000 * self.weight; + (if self.vt_delta_for_get > 0 { + self.virtual_time + .fetch_add(self.vt_delta_for_get, Ordering::Relaxed) + + self.vt_delta_for_get + } else { + self.virtual_time.load(Ordering::Relaxed) + }) + task_extra_priority + } + + #[inline] + fn current_vt(&self) -> u64 { + self.virtual_time.load(Ordering::Relaxed) + } + + #[inline] + fn increase_vt(&self, vt_delta: u64) { + self.virtual_time.fetch_add(vt_delta, Ordering::Relaxed); + } + + // TODO: make it delta type as generic to avoid mixed consume different types. + #[inline] + fn consume(&self, delta: ResourceConsumeType) { + let vt_delta = match delta { + ResourceConsumeType::CpuTime(dur) => dur.as_micros() as u64, + ResourceConsumeType::IoBytes(bytes) => bytes, + } * self.weight; + self.increase_vt(vt_delta); + } +} + +#[cfg(test)] +mod tests { + use kvproto::resource_manager::*; + use yatp::queue::Extras; + + use super::*; + + fn new_resource_group( + name: String, + is_ru_mode: bool, + read_tokens: u64, + write_tokens: u64, + ) -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name); + let mode = if is_ru_mode { + GroupMode::RuMode + } else { + GroupMode::RawMode + }; + group.set_mode(mode); + if is_ru_mode { + let mut ru_setting = GroupRequestUnitSettings::new(); + ru_setting + .mut_r_r_u() + .mut_settings() + .set_fill_rate(read_tokens); + ru_setting + .mut_w_r_u() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_r_u_settings(ru_setting); + } else { + let mut resource_setting = GroupResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_resource_settings(resource_setting); + } + group + } + + #[test] + fn test_resource_group() { + let resource_manager = ResourceGroupManager::default(); + + let group1 = new_resource_group("TEST".into(), true, 100, 100); + resource_manager.add_resource_group(group1); + + assert!(resource_manager.get_resource_group("test1").is_none()); + + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 100 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group1 = new_resource_group("Test".into(), true, 200, 100); + resource_manager.add_resource_group(group1); + let group = resource_manager.get_resource_group("test").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 200 + ); + drop(group); + assert_eq!(resource_manager.resource_groups.len(), 1); + + let group2 = new_resource_group("test2".into(), true, 400, 200); + resource_manager.add_resource_group(group2); + assert_eq!(resource_manager.resource_groups.len(), 2); + + let resouce_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resouce_ctl.resource_consumptions.len(), 3); + + let group1 = resouce_ctl.resource_group("test".as_bytes()); + assert_eq!(group1.weight, 500); + let group2 = resouce_ctl.resource_group("test2".as_bytes()); + assert_eq!(group2.weight, 250); + assert_eq!(group1.current_vt(), 0); + + let mut extras1 = Extras::single_level(); + extras1.set_metadata("test".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras1), 25_000); + assert_eq!(group1.current_vt(), 25_000); + + let mut extras2 = Extras::single_level(); + extras2.set_metadata("test2".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras2), 12_500); + assert_eq!(group2.current_vt(), 12_500); + + let mut extras3 = Extras::single_level(); + extras3.set_metadata("unknown_group".as_bytes().to_owned()); + assert_eq!(resouce_ctl.priority_of(&extras3), 50); + assert_eq!( + resouce_ctl + .resource_group("default".as_bytes()) + .current_vt(), + 50 + ); + + resouce_ctl.consume( + "test".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + resouce_ctl.consume( + "test2".as_bytes(), + ResourceConsumeType::CpuTime(Duration::from_micros(10000)), + ); + + assert_eq!(group1.current_vt(), 5_025_000); + assert_eq!(group1.current_vt(), group2.current_vt() * 2); + + // test update all group vts + resource_manager.advance_min_virtual_time(); + let group1_vt = group1.current_vt(); + assert_eq!(group1_vt, 5_025_000); + assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); + assert!( + resouce_ctl + .resource_group("default".as_bytes()) + .current_vt() + >= group1.current_vt() / 2 + ); + + drop(group1); + drop(group2); + + // test add 1 new resource group + let new_group = new_resource_group("new_group".into(), true, 500, 500); + resource_manager.add_resource_group(new_group); + + assert_eq!(resouce_ctl.resource_consumptions.len(), 4); + let group3 = resouce_ctl.resource_group("new_group".as_bytes()); + assert_eq!(group3.weight, 200); + assert!(group3.current_vt() >= group1_vt / 2); + } + + #[test] + fn test_adjust_resource_group_weight() { + let resource_manager = ResourceGroupManager::default(); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); + + let group1 = new_resource_group("test1".into(), true, 5000, 1000); + resource_manager.add_resource_group(group1); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + + // add a resource group with big ru + let group1 = new_resource_group("test2".into(), true, 50000, 2000); + resource_manager.add_resource_group(group1); + assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); + assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); + assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); + // resource_ctl_write should be unchanged. + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 10000); + assert_eq!( + resource_ctl_write.resource_group("test1".as_bytes()).weight, + 100 + ); + assert_eq!( + resource_ctl_write.resource_group("test2".as_bytes()).weight, + 50 + ); + } +} diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index acdca09b29c..d5e2f177b5e 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -69,6 +69,7 @@ raftstore = { workspace = true, features = ["engine_rocks"] } raftstore-v2 = { workspace = true } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } serde_json = "1.0" diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 3c926969ce2..52b9fbf1d1a 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -82,6 +82,7 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; +use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; use security::SecurityManager; use snap_recovery::RecoveryService; use tikv::{ @@ -244,6 +245,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery @@ -320,6 +322,7 @@ where let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); + let resource_manager = Arc::new(ResourceGroupManager::default()); // Initialize raftstore channels. let (router, system) = fsm::create_raft_batch_system(&config.raft_store); @@ -328,6 +331,14 @@ where let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. + if config.resource_control.enabled { + let resource_mgr1 = resource_manager.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr1.advance_min_virtual_time(); + }); + } let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -398,6 +409,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, br_snap_recovery_mode: is_recovering_marked, @@ -733,10 +745,19 @@ where } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let priority_mgr = if self.config.resource_control.enabled { + Some( + self.resource_manager + .derive_controller("unified-read-pool".into(), true), + ) + } else { + None + }; Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + priority_mgr, )) } else { None diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 5d037fa3412..12e6af61613 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -62,6 +62,7 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{router::RaftRouter, StateStorage}; +use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -221,6 +222,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, } @@ -285,6 +287,15 @@ where config.quota.max_delay_duration, config.quota.enable_auto_tune, )); + let resource_manager = Arc::new(ResourceGroupManager::default()); + // spawn a task to periodically update the minimal virtual time of all resource + // group. + if config.resource_control.enabled { + let resource_mgr1 = resource_manager.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr1.advance_min_virtual_time(); + }); + } let mut causal_ts_provider = None; if let ApiVersion::V2 = F::TAG { @@ -333,6 +344,7 @@ where flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, causal_ts_provider, tablet_registry: None, } @@ -622,10 +634,19 @@ where let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let priority_mgr = if self.config.resource_control.enabled { + Some( + self.resource_manager + .derive_controller("unified-read-pool".into(), true), + ) + } else { + None + }; Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + priority_mgr, )) } else { None diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index 9de2d49cb07..e74ced848c0 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -15,7 +15,7 @@ use fail::fail_point; use futures::channel::oneshot::{self, Canceled}; use prometheus::{IntCounter, IntGauge}; use tracker::TrackedFuture; -use yatp::task::future; +use yatp::{queue::Extras, task::future}; pub type ThreadPool = yatp::ThreadPool; @@ -82,7 +82,14 @@ impl FuturePool { where F: Future + Send + 'static, { - self.inner.spawn(TrackedFuture::new(future)) + self.inner.spawn(TrackedFuture::new(future), None) + } + + pub fn spawn_with_extras(&self, future: F, extras: Extras) -> Result<(), Full> + where + F: Future + Send + 'static, + { + self.inner.spawn(TrackedFuture::new(future), Some(extras)) } /// Spawns a future in the pool and returns a handle to the result of the @@ -143,7 +150,7 @@ impl PoolInner { } } - fn spawn(&self, future: F) -> Result<(), Full> + fn spawn(&self, future: F, extras: Option) -> Result<(), Full> where F: Future + Send + 'static, { @@ -154,11 +161,17 @@ impl PoolInner { metrics_running_task_count.inc(); - self.pool.spawn(async move { + let f = async move { let _ = future.await; metrics_handled_task_count.inc(); metrics_running_task_count.dec(); - }); + }; + + if let Some(extras) = extras { + self.pool.spawn(future::TaskCell::new(f, extras)); + } else { + self.pool.spawn(f); + } Ok(()) } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 6e246d6cddf..29376b904a5 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -10,7 +10,7 @@ pub use future_pool::{Full, FuturePool}; use prometheus::{local::LocalHistogram, Histogram}; use yatp::{ pool::{CloneRunnerBuilder, Local, Runner}, - queue::{multilevel, QueueType, TaskCell as _}, + queue::{multilevel, priority, QueueType, TaskCell as _}, task::future::{Runner as FutureRunner, TaskCell}, ThreadPool, }; @@ -282,6 +282,20 @@ impl YatpPoolBuilder { .build_with_queue_and_runner(QueueType::Multilevel(multilevel_builder), runner_builder) } + pub fn build_priority_pool( + &mut self, + priority_provider: Arc, + ) -> ThreadPool { + let (builder, read_pool_runner) = self.create_builder(); + let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + let priority_builder = priority::Builder::new( + priority::Config::default().name(Some(name)), + priority_provider, + ); + let runner_builder = priority_builder.runner_builder(CloneRunnerBuilder(read_pool_runner)); + builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) + } + fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let mut builder = yatp::Builder::new(thd_name!(name)); diff --git a/src/config/mod.rs b/src/config/mod.rs index d2c5941c5ec..8d3e5477f26 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -55,6 +55,7 @@ use raftstore::{ coprocessor::{Config as CopConfig, RegionInfoAccessor}, store::{CompactionGuardGeneratorFactory, Config as RaftstoreConfig, SplitConfig}, }; +use resource_control::Config as ResourceControlConfig; use resource_metering::Config as ResourceMeteringConfig; use security::SecurityConfig; use serde::{ @@ -3039,6 +3040,9 @@ pub struct TikvConfig { #[online_config(skip)] pub causal_ts: CausalTsConfig, + + #[online_config(submodule)] + pub resource_control: ResourceControlConfig, } impl Default for TikvConfig { @@ -3081,6 +3085,7 @@ impl Default for TikvConfig { resource_metering: ResourceMeteringConfig::default(), backup_stream: BackupStreamConfig::default(), causal_ts: CausalTsConfig::default(), + resource_control: ResourceControlConfig::default(), } } } diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 54fcaeb0489..711cd83e607 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -470,6 +470,11 @@ impl Endpoint { let resource_tag = self .resource_tag_factory .new_tag_with_key_ranges(&req_ctx.context, key_ranges); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); // box the tracker so that moving it is cheap. let tracker = Box::new(Tracker::new(req_ctx, self.slow_log_threshold)); @@ -480,6 +485,7 @@ impl Endpoint { .in_resource_metering_tag(resource_tag), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded); async move { res.await? } @@ -690,6 +696,11 @@ impl Endpoint { ) -> Result>> { let (tx, rx) = mpsc::channel::>(self.stream_channel_size); let priority = req_ctx.context.get_priority(); + let group_name = req_ctx + .context + .get_resource_group_name() + .as_bytes() + .to_owned(); let key_ranges = req_ctx .ranges .iter() @@ -712,6 +723,7 @@ impl Endpoint { }), priority, task_id, + group_name, ) .map_err(|_| Error::MaxPendingTasksExceeded)?; Ok(rx) diff --git a/src/read_pool.rs b/src/read_pool.rs index 5212c4ae594..1a590679584 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -11,6 +11,7 @@ use futures::{channel::oneshot, future::TryFutureExt}; use kvproto::kvrpcpb::CommandPri; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use prometheus::{IntCounter, IntGauge}; +use resource_control::{ControlledFuture, ResourceController}; use thiserror::Error; use tikv_util::{ sys::{cpu_time::ProcessStat, SysQuota}, @@ -52,6 +53,7 @@ pub enum ReadPool { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } @@ -73,12 +75,14 @@ impl ReadPool { running_threads, max_tasks, pool_size, + resource_ctl, } => ReadPoolHandle::Yatp { remote: pool.remote().clone(), running_tasks: running_tasks.clone(), running_threads: running_threads.clone(), max_tasks: *max_tasks, pool_size: *pool_size, + resource_ctl: resource_ctl.clone(), }, } } @@ -97,11 +101,18 @@ pub enum ReadPoolHandle { running_threads: IntGauge, max_tasks: usize, pool_size: usize, + resource_ctl: Option>, }, } impl ReadPoolHandle { - pub fn spawn(&self, f: F, priority: CommandPri, task_id: u64) -> Result<(), ReadPoolError> + pub fn spawn( + &self, + f: F, + priority: CommandPri, + task_id: u64, + group_meta: Vec, + ) -> Result<(), ReadPoolError> where F: Future + Send + 'static, { @@ -123,6 +134,7 @@ impl ReadPoolHandle { remote, running_tasks, max_tasks, + resource_ctl, .. } => { let running_tasks = running_tasks.clone(); @@ -140,14 +152,29 @@ impl ReadPoolHandle { CommandPri::Normal => None, CommandPri::Low => Some(2), }; - let extras = Extras::new_multilevel(task_id, fixed_level); - let task_cell = TaskCell::new( - TrackedFuture::new(async move { - f.await; - running_tasks.dec(); - }), - extras, - ); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_meta.clone()); + let task_cell = if let Some(resource_ctl) = resource_ctl { + TaskCell::new( + TrackedFuture::new(ControlledFuture::new( + async move { + f.await; + running_tasks.dec(); + }, + resource_ctl.clone(), + group_meta, + )), + extras, + ) + } else { + TaskCell::new( + TrackedFuture::new(async move { + f.await; + running_tasks.dec(); + }), + extras, + ) + }; remote.spawn(task_cell); } } @@ -159,6 +186,7 @@ impl ReadPoolHandle { f: F, priority: CommandPri, task_id: u64, + group_meta: Vec, ) -> impl Future> where F: Future + Send + 'static, @@ -172,6 +200,7 @@ impl ReadPoolHandle { }, priority, task_id, + group_meta, ); async move { res?; @@ -262,11 +291,12 @@ pub fn build_yatp_read_pool( config: &UnifiedReadPoolConfig, reporter: R, engine: E, + resource_ctl: Option>, ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); - let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); let raftkv = Arc::new(Mutex::new(engine)); - let pool = builder + let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); + builder .name_prefix(&unified_read_pool_name) .stack_size(config.stack_size.0 as usize) .thread_count( @@ -284,8 +314,12 @@ pub fn build_yatp_read_pool( }) .before_stop(|| unsafe { destroy_tls_engine::(); - }) - .build_multi_level_pool(); + }); + let pool = if let Some(ref r) = resource_ctl { + builder.build_priority_pool(r.clone()) + } else { + builder.build_multi_level_pool() + }; ReadPool::Yatp { pool, running_tasks: UNIFIED_READ_POOL_RUNNING_TASKS @@ -296,6 +330,7 @@ pub fn build_yatp_read_pool( .max_tasks_per_worker .saturating_mul(config.max_thread_count), pool_size: config.max_thread_count, + resource_ctl, } } @@ -600,7 +635,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -616,18 +651,18 @@ mod tests { let (task3, _tx3) = gen_task(); let (task4, _tx4) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } tx1.send(()).unwrap(); thread::sleep(Duration::from_millis(300)); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); } #[test] @@ -641,7 +676,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -658,11 +693,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -670,10 +705,10 @@ mod tests { handle.scale_pool_size(3); assert_eq!(handle.get_normal_pool_size(), 3); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -690,7 +725,7 @@ mod tests { // max running tasks number should be 2*1 = 2 let engine = TestEngineBuilder::new().build().unwrap(); - let pool = build_yatp_read_pool(&config, DummyReporter, engine); + let pool = build_yatp_read_pool(&config, DummyReporter, engine, None); let gen_task = || { let (tx, rx) = oneshot::channel::<()>(); @@ -707,11 +742,11 @@ mod tests { let (task4, _tx4) = gen_task(); let (task5, _tx5) = gen_task(); - handle.spawn(task1, CommandPri::Normal, 1).unwrap(); - handle.spawn(task2, CommandPri::Normal, 2).unwrap(); + handle.spawn(task1, CommandPri::Normal, 1, vec![]).unwrap(); + handle.spawn(task2, CommandPri::Normal, 2, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task3, CommandPri::Normal, 3) { + match handle.spawn(task3, CommandPri::Normal, 3, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } @@ -723,10 +758,10 @@ mod tests { handle.scale_pool_size(1); assert_eq!(handle.get_normal_pool_size(), 1); - handle.spawn(task4, CommandPri::Normal, 4).unwrap(); + handle.spawn(task4, CommandPri::Normal, 4, vec![]).unwrap(); thread::sleep(Duration::from_millis(300)); - match handle.spawn(task5, CommandPri::Normal, 5) { + match handle.spawn(task5, CommandPri::Normal, 5, vec![]) { Err(ReadPoolError::UnifiedReadPoolFull) => {} _ => panic!("should return full error"), } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 802b0507849..0819c2599b9 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -594,6 +594,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -727,6 +728,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -750,6 +752,11 @@ impl Storage { const CMD: CommandKind = CommandKind::batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = requests[0].get_context().get_priority(); + let group_name = requests[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let concurrency_manager = self.concurrency_manager.clone(); let api_version = self.api_version; @@ -910,6 +917,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -929,6 +937,7 @@ impl Storage { let stage_begin_ts = Instant::now(); const CMD: CommandKind = CommandKind::batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys .iter() @@ -1082,6 +1091,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1109,6 +1119,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1258,6 +1269,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1276,6 +1288,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::scan_lock; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag_with_key_ranges( &ctx, @@ -1405,6 +1418,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1577,6 +1591,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -1639,6 +1654,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -1657,6 +1673,11 @@ impl Storage { const CMD: CommandKind = CommandKind::raw_batch_get_command; // all requests in a batch have the same region, epoch, term, replica_read let priority = gets[0].get_context().get_priority(); + let group_name = gets[0] + .get_context() + .get_resource_group_name() + .as_bytes() + .to_owned(); let priority_tag = get_priority_tag(priority); let api_version = self.api_version; @@ -1770,6 +1791,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { res.map_err(|_| Error::from(ErrorInner::SchedTooBusy)) @@ -1786,6 +1808,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_get; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = keys.iter().map(|k| (k.clone(), k.clone())).collect(); let resource_tag = self @@ -1866,6 +1889,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2272,6 +2296,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self.resource_tag_factory.new_tag(&ctx); let api_version = self.api_version; @@ -2380,6 +2405,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2400,6 +2426,7 @@ impl Storage { ) -> impl Future>>> { const CMD: CommandKind = CommandKind::raw_batch_scan; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2536,6 +2563,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2553,6 +2581,7 @@ impl Storage { ) -> impl Future>> { const CMD: CommandKind = CommandKind::raw_get_key_ttl; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let resource_tag = self .resource_tag_factory @@ -2615,6 +2644,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { @@ -2719,6 +2749,7 @@ impl Storage { ) -> impl Future> { const CMD: CommandKind = CommandKind::raw_checksum; let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().as_bytes().to_owned(); let priority_tag = get_priority_tag(priority); let key_ranges = ranges .iter() @@ -2793,6 +2824,7 @@ impl Storage { .in_resource_metering_tag(resource_tag), priority, thread_rng().next_u64(), + group_name, ); async move { From 321aa833ca5ec0fd5dcec7fa8c01f65116d72ba6 Mon Sep 17 00:00:00 2001 From: MyonKeminta <9948422+MyonKeminta@users.noreply.github.com> Date: Fri, 13 Jan 2023 11:49:46 +0800 Subject: [PATCH 077/115] txn: Do constraint check when handling repeated acqurie_pessimsitic_lock request (#14037) close tikv/tikv#14038, close pingcap/tidb#40114 Fixes the problem that when handling repeated acquire_pessimistic_lock requests is recevied, should_not_exist is ignored. TiKV provides idempotency for these RPC requests, but for acquire_pessimistic_lock, it ignored the possibility that the client may expect a pessimistic_rollback between two acquire_pessimistic_lock request on the same key. In this case the second request may come from another statement and carries `should_not_exist` that wasn't set in the previously finished pessimistic lock request. If the first request successfully acquired the lock and the pessimistic_rollback failed, TiKV may return a sucessful response, making the client believe that the key doesn't exist before. In some rare cases, this has risk to cause data inconsistency. Signed-off-by: MyonKeminta Co-authored-by: Ti Chi Robot --- .../txn/actions/acquire_pessimistic_lock.rs | 150 +++++++++++++++++- 1 file changed, 146 insertions(+), 4 deletions(-) diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index fcffd500c8e..86b9ddeab41 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -142,10 +142,22 @@ pub fn acquire_pessimistic_lock( None }; - if need_load_value { - val = reader.get(&key, for_update_ts)?; - } else if need_check_existence { - val = reader.get_write(&key, for_update_ts)?.map(|_| vec![]); + if need_load_value || need_check_existence || should_not_exist { + let write = reader.get_write_with_commit_ts(&key, for_update_ts)?; + if let Some((write, commit_ts)) = write { + // Here `get_write_with_commit_ts` returns only the latest PUT if it exists and + // is not deleted. It's still ok to pass it into `check_data_constraint`. + // In case we are going to lock it with write conflict, we do not check it since + // the statement will then retry. + if locked_with_conflict_ts.is_none() { + check_data_constraint(reader, should_not_exist, &write, commit_ts, &key)?; + } + if need_load_value { + val = Some(reader.load_data(&key, write)?); + } else if need_check_existence { + val = Some(vec![]); + } + } } // Pervious write is not loaded. let (prev_write_loaded, prev_write) = (false, None); @@ -1832,4 +1844,134 @@ pub mod tests { must_pessimistic_rollback(&mut engine, b"k1", 10, 50); must_unlocked(&mut engine, b"k1"); } + + #[test] + fn test_repeated_request_check_should_not_exist() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + for &(return_values, check_existence) in + &[(false, false), (false, true), (true, false), (true, true)] + { + let key = &[b'k', (return_values as u8 * 2) + check_existence as u8] as &[u8]; + + // An empty key. + must_succeed(&mut engine, key, key, 10, 10); + let res = must_succeed_impl( + &mut engine, + key, + key, + 10, + true, + 1000, + 10, + return_values, + check_existence, + 15, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 10, 10, DoPessimisticCheck); + must_commit(&mut engine, key, 10, 19); + + // The key has one record: Lock(10, 19) + must_succeed(&mut engine, key, key, 20, 20); + let res = must_succeed_impl( + &mut engine, + key, + key, + 20, + true, + 1000, + 20, + return_values, + check_existence, + 25, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_put(&mut engine, key, b"v1", key, 20, 20, DoPessimisticCheck); + must_commit(&mut engine, key, 20, 29); + + // The key has records: + // Lock(10, 19), Put(20, 29) + must_succeed(&mut engine, key, key, 30, 30); + let error = must_err_impl( + &mut engine, + key, + key, + 30, + true, + 30, + return_values, + check_existence, + 35, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_lock(&mut engine, key, key, 30, 30, DoPessimisticCheck); + must_commit(&mut engine, key, 30, 39); + + // Lock(10, 19), Put(20, 29), Lock(30, 39) + must_succeed(&mut engine, key, key, 40, 40); + let error = must_err_impl( + &mut engine, + key, + key, + 40, + true, + 40, + return_values, + check_existence, + 45, + false, + ); + assert!(matches!( + error, + MvccError(box ErrorInner::AlreadyExist { .. }) + )); + must_pessimistic_prewrite_delete(&mut engine, key, key, 40, 40, DoPessimisticCheck); + must_commit(&mut engine, key, 40, 49); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49) + must_succeed(&mut engine, key, key, 50, 50); + let res = must_succeed_impl( + &mut engine, + key, + key, + 50, + true, + 1000, + 50, + return_values, + check_existence, + 55, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 50, 50, DoPessimisticCheck); + must_commit(&mut engine, key, 50, 59); + + // Lock(10, 19), Put(20, 29), Lock(30, 39), Delete(40, 49), Lock(50, 59) + must_succeed(&mut engine, key, key, 60, 60); + let res = must_succeed_impl( + &mut engine, + key, + key, + 60, + true, + 1000, + 60, + return_values, + check_existence, + 65, + false, + ); + assert!(res.is_none()); + must_pessimistic_prewrite_lock(&mut engine, key, key, 60, 60, DoPessimisticCheck); + must_commit(&mut engine, key, 60, 69); + } + } } From 65a99a89b9f03de1ca24cee8c33584d13370becc Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 13 Jan 2023 14:27:46 +0800 Subject: [PATCH 078/115] raftstore-v2: fix metrics and perf context (#14035) ref tikv/tikv#12842 This PR fixes several bugs and metrics: - Now waterfall timer will be reset in before_write, the goal is to solve the confusion that stall writes can pollute the whole waterfall metrics. - Perf context is changed not to be associated with engine instance. Perf context is thread local and instance independent under the hook. - Fix flushed index advance failure due to suspicious flush. - Support print long uncommitted logs and fix incorrect commit time Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/engine_panic/src/perf_context.rs | 2 +- components/engine_rocks/src/perf_context.rs | 2 +- components/engine_tirocks/src/perf_context.rs | 1 - components/engine_traits/src/flush.rs | 5 +- components/engine_traits/src/perf_context.rs | 2 +- components/raft_log_engine/src/engine.rs | 2 +- components/raftstore-v2/src/batch/store.rs | 1 + components/raftstore-v2/src/fsm/peer.rs | 2 +- .../operation/command/admin/compact_log.rs | 5 + .../raftstore-v2/src/operation/command/mod.rs | 36 ++++--- .../raftstore-v2/src/operation/query/lease.rs | 2 +- .../src/operation/query/replica.rs | 2 +- .../src/operation/ready/apply_trace.rs | 33 +++++-- .../raftstore-v2/src/operation/ready/mod.rs | 66 ++++++++++++- components/raftstore-v2/src/raft/apply.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 18 +++- .../src/router/response_channel.rs | 94 +++++++++++++------ .../raftstore-v2/src/worker/tablet_gc.rs | 2 + .../raftstore/src/store/async_io/write.rs | 13 +-- components/raftstore/src/store/fsm/apply.rs | 14 ++- components/raftstore/src/store/fsm/peer.rs | 2 +- components/raftstore/src/store/fsm/store.rs | 16 ++-- .../raftstore/src/store/local_metrics.rs | 73 ++++++++------ components/raftstore/src/store/msg.rs | 19 +--- components/raftstore/src/store/peer.rs | 4 +- src/coprocessor/tracker.rs | 32 +++---- src/storage/metrics.rs | 26 ++--- 27 files changed, 303 insertions(+), 173 deletions(-) diff --git a/components/engine_panic/src/perf_context.rs b/components/engine_panic/src/perf_context.rs index 46d18c00e77..27bdd1ac066 100644 --- a/components/engine_panic/src/perf_context.rs +++ b/components/engine_panic/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::engine::PanicEngine; impl PerfContextExt for PanicEngine { type PerfContext = PanicPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { panic!() } } diff --git a/components/engine_rocks/src/perf_context.rs b/components/engine_rocks/src/perf_context.rs index a731a9461dc..f8cfdbcc667 100644 --- a/components/engine_rocks/src/perf_context.rs +++ b/components/engine_rocks/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } diff --git a/components/engine_tirocks/src/perf_context.rs b/components/engine_tirocks/src/perf_context.rs index d1d975c65c3..643967230df 100644 --- a/components/engine_tirocks/src/perf_context.rs +++ b/components/engine_tirocks/src/perf_context.rs @@ -136,7 +136,6 @@ impl engine_traits::PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; fn get_perf_context( - &self, level: engine_traits::PerfLevel, kind: engine_traits::PerfContextKind, ) -> Self::PerfContext { diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index b3a827c234e..8300348da8c 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -157,7 +157,10 @@ impl PersistenceListener { } match flushed_pr { Some(pr) => pr, - None => panic!("{} not found in {:?}", cf, prs), + None => panic!( + "[region_id={}] [tablet_index={}] {} not found in {:?}", + self.region_id, self.tablet_index, cf, prs + ), } }; self.storage diff --git a/components/engine_traits/src/perf_context.rs b/components/engine_traits/src/perf_context.rs index ba48974a460..44462e3fe3c 100644 --- a/components/engine_traits/src/perf_context.rs +++ b/components/engine_traits/src/perf_context.rs @@ -37,7 +37,7 @@ numeric_enum_serializing_mod! {perf_level_serde PerfLevel { pub trait PerfContextExt { type PerfContext: PerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext; } /// The subsystem the PerfContext is being created for. diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 3db865ed8ad..838fe461f4b 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -366,7 +366,7 @@ impl RaftLogEngine { impl PerfContextExt for RaftLogEngine { type PerfContext = RaftEnginePerfContext; - fn get_perf_context(&self, _level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(_level: PerfLevel, _kind: PerfContextKind) -> Self::PerfContext { RaftEnginePerfContext } } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 6183778c369..ccf3f19f3ea 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -188,6 +188,7 @@ impl PollHandler PeerFsmDelegate<'a, EK, ER, PeerTick::CheckLeaderLease => unimplemented!(), PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => unimplemented!(), - PeerTick::CheckLongUncommitted => unimplemented!(), + PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), } } diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs index a4983b28a47..0f5fd9b392f 100644 --- a/components/raftstore-v2/src/operation/command/admin/compact_log.rs +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -73,6 +73,11 @@ impl CompactLogContext { pub fn set_last_applying_index(&mut self, index: u64) { self.last_applying_index = index; } + + #[inline] + pub fn last_applying_index(&self) -> u64 { + self.last_applying_index + } } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 047fe026ffe..cf29d9ee25a 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -32,7 +32,7 @@ use raftstore::{ apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, msg::ErrorCallback, util, Config, WriteCallback, @@ -302,9 +302,7 @@ impl Peer { t.metrics.write_instant = Some(now); &mut t.metrics.store_time_nanos }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + tracker.reset(now); } } } @@ -314,7 +312,7 @@ impl Peer { return; } // TODO: remove following log once stable. - info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res); + info!(self.logger, "on_apply_res"; "apply_res" => ?apply_res, "apply_trace" => ?self.storage().apply_trace()); // It must just applied a snapshot. if apply_res.applied_index < self.entry_storage().first_index() { // Ignore admin command side effects, otherwise it may split incomplete @@ -378,6 +376,12 @@ impl Peer { scheduler.send(ApplyTask::ManualFlush); } } + let last_applying_index = self.compact_log_context().last_applying_index(); + let committed_index = self.entry_storage().commit_index(); + if last_applying_index < committed_index { + // We need to continue to apply after previous page is finished. + self.set_has_ready(); + } } } @@ -691,11 +695,23 @@ impl Apply { .iter() .flat_map(|(v, _)| { v.write_trackers() - .flat_map(|t| t.as_tracker_token().cloned()) + .flat_map(|t| t.as_tracker_token()) }) .collect(); self.perf_context().report_metrics(&tokens); } + let mut apply_res = ApplyRes::default(); + apply_res.applied_index = index; + apply_res.applied_term = term; + apply_res.admin_result = self.take_admin_result().into_boxed_slice(); + apply_res.modifications = *self.modifications_mut(); + apply_res.metrics = mem::take(&mut self.metrics); + let written_bytes = apply_res.metrics.written_bytes; + self.res_reporter().report(apply_res); + + // Report result first and then invoking callbacks. This may delays callback a + // little bit, but can make sure all following messages must see the side + // effect of admin commands. let callbacks = self.callbacks_mut(); let now = std::time::Instant::now(); let apply_time = APPLY_TIME_HISTOGRAM.local(); @@ -709,14 +725,6 @@ impl Apply { if callbacks.capacity() > SHRINK_PENDING_CMD_QUEUE_CAP { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } - let mut apply_res = ApplyRes::default(); - apply_res.applied_index = index; - apply_res.applied_term = term; - apply_res.admin_result = self.take_admin_result().into_boxed_slice(); - apply_res.modifications = *self.modifications_mut(); - apply_res.metrics = mem::take(&mut self.metrics); - let written_bytes = apply_res.metrics.written_bytes; - self.res_reporter().report(apply_res); written_bytes } } diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 0abd0cccd72..3185f1bd24b 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -112,7 +112,7 @@ impl Peer { let time = monotonic_raw_now(); for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index fb00adbbc5a..901fd9726f6 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -75,7 +75,7 @@ impl Peer { let time = monotonic_raw_now(); for (req, ch, _) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) .to_std() .unwrap() diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 5b88a6ba94d..67bbed5aa4b 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -40,7 +40,7 @@ use kvproto::{ use raftstore::store::{ ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }; -use slog::{trace, Logger}; +use slog::{info, trace, Logger}; use tikv_util::{box_err, slog_panic, worker::Scheduler}; use crate::{ @@ -133,7 +133,8 @@ pub type DataTrace = [u64; DATA_CFS_LEN]; #[derive(Clone, Copy, Default, Debug)] struct Progress { flushed: u64, - /// The index of last entry that has modification to the CF. + /// The index of last entry that has modification to the CF. The value + /// can be larger than the index that actually modifies the CF in apply. /// /// If `flushed` == `last_modified`, then all data in the CF is persisted. last_modified: u64, @@ -192,9 +193,13 @@ impl ApplyTrace { trace.admin.last_modified = i; trace.persisted_applied = i; trace.last_flush_trigger = i; - let applied_region_state = engine - .get_region_state(region_id, trace.admin.flushed)? - .unwrap(); + let applied_region_state = match engine.get_region_state(region_id, trace.admin.flushed)? { + Some(s) => s, + None => panic!( + "failed to get region state [region_id={}] [apply_trace={:?}]", + region_id, trace + ), + }; Ok((trace, applied_region_state)) } @@ -242,7 +247,7 @@ impl ApplyTrace { } }) .max(); - if let Some(m) = last_modified && m >= self.admin.flushed + 4096 && m >= self.last_flush_trigger + 4096 { + if let Some(m) = last_modified && m >= self.admin.flushed + 4096000 && m >= self.last_flush_trigger + 4096000 { self.last_flush_trigger = m; true } else { @@ -257,10 +262,17 @@ impl ApplyTrace { } let min_flushed = self .data_cfs - .iter() + .iter_mut() // Only unflushed CFs are considered. Flushed CF always have uptodate changes // persisted. .filter_map(|pr| { + // All modifications before mem_index must be seen. If following condition is + // true, it means the modification comes beyond general apply process (like + // transaction GC unsafe write). Align `last_modified` to `flushed` to avoid + // blocking raft log GC. + if mem_index >= pr.flushed && pr.flushed > pr.last_modified { + pr.last_modified = pr.flushed; + } if pr.last_modified != pr.flushed { Some(pr.flushed) } else { @@ -484,6 +496,7 @@ impl Storage { let lb = write_task .extra_write .ensure_v2(|| raft_engine.log_batch(1)); + info!(self.logger(), "persisting admin flushed"; "tablet_index" => tablet_index, "flushed" => trace.admin.flushed); let trace = self.apply_trace_mut(); lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) .unwrap(); @@ -660,6 +673,12 @@ mod tests { ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + // In special cae, some CF may be flushed without any modification recorded, + // we should still able to advance the apply index forward. + ([(5, 2), (9, 3), (7, 3)], (2, 2), 3, 3), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 6, 6), + ([(5, 2), (9, 3), (7, 3)], (2, 2), 10, 10), + ([(5, 2), (9, 3), (7, 3)], (2, 3), 10, 2), ]; for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { let mut trace = ApplyTrace::default(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 3f559feff8b..d1348cf014b 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -81,6 +81,16 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } self.schedule_tick(PeerTick::Raft); } + + pub fn on_check_long_uncommitted(&mut self) { + if !self.fsm.peer().is_leader() { + return; + } + self.fsm + .peer_mut() + .check_long_uncommitted_proposals(self.store_ctx); + self.schedule_tick(PeerTick::CheckLongUncommitted); + } } impl Peer { @@ -396,9 +406,10 @@ impl Peer { // smaller than propose_time of a command, which was // proposed in another thread while this thread receives its // AppendEntriesResponse and is ready to calculate its commit-log-duration. - ctx.current_time.replace(monotonic_raw_now()); + let current_time = monotonic_raw_now(); + ctx.current_time.replace(current_time); ctx.raft_metrics.commit_log.observe(duration_to_sec( - (ctx.current_time.unwrap() - propose_time).to_std().unwrap(), + (current_time - propose_time).to_std().unwrap(), )); self.maybe_renew_leader_lease(propose_time, &ctx.store_meta, None); update_lease = false; @@ -730,6 +741,7 @@ impl Peer { self.region_heartbeat_pd(ctx); self.add_pending_tick(PeerTick::CompactLog); self.add_pending_tick(PeerTick::SplitRegionCheck); + self.add_pending_tick(PeerTick::CheckLongUncommitted); } StateRole::Follower => { self.leader_lease_mut().expire(); @@ -793,6 +805,56 @@ impl Peer { self.read_progress_mut().discard(); } } + + /// Check if there is long uncommitted proposal. + /// + /// This will increase the threshold when a long uncommitted proposal is + /// detected, and reset the threshold when there is no long uncommitted + /// proposal. + fn has_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) -> bool { + let mut has_long_uncommitted = false; + let base_threshold = ctx.cfg.long_uncommitted_base_threshold.0; + if let Some(propose_time) = self.proposals().oldest().and_then(|p| p.propose_time) { + // When a proposal was proposed with this ctx before, the current_time can be + // some. + let current_time = *ctx.current_time.get_or_insert_with(monotonic_raw_now); + let elapsed = match (current_time - propose_time).to_std() { + Ok(elapsed) => elapsed, + Err(_) => return false, + }; + // Increase the threshold for next turn when a long uncommitted proposal is + // detected. + let threshold = self.long_uncommitted_threshold(); + if elapsed >= threshold { + has_long_uncommitted = true; + self.set_long_uncommitted_threshold(threshold + base_threshold); + } else if elapsed < base_threshold { + self.set_long_uncommitted_threshold(base_threshold); + } + } else { + self.set_long_uncommitted_threshold(base_threshold); + } + has_long_uncommitted + } + + fn check_long_uncommitted_proposals(&mut self, ctx: &mut StoreContext) { + if self.has_long_uncommitted_proposals(ctx) { + let status = self.raft_group().status(); + let mut buffer: Vec<(u64, u64, u64)> = Vec::new(); + if let Some(prs) = status.progress { + for (id, p) in prs.iter() { + buffer.push((*id, p.commit_group_id, p.matched)); + } + } + warn!( + self.logger, + "found long uncommitted proposals"; + "progress" => ?buffer, + "cache_first_index" => ?self.entry_storage().entry_cache_first_index(), + "next_turn_threshold" => ?self.long_uncommitted_threshold(), + ); + } + } } impl Storage { diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 6d1faa98cbf..7a1a22a5a95 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -81,7 +81,7 @@ impl Apply { let applied_index = flush_state.applied_index(); assert_ne!(applied_index, 0, "{}", SlogFormat(&logger)); let tablet = remote_tablet.latest().unwrap().clone(); - let perf_context = tablet.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); + let perf_context = EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply); Apply { peer, tablet, diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index bc3d8a5af8e..8051066d4f9 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -1,7 +1,7 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - mem, + cmp, mem, sync::{atomic::Ordering, Arc}, time::{Duration, Instant}, }; @@ -104,6 +104,8 @@ pub struct Peer { /// lead_transferee if this peer(leader) is in a leadership transferring. leader_transferee: u64, + + long_uncommitted_threshold: u64, } impl Peer { @@ -180,6 +182,10 @@ impl Peer { flush_state, split_flow_control: SplitFlowControl::default(), leader_transferee: raft::INVALID_ID, + long_uncommitted_threshold: cmp::max( + cfg.long_uncommitted_base_threshold.0.as_secs(), + 1, + ), }; // If this region has only one peer and I am the one, campaign directly. @@ -769,4 +775,14 @@ impl Peer { .unwrap_or(raft::INVALID_ID), ) } + + #[inline] + pub fn long_uncommitted_threshold(&self) -> Duration { + Duration::from_secs(self.long_uncommitted_threshold) + } + + #[inline] + pub fn set_long_uncommitted_threshold(&mut self, dur: Duration) { + self.long_uncommitted_threshold = cmp::max(dur.as_secs(), 1); + } } diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index eeeb13f6555..f70b6635982 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -30,7 +30,12 @@ use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, WriteCallback, }; -use tracker::{TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken}; + +union Tracker { + read: TrackerToken, + write: TimeTracker, +} /// A struct allows to watch and notify specific events. /// @@ -53,7 +58,7 @@ struct EventCore { before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, - tracker: UnsafeCell, + tracker: UnsafeCell, } unsafe impl Send for EventCore {} @@ -240,16 +245,17 @@ pub struct BaseChannel { core: Arc>, } -impl BaseChannel { - /// Creates a pair of channel and subscriber. - #[inline] - pub fn pair() -> (Self, BaseSubscriber) { - let tracker_token = tracker::get_tls_tracker_token(); - Self::with_mask(u32::MAX, TimeTracker::Tracker(tracker_token)) - } +#[inline] +fn pair() -> (BaseChannel, BaseSubscriber) { + let tracker = Tracker { + read: get_tls_tracker_token(), + }; + BaseChannel::::with_mask(u32::MAX, tracker) +} +impl BaseChannel { #[inline] - fn with_mask(mask: u32, tracker: TimeTracker) -> (Self, BaseSubscriber) { + fn with_mask(mask: u32, tracker: Tracker) -> (Self, BaseSubscriber) { let core: Arc> = Arc::new(EventCore { event: AtomicU64::new(0), res: UnsafeCell::new(None), @@ -452,15 +458,8 @@ impl CmdResChannelBuilder { #[inline] pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { - let tracker_token = tracker::get_tls_tracker_token(); - let now = std::time::Instant::now(); - let tracker = if tracker_token == INVALID_TRACKER_TOKEN { - TimeTracker::Instant(now) - } else { - GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { - tracker.metrics.write_instant = Some(now); - }); - TimeTracker::Tracker(tracker_token) + let tracker = Tracker { + write: TimeTracker::default(), }; let (c, s) = CmdResChannel::with_mask(self.event_mask, tracker); if let Some(f) = self.before_set { @@ -476,6 +475,15 @@ impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; const COMMITTED_EVENT: u64 = 2; + + /// Creates a pair of channel and subscriber. + #[inline] + pub fn pair() -> (Self, CmdResSubscriber) { + let tracker = Tracker { + write: TimeTracker::default(), + }; + Self::with_mask(u32::MAX, tracker) + } } impl ErrorCallback for CmdResChannel { @@ -509,12 +517,12 @@ impl WriteCallback for CmdResChannel { type TimeTrackerListRef<'a> = &'a [TimeTracker]; #[inline] fn write_trackers(&self) -> Self::TimeTrackerListRef<'_> { - std::slice::from_ref(unsafe { &*self.core.tracker.get() }) + std::slice::from_ref(unsafe { &(*self.core.tracker.get()).write }) } type TimeTrackerListMut<'a> = &'a mut [TimeTracker]; fn write_trackers_mut(&mut self) -> Self::TimeTrackerListMut<'_> { - std::slice::from_mut(unsafe { &mut *self.core.tracker.get() }) + std::slice::from_mut(unsafe { &mut (*self.core.tracker.get()).write }) } // TODO: support executing hooks inside setting result. @@ -572,6 +580,13 @@ impl QueryResult { pub type QueryResChannel = BaseChannel; +impl QueryResChannel { + #[inline] + pub fn pair() -> (Self, QueryResSubscriber) { + pair() + } +} + impl ErrorCallback for QueryResChannel { #[inline] fn report_error(self, err: RaftCmdResponse) { @@ -592,8 +607,8 @@ impl ReadCallback for QueryResChannel { self.set_result(res); } - fn read_tracker(&self) -> Option<&TrackerToken> { - unsafe { (*self.core.tracker.get()).as_tracker_token() } + fn read_tracker(&self) -> Option { + Some(unsafe { (*self.core.tracker.get()).read }) } } @@ -608,6 +623,13 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +impl DebugInfoChannel { + #[inline] + pub fn pair() -> (Self, DebugInfoSubscriber) { + pair() + } +} + impl Debug for DebugInfoChannel { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "DebugInfoChannel") @@ -615,17 +637,29 @@ impl Debug for DebugInfoChannel { } #[cfg(feature = "testexport")] -pub type FlushChannel = BaseChannel<()>; -#[cfg(feature = "testexport")] -pub type FlushSubscriber = BaseSubscriber<()>; +mod flush_channel { + use super::*; -#[cfg(feature = "testexport")] -impl Debug for FlushChannel { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "FlushChannel") + pub type FlushChannel = BaseChannel<()>; + pub type FlushSubscriber = BaseSubscriber<()>; + + impl FlushChannel { + #[inline] + pub fn pair() -> (Self, FlushSubscriber) { + pair() + } + } + + impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } } } +#[cfg(feature = "testexport")] +pub use flush_channel::{FlushChannel, FlushSubscriber}; + #[cfg(test)] mod tests { use std::assert_matches::assert_matches; diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs index d4593223db3..d6d19743b1e 100644 --- a/components/raftstore-v2/src/worker/tablet_gc.rs +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -131,6 +131,8 @@ impl Runner { } fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { + // The tablet is about to be deleted, flush is a waste and will block destroy. + let _ = tablet.set_db_options(&[("avoid_flush_during_shutdown", "true")]); let _ = tablet.pause_background_work(); self.waiting_destroy_tasks .entry(region_id) diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 7016d0ab606..98c76ddd6d1 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -465,11 +465,12 @@ where self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { let now = std::time::Instant::now(); - for task in &self.tasks { - for tracker in &task.trackers { + for task in &mut self.tasks { + for tracker in &mut task.trackers { tracker.observe(now, &metrics.wf_before_write, |t| { &mut t.metrics.wf_before_write_nanos }); + tracker.reset(now); } } } @@ -549,7 +550,7 @@ where ) -> Self { let batch = WriteTaskBatch::new(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); let perf_context = - raft_engine.get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); + ER::get_perf_context(cfg.value().perf_level, PerfContextKind::RaftstoreStore); let cfg_tracker = cfg.clone().tracker(tag.clone()); Self { store_id, @@ -718,11 +719,7 @@ where .batch .tasks .iter() - .flat_map(|task| { - task.trackers - .iter() - .flat_map(|t| t.as_tracker_token().cloned()) - }) + .flat_map(|task| task.trackers.iter().flat_map(|t| t.as_tracker_token())) .collect(); self.perf_context.report_metrics(&trackers); write_raft_time = duration_to_sec(now.saturating_elapsed()); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index cab6ae0ffe8..58df32fd404 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -83,7 +83,7 @@ use crate::{ cmd_resp, entry_storage::{self, CachedEntries}, fsm::RaftPollerBuilder, - local_metrics::{RaftMetrics, TimeTracker}, + local_metrics::RaftMetrics, memory::*, metrics::*, msg::{Callback, ErrorCallback, PeerMsg, ReadResponse, SignificantMsg}, @@ -475,7 +475,7 @@ where host, importer, region_scheduler, - engine: engine.clone(), + engine, router, notifier, kv_wb, @@ -488,7 +488,7 @@ where committed_count: 0, sync_log_hint: false, use_delete_range: cfg.use_delete_range, - perf_context: engine.get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), + perf_context: EK::get_perf_context(cfg.perf_level, PerfContextKind::RaftstoreApply), yield_duration: cfg.apply_yield_duration.0, yield_msg_size: cfg.apply_yield_write_size.0, delete_ssts: vec![], @@ -582,7 +582,7 @@ where .cb_batch .iter() .flat_map(|(cb, _)| cb.write_trackers()) - .flat_map(|trackers| trackers.as_tracker_token().cloned()) + .flat_map(|trackers| trackers.as_tracker_token()) .collect(); self.perf_context.report_metrics(&trackers); self.sync_log_hint = false; @@ -3337,9 +3337,7 @@ impl Apply { t.metrics.write_instant = Some(now); &mut t.metrics.store_time_nanos }); - if let TimeTracker::Instant(t) = tracker { - *t = now; - } + tracker.reset(now); } } } @@ -4171,7 +4169,7 @@ where .flat_map(|p| p.cb.write_trackers()) .flat_map(|ts| ts.as_tracker_token()) { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.apply_wait_nanos = apply_wait.as_nanos() as u64; }); } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index e302ea6588a..7e00798b6df 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -622,7 +622,7 @@ where .propose_wait_time .observe(propose_time.as_secs_f64()); cmd.callback.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_propose_wait_nanos = propose_time.as_nanos() as u64; }) diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index ceb8858046d..3724eba13e2 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1374,14 +1374,14 @@ where ready_count: 0, has_ready: false, current_time: None, - raft_perf_context: self - .engines - .raft - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), - kv_perf_context: self - .engines - .kv - .get_perf_context(self.cfg.value().perf_level, PerfContextKind::RaftstoreStore), + raft_perf_context: ER::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), + kv_perf_context: EK::get_perf_context( + self.cfg.value().perf_level, + PerfContextKind::RaftstoreStore, + ), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], node_start_time: Some(TiInstant::now_coarse()), feature_gate: self.feature_gate.clone(), diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index c1db17f8cae..0e6a09cbf0b 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -7,7 +7,7 @@ use collections::HashSet; use prometheus::local::LocalHistogram; use raft::eraftpb::MessageType; use tikv_util::time::{Duration, Instant}; -use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS}; +use tracker::{Tracker, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; use super::metrics::*; @@ -208,47 +208,60 @@ impl StoreWriteMetrics { /// Tracker for the durations of a raftstore request. /// If a global tracker is not available, it will fallback to an Instant. #[derive(Debug, Clone, Copy)] -pub enum TimeTracker { - Tracker(TrackerToken), - Instant(std::time::Instant), +pub struct TimeTracker { + token: TrackerToken, + start: std::time::Instant, +} + +impl Default for TimeTracker { + #[inline] + fn default() -> Self { + let token = tracker::get_tls_tracker_token(); + let start = std::time::Instant::now(); + let tracker = TimeTracker { token, start }; + if token == INVALID_TRACKER_TOKEN { + return tracker; + } + + GLOBAL_TRACKERS.with_tracker(token, |tracker| { + tracker.metrics.write_instant = Some(start); + }); + tracker + } } impl TimeTracker { - pub fn as_tracker_token(&self) -> Option<&TrackerToken> { - match self { - TimeTracker::Tracker(tt) => Some(tt), - TimeTracker::Instant(_) => None, + #[inline] + pub fn as_tracker_token(&self) -> Option { + if self.token == INVALID_TRACKER_TOKEN { + None + } else { + Some(self.token) } } + #[inline] pub fn observe( &self, now: std::time::Instant, local_metric: &LocalHistogram, tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, ) { - match self { - TimeTracker::Tracker(t) => { - if let Some(dur) = GLOBAL_TRACKERS - .with_tracker(*t, |tracker| { - tracker.metrics.write_instant.map(|write_instant| { - let dur = now.saturating_duration_since(write_instant); - let metric = tracker_metric(tracker); - if *metric == 0 { - *metric = dur.as_nanos() as u64; - } - dur - }) - }) - .flatten() - { - local_metric.observe(dur.as_secs_f64()); - } - } - TimeTracker::Instant(t) => { - let dur = now.saturating_duration_since(*t); - local_metric.observe(dur.as_secs_f64()); - } + let dur = now.saturating_duration_since(self.start); + local_metric.observe(dur.as_secs_f64()); + if self.token == INVALID_TRACKER_TOKEN { + return; } + GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { + let metric = tracker_metric(tracker); + if *metric == 0 { + *metric = dur.as_nanos() as u64; + } + }); + } + + #[inline] + pub fn reset(&mut self, start: std::time::Instant) { + self.start = start; } } diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index e3fc8530d76..b2a2a7aa1d1 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -24,7 +24,7 @@ use pd_client::BucketMeta; use raft::SnapshotStatus; use smallvec::{smallvec, SmallVec}; use tikv_util::{deadline::Deadline, escape, memory::HeapSize, time::Instant}; -use tracker::{get_tls_tracker_token, TrackerToken, GLOBAL_TRACKERS, INVALID_TRACKER_TOKEN}; +use tracker::{get_tls_tracker_token, TrackerToken}; use super::{local_metrics::TimeTracker, region_meta::RegionMeta, FetchedLogs, RegionSnapshot}; use crate::store::{ @@ -137,16 +137,7 @@ where proposed_cb: Option, committed_cb: Option, ) -> Self { - let tracker_token = get_tls_tracker_token(); - let now = std::time::Instant::now(); - let tracker = if tracker_token == INVALID_TRACKER_TOKEN { - TimeTracker::Instant(now) - } else { - GLOBAL_TRACKERS.with_tracker(tracker_token, |tracker| { - tracker.metrics.write_instant = Some(now); - }); - TimeTracker::Tracker(tracker_token) - }; + let tracker = TimeTracker::default(); Callback::Write { cb, @@ -217,7 +208,7 @@ pub trait ReadCallback: ErrorCallback { type Response; fn set_result(self, result: Self::Response); - fn read_tracker(&self) -> Option<&TrackerToken>; + fn read_tracker(&self) -> Option; } pub trait WriteCallback: ErrorCallback { @@ -265,9 +256,9 @@ impl ReadCallback for Callback { self.invoke_read(result); } - fn read_tracker(&self) -> Option<&TrackerToken> { + fn read_tracker(&self) -> Option { let Callback::Read { tracker, .. } = self else { return None; }; - Some(tracker) + Some(*tracker) } } diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 347f62dd945..586ab7ba133 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -200,7 +200,7 @@ impl ProposalQueue { } #[inline] - fn oldest(&self) -> Option<&Proposal> { + pub fn oldest(&self) -> Option<&Proposal> { self.queue.front() } @@ -3292,7 +3292,7 @@ where let time = monotonic_raw_now(); for (req, cb, mut read_index) in read.take_cmds().drain(..) { cb.read_tracker().map(|tracker| { - GLOBAL_TRACKERS.with_tracker(*tracker, |t| { + GLOBAL_TRACKERS.with_tracker(tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read.propose_time).to_std().unwrap().as_nanos() as u64; }) diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index d6e146adf11..9c0b79ff8b8 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -6,7 +6,7 @@ use ::tracker::{get_tls_tracker_token, with_tls_tracker}; use engine_traits::{PerfContext, PerfContextExt, PerfContextKind}; use kvproto::{kvrpcpb, kvrpcpb::ScanDetailV2}; use pd_client::BucketMeta; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tikv_util::time::{self, Duration, Instant}; use txn_types::Key; @@ -148,9 +148,7 @@ impl Tracker { } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.start_observe(); - } + perf_context.start_observe(); }); self.current_stage = TrackerState::ItemBegan(now); } @@ -164,9 +162,7 @@ impl Tracker { self.total_storage_stats.add(&storage_stats); } self.with_perf_context(|perf_context| { - if let Some(c) = perf_context { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); }); self.current_stage = TrackerState::ItemFinished(now); } else { @@ -361,7 +357,7 @@ impl Tracker { fn with_perf_context(&self, f: F) -> T where - F: FnOnce(&mut Option>) -> T, + F: FnOnce(&mut Box) -> T, { thread_local! { static SELECT: RefCell>> = RefCell::new(None); @@ -385,19 +381,13 @@ impl Tracker { }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = unsafe { - with_tls_engine::(|engine| { - engine.kv_engine().map(|engine| { - Box::new(engine.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), - )) as Box - }) - }) - }; - } - f(&mut c) + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) as Box + }); + f(perf_context) }) } } diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index 080ff2c5951..4837567ee43 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -11,7 +11,7 @@ use pd_client::BucketMeta; use prometheus::*; use prometheus_static_metric::*; use raftstore::store::{util::build_key_range, ReadStats}; -use tikv_kv::{with_tls_engine, Engine}; +use tikv_kv::Engine; use tracker::get_tls_tracker_token; use crate::{ @@ -347,23 +347,15 @@ where }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - if c.is_none() { - *c = with_tls_engine(|engine: &mut E| { - engine.kv_engine().map(|c| { - Box::new(c.get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) as Box - }) - }); - }; - if let Some(c) = &mut *c { - c.start_observe(); - } + let perf_context = c.get_or_insert_with(|| { + Box::new(E::Local::get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) as Box + }); + perf_context.start_observe(); let res = f(); - if let Some(c) = &mut *c { - c.report_metrics(&[get_tls_tracker_token()]); - } + perf_context.report_metrics(&[get_tls_tracker_token()]); res }) } From 8d054dd1370fba970f5829cee5b9ffeabf44f9c8 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 13 Jan 2023 16:01:07 +0800 Subject: [PATCH 079/115] [Cloud]Fast add peer: Fix pre handle limit, and some refactors (#256) --- Cargo.lock | 1 + components/raftstore/src/store/fsm/store.rs | 4 +- components/raftstore/src/store/peer.rs | 4 - components/raftstore/src/store/snap.rs | 15 +- engine_store_ffi/src/ffihub_impl.rs | 114 +++++ engine_store_ffi/src/lib.rs | 20 +- engine_store_ffi/src/observer.rs | 420 +++++------------- engine_tiflash/Cargo.toml | 1 + .../src/cached_region_info_manager.rs | 155 +++++++ engine_tiflash/src/engine.rs | 56 ++- engine_tiflash/src/lib.rs | 2 + engine_tiflash/src/proxy_utils.rs | 23 + new-mock-engine-store/src/mock_cluster.rs | 7 +- .../src/mock_page_storage.rs | 2 +- proxy_server/src/run.rs | 14 +- proxy_tests/proxy/fast_add_peer.rs | 22 +- 16 files changed, 495 insertions(+), 365 deletions(-) create mode 100644 engine_store_ffi/src/ffihub_impl.rs create mode 100644 engine_tiflash/src/cached_region_info_manager.rs diff --git a/Cargo.lock b/Cargo.lock index 443f74b80e6..bec4948a2af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1702,6 +1702,7 @@ dependencies = [ "log_wrappers", "num_cpus", "online_config", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 18bc623fa44..1e0d845f4fa 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -67,8 +67,8 @@ use time::{self, Timespec}; use crate::{ bytes_capacity, coprocessor::{ - split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, PeerCreateEvent, - RegionChangeEvent, RegionChangeReason, + split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, RegionChangeEvent, + RegionChangeReason, }, store::{ async_io::{ diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index cffb7e40a9a..9614161739a 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -1769,10 +1769,6 @@ where for msg in msgs { let msg_type = msg.get_message().get_msg_type(); if msg_type == MessageType::MsgSnapshot { - let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); - snap_data - .merge_from_bytes(msg.get_message().get_snapshot().get_data()) - .unwrap(); let snap_index = msg.get_message().get_snapshot().get_metadata().get_index(); if snap_index > self.last_sent_snapshot_idx { self.last_sent_snapshot_idx = snap_index; diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index b360d2d20c3..a9c98dcd69f 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -436,8 +436,8 @@ pub struct Snapshot { cf_files: Vec, cf_index: usize, cf_file_index: usize, - pub meta_file: MetaFile, - pub hold_tmp_files: bool, + meta_file: MetaFile, + hold_tmp_files: bool, mgr: SnapManagerCore, } @@ -458,7 +458,6 @@ impl Snapshot { mgr: &SnapManagerCore, ) -> RaftStoreResult { let dir_path = dir.into(); - if !dir_path.exists() { file_system::create_dir_all(dir_path.as_path())?; } @@ -814,7 +813,7 @@ impl Snapshot { } // Only called in `do_build`. - pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { + fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -1141,6 +1140,14 @@ impl Snapshot { self.cf_files.iter().map(|cf| cf.kv_count).sum() } + pub fn set_hold_tmp_files(&mut self, v: bool) { + self.hold_tmp_files = v; + } + + pub fn hold_tmp_files(&self) -> bool { + self.hold_tmp_files + } + pub fn save(&mut self) -> io::Result<()> { debug!( "saving to snapshot file"; diff --git a/engine_store_ffi/src/ffihub_impl.rs b/engine_store_ffi/src/ffihub_impl.rs new file mode 100644 index 00000000000..0a08050b691 --- /dev/null +++ b/engine_store_ffi/src/ffihub_impl.rs @@ -0,0 +1,114 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use engine_tiflash::{FsStatsExt, RawPSWriteBatchPtr, RawPSWriteBatchWrapper}; + +use crate::{ + interfaces::root::DB as ffi_interfaces, EngineStoreServerHelper, PageAndCppStrWithView, + RawCppPtr, +}; + +pub struct TiFlashFFIHub { + pub engine_store_server_helper: &'static EngineStoreServerHelper, +} +unsafe impl Send for TiFlashFFIHub {} +unsafe impl Sync for TiFlashFFIHub {} +impl engine_tiflash::FFIHubInner for TiFlashFFIHub { + fn get_store_stats(&self) -> engine_tiflash::FsStatsExt { + self.engine_store_server_helper + .handle_compute_store_stats() + .into() + } + + fn create_write_batch(&self) -> RawPSWriteBatchWrapper { + self.engine_store_server_helper.create_write_batch().into() + } + + fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper) { + self.engine_store_server_helper + .gc_raw_cpp_ptr(wb_wrapper.ptr, wb_wrapper.type_); + } + + fn consume_write_batch(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.consume_write_batch(wb) + } + + fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize { + self.engine_store_server_helper.write_batch_size(wb) as usize + } + + fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool { + self.engine_store_server_helper.write_batch_is_empty(wb) != 0 + } + + fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_merge(lwb, rwb) + } + + fn write_batch_clear(&self, wb: RawPSWriteBatchPtr) { + self.engine_store_server_helper.write_batch_clear(wb) + } + + fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]) { + self.engine_store_server_helper + .write_batch_put_page(wb, page_id.into(), page.into()) + } + + fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]) { + self.engine_store_server_helper + .write_batch_del_page(wb, page_id.into()) + } + + fn read_page(&self, page_id: &[u8]) -> Option> { + // TODO maybe we can steal memory from C++ here to reduce redundant copy? + let value = self.engine_store_server_helper.read_page(page_id.into()); + return if value.view.len == 0 { + None + } else { + Some(value.view.to_slice().to_vec()) + }; + } + + fn scan_page( + &self, + start_page_id: &[u8], + end_page_id: &[u8], + f: &mut dyn FnMut(&[u8], &[u8]) -> engine_traits::Result, + ) { + let values = self + .engine_store_server_helper + .scan_page(start_page_id.into(), end_page_id.into()); + let arr = values.inner as *mut PageAndCppStrWithView; + for i in 0..values.len { + let value = unsafe { &*arr.offset(i as isize) }; + if value.page_view.len != 0 { + f( + &value.key_view.to_slice().to_vec(), + &value.page_view.to_slice().to_vec(), + ) + .unwrap(); + } + } + } +} + +impl From for RawPSWriteBatchWrapper { + fn from(src: RawCppPtr) -> Self { + let result = RawPSWriteBatchWrapper { + ptr: src.ptr, + type_: src.type_, + }; + let mut src = src; + src.ptr = std::ptr::null_mut(); + result + } +} + +#[allow(clippy::from_over_into)] +impl Into for ffi_interfaces::StoreStats { + fn into(self) -> FsStatsExt { + FsStatsExt { + available: self.fs_stats.avail_size, + capacity: self.fs_stats.capacity_size, + used: self.fs_stats.used_size, + } + } +} diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index a1af1bf3fe0..2d346638bb3 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -7,6 +7,7 @@ pub mod interfaces; pub mod basic_ffi_impls; pub mod domain_impls; pub mod encryption_impls; +pub mod ffihub_impl; mod lock_cf_reader; pub mod observer; pub mod ps_engine; @@ -27,7 +28,9 @@ pub use basic_ffi_impls::*; pub use domain_impls::*; use encryption::DataKeyManager; pub use encryption_impls::*; +pub use engine_tiflash::EngineStoreConfig; use engine_traits::{Peekable, CF_LOCK}; +pub use ffihub_impl::TiFlashFFIHub; use kvproto::{kvrpcpb, metapb, raft_cmdpb}; use lazy_static::lazy_static; use protobuf::Message; @@ -899,20 +902,3 @@ pub unsafe extern "C" fn ffi_poll_timer_task(task_ptr: RawVoidPtr, waker: RawVoi 0 } } - -use serde_derive::{Deserialize, Serialize}; -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] -#[serde(default)] -#[serde(rename_all = "kebab-case")] -pub struct EngineStoreConfig { - pub enable_fast_add_peer: bool, -} - -#[allow(clippy::derivable_impls)] -impl Default for EngineStoreConfig { - fn default() -> Self { - Self { - enable_fast_add_peer: false, - } - } -} diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 6da35fb7887..6b6dd65d846 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -5,15 +5,12 @@ use std::{ ops::DerefMut, path::PathBuf, str::FromStr, - sync::{ - atomic::{AtomicBool, Ordering}, - mpsc, Arc, Mutex, RwLock, - }, + sync::{atomic::Ordering, mpsc, Arc, Mutex, RwLock}, time::SystemTime, }; use collections::HashMap; -use engine_tiflash::{FsStatsExt, RawPSWriteBatchPtr, RawPSWriteBatchWrapper}; +use engine_tiflash::{CachedRegionInfo, CachedRegionInfoManager}; use engine_traits::{RaftEngine, SstMetaInfo, CF_RAFT}; use kvproto::{ metapb::Region, @@ -45,10 +42,9 @@ use yatp::{ }; use crate::{ - gen_engine_store_server_helper, - interfaces::root::{DB as ffi_interfaces, DB::EngineStoreApplyRes}, - name_to_cf, ColumnFamilyType, EngineStoreServerHelper, PageAndCppStrWithView, RaftCmdHeader, - RawCppPtr, TiFlashEngine, WriteCmdType, WriteCmds, CF_LOCK, + gen_engine_store_server_helper, interfaces::root::DB::EngineStoreApplyRes, name_to_cf, + ColumnFamilyType, EngineStoreServerHelper, RaftCmdHeader, RawCppPtr, TiFlashEngine, + WriteCmdType, WriteCmds, CF_LOCK, }; macro_rules! fatal { @@ -57,114 +53,6 @@ macro_rules! fatal { ::std::process::exit(1) }) } - -#[allow(clippy::from_over_into)] -impl Into for ffi_interfaces::StoreStats { - fn into(self) -> FsStatsExt { - FsStatsExt { - available: self.fs_stats.avail_size, - capacity: self.fs_stats.capacity_size, - used: self.fs_stats.used_size, - } - } -} - -impl From for RawPSWriteBatchWrapper { - fn from(src: RawCppPtr) -> Self { - let result = RawPSWriteBatchWrapper { - ptr: src.ptr, - type_: src.type_, - }; - let mut src = src; - src.ptr = std::ptr::null_mut(); - result - } -} - -pub struct TiFlashFFIHub { - pub engine_store_server_helper: &'static EngineStoreServerHelper, -} -unsafe impl Send for TiFlashFFIHub {} -unsafe impl Sync for TiFlashFFIHub {} -impl engine_tiflash::FFIHubInner for TiFlashFFIHub { - fn get_store_stats(&self) -> engine_tiflash::FsStatsExt { - self.engine_store_server_helper - .handle_compute_store_stats() - .into() - } - - fn create_write_batch(&self) -> RawPSWriteBatchWrapper { - self.engine_store_server_helper.create_write_batch().into() - } - - fn destroy_write_batch(&self, wb_wrapper: &RawPSWriteBatchWrapper) { - self.engine_store_server_helper - .gc_raw_cpp_ptr(wb_wrapper.ptr, wb_wrapper.type_); - } - - fn consume_write_batch(&self, wb: RawPSWriteBatchPtr) { - self.engine_store_server_helper.consume_write_batch(wb) - } - - fn write_batch_size(&self, wb: RawPSWriteBatchPtr) -> usize { - self.engine_store_server_helper.write_batch_size(wb) as usize - } - - fn write_batch_is_empty(&self, wb: RawPSWriteBatchPtr) -> bool { - self.engine_store_server_helper.write_batch_is_empty(wb) != 0 - } - - fn write_batch_merge(&self, lwb: RawPSWriteBatchPtr, rwb: RawPSWriteBatchPtr) { - self.engine_store_server_helper.write_batch_merge(lwb, rwb) - } - - fn write_batch_clear(&self, wb: RawPSWriteBatchPtr) { - self.engine_store_server_helper.write_batch_clear(wb) - } - - fn write_batch_put_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8], page: &[u8]) { - self.engine_store_server_helper - .write_batch_put_page(wb, page_id.into(), page.into()) - } - - fn write_batch_del_page(&self, wb: RawPSWriteBatchPtr, page_id: &[u8]) { - self.engine_store_server_helper - .write_batch_del_page(wb, page_id.into()) - } - - fn read_page(&self, page_id: &[u8]) -> Option> { - // TODO maybe we can steal memory from C++ here to reduce redundant copy? - let value = self.engine_store_server_helper.read_page(page_id.into()); - return if value.view.len == 0 { - None - } else { - Some(value.view.to_slice().to_vec()) - }; - } - - fn scan_page( - &self, - start_page_id: &[u8], - end_page_id: &[u8], - f: &mut dyn FnMut(&[u8], &[u8]) -> engine_traits::Result, - ) { - let values = self - .engine_store_server_helper - .scan_page(start_page_id.into(), end_page_id.into()); - let arr = values.inner as *mut PageAndCppStrWithView; - for i in 0..values.len { - let value = unsafe { &*arr.offset(i as isize) }; - if value.page_view.len != 0 { - f( - &value.key_view.to_slice().to_vec(), - &value.page_view.to_slice().to_vec(), - ) - .unwrap(); - } - } - } -} - pub struct PtrWrapper(RawCppPtr); unsafe impl Send for PtrWrapper {} @@ -190,21 +78,9 @@ impl PrehandleTask { unsafe impl Send for PrehandleTask {} unsafe impl Sync for PrehandleTask {} -const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; - -#[derive(Debug, Default)] -pub struct CachedRegionInfo { - pub replicated_or_created: AtomicBool, - // TiKV assumes a region's learner peer is added through snapshot. - // If this field is false, will try fast path when meet MsgAppend. - // If this field is true, it means this peer is inited or will be inited by a TiKV snapshot. - // NOTE If we want a fallback, then we must set inited_or_fallback to true, - // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. - pub inited_or_fallback: AtomicBool, - pub snapshot_inflight: portable_atomic::AtomicU128, -} - -pub type CachedRegionInfoMap = HashMap>; +// TiFlash observer's priority should be higher than all other observers, to +// avoid being bypassed. +const TIFLASH_OBSERVER_PRIORITY: u32 = 0; pub struct TiFlashObserver { pub store_id: u64, @@ -216,13 +92,33 @@ pub struct TiFlashObserver { pub snap_handle_pool_size: usize, pub apply_snap_pool: Option>>, pub pending_delete_ssts: Arc>>, - pub cached_region_info: Arc>>, // TODO should we use a Mutex here? pub trans: Arc>, pub snap_mgr: Arc, pub engine_store_cfg: crate::EngineStoreConfig, } +pub fn get_region_local_state( + engine: &EK, + region_id: u64, +) -> Option { + let region_state_key = keys::region_state_key(region_id); + engine + .get_msg_cf::(CF_RAFT, ®ion_state_key) + .unwrap_or(None) +} + +pub fn validate_remote_peer_region( + new_region: &kvproto::metapb::Region, + store_id: u64, + new_peer_id: u64, +) -> bool { + match find_peer(new_region, store_id) { + Some(peer) => peer.get_id() == new_peer_id, + None => false, + } +} + impl Clone for TiFlashObserver { fn clone(&self) -> Self { TiFlashObserver { @@ -235,7 +131,6 @@ impl Clone for TiFlashObserver { snap_handle_pool_size: self.snap_handle_pool_size, apply_snap_pool: self.apply_snap_pool.clone(), pending_delete_ssts: self.pending_delete_ssts.clone(), - cached_region_info: self.cached_region_info.clone(), trans: self.trans.clone(), snap_mgr: self.snap_mgr.clone(), engine_store_cfg: self.engine_store_cfg.clone(), @@ -243,142 +138,7 @@ impl Clone for TiFlashObserver { } } -// TiFlash observer's priority should be higher than all other observers, to -// avoid being bypassed. -const TIFLASH_OBSERVER_PRIORITY: u32 = 0; - -// Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) -#[inline] -fn hash_u64(mut i: u64) -> u64 { - i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); - i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); - i ^ (i >> 31) -} - -#[allow(dead_code)] -#[inline] -fn unhash_u64(mut i: u64) -> u64 { - i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); - i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); - i ^ (i >> 30) ^ (i >> 60) -} - -pub fn validate_remote_peer_region( - new_region: &kvproto::metapb::Region, - store_id: u64, - new_peer_id: u64, -) -> bool { - match find_peer(new_region, store_id) { - Some(peer) => peer.get_id() == new_peer_id, - None => false, - } -} - -pub fn get_region_local_state( - engine: &EK, - region_id: u64, -) -> Option { - let region_state_key = keys::region_state_key(region_id); - engine - .get_msg_cf::(CF_RAFT, ®ion_state_key) - .unwrap_or(None) -} - impl TiFlashObserver { - #[inline] - fn slot_index(id: u64) -> usize { - debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); - hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) - } - - pub fn access_cached_region_info_mut>)>( - &self, - region_id: u64, - mut f: F, - ) -> RaftStoreResult<()> { - let slot_id = Self::slot_index(region_id); - let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { - Ok(g) => g, - Err(_) => return Err(box_err!("access_cached_region_info_mut poisoned")), - }; - f(guard.entry(region_id)); - Ok(()) - } - - pub fn access_cached_region_info)>( - &self, - region_id: u64, - mut f: F, - ) { - let slot_id = Self::slot_index(region_id); - let guard = match self.cached_region_info.get(slot_id).unwrap().read() { - Ok(g) => g, - Err(_) => panic!("access_cached_region_info poisoned!"), - }; - match guard.get(®ion_id) { - Some(g) => f(g.clone()), - None => (), - } - } - - pub fn get_inited_or_fallback(&self, region_id: u64) -> Option { - let mut is_first: Option = None; - let f = |info: Arc| { - is_first = Some(info.inited_or_fallback.load(Ordering::SeqCst)); - }; - self.access_cached_region_info(region_id, f); - is_first - } - - pub fn remove_cached_region_info(&self, region_id: u64) { - let slot_id = Self::slot_index(region_id); - if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { - info!( - "remove_cached_region_info"; - "region_id" => region_id, - "store_id" => self.store_id, - ); - let _ = g.remove(®ion_id); - } - } - - pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> RaftStoreResult<()> { - self.access_cached_region_info_mut( - region_id, - |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); - } - MapEntry::Vacant(_) => { - tikv_util::safe_panic!("not inited!"); - } - }, - ) - } - - pub fn set_snapshot_inflight(&self, region_id: u64, v: u128) -> RaftStoreResult<()> { - self.access_cached_region_info_mut( - region_id, - |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - o.get_mut().snapshot_inflight.store(v, Ordering::SeqCst); - } - MapEntry::Vacant(_) => { - tikv_util::safe_panic!("not inited!"); - } - }, - ) - } - - fn fallback_to_slow_path(&self, region_id: u64) { - // TODO clean local, and prepare to request snapshot from TiKV as a trivial - // procedure. - fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); - if self.set_inited_or_fallback(region_id, true).is_err() { - tikv_util::safe_panic!("set_inited_or_fallback"); - } - } - pub fn is_initialized(&self, region_id: u64) -> bool { match get_region_local_state(&self.engine, region_id) { None => false, @@ -389,6 +149,14 @@ impl TiFlashObserver { } } + pub fn get_cached_manager(&self) -> Arc { + self.engine + .cached_region_info_manager + .as_ref() + .unwrap() + .clone() + } + // Returns whether we need to ignore this message and run fast path instead. pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { if !self.engine_store_cfg.enable_fast_add_peer { @@ -404,6 +172,7 @@ impl TiFlashObserver { } let region_id = msg.get_region_id(); let new_peer_id = msg.get_to_peer().get_id(); + let cached_manager = self.get_cached_manager(); let mut is_first = false; let mut is_replicated = false; let mut has_already_inited = None; @@ -469,11 +238,14 @@ impl TiFlashObserver { }; // Try not acquire write lock firstly. - match self.get_inited_or_fallback(region_id) { + match cached_manager.get_inited_or_fallback(region_id) { Some(true) => { is_first = false; } - None | Some(false) => self.access_cached_region_info_mut(region_id, f).unwrap(), + None | Some(false) => self + .get_cached_manager() + .access_cached_region_info_mut(region_id, f) + .unwrap(), }; #[cfg(any(test, feature = "testexport"))] @@ -555,7 +327,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id, res; "region_id" => region_id, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); return false; } }; @@ -570,7 +342,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id, res; "region_id" => region_id, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); } if let Err(_e) = new_region.merge_from_bytes(region_str) { error!( @@ -578,7 +350,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id, res; "region_id" => region_id, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); } // Validate @@ -590,7 +362,7 @@ impl TiFlashObserver { "region_id" => region_id, "region" => ?new_region, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); return false; } @@ -626,7 +398,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id, s; "region_id" => region_id, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); return false; } }; @@ -637,7 +409,7 @@ impl TiFlashObserver { self.store_id, region_id, new_peer_id, e; "region_id" => region_id, ); - self.fallback_to_slow_path(region_id); + cached_manager.fallback_to_slow_path(region_id); return false; } }; @@ -673,6 +445,7 @@ impl TiFlashObserver { apply_state: RaftApplyState, new_region: kvproto::metapb::Region, ) -> RaftStoreResult { + let cached_manager = self.get_cached_manager(); let inner_msg = msg.get_message(); // Build snapshot by get_snapshot_for_building let (mut snap, key) = { @@ -717,10 +490,6 @@ impl TiFlashObserver { let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); - info!( - "!!!!! create snapshot data file {:?} {}", - path, snap.hold_tmp_files - ); let mut f = std::fs::File::create(path.as_path())?; f.flush()?; f.sync_all()?; @@ -744,7 +513,7 @@ impl TiFlashObserver { f.sync_all()?; } snap_data.set_meta(snapshot_meta); - snap.hold_tmp_files = false; + snap.set_hold_tmp_files(false); } pb_snapshot_metadata @@ -783,7 +552,8 @@ impl TiFlashObserver { let current = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap(); - self.set_snapshot_inflight(region_id, current.as_millis()) + cached_manager + .set_snapshot_inflight(region_id, current.as_millis()) .unwrap(); // If we don't flush here, packet will lost. trans.flush(); @@ -816,10 +586,7 @@ impl TiFlashObserver { let snap_pool = Builder::new(tikv_util::thd_name!("region-task")) .max_thread_count(snap_handle_pool_size) .build_future_pool(); - let mut cached_region_info = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); - for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { - cached_region_info.push(RwLock::new(HashMap::default())); - } + TiFlashObserver { store_id, engine_store_server_helper, @@ -830,7 +597,6 @@ impl TiFlashObserver { snap_handle_pool_size, apply_snap_pool: Some(Arc::new(snap_pool)), pending_delete_ssts: Arc::new(RwLock::new(vec![])), - cached_region_info: Arc::new(cached_region_info), trans: Arc::new(Mutex::new(trans)), snap_mgr: Arc::new(snap_mgr), engine_store_cfg, @@ -1302,7 +1068,8 @@ impl RegionChangeObserver for TiFlashObs self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); if self.engine_store_cfg.enable_fast_add_peer { - self.remove_cached_region_info(region_id); + self.get_cached_manager() + .remove_cached_region_info(region_id); } } } @@ -1378,7 +1145,9 @@ impl RegionChangeObserver for TiFlashObs "region_id" => region_id, ); // TODO remove unwrap - self.access_cached_region_info_mut(region_id, f).unwrap(); + self.get_cached_manager() + .access_cached_region_info_mut(region_id, f) + .unwrap(); } } } @@ -1468,6 +1237,7 @@ impl ApplySnapshotObserver for TiFlashOb "peer_id" => peer_id, "region_id" => region_id, "snap_key" => ?snap_key, + "has_snap" => snap.is_some(), "pending" => self.engine.pending_applies_count.load(Ordering::SeqCst), ); fail::fail_point!("on_ob_pre_handle_snapshot", |_| {}); @@ -1489,10 +1259,10 @@ impl ApplySnapshotObserver for TiFlashOb let mut should_skip = false; #[allow(clippy::collapsible_if)] if self.engine_store_cfg.enable_fast_add_peer { - if self.access_cached_region_info_mut( + if self.get_cached_manager().access_cached_region_info_mut( region_id, |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { + MapEntry::Occupied(o) => { let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); if is_first_snapsot { info!("fast path: prehandle first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; @@ -1512,27 +1282,29 @@ impl ApplySnapshotObserver for TiFlashOb }; } - let (sender, receiver) = mpsc::channel(); - let task = Arc::new(PrehandleTask::new(receiver, peer_id)); - { - let mut lock = match self.pre_handle_snapshot_ctx.lock() { - Ok(l) => l, - Err(_) => fatal!("pre_apply_snapshot poisoned"), - }; - let ctx = lock.deref_mut(); - ctx.tracer.insert(snap_key.clone(), task.clone()); - } - if should_skip { return; } - let engine_store_server_helper = self.engine_store_server_helper; - let region = ob_ctx.region().clone(); - let snap_key = snap_key.clone(); - let ssts = retrieve_sst_files(snap); match self.apply_snap_pool.as_ref() { Some(p) => { + let (sender, receiver) = mpsc::channel(); + let task = Arc::new(PrehandleTask::new(receiver, peer_id)); + { + let mut lock = match self.pre_handle_snapshot_ctx.lock() { + Ok(l) => l, + Err(_) => fatal!("pre_apply_snapshot poisoned"), + }; + let ctx = lock.deref_mut(); + ctx.tracer.insert(snap_key.clone(), task.clone()); + } + + let engine_store_server_helper = self.engine_store_server_helper; + let region = ob_ctx.region().clone(); + let snap_key = snap_key.clone(); + let ssts = retrieve_sst_files(snap); + + // We use thread pool to do pre handling. self.engine .pending_applies_count .fetch_add(1, Ordering::SeqCst); @@ -1548,7 +1320,13 @@ impl ApplySnapshotObserver for TiFlashOb &snap_key, ); match sender.send(res) { - Err(_e) => error!("pre apply snapshot err when send to receiver"), + Err(_e) => { + error!("pre apply snapshot err when send to receiver"; + "region_id" => region.get_id(), + "peer_id" => task.peer_id, + "snap_key" => ?snap_key, + ) + } Ok(_) => (), } }); @@ -1584,7 +1362,7 @@ impl ApplySnapshotObserver for TiFlashOb let mut should_skip = false; #[allow(clippy::collapsible_if)] if self.engine_store_cfg.enable_fast_add_peer { - if self.access_cached_region_info_mut( + if self.get_cached_manager().access_cached_region_info_mut( region_id, |info: MapEntry>| match info { MapEntry::Occupied(mut o) => { @@ -1613,11 +1391,16 @@ impl ApplySnapshotObserver for TiFlashOb fatal!("post_apply_snapshot poisoned") }; } + + if should_skip { + return; + } + let snap = match snap { None => return, Some(s) => s, }; - let maybe_snapshot = { + let maybe_prehandle_task = { let mut lock = match self.pre_handle_snapshot_ctx.lock() { Ok(l) => l, Err(_) => fatal!("post_apply_snapshot poisoned"), @@ -1625,10 +1408,8 @@ impl ApplySnapshotObserver for TiFlashOb let ctx = lock.deref_mut(); ctx.tracer.remove(snap_key) }; - if should_skip { - return; - } - let need_retry = match maybe_snapshot { + + let need_retry = match maybe_prehandle_task { Some(t) => { let neer_retry = match t.recv.recv() { Ok(snap_ptr) => { @@ -1654,9 +1435,16 @@ impl ApplySnapshotObserver for TiFlashOb true } }; - self.engine + // According to pre_apply_snapshot, if registered tracer, + // then we must have put it into thread pool. + let prev = self + .engine .pending_applies_count .fetch_sub(1, Ordering::SeqCst); + + #[cfg(any(test, feature = "testexport"))] + assert!(prev > 0); + info!("apply snapshot finished"; "peer_id" => peer_id, "snap_key" => ?snap_key, @@ -1678,7 +1466,9 @@ impl ApplySnapshotObserver for TiFlashOb true } }; + if need_retry && !should_skip { + // Blocking pre handle. let ssts = retrieve_sst_files(snap); let ptr = pre_handle_snapshot_impl( self.engine_store_server_helper, diff --git a/engine_tiflash/Cargo.toml b/engine_tiflash/Cargo.toml index d06c6ba75f3..6969c39fc1a 100644 --- a/engine_tiflash/Cargo.toml +++ b/engine_tiflash/Cargo.toml @@ -42,6 +42,7 @@ libc = "0.2" log_wrappers = { workspace = true } num_cpus = "1" online_config = { workspace = true } +portable-atomic = "0.3" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" protobuf = "2" diff --git a/engine_tiflash/src/cached_region_info_manager.rs b/engine_tiflash/src/cached_region_info_manager.rs new file mode 100644 index 00000000000..fed3526b62a --- /dev/null +++ b/engine_tiflash/src/cached_region_info_manager.rs @@ -0,0 +1,155 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + collections::hash_map::Entry as MapEntry, + sync::{ + atomic::{AtomicBool, Ordering}, + Arc, RwLock, + }, +}; + +use collections::HashMap; +use tikv_util::{error, info}; + +const CACHED_REGION_INFO_SLOT_COUNT: usize = 256; + +pub type Result = std::result::Result>; + +#[derive(Debug, Default)] +pub struct CachedRegionInfo { + pub replicated_or_created: AtomicBool, + // TiKV assumes a region's learner peer is added through snapshot. + // If this field is false, will try fast path when meet MsgAppend. + // If this field is true, it means this peer is inited or will be inited by a TiKV snapshot. + // NOTE If we want a fallback, then we must set inited_or_fallback to true, + // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. + pub inited_or_fallback: AtomicBool, + pub snapshot_inflight: portable_atomic::AtomicU128, +} + +pub type CachedRegionInfoMap = HashMap>; + +pub struct CachedRegionInfoManager { + pub cached_region_info: Arc>>, +} + +impl CachedRegionInfoManager { + // Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) + #[inline] + fn hash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); + i ^ (i >> 31) + } + + #[allow(dead_code)] + #[inline] + fn unhash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); + i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); + i ^ (i >> 30) ^ (i >> 60) + } + + pub fn new() -> Self { + let mut cached_region_info = Vec::with_capacity(CACHED_REGION_INFO_SLOT_COUNT); + for _ in 0..CACHED_REGION_INFO_SLOT_COUNT { + cached_region_info.push(RwLock::new(HashMap::default())); + } + Self { + cached_region_info: Arc::new(cached_region_info), + } + } + + #[inline] + fn slot_index(id: u64) -> usize { + debug_assert!(CACHED_REGION_INFO_SLOT_COUNT.is_power_of_two()); + Self::hash_u64(id) as usize & (CACHED_REGION_INFO_SLOT_COUNT - 1) + } + + pub fn access_cached_region_info_mut>)>( + &self, + region_id: u64, + mut f: F, + ) -> Result<()> { + let slot_id = Self::slot_index(region_id); + let mut guard = match self.cached_region_info.get(slot_id).unwrap().write() { + Ok(g) => g, + Err(_) => return Err("access_cached_region_info_mut poisoned".into()), + }; + f(guard.entry(region_id)); + Ok(()) + } + + pub fn access_cached_region_info)>( + &self, + region_id: u64, + mut f: F, + ) { + let slot_id = Self::slot_index(region_id); + let guard = match self.cached_region_info.get(slot_id).unwrap().read() { + Ok(g) => g, + Err(_) => panic!("access_cached_region_info poisoned!"), + }; + match guard.get(®ion_id) { + Some(g) => f(g.clone()), + None => (), + } + } + + pub fn get_inited_or_fallback(&self, region_id: u64) -> Option { + let mut result: Option = None; + let f = |info: Arc| { + result = Some(info.inited_or_fallback.load(Ordering::SeqCst)); + }; + self.access_cached_region_info(region_id, f); + result + } + + pub fn remove_cached_region_info(&self, region_id: u64) { + let slot_id = Self::slot_index(region_id); + if let Ok(mut g) = self.cached_region_info.get(slot_id).unwrap().write() { + info!( + "remove_cached_region_info"; + "region_id" => region_id, + ); + let _ = g.remove(®ion_id); + } + } + + pub fn set_inited_or_fallback(&self, region_id: u64, v: bool) -> Result<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().inited_or_fallback.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + pub fn set_snapshot_inflight(&self, region_id: u64, v: u128) -> Result<()> { + self.access_cached_region_info_mut( + region_id, + |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + o.get_mut().snapshot_inflight.store(v, Ordering::SeqCst); + } + MapEntry::Vacant(_) => { + tikv_util::safe_panic!("not inited!"); + } + }, + ) + } + + pub fn fallback_to_slow_path(&self, region_id: u64) { + // TODO clean local, and prepare to request snapshot from TiKV as a trivial + // procedure. + fail::fail_point!("fallback_to_slow_path_not_allow", |_| {}); + if self.set_inited_or_fallback(region_id, true).is_err() { + tikv_util::safe_panic!("set_inited_or_fallback"); + } + } +} diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index f65ca8200cd..5dba0cbfc30 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -8,7 +8,7 @@ use std::{ ops::Deref, path::Path, sync::{ - atomic::{AtomicUsize, Ordering}, + atomic::{AtomicIsize, Ordering}, Arc, }, }; @@ -81,8 +81,10 @@ pub struct RocksEngine { pub rocks: engine_rocks::RocksEngine, pub engine_store_server_helper: isize, pub pool_capacity: usize, - pub pending_applies_count: Arc, + pub pending_applies_count: Arc, pub ffi_hub: Option>, + pub config_set: Option>, + pub cached_region_info_manager: Option>, } impl std::fmt::Debug for RocksEngine { @@ -107,6 +109,7 @@ impl RocksEngine { engine_store_server_helper: isize, snap_handle_pool_size: usize, ffi_hub: Option>, + config_set: Option>, ) { #[cfg(feature = "enable-pagestorage")] tikv_util::info!("enabled pagestorage"); @@ -116,6 +119,8 @@ impl RocksEngine { self.pool_capacity = snap_handle_pool_size; self.pending_applies_count.store(0, Ordering::SeqCst); self.ffi_hub = ffi_hub; + self.config_set = config_set; + self.cached_region_info_manager = Some(Arc::new(crate::CachedRegionInfoManager::new())) } pub fn from_rocks(rocks: engine_rocks::RocksEngine) -> Self { @@ -123,8 +128,10 @@ impl RocksEngine { rocks, engine_store_server_helper: 0, pool_capacity: 0, - pending_applies_count: Arc::new(AtomicUsize::new(0)), + pending_applies_count: Arc::new(AtomicIsize::new(0)), ffi_hub: None, + config_set: None, + cached_region_info_manager: None, } } @@ -133,8 +140,10 @@ impl RocksEngine { rocks: engine_rocks::RocksEngine::from_db(db), engine_store_server_helper: 0, pool_capacity: 0, - pending_applies_count: Arc::new(AtomicUsize::new(0)), + pending_applies_count: Arc::new(AtomicIsize::new(0)), ffi_hub: None, + config_set: None, + cached_region_info_manager: None, } } @@ -200,19 +209,44 @@ impl KvEngine for RocksEngine { // new task, or when `handle_pending_applies` need to handle multiple // snapshots. We need to compare to what's in queue. - fn can_apply_snapshot(&self, is_timeout: bool, new_batch: bool, _region_id: u64) -> bool { + fn can_apply_snapshot(&self, is_timeout: bool, new_batch: bool, region_id: u64) -> bool { + fail::fail_point!("on_can_apply_snapshot", |e| e + .unwrap() + .parse::() + .unwrap()); + if let Some(s) = self.config_set.as_ref() { + if s.engine_store.enable_fast_add_peer { + // TODO Return true if this is an empty snapshot. + // We need to test if the region is still in fast add peer mode. + let result = self + .cached_region_info_manager + .as_ref() + .expect("expect cached_region_info_manager") + .get_inited_or_fallback(region_id); + match result { + Some(true) => { + // Do nothing. + tikv_util::debug!("can_apply_snapshot no fast path. do normal checking"; + "region_id" => region_id, + ); + } + None | Some(false) => { + // Otherwise, try fast path. + return true; + } + }; + } + } // is called after calling observer's pre_handle_snapshot let in_queue = self.pending_applies_count.load(Ordering::SeqCst); - // if queue is full, we should begin to handle let can = if is_timeout && new_batch { + // If queue is full, we should begin to handle true } else { - in_queue > self.pool_capacity + // Otherwise, we wait until the queue is full. + // In order to batch more tasks. + in_queue > (self.pool_capacity as isize) }; - fail::fail_point!("on_can_apply_snapshot", |e| e - .unwrap() - .parse::() - .unwrap()); can } } diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 1d733e724be..022418a58ae 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -120,6 +120,8 @@ pub mod raw; mod proxy_utils; pub use proxy_utils::*; +mod cached_region_info_manager; +pub use cached_region_info_manager::*; pub use rocksdb::DB; pub fn get_env( diff --git a/engine_tiflash/src/proxy_utils.rs b/engine_tiflash/src/proxy_utils.rs index c44e355ae59..be7eb7cc2e9 100644 --- a/engine_tiflash/src/proxy_utils.rs +++ b/engine_tiflash/src/proxy_utils.rs @@ -71,3 +71,26 @@ pub fn log_check_double_write(batch: &crate::RocksWriteBatchVec) -> bool { } false } + +use serde_derive::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct EngineStoreConfig { + pub enable_fast_add_peer: bool, +} + +#[allow(clippy::derivable_impls)] +impl Default for EngineStoreConfig { + fn default() -> Self { + Self { + enable_fast_add_peer: false, + } + } +} + +#[derive(Default, Debug)] +pub struct ProxyConfigSet { + pub engine_store: EngineStoreConfig, +} diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index 146b4e87a7f..65278790b13 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -320,17 +320,20 @@ impl> Cluster { .engine_store_server_helper; let helper = engine_store_ffi::gen_engine_store_server_helper(helper_ptr); - let ffi_hub = Arc::new(engine_store_ffi::observer::TiFlashFFIHub { + let ffi_hub = Arc::new(engine_store_ffi::TiFlashFFIHub { engine_store_server_helper: helper, }); (helper_ptr, ffi_hub) }; let engines = ffi_helper_set.engine_store_server.engines.as_mut().unwrap(); - + let proxy_config_set = Arc::new(engine_tiflash::ProxyConfigSet { + engine_store: self.cfg.proxy_cfg.engine_store.clone(), + }); engines.kv.init( helper_ptr, self.cfg.proxy_cfg.raft_store.snap_handle_pool_size, Some(ffi_hub), + Some(proxy_config_set), ); assert_ne!(engines.kv.engine_store_server_helper, 0); diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs index 5782ee4f987..68773cece7f 100644 --- a/new-mock-engine-store/src/mock_page_storage.rs +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -208,7 +208,7 @@ pub unsafe extern "C" fn ffi_mockps_handle_scan_page( } pub unsafe extern "C" fn ffi_mockps_handle_purge_pagestorage( - wrap: *const ffi_interfaces::EngineStoreServerWrap, + _wrap: *const ffi_interfaces::EngineStoreServerWrap, ) { // TODO } diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index b819713c9cd..5221ab7b5b2 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -422,15 +422,19 @@ impl TiKvServer { .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); let helper = engine_store_ffi::gen_engine_store_server_helper(engine_store_server_helper); - let ffi_hub = Arc::new(engine_store_ffi::observer::TiFlashFFIHub { + let ffi_hub = Arc::new(engine_store_ffi::TiFlashFFIHub { engine_store_server_helper: helper, }); // engine_tiflash::RocksEngine has engine_rocks::RocksEngine inside let mut kv_engine = TiFlashEngine::from_rocks(kv_engine); + let proxy_config_set = Arc::new(engine_tiflash::ProxyConfigSet { + engine_store: self.proxy_config.engine_store.clone(), + }); kv_engine.init( engine_store_server_helper, self.proxy_config.raft_store.snap_handle_pool_size, Some(ffi_hub), + Some(proxy_config_set), ); let engines = Engines::new(kv_engine, raft_engine); @@ -1721,10 +1725,10 @@ impl ConfiguredRaftEngine for RaftLogEngine { impl ConfiguredRaftEngine for PSEngine { fn build( - config: &TikvConfig, - env: &Arc, - key_manager: &Option>, - block_cache: &Option, + _config: &TikvConfig, + _env: &Arc, + _key_manager: &Option>, + _block_cache: &Option, ) -> Self { PSEngine::new() } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 1bd26d4a124..3f2f70c591b 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -354,7 +354,7 @@ fn test_apply_snapshot() { pd_client.must_add_peer(1, new_learner_peer(2, 2)); must_put_and_check_key(&mut cluster, 1, 2, Some(true), None, Some(vec![1])); - // We add peer 3, it will be paused before fetching peer 2's data. + // We add peer 3 from peer 2, it will be paused before fetching peer 2's data. // However, peer 2 will apply conf change. fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(); @@ -369,19 +369,33 @@ fn test_apply_snapshot() { cluster.add_send_filter(CloneFilterFactory( RegionPacketFilter::new(1, 2) .msg_type(MessageType::MsgAppend) - .direction(Direction::Recv), + .direction(Direction::Both), + )); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(1, 2) + .msg_type(MessageType::MsgSnapshot) + .direction(Direction::Both), )); cluster.must_put(b"k3", b"v3"); cluster.must_put(b"k4", b"v4"); - force_compact_log(&mut cluster, b"k2", Some(vec![1])); + cluster.must_put(b"k5", b"v5"); // Log compacted, peer 2 will get snapshot, however, we pause when applying // snapshot. + force_compact_log(&mut cluster, b"k2", Some(vec![1])); + // Wait log compacted. + std::thread::sleep(std::time::Duration::from_millis(1000)); fail::cfg("on_ob_post_apply_snapshot", "pause").unwrap(); // Trigger a snapshot to 2. cluster.clear_send_filters(); - std::thread::sleep(std::time::Duration::from_millis(300)); + debug!("wait applying snapshot of peer 2"); + // Wait until peer 2 in Applying state. + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + states.in_disk_region_state.get_state() == PeerState::Applying + }); + // Now if we continue fast path, peer 2 will be in Applying state. + // Peer 3 can't use peer 2's data. // We will end up going slow path. fail::remove("ffi_fast_add_peer_pause"); fail::cfg("go_fast_path_succeed", "panic").unwrap(); From a3c15ce27d582dc695848bffb363631f4cae2db5 Mon Sep 17 00:00:00 2001 From: Jay Date: Mon, 16 Jan 2023 12:27:48 +0800 Subject: [PATCH 080/115] raftstore-v2: cleanup txn_ext (#14051) ref tikv/tikv#12842 Move transaction related code to txn_ext.rs. Fix the bug that snapshot doesn't set term and extra_op. Signed-off-by: Jay Lee --- components/raftstore-v2/src/fsm/peer.rs | 39 +-- .../operation/command/admin/conf_change.rs | 1 + .../src/operation/command/admin/split.rs | 16 +- .../command/admin/transfer_leader.rs | 97 +------ components/raftstore-v2/src/operation/mod.rs | 2 + components/raftstore-v2/src/operation/pd.rs | 16 -- .../raftstore-v2/src/operation/query/local.rs | 4 + .../raftstore-v2/src/operation/ready/mod.rs | 15 +- .../raftstore-v2/src/operation/txn_ext.rs | 260 ++++++++++++++++++ components/raftstore-v2/src/raft/peer.rs | 66 ++--- 10 files changed, 303 insertions(+), 213 deletions(-) create mode 100644 components/raftstore-v2/src/operation/txn_ext.rs diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index 0a6a66e8df1..26d5c2a1458 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -7,7 +7,7 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use raftstore::store::{Config, LocksStatus, TabletSnapManager, Transport}; +use raftstore::store::{Config, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, @@ -32,7 +32,6 @@ pub struct PeerFsm { /// twice accidentally. tick_registry: [bool; PeerTick::VARIANT_COUNT], is_stopped: bool, - reactivate_memory_lock_ticks: usize, } impl PeerFsm { @@ -55,7 +54,6 @@ impl PeerFsm { receiver: rx, tick_registry: [false; PeerTick::VARIANT_COUNT], is_stopped: false, - reactivate_memory_lock_ticks: 0, }); Ok((tx, fsm)) } @@ -136,9 +134,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn schedule_pending_ticks(&mut self) { let pending_ticks = self.fsm.peer.take_pending_ticks(); for tick in pending_ticks { - if tick == PeerTick::ReactivateMemoryLock { - self.fsm.reactivate_memory_lock_ticks = 0; - } self.schedule_tick(tick); } } @@ -225,7 +220,9 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, PeerTick::CheckPeerStaleState => unimplemented!(), PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), - PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), + PeerTick::ReactivateMemoryLock => { + self.fsm.peer.on_reactivate_memory_lock_tick(self.store_ctx) + } PeerTick::ReportBuckets => unimplemented!(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted(), } @@ -326,32 +323,4 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer.propose_pending_writes(self.store_ctx); self.schedule_pending_ticks(); } - - pub fn on_reactivate_memory_lock_tick(&mut self) { - let mut pessimistic_locks = self.fsm.peer.txn_ext().pessimistic_locks.write(); - - // If it is not leader, we needn't reactivate by tick. In-memory pessimistic - // lock will be enabled when this region becomes leader again. - // And this tick is currently only used for the leader transfer failure case. - if !self.fsm.peer().is_leader() - || pessimistic_locks.status != LocksStatus::TransferringLeader - { - return; - } - - self.fsm.reactivate_memory_lock_ticks += 1; - let transferring_leader = self.fsm.peer.raft_group().raft.lead_transferee.is_some(); - // `lead_transferee` is not set immediately after the lock status changes. So, - // we need the tick count condition to avoid reactivating too early. - if !transferring_leader - && self.fsm.reactivate_memory_lock_ticks - >= self.store_ctx.cfg.reactive_memory_lock_timeout_tick - { - pessimistic_locks.status = LocksStatus::Normal; - self.fsm.reactivate_memory_lock_ticks = 0; - } else { - drop(pessimistic_locks); - self.schedule_tick(PeerTick::ReactivateMemoryLock); - } - } } diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 8c9771b0201..42c433584fe 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -261,6 +261,7 @@ impl Apply { "changes" => ?changes, "legacy" => legacy, "original region" => ?region, "err" => ?e); + return Err(e); } } let conf_ver = region.get_region_epoch().get_conf_ver() + changes.len() as u64; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index d01b1371338..f9e44286490 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -449,21 +449,9 @@ impl Peer { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); let derived = &res.regions[res.derived_index]; - let derived_epoch = derived.get_region_epoch().clone(); let region_id = derived.get_id(); - // Group in-memory pessimistic locks in the original region into new regions. - // The locks of new regions will be put into the corresponding new regions - // later. And the locks belonging to the old region will stay in the original - // map. - let region_locks = { - let mut pessimistic_locks = self.txn_ext().pessimistic_locks.write(); - info!(self.logger, "moving {} locks to new regions", pessimistic_locks.len();); - // Update the version so the concurrent reader will fail due to EpochNotMatch - // instead of PessimisticLockNotFound. - pessimistic_locks.version = derived_epoch.get_version(); - pessimistic_locks.group_by_regions(&res.regions, derived) - }; + let region_locks = self.txn_context().split(&res.regions, derived); fail_point!("on_split_invalidate_locks"); let tablet: EK = match res.tablet.downcast() { @@ -650,7 +638,7 @@ impl Peer { let _ = self.raft_group_mut().campaign(); self.set_has_ready(); - *self.txn_ext().pessimistic_locks.write() = split_init.locks; + self.txn_context().init_with_lock(split_init.locks); let control = self.split_flow_control_mut(); control.approximate_size = split_init.approximate_size; control.approximate_keys = split_init.approximate_keys; diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs index 12bd7bbf491..54aa9845e17 100644 --- a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -3,22 +3,19 @@ use std::cmp::Ordering; use bytes::Bytes; -use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use fail::fail_point; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ disk_usage::DiskUsage, metapb, raft_cmdpb::{ - AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, RaftRequestHeader, - TransferLeaderRequest, + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, TransferLeaderRequest, }, }; -use parking_lot::RwLockWriteGuard; use raft::{eraftpb, ProgressState, Storage}; use raftstore::{ store::{ fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, - LocksStatus, TRANSFER_LEADER_COMMAND_REPLY_CTX, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }, Result, }; @@ -30,9 +27,8 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, fsm::ApplyResReporter, - operation::command::write::SimpleWriteEncoder, raft::{Apply, Peer}, - router::{CmdResChannel, PeerMsg, PeerTick}, + router::{CmdResChannel, PeerMsg}, }; fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { @@ -296,91 +292,6 @@ impl Peer { } None } - - // Returns whether we should propose another TransferLeader command. This is - // for: - // - Considering the amount of pessimistic locks can be big, it can reduce - // unavailable time caused by waiting for the transferee catching up logs. - // - Make transferring leader strictly after write commands that executes before - // proposing the locks, preventing unexpected lock loss. - fn propose_locks_before_transfer_leader( - &mut self, - ctx: &mut StoreContext, - msg: &eraftpb::Message, - ) -> bool { - // 1. Disable in-memory pessimistic locks. - - // Clone to make borrow checker happy when registering ticks. - let txn_ext = self.txn_ext().clone(); - let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); - - // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message - // is a reply to a transfer leader command before. If the locks status remain - // in the TransferringLeader status, we can safely initiate transferring leader - // now. - // If it's not in TransferringLeader status now, it is probably because several - // ticks have passed after proposing the locks in the last time and we - // reactivate the memory locks. Then, we should propose the locks again. - if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX - && pessimistic_locks.status == LocksStatus::TransferringLeader - { - return false; - } - - // If it is not writable, it's probably because it's a retried TransferLeader - // and the locks have been proposed. But we still need to return true to - // propose another TransferLeader command. Otherwise, some write requests that - // have marked some locks as deleted will fail because raft rejects more - // proposals. - // It is OK to return true here if it's in other states like MergingRegion or - // NotLeader. In those cases, the locks will fail to propose and nothing will - // happen. - if !pessimistic_locks.is_writable() { - return true; - } - pessimistic_locks.status = LocksStatus::TransferringLeader; - self.add_pending_tick(PeerTick::ReactivateMemoryLock); - - // 2. Propose pessimistic locks - if pessimistic_locks.is_empty() { - return false; - } - // FIXME: Raft command has size limit. Either limit the total size of - // pessimistic locks in a region, or split commands here. - let mut encoder = SimpleWriteEncoder::with_capacity(512); - let mut lock_count = 0; - { - // Downgrade to a read guard, do not block readers in the scheduler as far as - // possible. - let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); - fail_point!("invalidate_locks_before_transfer_leader"); - for (key, (lock, deleted)) in &*pessimistic_locks { - if *deleted { - continue; - } - lock_count += 1; - encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); - } - } - if lock_count == 0 { - // If the map is not empty but all locks are deleted, it is possible that a - // write command has just marked locks deleted but not proposed yet. - // It might cause that command to fail if we skip proposing the - // extra TransferLeader command here. - return true; - } - let mut header = Box::::default(); - header.set_region_id(self.region_id()); - header.set_region_epoch(self.region().get_region_epoch().clone()); - header.set_peer(self.peer().clone()); - info!( - self.logger, - "propose {} locks before transferring leader", lock_count; - ); - let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; - self.on_simple_write(ctx, write.header, write.data, write.ch); - true - } } impl Apply { diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 807f425e998..76baf31f9c8 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -5,6 +5,7 @@ mod life; mod pd; mod query; mod ready; +mod txn_ext; pub use command::{ AdminCmdResult, ApplyFlowControl, CommittedEntries, CompactLogContext, ProposalControl, @@ -20,4 +21,5 @@ pub use ready::{ pub(crate) use self::{ command::SplitInit, query::{LocalReader, ReadDelegatePair, SharedReadTablet}, + txn_ext::TxnContext, }; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 26945a3e176..17abdd85cf0 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -205,20 +205,4 @@ impl Peer { ); } } - - #[inline] - pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { - let task = pd::Task::UpdateMaxTimestamp { - region_id: self.region_id(), - initial_status, - txn_ext: self.txn_ext().clone(), - }; - if let Err(e) = ctx.schedulers.pd.schedule(task) { - error!( - self.logger, - "failed to notify pd with UpdateMaxTimestamp"; - "err" => %e, - ); - } - } } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index e4c0aa6d0b9..13b815d1ebc 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -2,6 +2,7 @@ // #[PerformanceCriticalPath] use std::{ + num::NonZeroU64, ops::Deref, sync::{atomic, Arc, Mutex}, }; @@ -246,6 +247,8 @@ where }; snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.term = NonZeroU64::new(delegate.term); + snap.txn_extra_op = delegate.txn_extra_op.load(); snap.bucket_meta = delegate.bucket_meta.clone(); delegate.cached_tablet.release(); @@ -945,6 +948,7 @@ mod tests { assert_eq!(read_progress.safe_ts(), 2); let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); assert_eq!(*snap.get_region(), region1); + assert_eq!(snap.term, NonZeroU64::new(term6)); drop(mix_tx); handler.join().unwrap(); diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index d1348cf014b..87e1c100a87 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -728,12 +728,12 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); - // Init the in-memory pessimistic lock table when the peer becomes leader. - self.activate_in_memory_pessimistic_locks(); - - // A more recent read may happen on the old leader. So max ts should - // be updated after a peer becomes leader. - self.require_updating_max_ts(ctx); + self.txn_context().on_became_leader( + ctx, + self.term(), + self.region(), + &self.logger, + ); // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); @@ -746,7 +746,8 @@ impl Peer { StateRole::Follower => { self.leader_lease_mut().expire(); self.storage_mut().cancel_generating_snap(None); - self.clear_in_memory_pessimistic_locks(); + self.txn_context() + .on_became_follower(self.term(), self.region()); } _ => {} } diff --git a/components/raftstore-v2/src/operation/txn_ext.rs b/components/raftstore-v2/src/operation/txn_ext.rs new file mode 100644 index 00000000000..911c1eaab78 --- /dev/null +++ b/components/raftstore-v2/src/operation/txn_ext.rs @@ -0,0 +1,260 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains everything related to transaction hook. +//! +//! This is the temporary (efficient) solution, it should be implemented as one +//! type of coprocessor. + +use std::sync::{atomic::Ordering, Arc}; + +use crossbeam::atomic::AtomicCell; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use kvproto::{kvrpcpb::ExtraOp, metapb::Region, raft_cmdpb::RaftRequestHeader}; +use parking_lot::RwLockWriteGuard; +use raft::eraftpb; +use raftstore::store::{ + LocksStatus, PeerPessimisticLocks, TxnExt, TRANSFER_LEADER_COMMAND_REPLY_CTX, +}; +use slog::{error, info, Logger}; + +use crate::{ + batch::StoreContext, + raft::Peer, + router::{PeerMsg, PeerTick}, + worker::pd, + SimpleWriteEncoder, +}; + +pub struct TxnContext { + ext: Arc, + extra_op: Arc>, + reactivate_memory_lock_ticks: usize, +} + +impl Default for TxnContext { + #[inline] + fn default() -> Self { + Self { + ext: Arc::default(), + extra_op: Arc::new(AtomicCell::new(ExtraOp::Noop)), + reactivate_memory_lock_ticks: 0, + } + } +} + +impl TxnContext { + #[inline] + pub fn on_region_changed(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_leader( + &self, + ctx: &mut StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) { + // A more recent read may happen on the old leader. So max ts should + // be updated after a peer becomes leader. + self.require_updating_max_ts(ctx, term, region, logger); + + // Init the in-memory pessimistic lock table when the peer becomes leader. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn on_became_follower(&self, term: u64, region: &Region) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = term; + pessimistic_locks.version = region.get_region_epoch().get_version(); + } + + #[inline] + pub fn ext(&self) -> &Arc { + &self.ext + } + + #[inline] + pub fn extra_op(&self) -> &Arc> { + &self.extra_op + } + + // TODO: find a better place to put all txn related stuff. + fn require_updating_max_ts( + &self, + ctx: &StoreContext, + term: u64, + region: &Region, + logger: &Logger, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let epoch = region.get_region_epoch(); + let term_low_bits = term & ((1 << 32) - 1); // 32 bits + let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits + let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); + self.ext + .max_ts_sync_status + .store(initial_status, Ordering::SeqCst); + info!( + logger, + "require updating max ts"; + "initial_status" => initial_status, + ); + let task = pd::Task::UpdateMaxTimestamp { + region_id: region.get_id(), + initial_status, + txn_ext: self.ext.clone(), + }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { + error!(logger, "failed to notify pd with UpdateMaxTimestamp"; "err" => ?e); + } + } + + pub fn split(&self, regions: &[Region], derived: &Region) -> Vec { + // Group in-memory pessimistic locks in the original region into new regions. + // The locks of new regions will be put into the corresponding new regions + // later. And the locks belonging to the old region will stay in the original + // map. + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + // Update the version so the concurrent reader will fail due to EpochNotMatch + // instead of PessimisticLockNotFound. + pessimistic_locks.version = derived.get_region_epoch().get_version(); + pessimistic_locks.group_by_regions(regions, derived) + } + + pub fn init_with_lock(&self, locks: PeerPessimisticLocks) { + let mut pessimistic_locks = self.ext.pessimistic_locks.write(); + *pessimistic_locks = locks; + } +} + +impl Peer { + /// Returns True means the tick is consumed, otherwise the tick should be + /// rescheduled. + pub fn on_reactivate_memory_lock_tick(&mut self, ctx: &mut StoreContext) { + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + if !self.is_leader() { + return; + } + + let transferring_leader = self.raft_group().raft.lead_transferee.is_some(); + let txn_context = self.txn_context_mut(); + let mut pessimistic_locks = txn_context.ext.pessimistic_locks.write(); + + // And this tick is currently only used for the leader transfer failure case. + if pessimistic_locks.status != LocksStatus::TransferringLeader { + return; + } + + txn_context.reactivate_memory_lock_ticks += 1; + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && txn_context.reactivate_memory_lock_ticks >= ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + txn_context.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + } + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + pub fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_context().ext.clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.txn_context_mut().reactivate_memory_lock_ticks = 0; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail::fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); + } + } + if lock_count == 0 { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", lock_count; + ); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); + true + } +} diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index 8051066d4f9..6cfcda4da25 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -2,16 +2,15 @@ use std::{ cmp, mem, - sync::{atomic::Ordering, Arc}, + sync::Arc, time::{Duration, Instant}, }; use collections::{HashMap, HashSet}; -use crossbeam::atomic::AtomicCell; use engine_traits::{ CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, }; -use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; +use kvproto::{metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ @@ -19,19 +18,18 @@ use raftstore::{ store::{ fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, - Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, - ReadProgress, TabletSnapManager, TxnExt, WriteTask, + Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, + TabletSnapManager, WriteTask, }, }; use slog::Logger; use super::storage::Storage; use crate::{ - batch::StoreContext, fsm::ApplyScheduler, operation::{ AsyncWriter, CompactLogContext, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, - SplitFlowControl, + SplitFlowControl, TxnContext, }, router::{CmdResChannel, PeerTick, QueryResChannel}, Result, @@ -83,8 +81,7 @@ pub struct Peer { last_region_buckets: Option, /// Transaction extensions related to this peer. - txn_ext: Arc, - txn_extra_op: Arc>, + txn_context: TxnContext, pending_ticks: Vec, @@ -173,8 +170,7 @@ impl Peer { ), region_buckets: None, last_region_buckets: None, - txn_ext: Arc::default(), - txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), + txn_context: TxnContext::default(), proposal_control: ProposalControl::new(0), pending_ticks: Vec::new(), split_trace: vec![], @@ -261,11 +257,8 @@ impl Peer { self.read_progress .update_leader_info(self.leader_id(), self.term(), self.region()); - { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } + self.txn_context + .on_region_changed(self.term(), self.region()); if self.serving() { host.on_region_changed( @@ -639,21 +632,6 @@ impl Peer { mem::take(&mut self.pending_ticks) } - pub fn activate_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::Normal; - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - - pub fn clear_in_memory_pessimistic_locks(&mut self) { - let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); - pessimistic_locks.status = LocksStatus::NotLeader; - pessimistic_locks.clear(); - pessimistic_locks.term = self.term(); - pessimistic_locks.version = self.region().get_region_epoch().get_version(); - } - #[inline] pub fn post_split(&mut self) { self.reset_region_buckets(); @@ -678,8 +656,13 @@ impl Peer { } #[inline] - pub fn txn_ext(&self) -> &Arc { - &self.txn_ext + pub fn txn_context(&self) -> &TxnContext { + &self.txn_context + } + + #[inline] + pub fn txn_context_mut(&mut self) -> &mut TxnContext { + &mut self.txn_context } pub fn generate_read_delegate(&self) -> ReadDelegate { @@ -690,8 +673,8 @@ impl Peer { self.term(), self.region().clone(), self.storage().entry_storage().applied_term(), - self.txn_extra_op.clone(), - self.txn_ext.clone(), + self.txn_context.extra_op().clone(), + self.txn_context.ext().clone(), self.read_progress().clone(), self.region_buckets.as_ref().map(|b| b.meta.clone()), ) @@ -715,19 +698,6 @@ impl Peer { .advance_apply(apply_index, term, region); } - // TODO: find a better place to put all txn related stuff. - pub fn require_updating_max_ts(&self, ctx: &StoreContext) { - let epoch = self.region().get_region_epoch(); - let term_low_bits = self.term() & ((1 << 32) - 1); // 32 bits - let version_lot_bits = epoch.get_version() & ((1 << 31) - 1); // 31 bits - let initial_status = (term_low_bits << 32) | (version_lot_bits << 1); - self.txn_ext - .max_ts_sync_status - .store(initial_status, Ordering::SeqCst); - - self.update_max_timestamp_pd(ctx, initial_status); - } - #[inline] pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { &mut self.split_trace From dd230992869f1e0bd26f8243b0e03214e2c756fb Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Mon, 16 Jan 2023 20:48:11 +0800 Subject: [PATCH 081/115] Refactor send snapshot (#258) --- components/raftstore/src/store/snap.rs | 7 ++-- engine_store_ffi/src/observer.rs | 44 ++++++++++++++------------ proxy_scripts/ci_check.sh | 1 + proxy_server/src/run.rs | 1 + proxy_tests/proxy/config.rs | 1 + 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a9c98dcd69f..40ccac997fa 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -663,7 +663,7 @@ impl Snapshot { Ok(snapshot_meta) } - fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { + pub fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { let mut cf_file_count_from_meta: Vec = vec![]; let mut file_count = 0; let mut current_cf = ""; @@ -812,8 +812,9 @@ impl Snapshot { } } - // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + // Save `SnapshotMeta` to file. + // Used in `do_build` and by external crates. + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 6b6dd65d846..1486958c3d2 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -447,8 +447,8 @@ impl TiFlashObserver { ) -> RaftStoreResult { let cached_manager = self.get_cached_manager(); let inner_msg = msg.get_message(); - // Build snapshot by get_snapshot_for_building - let (mut snap, key) = { + // Get a snapshot object. + let (mut snapshot, key) = { // Find term of entry at applied_index. let applied_index = apply_state.get_applied_index(); let applied_term = @@ -475,18 +475,18 @@ impl TiFlashObserver { // Build snapshot by do_snapshot let mut pb_snapshot: eraftpb::Snapshot = Default::default(); let pb_snapshot_metadata: &mut eraftpb::SnapshotMetadata = pb_snapshot.mut_metadata(); - let mut snap_data = kvproto::raft_serverpb::RaftSnapshotData::default(); + let mut pb_snapshot_data = kvproto::raft_serverpb::RaftSnapshotData::default(); { // eraftpb::SnapshotMetadata for (_, cf) in raftstore::store::snap::SNAPSHOT_CFS_ENUM_PAIR { - let cf_index: RaftStoreResult = snap + let cf_index: RaftStoreResult = snapshot .cf_files() .iter() .position(|x| &x.cf == cf) .ok_or(box_err!("can't find index for cf {}", cf)); let cf_index = cf_index?; - let cf_file = &snap.cf_files()[cf_index]; - // Create fake file. + let cf_file = &snapshot.cf_files()[cf_index]; + // Create fake cf file. let mut path = cf_file.path.clone(); path.push(cf_file.file_prefix.clone()); path.set_extension("sst"); @@ -494,26 +494,30 @@ impl TiFlashObserver { f.flush()?; f.sync_all()?; } - snap_data.set_region(new_region.clone()); - snap_data.set_file_size(0); + pb_snapshot_data.set_region(new_region.clone()); + pb_snapshot_data.set_file_size(0); const SNAPSHOT_VERSION: u64 = 2; - snap_data.set_version(SNAPSHOT_VERSION); + pb_snapshot_data.set_version(SNAPSHOT_VERSION); // SnapshotMeta // Which is snap.meta_file.meta - let snapshot_meta = raftstore::store::snap::gen_snapshot_meta(snap.cf_files(), true)?; + let snapshot_meta = + raftstore::store::snap::gen_snapshot_meta(snapshot.cf_files(), true)?; // Write MetaFile { - let v = snapshot_meta.write_to_bytes()?; - let mut f = std::fs::File::create(snap.meta_path())?; - info!("!!!!! create snapshot meta file {:?}", snap.meta_path()); - f.write_all(&v[..])?; - f.flush()?; - f.sync_all()?; + // let v = snapshot_meta.write_to_bytes()?; + // let mut f = std::fs::File::create(snapshot.meta_path())?; + // info!("!!!!! create snapshot meta file {:?}", snapshot.meta_path()); + // f.write_all(&v[..])?; + // f.flush()?; + // f.sync_all()?; + // snapshot.mut_meta_file().meta.insert(snapshot_meta.clone()); + snapshot.set_snapshot_meta(snapshot_meta.clone())?; + // snapshot.set_hold_tmp_files(false); + snapshot.save_meta_file()?; } - snap_data.set_meta(snapshot_meta); - snap.set_hold_tmp_files(false); + pb_snapshot_data.set_meta(snapshot_meta); } pb_snapshot_metadata @@ -521,7 +525,7 @@ impl TiFlashObserver { pb_snapshot_metadata.set_index(key.idx); pb_snapshot_metadata.set_term(key.term); - pb_snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + pb_snapshot.set_data(pb_snapshot_data.write_to_bytes().unwrap().into()); // Send reponse let mut response = RaftMessage::default(); @@ -543,7 +547,7 @@ impl TiFlashObserver { msg.get_to_peer().get_id(), key, response, - snap_data, + pb_snapshot_data, apply_state ); match self.trans.lock() { diff --git a/proxy_scripts/ci_check.sh b/proxy_scripts/ci_check.sh index 1aa71509fc7..a6d432a1fd7 100755 --- a/proxy_scripts/ci_check.sh +++ b/proxy_scripts/ci_check.sh @@ -44,6 +44,7 @@ elif [[ $M == "testnew" ]]; then cargo test --package proxy_tests --test proxy region cargo test --package proxy_tests --test proxy flashback cargo test --package proxy_tests --test proxy server_cluster_test + cargo test --package proxy_tests --test proxy fast_add_peer cargo test --package proxy_tests --test proxy ffi -- --test-threads 1 cargo test --package proxy_tests --test proxy write --features="proxy_tests/enable-pagestorage" elif [[ $M == "debug" ]]; then diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 5221ab7b5b2..c96941a6f67 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -539,6 +539,7 @@ impl TiKvServer { // Initialize and check config info!("using proxy config"; "config" => ?proxy_config); + info!("!!!!! using proxy config 2"; "engine_store" => ?proxy_config.engine_store); let cfg_controller = Self::init_config(config, &proxy_config); let config = cfg_controller.get_current(); diff --git a/proxy_tests/proxy/config.rs b/proxy_tests/proxy/config.rs index af5421c40b1..95b8e00d3cb 100644 --- a/proxy_tests/proxy/config.rs +++ b/proxy_tests/proxy/config.rs @@ -217,5 +217,6 @@ enable-fast-add-peer = true let proxy_config = gen_proxy_config(&cpath, false, &mut v); info!("using proxy config"; "config" => ?proxy_config); + info!("!!!!! using proxy config 2"; "engine_store" => ?proxy_config.engine_store); assert_eq!(true, proxy_config.engine_store.enable_fast_add_peer); } From 6d163b846327a0f61c1049b97cb4b315639ce9a6 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 16 Jan 2023 19:01:49 -0800 Subject: [PATCH 082/115] raftstore-v2: a few small fixes (#14039) ref tikv/tikv#12842 1) add snapshot apply metrics 2) disable bloomfilter for raftkv-v2 for now until a proper ratio is found 3) disable rocksdb write stall for raftkv-v2 until the tablet flow control is fully verified. Signed-off-by: Qi Xu Co-authored-by: Qi Xu --- components/raftstore-v2/src/operation/ready/mod.rs | 5 +++-- components/raftstore-v2/src/operation/ready/snapshot.rs | 6 ++++-- src/config/mod.rs | 7 ++++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 87e1c100a87..38d126ac87a 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -31,8 +31,8 @@ use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; use raftstore::{ coprocessor::{RegionChangeEvent, RoleChange}, store::{ - needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteCallback, - WriteTask, + needs_evict_entry_cache, util, worker_metrics::SNAP_COUNTER, FetchedLogs, ReadProgress, + Transport, WriteCallback, WriteTask, }, }; use slog::{debug, error, info, trace, warn}; @@ -877,6 +877,7 @@ impl Storage { ctx.snap_mgr.clone(), ctx.tablet_registry.clone(), ) { + SNAP_COUNTER.apply.fail.inc(); error!(self.logger(),"failed to apply snapshot";"error" => ?e) } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 04b6ed7e12b..bcbe220252b 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -36,8 +36,9 @@ use raft::{eraftpb::Snapshot, StateRole}; use raftstore::{ coprocessor::RegionChangeEvent, store::{ - metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, worker_metrics::SNAP_COUNTER, + GenSnapRes, ReadTask, TabletSnapKey, TabletSnapManager, Transport, WriteTask, + RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; use slog::{error, info, warn}; @@ -252,6 +253,7 @@ impl Peer { !s.scheduled || snapshot_index != RAFT_INIT_LOG_INDEX }) { info!(self.logger, "apply tablet snapshot completely"); + SNAP_COUNTER.apply.success.inc(); } if let Some(init) = split { info!(self.logger, "init split with snapshot finished"); diff --git a/src/config/mod.rs b/src/config/mod.rs index 8d3e5477f26..9caa68d8e6b 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -116,7 +116,8 @@ fn bloom_filter_ratio(et: EngineType) -> f64 { EngineType::RaftKv => 0.1, // In v2, every peer has its own tablet. The data scale is about tens of // GiBs. We only need a small portion for those key. - EngineType::RaftKv2 => 0.005, + // TODO: disable it for now until find out the proper ratio + EngineType::RaftKv2 => 0.0, } } @@ -1229,6 +1230,10 @@ impl DbConfig { self.write_buffer_limit.get_or_insert(ReadableSize( (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, )); + self.defaultcf.disable_write_stall = true; + self.writecf.disable_write_stall = true; + self.lockcf.disable_write_stall = true; + self.raftcf.disable_write_stall = true; } } } From a463db0911b4a2f2f47a29b567c54338a7ff3876 Mon Sep 17 00:00:00 2001 From: Zwb Date: Tue, 17 Jan 2023 14:51:48 +0800 Subject: [PATCH 083/115] apply: fix witness raft log gc panic and refactor (#14054) ref tikv/tikv#12876 fix witness raft log gc panic and refactor Signed-off-by: Wenbo Zhang Co-authored-by: Xinye Tao --- components/raftstore/src/store/config.rs | 7 ++ components/raftstore/src/store/fsm/apply.rs | 77 ++++++++++++--------- components/raftstore/src/store/fsm/peer.rs | 35 ++++------ tests/failpoints/cases/test_witness.rs | 8 +++ tests/integrations/config/mod.rs | 1 + 5 files changed, 74 insertions(+), 54 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 454cf61a4c8..4d3210318a6 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -68,6 +68,9 @@ pub struct Config { pub raft_log_compact_sync_interval: ReadableDuration, // Interval to gc unnecessary raft log. pub raft_log_gc_tick_interval: ReadableDuration, + // Interval to request voter_replicated_index for gc unnecessary raft log, + // if the leader has not initiated gc for a long time. + pub request_voter_replicated_index_interval: ReadableDuration, // A threshold to gc stale raft log, must >= 1. pub raft_log_gc_threshold: u64, // When entry count exceed this value, gc will be forced trigger. @@ -339,6 +342,7 @@ impl Default for Config { raft_entry_max_size: ReadableSize::mb(8), raft_log_compact_sync_interval: ReadableDuration::secs(2), raft_log_gc_tick_interval: ReadableDuration::secs(3), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 50, raft_log_gc_count_limit: None, raft_log_gc_size_limit: None, @@ -813,6 +817,9 @@ impl Config { CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_tick_interval"]) .set(self.raft_log_gc_tick_interval.as_secs_f64()); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["request_voter_replicated_index_interval"]) + .set(self.request_voter_replicated_index_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["raft_log_gc_threshold"]) .set(self.raft_log_gc_threshold as f64); diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 58df32fd404..60ed35e6892 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -258,6 +258,7 @@ pub enum ExecResult { CompactLog { state: RaftTruncatedState, first_index: u64, + has_pending: bool, }, SplitRegion { regions: Vec, @@ -300,7 +301,12 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, - PendingCompactCmd, + // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is + // `true`, when the raftstore detects that the raft log has not been gc for a long time, + // the raftstore thread will actively pull the `voter_replicated_index` from the leader + // and try to compact pending gc. If false, raftstore does not do any additional + // processing. + HasPendingCompactCmd(bool), } /// The possible returned value when applying logs. @@ -1508,7 +1514,7 @@ where | ExecResult::DeleteRange { .. } | ExecResult::IngestSst { .. } | ExecResult::TransferLeader { .. } - | ExecResult::PendingCompactCmd => {} + | ExecResult::HasPendingCompactCmd(..) => {} ExecResult::SplitRegion { ref derived, .. } => { self.region = derived.clone(); self.metrics.size_diff_hint = 0; @@ -2966,11 +2972,13 @@ where )) } + // When the first return value is true, it means that we have updated + // `RaftApplyState`, and the caller needs to do persistence. fn try_compact_log( &mut self, voter_replicated_index: u64, voter_replicated_term: u64, - ) -> Result>> { + ) -> Result<(bool, Option>)> { PEER_ADMIN_CMD_COUNTER.compact.all.inc(); let first_index = entry_storage::first_index(&self.apply_state); @@ -2981,7 +2989,7 @@ where "peer_id" => self.id(), "voter_replicated_index" => voter_replicated_index, ); - return Ok(None); + return Ok((false, None)); } // When the witness restarted, the pending compact cmd has been lost, so use @@ -2995,11 +3003,7 @@ where "compact_index" => voter_replicated_index, "first_index" => first_index, ); - return Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index: 0, - has_pending: false, - })); + return Ok((false, Some(ExecResult::HasPendingCompactCmd(false)))); } // compact failure is safe to be omitted, no need to assert. compact_raft_log( @@ -3009,11 +3013,7 @@ where voter_replicated_term, )?; PEER_ADMIN_CMD_COUNTER.compact.success.inc(); - return Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index, - has_pending: false, - })); + return Ok((true, Some(ExecResult::HasPendingCompactCmd(false)))); } match self.pending_cmds.pop_compact(voter_replicated_index) { @@ -3021,11 +3021,14 @@ where // compact failure is safe to be omitted, no need to assert. compact_raft_log(&self.tag, &mut self.apply_state, cmd.index, cmd.term)?; PEER_ADMIN_CMD_COUNTER.compact.success.inc(); - Ok(Some(TaskRes::Compact { - state: self.apply_state.get_truncated_state().clone(), - first_index, - has_pending: self.pending_cmds.has_compact(), - })) + Ok(( + true, + Some(ExecResult::CompactLog { + state: self.apply_state.get_truncated_state().clone(), + first_index, + has_pending: self.pending_cmds.has_compact(), + }), + )) } None => { info!( @@ -3034,7 +3037,7 @@ where "peer_id" => self.id(), "voter_replicated_index" => voter_replicated_index, ); - Ok(None) + Ok((false, None)) } } } @@ -3109,7 +3112,10 @@ where "peer_id" => self.id(), "command" => ?req.get_compact_log() ); - return Ok((resp, ApplyResult::Res(ExecResult::PendingCompactCmd))); + return Ok(( + resp, + ApplyResult::Res(ExecResult::HasPendingCompactCmd(true)), + )); } } } else { @@ -3133,6 +3139,7 @@ where ApplyResult::Res(ExecResult::CompactLog { state: self.apply_state.get_truncated_state().clone(), first_index, + has_pending: self.pending_cmds.has_compact(), }), )) } @@ -3693,11 +3700,6 @@ where // Whether destroy request is from its target region's snapshot merge_from_snapshot: bool, }, - Compact { - state: RaftTruncatedState, - first_index: u64, - has_pending: bool, - }, } pub struct ApplyFsm @@ -4109,18 +4111,29 @@ where voter_replicated_index: u64, voter_replicated_term: u64, ) { + if self.delegate.pending_remove || self.delegate.stopped { + return; + } + let res = self .delegate .try_compact_log(voter_replicated_index, voter_replicated_term); match res { - Ok(res) => { + Ok((should_write, res)) => { if let Some(res) = res { + if ctx.timer.is_none() { + ctx.timer = Some(Instant::now_coarse()); + } ctx.prepare_for(&mut self.delegate); - self.delegate.write_apply_state(ctx.kv_wb_mut()); - ctx.commit_opt(&mut self.delegate, true); - ctx.finish_for(&mut self.delegate, VecDeque::new()); - ctx.notifier - .notify_one(self.delegate.region_id(), PeerMsg::ApplyRes { res }); + let mut result = VecDeque::new(); + // If modified `truncated_state` in `try_compact_log`, the apply state should be + // persisted. + if should_write { + self.delegate.write_apply_state(ctx.kv_wb_mut()); + ctx.commit_opt(&mut self.delegate, true); + } + result.push_back(res); + ctx.finish_for(&mut self.delegate, result); } } Err(e) => error!(?e; diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 7e00798b6df..ccde4b031ef 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2311,21 +2311,6 @@ where *is_ready = true; } } - ApplyTaskRes::Compact { - state, - first_index, - has_pending, - } => { - self.fsm.peer.has_pending_compact_cmd = has_pending; - // When the witness restarts, the pending compact cmds will be lost. We will try - // to use `voter_replicated_index` as the `compact index` to avoid log - // accumulation, but if `voter_replicated_index` is less than `first_index`, - // then gc is not needed. In this case, the `first_index` we pass back will be - // 0, and `has_pending` set to false. - if first_index != 0 { - self.on_ready_compact_log(first_index, state); - } - } } if self.fsm.peer.unsafe_recovery_state.is_some() { self.check_unsafe_recovery_state(); @@ -4933,8 +4918,13 @@ where while let Some(result) = exec_results.pop_front() { match result { ExecResult::ChangePeer(cp) => self.on_ready_change_peer(cp), - ExecResult::CompactLog { first_index, state } => { - self.on_ready_compact_log(first_index, state) + ExecResult::CompactLog { + state, + first_index, + has_pending, + } => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + self.on_ready_compact_log(first_index, state); } ExecResult::SplitRegion { derived, @@ -4969,9 +4959,11 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), - ExecResult::PendingCompactCmd => { - self.fsm.peer.has_pending_compact_cmd = true; - self.register_pull_voter_replicated_index_tick(); + ExecResult::HasPendingCompactCmd(has_pending) => { + self.fsm.peer.has_pending_compact_cmd = has_pending; + if has_pending { + self.register_pull_voter_replicated_index_tick(); + } } } } @@ -5530,9 +5522,8 @@ where if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { return; } - // TODO: make it configurable if self.fsm.peer.last_compacted_time.elapsed() - > self.ctx.cfg.raft_log_gc_tick_interval.0 * 2 + > self.ctx.cfg.request_voter_replicated_index_interval.0 { let mut msg = ExtraMessage::default(); msg.set_type(ExtraMessageType::MsgVoterReplicatedIndexRequest); diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 98a845b7016..552434d1fed 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -78,6 +78,10 @@ fn test_witness_raftlog_gc_pull_voter_replicated_index() { let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); @@ -155,6 +159,10 @@ fn test_witness_raftlog_gc_after_reboot() { let mut cluster = new_server_cluster(0, 3); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(100); cluster.cfg.raft_store.raft_log_gc_tick_interval = ReadableDuration::millis(50); + cluster + .cfg + .raft_store + .request_voter_replicated_index_interval = ReadableDuration::millis(100); cluster.run(); let nodes = Vec::from_iter(cluster.get_node_ids()); assert_eq!(nodes.len(), 3); diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 0c6cf7cdd9c..a4e15b8fa6e 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -180,6 +180,7 @@ fn test_serde_custom_tikv_config() { raft_entry_max_size: ReadableSize::mb(12), raft_log_compact_sync_interval: ReadableDuration::secs(12), raft_log_gc_tick_interval: ReadableDuration::secs(12), + request_voter_replicated_index_interval: ReadableDuration::minutes(5), raft_log_gc_threshold: 12, raft_log_gc_count_limit: Some(12), raft_log_gc_size_limit: Some(ReadableSize::kb(1)), From 5235542066f3cd41d02581c6ee064159938f545e Mon Sep 17 00:00:00 2001 From: iosmanthus Date: Tue, 17 Jan 2023 21:05:50 +0800 Subject: [PATCH 084/115] copr: support handling keyspace request (#14027) ref tikv/tikv#12999 copr: support handling keyspace request Signed-off-by: iosmanthus --- Cargo.lock | 4 + components/api_version/Cargo.toml | 1 + components/api_version/src/keyspace.rs | 163 +++++++++++++++ components/api_version/src/lib.rs | 6 +- components/test_backup/src/lib.rs | 9 +- components/tidb_query_common/Cargo.toml | 1 + .../tidb_query_common/src/storage/scanner.rs | 187 +++++++++--------- components/tidb_query_datatype/Cargo.toml | 1 + .../tidb_query_datatype/src/codec/table.rs | 17 +- components/tidb_query_executors/Cargo.toml | 1 + .../src/index_scan_executor.rs | 39 ++-- components/tidb_query_executors/src/runner.rs | 11 +- .../src/table_scan_executor.rs | 35 ++-- .../src/util/scan_executor.rs | 16 +- src/coprocessor/checksum.rs | 8 +- src/coprocessor/dag/mod.rs | 15 +- src/coprocessor/endpoint.rs | 20 +- src/coprocessor/statistics/analyze.rs | 46 +++-- .../coprocessor_executors/index_scan/util.rs | 3 +- .../coprocessor_executors/integrated/util.rs | 3 +- .../coprocessor_executors/table_scan/util.rs | 3 +- .../benches/coprocessor_executors/util/mod.rs | 3 +- .../integrations/coprocessor/test_checksum.rs | 10 +- 23 files changed, 414 insertions(+), 188 deletions(-) create mode 100644 components/api_version/src/keyspace.rs diff --git a/Cargo.lock b/Cargo.lock index 0b7ca52725c..069dbc4950e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,6 +84,7 @@ dependencies = [ "codec", "engine_traits", "kvproto", + "log_wrappers", "match-template", "panic_hook", "thiserror", @@ -6103,6 +6104,7 @@ name = "tidb_query_common" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "byteorder", "derive_more", @@ -6124,6 +6126,7 @@ dependencies = [ name = "tidb_query_datatype" version = "0.0.1" dependencies = [ + "api_version", "base64", "bitfield", "bitflags", @@ -6164,6 +6167,7 @@ name = "tidb_query_executors" version = "0.0.1" dependencies = [ "anyhow", + "api_version", "async-trait", "codec", "collections", diff --git a/components/api_version/Cargo.toml b/components/api_version/Cargo.toml index 7362ca25ccc..c80607145bd 100644 --- a/components/api_version/Cargo.toml +++ b/components/api_version/Cargo.toml @@ -12,6 +12,7 @@ bitflags = "1.0.1" codec = { workspace = true } engine_traits = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } match-template = "0.0.1" thiserror = "1.0" tikv_alloc = { workspace = true } diff --git a/components/api_version/src/keyspace.rs b/components/api_version/src/keyspace.rs new file mode 100644 index 00000000000..4b263822a1b --- /dev/null +++ b/components/api_version/src/keyspace.rs @@ -0,0 +1,163 @@ +use std::fmt::Debug; + +use engine_traits::{Error, Result}; +use tikv_util::box_err; + +use super::*; + +const KEYSPACE_PREFIX_LEN: usize = 4; + +pub trait KvPair { + fn key(&self) -> &[u8]; + fn value(&self) -> &[u8]; + fn kv(&self) -> (&[u8], &[u8]) { + (self.key(), self.value()) + } +} + +impl KvPair for (Vec, Vec) { + fn key(&self) -> &[u8] { + &self.0 + } + fn value(&self) -> &[u8] { + &self.1 + } +} + +pub trait Keyspace { + type KvPair: KvPair = (Vec, Vec); + fn make_kv_pair(p: (Vec, Vec)) -> Result; + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + Ok((None, key)) + } +} + +#[derive(PartialEq, Clone, Copy, Debug)] +pub struct KeyspaceId(u32); + +impl From for KeyspaceId { + fn from(id: u32) -> Self { + Self(id) + } +} + +impl Keyspace for ApiV1 { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV1Ttl { + fn make_kv_pair(p: (Vec, Vec)) -> Result { + Ok(p) + } +} + +impl Keyspace for ApiV2 { + type KvPair = KeyspaceKv; + + fn make_kv_pair(p: (Vec, Vec)) -> Result { + let (k, v) = p; + let (keyspace, _) = Self::parse_keyspace(&k)?; + Ok(KeyspaceKv { + k, + v, + keyspace: keyspace.unwrap(), + }) + } + + fn parse_keyspace(key: &[u8]) -> Result<(Option, &[u8])> { + let mode = ApiV2::parse_key_mode(key); + if key.len() < KEYSPACE_PREFIX_LEN || (mode != KeyMode::Raw && mode != KeyMode::Txn) { + return Err(Error::Other(box_err!( + "invalid API V2 key: {}", + log_wrappers::Value(key) + ))); + } + let id = u32::from_be_bytes([0, key[1], key[2], key[3]]); + Ok((Some(KeyspaceId::from(id)), &key[KEYSPACE_PREFIX_LEN..])) + } +} + +pub struct KeyspaceKv { + k: Vec, + v: Vec, + keyspace: KeyspaceId, +} + +impl KvPair for KeyspaceKv { + fn key(&self) -> &[u8] { + &self.k[KEYSPACE_PREFIX_LEN..] + } + + fn value(&self) -> &[u8] { + &self.v + } +} + +impl KeyspaceKv { + pub fn keyspace(&self) -> KeyspaceId { + self.keyspace + } +} + +impl PartialEq<(Vec, Vec)> for KeyspaceKv { + fn eq(&self, other: &(Vec, Vec)) -> bool { + self.kv() == (&other.0, &other.1) + } +} + +impl PartialEq for KeyspaceKv { + fn eq(&self, other: &Self) -> bool { + self.k == other.k && self.v == other.v + } +} + +impl Debug for KeyspaceKv { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyspaceKv") + .field("key", &log_wrappers::Value(self.key())) + .field("value", &log_wrappers::Value(self.value())) + .field("keyspace", &self.keyspace()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_v1_parse_keyspace() { + let k = b"t123_111"; + let (keyspace, key) = ApiV1::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + + let (keyspace, key) = ApiV1Ttl::parse_keyspace(k).unwrap(); + assert_eq!(None, keyspace); + assert_eq!(k, key); + } + + #[test] + fn test_v2_parse_keyspace() { + let ok = vec![ + (b"x\x00\x00\x01t123_114", 1, b"t123_114"), + (b"r\x00\x00\x01t123_112", 1, b"t123_112"), + (b"x\x01\x00\x00t213_112", 0x010000, b"t213_112"), + (b"r\x01\x00\x00t123_113", 0x010000, b"t123_113"), + ]; + + for (key, id, user_key) in ok { + let (keyspace, key) = ApiV2::parse_keyspace(key).unwrap(); + assert_eq!(Some(KeyspaceId::from(id)), keyspace); + assert_eq!(user_key, key); + } + + let err: Vec<&[u8]> = vec![b"t123_111", b"s\x00\x00", b"r\x00\x00"]; + + for key in err { + ApiV2::parse_keyspace(key).unwrap_err(); + } + } +} diff --git a/components/api_version/src/lib.rs b/components/api_version/src/lib.rs index 0c9ae388917..879751e7b62 100644 --- a/components/api_version/src/lib.rs +++ b/components/api_version/src/lib.rs @@ -1,17 +1,21 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. #![feature(min_specialization)] +#![feature(associated_type_defaults)] mod api_v1; mod api_v1ttl; pub mod api_v2; +pub mod keyspace; use engine_traits::Result; use kvproto::kvrpcpb::ApiVersion; pub use match_template::match_template; use txn_types::{Key, TimeStamp}; -pub trait KvFormat: Clone + Copy + 'static + Send + Sync { +use crate::keyspace::Keyspace; + +pub trait KvFormat: Keyspace + Clone + Copy + 'static + Send + Sync { const TAG: ApiVersion; /// Corresponding TAG of client requests. For test only. #[cfg(any(test, feature = "testexport"))] diff --git a/components/test_backup/src/lib.rs b/components/test_backup/src/lib.rs index e990924c638..3409a6ef366 100644 --- a/components/test_backup/src/lib.rs +++ b/components/test_backup/src/lib.rs @@ -8,7 +8,7 @@ use std::{ time::Duration, }; -use api_version::{dispatch_api_version, KvFormat, RawValue}; +use api_version::{dispatch_api_version, keyspace::KvPair, ApiV1, KvFormat, RawValue}; use backup::Task; use collections::HashMap; use engine_traits::{CfName, IterOptions, CF_DEFAULT, CF_WRITE, DATA_KEY_PREFIX_LEN}; @@ -354,7 +354,7 @@ impl TestSuite { Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(snap_store, false), ranges: vec![Range::Interval(IntervalRange::from((start, end)))], scan_backward_in_range: false, @@ -362,8 +362,9 @@ impl TestSuite { is_scanned_range_aware: false, }); let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { - checksum = checksum_crc64_xor(checksum, digest.clone(), &k, &v); + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); + checksum = checksum_crc64_xor(checksum, digest.clone(), k, v); total_kvs += 1; total_bytes += (k.len() + v.len()) as u64; } diff --git a/components/tidb_query_common/Cargo.toml b/components/tidb_query_common/Cargo.toml index 3dd1693ba0d..f192b22a5f6 100644 --- a/components/tidb_query_common/Cargo.toml +++ b/components/tidb_query_common/Cargo.toml @@ -7,6 +7,7 @@ description = "Common utility of a query engine to run TiDB pushed down executor [dependencies] anyhow = "1.0" +api_version = { workspace = true } async-trait = "0.1" derive_more = "0.99.3" error_code = { workspace = true } diff --git a/components/tidb_query_common/src/storage/scanner.rs b/components/tidb_query_common/src/storage/scanner.rs index e12659f329b..d0d2345a09e 100644 --- a/components/tidb_query_common/src/storage/scanner.rs +++ b/components/tidb_query_common/src/storage/scanner.rs @@ -1,7 +1,8 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{marker::PhantomData, time::Duration}; +use api_version::KvFormat; use tikv_util::time::Instant; use yatp::task::future::reschedule; @@ -17,7 +18,7 @@ const CHECK_KEYS: usize = 32; /// A scanner that scans over multiple ranges. Each range can be a point range /// containing only one row, or an interval range containing multiple rows. -pub struct RangesScanner { +pub struct RangesScanner { storage: T, ranges_iter: RangesIterator, @@ -34,6 +35,8 @@ pub struct RangesScanner { working_range_begin_key: Vec, working_range_end_key: Vec, rescheduler: RescheduleChecker, + + _phantom: PhantomData, } // TODO: maybe it's better to make it generic to avoid directly depending @@ -72,7 +75,7 @@ pub struct RangesScannerOptions { pub is_scanned_range_aware: bool, // TODO: This can be const generics } -impl RangesScanner { +impl RangesScanner { pub fn new( RangesScannerOptions { storage, @@ -81,7 +84,7 @@ impl RangesScanner { is_key_only, is_scanned_range_aware, }: RangesScannerOptions, - ) -> RangesScanner { + ) -> RangesScanner { let ranges_len = ranges.len(); let ranges_iter = RangesIterator::new(ranges); RangesScanner { @@ -98,13 +101,14 @@ impl RangesScanner { working_range_begin_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), working_range_end_key: Vec::with_capacity(KEY_BUFFER_CAPACITY), rescheduler: RescheduleChecker::new(), + _phantom: PhantomData, } } /// Fetches next row. // Note: This is not implemented over `Iterator` since it can fail. // TODO: Change to use reference to avoid allocation and copy. - pub async fn next(&mut self) -> Result, StorageError> { + pub async fn next(&mut self) -> Result, StorageError> { self.next_opt(true).await } @@ -114,7 +118,7 @@ impl RangesScanner { pub async fn next_opt( &mut self, update_scanned_range: bool, - ) -> Result, StorageError> { + ) -> Result, StorageError> { loop { let mut force_check = true; let range = self.ranges_iter.next(); @@ -150,14 +154,14 @@ impl RangesScanner { if self.is_scanned_range_aware && update_scanned_range { self.update_scanned_range_from_scanned_row(&some_row); } - if some_row.is_some() { + if let Some(row) = some_row { // Retrieved one row from point range or interval range. if let Some(r) = self.scanned_rows_per_range.last_mut() { *r += 1; } self.rescheduler.check_reschedule(force_check).await; - - return Ok(some_row); + let kv = F::make_kv_pair(row).map_err(|e| StorageError(anyhow::Error::from(e)))?; + return Ok(Some(kv)); } else { // No more row in the range. self.ranges_iter.notify_drained(); @@ -288,6 +292,7 @@ impl RangesScanner { #[cfg(test)] mod tests { + use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use super::*; @@ -315,7 +320,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "c")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -323,24 +328,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), b"4".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), b"4".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -351,7 +356,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "bar_2")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -359,20 +364,20 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), b"3".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), b"3".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), b"1".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), b"1".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), b"5".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), b"5".to_vec()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), b"2".to_vec())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), b"2".to_vec()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); @@ -382,7 +387,7 @@ mod tests { PointRange::from("foo_3").into(), PointRange::from("bar_3").into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -390,24 +395,24 @@ mod tests { is_scanned_range_aware: false, }); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"bar_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"bar_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_2".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_2".to_vec(), Vec::new()) ); assert_eq!( - block_on(scanner.next()).unwrap(), - Some((b"foo_3".to_vec(), Vec::new())) + block_on(scanner.next()).unwrap().unwrap(), + (b"foo_3".to_vec(), Vec::new()) ); assert_eq!(block_on(scanner.next()).unwrap(), None); } @@ -422,7 +427,7 @@ mod tests { PointRange::from("foo_3").into(), IntervalRange::from(("a", "z")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -431,9 +436,9 @@ mod tests { }); let mut scanned_rows_per_range = Vec::new(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![2, 0, 1]); @@ -443,21 +448,21 @@ mod tests { assert_eq!(scanned_rows_per_range, vec![0]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![0, 2]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); assert_eq!(scanned_rows_per_range, vec![1]); scanned_rows_per_range.clear(); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); assert_eq!(block_on(scanner.next()).unwrap(), None); scanner.collect_scanned_rows_per_range(&mut scanned_rows_per_range); @@ -477,7 +482,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -497,7 +502,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -513,7 +518,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -529,7 +534,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -537,14 +542,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); @@ -567,7 +572,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -575,25 +580,25 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); assert_eq!(&r.upper_exclusive, b"foo\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo\0"); assert_eq!(&r.upper_exclusive, b"foo_2\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2\0"); assert_eq!(&r.upper_exclusive, b"bar\0"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar\0"); @@ -612,7 +617,7 @@ mod tests { // No range let ranges = vec![]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -632,7 +637,7 @@ mod tests { // Empty interval range let ranges = vec![IntervalRange::from(("x", "xb")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -648,7 +653,7 @@ mod tests { // Empty point range let ranges = vec![PointRange::from("x").into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -664,7 +669,7 @@ mod tests { // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -672,14 +677,14 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_3"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_3"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo_2"); assert_eq!(&r.upper_exclusive, b"foo_8"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -700,7 +705,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -708,20 +713,20 @@ mod tests { is_scanned_range_aware: true, }); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar_2"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar_2"); assert_eq!(&r.upper_exclusive, b"box"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"bar"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"bar"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"bar"); assert_eq!(&r.upper_exclusive, b"bar_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo_2"); - assert_eq!(&block_on(scanner.next()).unwrap().unwrap().0, b"foo"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo_2"); + assert_eq!(&block_on(scanner.next()).unwrap().unwrap().key(), b"foo"); let r = scanner.take_scanned_range(); assert_eq!(&r.lower_inclusive, b"foo"); @@ -739,7 +744,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: false, @@ -749,7 +754,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -757,7 +762,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -765,7 +770,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -791,7 +796,7 @@ mod tests { PointRange::from("bar_3").into(), IntervalRange::from(("bar_4", "box")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: false, @@ -801,7 +806,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -809,7 +814,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -817,7 +822,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -825,7 +830,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo"); @@ -846,7 +851,7 @@ mod tests { let storage = create_storage(); // Filled interval range let ranges = vec![IntervalRange::from(("foo", "foo_8")).into()]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: storage.clone(), ranges, scan_backward_in_range: true, @@ -856,7 +861,7 @@ mod tests { // Only lower_inclusive is updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_3" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -864,7 +869,7 @@ mod tests { // Upper_exclusive is updated. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -872,7 +877,7 @@ mod tests { // Upper_exclusive is not updated. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"foo_8"); @@ -896,7 +901,7 @@ mod tests { IntervalRange::from(("foo_5", "foo_50")).into(), IntervalRange::from(("foo", "foo_3")).into(), ]; - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage, ranges, scan_backward_in_range: true, @@ -906,7 +911,7 @@ mod tests { // Lower_inclusive is updated. Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"bar_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -914,7 +919,7 @@ mod tests { // Upper_exclusive is updated. Updated by scanned row. assert_eq!( - &block_on(scanner.next_opt(true)).unwrap().unwrap().0, + &block_on(scanner.next_opt(true)).unwrap().unwrap().key(), b"bar" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -922,7 +927,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo_2" ); assert_eq!(&scanner.working_range_begin_key, b"box"); @@ -930,7 +935,7 @@ mod tests { // Upper_exclusive is not update. assert_eq!( - &block_on(scanner.next_opt(false)).unwrap().unwrap().0, + &block_on(scanner.next_opt(false)).unwrap().unwrap().key(), b"foo" ); assert_eq!(&scanner.working_range_begin_key, b"box"); diff --git a/components/tidb_query_datatype/Cargo.toml b/components/tidb_query_datatype/Cargo.toml index e9d96e16284..e670674cdc6 100644 --- a/components/tidb_query_datatype/Cargo.toml +++ b/components/tidb_query_datatype/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "Data type of a query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } base64 = "0.13" bitfield = "0.13.2" bitflags = "1.0.1" diff --git a/components/tidb_query_datatype/src/codec/table.rs b/components/tidb_query_datatype/src/codec/table.rs index 00f6c22347b..37becbfb801 100644 --- a/components/tidb_query_datatype/src/codec/table.rs +++ b/components/tidb_query_datatype/src/codec/table.rs @@ -2,6 +2,7 @@ use std::{cmp, convert::TryInto, io::Write, sync::Arc, u8}; +use api_version::KvFormat; use codec::prelude::*; use collections::{HashMap, HashSet}; use kvproto::coprocessor::KeyRange; @@ -75,10 +76,13 @@ pub fn extract_table_prefix(key: &[u8]) -> Result<&[u8]> { } /// Checks if the range is for table record or index. -pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { +pub fn check_table_ranges(ranges: &[KeyRange]) -> Result<()> { for range in ranges { - extract_table_prefix(range.get_start())?; - extract_table_prefix(range.get_end())?; + let (_, start) = + F::parse_keyspace(range.get_start()).map_err(|e| Error::Other(Box::new(e)))?; + let (_, end) = F::parse_keyspace(range.get_end()).map_err(|e| Error::Other(Box::new(e)))?; + extract_table_prefix(start)?; + extract_table_prefix(end)?; if range.get_start() >= range.get_end() { return Err(invalid_type!( "invalid range,range.start should be smaller than range.end, but got [{:?},{:?})", @@ -544,6 +548,7 @@ pub fn generate_index_data_for_test( mod tests { use std::{i64, iter::FromIterator}; + use api_version::ApiV1; use collections::{HashMap, HashSet}; use tipb::ColumnInfo; @@ -790,18 +795,18 @@ mod tests { let mut range = KeyRange::default(); range.set_start(small_key.clone()); range.set_end(large_key.clone()); - check_table_ranges(&[range]).unwrap(); + check_table_ranges::(&[range]).unwrap(); // test range.start > range.end let mut range = KeyRange::default(); range.set_end(small_key.clone()); range.set_start(large_key); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); // test invalid end let mut range = KeyRange::default(); range.set_start(small_key); range.set_end(b"xx".to_vec()); - check_table_ranges(&[range]).unwrap_err(); + check_table_ranges::(&[range]).unwrap_err(); } #[test] diff --git a/components/tidb_query_executors/Cargo.toml b/components/tidb_query_executors/Cargo.toml index 123c306c125..331634dbd04 100644 --- a/components/tidb_query_executors/Cargo.toml +++ b/components/tidb_query_executors/Cargo.toml @@ -6,6 +6,7 @@ publish = false description = "A vector query engine to run TiDB pushed down executors" [dependencies] +api_version = { workspace = true } async-trait = "0.1" codec = { workspace = true } collections = { workspace = true } diff --git a/components/tidb_query_executors/src/index_scan_executor.rs b/components/tidb_query_executors/src/index_scan_executor.rs index ae04ffe03e6..9e415918541 100644 --- a/components/tidb_query_executors/src/index_scan_executor.rs +++ b/components/tidb_query_executors/src/index_scan_executor.rs @@ -2,6 +2,7 @@ use std::sync::Arc; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use codec::{number::NumberCodec, prelude::NumberDecoder}; use itertools::izip; @@ -30,11 +31,13 @@ use DecodeHandleStrategy::*; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchIndexScanExecutor(ScanExecutor); +pub struct BatchIndexScanExecutor( + ScanExecutor, +); // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchIndexScanExecutor>> { +impl BatchIndexScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &IndexScan) -> Result<()> { @@ -42,7 +45,7 @@ impl BatchIndexScanExecutor>> { } } -impl BatchIndexScanExecutor { +impl BatchIndexScanExecutor { pub fn new( storage: S, config: Arc, @@ -154,7 +157,7 @@ impl BatchIndexScanExecutor { } #[async_trait] -impl BatchExecutor for BatchIndexScanExecutor { +impl BatchExecutor for BatchIndexScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -975,7 +978,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[0].clone(), columns_info[1].clone()], @@ -1028,7 +1031,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1092,7 +1095,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![columns_info[1].clone(), columns_info[0].clone()], @@ -1133,7 +1136,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1185,7 +1188,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1262,7 +1265,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), vec![ @@ -1319,7 +1322,7 @@ mod tests { range }]; - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), vec![ @@ -1433,7 +1436,7 @@ mod tests { let mut value = value_prefix.clone(); value.extend(restore_data); let store = FixtureStorage::from(vec![(key.clone(), value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1476,7 +1479,7 @@ mod tests { let value = value_prefix; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1572,7 +1575,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, vec![])]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1672,7 +1675,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1766,7 +1769,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1859,7 +1862,7 @@ mod tests { }]; let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1985,7 +1988,7 @@ mod tests { let mut value = value_prefix; value.extend(restore_data); let store = FixtureStorage::from(vec![(key, value)]); - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index 551c3da8a7e..d04be41507e 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -2,6 +2,7 @@ use std::{convert::TryFrom, sync::Arc}; +use api_version::KvFormat; use fail::fail_point; use kvproto::coprocessor::KeyRange; use protobuf::Message; @@ -164,7 +165,7 @@ fn is_arrow_encodable(schema: &[FieldType]) -> bool { } #[allow(clippy::explicit_counter_loop)] -pub fn build_executors( +pub fn build_executors( executor_descriptors: Vec, storage: S, ranges: Vec, @@ -192,7 +193,7 @@ pub fn build_executors( let primary_prefix_column_ids = descriptor.take_primary_prefix_column_ids(); Box::new( - BatchTableScanExecutor::new( + BatchTableScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -212,7 +213,7 @@ pub fn build_executors( let columns_info = descriptor.take_columns().into(); let primary_column_ids_len = descriptor.take_primary_column_ids().len(); Box::new( - BatchIndexScanExecutor::new( + BatchIndexScanExecutor::<_, F>::new( storage, config.clone(), columns_info, @@ -364,7 +365,7 @@ pub fn build_executors( } impl BatchExecutorsRunner { - pub fn from_request + 'static>( + pub fn from_request + 'static, F: KvFormat>( mut req: DagRequest, ranges: Vec, storage: S, @@ -380,7 +381,7 @@ impl BatchExecutorsRunner { config.paging_size = paging_size; let config = Arc::new(config); - let out_most_executor = build_executors( + let out_most_executor = build_executors::<_, F>( req.take_executors().into(), storage, ranges, diff --git a/components/tidb_query_executors/src/table_scan_executor.rs b/components/tidb_query_executors/src/table_scan_executor.rs index 957a23ba8c0..4397869fcaa 100644 --- a/components/tidb_query_executors/src/table_scan_executor.rs +++ b/components/tidb_query_executors/src/table_scan_executor.rs @@ -2,6 +2,7 @@ use std::{collections::HashSet, sync::Arc}; +use api_version::{ApiV1, KvFormat}; use async_trait::async_trait; use collections::HashMap; use kvproto::coprocessor::KeyRange; @@ -23,13 +24,15 @@ use tipb::{ColumnInfo, FieldType, TableScan}; use super::util::scan_executor::*; use crate::interface::*; -pub struct BatchTableScanExecutor(ScanExecutor); +pub struct BatchTableScanExecutor( + ScanExecutor, +); type HandleIndicesVec = SmallVec<[usize; 2]>; // We assign a dummy type `Box>` so that we can // omit the type when calling `check_supported`. -impl BatchTableScanExecutor>> { +impl BatchTableScanExecutor>, ApiV1> { /// Checks whether this executor can be used. #[inline] pub fn check_supported(descriptor: &TableScan) -> Result<()> { @@ -37,7 +40,7 @@ impl BatchTableScanExecutor>> { } } -impl BatchTableScanExecutor { +impl BatchTableScanExecutor { #[allow(clippy::too_many_arguments)] pub fn new( storage: S, @@ -110,7 +113,7 @@ impl BatchTableScanExecutor { } #[async_trait] -impl BatchExecutor for BatchTableScanExecutor { +impl BatchExecutor for BatchTableScanExecutor { type StorageStats = S::Statistics; #[inline] @@ -702,7 +705,7 @@ mod tests { batch_expect_rows: &[usize], ) { let columns_info = helper.columns_info_by_idx(col_idxs); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), columns_info, @@ -786,7 +789,7 @@ mod tests { fn test_execution_summary() { let helper = TableScanTestHelper::new(); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( helper.store(), Arc::new(EvalConfig::default()), helper.columns_info_by_idx(&[0]), @@ -925,7 +928,7 @@ mod tests { // For row 0 + row 1 + (row 2 ~ row 4), we should only get row 0, row 1 and an // error. for corrupted_row_index in 2..=4 { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1032,7 +1035,7 @@ mod tests { // We should get row 0 and error because no further rows should be scanned when // there is an error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1080,7 +1083,7 @@ mod tests { }); let mut schema = schema.clone(); schema.push(FieldTypeTp::LongLong.into()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info, @@ -1122,7 +1125,7 @@ mod tests { // Let's also repeat case 1 for smaller batch size { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1165,7 +1168,7 @@ mod tests { // Case 2: row 1 + row 2 // We should get error and no row, for the same reason as above. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1186,7 +1189,7 @@ mod tests { // Case 3: row 2 + row 0 // We should get row 2 and row 0. There is no error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store.clone(), Arc::new(EvalConfig::default()), columns_info.clone(), @@ -1220,7 +1223,7 @@ mod tests { // Case 4: row 1 // We should get error. { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1270,7 +1273,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1378,7 +1381,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info, @@ -1559,7 +1562,7 @@ mod tests { let store = FixtureStorage::new(iter::once((key, (Ok(value)))).collect()); - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( store, Arc::new(EvalConfig::default()), columns_info.clone(), diff --git a/components/tidb_query_executors/src/util/scan_executor.rs b/components/tidb_query_executors/src/util/scan_executor.rs index 935db5dd392..75c7cdc9fe3 100644 --- a/components/tidb_query_executors/src/util/scan_executor.rs +++ b/components/tidb_query_executors/src/util/scan_executor.rs @@ -1,5 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::KeyRange; use tidb_query_common::{ @@ -40,12 +41,12 @@ pub trait ScanExecutorImpl: Send { /// A shared executor implementation for both table scan and index scan. /// Implementation differences between table scan and index scan are further /// given via `ScanExecutorImpl`. -pub struct ScanExecutor { +pub struct ScanExecutor { /// The internal scanning implementation. imp: I, /// The scanner that scans over ranges. - scanner: RangesScanner, + scanner: RangesScanner, /// A flag indicating whether this executor is ended. When table is drained /// or there was an error scanning the table, this flag will be set to @@ -63,7 +64,7 @@ pub struct ScanExecutorOptions { pub is_scanned_range_aware: bool, } -impl ScanExecutor { +impl ScanExecutor { pub fn new( ScanExecutorOptions { imp, @@ -75,7 +76,7 @@ impl ScanExecutor { is_scanned_range_aware, }: ScanExecutorOptions, ) -> Result { - tidb_query_datatype::codec::table::check_table_ranges(&key_ranges)?; + tidb_query_datatype::codec::table::check_table_ranges::(&key_ranges)?; if is_backward { key_ranges.reverse(); } @@ -108,10 +109,11 @@ impl ScanExecutor { for i in 0..scan_rows { let some_row = self.scanner.next_opt(i == scan_rows - 1).await?; - if let Some((key, value)) = some_row { + if let Some(row) = some_row { // Retrieved one row from point range or non-point range. - if let Err(e) = self.imp.process_kv_pair(&key, &value, columns) { + let (key, value) = row.kv(); + if let Err(e) = self.imp.process_kv_pair(key, value, columns) { // When there are errors in `process_kv_pair`, columns' length may not be // identical. For example, the filling process may be partially done so that // first several columns have N rows while the rest have N-1 rows. Since we do @@ -162,7 +164,7 @@ pub fn check_columns_info_supported(columns_info: &[ColumnInfo]) -> Result<()> { } #[async_trait] -impl BatchExecutor for ScanExecutor { +impl BatchExecutor for ScanExecutor { type StorageStats = S::Statistics; #[inline] diff --git a/src/coprocessor/checksum.rs b/src/coprocessor/checksum.rs index 52bd0a60184..3778f549427 100644 --- a/src/coprocessor/checksum.rs +++ b/src/coprocessor/checksum.rs @@ -1,5 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +use api_version::{keyspace::KvPair, ApiV1}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ // `ChecksumContext` is used to handle `ChecksumRequest` pub struct ChecksumContext { req: ChecksumRequest, - scanner: RangesScanner>>, + scanner: RangesScanner>, ApiV1>, } impl ChecksumContext { @@ -73,12 +74,13 @@ impl RequestHandler for ChecksumContext { let mut prefix_digest = crc64fast::Digest::new(); prefix_digest.write(&old_prefix); - while let Some((k, v)) = self.scanner.next().await? { + while let Some(row) = self.scanner.next().await? { + let (k, v) = row.kv(); if !k.starts_with(&new_prefix) { return Err(box_err!("Wrong prefix expect: {:?}", new_prefix)); } checksum = - checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], &v); + checksum_crc64_xor(checksum, prefix_digest.clone(), &k[new_prefix.len()..], v); total_kvs += 1; total_bytes += k.len() + v.len() + old_prefix.len() - new_prefix.len(); } diff --git a/src/coprocessor/dag/mod.rs b/src/coprocessor/dag/mod.rs index ce575859e59..31a6df181d5 100644 --- a/src/coprocessor/dag/mod.rs +++ b/src/coprocessor/dag/mod.rs @@ -2,8 +2,9 @@ mod storage_impl; -use std::sync::Arc; +use std::{marker::PhantomData, sync::Arc}; +use api_version::KvFormat; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -18,7 +19,7 @@ use crate::{ tikv_util::quota_limiter::QuotaLimiter, }; -pub struct DagHandlerBuilder { +pub struct DagHandlerBuilder { req: DagRequest, ranges: Vec, store: S, @@ -29,9 +30,10 @@ pub struct DagHandlerBuilder { is_cache_enabled: bool, paging_size: Option, quota_limiter: Arc, + _phantom: PhantomData, } -impl DagHandlerBuilder { +impl DagHandlerBuilder { pub fn new( req: DagRequest, ranges: Vec, @@ -54,6 +56,7 @@ impl DagHandlerBuilder { is_cache_enabled, paging_size, quota_limiter, + _phantom: PhantomData, } } @@ -65,7 +68,7 @@ impl DagHandlerBuilder { pub fn build(self) -> Result> { COPR_DAG_REQ_COUNT.with_label_values(&["batch"]).inc(); - Ok(BatchDagHandler::new( + Ok(BatchDagHandler::new::<_, F>( self.req, self.ranges, self.store, @@ -87,7 +90,7 @@ pub struct BatchDagHandler { } impl BatchDagHandler { - pub fn new( + pub fn new( req: DagRequest, ranges: Vec, store: S, @@ -100,7 +103,7 @@ impl BatchDagHandler { quota_limiter: Arc, ) -> Result { Ok(Self { - runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request( + runner: tidb_query_executors::runner::BatchExecutorsRunner::from_request::<_, F>( req, ranges, TikvStorage::new(store, is_cache_enabled), diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 711cd83e607..b9d01419a49 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -7,6 +7,7 @@ use std::{ use ::tracker::{ set_tls_tracker_token, with_tls_tracker, RequestInfo, RequestType, GLOBAL_TRACKERS, }; +use api_version::{dispatch_api_version, KvFormat}; use async_stream::try_stream; use concurrency_manager::ConcurrencyManager; use engine_traits::PerfLevel; @@ -147,6 +148,21 @@ impl Endpoint { /// /// It also checks if there are locks in memory blocking this read request. fn parse_request_and_check_memory_locks( + &self, + req: coppb::Request, + peer: Option, + is_streaming: bool, + ) -> Result<(RequestHandlerBuilder, ReqContext)> { + dispatch_api_version!(req.get_context().get_api_version(), { + self.parse_request_and_check_memory_locks_impl::(req, peer, is_streaming) + }) + } + + /// Parse the raw `Request` to create `RequestHandlerBuilder` and + /// `ReqContext`. Returns `Err` if fails. + /// + /// It also checks if there are locks in memory blocking this read request. + fn parse_request_and_check_memory_locks_impl( &self, mut req: coppb::Request, peer: Option, @@ -232,7 +248,7 @@ impl Endpoint { 0 => None, i => Some(i), }; - dag::DagHandlerBuilder::new( + dag::DagHandlerBuilder::<_, F>::new( dag, req_ctx.ranges.clone(), store, @@ -281,7 +297,7 @@ impl Endpoint { let quota_limiter = self.quota_limiter.clone(); builder = Box::new(move |snap, req_ctx| { - statistics::analyze::AnalyzeContext::new( + statistics::analyze::AnalyzeContext::<_, F>::new( analyze, req_ctx.ranges.clone(), start_ts, diff --git a/src/coprocessor/statistics/analyze.rs b/src/coprocessor/statistics/analyze.rs index 383f6161a1b..25ecf95653d 100644 --- a/src/coprocessor/statistics/analyze.rs +++ b/src/coprocessor/statistics/analyze.rs @@ -1,7 +1,8 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Reverse, collections::BinaryHeap, mem, sync::Arc}; +use std::{cmp::Reverse, collections::BinaryHeap, marker::PhantomData, mem, sync::Arc}; +use api_version::{keyspace::KvPair, KvFormat}; use async_trait::async_trait; use kvproto::coprocessor::{KeyRange, Response}; use protobuf::Message; @@ -41,16 +42,17 @@ const ANALYZE_VERSION_V1: i32 = 1; const ANALYZE_VERSION_V2: i32 = 2; // `AnalyzeContext` is used to handle `AnalyzeReq` -pub struct AnalyzeContext { +pub struct AnalyzeContext { req: AnalyzeReq, storage: Option>>, ranges: Vec, storage_stats: Statistics, quota_limiter: Arc, is_auto_analyze: bool, + _phantom: PhantomData, } -impl AnalyzeContext { +impl AnalyzeContext { pub fn new( req: AnalyzeReq, ranges: Vec, @@ -77,13 +79,14 @@ impl AnalyzeContext { storage_stats: Statistics::default(), quota_limiter, is_auto_analyze, + _phantom: PhantomData, }) } // handle_column is used to process `AnalyzeColumnsReq` // it would build a histogram for the primary key(if needed) and // collectors for each column value. - async fn handle_column(builder: &mut SampleBuilder) -> Result> { + async fn handle_column(builder: &mut SampleBuilder) -> Result> { let (col_res, _) = builder.collect_columns_stats().await?; let res_data = { @@ -93,7 +96,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { + async fn handle_mixed(builder: &mut SampleBuilder) -> Result> { let (col_res, idx_res) = builder.collect_columns_stats().await?; let res_data = { @@ -109,7 +112,7 @@ impl AnalyzeContext { Ok(res_data) } - async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { + async fn handle_full_sampling(builder: &mut RowSampleBuilder) -> Result> { let sample_res = builder.collect_column_stats().await?; let res_data = { let res = sample_res.into_proto(); @@ -122,7 +125,7 @@ impl AnalyzeContext { // it would build a histogram and count-min sketch of index values. async fn handle_index( req: AnalyzeIndexReq, - scanner: &mut RangesScanner>>, + scanner: &mut RangesScanner>, F>, is_common_handle: bool, ) -> Result> { let mut hist = Histogram::new(req.get_bucket_size() as usize); @@ -142,8 +145,8 @@ impl AnalyzeContext { } else { ANALYZE_VERSION_V1 }; - while let Some((key, _)) = scanner.next().await? { - let mut key = &key[..]; + while let Some(row) = scanner.next().await? { + let mut key = row.key(); if is_common_handle { table::check_record_key(key)?; key = &key[table::PREFIX_LEN..]; @@ -209,14 +212,14 @@ impl AnalyzeContext { } #[async_trait] -impl RequestHandler for AnalyzeContext { +impl RequestHandler for AnalyzeContext { async fn handle_request(&mut self) -> Result> { let ret = match self.req.get_tp() { AnalyzeType::TypeIndex | AnalyzeType::TypeCommonHandle => { let req = self.req.take_idx_req(); let ranges = std::mem::take(&mut self.ranges); - table::check_table_ranges(&ranges)?; - let mut scanner = RangesScanner::new(RangesScannerOptions { + table::check_table_ranges::(&ranges)?; + let mut scanner = RangesScanner::<_, F>::new(RangesScannerOptions { storage: self.storage.take().unwrap(), ranges: ranges .into_iter() @@ -240,7 +243,7 @@ impl RequestHandler for AnalyzeContext { let col_req = self.req.take_col_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, None, storage, ranges)?; + let mut builder = SampleBuilder::<_, F>::new(col_req, None, storage, ranges)?; let res = AnalyzeContext::handle_column(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -252,7 +255,8 @@ impl RequestHandler for AnalyzeContext { let idx_req = self.req.take_idx_req(); let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = SampleBuilder::new(col_req, Some(idx_req), storage, ranges)?; + let mut builder = + SampleBuilder::<_, F>::new(col_req, Some(idx_req), storage, ranges)?; let res = AnalyzeContext::handle_mixed(&mut builder).await; builder.data.collect_storage_stats(&mut self.storage_stats); res @@ -263,7 +267,7 @@ impl RequestHandler for AnalyzeContext { let storage = self.storage.take().unwrap(); let ranges = std::mem::take(&mut self.ranges); - let mut builder = RowSampleBuilder::new( + let mut builder = RowSampleBuilder::<_, F>::new( col_req, storage, ranges, @@ -302,8 +306,8 @@ impl RequestHandler for AnalyzeContext { } } -struct RowSampleBuilder { - data: BatchTableScanExecutor>>, +struct RowSampleBuilder { + data: BatchTableScanExecutor>, F>, max_sample_size: usize, max_fm_sketch_size: usize, @@ -314,7 +318,7 @@ struct RowSampleBuilder { is_auto_analyze: bool, } -impl RowSampleBuilder { +impl RowSampleBuilder { fn new( mut req: AnalyzeColumnsReq, storage: TikvStorage>, @@ -784,8 +788,8 @@ impl Drop for BaseRowSampleCollector { } } -struct SampleBuilder { - data: BatchTableScanExecutor>>, +struct SampleBuilder { + data: BatchTableScanExecutor>, F>, max_bucket_size: usize, max_sample_size: usize, @@ -802,7 +806,7 @@ struct SampleBuilder { /// `SampleBuilder` is used to analyze columns. It collects sample from /// the result set using Reservoir Sampling algorithm, estimates NDVs /// using FM Sketch during the collecting process, and builds count-min sketch. -impl SampleBuilder { +impl SampleBuilder { fn new( mut req: AnalyzeColumnsReq, common_handle_req: Option, diff --git a/tests/benches/coprocessor_executors/index_scan/util.rs b/tests/benches/coprocessor_executors/index_scan/util.rs index 7531fb68944..8d579c98a4f 100644 --- a/tests/benches/coprocessor_executors/index_scan/util.rs +++ b/tests/benches/coprocessor_executors/index_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchIndexScan store: &Store, unique: bool, ) -> Self::E { - let mut executor = BatchIndexScanExecutor::new( + let mut executor = BatchIndexScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/integrated/util.rs b/tests/benches/coprocessor_executors/integrated/util.rs index d9cb5fd2138..4b747307049 100644 --- a/tests/benches/coprocessor_executors/integrated/util.rs +++ b/tests/benches/coprocessor_executors/integrated/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -71,7 +72,7 @@ where store: &Store, ) { crate::util::bencher::BatchNextAllBencher::new(|| { - tidb_query_executors::runner::build_executors( + tidb_query_executors::runner::build_executors::<_, ApiV1>( black_box(executors.to_vec()), black_box(TikvStorage::new(ToTxnStore::::to_store(store), false)), black_box(ranges.to_vec()), diff --git a/tests/benches/coprocessor_executors/table_scan/util.rs b/tests/benches/coprocessor_executors/table_scan/util.rs index 2fe7c4fc4c0..0b2185074c8 100644 --- a/tests/benches/coprocessor_executors/table_scan/util.rs +++ b/tests/benches/coprocessor_executors/table_scan/util.rs @@ -2,6 +2,7 @@ use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::black_box; use futures::executor::block_on; use kvproto::coprocessor::KeyRange; @@ -33,7 +34,7 @@ impl scan_bencher::ScanExecutorBuilder for BatchTableScan store: &Store, _: (), ) -> Self::E { - let mut executor = BatchTableScanExecutor::new( + let mut executor = BatchTableScanExecutor::<_, ApiV1>::new( black_box(TikvStorage::new( ToTxnStore::::to_store(store), false, diff --git a/tests/benches/coprocessor_executors/util/mod.rs b/tests/benches/coprocessor_executors/util/mod.rs index 5ef442a25cd..0a5708c74ce 100644 --- a/tests/benches/coprocessor_executors/util/mod.rs +++ b/tests/benches/coprocessor_executors/util/mod.rs @@ -8,6 +8,7 @@ pub mod store; use std::{marker::PhantomData, sync::Arc}; +use api_version::ApiV1; use criterion::{black_box, measurement::Measurement}; use kvproto::coprocessor::KeyRange; use test_coprocessor::*; @@ -41,7 +42,7 @@ pub fn build_dag_handler( let mut dag = DagRequest::default(); dag.set_executors(executors.to_vec().into()); - tikv::coprocessor::dag::DagHandlerBuilder::new( + tikv::coprocessor::dag::DagHandlerBuilder::<_, ApiV1>::new( black_box(dag), black_box(ranges.to_vec()), black_box(ToTxnStore::::to_store(store)), diff --git a/tests/integrations/coprocessor/test_checksum.rs b/tests/integrations/coprocessor/test_checksum.rs index 66df6b2832c..405070842b4 100644 --- a/tests/integrations/coprocessor/test_checksum.rs +++ b/tests/integrations/coprocessor/test_checksum.rs @@ -2,6 +2,7 @@ use std::u64; +use api_version::{keyspace::KvPair, ApiV1}; use futures::executor::block_on; use kvproto::{ coprocessor::{KeyRange, Request}, @@ -79,7 +80,7 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> Default::default(), false, ); - let mut scanner = RangesScanner::new(RangesScannerOptions { + let mut scanner = RangesScanner::<_, ApiV1>::new(RangesScannerOptions { storage: TikvStorage::new(store, false), ranges: vec![Range::from_pb_range(range, false)], scan_backward_in_range: true, @@ -89,10 +90,11 @@ fn reversed_checksum_crc64_xor(store: &Store, range: KeyRange) -> let mut checksum = 0; let digest = crc64fast::Digest::new(); - while let Some((k, v)) = block_on(scanner.next()).unwrap() { + while let Some(row) = block_on(scanner.next()).unwrap() { + let (k, v) = row.kv(); let mut digest = digest.clone(); - digest.write(&k); - digest.write(&v); + digest.write(k); + digest.write(v); checksum ^= digest.sum64(); } checksum From e2e9f9c2a62051dc21cdb28767e41e65fc79acee Mon Sep 17 00:00:00 2001 From: Connor Date: Tue, 17 Jan 2023 23:21:49 +0800 Subject: [PATCH 085/115] storage: add priority scheduling for scheduler worker (#14057) ref tikv/tikv#13730 Support priority-based scheduling for the scheduler worker pool. Signed-off-by: Connor1996 Co-authored-by: Xinye Tao --- Cargo.lock | 2 + components/resource_control/src/lib.rs | 4 +- .../resource_control/src/resource_group.rs | 26 +- components/server/src/server.rs | 43 +-- components/server/src/server2.rs | 37 +-- components/test_raftstore/Cargo.toml | 1 + components/test_raftstore/src/cluster.rs | 18 +- components/test_raftstore/src/node.rs | 2 + components/test_raftstore/src/server.rs | 5 + .../tikv_util/src/yatp_pool/future_pool.rs | 2 + components/tikv_util/src/yatp_pool/mod.rs | 65 ++-- src/config/mod.rs | 20 +- src/read_pool.rs | 3 +- src/server/metrics.rs | 6 + src/server/service/kv.rs | 20 ++ src/storage/mod.rs | 55 +++- src/storage/txn/commands/mod.rs | 7 + src/storage/txn/mod.rs | 2 +- src/storage/txn/sched_pool.rs | 165 ++++++++-- src/storage/txn/scheduler.rs | 289 ++++++------------ tests/Cargo.toml | 1 + tests/failpoints/cases/test_storage.rs | 5 +- 22 files changed, 441 insertions(+), 337 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 069dbc4950e..ab1d164a1e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5884,6 +5884,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "server", @@ -5997,6 +5998,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "rand_xorshift", + "resource_control", "resource_metering", "security", "serde_json", diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index 516e5dd6c8d..eb6679f71e8 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -4,7 +4,9 @@ use online_config::OnlineConfig; use serde::{Deserialize, Serialize}; mod resource_group; -pub use resource_group::{ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +pub use resource_group::{ + ResourceConsumeType, ResourceController, ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL, +}; mod future; pub use future::ControlledFuture; diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index d9fa3ccf14c..70f89fd1a9d 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -382,40 +382,40 @@ mod tests { resource_manager.add_resource_group(group2); assert_eq!(resource_manager.resource_groups.len(), 2); - let resouce_ctl = resource_manager.derive_controller("test_read".into(), true); - assert_eq!(resouce_ctl.resource_consumptions.len(), 3); + let resource_ctl = resource_manager.derive_controller("test_read".into(), true); + assert_eq!(resource_ctl.resource_consumptions.len(), 3); - let group1 = resouce_ctl.resource_group("test".as_bytes()); + let group1 = resource_ctl.resource_group("test".as_bytes()); assert_eq!(group1.weight, 500); - let group2 = resouce_ctl.resource_group("test2".as_bytes()); + let group2 = resource_ctl.resource_group("test2".as_bytes()); assert_eq!(group2.weight, 250); assert_eq!(group1.current_vt(), 0); let mut extras1 = Extras::single_level(); extras1.set_metadata("test".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras1), 25_000); + assert_eq!(resource_ctl.priority_of(&extras1), 25_000); assert_eq!(group1.current_vt(), 25_000); let mut extras2 = Extras::single_level(); extras2.set_metadata("test2".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras2), 12_500); + assert_eq!(resource_ctl.priority_of(&extras2), 12_500); assert_eq!(group2.current_vt(), 12_500); let mut extras3 = Extras::single_level(); extras3.set_metadata("unknown_group".as_bytes().to_owned()); - assert_eq!(resouce_ctl.priority_of(&extras3), 50); + assert_eq!(resource_ctl.priority_of(&extras3), 50); assert_eq!( - resouce_ctl + resource_ctl .resource_group("default".as_bytes()) .current_vt(), 50 ); - resouce_ctl.consume( + resource_ctl.consume( "test".as_bytes(), ResourceConsumeType::CpuTime(Duration::from_micros(10000)), ); - resouce_ctl.consume( + resource_ctl.consume( "test2".as_bytes(), ResourceConsumeType::CpuTime(Duration::from_micros(10000)), ); @@ -429,7 +429,7 @@ mod tests { assert_eq!(group1_vt, 5_025_000); assert!(group2.current_vt() >= group1.current_vt() * 3 / 4); assert!( - resouce_ctl + resource_ctl .resource_group("default".as_bytes()) .current_vt() >= group1.current_vt() / 2 @@ -442,8 +442,8 @@ mod tests { let new_group = new_resource_group("new_group".into(), true, 500, 500); resource_manager.add_resource_group(new_group); - assert_eq!(resouce_ctl.resource_consumptions.len(), 4); - let group3 = resouce_ctl.resource_group("new_group".as_bytes()); + assert_eq!(resource_ctl.resource_consumptions.len(), 4); + let group3 = resource_ctl.resource_group("new_group".as_bytes()); assert_eq!(group3.weight, 200); assert!(group3.current_vt() >= group1_vt / 2); } diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 52b9fbf1d1a..cfc7e59e243 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -245,7 +245,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, - resource_manager: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery @@ -322,23 +322,27 @@ where let config = cfg_controller.get_current(); let store_path = Path::new(&config.storage.data_dir).to_owned(); - let resource_manager = Arc::new(ResourceGroupManager::default()); - - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); - // spawn a task to periodically update the minimal virtual time of all resource - // group. - if config.resource_control.enabled { - let resource_mgr1 = resource_manager.clone(); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mgr1 = mgr.clone(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - resource_mgr1.advance_min_virtual_time(); + mgr1.advance_min_virtual_time(); }); - } + Some(mgr) + } else { + None + }; + + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), @@ -745,19 +749,15 @@ where } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { - let priority_mgr = if self.config.resource_control.enabled { - Some( - self.resource_manager - .derive_controller("unified-read-pool".into(), true), - ) - } else { - None - }; + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), - priority_mgr, + resource_ctl, )) } else { None @@ -831,6 +831,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 12e6af61613..03b02e5f81e 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -222,7 +222,7 @@ struct TikvServer { check_leader_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, - resource_manager: Arc, + resource_manager: Option>, causal_ts_provider: Option>, // used for rawkv apiv2 tablet_registry: Option>, } @@ -287,15 +287,19 @@ where config.quota.max_delay_duration, config.quota.enable_auto_tune, )); - let resource_manager = Arc::new(ResourceGroupManager::default()); - // spawn a task to periodically update the minimal virtual time of all resource - // group. - if config.resource_control.enabled { - let resource_mgr1 = resource_manager.clone(); + + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mgr1 = mgr.clone(); + // spawn a task to periodically update the minimal virtual time of all resource + // group. background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - resource_mgr1.advance_min_virtual_time(); + mgr1.advance_min_virtual_time(); }); - } + Some(mgr) + } else { + None + }; let mut causal_ts_provider = None; if let ApiVersion::V2 = F::TAG { @@ -634,19 +638,15 @@ where let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { - let priority_mgr = if self.config.resource_control.enabled { - Some( - self.resource_manager - .derive_controller("unified-read-pool".into(), true), - ) - } else { - None - }; + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), - priority_mgr, + resource_ctl, )) } else { None @@ -719,6 +719,9 @@ where Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), self.causal_ts_provider.clone(), + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); cfg_controller.register( diff --git a/components/test_raftstore/Cargo.toml b/components/test_raftstore/Cargo.toml index 71c214ae21d..25a1224e261 100644 --- a/components/test_raftstore/Cargo.toml +++ b/components/test_raftstore/Cargo.toml @@ -49,6 +49,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true, features = ["testexport"] } rand = "0.8" resolved_ts = { workspace = true } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true } server = { workspace = true } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index b2330e26f93..2121b7e021f 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -46,6 +46,7 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; @@ -80,6 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -174,6 +176,7 @@ pub struct Cluster { pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, + resource_manager: Arc, } impl Cluster { @@ -207,6 +210,7 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + resource_manager: Arc::new(ResourceGroupManager::default()), kv_statistics: vec![], raft_statistics: vec![], } @@ -294,6 +298,7 @@ impl Cluster { key_mgr.clone(), router, system, + &self.resource_manager, )?; self.group_props.insert(node_id, props); self.engines.insert(node_id, engines); @@ -365,9 +370,16 @@ impl Cluster { tikv_util::thread_group::set_properties(Some(props)); debug!("calling run node"; "node_id" => node_id); // FIXME: rocksdb event listeners may not work, because we change the router. - self.sim - .wl() - .run_node(node_id, cfg, engines, store_meta, key_mgr, router, system)?; + self.sim.wl().run_node( + node_id, + cfg, + engines, + store_meta, + key_mgr, + router, + system, + &self.resource_manager, + )?; debug!("node {} started", node_id); Ok(()) } diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 78d98e5a5d3..9ae76dba9f8 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -30,6 +30,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use tempfile::TempDir; use test_pd_client::TestPdClient; @@ -229,6 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + _resource_manager: &Arc, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index 0ec60e468ee..ccf4df43497 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -42,6 +42,7 @@ use raftstore::{ }, Result, }; +use resource_control::ResourceGroupManager; use resource_metering::{CollectorRegHandle, ResourceTagFactory}; use security::SecurityManager; use tempfile::TempDir; @@ -264,6 +265,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -414,6 +416,7 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), self.get_causal_ts_provider(node_id), + Some(resource_manager.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -649,6 +652,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, + resource_manager: &Arc, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), @@ -660,6 +664,7 @@ impl Simulator for ServerCluster { key_manager, router, system, + resource_manager, ) ) } diff --git a/components/tikv_util/src/yatp_pool/future_pool.rs b/components/tikv_util/src/yatp_pool/future_pool.rs index e74ced848c0..f010b508aaa 100644 --- a/components/tikv_util/src/yatp_pool/future_pool.rs +++ b/components/tikv_util/src/yatp_pool/future_pool.rs @@ -28,6 +28,8 @@ struct Env { } #[derive(Clone)] +// FuturePool wraps a yatp thread pool providing task count metrics and gate +// maximum running tasks. pub struct FuturePool { inner: Arc, } diff --git a/components/tikv_util/src/yatp_pool/mod.rs b/components/tikv_util/src/yatp_pool/mod.rs index 29376b904a5..305d2162482 100644 --- a/components/tikv_util/src/yatp_pool/mod.rs +++ b/components/tikv_util/src/yatp_pool/mod.rs @@ -198,42 +198,42 @@ impl YatpPoolBuilder { } } - pub fn config(&mut self, config: Config) -> &mut Self { + pub fn config(self, config: Config) -> Self { // TODO: maybe we should use (1, num_cpu) for min and max thread count. self.thread_count(config.workers, config.workers, config.workers) .stack_size(config.stack_size) .max_tasks(config.workers.saturating_mul(config.max_tasks_per_worker)) } - pub fn stack_size(&mut self, val: usize) -> &mut Self { + pub fn stack_size(mut self, val: usize) -> Self { self.stack_size = val; self } - pub fn name_prefix(&mut self, val: impl Into) -> &mut Self { + pub fn name_prefix(mut self, val: impl Into) -> Self { let name = val.into(); self.name_prefix = Some(name); self } pub fn thread_count( - &mut self, + mut self, min_thread_count: usize, core_thread_count: usize, max_thread_count: usize, - ) -> &mut Self { + ) -> Self { self.min_thread_count = min_thread_count; self.core_thread_count = core_thread_count; self.max_thread_count = max_thread_count; self } - pub fn max_tasks(&mut self, tasks: usize) -> &mut Self { + pub fn max_tasks(mut self, tasks: usize) -> Self { self.max_tasks = tasks; self } - pub fn before_stop(&mut self, f: F) -> &mut Self + pub fn before_stop(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -241,7 +241,7 @@ impl YatpPoolBuilder { self } - pub fn after_start(&mut self, f: F) -> &mut Self + pub fn after_start(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -249,7 +249,7 @@ impl YatpPoolBuilder { self } - pub fn before_pause(&mut self, f: F) -> &mut Self + pub fn before_pause(mut self, f: F) -> Self where F: Fn() + Send + Sync + 'static, { @@ -257,13 +257,32 @@ impl YatpPoolBuilder { self } - pub fn build_future_pool(&mut self) -> FuturePool { + pub fn build_future_pool(self) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; let pool = self.build_single_level_pool(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); - FuturePool::from_pool(pool, name, self.core_thread_count, self.max_tasks) + FuturePool::from_pool(pool, &name, size, task) + } + + pub fn build_priority_future_pool( + self, + priority_provider: Arc, + ) -> FuturePool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); + let size = self.core_thread_count; + let task = self.max_tasks; + let pool = self.build_priority_pool(priority_provider); + FuturePool::from_pool(pool, &name, size, task) } - pub fn build_single_level_pool(&mut self) -> ThreadPool { + pub fn build_single_level_pool(self) -> ThreadPool { let (builder, runner) = self.create_builder(); builder.build_with_queue_and_runner( yatp::queue::QueueType::SingleLevel, @@ -271,9 +290,12 @@ impl YatpPoolBuilder { ) } - pub fn build_multi_level_pool(&mut self) -> ThreadPool { + pub fn build_multi_level_pool(self) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); let (builder, read_pool_runner) = self.create_builder(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let multilevel_builder = multilevel::Builder::new(multilevel::Config::default().name(Some(name))); let runner_builder = @@ -283,11 +305,14 @@ impl YatpPoolBuilder { } pub fn build_priority_pool( - &mut self, + self, priority_provider: Arc, ) -> ThreadPool { + let name = self + .name_prefix + .clone() + .unwrap_or_else(|| "yatp_pool".to_string()); let (builder, read_pool_runner) = self.create_builder(); - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); let priority_builder = priority::Builder::new( priority::Config::default().name(Some(name)), priority_provider, @@ -296,8 +321,8 @@ impl YatpPoolBuilder { builder.build_with_queue_and_runner(QueueType::Priority(priority_builder), runner_builder) } - fn create_builder(&mut self) -> (yatp::Builder, YatpPoolRunner) { - let name = self.name_prefix.as_deref().unwrap_or("yatp_pool"); + fn create_builder(mut self) -> (yatp::Builder, YatpPoolRunner) { + let name = self.name_prefix.unwrap_or_else(|| "yatp_pool".to_string()); let mut builder = yatp::Builder::new(thd_name!(name)); builder .stack_size(self.stack_size) @@ -309,7 +334,7 @@ impl YatpPoolBuilder { let before_stop = self.before_stop.take(); let before_pause = self.before_pause.take(); let schedule_wait_duration = - metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[name]); + metrics::YATP_POOL_SCHEDULE_WAIT_DURATION_VEC.with_label_values(&[&name]); let read_pool_runner = YatpPoolRunner::new( Default::default(), self.ticker.clone(), diff --git a/src/config/mod.rs b/src/config/mod.rs index 9caa68d8e6b..7878696faa5 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -4930,14 +4930,8 @@ mod tests { let max_pool_size = std::cmp::max(4, SysQuota::cpu_cores_quota() as usize); let check_scale_pool_size = |size: usize, ok: bool| { - let origin_pool_size = scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(); - let origin_pool_size_high = scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(); + let origin_pool_size = scheduler.get_sched_pool().get_pool_size(CommandPri::Normal); + let origin_pool_size_high = scheduler.get_sched_pool().get_pool_size(CommandPri::High); let res = cfg_controller .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)); let (expected_size, expected_size_high) = if ok { @@ -4948,17 +4942,11 @@ mod tests { (origin_pool_size, origin_pool_size_high) }; assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), expected_size ); assert_eq!( - scheduler - .get_sched_pool(CommandPri::High) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::High), expected_size_high ); }; diff --git a/src/read_pool.rs b/src/read_pool.rs index 1a590679584..ea20b149a3d 100644 --- a/src/read_pool.rs +++ b/src/read_pool.rs @@ -295,8 +295,7 @@ pub fn build_yatp_read_pool( ) -> ReadPool { let unified_read_pool_name = get_unified_read_pool_name(); let raftkv = Arc::new(Mutex::new(engine)); - let mut builder = YatpPoolBuilder::new(ReporterTicker { reporter }); - builder + let builder = YatpPoolBuilder::new(ReporterTicker { reporter }) .name_prefix(&unified_read_pool_name) .stack_size(config.stack_size.0 as usize) .thread_count( diff --git a/src/server/metrics.rs b/src/server/metrics.rs index 23f8256835b..d35c58cbf34 100644 --- a/src/server/metrics.rs +++ b/src/server/metrics.rs @@ -207,6 +207,12 @@ lazy_static! { &["type"] ) .unwrap(); + pub static ref GRPC_RESOURCE_GROUP_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( + "tikv_grpc_resource_group_total", + "Total number of handle grpc message for each resource group", + &["name"] + ) + .unwrap(); pub static ref GRPC_PROXY_MSG_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_grpc_proxy_msg_total", "Total number of handle grpc proxy message", diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 6c85741f64a..d42eb510891 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -171,6 +171,10 @@ macro_rules! handle_request { let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let resp = $future_name(&self.storage, req); let task = async move { let resp = resp.await?; @@ -1043,6 +1047,10 @@ fn handle_batch_commands_request( response_batch_commands_request(id, resp, tx.clone(), begin_instant, GrpcTypeKind::invalid, String::default()); }, Some(batch_commands_request::request::Cmd::Get(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_get(&req) }) { @@ -1057,6 +1065,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::RawGet(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); if batcher.as_mut().map_or(false, |req_batch| { req_batch.can_batch_raw_get(&req) }) { @@ -1071,6 +1083,10 @@ fn handle_batch_commands_request( } }, Some(batch_commands_request::request::Cmd::Coprocessor(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = future_copr(copr, Some(peer.to_string()), req) @@ -1098,6 +1114,10 @@ fn handle_batch_commands_request( ); } $(Some(batch_commands_request::request::Cmd::$cmd(mut req)) => { + let resource_group_name = req.get_context().get_resource_group_name(); + GRPC_RESOURCE_GROUP_COUNTER_VEC + .with_label_values(&[resource_group_name]) + .inc(); let begin_instant = Instant::now(); let source = req.mut_context().take_request_source(); let resp = $future_fn($($arg,)* req) diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 0819c2599b9..7429ed8900b 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -89,6 +89,7 @@ use kvproto::{ use pd_client::FeatureGate; use raftstore::store::{util::build_key_range, ReadStats, TxnExt, WriteStats}; use rand::prelude::*; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use tikv_kv::{OnAppliedCb, SnapshotExt}; use tikv_util::{ @@ -129,7 +130,7 @@ use crate::{ txn::{ commands::{RawAtomicStore, RawCompareAndSwap, TypedCommand}, flow_controller::{EngineFlowController, FlowController}, - scheduler::Scheduler as TxnScheduler, + scheduler::TxnScheduler, Command, ErrorInner as TxnError, }, types::StorageCallbackType, @@ -270,6 +271,7 @@ impl Storage { quota_limiter: Arc, feature_gate: FeatureGate, causal_ts_provider: Option>, + resource_ctl: Option>, ) -> Result { assert_eq!(config.api_version(), F::TAG, "Api version not match"); @@ -285,6 +287,7 @@ impl Storage { resource_tag_factory.clone(), Arc::clone("a_limiter), feature_gate, + resource_ctl, ); info!("Storage started."); @@ -1509,15 +1512,20 @@ impl Storage { // Schedule raw modify commands, which reuse the scheduler worker pool. // TODO: separate the txn and raw commands if needed in the future. - fn sched_raw_command(&self, tag: CommandKind, future: T) -> Result<()> + fn sched_raw_command( + &self, + group_name: &str, + pri: CommandPri, + tag: CommandKind, + future: T, + ) -> Result<()> where - T: Future + Send + 'static, + T: Future + Send + 'static, { SCHED_STAGE_COUNTER_VEC.get(tag).new.inc(); self.sched - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(future) + .get_sched_pool() + .spawn(group_name, pri, future) .map_err(|_| Error::from(ErrorInner::SchedTooBusy)) } @@ -1955,7 +1963,10 @@ impl Storage { let provider = self.causal_ts_provider.clone(); let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); - self.sched_raw_command(CMD, async move { + + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2065,7 +2076,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2128,7 +2141,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2187,7 +2202,9 @@ impl Storage { let cf = Self::rawkv_cf(&cf, self.api_version)?; let engine = self.engine.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2233,7 +2250,9 @@ impl Storage { let engine = self.engine.clone(); let concurrency_manager = self.concurrency_manager.clone(); let deadline = Self::get_deadline(&ctx); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { if let Err(e) = deadline.check() { return callback(Err(Error::from(e))); } @@ -2672,7 +2691,9 @@ impl Storage { return Err(Error::from(ErrorInner::TtlNotEnabled)); } let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let key = F::encode_raw_key_owned(key, None); let cmd = RawCompareAndSwap::new(cf, key, previous_value, value, ttl, api_version, ctx); Self::sched_raw_atomic_command( @@ -2703,7 +2724,9 @@ impl Storage { Self::check_ttl_valid(pairs.len(), &ttls)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { let modifies = Self::raw_batch_put_requests_to_modifies(cf, pairs, ttls, None); let cmd = RawAtomicStore::new(cf, modifies, ctx); Self::sched_raw_atomic_command( @@ -2726,7 +2749,9 @@ impl Storage { Self::check_api_version(self.api_version, ctx.api_version, CMD, &keys)?; let cf = Self::rawkv_cf(&cf, self.api_version)?; let sched = self.get_scheduler(); - self.sched_raw_command(CMD, async move { + let priority = ctx.get_priority(); + let group_name = ctx.get_resource_group_name().to_owned(); + self.sched_raw_command(&group_name, priority, CMD, async move { // Do NOT encode ts here as RawAtomicStore use key to gen lock let modifies = keys .into_iter() @@ -3183,6 +3208,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), ts_provider, + None, ) } @@ -3213,6 +3239,7 @@ impl TestStorageBuilder { Arc::new(QuotaLimiter::default()), latest_feature_gate(), None, + Some(Arc::new(ResourceController::new("test".to_owned(), false))), ) } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 2d79ebc97cc..5b94ea5bd85 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -715,6 +715,13 @@ impl Command { self.command_ext().get_ctx().get_priority() } + pub fn group_name(&self) -> String { + self.command_ext() + .get_ctx() + .get_resource_group_name() + .to_owned() + } + pub fn need_flow_control(&self) -> bool { !self.readonly() && self.priority() != CommandPri::High } diff --git a/src/storage/txn/mod.rs b/src/storage/txn/mod.rs index f6884b0efb8..d3b199208cb 100644 --- a/src/storage/txn/mod.rs +++ b/src/storage/txn/mod.rs @@ -32,7 +32,7 @@ pub use self::{ }, commands::{Command, RESOLVE_LOCK_BATCH_SIZE}, latch::{Latches, Lock}, - scheduler::Scheduler, + scheduler::TxnScheduler, store::{ EntryBatch, FixtureStore, FixtureStoreScanner, Scanner, SnapshotStore, Store, TxnEntry, TxnEntryScanner, TxnEntryStore, diff --git a/src/storage/txn/sched_pool.rs b/src/storage/txn/sched_pool.rs index c7c69b5bbf4..0cff9d51d41 100644 --- a/src/storage/txn/sched_pool.rs +++ b/src/storage/txn/sched_pool.rs @@ -8,14 +8,16 @@ use std::{ use collections::HashMap; use file_system::{set_io_type, IoType}; -use kvproto::pdpb::QueryKind; +use kvproto::{kvrpcpb::CommandPri, pdpb::QueryKind}; use pd_client::{Feature, FeatureGate}; use prometheus::local::*; use raftstore::store::WriteStats; +use resource_control::{ControlledFuture, ResourceController}; use tikv_util::{ sys::SysQuota, - yatp_pool::{FuturePool, PoolTicker, YatpPoolBuilder}, + yatp_pool::{Full, FuturePool, PoolTicker, YatpPoolBuilder}, }; +use yatp::queue::Extras; use crate::storage::{ kv::{destroy_tls_engine, set_tls_engine, Engine, FlowStatsReporter, Statistics}, @@ -41,11 +43,6 @@ thread_local! { static TLS_FEATURE_GATE: RefCell = RefCell::new(latest_feature_gate()); } -#[derive(Clone)] -pub struct SchedPool { - pub pool: FuturePool, -} - #[derive(Clone)] pub struct SchedTicker { reporter: R, @@ -57,38 +54,142 @@ impl PoolTicker for SchedTicker { } } +#[derive(Clone)] +pub enum SchedPool { + // separated thread pools for different priority commands + Vanilla { + high_worker_pool: FuturePool, + worker_pool: FuturePool, + }, + // one priority based thread pool to handle all commands + Priority { + worker_pool: FuturePool, + resource_ctl: Arc, + }, +} + impl SchedPool { pub fn new( engine: E, pool_size: usize, reporter: R, feature_gate: FeatureGate, - name_prefix: &str, + resource_ctl: Option>, ) -> Self { - let engine = Arc::new(Mutex::new(engine)); - // for low cpu quota env, set the max-thread-count as 4 to allow potential cases - // that we need more thread than cpu num. - let max_pool_size = std::cmp::max( - pool_size, - std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), - ); - let pool = YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) - .thread_count(1, pool_size, max_pool_size) - .name_prefix(name_prefix) - // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures - // the tls_engine invariants. - .after_start(move || { - set_tls_engine(engine.lock().unwrap().clone()); - set_io_type(IoType::ForegroundWrite); - TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); - }) - .before_stop(move || unsafe { - // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. - destroy_tls_engine::(); - tls_flush(&reporter); - }) - .build_future_pool(); - SchedPool { pool } + let builder = |pool_size: usize, name_prefix: &str| { + let engine = Arc::new(Mutex::new(engine.clone())); + let feature_gate = feature_gate.clone(); + let reporter = reporter.clone(); + // for low cpu quota env, set the max-thread-count as 4 to allow potential cases + // that we need more thread than cpu num. + let max_pool_size = std::cmp::max( + pool_size, + std::cmp::max(4, SysQuota::cpu_cores_quota() as usize), + ); + YatpPoolBuilder::new(SchedTicker {reporter:reporter.clone()}) + .thread_count(1, pool_size, max_pool_size) + .name_prefix(name_prefix) + // Safety: by setting `after_start` and `before_stop`, `FuturePool` ensures + // the tls_engine invariants. + .after_start(move || { + set_tls_engine(engine.lock().unwrap().clone()); + set_io_type(IoType::ForegroundWrite); + TLS_FEATURE_GATE.with(|c| *c.borrow_mut() = feature_gate.clone()); + }) + .before_stop(move || unsafe { + // Safety: we ensure the `set_` and `destroy_` calls use the same engine type. + destroy_tls_engine::(); + tls_flush(&reporter); + }) + }; + if let Some(ref r) = resource_ctl { + SchedPool::Priority { + worker_pool: builder(pool_size, "sched-worker-pool") + .build_priority_future_pool(r.clone()), + resource_ctl: r.clone(), + } + } else { + SchedPool::Vanilla { + worker_pool: builder(pool_size, "sched-worker-pool").build_future_pool(), + high_worker_pool: builder(std::cmp::max(1, pool_size / 2), "sched-high-pri-pool") + .build_future_pool(), + } + } + } + + pub fn spawn( + &self, + group_name: &str, + priority: CommandPri, + f: impl futures::Future + Send + 'static, + ) -> Result<(), Full> { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.spawn(f) + } else { + worker_pool.spawn(f) + } + } + SchedPool::Priority { + worker_pool, + resource_ctl, + } => { + let fixed_level = match priority { + CommandPri::High => Some(0), + CommandPri::Normal => None, + CommandPri::Low => Some(2), + }; + // TODO: maybe use a better way to generate task_id + let task_id = rand::random::(); + let mut extras = Extras::new_multilevel(task_id, fixed_level); + extras.set_metadata(group_name.as_bytes().to_owned()); + worker_pool.spawn_with_extras( + ControlledFuture::new( + async move { + f.await; + }, + resource_ctl.clone(), + group_name.as_bytes().to_owned(), + ), + extras, + ) + } + } + } + + pub fn scale_pool_size(&self, pool_size: usize) { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + high_worker_pool.scale_pool_size(std::cmp::max(1, pool_size / 2)); + worker_pool.scale_pool_size(pool_size); + } + SchedPool::Priority { worker_pool, .. } => { + worker_pool.scale_pool_size(pool_size); + } + } + } + + pub fn get_pool_size(&self, priority: CommandPri) -> usize { + match self { + SchedPool::Vanilla { + high_worker_pool, + worker_pool, + } => { + if priority == CommandPri::High { + high_worker_pool.get_pool_size() + } else { + worker_pool.get_pool_size() + } + } + SchedPool::Priority { worker_pool, .. } => worker_pool.get_pool_size(), + } } } diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index d96e3e7c97f..17110a07e7b 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -1,7 +1,7 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath -//! Scheduler which schedules the execution of `storage::Command`s. +//! TxnScheduler which schedules the execution of `storage::Command`s. //! //! There is one scheduler for each store. It receives commands from clients, //! executes them against the MVCC layer storage engine. @@ -12,16 +12,16 @@ //! leader. When the client read or write a row, the command is sent to the //! scheduler which is on the region leader's store. //! -//! Scheduler runs in a single-thread event loop, but command executions are +//! TxnScheduler runs in a single-thread event loop, but command executions are //! delegated to a pool of worker thread. //! -//! Scheduler keeps track of all the running commands and uses latches to ensure -//! serialized access to the overlapping rows involved in concurrent commands. -//! But note that scheduler only ensures serialized access to the overlapping -//! rows at command level, but a transaction may consist of multiple commands, -//! therefore conflicts may happen at transaction level. Transaction semantics -//! is ensured by the transaction protocol implemented in the client library, -//! which is transparent to the scheduler. +//! TxnScheduler keeps track of all the running commands and uses latches to +//! ensure serialized access to the overlapping rows involved in concurrent +//! commands. But note that scheduler only ensures serialized access to the +//! overlapping rows at command level, but a transaction may consist of multiple +//! commands, therefore conflicts may happen at transaction level. Transaction +//! semantics is ensured by the transaction protocol implemented in the client +//! library, which is transparent to the scheduler. use std::{ marker::PhantomData, @@ -47,12 +47,11 @@ use kvproto::{ use parking_lot::{Mutex, MutexGuard, RwLockWriteGuard}; use pd_client::{Feature, FeatureGate}; use raftstore::store::TxnExt; +use resource_control::ResourceController; use resource_metering::{FutureExt, ResourceTagFactory}; use smallvec::{smallvec, SmallVec}; use tikv_kv::{Modify, Snapshot, SnapshotExt, WriteData, WriteEvent}; -use tikv_util::{ - deadline::Deadline, quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE, -}; +use tikv_util::{quota_limiter::QuotaLimiter, time::Instant, timer::GLOBAL_TIMER_HANDLE}; use tracker::{get_tls_tracker_token, set_tls_tracker_token, TrackerToken}; use txn_types::TimeStamp; @@ -239,7 +238,7 @@ impl SchedulerTaskCallback { } } -struct SchedulerInner { +struct TxnSchedulerInner { // slot_id -> { cid -> `TaskContext` } in the slot. task_slots: Vec>>>, @@ -251,11 +250,8 @@ struct SchedulerInner { sched_pending_write_threshold: usize, - // worker pool - worker_pool: SchedPool, - - // high priority commands and system commands will be delivered to this pool - high_priority_pool: SchedPool, + // all tasks are executed in this pool + sched_worker_pool: SchedPool, // used to control write flow running_write_bytes: CachePadded, @@ -292,7 +288,7 @@ fn id_index(cid: u64) -> usize { cid as usize % TASKS_SLOTS_NUM } -impl SchedulerInner { +impl TxnSchedulerInner { /// Generates the next command ID. #[inline] fn gen_id(&self) -> u64 { @@ -375,19 +371,23 @@ impl SchedulerInner { /// /// Returns a deadline error if the deadline is exceeded. Returns the `Task` /// if all latches are acquired, returns `None` otherwise. - fn acquire_lock_on_wakeup(&self, cid: u64) -> Result, StorageError> { + fn acquire_lock_on_wakeup( + &self, + cid: u64, + ) -> Result, (String, CommandPri, StorageError)> { let mut task_slot = self.get_task_slot(cid); let tctx = task_slot.get_mut(&cid).unwrap(); // Check deadline early during acquiring latches to avoid expired requests // blocking other requests. - if let Err(e) = tctx.task.as_ref().unwrap().cmd.deadline().check() { + let cmd = &tctx.task.as_ref().unwrap().cmd; + if let Err(e) = cmd.deadline().check() { // `acquire_lock_on_wakeup` is called when another command releases its locks // and wakes up command `cid`. This command inserted its lock before // and now the lock is at the front of the queue. The actual // acquired count is one more than the `owned_count` recorded in the // lock, so we increase one to make `release` work. tctx.lock.owned_count += 1; - return Err(e.into()); + return Err((cmd.group_name(), cmd.priority(), e.into())); } if self.latches.acquire(&mut tctx.lock, cid) { tctx.on_schedule(); @@ -401,25 +401,22 @@ impl SchedulerInner { } fn scale_pool_size(&self, pool_size: usize) { - self.worker_pool.pool.scale_pool_size(pool_size); - self.high_priority_pool - .pool - .scale_pool_size(std::cmp::max(1, pool_size / 2)); + self.sched_worker_pool.scale_pool_size(pool_size); } } -/// Scheduler which schedules the execution of `storage::Command`s. +/// TxnScheduler which schedules the execution of `storage::Command`s. #[derive(Clone)] -pub struct Scheduler { - inner: Arc>, +pub struct TxnScheduler { + inner: Arc>, // The engine can be fetched from the thread local storage of scheduler threads. // So, we don't store the engine here. _engine: PhantomData, } -unsafe impl Send for Scheduler {} +unsafe impl Send for TxnScheduler {} -impl Scheduler { +impl TxnScheduler { /// Creates a scheduler. pub(in crate::storage) fn new( engine: E, @@ -433,6 +430,7 @@ impl Scheduler { resource_tag_factory: ResourceTagFactory, quota_limiter: Arc, feature_gate: FeatureGate, + resource_ctl: Option>, ) -> Self { let t = Instant::now_coarse(); let mut task_slots = Vec::with_capacity(TASKS_SLOTS_NUM); @@ -442,25 +440,18 @@ impl Scheduler { let lock_wait_queues = LockWaitQueues::new(lock_mgr.clone()); - let inner = Arc::new(SchedulerInner { + let inner = Arc::new(TxnSchedulerInner { task_slots, id_alloc: AtomicU64::new(0).into(), latches: Latches::new(config.scheduler_concurrency), running_write_bytes: AtomicUsize::new(0).into(), sched_pending_write_threshold: config.scheduler_pending_write_threshold.0 as usize, - worker_pool: SchedPool::new( - engine.clone(), - config.scheduler_worker_pool_size, - reporter.clone(), - feature_gate.clone(), - "sched-worker-pool", - ), - high_priority_pool: SchedPool::new( + sched_worker_pool: SchedPool::new( engine, - std::cmp::max(1, config.scheduler_worker_pool_size / 2), + config.scheduler_worker_pool_size, reporter, feature_gate.clone(), - "sched-high-pri-pool", + resource_ctl, ), control_mutex: Arc::new(tokio::sync::Mutex::new(false)), lock_mgr, @@ -481,7 +472,7 @@ impl Scheduler { t.saturating_elapsed(), "initialized the transaction scheduler" ); - Scheduler { + TxnScheduler { inner, _engine: PhantomData, } @@ -561,26 +552,19 @@ impl Scheduler { return; } let task = tctx.task.as_ref().unwrap(); - let deadline = task.cmd.deadline(); - let cmd_ctx = task.cmd.ctx().clone(); - self.fail_fast_or_check_deadline(cid, tag, cmd_ctx, deadline); + self.fail_fast_or_check_deadline(cid, &task.cmd); fail_point!("txn_scheduler_acquire_fail"); } - fn fail_fast_or_check_deadline( - &self, - cid: u64, - tag: CommandKind, - cmd_ctx: Context, - deadline: Deadline, - ) { + fn fail_fast_or_check_deadline(&self, cid: u64, cmd: &Command) { + let tag = cmd.tag(); + let ctx = cmd.ctx().clone(); + let deadline = cmd.deadline(); let sched = self.clone(); - self.inner - .high_priority_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&cmd.group_name(), cmd.priority(), async move { match unsafe { - with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&cmd_ctx)) + with_tls_engine(|engine: &mut E| engine.precheck_write_with_ctx(&ctx)) } { // Precheck failed, try to return err early. Err(e) => { @@ -632,14 +616,12 @@ impl Scheduler { self.execute(task); } Ok(None) => {} - Err(err) => { + Err((group_name, pri, err)) => { // Spawn the finish task to the pool to avoid stack overflow // when many queuing tasks fail successively. let this = self.clone(); - self.inner - .worker_pool - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(&group_name, pri, async move { this.finish_with_err(cid, err); }) .unwrap(); @@ -670,21 +652,17 @@ impl Scheduler { } // pub for test - pub fn get_sched_pool(&self, priority: CommandPri) -> &SchedPool { - if priority == CommandPri::High { - &self.inner.high_priority_pool - } else { - &self.inner.worker_pool - } + pub fn get_sched_pool(&self) -> &SchedPool { + &self.inner.sched_worker_pool } /// Executes the task in the sched pool. fn execute(&self, mut task: Task) { set_tls_tracker_token(task.tracker); let sched = self.clone(); - self.get_sched_pool(task.cmd.priority()) - .pool - .spawn(async move { + + self.get_sched_pool() + .spawn(&task.cmd.group_name(), task.cmd.priority(), async move { fail_point!("scheduler_start_execute"); if sched.check_task_deadline_exceeded(&task) { return; @@ -800,6 +778,7 @@ impl Scheduler { async_apply_prewrite: bool, new_acquired_locks: Vec, tag: CommandKind, + group_name: &str, ) { // TODO: Does async apply prewrite worth a special metric here? if pipelined { @@ -847,7 +826,7 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - self.on_acquired_locks_finished(new_acquired_locks); + self.on_acquired_locks_finished(group_name, new_acquired_locks); if do_wake_up { let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; @@ -932,7 +911,11 @@ impl Scheduler { ); } - fn on_release_locks(&self, released_locks: ReleasedLocks) -> SVec> { + fn on_release_locks( + &self, + group_name: &str, + released_locks: ReleasedLocks, + ) -> SVec> { // This function is always called when holding the latch of the involved keys. // So if we found the lock waiting queues are empty, there's no chance // that other threads/commands adds new lock-wait entries to the keys @@ -973,13 +956,21 @@ impl Scheduler { }); if !legacy_wake_up_list.is_empty() || !delay_wake_up_futures.is_empty() { - self.wake_up_legacy_pessimistic_locks(legacy_wake_up_list, delay_wake_up_futures); + self.wake_up_legacy_pessimistic_locks( + group_name, + legacy_wake_up_list, + delay_wake_up_futures, + ); } resumable_wake_up_list } - fn on_acquired_locks_finished(&self, new_acquired_locks: Vec) { + fn on_acquired_locks_finished( + &self, + group_name: &str, + new_acquired_locks: Vec, + ) { if new_acquired_locks.is_empty() || self.inner.lock_wait_queues.is_empty() { return; } @@ -992,9 +983,8 @@ impl Scheduler { .update_lock_wait(new_acquired_locks); } else { let lock_wait_queues = self.inner.lock_wait_queues.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { lock_wait_queues.update_lock_wait(new_acquired_locks); }) .unwrap(); @@ -1003,15 +993,16 @@ impl Scheduler { fn wake_up_legacy_pessimistic_locks( &self, + group_name: &str, legacy_wake_up_list: impl IntoIterator, ReleasedLock)> + Send + 'static, delayed_wake_up_futures: impl IntoIterator + Send + 'static, ) { let self1 = self.clone(); - self.get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + let group_name1 = group_name.to_owned(); + self.get_sched_pool() + .spawn(group_name, CommandPri::High, async move { for (lock_info, released_lock) in legacy_wake_up_list { let cb = lock_info.key_cb.unwrap().into_inner(); let e = StorageError::from(Error::from(MvccError::from( @@ -1030,9 +1021,8 @@ impl Scheduler { for f in delayed_wake_up_futures { let self2 = self1.clone(); self1 - .get_sched_pool(CommandPri::High) - .pool - .spawn(async move { + .get_sched_pool() + .spawn(&group_name1, CommandPri::High, async move { let res = f.await; if let Some(resumable_lock_wait_entry) = res { self2.schedule_awakened_pessimistic_locks( @@ -1121,7 +1111,7 @@ impl Scheduler { } /// Processes a read command within a worker thread, then posts - /// `ReadFinished` message back to the `Scheduler`. + /// `ReadFinished` message back to the `TxnScheduler`. fn process_read(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_read"); debug!("process read cmd in worker pool"; "cid" => task.cid); @@ -1144,12 +1134,13 @@ impl Scheduler { /// Processes a write command within a worker thread, then posts either a /// `WriteFinished` message if successful or a `FinishedWithErr` message - /// back to the `Scheduler`. + /// back to the `TxnScheduler`. async fn process_write(self, snapshot: E::Snap, task: Task, statistics: &mut Statistics) { fail_point!("txn_before_process_write"); let write_bytes = task.cmd.write_bytes(); let tag = task.cmd.tag(); let cid = task.cid; + let group_name = task.cmd.group_name(); let tracker = task.tracker; let scheduler = self.clone(); let quota_limiter = self.inner.quota_limiter.clone(); @@ -1285,7 +1276,7 @@ impl Scheduler { } let woken_up_resumable_entries = if !released_locks.is_empty() { - scheduler.on_release_locks(released_locks) + scheduler.on_release_locks(&group_name, released_locks) } else { smallvec![] }; @@ -1306,6 +1297,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1336,6 +1328,7 @@ impl Scheduler { false, new_acquired_locks, tag, + &group_name, ); return; } @@ -1522,6 +1515,7 @@ impl Scheduler { is_async_apply_prewrite, new_acquired_locks, tag, + &group_name, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC .get(tag) @@ -1828,7 +1822,7 @@ mod tests { } // TODO(cosven): use this in the following test cases to reduce duplicate code. - fn new_test_scheduler() -> (Scheduler, RocksEngine) { + fn new_test_scheduler() -> (TxnScheduler, RocksEngine) { let engine = TestEngineBuilder::new().build().unwrap(); let config = Config { scheduler_concurrency: 1024, @@ -1838,7 +1832,7 @@ mod tests { ..Default::default() }; ( - Scheduler::new( + TxnScheduler::new( engine.clone(), MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -1854,6 +1848,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), latest_feature_gate(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ), engine, ) @@ -1978,31 +1973,7 @@ mod tests { #[test] fn test_acquire_latch_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2084,38 +2055,15 @@ mod tests { #[test] fn test_pool_available_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); // Spawn a task that sleeps for 500ms to occupy the pool. The next request // cannot run within 500ms. scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .spawn(async { thread::sleep(Duration::from_millis(500)) }) + .get_sched_pool() + .spawn("", CommandPri::Normal, async { + thread::sleep(Duration::from_millis(500)) + }) .unwrap(); let mut req = BatchRollbackRequest::default(); @@ -2144,31 +2092,7 @@ mod tests { #[test] fn test_flow_control_trottle_deadline() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut req = CheckTxnStatusRequest::default(); req.mut_context().max_execution_duration_ms = 100; @@ -2212,31 +2136,7 @@ mod tests { #[test] fn test_accumulate_many_expired_commands() { - let engine = TestEngineBuilder::new().build().unwrap(); - let config = Config { - scheduler_concurrency: 1024, - scheduler_worker_pool_size: 1, - scheduler_pending_write_threshold: ReadableSize(100 * 1024 * 1024), - enable_async_apply_prewrite: false, - ..Default::default() - }; - let scheduler = Scheduler::new( - engine, - MockLockManager::new(), - ConcurrencyManager::new(1.into()), - &config, - DynamicConfigs { - pipelined_pessimistic_lock: Arc::new(AtomicBool::new(true)), - in_memory_pessimistic_lock: Arc::new(AtomicBool::new(false)), - wake_up_delay_duration_ms: Arc::new(AtomicU64::new(0)), - }, - Arc::new(FlowController::Singleton(EngineFlowController::empty())), - None, - DummyReporter, - ResourceTagFactory::new_for_test(), - Arc::new(QuotaLimiter::default()), - latest_feature_gate(), - ); + let (scheduler, _) = new_test_scheduler(); let mut lock = Lock::new(&[Key::from_raw(b"b")]); let cid = scheduler.inner.gen_id(); @@ -2283,7 +2183,7 @@ mod tests { let feature_gate = FeatureGate::default(); feature_gate.set_version("6.0.0").unwrap(); - let scheduler = Scheduler::new( + let scheduler = TxnScheduler::new( engine, MockLockManager::new(), ConcurrencyManager::new(1.into()), @@ -2299,6 +2199,7 @@ mod tests { ResourceTagFactory::new_for_test(), Arc::new(QuotaLimiter::default()), feature_gate.clone(), + Some(Arc::new(ResourceController::new("test".to_owned(), true))), ); // Use sync mode if pipelined_pessimistic_lock is false. assert_eq!(scheduler.pessimistic_lock_mode(), PessimisticLockMode::Sync); diff --git a/tests/Cargo.toml b/tests/Cargo.toml index ae6c6984487..1cc0e6bce87 100644 --- a/tests/Cargo.toml +++ b/tests/Cargo.toml @@ -95,6 +95,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft_log_engine = { workspace = true } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tempfile = "3.0" diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 2508b544285..1a7d44db972 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -312,10 +312,7 @@ fn test_scale_scheduler_pool() { .update_config("storage.scheduler-worker-pool-size", &format!("{}", size)) .unwrap(); assert_eq!( - scheduler - .get_sched_pool(CommandPri::Normal) - .pool - .get_pool_size(), + scheduler.get_sched_pool().get_pool_size(CommandPri::Normal), size ); }; From 7240e5778ef3c379b0f898c103dc675fad7af099 Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Wed, 18 Jan 2023 11:47:50 +0800 Subject: [PATCH 086/115] fix docker build (#13937) ref tikv/tikv#11312 Fix `make docker`. Signed-off-by: tabokie --- Dockerfile | 11 ++++++++--- cmd/build.rs | 4 +++- components/profiler/Cargo.toml | 1 + scripts/check-docker-build | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index c4ad36dc6e7..aefa51b2222 100644 --- a/Dockerfile +++ b/Dockerfile @@ -50,6 +50,11 @@ RUN ln -s /usr/bin/cmake3 /usr/bin/cmake ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH +# Install protoc +RUN curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip" +RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ +ENV PATH /usr/local/bin/:$PATH + # Install Rustup RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y ENV PATH /root/.cargo/bin/:$PATH @@ -72,8 +77,7 @@ RUN mkdir -p ./cmd/tikv-ctl/src ./cmd/tikv-server/src && \ echo 'fn main() {}' > ./cmd/tikv-ctl/src/main.rs && \ echo 'fn main() {}' > ./cmd/tikv-server/src/main.rs && \ for cargotoml in $(find . -type f -name "Cargo.toml"); do \ - sed -i '/fuzz/d' ${cargotoml} && \ - sed -i '/profiler/d' ${cargotoml} ; \ + sed -i '/fuzz/d' ${cargotoml} ; \ done COPY Makefile ./ @@ -105,8 +109,9 @@ FROM pingcap/alpine-glibc COPY --from=builder /tikv/target/release/tikv-server /tikv-server COPY --from=builder /tikv/target/release/tikv-ctl /tikv-ctl +# FIXME: Figure out why libstdc++ is not staticly linked. RUN apk add --no-cache \ - curl + curl libstdc++ EXPOSE 20160 20180 diff --git a/cmd/build.rs b/cmd/build.rs index 6d11a38f705..c19797d9227 100644 --- a/cmd/build.rs +++ b/cmd/build.rs @@ -32,7 +32,9 @@ fn link_sys_lib(lib: &str, tool: &cc::Tool) { } // remove lib prefix and .a postfix. let libname = &lib[3..lib.len() - 2]; - println!("cargo:rustc-link-lib=static:+whole-archive={}", &libname); + // Get around the issue "the linking modifiers `+bundle` and `+whole-archive` + // are not compatible with each other when generating rlibs" + println!("cargo:rustc-link-lib=static:-bundle,+whole-archive={}", &libname); println!( "cargo:rustc-link-search=native={}", path.parent().unwrap().display() diff --git a/components/profiler/Cargo.toml b/components/profiler/Cargo.toml index b0c456b209f..e5583a631d5 100644 --- a/components/profiler/Cargo.toml +++ b/components/profiler/Cargo.toml @@ -18,4 +18,5 @@ valgrind_request = { version = "1.1.0", optional = true } [[example]] name = "prime" +path = "examples/prime.rs" required-features = ["profiling"] diff --git a/scripts/check-docker-build b/scripts/check-docker-build index 6a505f31a89..0eee0c5cf1f 100755 --- a/scripts/check-docker-build +++ b/scripts/check-docker-build @@ -2,7 +2,7 @@ # This script checks if all cargo targets have path specifications. set -euo pipefail -for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/\|./profiler/'); do +for i in $(git ls-files | grep 'Cargo.toml' | grep -v 'fuzz/'); do for target in "test" "bench" "bin" "example"; do # from "[[test]]" to the first trailing empty line matches=$(sed -n "/\[\[$target\]\]/,/^$/ p" $i) From b35d4fb33a18c5be9136c790e01ca449075e6acb Mon Sep 17 00:00:00 2001 From: Hu# Date: Wed, 18 Jan 2023 14:57:51 +0800 Subject: [PATCH 087/115] pd_client: fix the kvproto compatibility (#14064) close tikv/tikv#14063 make sure kvproto compatibility Signed-off-by: husharp --- Cargo.lock | 2 +- components/error_code/src/pd.rs | 1 + components/pd_client/src/client.rs | 6 +----- components/pd_client/src/client_v2.rs | 6 +----- components/pd_client/src/errors.rs | 4 ++++ components/pd_client/src/util.rs | 1 + components/resource_control/src/resource_group.rs | 8 ++++---- etc/error_code.toml | 5 +++++ 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab1d164a1e0..a2924314f8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2727,7 +2727,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a14c44ef44b378d15adb5baad8402b838f031b51" +source = "git+https://github.com/pingcap/kvproto.git#adcf4c414bfd0ccf18436b377430aa2450fd4c81" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/error_code/src/pd.rs b/components/error_code/src/pd.rs index 3ca2ac0b29f..782c4f3923b 100644 --- a/components/error_code/src/pd.rs +++ b/components/error_code/src/pd.rs @@ -12,5 +12,6 @@ define_error_codes!( REGION_NOT_FOUND => ("RegionNotFound", "", ""), STORE_TOMBSTONE => ("StoreTombstone", "", ""), GLOBAL_CONFIG_NOT_FOUND => ("GlobalConfigNotFound","",""), + DATA_COMPACTED => ("DataCompacted","",""), UNKNOWN => ("Unknown", "", "") ); diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 5bccdcfacea..1e1e5980908 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -302,11 +302,7 @@ impl PdClient for RpcClient { Ok(grpc_response) => { let mut res = HashMap::with_capacity(grpc_response.get_items().len()); for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); } Ok(res) } diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index b42d8fb3ddb..35e5c3b4785 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -803,11 +803,7 @@ impl PdClient for RpcClient { Ok(grpc_response) => { let mut res = HashMap::with_capacity(grpc_response.get_items().len()); for c in grpc_response.get_items() { - if c.has_error() { - error!("failed to load global config with key {:?}", c.get_error()); - } else { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } + res.insert(c.get_name().to_owned(), c.get_value().to_owned()); } Ok(res) } diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index 61adceec391..689cb276064 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -26,6 +26,8 @@ pub enum Error { StoreTombstone(String), #[error("global config item {0} not found")] GlobalConfigNotFound(String), + #[error("required watch revision is smaller than current compact/min revision. {0:?}")] + DataCompacted(String), } pub type Result = result::Result; @@ -38,6 +40,7 @@ impl Error { | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) + | Error::DataCompacted(_) | Error::ClusterBootstrapped(_) | Error::Incompatible => false, } @@ -55,6 +58,7 @@ impl ErrorCodeExt for Error { Error::RegionNotFound(_) => error_code::pd::REGION_NOT_FOUND, Error::StoreTombstone(_) => error_code::pd::STORE_TOMBSTONE, Error::GlobalConfigNotFound(_) => error_code::pd::GLOBAL_CONFIG_NOT_FOUND, + Error::DataCompacted(_) => error_code::pd::DATA_COMPACTED, Error::Other(_) => error_code::pd::UNKNOWN, } } diff --git a/components/pd_client/src/util.rs b/components/pd_client/src/util.rs index 72c8cc16b04..fd58cd921d8 100644 --- a/components/pd_client/src/util.rs +++ b/components/pd_client/src/util.rs @@ -873,6 +873,7 @@ pub fn check_resp_header(header: &ResponseHeader) -> Result<()> { ErrorType::GlobalConfigNotFound => { Err(Error::GlobalConfigNotFound(err.get_message().to_owned())) } + ErrorType::DataCompacted => Err(Error::DataCompacted(err.get_message().to_owned())), ErrorType::Ok => Ok(()), ErrorType::DuplicatedEntry | ErrorType::EntryNotFound => Err(box_err!(err.get_message())), ErrorType::Unknown => Err(box_err!(err.get_message())), diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 70f89fd1a9d..bfe9d92d0f3 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -51,12 +51,12 @@ impl ResourceGroupManager { // TODO: currently we only consider the cpu usage in the read path, we may also take // io read bytes into account later. (GroupMode::RawMode, true) => rg - .get_resource_settings() + .get_raw_resource_settings() .get_cpu() .get_settings() .get_fill_rate(), (GroupMode::RawMode, false) => rg - .get_resource_settings() + .get_raw_resource_settings() .get_io_write() .get_settings() .get_fill_rate(), @@ -327,7 +327,7 @@ mod tests { .set_fill_rate(write_tokens); group.set_r_u_settings(ru_setting); } else { - let mut resource_setting = GroupResourceSettings::new(); + let mut resource_setting = GroupRawResourceSettings::new(); resource_setting .mut_cpu() .mut_settings() @@ -336,7 +336,7 @@ mod tests { .mut_io_write() .mut_settings() .set_fill_rate(write_tokens); - group.set_resource_settings(resource_setting); + group.set_raw_resource_settings(resource_setting); } group } diff --git a/etc/error_code.toml b/etc/error_code.toml index 5cdd770f8d2..6b361e29e37 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -263,6 +263,11 @@ error = ''' KV:Pd:GlobalConfigNotFound ''' +["KV:Pd:DataCompacted"] +error = ''' +KV:Pd:DataCompacted +''' + ["KV:Pd:Unknown"] error = ''' KV:Pd:Unknown From 15445fd8a9c6832afeaf335a84c334fa13f6ecfe Mon Sep 17 00:00:00 2001 From: Xinye Tao Date: Thu, 19 Jan 2023 11:23:49 +0800 Subject: [PATCH 088/115] raftstore-v2: add more features to pd worker v2 (#14003) ref tikv/tikv#12842 Signed-off-by: tabokie --- components/engine_panic/src/misc.rs | 4 + components/engine_panic/src/snapshot.rs | 10 +- components/engine_rocks/src/misc.rs | 12 + components/engine_rocks/src/snapshot.rs | 10 +- components/engine_traits/src/misc.rs | 2 + components/engine_traits/src/snapshot.rs | 4 +- components/raftstore-v2/src/batch/store.rs | 17 +- components/raftstore-v2/src/lib.rs | 2 +- .../pd/{update_max_timestamp.rs => misc.rs} | 13 + components/raftstore-v2/src/worker/pd/mod.rs | 223 ++++++++-- .../pd/{region_heartbeat.rs => region.rs} | 180 +++++++- .../raftstore-v2/src/worker/pd/split.rs | 85 +++- .../pd/{store_heartbeat.rs => store.rs} | 11 + .../tests/integrations/cluster.rs | 6 +- .../src/coprocessor/consistency_check.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 5 +- components/raftstore/src/store/mod.rs | 9 +- components/raftstore/src/store/worker/mod.rs | 5 +- components/raftstore/src/store/worker/pd.rs | 408 ++++++++---------- components/server/src/server2.rs | 68 ++- components/test_raftstore/src/util.rs | 5 +- src/server/raftkv2/node.rs | 14 +- 22 files changed, 781 insertions(+), 316 deletions(-) rename components/raftstore-v2/src/worker/pd/{update_max_timestamp.rs => misc.rs} (89%) rename components/raftstore-v2/src/worker/pd/{region_heartbeat.rs => region.rs} (58%) rename components/raftstore-v2/src/worker/pd/{store_heartbeat.rs => store.rs} (96%) diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 5e6fbe87267..93218767ec0 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -92,6 +92,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_num_keys(&self) -> Result { + panic!() + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_panic/src/snapshot.rs b/components/engine_panic/src/snapshot.rs index 296d7ce617a..f6cda5312cb 100644 --- a/components/engine_panic/src/snapshot.rs +++ b/components/engine_panic/src/snapshot.rs @@ -2,7 +2,9 @@ use std::ops::Deref; -use engine_traits::{IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + CfNamesExt, IterOptions, Iterable, Iterator, Peekable, ReadOptions, Result, Snapshot, +}; use crate::{db_vector::PanicDbVector, engine::PanicEngine}; @@ -36,6 +38,12 @@ impl Iterable for PanicSnapshot { } } +impl CfNamesExt for PanicSnapshot { + fn cf_names(&self) -> Vec<&str> { + panic!() + } +} + pub struct PanicSnapshotIterator; impl Iterator for PanicSnapshotIterator { diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index e339facaac4..3477226ae76 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -332,6 +332,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_rocks/src/snapshot.rs b/components/engine_rocks/src/snapshot.rs index b19a32fd739..60a12c4ac6d 100644 --- a/components/engine_rocks/src/snapshot.rs +++ b/components/engine_rocks/src/snapshot.rs @@ -5,7 +5,9 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ @@ -95,3 +97,9 @@ impl Peekable for RocksSnapshot { Ok(v.map(RocksDbVector::from_raw)) } } + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() + } +} diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index d9a07a1a915..5bbcbb2de79 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -115,6 +115,8 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn get_total_sst_files_size_cf(&self, cf: &str) -> Result>; + fn get_num_keys(&self) -> Result; + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/components/engine_traits/src/snapshot.rs b/components/engine_traits/src/snapshot.rs index 7907abd1445..a5829161e25 100644 --- a/components/engine_traits/src/snapshot.rs +++ b/components/engine_traits/src/snapshot.rs @@ -2,7 +2,7 @@ use std::fmt::Debug; -use crate::{iterable::Iterable, peekable::Peekable}; +use crate::{iterable::Iterable, peekable::Peekable, CfNamesExt}; /// A consistent read-only view of the database. /// @@ -10,6 +10,6 @@ use crate::{iterable::Iterable, peekable::Peekable}; /// clonable, call `into_sync` to create a `SyncSnapshot`. pub trait Snapshot where - Self: 'static + Peekable + Iterable + Send + Sync + Sized + Debug, + Self: 'static + Peekable + Iterable + CfNamesExt + Send + Sync + Sized + Debug, { } diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index ccf3f19f3ea..280e8dcc396 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -26,10 +26,11 @@ use raftstore::{ store::{ fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, local_metrics::RaftMetrics, - Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, - TabletSnapManager, Transport, WriteSenders, + AutoSplitController, Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, + StoreWriters, TabletSnapManager, Transport, WriteSenders, }, }; +use resource_metering::CollectorRegHandle; use slog::{warn, Logger}; use tikv_util::{ box_err, @@ -511,6 +512,8 @@ impl StoreSystem { concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, ) -> Result<()> @@ -526,7 +529,9 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let purge_worker = if raft_engine.need_manual_purge() { + let purge_worker = if raft_engine.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = raft_engine.clone(); let logger = self.logger.clone(); @@ -567,10 +572,14 @@ impl StoreSystem { workers.pd.remote(), concurrency_manager, causal_ts_provider, + workers.pd.scheduler(), + auto_split_controller, + store_meta.lock().unwrap().region_read_progress.clone(), + collector_reg_handle, self.logger.clone(), self.shutdown.clone(), cfg.clone(), - )); + )?); let split_check_scheduler = workers.background.start( "split-check", diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 7ddb1687d91..b82b6de3931 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -41,4 +41,4 @@ pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; -pub use worker::pd::{FlowReporter, Task as PdTask}; +pub use worker::pd::{PdReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/misc.rs similarity index 89% rename from components/raftstore-v2/src/worker/pd/update_max_timestamp.rs rename to components/raftstore-v2/src/worker/pd/misc.rs index 178d00ebd15..68c624b089a 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/misc.rs @@ -107,4 +107,17 @@ where self.remote.spawn(f); } } + + pub fn handle_report_min_resolved_ts(&mut self, store_id: u64, min_resolved_ts: u64) { + let resp = self + .pd_client + .report_min_resolved_ts(store_id, min_resolved_ts); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + warn!(logger, "report min resolved_ts failed"; "err" => ?e); + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index b54d088db66..b23d1500914 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -10,12 +10,14 @@ use collections::HashMap; use concurrency_manager::ConcurrencyManager; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; -use pd_client::PdClient; +use pd_client::{BucketStat, PdClient}; use raftstore::store::{ - util::KeysInfoFormatter, Config, FlowStatsReporter, ReadStats, TabletSnapManager, TxnExt, - WriteStats, + util::KeysInfoFormatter, AutoSplitController, Config, FlowStatsReporter, PdStatsMonitor, + ReadStats, RegionReadProgressRegistry, SplitInfo, StoreStatsReporter, TabletSnapManager, + TxnExt, WriteStats, NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }; -use slog::{error, info, Logger}; +use resource_metering::{Collector, CollectorRegHandle, RawRecords}; +use slog::{error, Logger}; use tikv_util::{ config::VersionTrack, time::UnixSecs, @@ -28,22 +30,36 @@ use crate::{ router::{CmdResChannel, PeerMsg}, }; -mod region_heartbeat; +mod misc; +mod region; mod split; -mod store_heartbeat; -mod update_max_timestamp; +mod store; -pub use region_heartbeat::RegionHeartbeatTask; +pub use region::RegionHeartbeatTask; + +type RecordPairVec = Vec; pub enum Task { - RegionHeartbeat(RegionHeartbeatTask), + // In store.rs. StoreHeartbeat { stats: pdpb::StoreStats, // TODO: StoreReport, StoreDrAutoSyncStatus }, + UpdateStoreInfos { + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + }, + // In region.rs. + RegionHeartbeat(RegionHeartbeatTask), + ReportRegionBuckets(BucketStat), + UpdateReadStats(ReadStats), + UpdateWriteStats(WriteStats), + UpdateRegionCpuRecords(Arc), DestroyPeer { region_id: u64, }, + // In split.rs. AskBatchSplit { region: metapb::Region, split_keys: Vec>, @@ -54,24 +70,51 @@ pub enum Task { ReportBatchSplit { regions: Vec, }, + AutoSplit { + split_infos: Vec, + }, + // In misc.rs. UpdateMaxTimestamp { region_id: u64, initial_status: u64, txn_ext: Arc, }, + ReportMinResolvedTs { + store_id: u64, + min_resolved_ts: u64, + }, } impl Display for Task { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { + Task::StoreHeartbeat { ref stats, .. } => { + write!(f, "store heartbeat stats: {stats:?}") + } + Task::UpdateStoreInfos { + ref cpu_usages, + ref read_io_rates, + ref write_io_rates, + } => write!( + f, + "get store's information: cpu_usages {:?}, read_io_rates {:?}, write_io_rates {:?}", + cpu_usages, read_io_rates, write_io_rates, + ), Task::RegionHeartbeat(ref hb_task) => write!( f, "region heartbeat for region {:?}, leader {}", hb_task.region, hb_task.peer.get_id(), ), - Task::StoreHeartbeat { ref stats, .. } => { - write!(f, "store heartbeat stats: {:?}", stats) + Task::ReportRegionBuckets(ref buckets) => write!(f, "report buckets: {:?}", buckets), + Task::UpdateReadStats(ref stats) => { + write!(f, "update read stats: {stats:?}") + } + Task::UpdateWriteStats(ref stats) => { + write!(f, "update write stats: {stats:?}") + } + Task::UpdateRegionCpuRecords(ref cpu_records) => { + write!(f, "get region cpu records: {:?}", cpu_records) } Task::DestroyPeer { ref region_id } => { write!(f, "destroy peer of region {}", region_id) @@ -87,11 +130,22 @@ impl Display for Task { KeysInfoFormatter(split_keys.iter()) ), Task::ReportBatchSplit { ref regions } => write!(f, "report split {:?}", regions), + Task::AutoSplit { ref split_infos } => { + write!(f, "auto split split regions, num is {}", split_infos.len()) + } Task::UpdateMaxTimestamp { region_id, .. } => write!( f, "update the max timestamp for region {} in the concurrency manager", region_id ), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => write!( + f, + "report min resolved ts: store {}, resolved ts {}", + store_id, min_resolved_ts, + ), } } } @@ -108,16 +162,18 @@ where tablet_registry: TabletRegistry, snap_mgr: TabletSnapManager, router: StoreRouter, + stats_monitor: PdStatsMonitor, remote: Remote, - region_peers: HashMap, - - // For store_heartbeat. + // For store. start_ts: UnixSecs, - store_stat: store_heartbeat::StoreStat, + store_stat: store::StoreStat, - // For region_heartbeat. + // For region. + region_peers: HashMap, + region_buckets: HashMap, + // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, is_hb_receiver_scheduled: bool, @@ -146,21 +202,38 @@ where remote: Remote, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + pd_scheduler: Scheduler, + auto_split_controller: AutoSplitController, + region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, logger: Logger, shutdown: Arc, cfg: Arc>, - ) -> Self { - Self { + ) -> Result { + let mut stats_monitor = PdStatsMonitor::new( + cfg.value().pd_store_heartbeat_tick_interval.0 / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + cfg.value().report_min_resolved_ts_interval.0, + PdReporter::new(pd_scheduler, logger.clone()), + ); + stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + )?; + Ok(Self { store_id, pd_client, raft_engine, tablet_registry, snap_mgr, router, + stats_monitor, remote, - region_peers: HashMap::default(), start_ts: UnixSecs::zero(), - store_stat: store_heartbeat::StoreStat::default(), + store_stat: store::StoreStat::default(), + region_peers: HashMap::default(), + region_buckets: HashMap::default(), region_cpu_records: HashMap::default(), is_hb_receiver_scheduled: false, concurrency_manager, @@ -168,7 +241,7 @@ where logger, shutdown, cfg, - } + }) } } @@ -183,8 +256,17 @@ where fn run(&mut self, task: Task) { self.maybe_schedule_heartbeat_receiver(); match task { - Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), Task::StoreHeartbeat { stats } => self.handle_store_heartbeat(stats), + Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + } => self.handle_update_store_infos(cpu_usages, read_io_rates, write_io_rates), + Task::RegionHeartbeat(task) => self.handle_region_heartbeat(task), + Task::ReportRegionBuckets(buckets) => self.handle_report_region_buckets(buckets), + Task::UpdateReadStats(stats) => self.handle_update_read_stats(stats), + Task::UpdateWriteStats(stats) => self.handle_update_write_stats(stats), + Task::UpdateRegionCpuRecords(records) => self.handle_update_region_cpu_records(records), Task::DestroyPeer { region_id } => self.handle_destroy_peer(region_id), Task::AskBatchSplit { region, @@ -194,51 +276,98 @@ where ch, } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), + Task::AutoSplit { split_infos } => self.handle_auto_split(split_infos), Task::UpdateMaxTimestamp { region_id, initial_status, txn_ext, } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), + Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + } => self.handle_report_min_resolved_ts(store_id, min_resolved_ts), } } } -impl Runner -where - EK: KvEngine, - ER: RaftEngine, - T: PdClient + 'static, -{ - fn handle_destroy_peer(&mut self, region_id: u64) { - match self.region_peers.remove(®ion_id) { - None => {} - Some(_) => { - info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) - } +#[derive(Clone)] +pub struct PdReporter { + scheduler: Scheduler, + logger: Logger, +} + +impl PdReporter { + pub fn new(scheduler: Scheduler, logger: Logger) -> Self { + PdReporter { scheduler, logger } + } +} + +impl FlowStatsReporter for PdReporter { + fn report_read_stats(&self, stats: ReadStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateReadStats(stats)) { + error!(self.logger, "Failed to send read flow statistics"; "err" => ?e); + } + } + + fn report_write_stats(&self, stats: WriteStats) { + if let Err(e) = self.scheduler.schedule(Task::UpdateWriteStats(stats)) { + error!(self.logger, "Failed to send write flow statistics"; "err" => ?e); } } } -#[derive(Clone)] -pub struct FlowReporter { - _scheduler: Scheduler, +impl Collector for PdReporter { + fn collect(&self, records: Arc) { + self.scheduler + .schedule(Task::UpdateRegionCpuRecords(records)) + .ok(); + } } -impl FlowReporter { - pub fn new(scheduler: Scheduler) -> Self { - FlowReporter { - _scheduler: scheduler, +impl StoreStatsReporter for PdReporter { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::UpdateStoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send store infos to pd worker"; + "err" => ?e, + ); } } -} -impl FlowStatsReporter for FlowReporter { - fn report_read_stats(&self, _read_stats: ReadStats) { - // TODO + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } } - fn report_write_stats(&self, _write_stats: WriteStats) { - // TODO + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.scheduler.schedule(task) { + error!( + self.logger, + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } } } diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region.rs similarity index 58% rename from components/raftstore-v2/src/worker/pd/region_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/region.rs index 31f84801ed2..d282534329b 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region.rs @@ -1,10 +1,15 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::Duration; +use std::{sync::Arc, time::Duration}; +use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{metapb, pdpb}; -use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; +use pd_client::{ + merge_bucket_stats, metrics::PD_HEARTBEAT_COUNTER_VEC, BucketStat, PdClient, RegionStat, +}; +use raftstore::store::{ReadStats, WriteStats}; +use resource_metering::RawRecords; use slog::{debug, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; @@ -44,6 +49,58 @@ pub struct PeerStat { pub approximate_size: u64, } +#[derive(Default)] +pub struct ReportBucket { + current_stat: BucketStat, + last_report_stat: Option, + last_report_ts: UnixSecs, +} + +impl ReportBucket { + fn new(current_stat: BucketStat) -> Self { + Self { + current_stat, + ..Default::default() + } + } + + fn report(&mut self, report_ts: UnixSecs) -> BucketStat { + self.last_report_ts = report_ts; + match self.last_report_stat.replace(self.current_stat.clone()) { + Some(last) => { + let mut delta = BucketStat::new( + self.current_stat.meta.clone(), + pd_client::new_bucket_stats(&self.current_stat.meta), + ); + // Buckets may be changed, recalculate last stats according to current meta. + merge_bucket_stats( + &delta.meta.keys, + &mut delta.stats, + &last.meta.keys, + &last.stats, + ); + for i in 0..delta.meta.keys.len() - 1 { + delta.stats.write_bytes[i] = + self.current_stat.stats.write_bytes[i] - delta.stats.write_bytes[i]; + delta.stats.write_keys[i] = + self.current_stat.stats.write_keys[i] - delta.stats.write_keys[i]; + delta.stats.write_qps[i] = + self.current_stat.stats.write_qps[i] - delta.stats.write_qps[i]; + + delta.stats.read_bytes[i] = + self.current_stat.stats.read_bytes[i] - delta.stats.read_bytes[i]; + delta.stats.read_keys[i] = + self.current_stat.stats.read_keys[i] - delta.stats.read_keys[i]; + delta.stats.read_qps[i] = + self.current_stat.stats.read_qps[i] - delta.stats.read_qps[i]; + } + delta + } + None => self.current_stat.clone(), + } + } +} + impl Runner where EK: KvEngine, @@ -244,4 +301,123 @@ where self.remote.spawn(f); self.is_hb_receiver_scheduled = true; } + + pub fn handle_report_region_buckets(&mut self, region_buckets: BucketStat) { + let region_id = region_buckets.meta.region_id; + self.merge_buckets(region_buckets); + let report_buckets = self.region_buckets.get_mut(®ion_id).unwrap(); + let last_report_ts = if report_buckets.last_report_ts.is_zero() { + self.start_ts + } else { + report_buckets.last_report_ts + }; + let now = UnixSecs::now(); + let interval_second = now.into_inner() - last_report_ts.into_inner(); + let delta = report_buckets.report(now); + let resp = self + .pd_client + .report_region_buckets(&delta, Duration::from_secs(interval_second)); + let logger = self.logger.clone(); + let f = async move { + if let Err(e) = resp.await { + debug!( + logger, + "failed to send buckets"; + "region_id" => region_id, + "version" => delta.meta.version, + "region_epoch" => ?delta.meta.region_epoch, + "err" => ?e + ); + } + }; + self.remote.spawn(f); + } + + pub fn handle_update_read_stats(&mut self, mut stats: ReadStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.read_bytes += region_info.flow.read_bytes as u64; + peer_stat.read_keys += region_info.flow.read_keys as u64; + self.store_stat.engine_total_bytes_read += region_info.flow.read_bytes as u64; + self.store_stat.engine_total_keys_read += region_info.flow.read_keys as u64; + peer_stat + .query_stats + .add_query_stats(®ion_info.query_stats.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.query_stats.0); + } + for (_, region_buckets) in std::mem::take(&mut stats.region_buckets) { + self.merge_buckets(region_buckets); + } + if !stats.region_infos.is_empty() { + self.stats_monitor.maybe_send_read_stats(stats); + } + } + + pub fn handle_update_write_stats(&mut self, mut stats: WriteStats) { + for (region_id, region_info) in stats.region_infos.iter_mut() { + let peer_stat = self + .region_peers + .entry(*region_id) + .or_insert_with(PeerStat::default); + peer_stat.query_stats.add_query_stats(®ion_info.0); + self.store_stat + .engine_total_query_num + .add_query_stats(®ion_info.0); + } + } + + pub fn handle_update_region_cpu_records(&mut self, records: Arc) { + // Send Region CPU info to AutoSplitController inside the stats_monitor. + self.stats_monitor.maybe_send_cpu_stats(&records); + Self::calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); + } + + pub fn handle_destroy_peer(&mut self, region_id: u64) { + match self.region_peers.remove(®ion_id) { + None => {} + Some(_) => { + info!(self.logger, "remove peer statistic record in pd"; "region_id" => region_id) + } + } + } + + fn merge_buckets(&mut self, mut buckets: BucketStat) { + let region_id = buckets.meta.region_id; + self.region_buckets + .entry(region_id) + .and_modify(|report_bucket| { + let current = &mut report_bucket.current_stat; + if current.meta < buckets.meta { + std::mem::swap(current, &mut buckets); + } + + merge_bucket_stats( + ¤t.meta.keys, + &mut current.stats, + &buckets.meta.keys, + &buckets.stats, + ); + }) + .or_insert_with(|| ReportBucket::new(buckets)); + } + + fn calculate_region_cpu_records( + store_id: u64, + records: Arc, + region_cpu_records: &mut HashMap, + ) { + for (tag, record) in &records.records { + let record_store_id = tag.store_id; + if record_store_id != store_id { + continue; + } + // Reporting a region heartbeat later will clear the corresponding record. + *region_cpu_records.entry(tag.region_id).or_insert(0) += record.cpu_time; + } + } } diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index cb7c3ad9308..bf13e01120a 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -6,10 +6,12 @@ use kvproto::{ raft_cmdpb::{AdminCmdType, AdminRequest, SplitRequest}, }; use pd_client::PdClient; -use slog::{info, warn}; +use raftstore::store::SplitInfo; +use slog::{info, warn, Logger}; +use yatp::{task::future::TaskCell, Remote}; use super::{requests::*, Runner}; -use crate::router::CmdResChannel; +use crate::{batch::StoreRouter, router::CmdResChannel}; fn new_batch_split_region_request( split_keys: Vec>, @@ -37,24 +39,50 @@ where ER: RaftEngine, T: PdClient + 'static, { + #[inline] pub fn handle_ask_batch_split( &mut self, - mut region: metapb::Region, + region: metapb::Region, split_keys: Vec>, peer: metapb::Peer, right_derive: bool, ch: CmdResChannel, + ) { + Self::ask_batch_split_imp( + &self.pd_client, + &self.logger, + &self.router, + &self.remote, + region, + split_keys, + peer, + right_derive, + Some(ch), + ); + } + + fn ask_batch_split_imp( + pd_client: &T, + logger: &Logger, + router: &StoreRouter, + remote: &Remote, + mut region: metapb::Region, + split_keys: Vec>, + peer: metapb::Peer, + right_derive: bool, + ch: Option, ) { if split_keys.is_empty() { - info!(self.logger, "empty split key, skip ask batch split"; - "region_id" => region.get_id()); + info!( + logger, + "empty split key, skip ask batch split"; + "region_id" => region.get_id() + ); return; } - let resp = self - .pd_client - .ask_batch_split(region.clone(), split_keys.len()); - let router = self.router.clone(); - let logger = self.logger.clone(); + let resp = pd_client.ask_batch_split(region.clone(), split_keys.len()); + let router = router.clone(); + let logger = logger.clone(); let f = async move { match resp.await { Ok(mut resp) => { @@ -73,7 +101,7 @@ where ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); - send_admin_request(&logger, &router, region_id, epoch, peer, req, Some(ch)); + send_admin_request(&logger, &router, region_id, epoch, peer, req, ch); } Err(e) => { warn!( @@ -85,7 +113,7 @@ where } } }; - self.remote.spawn(f); + remote.spawn(f); } pub fn handle_report_batch_split(&mut self, regions: Vec) { @@ -98,4 +126,37 @@ where }; self.remote.spawn(f); } + + pub fn handle_auto_split(&mut self, split_infos: Vec) { + let pd_client = self.pd_client.clone(); + let logger = self.logger.clone(); + let router = self.router.clone(); + let remote = self.remote.clone(); + + let f = async move { + for split_info in split_infos { + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::ask_batch_split_imp( + &pd_client, + &logger, + &router, + &remote, + region, + vec![split_key], + split_info.peer, + true, + None, + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + // TODO: implement half split + } + } + }; + self.remote.spawn(f); + } } diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store.rs similarity index 96% rename from components/raftstore-v2/src/worker/pd/store_heartbeat.rs rename to components/raftstore-v2/src/worker/pd/store.rs index ba75354c753..8f30b85d6f3 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store.rs @@ -257,6 +257,17 @@ where self.remote.spawn(f); } + pub fn handle_update_store_infos( + &mut self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + self.store_stat.store_cpu_usages = cpu_usages; + self.store_stat.store_read_io_rates = read_io_rates; + self.store_stat.store_write_io_rates = write_io_rates; + } + /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index ce0248130fb..90f7c500903 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -33,7 +33,8 @@ use raftstore::{ coprocessor::CoprocessorHost, store::{ region_meta::{RegionLocalState, RegionMeta}, - Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + AutoSplitController, Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, }, }; use raftstore_v2::{ @@ -41,6 +42,7 @@ use raftstore_v2::{ router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; +use resource_metering::CollectorRegHandle; use slog::{debug, o, Logger}; use tempfile::TempDir; use test_pd::mocker::Service; @@ -300,6 +302,8 @@ impl RunningState { concurrency_manager, causal_ts_provider, coprocessor_host, + AutoSplitController::default(), + CollectorRegHandle::new_for_test(), background.clone(), pd_worker, ) diff --git a/components/raftstore/src/coprocessor/consistency_check.rs b/components/raftstore/src/coprocessor/consistency_check.rs index 5ba97089f85..2ebf27c963f 100644 --- a/components/raftstore/src/coprocessor/consistency_check.rs +++ b/components/raftstore/src/coprocessor/consistency_check.rs @@ -2,7 +2,7 @@ use std::marker::PhantomData; -use engine_traits::{KvEngine, Snapshot, ALL_CFS, CF_RAFT}; +use engine_traits::{KvEngine, Snapshot, CF_RAFT}; use kvproto::metapb::Region; use crate::{ @@ -63,7 +63,7 @@ fn compute_hash_on_raw(region: &Region, snap: &S) -> Result { let start_key = keys::enc_start_key(region); let end_key = keys::enc_end_key(region); - for cf in ALL_CFS { + for cf in snap.cf_names() { snap.scan(cf, &start_key, &end_key, false, |k, v| { digest.update(k); digest.update(v); diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 3724eba13e2..b75aee3b4bb 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -1516,7 +1516,9 @@ impl RaftBatchSystem { ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - let purge_worker = if engines.raft.need_manual_purge() { + let purge_worker = if engines.raft.need_manual_purge() + && !cfg.value().raft_engine_purge_interval.0.is_zero() + { let worker = Worker::new("purge-worker"); let raft_clone = engines.raft.clone(); let router_clone = self.router(); @@ -1735,7 +1737,6 @@ impl RaftBatchSystem { Arc::clone(&pd_client), self.router.clone(), workers.pd_worker.scheduler(), - cfg.pd_store_heartbeat_tick_interval.0, auto_split_controller, concurrency_manager, snap_mgr, diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 42fb320035b..fe3c12427bd 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -79,9 +79,10 @@ pub use self::{ worker::{ metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, - LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, - ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, - SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, - TrackVer, WriteStats, + LocalReadContext, LocalReader, LocalReaderCore, PdStatsMonitor, PdTask, ReadDelegate, + ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, + StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, }; diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index e021651ba3d..ac23f4e58d5 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -27,7 +27,8 @@ pub use self::{ consistency_check::{Runner as ConsistencyCheckRunner, Task as ConsistencyCheckTask}, pd::{ new_change_peer_v2_request, FlowStatistics, FlowStatsReporter, HeartbeatTask, - Runner as PdRunner, Task as PdTask, + Runner as PdRunner, StatsMonitor as PdStatsMonitor, StoreStatsReporter, Task as PdTask, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, }, raftlog_gc::{Runner as RaftlogGcRunner, Task as RaftlogGcTask}, read::{ @@ -44,5 +45,5 @@ pub use self::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, split_config::{SplitConfig, SplitConfigManager}, - split_controller::{AutoSplitController, ReadStats, SplitConfigChange, WriteStats}, + split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index fdfa1b44c85..79b58d75c83 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -69,6 +69,8 @@ use crate::{ }, }; +pub const NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT: u32 = 2; + type RecordPairVec = Vec; #[derive(Default, Debug, Clone)] @@ -189,7 +191,6 @@ where id: u64, duration: RaftstoreDuration, }, - UpdateRegionCpuCollector(bool), RegionCpuRecords(Arc), ReportMinResolvedTs { store_id: u64, @@ -267,7 +268,7 @@ pub struct PeerStat { } #[derive(Default)] -pub struct ReportBucket { +struct ReportBucket { current_stat: BucketStat, last_report_stat: Option, last_report_ts: UnixSecs, @@ -418,12 +419,6 @@ where Task::UpdateSlowScore { id, ref duration } => { write!(f, "compute slow score: id {}, duration {:?}", id, duration) } - Task::UpdateRegionCpuCollector(is_register) => { - if is_register { - return write!(f, "register region cpu collector"); - } - write!(f, "deregister region cpu collector") - } Task::RegionCpuRecords(ref cpu_records) => { write!(f, "get region cpu records: {:?}", cpu_records) } @@ -476,12 +471,83 @@ fn convert_record_pairs(m: HashMap) -> RecordPairVec { .collect() } -struct StatsMonitor +#[derive(Clone)] +pub struct WrappedScheduler(Scheduler>); + +impl Collector for WrappedScheduler where EK: KvEngine, ER: RaftEngine, { - scheduler: Scheduler>, + fn collect(&self, records: Arc) { + self.0.schedule(Task::RegionCpuRecords(records)).ok(); + } +} + +pub trait StoreStatsReporter: Send + Clone + Sync + 'static + Collector { + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ); + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64); + fn auto_split(&self, split_infos: Vec); +} + +impl StoreStatsReporter for WrappedScheduler +where + EK: KvEngine, + ER: RaftEngine, +{ + fn report_store_infos( + &self, + cpu_usages: RecordPairVec, + read_io_rates: RecordPairVec, + write_io_rates: RecordPairVec, + ) { + let task = Task::StoreInfos { + cpu_usages, + read_io_rates, + write_io_rates, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send store infos to pd worker"; + "err" => ?e, + ); + } + } + + fn report_min_resolved_ts(&self, store_id: u64, min_resolved_ts: u64) { + let task = Task::ReportMinResolvedTs { + store_id, + min_resolved_ts, + }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send min resolved ts to pd worker"; + "err" => ?e, + ); + } + } + + fn auto_split(&self, split_infos: Vec) { + let task = Task::AutoSplit { split_infos }; + if let Err(e) = self.0.schedule(task) { + error!( + "failed to send split infos to pd worker"; + "err" => ?e, + ); + } + } +} + +pub struct StatsMonitor +where + T: StoreStatsReporter, +{ + reporter: T, handle: Option>, timer: Option>, read_stats_sender: Option>, @@ -492,18 +558,13 @@ where report_min_resolved_ts_interval: Duration, } -impl StatsMonitor +impl StatsMonitor where - EK: KvEngine, - ER: RaftEngine, + T: StoreStatsReporter, { - pub fn new( - interval: Duration, - report_min_resolved_ts_interval: Duration, - scheduler: Scheduler>, - ) -> Self { + pub fn new(interval: Duration, report_min_resolved_ts_interval: Duration, reporter: T) -> Self { StatsMonitor { - scheduler, + reporter, handle: None, timer: None, read_stats_sender: None, @@ -524,11 +585,10 @@ where &mut self, mut auto_split_controller: AutoSplitController, region_read_progress: RegionReadProgressRegistry, + collector_reg_handle: CollectorRegHandle, store_id: u64, ) -> Result<(), io::Error> { - if self.collect_tick_interval < default_collect_tick_interval() - || self.collect_store_infos_interval < self.collect_tick_interval - { + if self.collect_tick_interval < default_collect_tick_interval() { info!( "interval is too small, skip stats monitoring. If we are running tests, it is normal, otherwise a check is needed." ); @@ -555,7 +615,7 @@ where let (cpu_stats_sender, cpu_stats_receiver) = mpsc::channel(); self.cpu_stats_sender = Some(cpu_stats_sender); - let scheduler = self.scheduler.clone(); + let reporter = self.reporter.clone(); let props = tikv_util::thread_group::current_properties(); fn is_enable_tick(timer_cnt: u64, interval: u64) -> bool { @@ -570,13 +630,23 @@ where // make sure the record won't be disturbed. let mut collect_store_infos_thread_stats = ThreadInfoStatistics::new(); let mut load_base_split_thread_stats = ThreadInfoStatistics::new(); + let mut region_cpu_records_collector = None; + // Register the region CPU records collector. + if auto_split_controller + .cfg + .region_cpu_overload_threshold_ratio + > 0.0 + { + region_cpu_records_collector = + Some(collector_reg_handle.register(Box::new(reporter.clone()), false)); + } while let Err(mpsc::RecvTimeoutError::Timeout) = timer_rx.recv_timeout(tick_interval) { if is_enable_tick(timer_cnt, collect_store_infos_interval) { StatsMonitor::collect_store_infos( &mut collect_store_infos_thread_stats, - &scheduler, + &reporter, ); } if is_enable_tick(timer_cnt, load_base_split_check_interval) { @@ -585,14 +655,15 @@ where &read_stats_receiver, &cpu_stats_receiver, &mut load_base_split_thread_stats, - &scheduler, + &reporter, + &collector_reg_handle, + &mut region_cpu_records_collector, ); } if is_enable_tick(timer_cnt, report_min_resolved_ts_interval) { - StatsMonitor::report_min_resolved_ts( - ®ion_read_progress, + reporter.report_min_resolved_ts( store_id, - &scheduler, + region_read_progress.get_min_resolved_ts(), ); } timer_cnt += 1; @@ -604,26 +675,13 @@ where Ok(()) } - pub fn collect_store_infos( - thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, - ) { + pub fn collect_store_infos(thread_stats: &mut ThreadInfoStatistics, reporter: &T) { thread_stats.record(); let cpu_usages = convert_record_pairs(thread_stats.get_cpu_usages()); let read_io_rates = convert_record_pairs(thread_stats.get_read_io_rates()); let write_io_rates = convert_record_pairs(thread_stats.get_write_io_rates()); - let task = Task::StoreInfos { - cpu_usages, - read_io_rates, - write_io_rates, - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send store infos to pd worker"; - "err" => ?e, - ); - } + reporter.report_store_infos(cpu_usages, read_io_rates, write_io_rates); } pub fn load_base_split( @@ -631,16 +689,19 @@ where read_stats_receiver: &Receiver, cpu_stats_receiver: &Receiver>, thread_stats: &mut ThreadInfoStatistics, - scheduler: &Scheduler>, + reporter: &T, + collector_reg_handle: &CollectorRegHandle, + region_cpu_records_collector: &mut Option, ) { let start_time = TiInstant::now(); match auto_split_controller.refresh_and_check_cfg() { SplitConfigChange::UpdateRegionCpuCollector(is_register) => { - if let Err(e) = scheduler.schedule(Task::UpdateRegionCpuCollector(is_register)) { - error!( - "failed to register or deregister the region cpu collector"; - "is_register" => is_register, - "err" => ?e, + // If it's a deregister task, just take and drop the original collector. + if !is_register { + region_cpu_records_collector.take(); + } else { + region_cpu_records_collector.get_or_insert( + collector_reg_handle.register(Box::new(reporter.clone()), false), ); } } @@ -658,13 +719,7 @@ where let (top_qps, split_infos) = auto_split_controller.flush(read_stats_vec, cpu_stats_vec, thread_stats); auto_split_controller.clear(); - let task = Task::AutoSplit { split_infos }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send split infos to pd worker"; - "err" => ?e, - ); - } + reporter.auto_split(split_infos); for i in 0..TOP_N { if i < top_qps.len() { READ_QPS_TOPN @@ -677,23 +732,6 @@ where LOAD_BASE_SPLIT_DURATION_HISTOGRAM.observe(start_time.saturating_elapsed_secs()); } - pub fn report_min_resolved_ts( - region_read_progress: &RegionReadProgressRegistry, - store_id: u64, - scheduler: &Scheduler>, - ) { - let task = Task::ReportMinResolvedTs { - store_id, - min_resolved_ts: region_read_progress.get_min_resolved_ts(), - }; - if let Err(e) = scheduler.schedule(task) { - error!( - "failed to send min resolved ts to pd worker"; - "err" => ?e, - ); - } - } - pub fn stop(&mut self) { if let Some(h) = self.handle.take() { drop(self.timer.take()); @@ -705,14 +743,22 @@ where } } - #[inline(always)] - fn get_read_stats_sender(&self) -> &Option> { - &self.read_stats_sender + #[inline] + pub fn maybe_send_read_stats(&self, read_stats: ReadStats) { + if let Some(sender) = &self.read_stats_sender { + if sender.send(read_stats).is_err() { + warn!("send read_stats failed, are we shutting down?") + } + } } - #[inline(always)] - fn get_cpu_stats_sender(&self) -> &Option>> { - &self.cpu_stats_sender + #[inline] + pub fn maybe_send_cpu_stats(&self, cpu_stats: &Arc) { + if let Some(sender) = &self.cpu_stats_sender { + if sender.send(cpu_stats.clone()).is_err() { + warn!("send region cpu info failed, are we shutting down?") + } + } } } @@ -845,37 +891,6 @@ impl SlowScore { } } -// RegionCpuMeteringCollector is used to collect the region-related CPU info. -struct RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - scheduler: Scheduler>, -} - -impl RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn new(scheduler: Scheduler>) -> RegionCpuMeteringCollector { - RegionCpuMeteringCollector { scheduler } - } -} - -impl Collector for RegionCpuMeteringCollector -where - EK: KvEngine, - ER: RaftEngine, -{ - fn collect(&self, records: Arc) { - self.scheduler - .schedule(Task::RegionCpuRecords(records)) - .ok(); - } -} - pub struct Runner where EK: KvEngine, @@ -896,11 +911,9 @@ where // actually it is the sender connected to Runner's Worker which // calls Runner's run() on Task received. scheduler: Scheduler>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, store_heartbeat_interval: Duration, - collector_reg_handle: CollectorRegHandle, - region_cpu_records_collector: Option, // region_id -> total_cpu_time_ms (since last region heartbeat) region_cpu_records: HashMap, @@ -922,15 +935,12 @@ where ER: RaftEngine, T: PdClient + 'static, { - const INTERVAL_DIVISOR: u32 = 2; - pub fn new( cfg: &Config, store_id: u64, pd_client: Arc, router: RaftRouter, scheduler: Scheduler>, - store_heartbeat_interval: Duration, auto_split_controller: AutoSplitController, concurrency_manager: ConcurrencyManager, snap_mgr: SnapManager, @@ -941,25 +951,19 @@ where coprocessor_host: CoprocessorHost, causal_ts_provider: Option>, // used for rawkv apiv2 ) -> Runner { - // Register the region CPU records collector. - let mut region_cpu_records_collector = None; - if auto_split_controller - .cfg - .region_cpu_overload_threshold_ratio - > 0.0 - { - region_cpu_records_collector = Some(collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(scheduler.clone())), - false, - )); - } - let interval = store_heartbeat_interval / Self::INTERVAL_DIVISOR; + let store_heartbeat_interval = cfg.pd_store_heartbeat_tick_interval.0; + let interval = store_heartbeat_interval / NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT; let mut stats_monitor = StatsMonitor::new( interval, cfg.report_min_resolved_ts_interval.0, - scheduler.clone(), + WrappedScheduler(scheduler.clone()), ); - if let Err(e) = stats_monitor.start(auto_split_controller, region_read_progress, store_id) { + if let Err(e) = stats_monitor.start( + auto_split_controller, + region_read_progress, + collector_reg_handle, + store_id, + ) { error!("failed to start stats collector, error = {:?}", e); } @@ -975,8 +979,6 @@ where scheduler, store_heartbeat_interval, stats_monitor, - collector_reg_handle, - region_cpu_records_collector, region_cpu_records: HashMap::default(), concurrency_manager, snap_mgr, @@ -1041,21 +1043,6 @@ where self.remote.spawn(f); } - fn handle_update_region_cpu_collector(&mut self, is_register: bool) { - // If it's a deregister task, just take and drop the original collector. - if !is_register { - self.region_cpu_records_collector.take(); - return; - } - if self.region_cpu_records_collector.is_some() { - return; - } - self.region_cpu_records_collector = Some(self.collector_reg_handle.register( - Box::new(RegionCpuMeteringCollector::new(self.scheduler.clone())), - false, - )); - } - // Note: The parameter doesn't contain `self` because this function may // be called in an asynchronous context. fn handle_ask_batch_split( @@ -1604,11 +1591,7 @@ where self.merge_buckets(region_buckets); } if !read_stats.region_infos.is_empty() { - if let Some(sender) = self.stats_monitor.get_read_stats_sender() { - if sender.send(read_stats).is_err() { - warn!("send read_stats failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_read_stats(read_stats); } } @@ -1756,11 +1739,7 @@ where // TODO: more accurate CPU consumption of a specified region. fn handle_region_cpu_records(&mut self, records: Arc) { // Send Region CPU info to AutoSplitController inside the stats_monitor. - if let Some(cpu_stats_sender) = self.stats_monitor.get_cpu_stats_sender() { - if cpu_stats_sender.send(records.clone()).is_err() { - warn!("send region cpu info failed, are we shutting down?") - } - } + self.stats_monitor.maybe_send_cpu_stats(&records); calculate_region_cpu_records(self.store_id, records, &mut self.region_cpu_records); } @@ -1856,22 +1835,10 @@ where stats.set_is_busy(true); // We do not need to report store_info, so we just set `None` here. - let task = Task::StoreHeartbeat { - stats, - store_info: None, - report: None, - dr_autosync_status: None, - }; - if let Err(e) = self.scheduler.schedule(task) { - error!("force report store heartbeat failed"; - "store_id" => self.store_id, - "err" => ?e - ); - } else { - warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; - "store_id" => self.store_id, - ); - } + self.handle_store_heartbeat(stats, None, None, None); + warn!("scheduling store_heartbeat timeout, force report store slow score to pd."; + "store_id" => self.store_id, + ); } fn is_store_heartbeat_delayed(&self) -> bool { @@ -1954,48 +1921,43 @@ where let f = async move { for split_info in split_infos { - if let Ok(Some(region)) = - pd_client.get_region_by_id(split_info.region_id).await - { - // Try to split the region with the given split key. - if let Some(split_key) = split_info.split_key { - Self::handle_ask_batch_split( - router.clone(), - scheduler.clone(), - pd_client.clone(), - region, - vec![split_key], - split_info.peer, - true, - Callback::None, - String::from("auto_split"), - remote.clone(), + let Ok(Some(region)) = + pd_client.get_region_by_id(split_info.region_id).await else { continue }; + // Try to split the region with the given split key. + if let Some(split_key) = split_info.split_key { + Self::handle_ask_batch_split( + router.clone(), + scheduler.clone(), + pd_client.clone(), + region, + vec![split_key], + split_info.peer, + true, + Callback::None, + String::from("auto_split"), + remote.clone(), + ); + // Try to split the region on half within the given key + // range if there is no `split_key` been given. + } else if split_info.start_key.is_some() && split_info.end_key.is_some() { + let start_key = split_info.start_key.unwrap(); + let end_key = split_info.end_key.unwrap(); + let region_id = region.get_id(); + let msg = CasualMessage::HalfSplitRegion { + region_epoch: region.get_region_epoch().clone(), + start_key: Some(start_key.clone()), + end_key: Some(end_key.clone()), + policy: pdpb::CheckPolicy::Scan, + source: "auto_split", + cb: Callback::None, + }; + if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) { + error!("send auto half split request failed"; + "region_id" => region_id, + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => ?e, ); - return; - } - // Try to split the region on half within the given key range - // if there is no `split_key` been given. - if split_info.start_key.is_some() && split_info.end_key.is_some() { - let start_key = split_info.start_key.unwrap(); - let end_key = split_info.end_key.unwrap(); - let region_id = region.get_id(); - let msg = CasualMessage::HalfSplitRegion { - region_epoch: region.get_region_epoch().clone(), - start_key: Some(start_key.clone()), - end_key: Some(end_key.clone()), - policy: pdpb::CheckPolicy::Scan, - source: "auto_split", - cb: Callback::None, - }; - if let Err(e) = router.send(region_id, PeerMsg::CasualMessage(msg)) - { - error!("send auto half split request failed"; - "region_id" => region_id, - "start_key" => log_wrappers::Value::key(&start_key), - "end_key" => log_wrappers::Value::key(&end_key), - "err" => ?e, - ); - } } } } @@ -2124,9 +2086,6 @@ where } => self.handle_update_max_timestamp(region_id, initial_status, txn_ext), Task::QueryRegionLeader { region_id } => self.handle_query_region_leader(region_id), Task::UpdateSlowScore { id, duration } => self.slow_score.record(id, duration.sum()), - Task::UpdateRegionCpuCollector(is_register) => { - self.handle_update_region_cpu_collector(is_register) - } Task::RegionCpuRecords(records) => self.handle_region_cpu_records(records), Task::ReportMinResolvedTs { store_id, @@ -2469,7 +2428,7 @@ mod tests { struct RunnerTest { store_stat: Arc>, - stats_monitor: StatsMonitor, + stats_monitor: StatsMonitor>, } impl RunnerTest { @@ -2481,13 +2440,16 @@ mod tests { let mut stats_monitor = StatsMonitor::new( Duration::from_secs(interval), Duration::from_secs(0), - scheduler, + WrappedScheduler(scheduler), ); let store_meta = Arc::new(Mutex::new(StoreMeta::new(0))); let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); - if let Err(e) = - stats_monitor.start(AutoSplitController::default(), region_read_progress, 1) - { + if let Err(e) = stats_monitor.start( + AutoSplitController::default(), + region_read_progress, + CollectorRegHandle::new_for_test(), + 1, + ) { error!("failed to start stats collector, error = {:?}", e); } diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 03b02e5f81e..36a02130fdb 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -47,7 +47,10 @@ use file_system::{ use futures::executor::block_on; use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; -use kvproto::{deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion}; +use kvproto::{ + deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion, + resource_usage_agent::create_resource_metering_pub_sub, +}; use pd_client::{PdClient, RpcClient}; use raft_log_engine::RaftLogEngine; use raftstore::{ @@ -56,8 +59,8 @@ use raftstore::{ RawConsistencyCheckObserver, }, store::{ - memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, CheckLeaderRunner, SplitConfigManager, - TabletSnapManager, + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, AutoSplitController, CheckLeaderRunner, + SplitConfigManager, TabletSnapManager, }, RegionInfoAccessor, }; @@ -68,7 +71,7 @@ use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, coprocessor_v2, - read_pool::{build_yatp_read_pool, ReadPool}, + read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, gc_worker::{AutoGcConfig, GcWorker}, @@ -235,6 +238,7 @@ struct TikvEngines { struct Servers { lock_mgr: LockManager, server: LocalServer, + rsmeter_pubsub_service: resource_metering::PubSubService, } type LocalServer = Server>; @@ -635,7 +639,10 @@ where let engines = self.engines.as_ref().unwrap(); let pd_worker = LazyWorker::new("pd-worker"); - let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); + let pd_sender = raftstore_v2::PdReporter::new( + pd_worker.scheduler(), + slog_global::borrow_global().new(slog::o!()), + ); let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { let resource_ctl = self @@ -674,15 +681,16 @@ where let (reporter_notifier, data_sink_reg_handle, reporter_worker) = resource_metering::init_reporter( self.config.resource_metering.clone(), - collector_reg_handle, + collector_reg_handle.clone(), ); self.to_stop.push(reporter_worker); let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( self.config.resource_metering.receiver_address.clone(), self.env.clone(), - data_sink_reg_handle, + data_sink_reg_handle.clone(), ); self.to_stop.push(single_target_worker); + let rsmeter_pubsub_service = resource_metering::PubSubService::new(data_sink_reg_handle); let cfg_manager = resource_metering::ConfigManager::new( self.config.resource_metering.clone(), @@ -769,6 +777,22 @@ where cop_read_pools.handle() }; + let mut unified_read_pool_scale_receiver = None; + if self.config.readpool.is_unified_pool_enabled() { + let (unified_read_pool_scale_notifier, rx) = mpsc::sync_channel(10); + cfg_controller.register( + tikv::config::Module::Readpool, + Box::new(ReadPoolConfigManager::new( + unified_read_pool.as_ref().unwrap().handle(), + unified_read_pool_scale_notifier, + &self.background_worker, + self.config.readpool.unified.max_thread_count, + self.config.readpool.unified.auto_adjust_pool_size, + )), + ); + unified_read_pool_scale_receiver = Some(rx); + } + let check_leader_runner = CheckLeaderRunner::new( self.router.as_ref().unwrap().store_meta().clone(), self.coprocessor_host.clone().unwrap(), @@ -828,7 +852,17 @@ where let split_config_manager = SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); - cfg_controller.register(tikv::config::Module::Split, Box::new(split_config_manager)); + cfg_controller.register( + tikv::config::Module::Split, + Box::new(split_config_manager.clone()), + ); + + let auto_split_controller = AutoSplitController::new( + split_config_manager, + self.config.server.grpc_concurrency, + self.config.readpool.unified.max_thread_count, + unified_read_pool_scale_receiver, + ); // `ConsistencyCheckObserver` must be registered before `Node::start`. let safe_point = Arc::new(AtomicU64::new(0)); @@ -858,6 +892,8 @@ where self.concurrency_manager.clone(), self.causal_ts_provider.clone(), self.coprocessor_host.clone().unwrap(), + auto_split_controller, + collector_reg_handle, self.background_worker.clone(), pd_worker, raft_store, @@ -882,7 +918,11 @@ where initial_metric(&self.config.metric); - self.servers = Some(Servers { lock_mgr, server }); + self.servers = Some(Servers { + lock_mgr, + server, + rsmeter_pubsub_service, + }); server_config } @@ -923,6 +963,16 @@ where &self.config.pessimistic_txn, ) .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + + if servers + .server + .register_service(create_resource_metering_pub_sub( + servers.rsmeter_pubsub_service.clone(), + )) + .is_some() + { + warn!("failed to register resource metering pubsub service"); + } } fn init_io_utility(&mut self) -> BytesFetcher { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index d5c2eefa6d6..8b3745120d5 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,7 +16,8 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, + CfNamesExt, Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, + CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -101,7 +102,7 @@ pub fn must_region_cleared(engine: &Engines, region assert_eq!(state.get_state(), PeerState::Tombstone, "{:?}", state); let start_key = keys::data_key(region.get_start_key()); let end_key = keys::data_key(region.get_end_key()); - for cf in ALL_CFS { + for cf in engine.kv.cf_names() { engine .kv .scan(cf, &start_key, &end_key, false, |k, v| { diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs index b876951894c..588e8ae9e9b 100644 --- a/src/server/raftkv2/node.rs +++ b/src/server/raftkv2/node.rs @@ -9,9 +9,13 @@ use kvproto::{metapb, replication_modepb::ReplicationStatus}; use pd_client::PdClient; use raftstore::{ coprocessor::CoprocessorHost, - store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, + store::{ + AutoSplitController, GlobalReplicationState, TabletSnapManager, Transport, + RAFT_INIT_LOG_INDEX, + }, }; use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreRouter, StoreSystem}; +use resource_metering::CollectorRegHandle; use slog::{info, o, Logger}; use tikv_util::{ config::VersionTrack, @@ -92,6 +96,8 @@ where concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -129,6 +135,8 @@ where concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, store_cfg, @@ -188,6 +196,8 @@ where concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 coprocessor_host: CoprocessorHost, + auto_split_controller: AutoSplitController, + collector_reg_handle: CollectorRegHandle, background: Worker, pd_worker: LazyWorker, store_cfg: Arc>, @@ -218,6 +228,8 @@ where concurrency_manager, causal_ts_provider, coprocessor_host, + auto_split_controller, + collector_reg_handle, background, pd_worker, )?; From b2c9f5b1fe06e0b5a7bbec1d2d5cd8b10af771fd Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 19 Jan 2023 13:33:25 +0800 Subject: [PATCH 089/115] [Cloud] Support fallback after timeout (#260) --- engine_store_ffi/src/observer.rs | 132 +++++++++++++----- engine_store_ffi/src/ps_engine.rs | 23 ++- .../src/cached_region_info_manager.rs | 1 + engine_tiflash/src/write_batch.rs | 4 +- new-mock-engine-store/src/mock_store.rs | 11 +- proxy_tests/proxy/fast_add_peer.rs | 92 ++++++++++-- proxy_tests/proxy/ffi.rs | 4 +- proxy_tests/proxy/proxy.rs | 4 +- proxy_tests/proxy/region.rs | 2 +- 9 files changed, 204 insertions(+), 69 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 1486958c3d2..964837e0586 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -178,24 +178,54 @@ impl TiFlashObserver { let mut has_already_inited = None; let mut early_skip = false; let f = |info: MapEntry>| { + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); match info { MapEntry::Occupied(mut o) => { + // Test if a fast path is timeout + let fast_path_start = o.get().fast_add_peer_start.load(Ordering::SeqCst); + if fast_path_start != 0 { + let elapsed = current.as_millis() - fast_path_start; + #[cfg(any(test, feature = "testexport"))] + const TRACE_SLOW_MILLIS: u128 = 0; + #[cfg(any(test, feature = "testexport"))] + const FALLBACK_MILLIS: u128 = 1000 * 2; + #[cfg(not(any(test, feature = "testexport")))] + const TRACE_SLOW_MILLIS: u128 = 1000 * 60 * 3; + #[cfg(not(any(test, feature = "testexport")))] + const FALLBACK_MILLIS: u128 = 1000 * 60 * 5; + if elapsed >= TRACE_SLOW_MILLIS { + let need_fallback = elapsed > FALLBACK_MILLIS; + let do_fallback = if need_fallback { + // TODO If snapshot is sent, we can't fallback? + true + } else { + false + }; + info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", + self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + "is_replicated" => is_replicated, + "has_already_inited" => has_already_inited, + "is_first" => is_first, + "elapsed" => elapsed, + "do_fallback" => do_fallback, + ); + if do_fallback { + o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); + is_first = false; + early_skip = false; + return; + } + } + } + // If a snapshot is sent, we must skip further handling. let last = o.get().snapshot_inflight.load(Ordering::SeqCst); if last != 0 { - let current = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap(); - info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", - self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "region_id" => region_id, - "inner_msg" => ?inner_msg, - "is_replicated" => is_replicated, - "has_already_inited" => has_already_inited, - "is_first" => is_first, - "elapsed" => current.as_millis() - last, - ); early_skip = true; // We must return here to avoid changing `inited_or_fallback`. // Otherwise will cause different value in pre/post_apply_snapshot. @@ -221,6 +251,15 @@ impl TiFlashObserver { } else { (false, None) }; + if is_first { + // Don't care if the exchange succeeds. + let _ = o.get_mut().fast_add_peer_start.compare_exchange( + 0, + current.as_millis(), + Ordering::SeqCst, + Ordering::SeqCst, + ); + } // TODO include create is_replicated = o.get().replicated_or_created.load(Ordering::SeqCst); } @@ -231,7 +270,10 @@ impl TiFlashObserver { "region_id" => region_id, "inner_msg" => ?inner_msg, ); - v.insert(Arc::new(CachedRegionInfo::default())); + let c = CachedRegionInfo::default(); + c.fast_add_peer_start + .store(current.as_millis(), Ordering::SeqCst); + v.insert(Arc::new(c)); is_first = true; } } @@ -240,6 +282,7 @@ impl TiFlashObserver { // Try not acquire write lock firstly. match cached_manager.get_inited_or_fallback(region_id) { Some(true) => { + // Most cases, when the peer is already inited. is_first = false; } None | Some(false) => self @@ -286,18 +329,16 @@ impl TiFlashObserver { return false; } - { - // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, - // see `check_msg`. - if !is_replicated { - info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; - "to_peer_id" => msg.get_to_peer().get_id(), - "from_peer_id" => msg.get_from_peer().get_id(), - "region_id" => region_id, - "inner_msg" => ?inner_msg, - ); - return true; - } + // Peer is not created by Peer::replicate, will cause RegionNotRegistered error, + // see `check_msg`. + if !is_replicated { + info!("fast path: ongoing {}:{} {}, wait replicating peer", self.store_id, region_id, new_peer_id; + "to_peer_id" => msg.get_to_peer().get_id(), + "from_peer_id" => msg.get_from_peer().get_id(), + "region_id" => region_id, + "inner_msg" => ?inner_msg, + ); + return true; } info!("fast path: ongoing {}:{} {}, fetch data from remote peer", self.store_id, region_id, new_peer_id; @@ -305,7 +346,6 @@ impl TiFlashObserver { "from_peer_id" => msg.get_from_peer().get_id(), "region_id" => region_id, ); - fail::fail_point!("go_fast_path_not_allow", |_| { return false }); fail::fail_point!("ffi_fast_add_peer_pause", |_| { return false }); // Feed data let res = self @@ -377,7 +417,7 @@ impl TiFlashObserver { Ok(s) => { match s { crate::FastAddPeerStatus::Ok => { - fail::fail_point!("go_fast_path_succeed", |_| { return false }); + fail::fail_point!("go_fast_path_not_allow", |_| { return false }); info!("fast path: ongoing {}:{} {}, finish build and send", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), "from_peer_id" => msg.get_from_peer().get_id(), @@ -447,6 +487,29 @@ impl TiFlashObserver { ) -> RaftStoreResult { let cached_manager = self.get_cached_manager(); let inner_msg = msg.get_message(); + + let current = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap(); + #[cfg(any(test, feature = "testexport"))] + { + let fake_send: bool = (|| { + fail::fail_point!("fast_add_peer_fake_send", |t| { + let t = t.unwrap().parse::().unwrap(); + t + }); + 0 + })() != 0; + if fake_send { + // A handling snapshot may block handling later MsgAppend. + // So we fake send. + cached_manager + .set_snapshot_inflight(region_id, current.as_millis()) + .unwrap(); + return Ok(crate::FastAddPeerStatus::Ok); + } + } + // Get a snapshot object. let (mut snapshot, key) = { // Find term of entry at applied_index. @@ -550,12 +613,10 @@ impl TiFlashObserver { pb_snapshot_data, apply_state ); + match self.trans.lock() { Ok(mut trans) => match trans.send(response) { Ok(_) => { - let current = SystemTime::now() - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap(); cached_manager .set_snapshot_inflight(region_id, current.as_millis()) .unwrap(); @@ -1269,7 +1330,7 @@ impl ApplySnapshotObserver for TiFlashOb MapEntry::Occupied(o) => { let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); if is_first_snapsot { - info!("fast path: prehandle first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; + info!("fast path: prehandle first snapshot {}:{} {}", self.store_id, region_id, peer_id; "snap_key" => ?snap_key, "region_id" => region_id, ); @@ -1373,16 +1434,19 @@ impl ApplySnapshotObserver for TiFlashOb let is_first_snapsot = !o.get().inited_or_fallback.load(Ordering::SeqCst); if is_first_snapsot { let last = o.get().snapshot_inflight.load(Ordering::SeqCst); + let total = o.get().fast_add_peer_start.load(Ordering::SeqCst); let current = SystemTime::now() .duration_since(SystemTime::UNIX_EPOCH) .unwrap(); info!("fast path: applied first snapshot {}:{} {}, recover MsgAppend", self.store_id, region_id, peer_id; "snap_key" => ?snap_key, "region_id" => region_id, - "cost" => current.as_millis() - last, + "cost_snapshot" => current.as_millis() - last, + "cost_total" => current.as_millis() - total, ); should_skip = true; o.get_mut().snapshot_inflight.store(0, Ordering::SeqCst); + o.get_mut().fast_add_peer_start.store(0, Ordering::SeqCst); o.get_mut().inited_or_fallback.store(true, Ordering::SeqCst); } } diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index bd727c013d5..6be0ab41657 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -296,14 +296,13 @@ impl PSEngine { if from >= to { return Ok(0); } - // info!("gc_impl raft_group_id {} from {} to {}", raft_group_id, from ,to); let mut raft_wb = self.log_batch(0); for idx in from..to { - raft_wb.del_page(&keys::raft_log_key(raft_group_id, idx)); + raft_wb.del_page(&keys::raft_log_key(raft_group_id, idx))?; } // TODO: keep the max size of raft_wb under some threshold - self.consume(&mut raft_wb, false); + self.consume(&mut raft_wb, false)?; Ok((to - from) as usize) } @@ -399,7 +398,7 @@ impl RaftEngineDebug for PSEngine { let mut entry = Entry::default(); entry.merge_from_bytes(value)?; f(&entry) - }); + })?; Ok(()) } } @@ -468,10 +467,10 @@ impl RaftEngine for PSEngine { // TODO: find the first raft log index of this raft group if first_index <= state.last_index { for index in first_index..=state.last_index { - batch.del_page(&keys::raft_log_key(raft_group_id, index)); + batch.del_page(&keys::raft_log_key(raft_group_id, index))?; } } - self.consume(batch, true); + self.consume(batch, true)?; Ok(()) } @@ -487,8 +486,8 @@ impl RaftEngine for PSEngine { fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { let mut wb = self.log_batch(0); - wb.put_msg(&keys::raft_state_key(raft_group_id), state); - self.consume(&mut wb, false); + wb.put_msg(&keys::raft_state_key(raft_group_id), state)?; + self.consume(&mut wb, false)?; Ok(()) } @@ -522,8 +521,8 @@ impl RaftEngine for PSEngine { fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { let mut wb = self.log_batch(0); - wb.put_msg(keys::STORE_IDENT_KEY, ident); - self.consume(&mut wb, false); + wb.put_msg(keys::STORE_IDENT_KEY, ident)?; + self.consume(&mut wb, false)?; Ok(()) } @@ -557,8 +556,8 @@ impl RaftEngine for PSEngine { fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { let mut wb = self.log_batch(0); - wb.put_msg(keys::RECOVER_STATE_KEY, state); - self.consume(&mut wb, false); + wb.put_msg(keys::RECOVER_STATE_KEY, state)?; + self.consume(&mut wb, false)?; Ok(()) } } diff --git a/engine_tiflash/src/cached_region_info_manager.rs b/engine_tiflash/src/cached_region_info_manager.rs index fed3526b62a..6484764b2ab 100644 --- a/engine_tiflash/src/cached_region_info_manager.rs +++ b/engine_tiflash/src/cached_region_info_manager.rs @@ -25,6 +25,7 @@ pub struct CachedRegionInfo { // Otherwise, a normal snapshot will be neglect in `post_apply_snapshot` and cause data loss. pub inited_or_fallback: AtomicBool, pub snapshot_inflight: portable_atomic::AtomicU128, + pub fast_add_peer_start: portable_atomic::AtomicU128, } pub type CachedRegionInfoMap = HashMap>; diff --git a/engine_tiflash/src/write_batch.rs b/engine_tiflash/src/write_batch.rs index ea4143fb74a..c8b68dd781c 100644 --- a/engine_tiflash/src/write_batch.rs +++ b/engine_tiflash/src/write_batch.rs @@ -58,8 +58,8 @@ pub struct RocksWriteBatchVec { impl RocksWriteBatchVec { pub fn new( db: Arc, - ffi_hub: Option>, - ps_wb: RawPSWriteBatchWrapper, + _ffi_hub: Option>, + _ps_wb: RawPSWriteBatchWrapper, batch_size_limit: usize, cap: usize, support_write_batch_vec: bool, diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 0f9effcaf1c..6814ff84d6f 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -163,6 +163,12 @@ impl EngineStoreServer { } } } + + pub unsafe fn write_to_db_by_region_id(&mut self, region_id: u64, reason: String) { + let kv = &mut self.engines.as_mut().unwrap().kv; + let region = self.kvstore.get_mut(®ion_id).unwrap(); + write_to_db_data_by_engine(self.id, kv, region, reason) + } } pub struct EngineStoreServerWrap { @@ -1507,7 +1513,10 @@ unsafe extern "C" fn ffi_fast_add_peer( return; } }; - debug!("recover from remote peer: data from {} to {}", from_store, store_id; "region_id" => region_id); + debug!("recover from remote peer: begin data from {} to {}", from_store, store_id; + "region_id" => region_id, + "apply_state" => ?apply_state, + ); // TODO In TiFlash we should take care of write batch size if let Err(e) = copy_data_from( &source_engines, diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 3f2f70c591b..47f3ee211d8 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -9,10 +9,12 @@ enum SourceType { InvalidSource, } +#[derive(PartialEq, Eq, Debug)] enum PauseType { None, Build, ApplySnapshot, + SendFakeSnapshot, } #[test] @@ -40,6 +42,7 @@ fn basic_fast_add_peer() { } fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseType) { + // The case in TiFlash is (DelayedPeer, false, Build) tikv_util::set_panic_hook(true, "./"); let (mut cluster, pd_client) = new_mock_cluster(0, 3); cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; @@ -82,6 +85,11 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT match pause { PauseType::Build => fail::cfg("ffi_fast_add_peer_pause", "pause").unwrap(), PauseType::ApplySnapshot => fail::cfg("on_can_apply_snapshot", "return(false)").unwrap(), + PauseType::SendFakeSnapshot => { + fail::cfg("fast_add_peer_fake_send", "return(1)").unwrap(); + // If we fake send snapshot, then fast path will certainly fail. + // Then we will timeout in FALLBACK_MILLIS and go to slow path. + } _ => (), } @@ -89,6 +97,28 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT pd_client.must_add_peer(1, new_learner_peer(3, 3)); cluster.must_put(b"k2", b"v2"); + let need_fallback = if pause == PauseType::SendFakeSnapshot { + true + } else { + false + }; + + // If we need to fallback to slow path, + // we must make sure the data is persisted before Leader generated snapshot. + // This is necessary, since we haven't adapt `handle_snapshot`, + // which is a leader logic. + if need_fallback { + check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1])); + iter_ffi_helpers( + &cluster, + Some(vec![1]), + &mut |_, _, ffi: &mut FFIHelperSet| unsafe { + let server = ffi.engine_store_server.as_mut(); + server.write_to_db_by_region_id(1, "persist for up-to-date snapshot".to_string()); + }, + ); + } + match source_type { SourceType::DelayedLearner => { // Make sure conf change is applied in peer 2. @@ -104,6 +134,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT _ => (), }; + // Wait some time and then recover. match pause { PauseType::Build => { std::thread::sleep(std::time::Duration::from_millis(3000)); @@ -115,9 +146,16 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); std::thread::sleep(std::time::Duration::from_millis(5000)); } + PauseType::SendFakeSnapshot => { + // Wait FALLBACK_MILLIS + std::thread::sleep(std::time::Duration::from_millis(5000)); + fail::remove("fast_add_peer_fake_send"); + std::thread::sleep(std::time::Duration::from_millis(2000)); + } _ => (), } + // Check stage 1. match source_type { SourceType::DelayedLearner => { check_key(&cluster, b"k3", b"v3", Some(true), None, Some(vec![1, 3])); @@ -153,11 +191,17 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT iter_ffi_helpers( &cluster, Some(vec![3]), - &mut |_, _, ffi: &mut FFIHelperSet| { - let server = &ffi.engine_store_server; - (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { - assert_eq!(1, e.fast_add_peer_count.load(Ordering::SeqCst)); - }); + &mut |_, _, _ffi: &mut FFIHelperSet| { + // Not actually the case, since we allow handling + // MsgAppend multiple times. + // So the following fires when: + // (DelayedLearner, false, ApplySnapshot) + + // let server = &ffi.engine_store_server; + // (*ffi.engine_store_server).mutate_region_states(1, |e: + // &mut RegionStats| { assert_eq!(1, + // e.fast_add_peer_count.load(Ordering::SeqCst)); + // }); }, ); } @@ -207,7 +251,7 @@ fn simple_fast_add_peer(source_type: SourceType, block_wait: bool, pause: PauseT iter_ffi_helpers( &cluster, Some(vec![3]), - &mut |id: u64, _, ffi: &mut FFIHelperSet| { + &mut |_, _, ffi: &mut FFIHelperSet| { (*ffi.engine_store_server).mutate_region_states(1, |e: &mut RegionStats| { assert!(e.fast_add_peer_count.load(Ordering::SeqCst) > 0); }); @@ -308,6 +352,22 @@ fn test_fast_add_peer_from_delayed_learner_blocked_paused_apply() { fail::remove("fallback_to_slow_path_not_allow"); } +#[test] +fn test_fast_add_peer_from_delayed_learner_apply() { + fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + simple_fast_add_peer(SourceType::DelayedLearner, false, PauseType::ApplySnapshot); + fail::remove("fallback_to_slow_path_not_allow"); +} + +#[test] +fn test_timeout_fallback() { + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); + simple_fast_add_peer(SourceType::Learner, false, PauseType::SendFakeSnapshot); + fail::remove("on_pre_persist_with_finish"); + fail::remove("apply_on_handle_snapshot_sync"); +} + #[test] fn test_existing_peer() { fail::cfg("before_tiflash_check_double_write", "return").unwrap(); @@ -398,13 +458,13 @@ fn test_apply_snapshot() { // Peer 3 can't use peer 2's data. // We will end up going slow path. fail::remove("ffi_fast_add_peer_pause"); - fail::cfg("go_fast_path_succeed", "panic").unwrap(); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); std::thread::sleep(std::time::Duration::from_millis(300)); // Resume applying snapshot fail::remove("on_ob_post_apply_snapshot"); check_key(&cluster, b"k4", b"v4", Some(true), None, Some(vec![1, 3])); cluster.shutdown(); - fail::remove("go_fast_path_succeed"); + fail::remove("go_fast_path_not_allow"); fail::remove("ffi_fast_add_peer_from_id"); fail::remove("before_tiflash_check_double_write"); } @@ -431,17 +491,17 @@ fn test_split_no_fast_add() { let r3 = cluster.get_region(b"k3"); assert_eq!(r1.get_id(), r3.get_id()); - fail::cfg("go_fast_path_succeed", "panic").unwrap(); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); cluster.must_split(&r1, b"k2"); must_wait_until_cond_node(&cluster, 1000, None, &|states: &States| -> bool { states.in_disk_region_state.get_region().get_peers().len() == 3 }); - let r1_new = cluster.get_region(b"k1"); // 1000 - let r3_new = cluster.get_region(b"k3"); // 1 + let _r1_new = cluster.get_region(b"k1"); // 1000 + let _r3_new = cluster.get_region(b"k3"); // 1 cluster.must_put(b"k0", b"v0"); check_key(&cluster, b"k0", b"v0", Some(true), None, None); - fail::remove("go_fast_path_succeed"); + fail::remove("go_fast_path_not_allow"); fail::remove("on_can_apply_snapshot"); cluster.shutdown(); } @@ -508,14 +568,16 @@ fn test_fall_back_to_slow_path() { // Can always apply snapshot immediately fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); fail::cfg("on_pre_persist_with_finish", "return").unwrap(); - fail::cfg("ffi_fast_add_peer_fail_after_write", "return(1)").unwrap(); - fail::cfg("go_fast_path_succeed", "panic").unwrap(); let _ = cluster.run_conf_change(); cluster.must_put(b"k1", b"v1"); check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); cluster.must_put(b"k2", b"v2"); + + fail::cfg("ffi_fast_add_peer_fail_after_write", "return(1)").unwrap(); + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(2, 2)); check_key(&cluster, b"k2", b"v2", Some(true), None, Some(vec![1, 2])); must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { @@ -525,6 +587,6 @@ fn test_fall_back_to_slow_path() { fail::remove("ffi_fast_add_peer_fail_after_write"); fail::remove("on_can_apply_snapshot"); fail::remove("on_pre_persist_with_finish"); - fail::remove("go_fast_path_succeed"); + fail::remove("go_fast_path_not_allow"); cluster.shutdown(); } diff --git a/proxy_tests/proxy/ffi.rs b/proxy_tests/proxy/ffi.rs index ea1c12c57c6..9694d76529e 100644 --- a/proxy_tests/proxy/ffi.rs +++ b/proxy_tests/proxy/ffi.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_store_ffi::{ - get_engine_store_server_helper, EngineStoreServerHelper, RawCppPtr, RawCppPtrArr, - RawCppPtrTuple, RawVoidPtr, UnwrapExternCFunc, + get_engine_store_server_helper, RawCppPtr, RawCppPtrArr, RawCppPtrTuple, RawVoidPtr, + UnwrapExternCFunc, }; use new_mock_engine_store::{ mock_cluster::init_global_ffi_helper_set, mock_store::RawCppPtrTypeImpl, diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index ecfabe6d364..11dc8944df9 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -676,13 +676,13 @@ pub fn must_not_merged(pd_client: Arc, from: u64, duration: Durati let timer = tikv_util::time::Instant::now(); loop { let region = futures::executor::block_on(pd_client.get_region_by_id(from)).unwrap(); - if let Some(r) = region { + if let Some(_) = region { if timer.saturating_elapsed() > duration { return; } } else { panic!("region {} is merged.", from); } - std::thread::sleep_ms(10); + std::thread::sleep(std::time::Duration::from_millis(10)); } } diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index 270953e0cd4..d285496bdda 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -373,7 +373,7 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ iter_ffi_helpers( cluster, Some(vec![from]), - &mut |id: u64, _, ffi: &mut FFIHelperSet| { + &mut |_, _, ffi: &mut FFIHelperSet| { let server = &mut ffi.engine_store_server; maybe_source_region = server.kvstore.get(®ion_id).cloned(); }, From 860fc839a988a6c975fbea18fc22f1d840bdfdc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Thu, 19 Jan 2023 17:03:49 +0800 Subject: [PATCH 090/115] log-backup: an ad-hoc way for hot reloading TLS certs (#14072) close tikv/tikv#14071 Log backup would aware TLS certifications changing. Signed-off-by: hillium --- .../src/metadata/store/lazy_etcd.rs | 91 ++++++++++++------- components/security/src/lib.rs | 4 + components/server/src/server.rs | 8 +- 3 files changed, 65 insertions(+), 38 deletions(-) diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index b712a23973d..37ffbad37c4 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -1,15 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, time::Duration}; +use std::{ + sync::Arc, + time::{Duration, SystemTime}, +}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; use openssl::x509::verify::X509VerifyFlags; +use security::SecurityManager; use tikv_util::{ info, stream::{RetryError, RetryExt}, + warn, }; -use tokio::sync::OnceCell; +use tokio::sync::Mutex as AsyncMutex; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; use crate::errors::{ContextualResultExt, Result}; @@ -17,20 +22,34 @@ use crate::errors::{ContextualResultExt, Result}; const RPC_TIMEOUT: Duration = Duration::from_secs(30); #[derive(Clone)] -pub struct LazyEtcdClient(Arc); +pub struct LazyEtcdClient(Arc>); -#[derive(Debug)] +#[derive(Clone)] pub struct ConnectionConfig { - pub tls: Option, + pub tls: Arc, pub keep_alive_interval: Duration, pub keep_alive_timeout: Duration, } +impl std::fmt::Debug for ConnectionConfig { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectionConfig") + .field("keep_alive_interval", &self.keep_alive_interval) + .field("keep_alive_timeout", &self.keep_alive_timeout) + .finish() + } +} + impl ConnectionConfig { /// Convert the config to the connection option. fn to_connection_options(&self) -> ConnectOptions { let mut opts = ConnectOptions::new(); - if let Some(tls) = &self.tls { + if let Some(tls) = &self + .tls + .client_suite() + .map_err(|err| warn!("failed to load client suite!"; "err" => %err)) + .ok() + { opts = opts.with_openssl_tls( OpenSslClientConfig::default() .ca_cert_pem(&tls.ca) @@ -54,28 +73,27 @@ impl ConnectionConfig { impl LazyEtcdClient { pub fn new(endpoints: &[String], conf: ConnectionConfig) -> Self { - Self(Arc::new(LazyEtcdClientInner { - opt: conf.to_connection_options(), + Self(Arc::new(AsyncMutex::new(LazyEtcdClientInner { + conf, endpoints: endpoints.iter().map(ToString::to_string).collect(), - cli: OnceCell::new(), - })) + last_modified: None, + cli: None, + }))) } -} - -impl std::ops::Deref for LazyEtcdClient { - type Target = LazyEtcdClientInner; - fn deref(&self) -> &Self::Target { - Arc::deref(&self.0) + async fn get_cli(&self) -> Result { + let mut l = self.0.lock().await; + l.get_cli().await.cloned() } } #[derive(Clone)] pub struct LazyEtcdClientInner { - opt: ConnectOptions, + conf: ConnectionConfig, endpoints: Vec, - cli: OnceCell, + last_modified: Option, + cli: Option, } fn etcd_error_is_retryable(etcd_err: &EtcdError) -> bool { @@ -130,23 +148,34 @@ where } impl LazyEtcdClientInner { - async fn connect(&self) -> Result { + async fn connect(&mut self) -> Result<&EtcdStore> { let store = retry(|| { // For now, the interface of the `etcd_client` doesn't us to control // how to create channels when connecting, hence we cannot update the tls config - // at runtime. - // TODO: maybe add some method like `with_channel` for `etcd_client`, and adapt - // the `SecurityManager` API, instead of doing everything by own. - etcd_client::Client::connect(self.endpoints.clone(), Some(self.opt.clone())) + // at runtime, now what we did is manually check that each time we are getting + // the clients. + etcd_client::Client::connect( + self.endpoints.clone(), + Some(self.conf.to_connection_options()), + ) }) .await .context("during connecting to the etcd")?; - Ok(EtcdStore::from(store)) + let store = EtcdStore::from(store); + self.cli = Some(store); + Ok(self.cli.as_ref().unwrap()) } - pub async fn get_cli(&self) -> Result<&EtcdStore> { - let store = self.cli.get_or_try_init(|| self.connect()).await?; - Ok(store) + pub async fn get_cli(&mut self) -> Result<&EtcdStore> { + let modified = self.conf.tls.get_config().is_modified(&mut self.last_modified) + // Don't reload once we cannot check whether it is modified. + // Because when TLS disabled, this would always fail. + .unwrap_or(false); + if !modified && self.cli.is_some() { + return Ok(self.cli.as_ref().unwrap()); + } + info!("log backup reconnecting to the etcd service."; "tls_modified" => %modified, "connected_before" => %self.cli.is_some()); + self.connect().await } } @@ -155,7 +184,7 @@ impl MetaStore for LazyEtcdClient { type Snap = EtcdSnapshot; async fn snapshot(&self) -> Result { - self.0.get_cli().await?.snapshot().await + self.get_cli().await?.snapshot().await } async fn watch( @@ -163,14 +192,14 @@ impl MetaStore for LazyEtcdClient { keys: super::Keys, start_rev: i64, ) -> Result { - self.0.get_cli().await?.watch(keys, start_rev).await + self.get_cli().await?.watch(keys, start_rev).await } async fn txn(&self, txn: super::Transaction) -> Result<()> { - self.0.get_cli().await?.txn(txn).await + self.get_cli().await?.txn(txn).await } async fn txn_cond(&self, txn: super::CondTransaction) -> Result<()> { - self.0.get_cli().await?.txn_cond(txn).await + self.get_cli().await?.txn_cond(txn).await } } diff --git a/components/security/src/lib.rs b/components/security/src/lib.rs index 52f438236fd..68328c01ebe 100644 --- a/components/security/src/lib.rs +++ b/components/security/src/lib.rs @@ -190,6 +190,10 @@ impl SecurityManager { ) } } + + pub fn get_config(&self) -> &SecurityConfig { + &self.cfg + } } #[derive(Clone)] diff --git a/components/server/src/server.rs b/components/server/src/server.rs index cfc7e59e243..97fd1f77eef 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -1022,13 +1022,7 @@ where ConnectionConfig { keep_alive_interval: self.config.server.grpc_keepalive_time.0, keep_alive_timeout: self.config.server.grpc_keepalive_timeout.0, - tls: self - .security_mgr - .client_suite() - .map_err(|err| { - warn!("Failed to load client TLS suite, ignoring TLS config."; "err" => %err); - }) - .ok(), + tls: Arc::clone(&self.security_mgr), }, ); let backup_stream_endpoint = backup_stream::Endpoint::new( From 42c3814f2a11c50d6a496c8aaca8e314b26f7ead Mon Sep 17 00:00:00 2001 From: YangKeao Date: Thu, 19 Jan 2023 04:25:49 -0500 Subject: [PATCH 091/115] json, copr: implement unary not for json (#14070) close tikv/tikv#14069 Signed-off-by: YangKeao Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/tidb_query_executors/src/runner.rs | 9 ++++++ components/tidb_query_expr/src/impl_op.rs | 32 +++++++++++++++++++ components/tidb_query_expr/src/lib.rs | 1 + 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a2924314f8a..cc89037bffa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6599,7 +6599,7 @@ dependencies = [ [[package]] name = "tipb" version = "0.0.1" -source = "git+https://github.com/pingcap/tipb.git#f3286471a05a4454a1071dd5f66ac7dbf6c79ba3" +source = "git+https://github.com/pingcap/tipb.git#c6b7a5a1623bb2766a502301ecc3ac8f98cc7c79" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/tidb_query_executors/src/runner.rs b/components/tidb_query_executors/src/runner.rs index d04be41507e..392b41ff165 100644 --- a/components/tidb_query_executors/src/runner.rs +++ b/components/tidb_query_executors/src/runner.rs @@ -150,6 +150,15 @@ impl BatchExecutorsRunner<()> { ExecType::TypePartitionTableScan => { other_err!("PartitionTableScan executor not implemented"); } + ExecType::TypeSort => { + other_err!("Sort executor not implemented"); + } + ExecType::TypeWindow => { + other_err!("Window executor not implemented"); + } + ExecType::TypeExpand => { + other_err!("Expand executor not implemented"); + } } } diff --git a/components/tidb_query_expr/src/impl_op.rs b/components/tidb_query_expr/src/impl_op.rs index 5289f427e93..665448279fb 100644 --- a/components/tidb_query_expr/src/impl_op.rs +++ b/components/tidb_query_expr/src/impl_op.rs @@ -55,6 +55,18 @@ pub fn unary_not_decimal(arg: Option<&Decimal>) -> Result> { Ok(arg.as_ref().map(|v| v.is_zero() as i64)) } +#[rpn_fn(nullable)] +#[inline] +pub fn unary_not_json(arg: Option) -> Result> { + let json_zero = Json::from_i64(0).unwrap(); + Ok(arg.as_ref().map(|v| { + if v == &json_zero.as_ref() { + return 1; + } + 0 + })) +} + #[rpn_fn(nullable)] #[inline] pub fn unary_minus_uint(arg: Option<&Int>) -> Result> { @@ -383,6 +395,26 @@ mod tests { } } + #[test] + fn test_unary_not_json() { + let test_cases = vec![ + (None, None), + (Some(Json::from_i64(0).unwrap()), Some(1)), + (Some(Json::from_i64(1).unwrap()), Some(0)), + ( + Some(Json::from_array(vec![Json::from_i64(0).unwrap()]).unwrap()), + Some(0), + ), + ]; + for (arg, expect_output) in test_cases { + let output = RpnFnScalarEvaluator::new() + .push_param(arg.clone()) + .evaluate(ScalarFuncSig::UnaryNotJson) + .unwrap(); + assert_eq!(output, expect_output, "{:?}", arg.as_ref()); + } + } + #[test] fn test_unary_minus_int() { let unsigned_test_cases = vec![ diff --git a/components/tidb_query_expr/src/lib.rs b/components/tidb_query_expr/src/lib.rs index 43b0602ebbb..649a7cfa1c8 100644 --- a/components/tidb_query_expr/src/lib.rs +++ b/components/tidb_query_expr/src/lib.rs @@ -732,6 +732,7 @@ fn map_expr_node_to_rpn_func(expr: &Expr) -> Result { ScalarFuncSig::UnaryNotInt => unary_not_int_fn_meta(), ScalarFuncSig::UnaryNotReal => unary_not_real_fn_meta(), ScalarFuncSig::UnaryNotDecimal => unary_not_decimal_fn_meta(), + ScalarFuncSig::UnaryNotJson => unary_not_json_fn_meta(), ScalarFuncSig::UnaryMinusInt => map_unary_minus_int_func(value, children)?, ScalarFuncSig::UnaryMinusReal => unary_minus_real_fn_meta(), ScalarFuncSig::UnaryMinusDecimal => unary_minus_decimal_fn_meta(), From a5c6fe53416fd9f424f9fa5efeb7c6902822820c Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 19 Jan 2023 17:37:06 +0800 Subject: [PATCH 092/115] [Cloud] Fast add peer: for enable-pagestorage (#261) --- engine_store_ffi/src/observer.rs | 4 ++-- proxy_server/src/lib.rs | 4 +++- raftstore-proxy/Cargo.toml | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 964837e0586..e0593948329 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1505,13 +1505,13 @@ impl ApplySnapshotObserver for TiFlashOb }; // According to pre_apply_snapshot, if registered tracer, // then we must have put it into thread pool. - let prev = self + let _prev = self .engine .pending_applies_count .fetch_sub(1, Ordering::SeqCst); #[cfg(any(test, feature = "testexport"))] - assert!(prev > 0); + assert!(_prev > 0); info!("apply snapshot finished"; "peer_id" => peer_id, diff --git a/proxy_server/src/lib.rs b/proxy_server/src/lib.rs index 07032edaed9..72a677dabd5 100644 --- a/proxy_server/src/lib.rs +++ b/proxy_server/src/lib.rs @@ -34,7 +34,8 @@ fn proxy_version_info() -> String { \nRust Version: {}\ \nStorage Engine: {}\ \nPrometheus Prefix: {}\ - \nProfile: {}", + \nProfile: {}\ + \nEnable Features: {}", option_env!("PROXY_BUILD_GIT_HASH").unwrap_or(fallback), option_env!("PROXY_BUILD_GIT_BRANCH").unwrap_or(fallback), option_env!("PROXY_BUILD_TIME").unwrap_or(fallback), @@ -42,6 +43,7 @@ fn proxy_version_info() -> String { option_env!("ENGINE_LABEL_VALUE").unwrap_or(fallback), option_env!("PROMETHEUS_METRIC_NAME_PREFIX").unwrap_or(fallback), option_env!("PROXY_PROFILE").unwrap_or(fallback), + option_env!("ENABLE_FEATURES").unwrap_or(fallback), ) } diff --git a/raftstore-proxy/Cargo.toml b/raftstore-proxy/Cargo.toml index e6e254e87a8..074c9c49c80 100644 --- a/raftstore-proxy/Cargo.toml +++ b/raftstore-proxy/Cargo.toml @@ -14,6 +14,7 @@ portable = ["proxy_server/portable"] sse = ["proxy_server/sse"] mem-profiling = ["proxy_server/mem-profiling"] failpoints = ["proxy_server/failpoints"] +enable-pagestorage = ["proxy_server/enable-pagestorage"] cloud-aws = ["proxy_server/cloud-aws"] cloud-gcp = ["proxy_server/cloud-gcp"] @@ -40,4 +41,4 @@ name = "raftstore_proxy" crate-type = ["cdylib"] [dependencies] -proxy_server = { workspace = true, features = ["enable-pagestorage"] } +proxy_server = { workspace = true } From cf622538b2ab118f51bf64a23ba41507b7e67f3f Mon Sep 17 00:00:00 2001 From: Zwb Date: Thu, 19 Jan 2023 22:15:00 +0800 Subject: [PATCH 093/115] raftstore: support switch witness (#13491) * support switch witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add switch witness api for test_pd_client ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * pd heartbeat resp support switch witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * update region epoch ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix write apply state race ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * remove unnecessary code ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add back test_witness_conf_change ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add some tests ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * avoid test failures ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * a few refactor ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * add witness election priority and address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * clean code ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix tests failed caused by cfg ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix test failed caused by mistake modify ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * adjust priority after snapshot persisted ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * address comments ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * notify pd after switch witness as region changed ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * define a new backoff error for witness ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix panic caused by applygap ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * forbid transfer leader to non-witness waiting data ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * update kvproto ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * fix two panics ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * retry request snapshot ref tikv/tikv#12876 Signed-off-by: Wenbo Zhang * retry to request snaphost after term change ref #12876 Signed-off-by: Wenbo Zhang * update kvproto comment ref #12876 Signed-off-by: Wenbo Zhang Signed-off-by: Wenbo Zhang Signed-off-by: Zwb Co-authored-by: Xinye Tao --- Cargo.lock | 22 +- components/error_code/src/raftstore.rs | 3 + .../raftstore-v2/src/operation/query/local.rs | 1 + components/raftstore/src/coprocessor/mod.rs | 1 + components/raftstore/src/errors.rs | 9 + components/raftstore/src/store/config.rs | 8 + components/raftstore/src/store/fsm/apply.rs | 136 +++++++- components/raftstore/src/store/fsm/mod.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 170 +++++++-- components/raftstore/src/store/fsm/store.rs | 5 + components/raftstore/src/store/metrics.rs | 7 +- components/raftstore/src/store/msg.rs | 5 +- components/raftstore/src/store/peer.rs | 108 ++++-- .../raftstore/src/store/peer_storage.rs | 18 + components/raftstore/src/store/util.rs | 2 +- .../raftstore/src/store/worker/metrics.rs | 1 + components/raftstore/src/store/worker/pd.rs | 34 +- components/raftstore/src/store/worker/read.rs | 24 +- components/test_pd_client/src/pd.rs | 119 ++++++- etc/error_code.toml | 5 + tests/failpoints/cases/test_witness.rs | 273 +++++++++++++-- tests/integrations/config/mod.rs | 1 + tests/integrations/raftstore/test_witness.rs | 328 +++++++++--------- 23 files changed, 1029 insertions(+), 253 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index cc89037bffa..e9f55d1923d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2732,7 +2732,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", "raft-proto", ] @@ -4121,6 +4121,18 @@ dependencies = [ "regex", ] +[[package]] +name = "protobuf-build" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb3c02f54ecaf12572c1a60dbdb36b1f8f713a16105881143f2be84cca5bbe3" +dependencies = [ + "bitflags", + "protobuf", + "protobuf-codegen", + "regex", +] + [[package]] name = "protobuf-codegen" version = "2.8.0" @@ -4161,7 +4173,7 @@ dependencies = [ [[package]] name = "raft" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "fxhash", @@ -4220,11 +4232,11 @@ dependencies = [ [[package]] name = "raft-proto" version = "0.7.0" -source = "git+https://github.com/tikv/raft-rs?branch=master#2357cb22760719bcd107a90d1e64ef505bdb1e15" +source = "git+https://github.com/tikv/raft-rs?branch=master#f73766712a538c2f6eb135b455297ad6c03fc58d" dependencies = [ "bytes", "protobuf", - "protobuf-build", + "protobuf-build 0.14.0", ] [[package]] @@ -6604,7 +6616,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "protobuf", - "protobuf-build", + "protobuf-build 0.13.0", ] [[package]] diff --git a/components/error_code/src/raftstore.rs b/components/error_code/src/raftstore.rs index 1b6a85493cf..35dfe564ef0 100644 --- a/components/error_code/src/raftstore.rs +++ b/components/error_code/src/raftstore.rs @@ -32,6 +32,7 @@ define_error_codes!( RECOVERY_IN_PROGRESS => ("RecoveryInProgress", "", ""), FLASHBACK_IN_PROGRESS => ("FlashbackInProgress", "", ""), FLASHBACK_NOT_PREPARED => ("FlashbackNotPrepared", "", ""), + IS_WITNESS => ("IsWitness", "", ""), SNAP_ABORT => ("SnapAbort", "", ""), SNAP_TOO_MANY => ("SnapTooMany", "", ""), @@ -70,6 +71,8 @@ impl ErrorCodeExt for errorpb::Error { FLASHBACK_IN_PROGRESS } else if self.has_flashback_not_prepared() { FLASHBACK_NOT_PREPARED + } else if self.has_is_witness() { + IS_WITNESS } else { UNKNOWN } diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 13b815d1ebc..f574571f790 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -815,6 +815,7 @@ mod tests { txn_ext: txn_ext.clone(), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: Some(bucket_meta.clone()), }; diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 5100e9d4632..73110660856 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -300,6 +300,7 @@ pub enum RegionChangeReason { PrepareMerge, CommitMerge, RollbackMerge, + SwitchWitness, } #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/components/raftstore/src/errors.rs b/components/raftstore/src/errors.rs index 3c415c65af6..36fcec7f1f3 100644 --- a/components/raftstore/src/errors.rs +++ b/components/raftstore/src/errors.rs @@ -140,6 +140,9 @@ pub enum Error { region_id: u64, local_state: raft_serverpb::RegionLocalState, }, + + #[error("peer is a witness of region {0}")] + IsWitness(u64), } pub type Result = result::Result; @@ -263,6 +266,11 @@ impl From for errorpb::Error { e.set_region_id(region_id); errorpb.set_flashback_not_prepared(e); } + Error::IsWitness(region_id) => { + let mut e = errorpb::IsWitness::default(); + e.set_region_id(region_id); + errorpb.set_is_witness(e); + } _ => {} }; @@ -319,6 +327,7 @@ impl ErrorCodeExt for Error { Error::DataIsNotReady { .. } => error_code::raftstore::DATA_IS_NOT_READY, Error::DeadlineExceeded => error_code::raftstore::DEADLINE_EXCEEDED, Error::PendingPrepareMerge => error_code::raftstore::PENDING_PREPARE_MERGE, + Error::IsWitness(..) => error_code::raftstore::IS_WITNESS, Error::Other(_) | Error::RegionNotRegistered { .. } => error_code::raftstore::UNKNOWN, } diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 4d3210318a6..34f4e159dee 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -324,6 +324,12 @@ pub struct Config { #[online_config(hidden)] // Interval to check peers availability info. pub check_peers_availability_interval: ReadableDuration, + + #[doc(hidden)] + #[serde(skip_serializing)] + #[online_config(hidden)] + // Interval to check if need to request snapshot. + pub check_request_snapshot_interval: ReadableDuration, } impl Default for Config { @@ -433,6 +439,8 @@ impl Default for Config { unreachable_backoff: ReadableDuration::secs(10), // TODO: make its value reasonable check_peers_availability_interval: ReadableDuration::secs(30), + // TODO: make its value reasonable + check_request_snapshot_interval: ReadableDuration::minutes(1), } } } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 60ed35e6892..9f2d234010f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -40,7 +40,7 @@ use kvproto::{ metapb::{self, PeerRole, Region, RegionEpoch}, raft_cmdpb::{ AdminCmdType, AdminRequest, AdminResponse, ChangePeerRequest, CmdType, CommitMergeRequest, - RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, + RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::{MergeState, PeerState, RaftApplyState, RaftTruncatedState, RegionLocalState}, }; @@ -252,6 +252,13 @@ impl Range { } } +#[derive(Default, Debug)] +pub struct SwitchWitness { + pub index: u64, + pub switches: Vec, + pub region: Region, +} + #[derive(Debug)] pub enum ExecResult { ChangePeer(ChangePeer), @@ -301,6 +308,7 @@ pub enum ExecResult { SetFlashbackState { region: Region, }, + BatchSwitchWitness(SwitchWitness), // The raftstore thread will use it to update the internal state of `PeerFsm`. If it is // `true`, when the raftstore detects that the raft log has not been gc for a long time, // the raftstore thread will actively pull the `voter_replicated_index` from the leader @@ -979,6 +987,9 @@ where /// in same Ready should be applied failed. pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + wait_data: bool, + /// The commands waiting to be committed and applied pending_cmds: PendingCmdQueue>, /// The counter of pending request snapshots. See more in `Peer`. @@ -1041,6 +1052,7 @@ where peer: find_peer_by_id(®.region, reg.id).unwrap().clone(), region: reg.region, pending_remove: false, + wait_data: false, last_flush_applied_index: reg.apply_state.get_applied_index(), apply_state: reg.apply_state, applied_term: reg.applied_term, @@ -1119,7 +1131,13 @@ where match res { ApplyResult::None => {} - ApplyResult::Res(res) => results.push_back(res), + ApplyResult::Res(res) => { + results.push_back(res); + if self.wait_data { + apply_ctx.committed_count -= committed_entries_drainer.len(); + break; + } + } ApplyResult::Yield | ApplyResult::WaitMergeSource(_) => { // Both cancel and merge will yield current processing. apply_ctx.committed_count -= committed_entries_drainer.len() + 1; @@ -1535,6 +1553,12 @@ where ExecResult::SetFlashbackState { ref region } => { self.region = region.clone(); } + ExecResult::BatchSwitchWitness(ref switches) => { + self.region = switches.region.clone(); + if let Some(p) = find_peer_by_id(&self.region, self.id()) { + self.peer = p.clone(); + } + } } } if let Some(epoch) = origin_epoch { @@ -1669,7 +1693,7 @@ where AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { self.exec_flashback(ctx, request) } - AdminCmdType::BatchSwitchWitness => Err(box_err!("unsupported admin command type")), + AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), }?; response.set_cmd_type(cmd_type); @@ -3202,6 +3226,90 @@ where )) } + fn exec_batch_switch_witness( + &mut self, + ctx: &mut ApplyContext, + request: &AdminRequest, + ) -> Result<(AdminResponse, ApplyResult)> { + assert!(request.has_switch_witnesses()); + let switches = request + .get_switch_witnesses() + .get_switch_witnesses() + .to_vec(); + + info!( + "exec BatchSwitchWitness"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "epoch" => ?self.region.get_region_epoch(), + ); + + let mut region = self.region.clone(); + for s in switches.as_slice() { + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.all.inc(); + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + let mut peer_is_exist = false; + for p in region.mut_peers().iter_mut() { + if p.id == peer_id { + if p.is_witness == is_witness { + return Err(box_err!( + "switch peer {:?} on region {:?} is no-op", + p, + self.region + )); + } + p.is_witness = is_witness; + peer_is_exist = true; + break; + } + } + if !peer_is_exist { + return Err(box_err!( + "switch peer {} on region {:?} failed: peer does not exist", + peer_id, + self.region + )); + } + PEER_ADMIN_CMD_COUNTER.batch_switch_witness.success.inc(); + if self.id() == peer_id && !is_witness { + self.wait_data = true; + self.peer.is_witness = false; + } + } + let conf_ver = region.get_region_epoch().get_conf_ver() + switches.len() as u64; + region.mut_region_epoch().set_conf_ver(conf_ver); + info!( + "switch witness successfully"; + "region_id" => self.region_id(), + "peer_id" => self.id(), + "switches" => ?switches, + "original region" => ?&self.region, + "current region" => ?®ion, + ); + + let state = if self.pending_remove { + PeerState::Tombstone + } else if self.wait_data { + PeerState::Unavailable + } else { + PeerState::Normal + }; + + if let Err(e) = write_peer_state(ctx.kv_wb_mut(), ®ion, state, None) { + panic!("{} failed to update region state: {:?}", self.tag, e); + } + + let resp = AdminResponse::default(); + Ok(( + resp, + ApplyResult::Res(ExecResult::BatchSwitchWitness(SwitchWitness { + index: ctx.exec_log_index, + switches, + region, + })), + )) + } + fn update_memory_trace(&mut self, event: &mut TraceEvent) { let pending_cmds = self.pending_cmds.heap_size(); let merge_yield = if let Some(ref mut state) = self.yield_state { @@ -3593,6 +3701,7 @@ where #[cfg(any(test, feature = "testexport"))] #[allow(clippy::type_complexity)] Validate(u64, Box), + Recover(u64), CheckCompact { region_id: u64, voter_replicated_index: u64, @@ -3645,6 +3754,7 @@ where } => write!(f, "[region {}] change cmd", region_id), #[cfg(any(test, feature = "testexport"))] Msg::Validate(region_id, _) => write!(f, "[region {}] validate", region_id), + Msg::Recover(region_id) => write!(f, "recover [region {}] apply", region_id), Msg::CheckCompact { region_id, voter_replicated_index, @@ -3770,6 +3880,10 @@ where return; } + if self.delegate.wait_data { + return; + } + let mut entries = Vec::new(); let mut dangle_size = 0; @@ -3972,8 +4086,9 @@ where if self.delegate.pending_remove || self.delegate.stopped { return; } - if self.delegate.peer.is_witness { - // witness shouldn't generate snapshot. + if self.delegate.peer.is_witness || self.delegate.wait_data { + // witness or non-witness hasn't finish applying snapshot shouldn't generate + // snapshot. return; } let applied_index = self.delegate.apply_state.get_applied_index(); @@ -4199,8 +4314,11 @@ where } } } - batch_apply = Some(apply); + if !self.delegate.wait_data { + batch_apply = Some(apply); + } } + Msg::Recover(..) => self.delegate.wait_data = false, Msg::Registration(reg) => self.handle_registration(reg), Msg::Destroy(d) => self.handle_destroy(apply_ctx, d), Msg::LogsUpToDate(cul) => self.logs_up_to_date_for_merge(apply_ctx, cul), @@ -4637,6 +4755,11 @@ where } #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => return, + Msg::Recover(region_id) => { + info!("recover apply"; + "region_id" => region_id); + return; + } Msg::CheckCompact { region_id, .. } => { info!("target region is not found"; "region_id" => region_id); @@ -4774,6 +4897,7 @@ mod memtrace { | Msg::Change { .. } => 0, #[cfg(any(test, feature = "testexport"))] Msg::Validate(..) => 0, + Msg::Recover(..) => 0, Msg::CheckCompact { .. } => 0, } } diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index 2f700eec9bf..b481caf4f74 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -14,7 +14,7 @@ pub use self::{ check_sst_for_ingestion, create_apply_batch_system, Apply, ApplyBatchSystem, ApplyMetrics, ApplyRes, ApplyRouter, Builder as ApplyPollerBuilder, CatchUpLogs, ChangeObserver, ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, - Registration, TaskRes as ApplyTaskRes, + Registration, SwitchWitness, TaskRes as ApplyTaskRes, }, peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index ccde4b031ef..d405c3471af 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -75,7 +75,7 @@ use crate::{ apply, store::{PollContext, StoreMeta}, ApplyMetrics, ApplyTask, ApplyTaskRes, CatchUpLogs, ChangeObserver, ChangePeer, - ExecResult, + ExecResult, SwitchWitness, }, hibernate_state::{GroupState, HibernateState}, local_metrics::{RaftMetrics, TimeTracker}, @@ -247,6 +247,7 @@ where raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, + wait_data: bool, ) -> Result> { let meta_peer = match find_peer(region, store_id) { None => { @@ -277,6 +278,7 @@ where engines, region, meta_peer, + wait_data, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -331,6 +333,7 @@ where engines, ®ion, peer, + false, )?, tick_registry: [false; PeerTick::VARIANT_COUNT], missing_ticks: 0, @@ -1192,6 +1195,7 @@ where PeerTick::ReportBuckets => self.on_report_region_buckets_tick(), PeerTick::CheckLongUncommitted => self.on_check_long_uncommitted_tick(), PeerTick::CheckPeersAvailability => self.on_check_peers_availability(), + PeerTick::RequestSnapshot => self.on_request_snapshot_tick(), PeerTick::RequestVoterReplicatedIndex => self.on_request_voter_replicated_index(), } } @@ -1203,6 +1207,9 @@ where self.register_split_region_check_tick(); self.register_check_peer_stale_state_tick(); self.on_check_merge(); + if self.fsm.peer.wait_data { + self.on_request_snapshot_tick(); + } // Apply committed entries more quickly. // Or if it's a leader. This implicitly means it's a singleton // because it becomes leader in `Peer::new` when it's a @@ -1951,6 +1958,7 @@ where self.register_raft_gc_log_tick(); self.register_check_leader_lease_tick(); self.register_report_region_buckets_tick(); + self.register_check_peers_availability_tick(); } if let Some(ForceLeaderState::ForceLeader { .. }) = self.fsm.peer.force_leader { @@ -2161,12 +2169,6 @@ where return; } - // Keep ticking if there are disk full peers for the Region. - if !self.fsm.peer.disk_full_peers.is_empty() { - self.register_raft_base_tick(); - return; - } - debug!("stop ticking"; "res" => ?res, "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id(), @@ -2258,6 +2260,9 @@ where "peer_id" => self.fsm.peer_id(), "res" => ?res, ); + if self.fsm.peer.wait_data { + return; + } self.on_ready_result(&mut res.exec_res, &res.metrics); if self.fsm.stopped { return; @@ -2467,6 +2472,17 @@ where return Ok(()); } + if MessageType::MsgAppend == msg_type + && self.fsm.peer.wait_data + && self.fsm.peer.should_reject_msgappend + { + debug!("skip {:?} because of non-witness waiting data", msg_type; + "region_id" => self.region_id(), "peer_id" => self.fsm.peer_id() + ); + self.ctx.raft_metrics.message_dropped.non_witness.inc(); + return Ok(()); + } + if !self.validate_raft_msg(&msg) { return Ok(()); } @@ -2603,6 +2619,7 @@ where fn on_hibernate_request(&mut self, from: &metapb::Peer) { if !self.ctx.cfg.hibernate_regions || self.fsm.peer.has_uncommitted_log() + || self.fsm.peer.wait_data || from.get_id() != self.fsm.peer.leader_id() { // Ignore the message means rejecting implicitly. @@ -3053,7 +3070,7 @@ where if snap.get_metadata().get_index() < self.fsm.peer.get_store().applied_index() && snap_data.get_meta().get_for_witness() != self.fsm.peer.is_witness() { - info!( + error!( "mismatch witness snapshot"; "region_id" => region_id, "peer_id" => self.fsm.peer_id(), @@ -3355,7 +3372,6 @@ where ); } else { self.fsm.peer.transfer_leader(&from); - self.fsm.peer.wait_data_peers.clear(); } } } @@ -4069,6 +4085,7 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, + false, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { @@ -4959,6 +4976,9 @@ where ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::BatchSwitchWitness(switches) => { + self.on_ready_batch_switch_witness(switches) + } ExecResult::HasPendingCompactCmd(has_pending) => { self.fsm.peer.has_pending_compact_cmd = has_pending; if has_pending { @@ -5126,8 +5146,29 @@ where && msg.get_admin_request().get_cmd_type() == AdminCmdType::TransferLeader) { self.ctx.raft_metrics.invalid_proposal.witness.inc(); - // TODO: use a dedicated error type - return Err(Error::RecoveryInProgress(self.region_id())); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests to switch it into a witness when it's a leader + if self.fsm.peer.is_leader() + && msg.has_admin_request() + && msg.get_admin_request().get_cmd_type() == AdminCmdType::BatchSwitchWitness + && msg + .get_admin_request() + .get_switch_witnesses() + .get_switch_witnesses() + .iter() + .any(|s| s.get_peer_id() == self.fsm.peer.peer.get_id() && s.get_is_witness()) + { + self.ctx.raft_metrics.invalid_proposal.witness.inc(); + return Err(Error::IsWitness(self.region_id())); + } + + // Forbid requests when it becomes to non-witness but not finish applying + // snapshot. + if self.fsm.peer.wait_data { + self.ctx.raft_metrics.invalid_proposal.non_witness.inc(); + return Err(Error::IsWitness(self.region_id())); } // check whether the peer is initialized. @@ -5518,6 +5559,36 @@ where self.register_check_long_uncommitted_tick(); } + fn on_request_snapshot_tick(&mut self) { + fail_point!("ignore request snapshot", |_| { + self.schedule_tick(PeerTick::RequestSnapshot); + }); + if !self.fsm.peer.wait_data || self.fsm.peer.is_leader() { + return; + } + self.fsm.peer.request_index = self.fsm.peer.raft_group.raft.raft_log.last_index(); + let last_term = self.fsm.peer.get_index_term(self.fsm.peer.request_index); + if last_term == self.fsm.peer.term() { + self.fsm.peer.should_reject_msgappend = true; + if let Err(e) = self.fsm.peer.raft_group.request_snapshot() { + error!( + "failed to request snapshot"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "err" => %e, + ); + } + } else { + // If a leader change occurs after switch to non-witness, it should be + // continue processing `MsgAppend` until `last_term == term`, then retry + // to request snapshot. + self.fsm.peer.should_reject_msgappend = false; + } + // Requesting a snapshot may fail, so register a periodic event as a defense + // until succeeded. + self.schedule_tick(PeerTick::RequestSnapshot); + } + fn on_request_voter_replicated_index(&mut self) { if !self.fsm.peer.is_witness() || !self.fsm.peer.has_pending_compact_cmd { return; @@ -6059,18 +6130,31 @@ where } fn on_check_peers_availability(&mut self) { + let mut invalid_peers: Vec = Vec::new(); for peer_id in self.fsm.peer.wait_data_peers.iter() { - let peer = self.fsm.peer.get_peer_from_cache(*peer_id).unwrap(); - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityRequest); - self.fsm - .peer - .send_extra_message(msg, &mut self.ctx.trans, &peer); - debug!( - "check peer availability"; - "target peer id" => *peer_id, - ); + match self.fsm.peer.get_peer_from_cache(*peer_id) { + Some(peer) => { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityRequest); + self.fsm + .peer + .send_extra_message(msg, &mut self.ctx.trans, &peer); + debug!( + "check peer availability"; + "target peer id" => *peer_id, + ); + } + None => invalid_peers.push(*peer_id), + } } + // For some reasons, the peer corresponding to the previously saved peer_id + // no longer exists. In order to avoid passing invalid information to pd when + // reporting pending peers and affecting pd scheduling, remove it from the + // `wait_data_peers`. + self.fsm + .peer + .wait_data_peers + .retain(|peer_id| !invalid_peers.contains(peer_id)); } fn register_pull_voter_replicated_index_tick(&mut self) { @@ -6355,6 +6439,50 @@ where self.fsm.peer.leader_lease_mut().expire_remote_lease(); } + fn on_ready_batch_switch_witness(&mut self, sw: SwitchWitness) { + { + let mut meta = self.ctx.store_meta.lock().unwrap(); + meta.set_region( + &self.ctx.coprocessor_host, + sw.region, + &mut self.fsm.peer, + RegionChangeReason::SwitchWitness, + ); + } + for s in sw.switches { + let (peer_id, is_witness) = (s.get_peer_id(), s.get_is_witness()); + if self.fsm.peer_id() == peer_id { + if is_witness && !self.fsm.peer.is_leader() { + let _ = self.fsm.peer.get_store().clear_data(); + self.fsm.peer.raft_group.set_priority(-1); + } else { + self.fsm + .peer + .update_read_progress(self.ctx, ReadProgress::WaitData(true)); + self.fsm.peer.wait_data = true; + self.on_request_snapshot_tick(); + } + self.fsm.peer.peer.is_witness = is_witness; + continue; + } + if !is_witness && !self.fsm.peer.wait_data_peers.contains(&peer_id) { + self.fsm.peer.wait_data_peers.push(peer_id); + } + } + if self.fsm.peer.is_leader() { + info!( + "notify pd with change peer region"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "region" => ?self.fsm.peer.region(), + ); + self.fsm.peer.heartbeat_pd(self.ctx); + if !self.fsm.peer.wait_data_peers.is_empty() { + self.register_check_peers_availability_tick(); + } + } + } + /// Verify and store the hash to state. return true means the hash has been /// stored successfully. // TODO: Consider context in the function. diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index b75aee3b4bb..2ca573824f9 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -594,6 +594,8 @@ where self.cfg.check_long_uncommitted_interval.0; self.tick_batch[PeerTick::CheckPeersAvailability as usize].wait_duration = self.cfg.check_peers_availability_interval.0; + self.tick_batch[PeerTick::RequestSnapshot as usize].wait_duration = + self.cfg.check_request_snapshot_interval.0; // TODO: make it reasonable self.tick_batch[PeerTick::RequestVoterReplicatedIndex as usize].wait_duration = self.cfg.raft_log_gc_tick_interval.0 * 2; @@ -1206,6 +1208,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, + local_state.get_state() == PeerState::Unavailable, )); peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { @@ -1246,6 +1249,7 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, + false, )?; peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); @@ -2911,6 +2915,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, + false, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index ce4f099610e..6c6357d286c 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -35,7 +35,8 @@ make_auto_flush_static_metric! { compact, transfer_leader, prepare_flashback, - finish_flashback + finish_flashback, + batch_switch_witness : "batch-switch-witness", } pub label_enum AdminCmdStatus { @@ -177,6 +178,7 @@ make_static_metric! { region_nonexistent, applying_snap, disk_full, + non_witness, recovery, } @@ -205,7 +207,8 @@ make_static_metric! { force_leader, witness, flashback_in_progress, - flashback_not_prepared + flashback_not_prepared, + non_witness, } pub label_enum RaftEventDurationType { diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index b2a2a7aa1d1..3c555689cb9 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -384,7 +384,8 @@ pub enum PeerTick { ReportBuckets = 9, CheckLongUncommitted = 10, CheckPeersAvailability = 11, - RequestVoterReplicatedIndex = 12, + RequestSnapshot = 12, + RequestVoterReplicatedIndex = 13, } impl PeerTick { @@ -405,6 +406,7 @@ impl PeerTick { PeerTick::ReportBuckets => "report_buckets", PeerTick::CheckLongUncommitted => "check_long_uncommitted", PeerTick::CheckPeersAvailability => "check_peers_availability", + PeerTick::RequestSnapshot => "request_snapshot", PeerTick::RequestVoterReplicatedIndex => "request_voter_replicated_index", } } @@ -423,6 +425,7 @@ impl PeerTick { PeerTick::ReportBuckets, PeerTick::CheckLongUncommitted, PeerTick::CheckPeersAvailability, + PeerTick::RequestSnapshot, PeerTick::RequestVoterReplicatedIndex, ]; TICKS diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 586ab7ba133..e2a914fded6 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -894,6 +894,17 @@ where /// the same time period. pub wait_data: bool, + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, but the request may fail, so we need to save + /// the request index for retrying. + pub request_index: u64, + + /// When the witness becomes non-witness, it need to actively request a + /// snapshot from the leader, In order to avoid log lag, we need to reject + /// the leader's `MsgAppend` request unless the `term` of the `last index` + /// is less than the peer's current `term`. + pub should_reject_msgappend: bool, + /// Force leader state is only used in online recovery when the majority of /// peers are missing. In this state, it forces one peer to become leader /// out of accordance with Raft election rule, and forbids any @@ -1055,6 +1066,7 @@ where engines: Engines, region: &metapb::Region, peer: metapb::Peer, + wait_data: bool, ) -> Result> { let peer_id = peer.get_id(); if peer_id == raft::INVALID_ID { @@ -1086,12 +1098,13 @@ where skip_bcast_commit: true, pre_vote: cfg.prevote, max_committed_size_per_ready: MAX_COMMITTED_SIZE_PER_READY, - // TODO: if peer.is_witness { 0 } else { 1 }, + priority: if peer.is_witness { -1 } else { 0 }, ..Default::default() }; let logger = slog_global::get_global().new(slog::o!("region_id" => region.get_id())); let raft_group = RawNode::new(&raft_cfg, ps, &logger)?; + let last_index = raft_group.store().last_index(); // In order to avoid excessive log accumulation due to the loss of pending // compaction cmds after the witness is restarted, it will actively pull // voter_request_index once at start. @@ -1118,7 +1131,9 @@ where compaction_declined_bytes: 0, leader_unreachable: false, pending_remove: false, - wait_data: false, + wait_data, + request_index: last_index, + should_reject_msgappend: false, should_wake_up: false, force_leader: None, pending_merge_state: None, @@ -1592,6 +1607,14 @@ where res.reason = "replication mode"; return res; } + if !self.disk_full_peers.is_empty() { + res.reason = "has disk full peers"; + return res; + } + if !self.wait_data_peers.is_empty() { + res.reason = "has wait data peers"; + return res; + } res.up_to_date = true; res } @@ -1617,6 +1640,8 @@ where && !self.has_unresolved_reads() // If it becomes leader, the stats is not valid anymore. && !self.is_leader() + // Keep ticking if it's waiting for snapshot. + && !self.wait_data } } @@ -2061,6 +2086,12 @@ where let status = self.raft_group.status(); let truncated_idx = self.get_store().truncated_index(); + for peer_id in &self.wait_data_peers { + if let Some(p) = self.get_peer_from_cache(*peer_id) { + pending_peers.push(p); + } + } + if status.progress.is_none() { return pending_peers; } @@ -2137,6 +2168,9 @@ where if self.peers_start_pending_time[i].0 != peer_id { continue; } + if self.wait_data_peers.contains(&peer_id) { + continue; + } let truncated_idx = self.raft_group.store().truncated_index(); if let Some(progress) = self.raft_group.raft.prs().get(peer_id) { if progress.matched >= truncated_idx { @@ -2394,8 +2428,12 @@ where // a stale heartbeat can make the leader think follower has already applied // the snapshot, and send remaining log entries, which may increase // commit_index. + // + // If it's witness before, but a command changes it to non-witness, it will stop + // applying all following command, therefore, add the judgment of `wait_data` to + // avoid applying snapshot is also blocked. // TODO: add more test - self.last_applying_idx == self.get_store().applied_index() + (self.last_applying_idx == self.get_store().applied_index() || self.wait_data) // Requesting snapshots also triggers apply workers to write // apply states even if there is no pending committed entry. // TODO: Instead of sharing the counter, we should apply snapshots @@ -2565,11 +2603,18 @@ where // i.e. call `RawNode::advance_apply_to`. self.post_pending_read_index_on_replica(ctx); // Resume `read_progress` + self.update_read_progress(ctx, ReadProgress::WaitData(false)); self.read_progress.resume(); // Update apply index to `last_applying_idx` self.read_progress .update_applied(self.last_applying_idx, &ctx.coprocessor_host); - self.notify_leader_the_peer_is_available(ctx); + if self.wait_data { + self.notify_leader_the_peer_is_available(ctx); + ctx.apply_router + .schedule_task(self.region_id, ApplyTask::Recover(self.region_id)); + self.wait_data = false; + return false; + } } CheckApplyingSnapStatus::Idle => { // FIXME: It's possible that the snapshot applying task is canceled. @@ -2590,22 +2635,19 @@ where &mut self, ctx: &mut PollContext, ) { - if self.wait_data { - self.wait_data = false; - fail_point!("ignore notify leader the peer is available", |_| {}); - let leader_id = self.leader_id(); - let leader = self.get_peer_from_cache(leader_id); - if let Some(leader) = leader { - let mut msg = ExtraMessage::default(); - msg.set_type(ExtraMessageType::MsgAvailabilityResponse); - msg.wait_data = false; - self.send_extra_message(msg, &mut ctx.trans, &leader); - info!( - "notify leader the leader is available"; - "region id" => self.region().get_id(), - "peer id" => self.peer.id - ); - } + fail_point!("ignore notify leader the peer is available", |_| {}); + let leader_id = self.leader_id(); + let leader = self.get_peer_from_cache(leader_id); + if let Some(leader) = leader { + let mut msg = ExtraMessage::default(); + msg.set_type(ExtraMessageType::MsgAvailabilityResponse); + msg.wait_data = false; + self.send_extra_message(msg, &mut ctx.trans, &leader); + info!( + "notify leader the peer is available"; + "region id" => self.region().get_id(), + "peer id" => self.peer.id + ); } } @@ -3128,9 +3170,8 @@ where "after" => ?peer, ); self.peer = peer; - // TODO: set priority for witness - // self.raft_group - // .set_priority(if self.peer.is_witness { 0 } else { 1 }); + self.raft_group + .set_priority(if self.peer.is_witness { -1 } else { 0 }); }; self.activate(ctx); @@ -3586,6 +3627,16 @@ where reader.update(progress); } + pub fn update_read_progress( + &self, + ctx: &mut PollContext, + progress: ReadProgress, + ) { + let mut meta = ctx.store_meta.lock().unwrap(); + let reader = meta.readers.get_mut(&self.region_id).unwrap(); + self.maybe_update_read_progress(reader, progress); + } + pub fn maybe_campaign(&mut self, parent_is_leader: bool) -> bool { if self.region().get_peers().len() <= 1 { // The peer campaigned when it was created, no need to do it again. @@ -4434,13 +4485,10 @@ where msg: &eraftpb::Message, peer_disk_usage: DiskUsage, ) -> bool { - if self.is_witness() { - // shouldn't transfer leader to witness peer - return true; - } - let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); - if pending_snapshot + // shouldn't transfer leader to witness peer or non-witness waiting data + if self.is_witness() || self.wait_data + || pending_snapshot || msg.get_from() != self.leader_id() // Transfer leader to node with disk full will lead to write availablity downback. // But if the current leader is disk full, and send such request, we should allow it, @@ -4455,6 +4503,8 @@ where "from" => msg.get_from(), "pending_snapshot" => pending_snapshot, "disk_usage" => ?ctx.self_disk_usage, + "is_witness" => self.is_witness(), + "wait_data" => self.wait_data, ); return true; } diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index b060a866d71..8dc8a18906c 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -449,6 +449,11 @@ where /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// available snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { + fail_point!("ignore generate snapshot", self.peer_id == 1, |_| { + Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )) + }); if self.peer.as_ref().unwrap().is_witness { // witness could be the leader for a while, do not generate snapshot now return Err(raft::Error::Store( @@ -457,6 +462,18 @@ where } if find_peer_by_id(&self.region, to).map_or(false, |p| p.is_witness) { + // Although we always sending snapshot task behind apply task to get latest + // snapshot, we can't use `last_applying_idx` here, as below the judgment + // condition will generate an witness snapshot directly, the new non-witness + // will ingore this mismatch snapshot and can't request snapshot successfully + // again. + if self.applied_index() < request_index { + // It may be a request from non-witness. In order to avoid generating mismatch + // snapshots, wait for apply non-witness to complete + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } // generate an empty snapshot for witness directly return Ok(util::new_empty_snapshot( self.region.clone(), @@ -666,6 +683,7 @@ where "peer_id" => self.peer_id, "region" => ?region, "state" => ?self.apply_state(), + "for_witness" => for_witness, ); Ok((region, for_witness)) diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2d27b56fda5..2f61534d159 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -228,7 +228,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { AdminCmdEpochState::new(true, true, false, false) } - AdminCmdType::BatchSwitchWitness => unimplemented!(), + AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), } } diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 5861e27a508..e6c3c505cdf 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -59,6 +59,7 @@ make_static_metric! { witness, flashback_not_prepared, flashback_in_progress, + wait_data, } pub struct LocalReadRejectCounter : LocalIntCounter { diff --git a/components/raftstore/src/store/worker/pd.rs b/components/raftstore/src/store/worker/pd.rs index 79b58d75c83..18ecc77f599 100644 --- a/components/raftstore/src/store/worker/pd.rs +++ b/components/raftstore/src/store/worker/pd.rs @@ -25,8 +25,8 @@ use kvproto::{ kvrpcpb::DiskFullOpt, metapb, pdpb, raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, + AdminCmdType, AdminRequest, BatchSwitchWitnessRequest, ChangePeerRequest, + ChangePeerV2Request, RaftCmdRequest, SplitRequest, SwitchWitnessRequest, }, raft_serverpb::RaftMessage, replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, @@ -1551,6 +1551,18 @@ where deadline:None, disk_full_opt:DiskFullOpt::AllowedOnAlmostFull, }); + } else if resp.has_switch_witnesses() { + PD_HEARTBEAT_COUNTER_VEC + .with_label_values(&["switch witness"]) + .inc(); + + let mut switches = resp.take_switch_witnesses(); + info!("try to switch witness"; + "region_id" => region_id, + "switch witness" => ?switches + ); + let req = new_batch_switch_witness(switches.take_switch_witnesses().into()); + send_admin_request(&router, region_id, epoch, peer, req, Callback::None, Default::default()); } else { PD_HEARTBEAT_COUNTER_VEC.with_label_values(&["noop"]).inc(); } @@ -2257,6 +2269,24 @@ fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { req } +fn new_batch_switch_witness(switches: Vec) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSwitchWitness); + let switch_reqs = switches + .into_iter() + .map(|s| { + let mut sw = SwitchWitnessRequest::default(); + sw.set_peer_id(s.get_peer_id()); + sw.set_is_witness(s.get_is_witness()); + sw + }) + .collect(); + let mut sw = BatchSwitchWitnessRequest::default(); + sw.set_switch_witnesses(switch_reqs); + req.set_switch_witnesses(sw); + req +} + fn send_admin_request( router: &RaftRouter, region_id: u64, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index a7849f5e1dd..6b20e375786 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -412,6 +412,8 @@ pub struct ReadDelegate { pub txn_ext: Arc, pub read_progress: Arc, pub pending_remove: bool, + /// Indicates whether the peer is waiting data. See more in `Peer`. + pub wait_data: bool, // `track_ver` used to keep the local `ReadDelegate` in `LocalReader` // up-to-date with the global `ReadDelegate` stored at `StoreMeta` @@ -435,6 +437,7 @@ impl ReadDelegate { txn_ext: peer.txn_ext.clone(), read_progress: peer.read_progress.clone(), pending_remove: false, + wait_data: false, bucket_meta: peer.region_buckets.as_ref().map(|b| b.meta.clone()), track_ver: TrackVer::new(), } @@ -463,6 +466,7 @@ impl ReadDelegate { txn_ext, read_progress, pending_remove: false, + wait_data: false, bucket_meta, track_ver: TrackVer::new(), } @@ -496,6 +500,9 @@ impl ReadDelegate { Progress::RegionBuckets(bucket_meta) => { self.bucket_meta = Some(bucket_meta); } + Progress::WaitData(wait_data) => { + self.wait_data = wait_data; + } } } @@ -591,6 +598,7 @@ impl ReadDelegate { txn_ext: Default::default(), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, } @@ -620,6 +628,7 @@ pub enum Progress { AppliedTerm(u64), LeaderLease(RemoteLease), RegionBuckets(Arc), + WaitData(bool), } impl Progress { @@ -642,6 +651,10 @@ impl Progress { pub fn region_buckets(bucket_meta: Arc) -> Progress { Progress::RegionBuckets(bucket_meta) } + + pub fn wait_data(wait_data: bool) -> Progress { + Progress::WaitData(wait_data) + } } struct SnapCache @@ -797,7 +810,13 @@ where // Check witness if find_peer_by_id(&delegate.region, delegate.peer_id).map_or(true, |p| p.is_witness) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.witness.inc()); - return Err(Error::RecoveryInProgress(region_id)); + return Err(Error::IsWitness(region_id)); + } + + // Check non-witness hasn't finish applying snapshot yet. + if delegate.wait_data { + TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.wait_data.inc()); + return Err(Error::IsWitness(region_id)); } // Check whether the region is in the flashback state and the local read could @@ -1299,6 +1318,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress: read_progress.clone(), pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; @@ -1590,6 +1610,7 @@ mod tests { track_ver: TrackVer::new(), read_progress: Arc::new(RegionReadProgress::new(®ion, 0, 0, 1)), pending_remove: false, + wait_data: false, bucket_meta: None, }; meta.readers.insert(1, read_delegate); @@ -1715,6 +1736,7 @@ mod tests { txn_ext: Arc::new(TxnExt::default()), read_progress, pending_remove: false, + wait_data: false, track_ver: TrackVer::new(), bucket_meta: None, }; diff --git a/components/test_pd_client/src/pd.rs b/components/test_pd_client/src/pd.rs index 513d08643a7..a76692c4a67 100644 --- a/components/test_pd_client/src/pd.rs +++ b/components/test_pd_client/src/pd.rs @@ -27,8 +27,8 @@ use keys::{self, data_key, enc_end_key, enc_start_key}; use kvproto::{ metapb::{self, PeerRole}, pdpb::{ - self, ChangePeer, ChangePeerV2, CheckPolicy, Merge, RegionHeartbeatResponse, SplitRegion, - TransferLeader, + self, BatchSwitchWitness, ChangePeer, ChangePeerV2, CheckPolicy, Merge, + RegionHeartbeatResponse, SplitRegion, SwitchWitness, TransferLeader, }, replication_modepb::{ DrAutoSyncState, RegionReplicationStatus, ReplicationMode, ReplicationStatus, @@ -40,7 +40,7 @@ use pd_client::{ }; use raft::eraftpb::ConfChangeType; use tikv_util::{ - store::{check_key_in_region, find_peer, is_learner, new_peer, QueryStats}, + store::{check_key_in_region, find_peer, find_peer_by_id, is_learner, new_peer, QueryStats}, time::{Instant, UnixSecs}, timer::GLOBAL_TIMER_HANDLE, Either, HandyRwLock, @@ -135,6 +135,11 @@ enum Operator { remove_peers: Vec, policy: SchedulePolicy, }, + BatchSwitchWitness { + peer_ids: Vec, + is_witnesses: Vec, + policy: SchedulePolicy, + }, } pub fn sleep_ms(ms: u64) { @@ -201,6 +206,22 @@ pub fn new_pd_merge_region(target_region: metapb::Region) -> RegionHeartbeatResp resp } +fn switch_witness(peer_id: u64, is_witness: bool) -> SwitchWitness { + let mut sw = SwitchWitness::default(); + sw.set_peer_id(peer_id); + sw.set_is_witness(is_witness); + sw +} + +pub fn new_pd_batch_switch_witnesses(switches: Vec) -> RegionHeartbeatResponse { + let mut switch_witnesses = BatchSwitchWitness::default(); + switch_witnesses.set_switch_witnesses(switches.into()); + + let mut resp = RegionHeartbeatResponse::default(); + resp.set_switch_witnesses(switch_witnesses); + resp +} + impl Operator { fn make_region_heartbeat_response( &self, @@ -276,6 +297,17 @@ impl Operator { } new_pd_change_peer_v2(cps) } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + .. + } => { + let mut switches = Vec::with_capacity(peer_ids.len()); + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + switches.push(switch_witness(*peer_id, *is_witness)); + } + new_pd_batch_switch_witnesses(switches) + } } } @@ -360,6 +392,26 @@ impl Operator { add && remove || !policy.schedule() } + Operator::BatchSwitchWitness { + ref peer_ids, + ref is_witnesses, + ref mut policy, + } => { + if !policy.schedule() { + return true; + } + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + if region + .get_peers() + .iter() + .any(|p| (p.get_id() == *peer_id) && (p.get_is_witness() != *is_witness)) + || cluster.pending_peers.contains_key(peer_id) + { + return false; + } + } + true + } } } } @@ -1043,6 +1095,48 @@ impl TestPdClient { panic!("region {:?} failed to leave joint", region); } + pub fn must_finish_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + for _ in 1..500 { + sleep_ms(10); + let region = match block_on(self.get_region_by_id(region_id)).unwrap() { + Some(region) => region, + None => continue, + }; + + for p in region.get_peers().iter() { + error!("in must_finish_switch_witnesses, p: {:?}", p); + } + + let mut need_retry = false; + for (peer_id, is_witness) in peer_ids.iter().zip(is_witnesses.iter()) { + match find_peer_by_id(®ion, *peer_id) { + Some(p) => { + if p.get_is_witness() != *is_witness + || self.cluster.rl().pending_peers.contains_key(&p.get_id()) + { + need_retry = true; + break; + } + } + None => { + need_retry = true; + break; + } + } + } + if !need_retry { + return; + } + } + let region = block_on(self.get_region_by_id(region_id)).unwrap(); + panic!("region {:?} failed to finish switch witnesses", region); + } + pub fn add_region(&self, region: &metapb::Region) { self.cluster.wl().add_region(region) } @@ -1072,6 +1166,15 @@ impl TestPdClient { self.schedule_operator(region_id, op); } + pub fn switch_witnesses(&self, region_id: u64, peer_ids: Vec, is_witnesses: Vec) { + let op = Operator::BatchSwitchWitness { + peer_ids, + is_witnesses, + policy: SchedulePolicy::TillSuccess, + }; + self.schedule_operator(region_id, op); + } + pub fn joint_confchange( &self, region_id: u64, @@ -1189,6 +1292,16 @@ impl TestPdClient { self.must_none_peer(region_id, peer); } + pub fn must_switch_witnesses( + &self, + region_id: u64, + peer_ids: Vec, + is_witnesses: Vec, + ) { + self.switch_witnesses(region_id, peer_ids.clone(), is_witnesses.clone()); + self.must_finish_switch_witnesses(region_id, peer_ids, is_witnesses); + } + pub fn must_joint_confchange( &self, region_id: u64, diff --git a/etc/error_code.toml b/etc/error_code.toml index 6b361e29e37..bb23c9b5e26 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -448,6 +448,11 @@ error = ''' KV:Raftstore:FlashbackNotPrepared ''' +["KV:Raftstore:IsWitness"] +error = ''' +KV:Raftstore:IsWitness +''' + ["KV:Raftstore:SnapAbort"] error = ''' KV:Raftstore:SnapAbort diff --git a/tests/failpoints/cases/test_witness.rs b/tests/failpoints/cases/test_witness.rs index 552434d1fed..ef178ee8aa0 100644 --- a/tests/failpoints/cases/test_witness.rs +++ b/tests/failpoints/cases/test_witness.rs @@ -4,22 +4,11 @@ use std::{iter::FromIterator, sync::Arc, time::Duration}; use collections::HashMap; use futures::executor::block_on; -use kvproto::{metapb, raft_serverpb::RaftApplyState}; +use kvproto::raft_serverpb::RaftApplyState; use pd_client::PdClient; use test_raftstore::*; use tikv_util::{config::ReadableDuration, store::find_peer}; -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - // Test the case local reader works well with witness peer. #[test] fn test_witness_update_region_in_local_reader() { @@ -35,8 +24,12 @@ fn test_witness_update_region_in_local_reader() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); cluster.must_put(b"k0", b"v0"); @@ -61,8 +54,8 @@ fn test_witness_update_region_in_local_reader() { .read(None, request.clone(), Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } @@ -95,8 +88,12 @@ fn test_witness_raftlog_gc_pull_voter_replicated_index() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -176,8 +173,12 @@ fn test_witness_raftlog_gc_after_reboot() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -240,3 +241,235 @@ fn test_witness_raftlog_gc_after_reboot() { } fail::remove("on_raft_gc_log_tick"); } + +// Test the case request snapshot and apply successfully after non-witness +// restart. +#[test] +fn test_request_snapshot_after_reboot() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp = "ignore request snapshot"; + fail::cfg(fp, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore request snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + cluster.stop_node(nodes[2]); + fail::remove(fp); + std::thread::sleep(Duration::from_millis(100)); + // the PeerState is Unavailable, so it will request snapshot immediately after + // start. + cluster.run_node(nodes[2]).unwrap(); + must_get_none(&cluster.get_engine(3), b"k1"); + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); +} + +// Test the case request snapshot and apply successfully after term change. +#[test] +fn test_request_snapshot_after_term_change() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(20); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> nonwitness + let fp1 = "ignore generate snapshot"; + fail::cfg(fp1, "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // as we ignore generate snapshot, so snapshot should still not applied yet + assert_eq!(cluster.pd_client.get_pending_peers().len(), 1); + must_get_none(&cluster.get_engine(3), b"k1"); + + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store2.clone()); + // After leader changes, the `term` and `last term` no longer match, so + // continue to receive `MsgAppend` until the two get equal, then retry to + // request snapshot and complete the application. + std::thread::sleep(Duration::from_millis(500)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp1); +} + +fn test_non_witness_availability(fp: &str) { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.pd_heartbeat_tick_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.check_peers_availability_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // non-witness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + cluster.must_put(b"k1", b"v1"); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + fail::cfg(fp, "return").unwrap(); + + // witness -> non-witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(500)); + // snapshot applied + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + assert_eq!(cluster.pd_client.get_pending_peers().len(), 0); + fail::remove(fp); +} + +// Test the case leader pulls non-witness availability when non-witness failed +// to push the info. +#[test] +fn test_pull_non_witness_availability() { + test_non_witness_availability("ignore notify leader the peer is available"); +} + +// Test the case non-witness pushes its availability without leader pulling. +#[test] +fn test_push_non_witness_availability() { + test_non_witness_availability("ignore schedule check non-witness availability tick"); +} + +// Test the case non-witness hasn't finish applying snapshot when receives read +// request. +#[test] +fn test_non_witness_replica_read() { + let mut cluster = new_server_cluster(0, 3); + cluster.cfg.raft_store.check_request_snapshot_interval = ReadableDuration::millis(20); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k0", b"v0"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1); + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + // witness -> nonwitness + fail::cfg("ignore request snapshot", "return").unwrap(); + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store3.get_id()], vec![false]); + std::thread::sleep(Duration::from_millis(100)); + // as we ignore request snapshot, so snapshot should still not applied yet + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3.clone()); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!( + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { + region_id: region.get_id(), + ..Default::default() + } + ); + + // start requesting snapshot and give enough time for applying snapshot to + // complete + fail::remove("ignore request snapshot"); + std::thread::sleep(Duration::from_millis(500)); + + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![new_get_cmd(b"k0")], + false, + ); + request.mut_header().set_peer(peer_on_store3); + request.mut_header().set_replica_read(true); + + let resp = cluster + .read(None, request, Duration::from_millis(100)) + .unwrap(); + assert_eq!(resp.get_header().has_error(), false); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index a4e15b8fa6e..bb35b069a41 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -255,6 +255,7 @@ fn test_serde_custom_tikv_config() { max_snapshot_file_raw_size: ReadableSize::gb(10), unreachable_backoff: ReadableDuration::secs(111), check_peers_availability_interval: ReadableDuration::secs(30), + check_request_snapshot_interval: ReadableDuration::minutes(1), }; value.pd = PdConfig::new(vec!["example.com:443".to_owned()]); let titan_cf_config = TitanCfConfig { diff --git a/tests/integrations/raftstore/test_witness.rs b/tests/integrations/raftstore/test_witness.rs index 301a743588e..f35b21b08a1 100644 --- a/tests/integrations/raftstore/test_witness.rs +++ b/tests/integrations/raftstore/test_witness.rs @@ -14,28 +14,6 @@ use raft::eraftpb::ConfChangeType; use test_raftstore::*; use tikv_util::store::find_peer; -fn become_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(true); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - -fn become_non_witness(cluster: &Cluster, region_id: u64, peer: &mut metapb::Peer) { - peer.set_role(metapb::PeerRole::Learner); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - cluster.pd_client.must_remove_peer(region_id, peer.clone()); - peer.set_is_witness(false); - peer.set_id(peer.get_id() + 10); - cluster.pd_client.must_add_peer(region_id, peer.clone()); - peer.set_role(metapb::PeerRole::Voter); - cluster.pd_client.must_add_peer(region_id, peer.clone()); -} - // Test the case that region split or merge with witness peer #[test] fn test_witness_split_merge() { @@ -49,9 +27,12 @@ fn test_witness_split_merge() { let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); - + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let before = cluster .apply_state(region.get_id(), nodes[2]) .get_applied_index(); @@ -96,8 +77,12 @@ fn test_witness_split_merge() { assert!(find_peer(&right, nodes[2]).unwrap().is_witness); // can't merge with different witness location - let mut peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); - become_non_witness(&cluster, left.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(&left, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + left.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); let left = cluster.get_region(b"k1"); let req = new_admin_request( left.get_id(), @@ -174,6 +159,8 @@ fn test_witness_conf_change() { .pd_client .must_remove_peer(region.get_id(), peer_on_store3); + std::thread::sleep(Duration::from_millis(10)); + assert_eq!( cluster .region_local_state(region.get_id(), nodes[2]) @@ -182,124 +169,127 @@ fn test_witness_conf_change() { ); } -// #[test] -// // Test flow of switch witness -// fn test_witness_switch_witness() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); - -// std::thread::sleep(Duration::from_millis(100)); -// must_get_none(&cluster.get_engine(3), b"k1"); - -// // witness -> nonwitness -// peer_on_store3.set_role(metapb::PeerRole::Learner); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// cluster -// .pd_client -// .must_remove_peer(region.get_id(), peer_on_store3.clone()); -// peer_on_store3.set_is_witness(false); -// cluster -// .pd_client -// .must_add_peer(region.get_id(), peer_on_store3.clone()); -// std::thread::sleep(Duration::from_millis(100)); -// must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); -// } - -// TODO: add back when switch witness is supported -// // Test the case that leader is forbidden to become witness -// #[test] -// fn test_witness_leader() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// cluster.must_put(b"k1", b"v1"); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// let mut peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); -// cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); - -// // can't make leader to witness -// peer_on_store1.set_is_witness(true); -// cluster -// .pd_client -// .add_peer(region.get_id(), peer_on_store1.clone()); - -// std::thread::sleep(Duration::from_millis(100)); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// 1 -// ); -// // leader changes to witness failed, so still can get the value -// must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); - -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// // can't transfer leader to witness -// cluster.transfer_leader(region.get_id(), &mut peer_on_store3); -// assert_eq!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[0], -// ); -// } - -// TODO: add back when election priority is supported -// // Test the case that witness can't be elected as leader based on election -// // priority when there is no log gap -// #[test] -// fn test_witness_election_priority() { -// let mut cluster = new_server_cluster(0, 3); -// cluster.run(); -// let nodes = Vec::from_iter(cluster.get_node_ids()); -// assert_eq!(nodes.len(), 3); - -// let pd_client = Arc::clone(&cluster.pd_client); -// pd_client.disable_default_operator(); - -// let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); -// // nonwitness -> witness -// let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); -// become_witness(&cluster, region.get_id(), &mut peer_on_store3); -// cluster.must_put(b"k0", b"v0"); - -// // make sure logs are replicated to the witness -// std::thread::sleep(Duration::from_millis(100)); - -// for i in 1..10 { -// let node = -// cluster.leader_of_region(region.get_id()).unwrap().store_id; cluster. -// stop_node(node); let (k, v) = (format!("k{}", i), format!("v{}", i)); -// let key = k.as_bytes(); -// let value = v.as_bytes(); -// cluster.must_put(key, value); -// // the witness can't be elected as the leader when there is no log -// gap assert_ne!( -// cluster.leader_of_region(region.get_id()).unwrap().store_id, -// nodes[2], -// ); -// cluster.run_node(node).unwrap(); -// } -// } +// Test flow of switch witness +#[test] +fn test_witness_switch_witness() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_none(&cluster.get_engine(3), b"k1"); + + // witness -> non-witness + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![false], + ); + + std::thread::sleep(Duration::from_millis(100)); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); +} + +// Test the case that leader is forbidden to become witness +#[test] +fn test_witness_leader() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + cluster.must_put(b"k1", b"v1"); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); + cluster.must_transfer_leader(region.get_id(), peer_on_store1.clone()); + + // can't make leader to witness + cluster + .pd_client + .switch_witnesses(region.get_id(), vec![peer_on_store1.get_id()], vec![true]); + + std::thread::sleep(Duration::from_millis(100)); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + 1 + ); + // leader changes to witness failed, so still can get the value + must_get_equal(&cluster.get_engine(nodes[0]), b"k1", b"v1"); + + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + // can't transfer leader to witness + cluster.transfer_leader(region.get_id(), peer_on_store3); + assert_eq!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[0], + ); +} + +// Test the case that witness can't be elected as leader based on election +// priority when there is no log gap +#[test] +fn test_witness_election_priority() { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + let nodes = Vec::from_iter(cluster.get_node_ids()); + assert_eq!(nodes.len(), 3); + + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let region = block_on(pd_client.get_region_by_id(1)).unwrap().unwrap(); + + // nonwitness -> witness + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); + cluster.must_put(b"k0", b"v0"); + + // make sure logs are replicated to the witness + std::thread::sleep(Duration::from_millis(100)); + + for i in 1..10 { + let node = cluster.leader_of_region(region.get_id()).unwrap().store_id; + cluster.stop_node(node); + let (k, v) = (format!("k{}", i), format!("v{}", i)); + let key = k.as_bytes(); + let value = v.as_bytes(); + cluster.must_put(key, value); + // the witness can't be elected as the leader when there is no log gap + assert_ne!( + cluster.leader_of_region(region.get_id()).unwrap().store_id, + nodes[2], + ); + cluster.run_node(node).unwrap(); + // make sure logs are replicated to the restarted node + std::thread::sleep(Duration::from_millis(100)); + } +} // Test the case that truncated index won't advance when there is a witness even // if the gap gap exceeds the gc count limit @@ -320,8 +310,12 @@ fn test_witness_raftlog_gc_lagged_follower() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); // make sure raft log gc is triggered std::thread::sleep(Duration::from_millis(200)); @@ -391,8 +385,12 @@ fn test_witness_raftlog_gc_lagged_witness() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); cluster.must_put(b"k0", b"v0"); // make sure raft log gc is triggered @@ -447,8 +445,12 @@ fn test_witness_replica_read() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); // nonwitness -> witness - let mut peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); - become_witness(&cluster, region.get_id(), &mut peer_on_store3); + let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store3.get_id()], + vec![true], + ); let mut request = new_request( region.get_id(), @@ -463,15 +465,15 @@ fn test_witness_replica_read() { .read(None, request, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() } ); } -fn must_get_error_recovery_in_progress( +fn must_get_error_is_witness( cluster: &mut Cluster, region: &metapb::Region, cmd: kvproto::raft_cmdpb::Request, @@ -486,8 +488,8 @@ fn must_get_error_recovery_in_progress( .call_command_on_leader(req, Duration::from_millis(100)) .unwrap(); assert_eq!( - resp.get_header().get_error().get_recovery_in_progress(), - &kvproto::errorpb::RecoveryInProgress { + resp.get_header().get_error().get_is_witness(), + &kvproto::errorpb::IsWitness { region_id: region.get_id(), ..Default::default() }, @@ -513,9 +515,13 @@ fn test_witness_leader_down() { let peer_on_store1 = find_peer(®ion, nodes[0]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store1); - let mut peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); + let peer_on_store2 = find_peer(®ion, nodes[1]).unwrap().clone(); // nonwitness -> witness - become_witness(&cluster, region.get_id(), &mut peer_on_store2); + cluster.pd_client.must_switch_witnesses( + region.get_id(), + vec![peer_on_store2.get_id()], + vec![true], + ); // the other follower is isolated cluster.add_send_filter(IsolationFilterFactory::new(3)); @@ -530,13 +536,13 @@ fn test_witness_leader_down() { // forbid writes let put = new_put_cmd(b"k3", b"v3"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, put); + must_get_error_is_witness(&mut cluster, ®ion, put); // forbid reads let get = new_get_cmd(b"k1"); - must_get_error_recovery_in_progress(&mut cluster, ®ion, get); + must_get_error_is_witness(&mut cluster, ®ion, get); // forbid read index let read_index = new_read_index_cmd(); - must_get_error_recovery_in_progress(&mut cluster, ®ion, read_index); + must_get_error_is_witness(&mut cluster, ®ion, read_index); let peer_on_store3 = find_peer(®ion, nodes[2]).unwrap().clone(); cluster.must_transfer_leader(region.get_id(), peer_on_store3); From f178f781048bef4930a8e82fd08c3e194e9f8ae4 Mon Sep 17 00:00:00 2001 From: Hu# Date: Fri, 20 Jan 2023 14:57:49 +0800 Subject: [PATCH 094/115] resource_manager: add watch for resource group (#14022) close tikv/tikv#13983 - add etcd mock for pd - add service for resource group Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 10 + components/pd_client/src/client.rs | 67 +++- components/pd_client/src/errors.rs | 6 +- components/pd_client/src/lib.rs | 24 +- components/resource_control/Cargo.toml | 10 + components/resource_control/src/lib.rs | 3 + .../resource_control/src/resource_group.rs | 7 +- components/resource_control/src/service.rs | 267 ++++++++++++++++ components/server/src/server.rs | 16 +- components/server/src/server2.rs | 16 +- components/test_pd/Cargo.toml | 3 + components/test_pd/src/lib.rs | 1 + components/test_pd/src/mocker/etcd.rs | 288 ++++++++++++++++++ components/test_pd/src/mocker/mod.rs | 61 +++- components/test_pd/src/server.rs | 67 +++- components/tikv_util/src/worker/pool.rs | 7 + tests/failpoints/cases/test_pd_client.rs | 54 ---- .../failpoints/cases/test_pd_client_legacy.rs | 111 ++++--- 18 files changed, 866 insertions(+), 152 deletions(-) create mode 100644 components/resource_control/src/service.rs create mode 100644 components/test_pd/src/mocker/etcd.rs diff --git a/Cargo.lock b/Cargo.lock index e9f55d1923d..ee047aaae6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4676,15 +4676,22 @@ dependencies = [ "byteorder", "crossbeam-skiplist", "dashmap", + "fail", + "futures 0.3.15", "kvproto", "lazy_static", "online_config", + "pd_client", "pin-project", "prometheus", + "protobuf", "serde", "slog", "slog-global", + "test_pd", + "test_pd_client", "tikv_util", + "tokio", "yatp", ] @@ -5838,11 +5845,14 @@ dependencies = [ "futures 0.3.15", "grpcio", "kvproto", + "log_wrappers", "pd_client", "security", "slog", "slog-global", "tikv_util", + "tokio", + "tokio-stream", ] [[package]] diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 1e1e5980908..6686c4e8a04 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -1,7 +1,6 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::HashMap, fmt, sync::{ atomic::{AtomicU64, Ordering}, @@ -286,9 +285,46 @@ impl fmt::Debug for RpcClient { const LEADER_CHANGE_RETRY: usize = 10; impl PdClient for RpcClient { - fn load_global_config(&self, config_path: String) -> PdFuture> { - use kvproto::pdpb::LoadGlobalConfigRequest; - let mut req = LoadGlobalConfigRequest::new(); + fn store_global_config( + &self, + config_path: String, + items: Vec, + ) -> PdFuture<()> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["store_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::StoreGlobalConfigRequest::new(); + req.set_config_path(config_path); + req.set_changes(items.into()); + let executor = move |client: &Client, req| match client + .inner + .rl() + .client_stub + .store_global_config_async(&req) + { + Ok(grpc_response) => Box::pin(async move { + if let Err(err) = grpc_response.await { + return Err(box_err!("{:?}", err)); + } + Ok(()) + }) as PdFuture<_>, + Err(err) => Box::pin(async move { Err(box_err!("{:?}", err)) }) as PdFuture<_>, + }; + self.pd_client + .request(req, executor, LEADER_CHANGE_RETRY) + .execute() + } + + fn load_global_config( + &self, + config_path: String, + ) -> PdFuture<(Vec, i64)> { + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["load_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::LoadGlobalConfigRequest::new(); req.set_config_path(config_path); let executor = |client: &Client, req| match client .inner @@ -299,13 +335,10 @@ impl PdClient for RpcClient { { Ok(grpc_response) => Box::pin(async move { match grpc_response.await { - Ok(grpc_response) => { - let mut res = HashMap::with_capacity(grpc_response.get_items().len()); - for c in grpc_response.get_items() { - res.insert(c.get_name().to_owned(), c.get_value().to_owned()); - } - Ok(res) - } + Ok(grpc_response) => Ok(( + Vec::from(grpc_response.get_items()), + grpc_response.get_revision(), + )), Err(err) => Err(box_err!("{:?}", err)), } }) as PdFuture<_>, @@ -318,9 +351,17 @@ impl PdClient for RpcClient { fn watch_global_config( &self, + config_path: String, + revision: i64, ) -> Result> { - use kvproto::pdpb::WatchGlobalConfigRequest; - let req = WatchGlobalConfigRequest::default(); + let _timer = PD_REQUEST_HISTOGRAM_VEC + .with_label_values(&["watch_global_config"]) + .start_coarse_timer(); + + let mut req = pdpb::WatchGlobalConfigRequest::default(); + info!("[global_config] start watch global config"; "path" => &config_path, "revision" => revision); + req.set_config_path(config_path); + req.set_revision(revision); sync_request(&self.pd_client, LEADER_CHANGE_RETRY, |client, _| { client.watch_global_config(&req) }) diff --git a/components/pd_client/src/errors.rs b/components/pd_client/src/errors.rs index 689cb276064..5bacca03354 100644 --- a/components/pd_client/src/errors.rs +++ b/components/pd_client/src/errors.rs @@ -35,12 +35,14 @@ pub type Result = result::Result; impl Error { pub fn retryable(&self) -> bool { match self { - Error::Grpc(_) | Error::ClusterNotBootstrapped(_) | Error::StreamDisconnect(_) => true, + Error::Grpc(_) + | Error::ClusterNotBootstrapped(_) + | Error::StreamDisconnect(_) + | Error::DataCompacted(_) => true, Error::Other(_) | Error::RegionNotFound(_) | Error::StoreTombstone(_) | Error::GlobalConfigNotFound(_) - | Error::DataCompacted(_) | Error::ClusterBootstrapped(_) | Error::Incompatible => false, } diff --git a/components/pd_client/src/lib.rs b/components/pd_client/src/lib.rs index 46a3e6924db..b877750770d 100644 --- a/components/pd_client/src/lib.rs +++ b/components/pd_client/src/lib.rs @@ -14,15 +14,14 @@ mod util; mod config; pub mod errors; -use std::{cmp::Ordering, collections::HashMap, ops::Deref, sync::Arc, time::Duration}; +use std::{cmp::Ordering, ops::Deref, sync::Arc, time::Duration}; use futures::future::BoxFuture; -use grpcio::ClientSStreamReceiver; use kvproto::{ metapb, pdpb, replication_modepb::{RegionReplicationStatus, ReplicationStatus, StoreDrAutoSyncStatus}, }; -use pdpb::{QueryStats, WatchGlobalConfigResponse}; +use pdpb::QueryStats; use tikv_util::time::{Instant, UnixSecs}; use txn_types::TimeStamp; @@ -201,6 +200,8 @@ impl BucketStat { } pub const INVALID_ID: u64 = 0; +// TODO: Implementation of config registration for each module +pub const RESOURCE_CONTROL_CONFIG_PATH: &str = "resource_group/settings"; /// PdClient communicates with Placement Driver (PD). /// Because now one PD only supports one cluster, so it is no need to pass @@ -209,17 +210,28 @@ pub const INVALID_ID: u64 = 0; /// all the time. pub trait PdClient: Send + Sync { /// Load a list of GlobalConfig - fn load_global_config(&self, _config_path: String) -> PdFuture> { + fn load_global_config( + &self, + _config_path: String, + ) -> PdFuture<(Vec, i64)> { unimplemented!(); } /// Store a list of GlobalConfig - fn store_global_config(&self, _list: HashMap) -> PdFuture<()> { + fn store_global_config( + &self, + _config_path: String, + _items: Vec, + ) -> PdFuture<()> { unimplemented!(); } /// Watching change of GlobalConfig - fn watch_global_config(&self) -> Result> { + fn watch_global_config( + &self, + _config_path: String, + _revision: i64, + ) -> Result> { unimplemented!(); } diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 822aed2cd2d..3f796627040 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -4,17 +4,27 @@ version = "0.0.1" edition = "2021" publish = false +[features] +failpoints = ["fail/failpoints"] + [dependencies] byteorder = "1.2" crossbeam-skiplist = "0.1" dashmap = "5.1" +fail = "0.5" +futures = { version = "0.3" } kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.0" online_config = { workspace = true } +pd_client = { workspace = true } pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } +protobuf = { version = "2.8", features = ["bytes"] } serde = { version = "1.0", features = ["derive"] } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } +test_pd = { workspace = true } +test_pd_client = { workspace = true } tikv_util = { workspace = true } +tokio = { version = "1.5", features = ["time"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/lib.rs b/components/resource_control/src/lib.rs index eb6679f71e8..5534ed2153d 100644 --- a/components/resource_control/src/lib.rs +++ b/components/resource_control/src/lib.rs @@ -11,6 +11,9 @@ pub use resource_group::{ mod future; pub use future::ControlledFuture; +mod service; +pub use service::ResourceManagerService; + #[derive(Clone, Serialize, Deserialize, PartialEq, Debug, OnlineConfig, Default)] #[serde(default)] #[serde(rename_all = "kebab-case")] diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index bfe9d92d0f3..23a50b42560 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -295,18 +295,19 @@ impl GroupPriorityTracker { } #[cfg(test)] -mod tests { - use kvproto::resource_manager::*; +pub(crate) mod tests { use yatp::queue::Extras; use super::*; - fn new_resource_group( + pub fn new_resource_group( name: String, is_ru_mode: bool, read_tokens: u64, write_tokens: u64, ) -> ResourceGroup { + use kvproto::resource_manager::{GroupRawResourceSettings, GroupRequestUnitSettings}; + let mut group = ResourceGroup::new(); group.set_name(name); let mode = if is_ru_mode { diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs new file mode 100644 index 00000000000..ea9a9d724b9 --- /dev/null +++ b/components/resource_control/src/service.rs @@ -0,0 +1,267 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{sync::Arc, time::Duration}; + +use futures::StreamExt; +use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; +use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; +use tikv_util::error; + +use crate::ResourceGroupManager; + +#[derive(Clone)] +pub struct ResourceManagerService { + manager: Arc, + pd_client: Arc, + // record watch revision + revision: i64, +} + +impl ResourceManagerService { + /// Constructs a new `Service` with `ResourceGroupManager` and a `RpcClient` + pub fn new( + manager: Arc, + pd_client: Arc, + ) -> ResourceManagerService { + ResourceManagerService { + pd_client, + manager, + revision: 0, + } + } +} + +impl ResourceManagerService { + pub async fn watch_resource_groups(&mut self) { + // Firstly, load all resource groups as of now. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + // Secondly, start watcher at loading revision. + loop { + match self + .pd_client + .watch_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), self.revision) + { + Ok(mut stream) => { + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + self.revision = r.get_revision(); + r.get_changes() + .iter() + .for_each(|item| match item.get_kind() { + EventType::Put => { + if let Ok(group) = + protobuf::parse_from_bytes::( + item.get_value().as_bytes(), + ) + { + self.manager.add_resource_group(group); + } + } + EventType::Delete => { + self.manager.remove_resource_group(item.get_name()); + } + }); + } + Err(err) => { + error!("failed to get stream"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + Err(PdError::DataCompacted(msg)) => { + error!("required revision has been compacted"; "err" => ?msg); + // If the etcd revision is compacted, we need to reload all resouce groups. + let (groups, revision) = self.list_resource_groups().await; + self.revision = revision; + groups + .into_iter() + .for_each(|rg| self.manager.add_resource_group(rg)); + } + Err(err) => { + error!("failed to watch resource groups"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } + + async fn list_resource_groups(&mut self) -> (Vec, i64) { + loop { + match self + .pd_client + .load_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string()) + .await + { + Ok((items, revision)) => { + let groups = items + .into_iter() + .filter_map(|g| protobuf::parse_from_bytes(g.get_value().as_bytes()).ok()) + .collect(); + return (groups, revision); + } + Err(err) => { + error!("failed to load global config"; "err" => ?err); + tokio::time::sleep(Duration::from_secs(1)).await; + } + } + } + } +} + +#[cfg(test)] +pub mod tests { + use std::time::Duration; + + use futures::executor::block_on; + use kvproto::pdpb::GlobalConfigItem; + use pd_client::RpcClient; + use protobuf::Message; + use test_pd::{mocker::Service, util::*, Server as MockServer}; + use tikv_util::{config::ReadableDuration, worker::Builder}; + + use crate::resource_group::tests::new_resource_group; + + fn new_test_server_and_client( + update_interval: ReadableDuration, + ) -> (MockServer, RpcClient) { + let server = MockServer::new(1); + let eps = server.bind_addrs(); + let client = new_client_with_update_interval(eps, None, update_interval); + (server, client) + } + + fn add_resource_group(pd_client: Arc, group: ResourceGroup) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Put); + item.set_name(group.get_name().to_string()); + let mut buf = Vec::new(); + group.write_to_vec(&mut buf).unwrap(); + item.set_value(String::from_utf8(buf).unwrap()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + fn delete_resource_group(pd_client: Arc, name: &str) { + let mut item = GlobalConfigItem::default(); + item.set_kind(EventType::Delete); + item.set_name(name.to_string()); + + futures::executor::block_on(async move { + pd_client + .store_global_config(RESOURCE_CONTROL_CONFIG_PATH.to_string(), vec![item]) + .await + }) + .unwrap(); + } + + use super::*; + #[test] + fn crud_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let group = new_resource_group("TEST".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 1); + + delete_resource_group(s.pd_client.clone(), "TEST"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 2); + + server.stop(); + } + + #[test] + fn watch_config_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let mut s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 0); + assert_eq!(revision, 0); + + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group("TEST1".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + let group2 = new_resource_group("TEST2".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group2); + // Mock modify + let group2 = new_resource_group("TEST2".into(), true, 50, 50); + add_resource_group(s.pd_client.clone(), group2); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 2); + assert_eq!(revision, 3); + // Mock delete + delete_resource_group(s.pd_client.clone(), "TEST1"); + let (res, revision) = block_on(s.list_resource_groups()); + assert_eq!(res.len(), 1); + assert_eq!(revision, 4); + // Wait for watcher + std::thread::sleep(Duration::from_millis(100)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 1); + assert!(s.manager.get_resource_group("TEST1").is_none()); + let group = s.manager.get_resource_group("TEST2").unwrap(); + assert_eq!( + group + .value() + .get_r_u_settings() + .get_r_r_u() + .get_settings() + .get_fill_rate(), + 50 + ); + server.stop(); + } + + #[test] + fn reboot_watch_server_test() { + let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let resource_manager = ResourceGroupManager::default(); + + let s = ResourceManagerService::new(Arc::new(resource_manager), Arc::new(client)); + let background_worker = Builder::new("background").thread_count(1).create(); + let mut s_clone = s.clone(); + background_worker.spawn_async_task(async move { + s_clone.watch_resource_groups().await; + }); + // Mock add + let group1 = new_resource_group("TEST1".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + // Mock reboot watch server + let watch_global_config_fp = "watch_global_config_return"; + fail::cfg(watch_global_config_fp, "return").unwrap(); + std::thread::sleep(Duration::from_millis(100)); + fail::remove(watch_global_config_fp); + // Mock add after rebooting will success + let group1 = new_resource_group("TEST2".into(), true, 100, 100); + add_resource_group(s.pd_client.clone(), group1); + // Wait watcher update + std::thread::sleep(Duration::from_secs(1)); + let groups = s.manager.get_all_resource_groups(); + assert_eq!(groups.len(), 2); + + server.stop(); + } +} diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 97fd1f77eef..207373313a4 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -82,7 +82,9 @@ use raftstore::{ }, RaftRouterCompactedEventSender, }; -use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use snap_recovery::RecoveryService; use tikv::{ @@ -330,11 +332,17 @@ where let resource_manager = if config.resource_control.enabled { let mgr = Arc::new(ResourceGroupManager::default()); - let mgr1 = mgr.clone(); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); // spawn a task to periodically update the minimal virtual time of all resource - // group. + // groups. + let resource_mgr = mgr.clone(); background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - mgr1.advance_min_virtual_time(); + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; }); Some(mgr) } else { diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 36a02130fdb..f193e1c7445 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -65,7 +65,9 @@ use raftstore::{ RegionInfoAccessor, }; use raftstore_v2::{router::RaftRouter, StateStorage}; -use resource_control::{ResourceGroupManager, MIN_PRIORITY_UPDATE_INTERVAL}; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use tikv::{ config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, @@ -294,11 +296,17 @@ where let resource_manager = if config.resource_control.enabled { let mgr = Arc::new(ResourceGroupManager::default()); - let mgr1 = mgr.clone(); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); // spawn a task to periodically update the minimal virtual time of all resource - // group. + // groups. + let resource_mgr = mgr.clone(); background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { - mgr1.advance_min_virtual_time(); + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; }); Some(mgr) } else { diff --git a/components/test_pd/Cargo.toml b/components/test_pd/Cargo.toml index a478e6ee325..6277789b194 100644 --- a/components/test_pd/Cargo.toml +++ b/components/test_pd/Cargo.toml @@ -10,8 +10,11 @@ fail = "0.5" futures = "0.3" grpcio = { workspace = true } kvproto = { workspace = true } +log_wrappers = { workspace = true } pd_client = { workspace = true } security = { workspace = true } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } tikv_util = { workspace = true } +tokio = { version = "1.0", features = ["full"] } +tokio-stream = "0.1" diff --git a/components/test_pd/src/lib.rs b/components/test_pd/src/lib.rs index 187a899d7fb..bd768e58318 100644 --- a/components/test_pd/src/lib.rs +++ b/components/test_pd/src/lib.rs @@ -1,4 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +#![feature(slice_group_by)] #[macro_use] extern crate tikv_util; diff --git a/components/test_pd/src/mocker/etcd.rs b/components/test_pd/src/mocker/etcd.rs new file mode 100644 index 00000000000..3939dfc9a72 --- /dev/null +++ b/components/test_pd/src/mocker/etcd.rs @@ -0,0 +1,288 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::Cell, + collections::{BTreeMap, HashMap}, + ops::Bound, + sync::Arc, +}; + +use futures::lock::Mutex; +use tokio::sync::mpsc::{self, Sender}; +use tokio_stream::wrappers::ReceiverStream; + +use super::Result; + +/// An in-memory, single versioned storage. +/// Emulating some interfaces of etcd for testing. +#[derive(Default, Debug)] +pub struct Etcd { + items: BTreeMap, + subs: HashMap, + revision: i64, + sub_id_alloc: Cell, +} + +pub type EtcdClient = Arc>; + +impl Etcd { + fn alloc_rev(&mut self) -> i64 { + self.revision += 1; + self.revision + } + + pub fn get_revision(&self) -> i64 { + self.revision + } + + pub fn get_key(&self, keys: Keys) -> (Vec, i64) { + let (start_key, end_key) = keys.into_bound(); + let kvs = self + .items + .range(( + Bound::Included(&Key(start_key, 0)), + Bound::Excluded(&Key(end_key, self.revision)), + )) + .collect::>() + .as_slice() + .group_by(|item1, item2| item1.0.0 == item2.0.0) + .filter_map(|group| { + let (k, v) = group.last()?; + match v { + Value::Val(val) => Some(KeyValue(MetaKey(k.0.clone()), val.clone())), + Value::Del => None, + } + }) + .fold(Vec::new(), |mut items, item| { + items.push(item); + items + }); + + (kvs, self.get_revision()) + } + + pub async fn set(&mut self, mut pair: KeyValue) -> Result<()> { + let rev = self.alloc_rev(); + for sub in self.subs.values() { + if pair.key() < sub.end_key.as_slice() && pair.key() >= sub.start_key.as_slice() { + sub.tx + .send(KvEvent { + kind: KvEventType::Put, + pair: pair.clone(), + }) + .await + .unwrap(); + } + } + self.items + .insert(Key(pair.take_key(), rev), Value::Val(pair.take_value())); + Ok(()) + } + + pub async fn delete(&mut self, keys: Keys) -> Result<()> { + let (start_key, end_key) = keys.into_bound(); + let rev = self.alloc_rev(); + let mut v = self + .items + .range(( + Bound::Included(Key(start_key, 0)), + Bound::Excluded(Key(end_key, self.revision)), + )) + .map(|(k, _)| Key::clone(k)) + .collect::>(); + v.dedup_by(|k1, k2| k1.0 == k2.0); + + for mut victim in v { + let k = Key(victim.0.clone(), rev); + self.items.insert(k, Value::Del); + + for sub in self.subs.values() { + if victim.0.as_slice() < sub.end_key.as_slice() + && victim.0.as_slice() >= sub.start_key.as_slice() + { + sub.tx + .send(KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(std::mem::take(&mut victim.0)), vec![]), + }) + .await + .unwrap(); + } + } + } + Ok(()) + } + + pub async fn watch(&mut self, keys: Keys, start_rev: i64) -> Result> { + let id = self.sub_id_alloc.get(); + self.sub_id_alloc.set(id + 1); + let (tx, rx) = mpsc::channel(1024); + let (start_key, end_key) = keys.into_bound(); + + // Sending events from [start_rev, now) to the client. + let mut pending = self + .items + .range(( + Bound::Included(Key(start_key.clone(), 0)), + Bound::Excluded(Key(end_key.clone(), self.revision)), + )) + .filter(|(k, _)| k.1 >= start_rev) + .collect::>(); + pending.sort_by_key(|(k, _)| k.1); + for (k, v) in pending { + let event = match v { + Value::Val(val) => KvEvent { + kind: KvEventType::Put, + pair: KeyValue(MetaKey(k.0.clone()), val.clone()), + }, + Value::Del => KvEvent { + kind: KvEventType::Delete, + pair: KeyValue(MetaKey(k.0.clone()), vec![]), + }, + }; + tx.send(event).await.expect("too many pending events"); + } + + self.subs.insert( + id, + Subscriber { + start_key, + end_key, + tx, + }, + ); + Ok(ReceiverStream::new(rx)) + } + + pub fn clear_subs(&mut self) { + self.subs.clear(); + self.sub_id_alloc.set(0); + } + + /// A tool for dumpling the whole storage when test failed. + /// Add this to test code temporarily for debugging. + #[allow(dead_code)] + pub fn dump(&self) { + println!(">>>>>>> /etc (revision = {}) <<<<<<<", self.revision); + for (k, v) in self.items.iter() { + println!("{:?} => {:?}", k, v); + } + } +} + +#[derive(Clone, Debug)] +pub struct MetaKey(pub Vec); + +impl MetaKey { + /// return the key that keeps the range [self, self.next()) contains only + /// `self`. + pub fn next(&self) -> Self { + let mut next = self.clone(); + next.0.push(0); + next + } + + /// return the key that keeps the range [self, self.next_prefix()) contains + /// all keys with the prefix `self`. + pub fn next_prefix(&self) -> Self { + let mut next_prefix = self.clone(); + for i in (0..next_prefix.0.len()).rev() { + if next_prefix.0[i] == u8::MAX { + next_prefix.0.pop(); + } else { + next_prefix.0[i] += 1; + break; + } + } + next_prefix + } +} + +/// A simple key value pair of metadata. +#[derive(Clone, Debug)] +pub struct KeyValue(pub MetaKey, pub Vec); + +impl KeyValue { + pub fn key(&self) -> &[u8] { + self.0.0.as_slice() + } + + pub fn value(&self) -> &[u8] { + self.1.as_slice() + } + + pub fn take_key(&mut self) -> Vec { + std::mem::take(&mut self.0.0) + } + + pub fn take_value(&mut self) -> Vec { + std::mem::take(&mut self.1) + } +} + +#[derive(Debug)] +pub enum KvEventType { + Put, + Delete, +} + +#[derive(Debug)] +pub struct KvEvent { + pub kind: KvEventType, + pub pair: KeyValue, +} + +#[derive(Debug)] +struct Subscriber { + start_key: Vec, + end_key: Vec, + tx: Sender, +} + +/// A key with revision. +#[derive(Default, Eq, PartialEq, Ord, PartialOrd, Clone)] +struct Key(Vec, i64); + +impl std::fmt::Debug for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("Key") + .field(&format_args!( + "{}@{}", + log_wrappers::Value::key(&self.0), + self.1 + )) + .finish() + } +} + +/// A value (maybe tombstone.) +#[derive(Debug, PartialEq, Clone)] +enum Value { + Val(Vec), + Del, +} + +/// The key set for getting. +#[derive(Debug)] +pub enum Keys { + Prefix(MetaKey), + Range(MetaKey, MetaKey), + Key(MetaKey), +} + +impl Keys { + /// convert the key set for corresponding key range. + pub fn into_bound(self) -> (Vec, Vec) { + match self { + Keys::Prefix(x) => { + let next = x.next_prefix().0; + ((x.0), (next)) + } + Keys::Range(start, end) => ((start.0), (end.0)), + Keys::Key(k) => { + let next = k.next().0; + ((k.0), (next)) + } + } + } +} diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index 84c2508d4ea..b9ae839b06e 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -2,15 +2,18 @@ use std::result; +use futures::executor::block_on; use kvproto::pdpb::*; mod bootstrap; +pub mod etcd; mod incompatible; mod leader_change; mod retry; mod service; mod split; +use self::etcd::{EtcdClient, KeyValue, Keys, MetaKey}; pub use self::{ bootstrap::AlreadyBootstrapped, incompatible::Incompatible, @@ -28,28 +31,62 @@ pub trait PdMocker { fn load_global_config( &self, _req: &LoadGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - let mut send = vec![]; - for r in 0..10 { - let mut i = GlobalConfigItem::default(); - i.set_name(format!("/global/config/{}", r)); - i.set_value(r.to_string()); - send.push(i); - } let mut res = LoadGlobalConfigResponse::default(); - res.set_items(send.into()); + let mut items = Vec::new(); + let (resp, revision) = block_on(async move { + etcd_client.lock().await.get_key(Keys::Range( + MetaKey(b"".to_vec()), + MetaKey(b"\xff".to_vec()), + )) + }); + + let values: Vec = resp + .iter() + .map(|kv| { + let mut item = GlobalConfigItem::default(); + item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); + item.set_value(String::from_utf8(kv.value().to_vec()).unwrap()); + item + }) + .collect(); + + items.extend(values); + res.set_revision(revision); + res.set_items(items.into()); Some(Ok(res)) } fn store_global_config( &self, - _: &StoreGlobalConfigRequest, + req: &StoreGlobalConfigRequest, + etcd_client: EtcdClient, ) -> Option> { - unimplemented!() + for item in req.get_changes() { + let cli = etcd_client.clone(); + block_on(async move { + match item.get_kind() { + EventType::Put => { + let kv = KeyValue(MetaKey(item.get_name().into()), item.get_value().into()); + cli.lock().await.set(kv).await + } + EventType::Delete => { + let key = Keys::Key(MetaKey(item.get_name().into())); + cli.lock().await.delete(key).await + } + } + }) + .unwrap(); + } + Some(Ok(StoreGlobalConfigResponse::default())) } - fn watch_global_config(&self) -> Option> { - panic!("could not mock this function due to it should return a stream") + fn watch_global_config( + &self, + _req: &WatchGlobalConfigRequest, + ) -> Option> { + unimplemented!() } fn get_members(&self, _: &GetMembersRequest) -> Option> { diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index 9e1a2b3bb0f..cb495307a1f 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -1,6 +1,7 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{ atomic::{AtomicI64, Ordering}, Arc, @@ -20,6 +21,7 @@ use pd_client::Error as PdError; use security::*; use super::mocker::*; +use crate::mocker::etcd::{EtcdClient, Keys, KvEventType, MetaKey}; pub struct Server { server: Option, @@ -57,6 +59,7 @@ impl Server { default_handler, case, tso_logical: Arc::new(AtomicI64::default()), + etcd_client: EtcdClient::default(), }; let mut server = Server { server: None, @@ -170,6 +173,7 @@ struct PdMock { default_handler: Arc, case: Option>, tso_logical: Arc, + etcd_client: EtcdClient, } impl Clone for PdMock { @@ -178,6 +182,7 @@ impl Clone for PdMock { default_handler: Arc::clone(&self.default_handler), case: self.case.clone(), tso_logical: self.tso_logical.clone(), + etcd_client: self.etcd_client.clone(), } } } @@ -189,39 +194,71 @@ impl Pd for PdMock { req: LoadGlobalConfigRequest, sink: UnarySink, ) { - hijack_unary(self, ctx, sink, |c| c.load_global_config(&req)) + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| c.load_global_config(&req, cli.clone())) } fn store_global_config( &mut self, - _ctx: RpcContext<'_>, - _req: StoreGlobalConfigRequest, - _sink: UnarySink, + ctx: RpcContext<'_>, + req: StoreGlobalConfigRequest, + sink: UnarySink, ) { - unimplemented!() + let cli = self.etcd_client.clone(); + hijack_unary(self, ctx, sink, |c| { + c.store_global_config(&req, cli.clone()) + }) } fn watch_global_config( &mut self, ctx: RpcContext<'_>, - _req: WatchGlobalConfigRequest, + req: WatchGlobalConfigRequest, mut sink: ServerStreamingSink, ) { - ctx.spawn(async move { - let mut name: usize = 0; - loop { + let cli = self.etcd_client.clone(); + let future = async move { + let mut watcher = match cli + .lock() + .await + .watch( + Keys::Range(MetaKey(b"".to_vec()), MetaKey(b"\xff".to_vec())), + req.revision, + ) + .await + { + Ok(w) => w, + Err(err) => { + error!("failed to watch: {:?}", err); + return; + } + }; + + while let Some(event) = watcher.as_mut().recv().await { + info!("watch event from etcd"; "event" => ?event); let mut change = GlobalConfigItem::new(); - change.set_name(format!("/global/config/{:?}", name).to_owned()); - change.set_value(format!("{:?}", name)); + change.set_kind(match event.kind { + KvEventType::Put => EventType::Put, + KvEventType::Delete => EventType::Delete, + }); + change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); + change.set_value(from_utf8(event.pair.value()).unwrap().to_string()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); - // simulate network delay - std::thread::sleep(Duration::from_millis(10)); - name += 1; let _ = sink.send((wc, WriteFlags::default())).await; let _ = sink.flush().await; + #[cfg(feature = "failpoints")] + { + use futures::executor::block_on; + let cli_clone = cli.clone(); + fail_point!("watch_global_config_return", |_| { + block_on(async move { cli_clone.lock().await.clear_subs() }); + watcher.close(); + }); + } } - }) + }; + ctx.spawn(future); } fn get_members( diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index e761fac8bb5..26dbf495f54 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -405,6 +405,13 @@ impl Worker { }); } + pub fn spawn_async_task(&self, f: F) + where + F: Future + Send + 'static, + { + self.remote.spawn(f); + } + fn delay_notify(tx: UnboundedSender>, timeout: Duration) { let now = Instant::now(); let f = GLOBAL_TIMER_HANDLE diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 7dd767d19c9..92942fa90f9 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -97,60 +97,6 @@ fn test_pd_client_deadlock() { fail::remove(pd_client_reconnect_fp); } -#[test] -fn test_load_global_config() { - let (mut _server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = futures::executor::block_on(async move { - client.load_global_config("global".to_string()).await - }); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) - } -} - -#[test] -fn test_watch_global_config_on_closed_server() { - let (mut server, mut client) = new_test_server_and_client(ReadableDuration::millis(100)); - use futures::StreamExt; - let j = std::thread::spawn(move || { - let mut r = client.watch_global_config().unwrap(); - block_on(async move { - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) - } - } - } - } - }); - }); - thread::sleep(Duration::from_millis(200)); - server.stop(); - j.join().unwrap(); -} - // Updating pd leader may be slow, we need to make sure it does not block other // RPC in the same gRPC Environment. #[test] diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index 172db8ac09e..3638e448bd9 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -7,11 +7,11 @@ use std::{ }; use grpcio::EnvBuilder; -use kvproto::metapb::*; +use kvproto::{metapb::*, pdpb::GlobalConfigItem}; use pd_client::{PdClient, RegionInfo, RegionStat, RpcClient}; use security::{SecurityConfig, SecurityManager}; use test_pd::{mocker::*, util::*, Server as MockServer}; -use tikv_util::config::ReadableDuration; +use tikv_util::{config::ReadableDuration, worker::Builder}; fn new_test_server_and_client( update_interval: ReadableDuration, @@ -108,57 +108,90 @@ fn test_pd_client_deadlock() { #[test] fn test_load_global_config() { let (mut _server, client) = new_test_server_and_client(ReadableDuration::millis(100)); - let res = - futures::executor::block_on( - async move { client.load_global_config("global".into()).await }, - ); - for (k, v) in res.unwrap() { - assert_eq!(k, format!("/global/config/{}", v)) + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let check_items = global_items.clone(); + if let Err(err) = futures::executor::block_on( + client.store_global_config( + String::from("global"), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_value(value.to_string()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); } + + let (res, revision) = + futures::executor::block_on(client.load_global_config(String::from("global"))).unwrap(); + assert!( + res.iter() + .zip(check_items) + .all(|(item1, item2)| item1.name == item2.0 && item1.value == item2.1) + ); + assert_eq!(revision, 3); } #[test] fn test_watch_global_config_on_closed_server() { let (mut server, client) = new_test_server_and_client(ReadableDuration::millis(100)); + let global_items = vec![("test1", "val1"), ("test2", "val2"), ("test3", "val3")]; + let items_clone = global_items.clone(); + let client = Arc::new(client); + let cli_clone = client.clone(); use futures::StreamExt; - let j = std::thread::spawn(move || { - futures::executor::block_on(async move { - let mut r = client.watch_global_config().unwrap(); - let mut i: usize = 0; - while let Some(r) = r.next().await { - match r { - Ok(res) => { - let change = &res.get_changes()[0]; - assert_eq!( - change - .get_name() - .split('/') - .collect::>() - .last() - .unwrap() - .to_owned(), - format!("{:?}", i) - ); - assert_eq!(change.get_value().to_owned(), format!("{:?}", i)); - i += 1; - } - Err(e) => { - if let grpcio::Error::RpcFailure(e) = e { - // 14-UNAVAILABLE - assert_eq!(e.code(), grpcio::RpcStatusCode::from(14)); - break; - } else { - panic!("other error occur {:?}", e) + let background_worker = Builder::new("background").thread_count(1).create(); + background_worker.spawn_async_task(async move { + match cli_clone.watch_global_config("global".into(), 0) { + Ok(mut stream) => { + let mut i: usize = 0; + while let Some(grpc_response) = stream.next().await { + match grpc_response { + Ok(r) => { + for item in r.get_changes() { + assert_eq!(item.get_name(), items_clone[i].0); + assert_eq!(item.get_value(), items_clone[i].1); + i += 1; + } } + Err(err) => panic!("failed to get stream, err: {:?}", err), } } } - }); + Err(err) => { + if !err.to_string().contains("UNAVAILABLE") { + // Not 14-UNAVAILABLE + panic!("other error occur {:?}", err) + } + } + } }); - thread::sleep(Duration::from_millis(200)); + + if let Err(err) = futures::executor::block_on( + client.store_global_config( + "global".into(), + global_items + .iter() + .map(|(name, value)| { + let mut item = GlobalConfigItem::default(); + item.set_name(name.to_string()); + item.set_value(value.to_string()); + item + }) + .collect::>(), + ), + ) { + panic!("error occur {:?}", err); + } + + thread::sleep(Duration::from_millis(100)); server.stop(); - j.join().unwrap(); } // Updating pd leader may be slow, we need to make sure it does not block other From 9726e56e5b667649504e3ec636f12843bc94ff8d Mon Sep 17 00:00:00 2001 From: Connor Date: Fri, 20 Jan 2023 22:43:49 +0800 Subject: [PATCH 095/115] batch-system: add priority scheduling for batch system (#14065) ref tikv/tikv#13730 Support priority-based scheduling for the apply batch system. Signed-off-by: Connor1996 --- Cargo.lock | 8 +- components/batch-system/Cargo.toml | 2 + .../batch-system/benches/batch-system.rs | 6 +- components/batch-system/benches/router.rs | 2 +- components/batch-system/src/batch.rs | 101 ++----- components/batch-system/src/channel.rs | 252 ++++++++++++++++++ components/batch-system/src/fsm.rs | 18 +- components/batch-system/src/lib.rs | 3 +- components/batch-system/src/mailbox.rs | 2 + components/batch-system/src/test_runner.rs | 19 +- components/batch-system/tests/cases/batch.rs | 105 +++++++- components/batch-system/tests/cases/router.rs | 4 +- components/raftstore-v2/Cargo.toml | 1 + components/raftstore-v2/src/batch/store.rs | 2 +- .../raftstore-v2/src/operation/command/mod.rs | 1 + components/raftstore-v2/src/router/message.rs | 5 + components/raftstore/Cargo.toml | 1 + .../raftstore/src/store/entry_storage.rs | 7 + components/raftstore/src/store/fsm/apply.rs | 55 +++- components/raftstore/src/store/fsm/peer.rs | 6 +- components/raftstore/src/store/fsm/store.rs | 17 +- components/raftstore/src/store/msg.rs | 5 + components/raftstore/src/store/peer.rs | 5 +- components/raftstore/src/store/util.rs | 38 ++- .../src/store/worker/refresh_config.rs | 2 +- .../resource_control/src/resource_group.rs | 15 +- components/server/src/server.rs | 2 +- components/test_raftstore/src/cluster.rs | 12 +- components/test_raftstore/src/node.rs | 2 +- components/test_raftstore/src/server.rs | 8 +- components/tikv_util/Cargo.toml | 2 +- .../tikv_util/src/mpsc/priority_queue.rs | 46 ++-- .../integrations/config/dynamic/raftstore.rs | 2 +- tests/integrations/config/dynamic/snap.rs | 3 +- .../integrations/raftstore/test_bootstrap.rs | 2 +- 35 files changed, 614 insertions(+), 147 deletions(-) create mode 100644 components/batch-system/src/channel.rs diff --git a/Cargo.lock b/Cargo.lock index ee047aaae6d..d288af846a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -581,9 +581,11 @@ dependencies = [ "derive_more", "fail", "file_system", + "kvproto", "lazy_static", "online_config", "prometheus", + "resource_control", "serde", "serde_derive", "slog", @@ -2727,7 +2729,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#adcf4c414bfd0ccf18436b377430aa2450fd4c81" +source = "git+https://github.com/pingcap/kvproto.git#009f31598ac3200dc8b32e18f96fc4deb7b32e48" dependencies = [ "futures 0.3.15", "grpcio", @@ -4312,6 +4314,7 @@ dependencies = [ "raft", "raft-proto", "rand 0.8.5", + "resource_control", "resource_metering", "serde", "serde_derive", @@ -4362,6 +4365,7 @@ dependencies = [ "raft-proto", "raftstore", "rand 0.8.5", + "resource_control", "resource_metering", "slog", "slog-global", @@ -6565,7 +6569,7 @@ dependencies = [ "openssl", "page_size", "panic_hook", - "parking_lot 0.12.1", + "parking_lot_core 0.9.1", "pin-project", "procfs", "procinfo", diff --git a/components/batch-system/Cargo.toml b/components/batch-system/Cargo.toml index 7fe5798f833..75a0230c188 100644 --- a/components/batch-system/Cargo.toml +++ b/components/batch-system/Cargo.toml @@ -13,9 +13,11 @@ crossbeam = "0.8" derive_more = { version = "0.99", optional = true } fail = "0.5" file_system = { workspace = true } +kvproto = { git = "https://github.com/pingcap/kvproto.git" } lazy_static = "1.3" online_config = { workspace = true } prometheus = { version = "0.13", default-features = false, features = ["nightly"] } +resource_control = { workspace = true } serde = { version = "1.0", features = ["derive"] } serde_derive = "1.0" slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/components/batch-system/benches/batch-system.rs b/components/batch-system/benches/batch-system.rs index c248eabaf04..9edf72f0ff9 100644 --- a/components/batch-system/benches/batch-system.rs +++ b/components/batch-system/benches/batch-system.rs @@ -20,7 +20,7 @@ fn end_hook(tx: &std::sync::mpsc::Sender<()>) -> Message { fn bench_spawn_many(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 32; const MESSAGE_LIMIT: usize = 256; @@ -55,7 +55,7 @@ fn bench_spawn_many(c: &mut Criterion) { fn bench_imbalance(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); const ID_LIMIT: u64 = 10; const MESSAGE_LIMIT: usize = 512; @@ -92,7 +92,7 @@ fn bench_imbalance(c: &mut Criterion) { fn bench_fairness(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let state_cnt = Arc::new(AtomicUsize::new(0)); for id in 0..10 { diff --git a/components/batch-system/benches/router.rs b/components/batch-system/benches/router.rs index 3dd7e282e15..e25ee58b94d 100644 --- a/components/batch-system/benches/router.rs +++ b/components/batch-system/benches/router.rs @@ -8,7 +8,7 @@ use criterion::*; fn bench_send(c: &mut Criterion) { let (control_tx, control_fsm) = Runner::new(100000); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); system.spawn("test".to_owned(), Builder::new()); let (normal_tx, normal_fsm) = Runner::new(100000); let normal_box = BasicMailbox::new(normal_tx, normal_fsm, Arc::default()); diff --git a/components/batch-system/src/batch.rs b/components/batch-system/src/batch.rs index 4d935ad4819..48ef809d421 100644 --- a/components/batch-system/src/batch.rs +++ b/components/batch-system/src/batch.rs @@ -15,15 +15,16 @@ use std::{ time::Duration, }; -use crossbeam::channel::{self, SendError}; use fail::fail_point; use file_system::{set_io_type, IoType}; +use resource_control::ResourceController; use tikv_util::{ debug, error, info, mpsc, safe_panic, sys::thread::StdThreadBuildWrapper, thd_name, - time::Instant, warn, + time::Instant, }; use crate::{ + channel::{fsm_channel, ControlScheduler, FsmReceiver, FsmSender, NormalScheduler}, config::Config, fsm::{Fsm, FsmScheduler, Priority}, mailbox::BasicMailbox, @@ -37,60 +38,6 @@ pub enum FsmTypes { // Used as a signal that scheduler should be shutdown. Empty, } - -// A macro to introduce common definition of scheduler. -macro_rules! impl_sched { - ($name:ident, $ty:path,Fsm = $fsm:tt) => { - pub struct $name { - sender: channel::Sender>, - low_sender: channel::Sender>, - } - - impl Clone for $name { - #[inline] - fn clone(&self) -> $name { - $name { - sender: self.sender.clone(), - low_sender: self.low_sender.clone(), - } - } - } - - impl FsmScheduler for $name - where - $fsm: Fsm, - { - type Fsm = $fsm; - - #[inline] - fn schedule(&self, fsm: Box) { - let sender = match fsm.get_priority() { - Priority::Normal => &self.sender, - Priority::Low => &self.low_sender, - }; - match sender.send($ty(fsm)) { - Ok(()) => {} - // TODO: use debug instead. - Err(SendError($ty(fsm))) => warn!("failed to schedule fsm {:p}", fsm), - _ => unreachable!(), - } - } - - fn shutdown(&self) { - // TODO: close it explicitly once it's supported. - // Magic number, actually any number greater than poll pool size works. - for _ in 0..256 { - let _ = self.sender.send(FsmTypes::Empty); - let _ = self.low_sender.send(FsmTypes::Empty); - } - } - } - }; -} - -impl_sched!(NormalScheduler, FsmTypes::Normal, Fsm = N); -impl_sched!(ControlScheduler, FsmTypes::Control, Fsm = C); - pub struct NormalFsm { fsm: Box, timer: Instant, @@ -168,7 +115,7 @@ impl Batch { /// /// When pending messages of the FSM is different than `expected_len`, /// attempts to schedule it in this poller again. Returns the `fsm` if the - /// re-scheduling suceeds. + /// re-scheduling succeeds. fn release(&mut self, mut fsm: NormalFsm, expected_len: usize) -> Option> { let mailbox = fsm.take_mailbox().unwrap(); mailbox.release(fsm.fsm); @@ -341,7 +288,7 @@ pub trait PollHandler: Send + 'static { /// Internal poller that fetches batch and call handler hooks for readiness. pub struct Poller { pub router: Router, ControlScheduler>, - pub fsm_receiver: channel::Receiver>, + pub fsm_receiver: FsmReceiver, pub handler: Handler, pub max_batch_size: usize, pub reschedule_duration: Duration, @@ -534,8 +481,8 @@ pub trait HandlerBuilder { pub struct BatchSystem { name_prefix: Option, router: BatchRouter, - receiver: channel::Receiver>, - low_receiver: channel::Receiver>, + receiver: FsmReceiver, + low_receiver: FsmReceiver, pool_size: usize, max_batch_size: usize, workers: Arc>>>, @@ -649,15 +596,15 @@ where } } -struct PoolStateBuilder { +struct PoolStateBuilder { max_batch_size: usize, reschedule_duration: Duration, - fsm_receiver: channel::Receiver>, - fsm_sender: channel::Sender>, + fsm_receiver: FsmReceiver, + fsm_sender: FsmSender, pool_size: usize, } -impl PoolStateBuilder { +impl PoolStateBuilder { fn build>( self, name_prefix: String, @@ -683,11 +630,11 @@ impl PoolStateBuilder { } } -pub struct PoolState> { +pub struct PoolState> { pub name_prefix: String, pub handler_builder: H, - pub fsm_receiver: channel::Receiver>, - pub fsm_sender: channel::Sender>, + pub fsm_receiver: FsmReceiver, + pub fsm_sender: FsmSender, pub low_priority_pool_size: usize, pub expected_pool_size: usize, pub workers: Arc>>>, @@ -707,32 +654,32 @@ pub fn create_system( cfg: &Config, sender: mpsc::LooseBoundedSender, controller: Box, + resource_ctl: Option>, ) -> (BatchRouter, BatchSystem) { let state_cnt = Arc::new(AtomicUsize::new(0)); let control_box = BasicMailbox::new(sender, controller, state_cnt.clone()); - let (tx, rx) = channel::unbounded(); - let (tx2, rx2) = channel::unbounded(); + let (sender, receiver) = fsm_channel(resource_ctl); + let (low_sender, low_receiver) = fsm_channel(None); // no resource control for low fsm let normal_scheduler = NormalScheduler { - sender: tx.clone(), - low_sender: tx2.clone(), + sender: sender.clone(), + low_sender, }; let control_scheduler = ControlScheduler { - sender: tx.clone(), - low_sender: tx2, + sender: sender.clone(), }; let pool_state_builder = PoolStateBuilder { max_batch_size: cfg.max_batch_size(), reschedule_duration: cfg.reschedule_duration.0, - fsm_receiver: rx.clone(), - fsm_sender: tx, + fsm_receiver: receiver.clone(), + fsm_sender: sender, pool_size: cfg.pool_size, }; let router = Router::new(control_box, normal_scheduler, control_scheduler, state_cnt); let system = BatchSystem { name_prefix: None, router: router.clone(), - receiver: rx, - low_receiver: rx2, + receiver, + low_receiver, pool_size: cfg.pool_size, max_batch_size: cfg.max_batch_size(), workers: Arc::new(Mutex::new(Vec::new())), diff --git a/components/batch-system/src/channel.rs b/components/batch-system/src/channel.rs new file mode 100644 index 00000000000..094b6a7a2ae --- /dev/null +++ b/components/batch-system/src/channel.rs @@ -0,0 +1,252 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cell::RefCell, sync::Arc}; + +use crossbeam::channel::{self, RecvError, SendError, TryRecvError, TrySendError}; +use kvproto::kvrpcpb::CommandPri; +use resource_control::{ResourceConsumeType, ResourceController}; +use tikv_util::{mpsc::priority_queue, warn}; + +use crate::{ + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, + FsmTypes, +}; + +pub fn fsm_channel( + resource_ctl: Option>, +) -> (FsmSender, FsmReceiver) { + if let Some(ctl) = resource_ctl { + let (tx, rx) = priority_queue::unbounded(); + ( + FsmSender::Priority { + resource_ctl: ctl, + sender: tx, + last_msg_group: RefCell::new(String::new()), + }, + FsmReceiver::Priority(rx), + ) + } else { + let (tx, rx) = channel::unbounded(); + (FsmSender::Vanilla(tx), FsmReceiver::Vanilla(rx)) + } +} + +pub struct NormalScheduler { + pub(crate) sender: FsmSender, + pub(crate) low_sender: FsmSender, +} + +impl Clone for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + NormalScheduler { + sender: self.sender.clone(), + low_sender: self.low_sender.clone(), + } + } +} + +impl FsmScheduler for NormalScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = N; + + fn consume_msg_resource(&self, msg: &::Message) { + self.sender.consume_msg_resource(msg); + } + + #[inline] + fn schedule(&self, fsm: Box) { + let sender = match fsm.get_priority() { + Priority::Normal => &self.sender, + Priority::Low => &self.low_sender, + }; + + match sender.send(FsmTypes::Normal(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Normal(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + let _ = self.low_sender.send(FsmTypes::Empty); + } + } +} + +pub struct ControlScheduler { + pub(crate) sender: FsmSender, +} + +impl Clone for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + ControlScheduler { + sender: self.sender.clone(), + } + } +} + +impl FsmScheduler for ControlScheduler +where + N: Fsm, + C: Fsm, +{ + type Fsm = C; + + fn consume_msg_resource(&self, _msg: &::Message) {} + + #[inline] + fn schedule(&self, fsm: Box) { + match self.sender.send(FsmTypes::Control(fsm)) { + Ok(()) => {} + Err(SendError(FsmTypes::Control(fsm))) => warn!("failed to schedule fsm {:p}", fsm), + _ => unreachable!(), + } + } + + fn shutdown(&self) { + // TODO: close it explicitly once it's supported. + // Magic number, actually any number greater than poll pool size works. + for _ in 0..256 { + let _ = self.sender.send(FsmTypes::Empty); + } + } +} + +pub enum FsmSender { + Vanilla(channel::Sender>), + Priority { + resource_ctl: Arc, + sender: priority_queue::Sender>, + last_msg_group: RefCell, + }, +} + +impl Clone for FsmSender +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmSender::Vanilla(sender) => FsmSender::Vanilla(sender.clone()), + FsmSender::Priority { + resource_ctl, + sender, + .. + } => FsmSender::Priority { + resource_ctl: resource_ctl.clone(), + sender: sender.clone(), + last_msg_group: RefCell::new(String::new()), + }, + } + } +} + +impl FsmSender { + pub fn send(&self, fsm: FsmTypes) -> Result<(), SendError>> { + match self { + FsmSender::Vanilla(sender) => sender.send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + // TODO: pass different priority + let pri = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.send(fsm, pri) + } + } + } + + pub fn try_send(&self, fsm: FsmTypes) -> Result<(), TrySendError>> { + match self { + FsmSender::Vanilla(sender) => sender.try_send(fsm), + FsmSender::Priority { + resource_ctl, + sender, + last_msg_group, + } => { + let priority = resource_ctl + .get_priority(last_msg_group.borrow().as_bytes(), CommandPri::Normal); + sender.try_send(fsm, priority) + } + } + } + + fn consume_msg_resource(&self, msg: &N::Message) { + match self { + FsmSender::Vanilla(_) => {} + FsmSender::Priority { + resource_ctl, + last_msg_group, + .. + } => { + if let Some(mut groups) = msg.get_resource_consumptions() { + let mut dominant_group = "".to_owned(); + let mut max_write_bytes = 0; + for (group_name, write_bytes) in groups.drain() { + resource_ctl.consume( + group_name.as_bytes(), + ResourceConsumeType::IoBytes(write_bytes), + ); + if write_bytes > max_write_bytes { + dominant_group = group_name; + max_write_bytes = write_bytes; + } + } + *last_msg_group.borrow_mut() = dominant_group; + } + } + } + } +} + +pub enum FsmReceiver { + Vanilla(channel::Receiver>), + Priority(priority_queue::Receiver>), +} + +impl Clone for FsmReceiver +where + N: Fsm, + C: Fsm, +{ + fn clone(&self) -> Self { + match self { + FsmReceiver::Vanilla(receiver) => FsmReceiver::Vanilla(receiver.clone()), + FsmReceiver::Priority(receiver) => FsmReceiver::Priority(receiver.clone()), + } + } +} + +impl FsmReceiver { + pub fn recv(&self) -> Result, RecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.recv(), + FsmReceiver::Priority(receiver) => receiver.recv(), + } + } + + pub fn try_recv(&self) -> Result, TryRecvError> { + match self { + FsmReceiver::Vanilla(receiver) => receiver.try_recv(), + FsmReceiver::Priority(receiver) => receiver.try_recv(), + } + } +} diff --git a/components/batch-system/src/fsm.rs b/components/batch-system/src/fsm.rs index 09e32333c96..5d9e009fa01 100644 --- a/components/batch-system/src/fsm.rs +++ b/components/batch-system/src/fsm.rs @@ -10,6 +10,8 @@ use std::{ usize, }; +use collections::HashMap; + use crate::mailbox::BasicMailbox; #[derive(Clone, Copy, Debug, PartialEq)] @@ -24,15 +26,26 @@ pub trait FsmScheduler { /// Schedule a Fsm for later handling. fn schedule(&self, fsm: Box); + /// Shutdown the scheduler, which indicates that resources like /// background thread pool should be released. fn shutdown(&self); + + /// Consume the resources of msg in resource controller if enabled, + /// otherwise do nothing. + fn consume_msg_resource(&self, msg: &::Message); +} + +pub trait ResourceMetered { + fn get_resource_consumptions(&self) -> Option> { + None + } } /// A `Fsm` is a finite state machine. It should be able to be notified for /// updating internal state according to incoming messages. -pub trait Fsm { - type Message: Send; +pub trait Fsm: Send + 'static { + type Message: Send + ResourceMetered; fn is_stopped(&self) -> bool; @@ -42,6 +55,7 @@ pub trait Fsm { Self: Sized, { } + /// Take the mailbox from FSM. Implementation should ensure there will be /// no reference to mailbox after calling this method. fn take_mailbox(&mut self) -> Option> diff --git a/components/batch-system/src/lib.rs b/components/batch-system/src/lib.rs index 9a307a534ac..f4f799dcc9a 100644 --- a/components/batch-system/src/lib.rs +++ b/components/batch-system/src/lib.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. mod batch; +mod channel; mod config; mod fsm; mod mailbox; @@ -16,7 +17,7 @@ pub use self::{ PollHandler, Poller, PoolState, }, config::Config, - fsm::{Fsm, FsmScheduler, Priority}, + fsm::{Fsm, FsmScheduler, Priority, ResourceMetered}, mailbox::{BasicMailbox, Mailbox}, router::Router, }; diff --git a/components/batch-system/src/mailbox.rs b/components/batch-system/src/mailbox.rs index 5afddf73c14..869031392af 100644 --- a/components/batch-system/src/mailbox.rs +++ b/components/batch-system/src/mailbox.rs @@ -75,6 +75,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), SendError> { + scheduler.consume_msg_resource(&msg); self.sender.force_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) @@ -89,6 +90,7 @@ impl BasicMailbox { msg: Owner::Message, scheduler: &S, ) -> Result<(), TrySendError> { + scheduler.consume_msg_resource(&msg); self.sender.try_send(msg)?; self.state.notify(scheduler, Cow::Borrowed(self)); Ok(()) diff --git a/components/batch-system/src/test_runner.rs b/components/batch-system/src/test_runner.rs index 6be64d5d695..a3ae80dc55a 100644 --- a/components/batch-system/src/test_runner.rs +++ b/components/batch-system/src/test_runner.rs @@ -11,10 +11,11 @@ use std::{ }, }; +use collections::HashMap; use derive_more::{Add, AddAssign}; use tikv_util::mpsc; -use crate::*; +use crate::{fsm::ResourceMetered, *}; /// Message `Runner` can accepts. pub enum Message { @@ -22,6 +23,21 @@ pub enum Message { Loop(usize), /// `Runner` will call the callback directly. Callback(Box), + /// group name, write bytes + Resource(String, u64), +} + +impl ResourceMetered for Message { + fn get_resource_consumptions(&self) -> Option> { + match self { + Message::Resource(group_name, bytes) => { + let mut map = HashMap::default(); + map.insert(group_name.to_owned(), *bytes); + Some(map) + } + _ => None, + } + } } /// A simple runner used for benchmarking only. @@ -102,6 +118,7 @@ impl Handler { } } Ok(Message::Callback(cb)) => cb(self, r), + Ok(Message::Resource(..)) => {} Err(_) => break, } } diff --git a/components/batch-system/tests/cases/batch.rs b/components/batch-system/tests/cases/batch.rs index f950df68b8d..dc13affc363 100644 --- a/components/batch-system/tests/cases/batch.rs +++ b/components/batch-system/tests/cases/batch.rs @@ -7,13 +7,15 @@ use std::{ }; use batch_system::{test_runner::*, *}; +use kvproto::resource_manager::{GroupMode, GroupRawResourceSettings, ResourceGroup}; +use resource_control::ResourceGroupManager; use tikv_util::mpsc; #[test] fn test_batch() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); let metrics = builder.metrics.clone(); system.spawn("test".to_owned(), builder); @@ -55,7 +57,7 @@ fn test_batch() { fn test_priority() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); let (tx, rx) = mpsc::unbounded(); @@ -101,3 +103,102 @@ fn test_priority() { .unwrap(); assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(3)); } + +#[test] +fn test_resource_group() { + let (control_tx, control_fsm) = Runner::new(10); + let resource_manager = ResourceGroupManager::default(); + + let get_group = |name: &str, read_tokens: u64, write_tokens: u64| -> ResourceGroup { + let mut group = ResourceGroup::new(); + group.set_name(name.to_string()); + group.set_mode(GroupMode::RawMode); + let mut resource_setting = GroupRawResourceSettings::new(); + resource_setting + .mut_cpu() + .mut_settings() + .set_fill_rate(read_tokens); + resource_setting + .mut_io_write() + .mut_settings() + .set_fill_rate(write_tokens); + group.set_raw_resource_settings(resource_setting); + group + }; + + resource_manager.add_resource_group(get_group("group1", 10, 10)); + resource_manager.add_resource_group(get_group("group2", 100, 100)); + + let mut cfg = Config::default(); + cfg.pool_size = 1; + let (router, mut system) = batch_system::create_system( + &cfg, + control_tx, + control_fsm, + Some(resource_manager.derive_controller("test".to_string(), false)), + ); + let builder = Builder::new(); + system.spawn("test".to_owned(), builder); + let (tx, rx) = mpsc::unbounded(); + let tx_ = tx.clone(); + let r = router.clone(); + let state_cnt = Arc::new(AtomicUsize::new(0)); + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + let (tx, runner) = Runner::new(10); + r.register(1, BasicMailbox::new(tx, runner, state_cnt.clone())); + let (tx2, runner2) = Runner::new(10); + r.register(2, BasicMailbox::new(tx2, runner2, state_cnt)); + tx_.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + let tx_ = tx.clone(); + let (tx1, rx1) = std::sync::mpsc::sync_channel(0); + // block the thread + router + .send_control(Message::Callback(Box::new( + move |_: &Handler, _: &mut Runner| { + tx_.send(0).unwrap(); + tx1.send(0).unwrap(); + }, + ))) + .unwrap(); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(0)); + + router + .send(1, Message::Resource("group1".to_string(), 1)) + .unwrap(); + let tx_ = tx.clone(); + router + .send( + 1, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx_.send(1).unwrap(); + })), + ) + .unwrap(); + + router + .send(2, Message::Resource("group2".to_string(), 1)) + .unwrap(); + router + .send( + 2, + Message::Callback(Box::new(move |_: &Handler, _: &mut Runner| { + tx.send(2).unwrap(); + })), + ) + .unwrap(); + + // pause the blocking thread + assert_eq!(rx1.recv_timeout(Duration::from_secs(3)), Ok(0)); + + // should recv from group2 first, because group2 has more tokens and it would be + // handled with higher priority. + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(2)); + assert_eq!(rx.recv_timeout(Duration::from_secs(3)), Ok(1)); +} diff --git a/components/batch-system/tests/cases/router.rs b/components/batch-system/tests/cases/router.rs index 543937fa8ef..d746dfad5cb 100644 --- a/components/batch-system/tests/cases/router.rs +++ b/components/batch-system/tests/cases/router.rs @@ -30,7 +30,7 @@ fn test_basic() { let (control_drop_tx, control_drop_rx) = mpsc::unbounded(); control_fsm.sender = Some(control_drop_tx); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); @@ -130,7 +130,7 @@ fn test_basic() { fn test_router_trace() { let (control_tx, control_fsm) = Runner::new(10); let (router, mut system) = - batch_system::create_system(&Config::default(), control_tx, control_fsm); + batch_system::create_system(&Config::default(), control_tx, control_fsm, None); let builder = Builder::new(); system.spawn("test".to_owned(), builder); diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 6726c5ed742..5b917b9ddf7 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -52,6 +52,7 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raft-proto = { version = "0.7.0" } raftstore = { workspace = true } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 280e8dcc396..1c7360a86bc 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -749,7 +749,7 @@ where { let (store_tx, store_fsm) = StoreFsm::new(cfg, store_id, logger.clone()); let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm, None); let system = StoreSystem { system, workers: None, diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index cf29d9ee25a..edca9510c27 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -590,6 +590,7 @@ impl Apply { AdminCmdType::InvalidAdmin => { return Err(box_err!("invalid admin command type")); } + AdminCmdType::UpdateGcPeer => unimplemented!(), }; match admin_result { diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index c1e5f0d37dc..a9353e171d9 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -2,6 +2,7 @@ // #[PerformanceCriticalPath] +use batch_system::ResourceMetered; use kvproto::{ metapb, raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, @@ -197,6 +198,8 @@ pub enum PeerMsg { WaitFlush(super::FlushChannel), } +impl ResourceMetered for PeerMsg {} + impl PeerMsg { pub fn raft_query(req: RaftCmdRequest) -> (Self, QueryResSubscriber) { let (ch, sub) = QueryResChannel::pair(); @@ -259,3 +262,5 @@ pub enum StoreMsg { Start, StoreUnreachable { to_store_id: u64 }, } + +impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore/Cargo.toml b/components/raftstore/Cargo.toml index 548693b71ac..8df501f279d 100644 --- a/components/raftstore/Cargo.toml +++ b/components/raftstore/Cargo.toml @@ -72,6 +72,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0", default-features = false } rand = "0.8.3" +resource_control = { workspace = true } resource_metering = { workspace = true } serde = "1.0" serde_derive = "1.0" diff --git a/components/raftstore/src/store/entry_storage.rs b/components/raftstore/src/store/entry_storage.rs index bc85ecedc34..afa13730ccf 100644 --- a/components/raftstore/src/store/entry_storage.rs +++ b/components/raftstore/src/store/entry_storage.rs @@ -69,6 +69,13 @@ impl CachedEntries { } } + pub fn iter_entries(&self, mut f: impl FnMut(&Entry)) { + let entries = self.entries.lock().unwrap(); + for entry in &entries.0 { + f(entry); + } + } + /// Take cached entries and dangle size for them. `dangle` means not in /// entry cache. pub fn take_entries(&self) -> (Vec, usize) { diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 9f2d234010f..22a42393173 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -24,7 +24,7 @@ use std::{ use batch_system::{ BasicMailbox, BatchRouter, BatchSystem, Config as BatchSystemConfig, Fsm, HandleResult, - HandlerBuilder, PollHandler, Priority, + HandlerBuilder, PollHandler, Priority, ResourceMetered, }; use collections::{HashMap, HashMapEntry, HashSet}; use crossbeam::channel::{TryRecvError, TrySendError}; @@ -46,11 +46,12 @@ use kvproto::{ }; use pd_client::{new_bucket_stats, BucketMeta, BucketStat}; use prometheus::local::LocalHistogram; -use protobuf::{wire_format::WireType, CodedInputStream}; +use protobuf::{wire_format::WireType, CodedInputStream, Message}; use raft::eraftpb::{ ConfChange, ConfChangeType, ConfChangeV2, Entry, EntryType, Snapshot as RaftSnapshot, }; use raft_proto::ConfChangeI; +use resource_control::ResourceController; use smallvec::{smallvec, SmallVec}; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -1695,6 +1696,7 @@ where } AdminCmdType::BatchSwitchWitness => self.exec_batch_switch_witness(ctx, request), AdminCmdType::InvalidAdmin => Err(box_err!("unsupported admin command type")), + AdminCmdType::UpdateGcPeer => unimplemented!(), }?; response.set_cmd_type(cmd_type); @@ -3709,6 +3711,26 @@ where }, } +impl ResourceMetered for Msg { + fn get_resource_consumptions(&self) -> Option> { + match self { + Msg::Apply { apply, .. } => { + let mut map = HashMap::default(); + for cached_entries in &apply.entries { + cached_entries.iter_entries(|entry| { + // TODO: maybe use a more efficient way to get the resource group name. + let header = util::get_entry_header(entry); + let group_name = header.get_resource_group_name().to_owned(); + *map.entry(group_name).or_default() += entry.compute_size() as u64; + }); + } + Some(map) + } + _ => None, + } + } +} + impl Msg where EK: KvEngine, @@ -4406,6 +4428,7 @@ pub enum ControlMsg { }, } +impl ResourceMetered for ControlMsg {} pub struct ControlFsm { receiver: Receiver, stopped: bool, @@ -4834,10 +4857,15 @@ impl ApplyBatchSystem { pub fn create_apply_batch_system( cfg: &Config, + resource_ctl: Option>, ) -> (ApplyRouter, ApplyBatchSystem) { let (control_tx, control_fsm) = ControlFsm::new(); - let (router, system) = - batch_system::create_system(&cfg.apply_batch_system, control_tx, control_fsm); + let (router, system) = batch_system::create_system( + &cfg.apply_batch_system, + control_tx, + control_fsm, + resource_ctl, + ); (ApplyRouter { router }, ApplyBatchSystem { system }) } @@ -4984,6 +5012,7 @@ mod tests { cmd.mut_put().set_key(b"key".to_vec()); cmd.mut_put().set_value(b"value".to_vec()); let mut req = RaftCmdRequest::default(); + req.set_header(RaftRequestHeader::default()); req.mut_requests().push(cmd); e.set_data(req.write_to_bytes().unwrap().into()) } @@ -5251,7 +5280,7 @@ mod tests { let (_dir, importer) = create_tmp_importer("apply-basic"); let (region_scheduler, mut snapshot_rx) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -5715,7 +5744,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6054,7 +6083,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6145,7 +6174,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6325,7 +6354,7 @@ mod tests { cfg.apply_batch_system.low_priority_pool_size = 0; Arc::new(VersionTrack::new(cfg)) }; - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6418,7 +6447,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-exec-observer".to_owned(), @@ -6642,7 +6671,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Config::default(); - let (router, mut system) = create_apply_batch_system(&cfg); + let (router, mut system) = create_apply_batch_system(&cfg, None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -6922,7 +6951,7 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(obs)); let (region_scheduler, _) = dummy_scheduler(); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "test-store".to_owned(), @@ -7148,7 +7177,7 @@ mod tests { let (region_scheduler, _) = dummy_scheduler(); let sender = Box::new(TestNotifier { tx }); let cfg = Arc::new(VersionTrack::new(Config::default())); - let (router, mut system) = create_apply_batch_system(&cfg.value()); + let (router, mut system) = create_apply_batch_system(&cfg.value(), None); let pending_create_peers = Arc::new(Mutex::new(HashMap::default())); let builder = super::Builder:: { tag: "flashback_need_to_be_applied".to_owned(), diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index d405c3471af..4266e400cd3 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -2702,7 +2702,7 @@ where } let mut resp = ExtraMessage::default(); resp.set_type(ExtraMessageType::MsgVoterReplicatedIndexResponse); - resp.voter_replicated_index = voter_replicated_idx; + resp.index = voter_replicated_idx; self.fsm .peer .send_extra_message(resp, &mut self.ctx.trans, from); @@ -2719,7 +2719,7 @@ where if self.fsm.peer.is_leader() || !self.fsm.peer.is_witness() { return; } - let voter_replicated_index = msg.voter_replicated_index; + let voter_replicated_index = msg.index; if let Ok(voter_replicated_term) = self.fsm.peer.get_store().term(voter_replicated_index) { self.ctx.apply_router.schedule_task( self.region_id(), @@ -2787,6 +2787,8 @@ where ExtraMessageType::MsgVoterReplicatedIndexResponse => { self.on_voter_replicated_index_response(msg.get_extra_msg()); } + ExtraMessageType::MsgGcPeerRequest => unimplemented!(), + ExtraMessageType::MsgGcPeerResponse => unimplemented!(), } } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2ca573824f9..e68873cadf1 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -42,6 +42,7 @@ use kvproto::{ use pd_client::{Feature, FeatureGate, PdClient}; use protobuf::Message; use raft::StateRole; +use resource_control::ResourceGroupManager; use resource_metering::CollectorRegHandle; use sst_importer::SstImporter; use tikv_alloc::trace::TraceEvent; @@ -1795,11 +1796,21 @@ impl RaftBatchSystem { pub fn create_raft_batch_system( cfg: &Config, + resource_manager: &Option>, ) -> (RaftRouter, RaftBatchSystem) { let (store_tx, store_fsm) = StoreFsm::new(cfg); - let (apply_router, apply_system) = create_apply_batch_system(cfg); - let (router, system) = - batch_system::create_system(&cfg.store_batch_system, store_tx, store_fsm); + let (apply_router, apply_system) = create_apply_batch_system( + cfg, + resource_manager + .as_ref() + .map(|m| m.derive_controller("apply".to_owned(), false)), + ); + let (router, system) = batch_system::create_system( + &cfg.store_batch_system, + store_tx, + store_fsm, + None, // Do not do priority scheduling for store batch system + ); let raft_router = RaftRouter { router }; let system = RaftBatchSystem { system, diff --git a/components/raftstore/src/store/msg.rs b/components/raftstore/src/store/msg.rs index 3c555689cb9..195a94478dc 100644 --- a/components/raftstore/src/store/msg.rs +++ b/components/raftstore/src/store/msg.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use std::{borrow::Cow, fmt}; +use batch_system::ResourceMetered; use collections::HashSet; use engine_traits::{CompactedEvent, KvEngine, Snapshot}; use futures::channel::mpsc::UnboundedSender; @@ -772,6 +773,8 @@ pub enum PeerMsg { Destroy(u64), } +impl ResourceMetered for PeerMsg {} + impl fmt::Debug for PeerMsg { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -867,6 +870,8 @@ where }, } +impl ResourceMetered for StoreMsg {} + impl fmt::Debug for StoreMsg where EK: KvEngine, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index e2a914fded6..44701fbf705 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -4843,7 +4843,7 @@ where return; } if let Some(ref state) = self.pending_merge_state { - if state.get_commit() == extra_msg.get_premerge_commit() { + if state.get_commit() == extra_msg.get_index() { self.add_want_rollback_merge_peer(peer_id); } } @@ -5438,7 +5438,7 @@ where }; let mut extra_msg = ExtraMessage::default(); extra_msg.set_type(ExtraMessageType::MsgWantRollbackMerge); - extra_msg.set_premerge_commit(premerge_commit); + extra_msg.set_index(premerge_commit); self.send_extra_message(extra_msg, &mut ctx.trans, &to_peer); } @@ -5795,6 +5795,7 @@ mod tests { AdminCmdType::ComputeHash, AdminCmdType::VerifyHash, AdminCmdType::BatchSwitchWitness, + AdminCmdType::UpdateGcPeer, ]; for tp in AdminCmdType::values() { let mut msg = RaftCmdRequest::default(); diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 2f61534d159..4d8128822c7 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -24,9 +24,9 @@ use kvproto::{ }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; -use protobuf::{self, Message}; +use protobuf::{self, CodedInputStream, Message}; use raft::{ - eraftpb::{self, ConfChangeType, ConfState, MessageType, Snapshot}, + eraftpb::{self, ConfChangeType, ConfState, Entry, EntryType, MessageType, Snapshot}, Changer, RawNode, INVALID_INDEX, }; use raft_proto::ConfChangeI; @@ -229,6 +229,7 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat AdminCmdEpochState::new(true, true, false, false) } AdminCmdType::BatchSwitchWitness => AdminCmdEpochState::new(false, true, false, true), + AdminCmdType::UpdateGcPeer => AdminCmdEpochState::new(false, false, false, false), } } @@ -725,6 +726,24 @@ pub(crate) fn u64_to_timespec(u: u64) -> Timespec { Timespec::new(sec as i64, nsec as i32) } +pub fn get_entry_header(entry: &Entry) -> RaftRequestHeader { + if entry.get_entry_type() != EntryType::EntryNormal { + return RaftRequestHeader::default(); + } + // request header is encoded into data + let mut is = CodedInputStream::from_bytes(entry.get_data()); + if is.eof().unwrap() { + return RaftRequestHeader::default(); + } + let (field_number, _) = is.read_tag_unpack().unwrap(); + let t = is.read_message().unwrap(); + // Header field is of number 1 + if field_number != 1 { + panic!("unexpected field number: {} {:?}", field_number, t); + } + t +} + /// Parse data of entry `index`. /// /// # Panics @@ -1671,6 +1690,7 @@ mod tests { metapb::{self, RegionEpoch}, raft_cmdpb::AdminRequest, }; + use protobuf::Message as _; use raft::eraftpb::{ConfChangeType, Entry, Message, MessageType}; use tikv_util::store::new_peer; use time::Duration as TimeDuration; @@ -1749,6 +1769,20 @@ mod tests { assert_eq!(m1.inspect(Some(monotonic_raw_now())), LeaseState::Valid); } + #[test] + fn test_get_entry_header() { + let mut req = RaftCmdRequest::default(); + let mut header = RaftRequestHeader::default(); + header.set_resource_group_name("test".to_owned()); + req.set_header(header); + let mut entry = Entry::new(); + entry.set_term(1); + entry.set_index(2); + entry.set_data(req.write_to_bytes().unwrap().into()); + let header = get_entry_header(&entry); + assert_eq!(header.get_resource_group_name(), "test"); + } + #[test] fn test_timespec_u64() { let cases = vec![ diff --git a/components/raftstore/src/store/worker/refresh_config.rs b/components/raftstore/src/store/worker/refresh_config.rs index 6555e96f102..d09a6dd9f53 100644 --- a/components/raftstore/src/store/worker/refresh_config.rs +++ b/components/raftstore/src/store/worker/refresh_config.rs @@ -43,7 +43,7 @@ where for _ in 0..size { if let Err(e) = self.state.fsm_sender.send(FsmTypes::Empty) { error!( - "failed to decrese thread pool"; + "failed to decrease thread pool"; "decrease to" => size, "err" => %e, ); diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 23a50b42560..1524ebcba5d 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -9,7 +9,10 @@ use std::{ }; use dashmap::{mapref::one::Ref, DashMap}; -use kvproto::resource_manager::{GroupMode, ResourceGroup}; +use kvproto::{ + kvrpcpb::CommandPri, + resource_manager::{GroupMode, ResourceGroup}, +}; use yatp::queue::priority::TaskPriorityProvider; // a read task cost at least 50us. @@ -97,7 +100,6 @@ impl ResourceGroupManager { let ru_quota = Self::get_ru_setting(g.value(), controller.is_read); controller.add_resource_group(g.key().clone().into_bytes(), ru_quota); } - controller } @@ -243,6 +245,15 @@ impl ResourceController { // need totally accurate here. self.last_min_vt.store(max_vt, Ordering::Relaxed); } + + pub fn get_priority(&self, name: &[u8], pri: CommandPri) -> u64 { + let level = match pri { + CommandPri::Low => 2, + CommandPri::Normal => 1, + CommandPri::High => 0, + }; + self.resource_group(name).get_priority(level) + } } impl TaskPriorityProvider for ResourceController { diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 207373313a4..2a479964ced 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -350,7 +350,7 @@ where }; // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 2121b7e021f..81e7129407e 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -81,7 +81,7 @@ pub trait Simulator { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult; fn stop_node(&mut self, node_id: u64); fn get_node_ids(&self) -> HashSet; @@ -176,7 +176,7 @@ pub struct Cluster { pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, - resource_manager: Arc, + resource_manager: Option>, } impl Cluster { @@ -210,7 +210,7 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), - resource_manager: Arc::new(ResourceGroupManager::default()), + resource_manager: Some(Arc::new(ResourceGroupManager::default())), kv_statistics: vec![], raft_statistics: vec![], } @@ -279,7 +279,8 @@ impl Cluster { // Try start new nodes. for _ in 0..self.count - self.engines.len() { - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let engines = self.dbs.last().unwrap().clone(); @@ -350,7 +351,8 @@ impl Cluster { debug!("starting node {}", node_id); let engines = self.engines[&node_id].clone(); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { cfg.server.labels = labels.to_owned(); diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 9ae76dba9f8..05ed8ece83d 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -230,7 +230,7 @@ impl Simulator for NodeCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - _resource_manager: &Arc, + _resource_manager: &Option>, ) -> ServerResult { assert!(node_id == 0 || !self.nodes.contains_key(&node_id)); let pd_worker = LazyWorker::new("test-pd-worker"); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index ccf4df43497..63a0b4e4804 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -265,7 +265,7 @@ impl ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult { let (tmp_str, tmp) = if node_id == 0 || !self.snap_paths.contains_key(&node_id) { let p = test_util::temp_dir("test_cluster", cfg.prefer_mem); @@ -416,7 +416,9 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), self.get_causal_ts_provider(node_id), - Some(resource_manager.derive_controller("scheduler-worker-pool".to_owned(), true)), + resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), )?; self.storages.insert(node_id, raft_engine); @@ -652,7 +654,7 @@ impl Simulator for ServerCluster { key_manager: Option>, router: RaftRouter, system: RaftBatchSystem, - resource_manager: &Arc, + resource_manager: &Option>, ) -> ServerResult { dispatch_api_version!( cfg.storage.api_version(), diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 92f3bac3d5b..1193751b228 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -38,7 +38,7 @@ num-traits = "0.2" num_cpus = "1" online_config = { workspace = true } openssl = "0.10" -parking_lot = "0.12.1" +parking_lot_core = "0.9.1" pin-project = "1.0" prometheus = { version = "0.13", features = ["nightly"] } prometheus-static-metric = "0.5" diff --git a/components/tikv_util/src/mpsc/priority_queue.rs b/components/tikv_util/src/mpsc/priority_queue.rs index 3389d6154c3..fac741361db 100644 --- a/components/tikv_util/src/mpsc/priority_queue.rs +++ b/components/tikv_util/src/mpsc/priority_queue.rs @@ -7,7 +7,9 @@ use std::sync::{ use crossbeam::channel::{RecvError, SendError, TryRecvError, TrySendError}; use crossbeam_skiplist::SkipMap; -use parking_lot::{Condvar, Mutex}; +use parking_lot_core::{ + park, unpark_all, unpark_one, SpinWait, DEFAULT_PARK_TOKEN, DEFAULT_UNPARK_TOKEN, +}; // Create a priority based channel. Sender can send message with priority of // u64, and receiver will receive messages in ascending order of priority. For @@ -54,8 +56,6 @@ impl Drop for Cell { #[derive(Default)] struct PriorityQueue { queue: SkipMap>, - disconnected: Mutex, - available: Condvar, sequencer: AtomicU64, @@ -67,8 +67,6 @@ impl PriorityQueue { pub fn new() -> Self { Self { queue: SkipMap::new(), - disconnected: Mutex::new(false), - available: Condvar::new(), sequencer: AtomicU64::new(0), senders: AtomicUsize::new(1), receivers: AtomicUsize::new(1), @@ -81,6 +79,10 @@ impl PriorityQueue { sequence: self.sequencer.fetch_add(1, Ordering::Relaxed), } } + + fn is_disconnected(&self) -> bool { + self.senders.load(Ordering::SeqCst) == 0 + } } // When derived `PartialOrd` on structs, it will produce a lexicographic @@ -109,7 +111,10 @@ impl Sender { self.inner .queue .insert(self.inner.get_map_key(pri), Cell::new(msg)); - self.inner.available.notify_one(); + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_one(addr, |_| DEFAULT_UNPARK_TOKEN); + } Ok(()) } @@ -132,8 +137,10 @@ impl Drop for Sender { fn drop(&mut self) { let old = self.inner.senders.fetch_sub(1, Ordering::AcqRel); if old <= 1 { - *self.inner.disconnected.lock() = true; - self.inner.available.notify_all(); + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + unpark_all(addr, DEFAULT_UNPARK_TOKEN); + } } } } @@ -146,14 +153,13 @@ impl Receiver { pub fn try_recv(&self) -> Result { match self.inner.queue.pop_front() { Some(entry) => Ok(entry.value().take().unwrap()), - None if self.inner.senders.load(Ordering::SeqCst) == 0 => { - Err(TryRecvError::Disconnected) - } + None if self.inner.is_disconnected() => Err(TryRecvError::Disconnected), None => Err(TryRecvError::Empty), } } pub fn recv(&self) -> Result { + let mut spin = SpinWait::new(); loop { match self.try_recv() { Ok(msg) => return Ok(msg), @@ -161,17 +167,25 @@ impl Receiver { return Err(RecvError); } Err(TryRecvError::Empty) => { - let mut disconnected = self.inner.disconnected.lock(); - if *disconnected { - return Err(RecvError); + if spin.spin() { + continue; + } + let addr = &*self.inner as *const PriorityQueue as usize; + unsafe { + park( + addr, + || self.len() == 0 && !self.inner.is_disconnected(), + || {}, + |_, _| {}, + DEFAULT_PARK_TOKEN, + None, + ); } - self.inner.available.wait(&mut disconnected); } } } } - #[cfg(test)] fn len(&self) -> usize { self.inner.queue.len() } diff --git a/tests/integrations/config/dynamic/raftstore.rs b/tests/integrations/config/dynamic/raftstore.rs index 70e70b3cbe6..ff1babb7e1f 100644 --- a/tests/integrations/config/dynamic/raftstore.rs +++ b/tests/integrations/config/dynamic/raftstore.rs @@ -66,7 +66,7 @@ fn start_raftstore( ApplyRouter, RaftBatchSystem, ) { - let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store); + let (raft_router, mut system) = create_raft_batch_system(&cfg.raft_store, &None); let engines = create_tmp_engine(dir); let host = CoprocessorHost::default(); let importer = { diff --git a/tests/integrations/config/dynamic/snap.rs b/tests/integrations/config/dynamic/snap.rs index 1a82ec8005e..af03246acf4 100644 --- a/tests/integrations/config/dynamic/snap.rs +++ b/tests/integrations/config/dynamic/snap.rs @@ -45,7 +45,8 @@ fn start_server( .name_prefix(thd_name!("test-server")) .build(), ); - let (raft_router, _) = create_raft_batch_system::(&cfg.raft_store); + let (raft_router, _) = + create_raft_batch_system::(&cfg.raft_store, &None); let mut snap_worker = Worker::new("snap-handler").lazy_build("snap-handler"); let snap_worker_scheduler = snap_worker.scheduler(); let server_config = Arc::new(VersionTrack::new(cfg.server.clone())); diff --git a/tests/integrations/raftstore/test_bootstrap.rs b/tests/integrations/raftstore/test_bootstrap.rs index 8ede13bd0f4..ee063e0f1e7 100644 --- a/tests/integrations/raftstore/test_bootstrap.rs +++ b/tests/integrations/raftstore/test_bootstrap.rs @@ -42,7 +42,7 @@ fn test_node_bootstrap_with_prepared_data() { let pd_client = Arc::new(TestPdClient::new(0, false)); let cfg = new_tikv_config(0); - let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store); + let (_, system) = fsm::create_raft_batch_system(&cfg.raft_store, &None); let simulate_trans = SimulateTransport::new(ChannelTransport::new()); let tmp_path = Builder::new().prefix("test_cluster").tempdir().unwrap(); let engine = From fcaa14a9d58814d733cfc41176e038b12cf9b9ab Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Sat, 28 Jan 2023 15:21:33 +0800 Subject: [PATCH 096/115] [Cloud] Merge raftstore-proxy master(PR #249) (#259) --- Cargo.lock | 45 +- Cargo.toml | 9 +- Makefile | 2 +- security/SECURITY.md => SECURITY.md | 4 +- cmd/tikv-ctl/src/executor.rs | 35 +- cmd/tikv-ctl/src/main.rs | 23 +- cmd/tikv-server/src/main.rs | 10 +- .../backup-stream/src/checkpoint_manager.rs | 29 +- components/backup-stream/src/endpoint.rs | 74 +- components/backup-stream/src/event_loader.rs | 3 +- .../src/metadata/checkpoint_cache.rs | 71 + .../backup-stream/src/metadata/client.rs | 39 +- components/backup-stream/src/metadata/mod.rs | 3 +- .../src/metadata/store/lazy_etcd.rs | 14 +- components/backup-stream/src/metadata/test.rs | 4 +- .../backup-stream/src/subscription_manager.rs | 19 +- .../backup-stream/src/subscription_track.rs | 109 +- components/backup-stream/src/utils.rs | 130 ++ components/backup/src/endpoint.rs | 1 + components/cdc/src/observer.rs | 2 + components/engine_panic/src/compact.rs | 11 +- components/engine_panic/src/engine.rs | 12 +- components/engine_panic/src/misc.rs | 34 +- components/engine_panic/src/raft_engine.rs | 69 +- components/engine_rocks/src/compact.rs | 14 +- components/engine_rocks/src/db_options.rs | 22 +- components/engine_rocks/src/engine.rs | 65 +- components/engine_rocks/src/event_listener.rs | 171 +- components/engine_rocks/src/file_system.rs | 6 +- components/engine_rocks/src/lib.rs | 7 +- components/engine_rocks/src/misc.rs | 117 +- components/engine_rocks/src/options.rs | 1 + components/engine_rocks/src/raft_engine.rs | 114 +- components/engine_rocks/src/raw.rs | 12 +- components/engine_rocks/src/rocks_metrics.rs | 511 +++-- .../engine_rocks/src/rocks_metrics_defs.rs | 5 +- components/engine_rocks/src/util.rs | 4 +- components/engine_rocks/src/write_batch.rs | 32 +- .../engine_rocks_helper/src/sst_recovery.rs | 3 +- components/engine_test/src/lib.rs | 378 +--- components/engine_traits/Cargo.toml | 2 + components/engine_traits/src/cf_defs.rs | 1 + components/engine_traits/src/compact.rs | 26 +- components/engine_traits/src/engine.rs | 352 +--- components/engine_traits/src/flush.rs | 179 ++ components/engine_traits/src/lib.rs | 9 +- components/engine_traits/src/misc.rs | 30 +- components/engine_traits/src/raft_engine.rs | 96 +- components/engine_traits/src/tablet.rs | 472 +++++ components/engine_traits/src/write_batch.rs | 7 + components/pd_client/src/client_v2.rs | 8 +- components/raft_log_engine/Cargo.toml | 4 + components/raft_log_engine/src/engine.rs | 338 +++- components/raft_log_engine/src/lib.rs | 1 + components/raftstore-v2/Cargo.toml | 8 +- components/raftstore-v2/src/batch/store.rs | 299 ++- components/raftstore-v2/src/bootstrap.rs | 7 +- components/raftstore-v2/src/fsm/apply.rs | 47 +- components/raftstore-v2/src/fsm/peer.rs | 135 +- components/raftstore-v2/src/fsm/store.rs | 115 +- components/raftstore-v2/src/lib.rs | 8 +- .../operation/command/admin/compact_log.rs | 319 +++ .../operation/command/admin/conf_change.rs | 15 +- .../src/operation/command/admin/mod.rs | 65 +- .../src/operation/command/admin/split.rs | 509 +++-- .../command/admin/transfer_leader.rs | 418 ++++ .../src/operation/command/control.rs | 13 +- .../raftstore-v2/src/operation/command/mod.rs | 263 +-- .../src/operation/command/write/mod.rs | 129 +- .../operation/command/write/simple_write.rs | 296 ++- components/raftstore-v2/src/operation/life.rs | 35 +- components/raftstore-v2/src/operation/mod.rs | 9 +- components/raftstore-v2/src/operation/pd.rs | 49 +- .../raftstore-v2/src/operation/query/lease.rs | 44 +- .../raftstore-v2/src/operation/query/local.rs | 318 +-- .../raftstore-v2/src/operation/query/mod.rs | 61 +- .../src/operation/query/replica.rs | 5 +- .../src/operation/ready/apply_trace.rs | 636 ++++++ .../src/operation/ready/async_writer.rs | 3 +- .../raftstore-v2/src/operation/ready/mod.rs | 140 +- .../src/operation/ready/snapshot.rs | 374 ++-- components/raftstore-v2/src/raft/apply.rs | 99 +- components/raftstore-v2/src/raft/mod.rs | 2 +- components/raftstore-v2/src/raft/peer.rs | 331 ++- components/raftstore-v2/src/raft/storage.rs | 322 +-- components/raftstore-v2/src/router/imp.rs | 100 +- .../src/router/internal_message.rs | 10 +- components/raftstore-v2/src/router/message.rs | 147 +- components/raftstore-v2/src/router/mod.rs | 4 +- .../src/router/response_channel.rs | 273 ++- components/raftstore-v2/src/worker/mod.rs | 5 +- components/raftstore-v2/src/worker/pd/mod.rs | 105 +- .../src/worker/pd/region_heartbeat.rs | 19 +- .../raftstore-v2/src/worker/pd/split.rs | 4 +- .../src/worker/pd/store_heartbeat.rs | 5 +- .../src/worker/pd/update_max_timestamp.rs | 3 +- .../raftstore-v2/src/worker/tablet_gc.rs | 227 +++ .../raftstore-v2/tests/failpoints/mod.rs | 2 + .../tests/failpoints/test_basic_write.rs | 51 +- .../tests/failpoints/test_split.rs | 106 + .../tests/failpoints/test_trace_apply.rs | 7 + .../tests/integrations/cluster.rs | 302 ++- .../raftstore-v2/tests/integrations/mod.rs | 2 + .../tests/integrations/test_basic_write.rs | 110 +- .../tests/integrations/test_conf_change.rs | 55 +- .../tests/integrations/test_life.rs | 29 +- .../tests/integrations/test_pd_heartbeat.rs | 2 +- .../tests/integrations/test_read.rs | 28 +- .../tests/integrations/test_split.rs | 257 ++- .../tests/integrations/test_status.rs | 2 +- .../tests/integrations/test_trace_apply.rs | 217 ++ .../integrations/test_transfer_leader.rs | 151 ++ .../raftstore/src/coprocessor/dispatcher.rs | 134 +- components/raftstore/src/coprocessor/mod.rs | 12 +- .../src/coprocessor/region_info_accessor.rs | 78 +- .../src/coprocessor/split_check/half.rs | 18 +- .../src/coprocessor/split_check/keys.rs | 57 +- .../src/coprocessor/split_check/size.rs | 127 +- .../src/coprocessor/split_check/table.rs | 14 +- components/raftstore/src/lib.rs | 1 + components/raftstore/src/router.rs | 110 +- .../raftstore/src/store/async_io/read.rs | 6 +- .../raftstore/src/store/async_io/write.rs | 293 ++- .../src/store/async_io/write_tests.rs | 148 +- .../raftstore/src/store/compaction_guard.rs | 6 +- components/raftstore/src/store/fsm/apply.rs | 11 +- components/raftstore/src/store/fsm/mod.rs | 2 +- components/raftstore/src/store/fsm/peer.rs | 169 +- components/raftstore/src/store/fsm/store.rs | 55 +- components/raftstore/src/store/mod.rs | 20 +- components/raftstore/src/store/peer.rs | 16 +- .../raftstore/src/store/peer_storage.rs | 48 +- components/raftstore/src/store/region_meta.rs | 11 +- .../raftstore/src/store/region_snapshot.rs | 2 +- components/raftstore/src/store/snap.rs | 53 +- components/raftstore/src/store/util.rs | 163 +- .../src/store/worker/check_leader.rs | 52 +- .../raftstore/src/store/worker/compact.rs | 2 +- .../src/store/worker/consistency_check.rs | 48 +- .../raftstore/src/store/worker/raftlog_gc.rs | 70 +- components/raftstore/src/store/worker/read.rs | 20 +- .../raftstore/src/store/worker/split_check.rs | 164 +- components/server/Cargo.toml | 1 + components/server/src/lib.rs | 2 + components/server/src/raft_engine_switch.rs | 12 +- components/server/src/server.rs | 243 ++- components/server/src/server2.rs | 1776 +++++++++++++++++ components/server/src/signal_handler.rs | 30 +- components/snap_recovery/src/init_cluster.rs | 36 +- components/test_raftstore/src/cluster.rs | 13 +- .../test_raftstore/src/common-test.toml | 1 - components/test_raftstore/src/server.rs | 15 +- components/test_raftstore/src/util.rs | 25 +- components/tikv_kv/src/lib.rs | 6 +- components/tikv_kv/src/mock_engine.rs | 2 +- components/tikv_kv/src/rocksdb_engine.rs | 11 +- components/tikv_util/Cargo.toml | 2 +- components/tikv_util/src/lib.rs | 1 + components/tikv_util/src/sys/mod.rs | 23 +- components/tikv_util/src/worker/pool.rs | 10 +- components/txn_types/src/types.rs | 4 +- engine_store_ffi/src/observer.rs | 88 +- engine_store_ffi/src/ps_engine.rs | 106 +- engine_tiflash/src/compact.rs | 14 +- engine_tiflash/src/db_options.rs | 22 +- engine_tiflash/src/engine.rs | 8 - engine_tiflash/src/lib.rs | 7 +- engine_tiflash/src/misc.rs | 48 +- engine_tiflash/src/ps_write_batch.rs | 5 +- engine_tiflash/src/raft_engine.rs | 114 +- engine_tiflash/src/rocks_metrics.rs | 337 +++- engine_tiflash/src/rocks_metrics_defs.rs | 1 - engine_tiflash/src/util.rs | 4 +- engine_tiflash/src/write_batch.rs | 34 +- etc/config-template.toml | 25 +- metrics/alertmanager/tikv.rules.yml | 8 +- metrics/grafana/performance_write.json | 2 +- metrics/grafana/tikv_details.json | 6 +- metrics/grafana/tikv_summary.json | 2 +- metrics/grafana/tikv_trouble_shooting.json | 2 +- new-mock-engine-store/src/mock_cluster.rs | 29 +- new-mock-engine-store/src/server.rs | 7 +- proxy_server/src/config.rs | 16 + proxy_server/src/engine.rs | 55 + proxy_server/src/lib.rs | 1 + proxy_server/src/run.rs | 180 +- proxy_server/src/util.rs | 2 +- proxy_tests/proxy/config.rs | 4 + proxy_tests/proxy/normal.rs | 2 +- proxy_tests/proxy/region.rs | 16 +- proxy_tests/proxy/server_cluster_test.rs | 2 +- src/config/configurable.rs | 141 ++ src/{config.rs => config/mod.rs} | 662 +++--- src/coprocessor/endpoint.rs | 6 +- src/coprocessor/mod.rs | 4 + src/coprocessor/tracker.rs | 34 +- src/import/sst_service.rs | 35 +- src/server/debug.rs | 45 +- src/server/engine_factory.rs | 337 ++-- src/server/gc_worker/compaction_filter.rs | 144 +- src/server/gc_worker/gc_worker.rs | 82 +- src/server/gc_worker/mod.rs | 16 + .../gc_worker/rawkv_compaction_filter.rs | 13 +- src/server/lock_manager/waiter_manager.rs | 12 +- src/server/mod.rs | 5 +- src/server/node.rs | 165 +- src/server/raftkv/mod.rs | 10 +- src/server/raftkv2/mod.rs | 296 +++ src/server/raftkv2/node.rs | 238 +++ src/server/raftkv2/raft_extension.rs | 109 + src/server/server.rs | 42 +- src/server/service/debug.rs | 17 +- src/server/service/diagnostics/mod.rs | 2 +- src/server/service/diagnostics/sys.rs | 55 +- src/server/service/kv.rs | 5 +- src/server/status_server/mod.rs | 61 +- src/server/tablet_snap.rs | 4 +- src/storage/config.rs | 22 +- src/storage/config_manager.rs | 44 +- src/storage/kv/test_engine_builder.rs | 25 +- .../lock_manager/lock_waiting_queue.rs | 72 +- src/storage/lock_manager/mod.rs | 1 - src/storage/metrics.rs | 26 +- src/storage/mod.rs | 117 +- src/storage/mvcc/reader/reader.rs | 2 +- src/storage/mvcc/txn.rs | 31 +- .../txn/actions/acquire_pessimistic_lock.rs | 4 +- src/storage/txn/actions/check_txn_status.rs | 2 +- .../txn/actions/flashback_to_version.rs | 219 +- src/storage/txn/actions/prewrite.rs | 13 +- .../txn/commands/acquire_pessimistic_lock.rs | 4 +- .../acquire_pessimistic_lock_resumed.rs | 8 +- src/storage/txn/commands/atomic_store.rs | 1 + .../txn/commands/check_secondary_locks.rs | 2 + src/storage/txn/commands/check_txn_status.rs | 2 + src/storage/txn/commands/cleanup.rs | 2 + src/storage/txn/commands/commit.rs | 2 + src/storage/txn/commands/compare_and_swap.rs | 1 + .../txn/commands/flashback_to_version.rs | 5 +- .../flashback_to_version_read_phase.rs | 64 +- src/storage/txn/commands/mod.rs | 5 +- src/storage/txn/commands/pause.rs | 1 + .../txn/commands/pessimistic_rollback.rs | 2 + src/storage/txn/commands/prewrite.rs | 10 +- src/storage/txn/commands/resolve_lock.rs | 2 + src/storage/txn/commands/resolve_lock_lite.rs | 2 + src/storage/txn/commands/rollback.rs | 2 + src/storage/txn/commands/txn_heart_beat.rs | 4 +- .../flow_controller/tablet_flow_controller.rs | 125 +- src/storage/txn/scheduler.rs | 51 +- tests/failpoints/cases/test_gc_worker.rs | 2 +- tests/failpoints/cases/test_pd_client.rs | 15 +- tests/failpoints/cases/test_snap.rs | 4 +- tests/failpoints/cases/test_stale_peer.rs | 4 +- tests/failpoints/cases/test_storage.rs | 6 +- .../failpoints/cases/test_table_properties.rs | 12 +- tests/integrations/backup/mod.rs | 30 + tests/integrations/config/mod.rs | 32 +- tests/integrations/config/test-custom.toml | 21 +- tests/integrations/pd/test_rpc_client.rs | 2 +- .../raftstore/test_compact_lock_cf.rs | 8 +- .../integrations/raftstore/test_flashback.rs | 114 +- tests/integrations/raftstore/test_stats.rs | 6 +- .../raftstore/test_update_region_size.rs | 2 +- tests/integrations/server/kv_service.rs | 81 +- tests/integrations/server/lock_manager.rs | 113 +- tests/integrations/server/status_server.rs | 8 +- tests/integrations/storage/test_titan.rs | 17 +- 268 files changed, 14608 insertions(+), 5588 deletions(-) rename security/SECURITY.md => SECURITY.md (98%) create mode 100644 components/backup-stream/src/metadata/checkpoint_cache.rs create mode 100644 components/engine_traits/src/flush.rs create mode 100644 components/engine_traits/src/tablet.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/compact_log.rs create mode 100644 components/raftstore-v2/src/operation/command/admin/transfer_leader.rs create mode 100644 components/raftstore-v2/src/operation/ready/apply_trace.rs create mode 100644 components/raftstore-v2/src/worker/tablet_gc.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_split.rs create mode 100644 components/raftstore-v2/tests/failpoints/test_trace_apply.rs create mode 100644 components/raftstore-v2/tests/integrations/test_trace_apply.rs create mode 100644 components/raftstore-v2/tests/integrations/test_transfer_leader.rs create mode 100644 components/server/src/server2.rs create mode 100644 proxy_server/src/engine.rs create mode 100644 src/config/configurable.rs rename src/{config.rs => config/mod.rs} (93%) create mode 100644 src/server/raftkv2/mod.rs create mode 100644 src/server/raftkv2/node.rs create mode 100644 src/server/raftkv2/raft_extension.rs diff --git a/Cargo.lock b/Cargo.lock index bec4948a2af..ec1adad40ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -446,8 +446,7 @@ dependencies = [ [[package]] name = "backtrace" version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +source = "git+https://github.com/hehechen/backtrace-rs?branch=v0.3.61#d0aeebbea2298174e4c6edd3d1e54bda0e6624e4" dependencies = [ "addr2line", "cc", @@ -1432,12 +1431,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "doc-comment" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "923dea538cea0aa3025e8685b20d6ee21ef99c4f77e954a30febbaac5ec73a97" - [[package]] name = "dyn-clone" version = "1.0.4" @@ -1730,9 +1723,11 @@ name = "engine_traits" version = "0.0.1" dependencies = [ "case_macros", + "collections", "error_code", "fail", "file_system", + "keys", "kvproto", "log_wrappers", "protobuf", @@ -3030,7 +3025,7 @@ dependencies = [ [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "bindgen 0.57.0", "bzip2-sys", @@ -3049,7 +3044,7 @@ dependencies = [ [[package]] name = "libtitan_sys" version = "0.0.1" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "bzip2-sys", "cc", @@ -3566,9 +3561,9 @@ dependencies = [ [[package]] name = "ntapi" -version = "0.3.3" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26e041cd983acbc087e30fcba770380cfa352d0e392e175b2344ebaf7ea0602" +checksum = "bc51db7b362b205941f71232e56c625156eb9a929f8cf74a428fd5bc094a4afc" dependencies = [ "winapi 0.3.9", ] @@ -4659,6 +4654,7 @@ dependencies = [ name = "raft_log_engine" version = "0.0.1" dependencies = [ + "codec", "encryption", "engine_traits", "file_system", @@ -4673,6 +4669,7 @@ dependencies = [ "serde_derive", "slog", "slog-global", + "tempfile", "tikv_util", "time", "tracker", @@ -4760,6 +4757,7 @@ name = "raftstore-v2" version = "0.1.0" dependencies = [ "batch-system", + "bytes", "causal_ts", "collections", "concurrency_manager", @@ -4774,12 +4772,14 @@ dependencies = [ "keys", "kvproto", "log_wrappers", + "parking_lot 0.12.0", "pd_client", "prometheus", "protobuf", "raft", "raft-proto", "raftstore", + "rand 0.8.5", "resource_metering", "slog", "slog-global", @@ -4787,6 +4787,7 @@ dependencies = [ "tempfile", "test_pd", "test_util", + "thiserror", "tikv_util", "time", "tracker", @@ -4922,9 +4923,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.0" +version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" +checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" dependencies = [ "autocfg", "crossbeam-deque", @@ -4934,14 +4935,13 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.0" +version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" +checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils 0.8.8", - "lazy_static", "num_cpus", ] @@ -5145,7 +5145,7 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/tikv/rust-rocksdb.git#bd07e9e598db63574cf06edaeea3c4687eadff59" +source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ "libc 0.2.132", "librocksdb_sys", @@ -5611,6 +5611,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", + "raftstore-v2", "rand 0.8.5", "resolved_ts", "resource_metering", @@ -6030,13 +6031,12 @@ checksum = "20518fe4a4c9acf048008599e464deb21beeae3d3578418951a189c235a7a9a8" [[package]] name = "sysinfo" -version = "0.16.4" +version = "0.26.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c280c91abd1aed2e36be1bc8f56fbc7a2acbb2b58fbcac9641510179cc72dd9" +checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "doc-comment", "libc 0.2.132", "ntapi", "once_cell", @@ -6600,7 +6600,7 @@ dependencies = [ [[package]] name = "tikv" -version = "6.5.0-alpha" +version = "6.6.0-alpha" dependencies = [ "anyhow", "api_version", @@ -6680,6 +6680,7 @@ dependencies = [ "raft", "raft_log_engine", "raftstore", + "raftstore-v2", "rand 0.7.3", "regex", "reqwest", diff --git a/Cargo.toml b/Cargo.toml index 55d6b086d42..fe73c10b584 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tikv" -version = "6.5.0-alpha" +version = "6.6.0-alpha" authors = ["The TiKV Authors"] description = "A distributed transactional key-value database powered by Rust and Raft" license = "Apache-2.0" @@ -137,6 +137,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.7.3" regex = "1.3" resource_metering = { workspace = true } @@ -154,7 +155,7 @@ smallvec = "1.4" sst_importer = { workspace = true } strum = { version = "0.20", features = ["derive"] } sync_wrapper = "0.1.1" -sysinfo = "0.16" +sysinfo = "0.26" tempfile = "3.0" thiserror = "1.0" tidb_query_aggr = { workspace = true } @@ -219,6 +220,8 @@ cmake = { git = "https://github.com/rust-lang/cmake-rs" } # This is a workaround for cargo can't resolving the this patch in yatp. crossbeam-deque = { git = "https://github.com/crossbeam-rs/crossbeam", rev = "41ed3d948720f26149b2ebeaf58fe8a193134056" } +# remove this when https://github.com/rust-lang/backtrace-rs/pull/503 is merged. +backtrace = { git = 'https://github.com/hehechen/backtrace-rs', branch = "v0.3.61" } [target.'cfg(target_os = "linux")'.dependencies] procinfo = { git = "https://github.com/tikv/procinfo-rs", rev = "6599eb9dca74229b2c1fcc44118bef7eff127128" } # When you modify TiKV cooperatively with kvproto, this will be useful to submit the PR to TiKV and the PR to @@ -340,7 +343,7 @@ pd_client = { path = "components/pd_client" } profiler = { path = "components/profiler" } raft_log_engine = { path = "components/raft_log_engine" } raftstore = { path = "components/raftstore", default-features = false } -raftstore_v2 = { path = "components/raftstore-v2", default-features = false } +raftstore-v2 = { path = "components/raftstore-v2", default-features = false } resolved_ts = { path = "components/resolved_ts" } resource_metering = { path = "components/resource_metering" } security = { path = "components/security" } diff --git a/Makefile b/Makefile index 8c595643828..2e55ed72176 100644 --- a/Makefile +++ b/Makefile @@ -223,7 +223,7 @@ ci_fmt_check: ci_test: wget https://github.com/protocolbuffers/protobuf/releases/download/v3.8.0/protoc-3.8.0-linux-x86_64.zip unzip protoc-3.8.0-linux-x86_64.zip - PROTOC="`pwd`/bin/protoc" M="testold" ./proxy_scripts/ci_check.sh + # PROTOC="`pwd`/bin/protoc" M="testold" ./proxy_scripts/ci_check.sh PROTOC="`pwd`/bin/protoc" M="testnew" ./proxy_scripts/ci_check.sh make debug diff --git a/security/SECURITY.md b/SECURITY.md similarity index 98% rename from security/SECURITY.md rename to SECURITY.md index 353a70f039f..30be9e0daf0 100644 --- a/security/SECURITY.md +++ b/SECURITY.md @@ -18,6 +18,8 @@ The following are the versions that we support for security updates | Version | Supported | | ------- | ------------------ | +| 6.x | :white_check_mark: | +| 5.x | :white_check_mark: | | 4.x | :white_check_mark: | | 3.x | :white_check_mark: | | 2.x | :white_check_mark: | @@ -94,4 +96,4 @@ IvCICV7zG1cyuM/Z2Y7/TJ+upvahP46nM3s3G15b8FYuTSmRN1Kp9+mBt2BHqOy1 ulx+VF4Lf9n3ydf593Nha9bMJ/rnSp01 =XbYK -----END PGP PUBLIC KEY BLOCK----- -``` \ No newline at end of file +``` diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index b2d25a32d5b..94610face44 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1,8 +1,7 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::ToOwned, cmp::Ordering, path::PathBuf, pin::Pin, str, string::ToString, sync::Arc, - time::Duration, u64, + borrow::ToOwned, cmp::Ordering, pin::Pin, str, string::ToString, sync::Arc, time::Duration, u64, }; use encryption_export::data_key_manager_from_config; @@ -28,7 +27,10 @@ use security::SecurityManager; use serde_json::json; use tikv::{ config::{ConfigController, TikvConfig}, - server::debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + server::{ + debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + KvEngineFactoryBuilder, + }, }; use tikv_util::escape; @@ -45,7 +47,6 @@ type MvccInfoStream = Pin, MvccInfo), Stri pub fn new_debug_executor( cfg: &TikvConfig, data_dir: Option<&str>, - skip_paranoid_checks: bool, host: Option<&str>, mgr: Arc, ) -> Box { @@ -55,47 +56,37 @@ pub fn new_debug_executor( // TODO: perhaps we should allow user skip specifying data path. let data_dir = data_dir.unwrap(); - let kv_path = cfg.infer_kv_engine_path(Some(data_dir)).unwrap(); let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); let cache = cfg.storage.block_cache.build_shared_cache(); - let shared_block_cache = cache.is_some(); let env = cfg .build_shared_rocks_env(key_manager.clone(), None /* io_rate_limiter */) .unwrap(); - let mut kv_db_opts = cfg.rocksdb.build_opt(); - kv_db_opts.set_env(env.clone()); - kv_db_opts.set_paranoid_checks(!skip_paranoid_checks); - let kv_cfs_opts = cfg - .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); - let kv_path = PathBuf::from(kv_path).canonicalize().unwrap(); - let kv_path = kv_path.to_str().unwrap(); - let mut kv_db = match new_engine_opt(kv_path, kv_db_opts, kv_cfs_opts) { + let factory = KvEngineFactoryBuilder::new(env.clone(), cfg, cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(data_dir) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - kv_db.set_shared_block_cache(shared_block_cache); let cfg_controller = ConfigController::default(); if !cfg.raft_engine.enable { - let mut raft_db_opts = cfg.raftdb.build_opt(); - raft_db_opts.set_env(env); - let raft_db_cf_opts = cfg.raftdb.build_cf_opts(&cache); + let raft_db_opts = cfg.raftdb.build_opt(env, None); + let raft_db_cf_opts = cfg.raftdb.build_cf_opts(factory.block_cache()); let raft_path = cfg.infer_raft_db_path(Some(data_dir)).unwrap(); if !db_exist(&raft_path) { error!("raft db not exists: {}", raft_path); tikv_util::logger::exit_process_gracefully(-1); } - let mut raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - raft_db.set_shared_block_cache(shared_block_cache); let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } else { @@ -383,7 +374,7 @@ pub trait DebugExecutor { to_config: &TikvConfig, mgr: Arc, ) { - let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, false, to_host, mgr); + let rhs_debug_executor = new_debug_executor(to_config, to_data_dir, to_host, mgr); let r1 = self.get_region_info(region); let r2 = rhs_debug_executor.get_region_info(region); diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 72078d07f62..30cd7035bef 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -59,7 +59,7 @@ fn main() { // Initialize configuration and security manager. let cfg_path = opt.config.as_ref(); - let cfg = cfg_path.map_or_else( + let mut cfg = cfg_path.map_or_else( || { let mut cfg = TikvConfig::default(); cfg.log.level = tikv_util::logger::get_level_by_string("warn") @@ -249,9 +249,8 @@ fn main() { .exit(); } - let skip_paranoid_checks = opt.skip_paranoid_checks; - let debug_executor = - new_debug_executor(&cfg, data_dir, skip_paranoid_checks, host, Arc::clone(&mgr)); + cfg.rocksdb.paranoid_checks = Some(!opt.skip_paranoid_checks); + let debug_executor = new_debug_executor(&cfg, data_dir, host, Arc::clone(&mgr)); match cmd { Cmd::Print { cf, key } => { @@ -643,7 +642,7 @@ fn compact_whole_cluster( .name(format!("compact-{}", addr)) .spawn_wrapper(move || { tikv_alloc::add_thread_memory_accessor(); - let debug_executor = new_debug_executor(&cfg, None, false, Some(&addr), mgr); + let debug_executor = new_debug_executor(&cfg, None, Some(&addr), mgr); for cf in cfs { debug_executor.compact( Some(&addr), @@ -682,20 +681,20 @@ fn read_fail_file(path: &str) -> Vec<(String, String)> { list } -fn run_ldb_command(args: Vec, cfg: &TikvConfig) { +fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { let key_manager = data_key_manager_from_config(&cfg.security.encryption, &cfg.storage.data_dir) .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - let mut opts = cfg.rocksdb.build_opt(); - opts.set_env(env); + cfg.rocksdb.build_opt(&cfg.rocksdb.build_resources(env)) +} - engine_rocks::raw::run_ldb_tool(&args, &opts); +fn run_ldb_command(args: Vec, cfg: &TikvConfig) { + engine_rocks::raw::run_ldb_tool(&args, &build_rocks_opts(cfg)); } fn run_sst_dump_command(args: Vec, cfg: &TikvConfig) { - let opts = cfg.rocksdb.build_opt(); - engine_rocks::raw::run_sst_dump_tool(&args, &opts); + engine_rocks::raw::run_sst_dump_tool(&args, &build_rocks_opts(cfg)); } fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, cfg: &TikvConfig) { @@ -714,7 +713,7 @@ fn print_bad_ssts(data_dir: &str, manifest: Option<&str>, pd_client: RpcClient, let stderr = BufferRedirect::stderr().unwrap(); let stdout = BufferRedirect::stdout().unwrap(); - let opts = cfg.rocksdb.build_opt(); + let opts = build_rocks_opts(cfg); match run_and_wait_child_process(|| engine_rocks::raw::run_sst_dump_tool(&args, &opts)) { Ok(code) => { diff --git a/cmd/tikv-server/src/main.rs b/cmd/tikv-server/src/main.rs index b366cd7849f..1d846d72bdb 100644 --- a/cmd/tikv-server/src/main.rs +++ b/cmd/tikv-server/src/main.rs @@ -7,7 +7,10 @@ use std::{path::Path, process}; use clap::{crate_authors, App, Arg}; use serde_json::{Map, Value}; use server::setup::{ensure_no_unrecognized_config, validate_and_persist_config}; -use tikv::config::{to_flatten_config_info, TikvConfig}; +use tikv::{ + config::{to_flatten_config_info, TikvConfig}, + storage::config::EngineType, +}; fn main() { let build_timestamp = option_env!("TIKV_BUILD_TIME"); @@ -207,5 +210,8 @@ fn main() { process::exit(0); } - server::server::run_tikv(config); + match config.storage.engine { + EngineType::RaftKv => server::server::run_tikv(config), + EngineType::RaftKv2 => server::server2::run_tikv(config), + } } diff --git a/components/backup-stream/src/checkpoint_manager.rs b/components/backup-stream/src/checkpoint_manager.rs index f34211ef7a5..5cf4292faa3 100644 --- a/components/backup-stream/src/checkpoint_manager.rs +++ b/components/backup-stream/src/checkpoint_manager.rs @@ -186,6 +186,16 @@ impl CheckpointManager { pub fn add_subscriber(&mut self, sub: Subscription) -> future![Result<()>] { let mgr = self.manager_handle.as_ref().cloned(); + let initial_data = self + .items + .values() + .map(|v| FlushEvent { + start_key: v.region.start_key.clone(), + end_key: v.region.end_key.clone(), + checkpoint: v.checkpoint.into_inner(), + ..Default::default() + }) + .collect::>(); // NOTE: we cannot send the real error into the client directly because once // we send the subscription into the sink, we cannot fetch it again :( @@ -208,6 +218,11 @@ impl CheckpointManager { mgr.send(SubscriptionOp::Add(sub)) .await .map_err(|err| annotate!(err, "failed to send request to subscriber manager"))?; + mgr.send(SubscriptionOp::Emit(initial_data)) + .await + .map_err(|err| { + annotate!(err, "failed to send initial data to subscriber manager") + })?; Ok(()) } } @@ -356,9 +371,13 @@ impl FlushObserver for BasicFlushObserver { .update_service_safe_point( format!("backup-stream-{}-{}", task, self.store_id), TimeStamp::new(rts.saturating_sub(1)), - // Add a service safe point for 30 mins (6x the default flush interval). - // It would probably be safe. - Duration::from_secs(1800), + // Add a service safe point for 24 hours. (the same as fatal error.) + // We make it the same duration as we meet fatal errors because TiKV may be + // SIGKILL'ed after it meets fatal error and before it successfully updated the + // fatal error safepoint. + // TODO: We'd better make the coordinator, who really + // calculates the checkpoint to register service safepoint. + Duration::from_secs(60 * 60 * 24), ) .await { @@ -454,7 +473,7 @@ where } #[cfg(test)] -mod tests { +pub mod tests { use std::{ assert_matches, collections::HashMap, @@ -506,7 +525,7 @@ mod tests { assert_matches::assert_matches!(r, GetCheckpointResult::Ok{checkpoint, ..} if checkpoint.into_inner() == 24); } - struct MockPdClient { + pub struct MockPdClient { safepoint: RwLock>, } diff --git a/components/backup-stream/src/endpoint.rs b/components/backup-stream/src/endpoint.rs index 2ebeee2ea66..c50c70a2eec 100644 --- a/components/backup-stream/src/endpoint.rs +++ b/components/backup-stream/src/endpoint.rs @@ -273,7 +273,22 @@ where meta_client: MetadataClient, scheduler: Scheduler, ) -> Result<()> { - let tasks = meta_client.get_tasks().await?; + let tasks; + loop { + let r = meta_client.get_tasks().await; + match r { + Ok(t) => { + tasks = t; + break; + } + Err(e) => { + e.report("failed to get backup stream task"); + tokio::time::sleep(Duration::from_secs(5)).await; + continue; + } + } + } + for task in tasks.inner { info!("backup stream watch task"; "task" => ?task); if task.is_paused { @@ -1055,12 +1070,21 @@ pub enum ObserveOp { impl std::fmt::Debug for ObserveOp { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Start { region } => f.debug_struct("Start").field("region", region).finish(), - Self::Stop { region } => f.debug_struct("Stop").field("region", region).finish(), - Self::Destroy { region } => f.debug_struct("Destroy").field("region", region).finish(), + Self::Start { region } => f + .debug_struct("Start") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Stop { region } => f + .debug_struct("Stop") + .field("region", &utils::debug_region(region)) + .finish(), + Self::Destroy { region } => f + .debug_struct("Destroy") + .field("region", &utils::debug_region(region)) + .finish(), Self::RefreshResolver { region } => f .debug_struct("RefreshResolver") - .field("region", region) + .field("region", &utils::debug_region(region)) .finish(), Self::NotifyFailToStartObserve { region, @@ -1068,7 +1092,7 @@ impl std::fmt::Debug for ObserveOp { err, } => f .debug_struct("NotifyFailToStartObserve") - .field("region", region) + .field("region", &utils::debug_region(region)) .field("handle", handle) .field("err", err) .finish(), @@ -1165,3 +1189,41 @@ where self.run_task(task) } } + +#[cfg(test)] +mod test { + use engine_rocks::RocksEngine; + use raftstore::coprocessor::region_info_accessor::MockRegionInfoProvider; + use test_raftstore::MockRaftStoreRouter; + use tikv_util::worker::dummy_scheduler; + + use crate::{ + checkpoint_manager::tests::MockPdClient, endpoint, endpoint::Endpoint, metadata::test, Task, + }; + + #[tokio::test] + async fn test_start() { + let cli = test::test_meta_cli(); + let (sched, mut rx) = dummy_scheduler(); + let task = test::simple_task("simple_3"); + cli.insert_task_with_range(&task, &[]).await.unwrap(); + + fail::cfg("failed_to_get_tasks", "1*return").unwrap(); + Endpoint::<_, MockRegionInfoProvider, RocksEngine, MockRaftStoreRouter, MockPdClient>::start_and_watch_tasks(cli, sched).await.unwrap(); + fail::remove("failed_to_get_tasks"); + + let _t1 = rx.recv().unwrap(); + let t2 = rx.recv().unwrap(); + + match t2 { + Task::WatchTask(t) => match t { + endpoint::TaskOp::AddTask(t) => { + assert_eq!(t.info, task.info); + assert!(!t.is_paused); + } + _ => panic!("not match TaskOp type"), + }, + _ => panic!("not match Task type {:?}", t2), + } + } +} diff --git a/components/backup-stream/src/event_loader.rs b/components/backup-stream/src/event_loader.rs index 27c05b5b875..6222f058cd4 100644 --- a/components/backup-stream/src/event_loader.rs +++ b/components/backup-stream/src/event_loader.rs @@ -236,7 +236,8 @@ where ) -> Result { let mut last_err = None; for _ in 0..MAX_GET_SNAPSHOT_RETRY { - let r = self.observe_over(region, cmd()); + let c = cmd(); + let r = self.observe_over(region, c); match r { Ok(s) => { return Ok(s); diff --git a/components/backup-stream/src/metadata/checkpoint_cache.rs b/components/backup-stream/src/metadata/checkpoint_cache.rs new file mode 100644 index 00000000000..50573d003d8 --- /dev/null +++ b/components/backup-stream/src/metadata/checkpoint_cache.rs @@ -0,0 +1,71 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use tikv_util::time::Instant; +use txn_types::TimeStamp; + +/// The lease time of a checkpoint. +/// 12s is the default interval of the coornaditor tick. +const CACHE_LEASE_TIME: Duration = Duration::from_secs(12); + +pub struct CheckpointCache { + last_access: Instant, + checkpoint: TimeStamp, + + cache_lease_time: Duration, +} + +impl Default for CheckpointCache { + fn default() -> Self { + Self { + last_access: Instant::now_coarse(), + checkpoint: TimeStamp::zero(), + + cache_lease_time: CACHE_LEASE_TIME, + } + } +} + +impl CheckpointCache { + #[cfg(test)] + pub fn with_cache_lease(lease: Duration) -> Self { + Self { + cache_lease_time: lease, + ..Self::default() + } + } + + pub fn update(&mut self, checkpoint: impl Into) { + self.last_access = Instant::now_coarse(); + self.checkpoint = self.checkpoint.max(checkpoint.into()) + } + + pub fn get(&self) -> Option { + if self.checkpoint.is_zero() + || self.last_access.saturating_elapsed() > self.cache_lease_time + { + return None; + } + Some(self.checkpoint) + } +} + +#[cfg(test)] +mod test { + use std::time::Duration; + + use super::CheckpointCache; + + #[test] + fn test_basic() { + let mut c = CheckpointCache::with_cache_lease(Duration::from_millis(100)); + assert_eq!(c.get(), None); + c.update(42); + assert_eq!(c.get(), Some(42.into())); + c.update(41); + assert_eq!(c.get(), Some(42.into())); + std::thread::sleep(Duration::from_millis(200)); + assert_eq!(c.get(), None); + } +} diff --git a/components/backup-stream/src/metadata/client.rs b/components/backup-stream/src/metadata/client.rs index b7f1fcb2025..97e8d2140b5 100644 --- a/components/backup-stream/src/metadata/client.rs +++ b/components/backup-stream/src/metadata/client.rs @@ -1,7 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path}; +use std::{cmp::Ordering, collections::HashMap, fmt::Debug, path::Path, sync::Arc}; +use dashmap::DashMap; use kvproto::{ brpb::{StreamBackupError, StreamBackupTaskInfo}, metapb::Region, @@ -11,6 +12,7 @@ use tokio_stream::StreamExt; use txn_types::TimeStamp; use super::{ + checkpoint_cache::CheckpointCache, keys::{self, KeyValue, MetaKey}, store::{ CondTransaction, Condition, GetExtra, Keys, KvEvent, KvEventType, MetaStore, Snapshot, @@ -26,6 +28,7 @@ use crate::{ #[derive(Clone)] pub struct MetadataClient { store_id: u64, + caches: Arc>, pub(crate) meta_store: Store, } @@ -239,6 +242,7 @@ impl MetadataClient { pub fn new(store: Store, store_id: u64) -> Self { Self { meta_store: store, + caches: Arc::default(), store_id, } } @@ -353,6 +357,11 @@ impl MetadataClient { defer! { super::metrics::METADATA_OPERATION_LATENCY.with_label_values(&["task_fetch"]).observe(now.saturating_elapsed().as_secs_f64()) } + fail::fail_point!("failed_to_get_tasks", |_| { + Err(Error::MalformedMetadata( + "faild to connect etcd client".to_string(), + )) + }); let snap = self.meta_store.snapshot().await?; let kvs = snap.get(Keys::Prefix(MetaKey::tasks())).await?; @@ -693,21 +702,41 @@ impl MetadataClient { Ok(min_checkpoint) } + fn cached_checkpoint(&self, task: &str) -> Option { + self.caches + .get(task) + .and_then(|x| x.value().get()) + .map(|x| Checkpoint { + provider: CheckpointProvider::Global, + ts: x, + }) + } + + fn update_cache(&self, task: &str, checkpoint: TimeStamp) { + let mut c = self.caches.entry(task.to_owned()).or_default(); + c.value_mut().update(checkpoint); + } + pub async fn get_region_checkpoint(&self, task: &str, region: &Region) -> Result { + if let Some(c) = self.cached_checkpoint(task) { + return Ok(c); + } let key = MetaKey::next_bakcup_ts_of_region(task, region); let s = self.meta_store.snapshot().await?; let r = s.get(Keys::Key(key.clone())).await?; - match r.len() { + let cp = match r.len() { 0 => { let global_cp = self.global_checkpoint_of(task).await?; let cp = match global_cp { None => self.get_task_start_ts_checkpoint(task).await?, Some(cp) => cp, }; - Ok(cp) + cp } - _ => Ok(Checkpoint::from_kv(&r[0])?), - } + _ => Checkpoint::from_kv(&r[0])?, + }; + self.update_cache(task, cp.ts); + Ok(cp) } } diff --git a/components/backup-stream/src/metadata/mod.rs b/components/backup-stream/src/metadata/mod.rs index 4c387533e49..20887a24b02 100644 --- a/components/backup-stream/src/metadata/mod.rs +++ b/components/backup-stream/src/metadata/mod.rs @@ -1,10 +1,11 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod checkpoint_cache; mod client; pub mod keys; mod metrics; pub mod store; -mod test; +pub mod test; pub use client::{Checkpoint, CheckpointProvider, MetadataClient, MetadataEvent, StreamTask}; pub use store::lazy_etcd::{ConnectionConfig, LazyEtcdClient}; diff --git a/components/backup-stream/src/metadata/store/lazy_etcd.rs b/components/backup-stream/src/metadata/store/lazy_etcd.rs index 6fc3a5332ea..b712a23973d 100644 --- a/components/backup-stream/src/metadata/store/lazy_etcd.rs +++ b/components/backup-stream/src/metadata/store/lazy_etcd.rs @@ -4,7 +4,11 @@ use std::{sync::Arc, time::Duration}; use etcd_client::{ConnectOptions, Error as EtcdError, OpenSslClientConfig}; use futures::Future; -use tikv_util::stream::{RetryError, RetryExt}; +use openssl::x509::verify::X509VerifyFlags; +use tikv_util::{ + info, + stream::{RetryError, RetryExt}, +}; use tokio::sync::OnceCell; use super::{etcd::EtcdSnapshot, EtcdStore, MetaStore}; @@ -30,6 +34,12 @@ impl ConnectionConfig { opts = opts.with_openssl_tls( OpenSslClientConfig::default() .ca_cert_pem(&tls.ca) + // Some of users may prefer using multi-level self-signed certs. + // In this scenario, we must set this flag or openssl would probably complain it cannot found the root CA. + // (Because the flags we provide allows users providing exactly one CA cert.) + // We haven't make it configurable because it is enabled in gRPC by default too. + // TODO: Perhaps implement grpc-io based etcd client, fully remove the difference between gRPC TLS and our custom TLS? + .manually(|c| c.cert_store_mut().set_flags(X509VerifyFlags::PARTIAL_CHAIN)) .client_cert_pem_and_key(&tls.client_cert, &tls.client_key.0), ) } @@ -113,7 +123,7 @@ where use futures::TryFutureExt; let r = tikv_util::stream::retry_ext( move || action().err_into::(), - RetryExt::default().with_fail_hook(|err| println!("meet error {:?}", err)), + RetryExt::default().with_fail_hook(|err| info!("retry it"; "err" => ?err)), ) .await; r.map_err(|err| err.0.into()) diff --git a/components/backup-stream/src/metadata/test.rs b/components/backup-stream/src/metadata/test.rs index ec2a30efbf3..a57722089bf 100644 --- a/components/backup-stream/src/metadata/test.rs +++ b/components/backup-stream/src/metadata/test.rs @@ -16,11 +16,11 @@ use crate::{ metadata::{store::SlashEtcStore, MetadataEvent}, }; -fn test_meta_cli() -> MetadataClient { +pub fn test_meta_cli() -> MetadataClient { MetadataClient::new(SlashEtcStore::default(), 42) } -fn simple_task(name: &str) -> StreamTask { +pub fn simple_task(name: &str) -> StreamTask { let mut task = StreamTask::default(); task.info.set_name(name.to_owned()); task.info.set_start_ts(1); diff --git a/components/backup-stream/src/subscription_manager.rs b/components/backup-stream/src/subscription_manager.rs index 83181829b43..91b4c096e7d 100644 --- a/components/backup-stream/src/subscription_manager.rs +++ b/components/backup-stream/src/subscription_manager.rs @@ -165,7 +165,7 @@ impl ScanCmd { } = self; let begin = Instant::now_coarse(); let stat = initial_scan.do_initial_scan(region, *last_checkpoint, handle.clone())?; - info!("initial scanning of leader transforming finished!"; "takes" => ?begin.saturating_elapsed(), "region" => %region.get_id(), "from_ts" => %last_checkpoint); + info!("initial scanning finished!"; "takes" => ?begin.saturating_elapsed(), "from_ts" => %last_checkpoint, utils::slog_region(region)); utils::record_cf_stat("lock", &stat.lock); utils::record_cf_stat("write", &stat.write); utils::record_cf_stat("default", &stat.data); @@ -281,7 +281,7 @@ impl ScanPoolHandle { } /// The default channel size. -const MESSAGE_BUFFER_SIZE: usize = 4096; +const MESSAGE_BUFFER_SIZE: usize = 32768; /// The operator for region subscription. /// It make a queue for operations over the `SubscriptionTracer`, generally, @@ -414,7 +414,7 @@ where true, false, ) - .map_err(|err| warn!("check epoch and stop failed."; "err" => %err)) + .map_err(|err| warn!("check epoch and stop failed."; utils::slog_region(region), "err" => %err)) .is_ok() }); } @@ -455,13 +455,16 @@ where "take" => ?now.saturating_elapsed(), "timedout" => %timedout); } let cps = self.subs.resolve_with(min_ts); - let min_region = cps.iter().min_by_key(|(_, rts)| rts); + let min_region = cps.iter().min_by_key(|rs| rs.checkpoint); // If there isn't any region observed, the `min_ts` can be used as resolved ts // safely. - let rts = min_region.map(|(_, rts)| *rts).unwrap_or(min_ts); - info!("getting checkpoint"; "defined_by_region" => ?min_region.map(|r| r.0.get_id()), "checkpoint" => %rts); + let rts = min_region.map(|rs| rs.checkpoint).unwrap_or(min_ts); + info!("getting checkpoint"; "defined_by_region" => ?min_region); self.subs.warn_if_gap_too_huge(rts); - callback(ResolvedRegions::new(rts, cps)); + callback(ResolvedRegions::new( + rts, + cps.into_iter().map(|r| (r.region, r.checkpoint)).collect(), + )); } } } @@ -583,7 +586,7 @@ where exists = true; let should_remove = old.handle().id == handle.id; if !should_remove { - warn!("stale retry command"; "region" => ?region, "handle" => ?handle, "old_handle" => ?old.handle()); + warn!("stale retry command"; utils::slog_region(®ion), "handle" => ?handle, "old_handle" => ?old.handle()); } should_remove }); diff --git a/components/backup-stream/src/subscription_track.rs b/components/backup-stream/src/subscription_track.rs index 50c3c6c1143..a24076661bb 100644 --- a/components/backup-stream/src/subscription_track.rs +++ b/components/backup-stream/src/subscription_track.rs @@ -2,7 +2,10 @@ use std::{sync::Arc, time::Duration}; -use dashmap::{mapref::one::RefMut, DashMap}; +use dashmap::{ + mapref::{entry::Entry, one::RefMut}, + DashMap, +}; use kvproto::metapb::Region; use raftstore::coprocessor::*; use resolved_ts::Resolver; @@ -57,6 +60,63 @@ impl RegionSubscription { } } +#[derive(PartialEq, Eq)] +pub enum CheckpointType { + MinTs, + StartTsOfInitialScan, + StartTsOfTxn(Option>), +} + +impl std::fmt::Debug for CheckpointType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::MinTs => write!(f, "MinTs"), + Self::StartTsOfInitialScan => write!(f, "StartTsOfInitialScan"), + Self::StartTsOfTxn(arg0) => f + .debug_tuple("StartTsOfTxn") + .field(&format_args!( + "{}", + utils::redact(&arg0.as_ref().map(|x| x.as_ref()).unwrap_or(&[])) + )) + .finish(), + } + } +} + +pub struct ResolveResult { + pub region: Region, + pub checkpoint: TimeStamp, + pub checkpoint_type: CheckpointType, +} + +impl std::fmt::Debug for ResolveResult { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ResolveResult") + .field("region", &self.region.get_id()) + .field("checkpoint", &self.checkpoint) + .field("checkpoint_type", &self.checkpoint_type) + .finish() + } +} + +impl ResolveResult { + fn resolve(sub: &mut RegionSubscription, min_ts: TimeStamp) -> Self { + let ts = sub.resolver.resolve(min_ts); + let ty = if ts == min_ts { + CheckpointType::MinTs + } else if sub.resolver.in_phase_one() { + CheckpointType::StartTsOfInitialScan + } else { + CheckpointType::StartTsOfTxn(sub.resolver.sample_far_lock()) + }; + Self { + region: sub.meta.clone(), + checkpoint: ts, + checkpoint_type: ty, + } + } +} + impl SubscriptionTracer { /// clear the current `SubscriptionTracer`. pub fn clear(&self) { @@ -91,11 +151,11 @@ impl SubscriptionTracer { /// try advance the resolved ts with the min ts of in-memory locks. /// returns the regions and theirs resolved ts. - pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec<(Region, TimeStamp)> { + pub fn resolve_with(&self, min_ts: TimeStamp) -> Vec { self.0 .iter_mut() // Don't advance the checkpoint ts of removed region. - .map(|mut s| (s.meta.clone(), s.resolver.resolve(min_ts))) + .map(|mut s| ResolveResult::resolve(s.value_mut(), min_ts)) .collect() } @@ -128,21 +188,19 @@ impl SubscriptionTracer { if_cond: impl FnOnce(&RegionSubscription, &Region) -> bool, ) -> bool { let region_id = region.get_id(); - let remove_result = self.0.remove(®ion_id); + let remove_result = self.0.entry(region_id); match remove_result { - Some((_, mut v)) => { - if if_cond(&v, region) { + Entry::Occupied(mut x) => { + if if_cond(x.get(), region) { TRACK_REGION.dec(); - v.stop(); + x.get_mut().stop(); + let v = x.remove(); info!("stop listen stream from store"; "observer" => ?v, "region_id"=> %region_id); return true; } false } - None => { - warn!("trying to deregister region not registered"; "region_id" => %region_id); - false - } + Entry::Vacant(_) => false, } } @@ -156,7 +214,7 @@ impl SubscriptionTracer { let mut sub = match self.get_subscription_of(new_region.get_id()) { Some(sub) => sub, None => { - warn!("backup stream observer refreshing void subscription."; "new_region" => ?new_region); + warn!("backup stream observer refreshing void subscription."; utils::slog_region(new_region)); return true; } }; @@ -258,6 +316,12 @@ impl std::fmt::Debug for FutureLock { } impl TwoPhaseResolver { + /// try to get one of the key of the oldest lock in the resolver. + pub fn sample_far_lock(&self) -> Option> { + let (_, keys) = self.resolver.locks().first_key_value()?; + keys.iter().next().cloned() + } + pub fn in_phase_one(&self) -> bool { self.stable_ts.is_some() } @@ -348,6 +412,8 @@ impl std::fmt::Debug for TwoPhaseResolver { #[cfg(test)] mod test { + use std::sync::Arc; + use kvproto::metapb::{Region, RegionEpoch}; use raftstore::coprocessor::ObserveHandle; use txn_types::TimeStamp; @@ -433,15 +499,24 @@ mod test { subs.deregister_region_if(®ion(5, 8, 1), |_, _| true); drop(region4_sub); - let mut rs = subs.resolve_with(TimeStamp::new(1000)); + let mut rs = subs + .resolve_with(TimeStamp::new(1000)) + .into_iter() + .map(|r| (r.region, r.checkpoint, r.checkpoint_type)) + .collect::>(); rs.sort_by_key(|k| k.0.get_id()); + use crate::subscription_track::CheckpointType::*; assert_eq!( rs, vec![ - (region(1, 1, 1), TimeStamp::new(42)), - (region(2, 2, 1), TimeStamp::new(1000)), - (region(3, 4, 1), TimeStamp::new(1000)), - (region(4, 8, 1), TimeStamp::new(128)), + (region(1, 1, 1), 42.into(), StartTsOfInitialScan), + (region(2, 2, 1), 1000.into(), MinTs), + (region(3, 4, 1), 1000.into(), MinTs), + ( + region(4, 8, 1), + 128.into(), + StartTsOfTxn(Some(Arc::from(b"Alpi".as_slice()))) + ), ] ); } diff --git a/components/backup-stream/src/utils.rs b/components/backup-stream/src/utils.rs index 6ecea21f2f5..1746882690f 100644 --- a/components/backup-stream/src/utils.rs +++ b/components/backup-stream/src/utils.rs @@ -3,6 +3,7 @@ use core::pin::Pin; use std::{ borrow::Borrow, + cell::RefCell, collections::{hash_map::RandomState, BTreeMap, HashMap}, ops::{Bound, RangeBounds}, path::Path, @@ -20,6 +21,7 @@ use engine_traits::{CfName, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE}; use futures::{channel::mpsc, executor::block_on, ready, task::Poll, FutureExt, StreamExt}; use kvproto::{ brpb::CompressionType, + metapb::Region, raft_cmdpb::{CmdType, Request}, }; use raft::StateRole; @@ -743,6 +745,109 @@ impl CompressionWriter for ZstdCompressionWriter { } } +/// make a pair of key range to impl Debug which prints [start_key,$end_key). +pub fn debug_key_range<'ret, 'a: 'ret, 'b: 'ret>( + start: &'a [u8], + end: &'b [u8], +) -> impl std::fmt::Debug + 'ret { + DebugKeyRange::<'a, 'b>(start, end) +} + +struct DebugKeyRange<'start, 'end>(&'start [u8], &'end [u8]); + +impl<'start, 'end> std::fmt::Debug for DebugKeyRange<'start, 'end> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let end_key = if self.1.is_empty() { + Either::Left("inf") + } else { + Either::Right(redact(&self.1)) + }; + let end_key: &dyn std::fmt::Display = match &end_key { + Either::Left(x) => x, + Either::Right(y) => y, + }; + write!(f, "[{},{})", redact(&self.0), end_key) + } +} + +/// make a [`Region`](kvproto::metapb::Region) implements [`slog::KV`], which +/// prints its fields like `[r.id=xxx] [r.ver=xxx] ...` +pub fn slog_region(r: &Region) -> impl slog::KV + '_ { + SlogRegion(r) +} + +/// make a [`Region`](kvproto::metapb::Region) implements +/// [`Debug`](std::fmt::Debug), which prints its essential fields. +pub fn debug_region(r: &Region) -> impl std::fmt::Debug + '_ { + DebugRegion(r) +} + +struct DebugRegion<'a>(&'a Region); + +impl<'a> std::fmt::Debug for DebugRegion<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let r = self.0; + f.debug_struct("Region") + .field("id", &r.get_id()) + .field("ver", &r.get_region_epoch().get_version()) + .field("conf_ver", &r.get_region_epoch().get_conf_ver()) + .field( + "range", + &debug_key_range(r.get_start_key(), r.get_end_key()), + ) + .field( + "peers", + &debug_iter(r.get_peers().iter().map(|p| p.store_id)), + ) + .finish() + } +} + +struct SlogRegion<'a>(&'a Region); + +impl<'a> slog::KV for SlogRegion<'a> { + fn serialize( + &self, + _record: &slog::Record<'_>, + serializer: &mut dyn slog::Serializer, + ) -> slog::Result { + let r = self.0; + serializer.emit_u64("r.id", r.get_id())?; + serializer.emit_u64("r.ver", r.get_region_epoch().get_version())?; + serializer.emit_u64("r.conf_ver", r.get_region_epoch().get_conf_ver())?; + serializer.emit_arguments( + "r.range", + &format_args!("{:?}", debug_key_range(r.get_start_key(), r.get_end_key())), + )?; + serializer.emit_arguments( + "r.peers", + &format_args!("{:?}", debug_iter(r.get_peers().iter().map(|p| p.store_id))), + )?; + Ok(()) + } +} + +pub fn debug_iter(t: impl Iterator) -> impl std::fmt::Debug { + DebugIter(RefCell::new(t)) +} + +struct DebugIter>(RefCell); + +impl> std::fmt::Debug for DebugIter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut is_first = true; + while let Some(x) = self.0.borrow_mut().next() { + if !is_first { + write!(f, ",{:?}", x)?; + } else { + write!(f, "{:?}", x)?; + is_first = false; + } + } + Ok(()) + } +} + #[cfg(test)] mod test { use std::{ @@ -755,10 +860,35 @@ mod test { use engine_traits::WriteOptions; use futures::executor::block_on; + use kvproto::metapb::{Region, RegionEpoch}; use tokio::io::{AsyncWriteExt, BufReader}; use crate::utils::{is_in_range, CallbackWaitGroup, SegmentMap}; + #[test] + fn test_redact() { + log_wrappers::set_redact_info_log(true); + let mut region = Region::default(); + region.set_id(42); + region.set_start_key(b"TiDB".to_vec()); + region.set_end_key(b"TiDC".to_vec()); + region.set_region_epoch({ + let mut r = RegionEpoch::default(); + r.set_version(108); + r.set_conf_ver(352); + r + }); + + // Can we make a better way to test this? + assert_eq!( + "Region { id: 42, ver: 108, conf_ver: 352, range: [?,?), peers: }", + format!("{:?}", super::debug_region(®ion)) + ); + + let range = super::debug_key_range(b"alpha", b"omega"); + assert_eq!("[?,?)", format!("{:?}", range)); + } + #[test] fn test_range_functions() { #[derive(Debug)] diff --git a/components/backup/src/endpoint.rs b/components/backup/src/endpoint.rs index b880da7a3dc..0469ffa30a7 100644 --- a/components/backup/src/endpoint.rs +++ b/components/backup/src/endpoint.rs @@ -330,6 +330,7 @@ impl BackupRange { assert!(!ctx.get_replica_read()); let snap_ctx = SnapContext { pb_ctx: &ctx, + allowed_in_flashback: self.region.is_in_flashback, ..Default::default() }; diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 7c33d21aadd..696bc6341ee 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -272,6 +272,7 @@ mod tests { leader_id: 2, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -299,6 +300,7 @@ mod tests { leader_id: raft::INVALID_ID, prev_lead_transferee: 3, vote: 3, + initialized: true, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { diff --git a/components/engine_panic/src/compact.rs b/components/engine_panic/src/compact.rs index f1e78d57010..988bec790de 100644 --- a/components/engine_panic/src/compact.rs +++ b/components/engine_panic/src/compact.rs @@ -13,7 +13,7 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -24,15 +24,6 @@ impl CompactExt for PanicEngine { panic!() } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - panic!() - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index a296c3df9d8..6bca7d46485 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -2,7 +2,7 @@ use engine_traits::{ IterOptions, Iterable, Iterator, KvEngine, Peekable, ReadOptions, Result, SyncMutable, - TabletAccessor, WriteOptions, + WriteOptions, }; use crate::{db_vector::PanicDbVector, snapshot::PanicSnapshot, write_batch::PanicWriteBatch}; @@ -24,16 +24,6 @@ impl KvEngine for PanicEngine { } } -impl TabletAccessor for PanicEngine { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &PanicEngine)) { - panic!() - } - - fn is_single_engine(&self) -> bool { - panic!() - } -} - impl Peekable for PanicEngine { type DbVector = PanicDbVector; diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 82012b84ed6..5e6fbe87267 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -1,11 +1,29 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DeleteStrategy, MiscExt, Range, Result}; +use engine_traits::{DeleteStrategy, MiscExt, Range, Result, StatisticsReporter}; use crate::engine::PanicEngine; +pub struct PanicReporter; + +impl StatisticsReporter for PanicReporter { + fn new(name: &str) -> Self { + panic!() + } + + fn collect(&mut self, engine: &PanicEngine) { + panic!() + } + + fn flush(&mut self) { + panic!() + } +} + impl MiscExt for PanicEngine { - fn flush_cfs(&self, wait: bool) -> Result<()> { + type StatisticsReporter = PanicReporter; + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { panic!() } @@ -30,6 +48,10 @@ impl MiscExt for PanicEngine { panic!() } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + panic!() + } + fn get_engine_used_size(&self) -> Result { panic!() } @@ -42,10 +64,18 @@ impl MiscExt for PanicEngine { panic!() } + fn pause_background_work(&self) -> Result<()> { + panic!() + } + fn exists(path: &str) -> bool { panic!() } + fn locked(path: &str) -> Result { + panic!() + } + fn dump_stats(&self) -> Result { panic!() } diff --git a/components/engine_panic/src/raft_engine.rs b/components/engine_panic/src/raft_engine.rs index ad05e66c6fa..c3de53b4932 100644 --- a/components/engine_panic/src/raft_engine.rs +++ b/components/engine_panic/src/raft_engine.rs @@ -47,11 +47,23 @@ impl RaftEngineReadOnly for PanicEngine { panic!() } - fn get_region_state(&self, raft_group_id: u64) -> Result> { + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { panic!() } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { panic!() } @@ -108,15 +120,16 @@ impl RaftEngine for PanicEngine { panic!() } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - panic!() - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { panic!() } - fn gc(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { panic!() } @@ -132,10 +145,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn reset_statistics(&self) { - panic!() - } - fn dump_stats(&self) -> Result { panic!() } @@ -148,10 +157,6 @@ impl RaftEngine for PanicEngine { panic!() } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - panic!() - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -159,10 +164,6 @@ impl RaftEngine for PanicEngine { { panic!() } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - panic!() - } } impl RaftLogBatch for PanicWriteBatch { @@ -202,11 +203,35 @@ impl RaftLogBatch for PanicWriteBatch { panic!() } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + panic!() + } + + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { panic!() } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { panic!() } } diff --git a/components/engine_rocks/src/compact.rs b/components/engine_rocks/src/compact.rs index b9e3e5fe558..199b7d9f3be 100644 --- a/components/engine_rocks/src/compact.rs +++ b/components/engine_rocks/src/compact.rs @@ -24,7 +24,7 @@ impl CompactExt for RocksEngine { Ok(false) } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -43,18 +43,6 @@ impl CompactExt for RocksEngine { Ok(()) } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - for cf_name in self.cf_names() { - self.compact_files_in_range_cf(cf_name, start, end, output_level)?; - } - Ok(()) - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/components/engine_rocks/src/db_options.rs b/components/engine_rocks/src/db_options.rs index f4044c44449..f437cc7b433 100644 --- a/components/engine_rocks/src/db_options.rs +++ b/components/engine_rocks/src/db_options.rs @@ -66,23 +66,29 @@ impl DbOptions for RocksDbOptions { } fn get_rate_bytes_per_sec(&self) -> Option { - self.0.get_rate_bytes_per_sec() + self.0.get_rate_limiter().map(|r| r.get_bytes_per_second()) } fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()> { - self.0 - .set_rate_bytes_per_sec(rate_bytes_per_sec) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_bytes_per_second(rate_bytes_per_sec); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn get_rate_limiter_auto_tuned(&self) -> Option { - self.0.get_auto_tuned() + self.0.get_rate_limiter().map(|r| r.get_auto_tuned()) } fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { - self.0 - .set_auto_tuned(rate_limiter_auto_tuned) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_auto_tuned(rate_limiter_auto_tuned); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 41066c85756..0e73de357e5 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -2,30 +2,17 @@ use std::{any::Any, sync::Arc}; -use engine_traits::{ - IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable, TabletAccessor, -}; +use engine_traits::{IterOptions, Iterable, KvEngine, Peekable, ReadOptions, Result, SyncMutable}; use rocksdb::{DBIterator, Writable, DB}; use crate::{ - db_vector::RocksDbVector, - options::RocksReadOptions, - r2e, - rocks_metrics::{ - flush_engine_histogram_metrics, flush_engine_iostall_properties, flush_engine_properties, - flush_engine_ticker_metrics, - }, - rocks_metrics_defs::{ - ENGINE_HIST_TYPES, ENGINE_TICKER_TYPES, TITAN_ENGINE_HIST_TYPES, TITAN_ENGINE_TICKER_TYPES, - }, - util::get_cf_handle, + db_vector::RocksDbVector, options::RocksReadOptions, r2e, util::get_cf_handle, RocksEngineIterator, RocksSnapshot, }; #[derive(Clone, Debug)] pub struct RocksEngine { db: Arc, - shared_block_cache: bool, support_multi_batch_write: bool, } @@ -37,7 +24,6 @@ impl RocksEngine { pub fn from_db(db: Arc) -> Self { RocksEngine { db: db.clone(), - shared_block_cache: false, support_multi_batch_write: db.get_db_options().is_enable_multi_batch_write(), } } @@ -50,14 +36,6 @@ impl RocksEngine { self.db.clone() } - pub fn set_shared_block_cache(&mut self, enable: bool) { - self.shared_block_cache = enable; - } - - pub fn shared_block_cache(&self) -> bool { - self.shared_block_cache - } - pub fn support_multi_batch_write(&self) -> bool { self.support_multi_batch_write } @@ -74,51 +52,12 @@ impl KvEngine for RocksEngine { self.db.sync_wal().map_err(r2e) } - fn flush_metrics(&self, instance: &str) { - for t in ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - if self.db.is_titan() { - for t in TITAN_ENGINE_TICKER_TYPES { - let v = self.db.get_and_reset_statistics_ticker_count(*t); - flush_engine_ticker_metrics(*t, v, instance); - } - for t in TITAN_ENGINE_HIST_TYPES { - if let Some(v) = self.db.get_statistics_histogram(*t) { - flush_engine_histogram_metrics(*t, v, instance); - } - } - } - flush_engine_properties(&self.db, instance, self.shared_block_cache); - flush_engine_iostall_properties(&self.db, instance); - } - - fn reset_statistics(&self) { - self.db.reset_statistics(); - } - fn bad_downcast(&self) -> &T { let e: &dyn Any = &self.db; e.downcast_ref().expect("bad engine downcast") } } -impl TabletAccessor for RocksEngine { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - f(0, 0, self); - } - - fn is_single_engine(&self) -> bool { - true - } -} - impl Iterable for RocksEngine { type Iterator = RocksEngineIterator; diff --git a/components/engine_rocks/src/event_listener.rs b/components/engine_rocks/src/event_listener.rs index ad7a9de455f..b940fcb39f3 100644 --- a/components/engine_rocks/src/event_listener.rs +++ b/components/engine_rocks/src/event_listener.rs @@ -1,10 +1,11 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use engine_traits::PersistenceListener; use file_system::{get_io_type, set_io_type, IoType}; use regex::Regex; use rocksdb::{ - CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MutableStatus, - SubcompactionJobInfo, WriteStallInfo, + CompactionJobInfo, DBBackgroundErrorReason, FlushJobInfo, IngestionInfo, MemTableInfo, + MutableStatus, SubcompactionJobInfo, WriteStallInfo, }; use tikv_util::{error, metrics::CRITICAL_ERROR, set_panic_mark, warn, worker::Scheduler}; @@ -178,9 +179,40 @@ fn resolve_sst_filename_from_err(err: &str) -> Option { Some(filename) } +pub struct RocksPersistenceListener(PersistenceListener); + +impl RocksPersistenceListener { + pub fn new(listener: PersistenceListener) -> RocksPersistenceListener { + RocksPersistenceListener(listener) + } +} + +impl rocksdb::EventListener for RocksPersistenceListener { + fn on_memtable_sealed(&self, info: &MemTableInfo) { + self.0 + .on_memtable_sealed(info.cf_name().to_string(), info.earliest_seqno()); + } + + fn on_flush_completed(&self, job: &FlushJobInfo) { + self.0 + .on_flush_completed(job.cf_name(), job.largest_seqno()); + } +} + #[cfg(test)] mod tests { + use std::sync::{ + mpsc::{self, Sender}, + Arc, Mutex, + }; + + use engine_traits::{ + FlushProgress, FlushState, MiscExt, StateStorage, SyncMutable, CF_DEFAULT, DATA_CFS, + }; + use tempfile::Builder; + use super::*; + use crate::{util, RocksCfOptions, RocksDbOptions}; #[test] fn test_resolve_sst_filename() { @@ -188,4 +220,139 @@ mod tests { let filename = resolve_sst_filename_from_err(err).unwrap(); assert_eq!(filename, "/000398.sst"); } + + type Record = (u64, u64, FlushProgress); + + #[derive(Default)] + struct MemStorage { + records: Mutex>, + } + + impl StateStorage for MemStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + self.records + .lock() + .unwrap() + .push((region_id, tablet_index, pr)); + } + } + + struct FlushTrack { + sealed: Mutex>, + block_flush: Arc>, + } + + impl rocksdb::EventListener for FlushTrack { + fn on_memtable_sealed(&self, _: &MemTableInfo) { + let _ = self.sealed.lock().unwrap().send(()); + } + + fn on_flush_begin(&self, _: &FlushJobInfo) { + drop(self.block_flush.lock().unwrap()) + } + } + + #[test] + fn test_persistence_listener() { + let temp_dir = Builder::new() + .prefix("test_persistence_listener") + .tempdir() + .unwrap(); + let (region_id, tablet_index) = (2, 3); + + let storage = Arc::new(MemStorage::default()); + let state = Arc::new(FlushState::default()); + let listener = + PersistenceListener::new(region_id, tablet_index, state.clone(), storage.clone()); + let mut db_opt = RocksDbOptions::default(); + db_opt.add_event_listener(RocksPersistenceListener::new(listener)); + let (tx, rx) = mpsc::channel(); + let block_flush = Arc::new(Mutex::new(())); + db_opt.add_event_listener(FlushTrack { + sealed: Mutex::new(tx), + block_flush: block_flush.clone(), + }); + + let mut cf_opts: Vec<_> = DATA_CFS + .iter() + .map(|cf| (*cf, RocksCfOptions::default())) + .collect(); + cf_opts[0].1.set_max_write_buffer_number(4); + cf_opts[0].1.set_min_write_buffer_number_to_merge(2); + cf_opts[0].1.set_write_buffer_size(1024); + cf_opts[0].1.set_disable_auto_compactions(true); + let db = util::new_engine_opt(temp_dir.path().to_str().unwrap(), db_opt, cf_opts).unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + let sst_count = || { + std::fs::read_dir(temp_dir.path()) + .unwrap() + .filter(|p| { + let p = match p { + Ok(p) => p, + Err(_) => return false, + }; + p.path().extension().map_or(false, |ext| ext == "sst") + }) + .count() + }; + // Although flush is triggered, but there is nothing to flush. + assert_eq!(sst_count(), 0); + assert_eq!(storage.records.lock().unwrap().len(), 0); + + // Flush one key should work. + state.set_applied_index(2); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 1); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 2); + + // When puts and deletes are mixed, the puts may be deleted during flush. + state.set_applied_index(3); + db.put_cf(CF_DEFAULT, b"k0", b"v0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k0").unwrap(); + db.delete_cf(CF_DEFAULT, b"k1").unwrap(); + db.put_cf(CF_DEFAULT, b"k1", b"v1").unwrap(); + db.flush_cf(CF_DEFAULT, true).unwrap(); + assert_eq!(sst_count(), 2); + let record = storage.records.lock().unwrap().pop().unwrap(); + assert_eq!(storage.records.lock().unwrap().len(), 0); + assert_eq!(record.0, region_id); + assert_eq!(record.1, tablet_index); + assert_eq!(record.2.applied_index(), 3); + // Detail check of `FlushProgress` will be done in raftstore-v2 tests. + + // Drain all the events. + while rx.try_recv().is_ok() {} + state.set_applied_index(4); + let block = block_flush.lock(); + // Seal twice to trigger flush. Seal third to make a seqno conflict, in + // which case flush largest seqno will be equal to seal earliest seqno. + let mut key_count = 2; + for i in 0..3 { + while rx.try_recv().is_err() { + db.put(format!("k{key_count}").as_bytes(), &[0; 512]) + .unwrap(); + key_count += 1; + } + state.set_applied_index(5 + i); + } + drop(block); + // Memtable is seal before put, so there must be still one KV in memtable. + db.flush_cf(CF_DEFAULT, true).unwrap(); + rx.try_recv().unwrap(); + // There is 2 sst before this round, and then 4 are merged into 2, so there + // should be 4 ssts. + assert_eq!(sst_count(), 4); + let records = storage.records.lock().unwrap(); + // Although it seals 4 times, but only create 2 SSTs, so only 2 records. + assert_eq!(records.len(), 2); + // The indexes of two merged flush state are 4 and 5, so merged value is 5. + assert_eq!(records[0].2.applied_index(), 5); + // The last two flush state is 6 and 7. + assert_eq!(records[1].2.applied_index(), 7); + } } diff --git a/components/engine_rocks/src/file_system.rs b/components/engine_rocks/src/file_system.rs index f3211d52d68..b470237f313 100644 --- a/components/engine_rocks/src/file_system.rs +++ b/components/engine_rocks/src/file_system.rs @@ -82,17 +82,17 @@ mod tests { db.put(&data_key(b"a1"), &value).unwrap(); db.put(&data_key(b"a2"), &value).unwrap(); assert_eq!(stats.fetch(IoType::Flush, IoOp::Write), 0); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); db.put(&data_key(b"a2"), &value).unwrap(); db.put(&data_key(b"a3"), &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); assert!(stats.fetch(IoType::Flush, IoOp::Write) > value_size * 2); assert!(stats.fetch(IoType::Flush, IoOp::Write) < value_size * 2 + amplification_bytes); stats.reset(); - db.compact_range( + db.compact_range_cf( CF_DEFAULT, None, // start_key None, // end_key false, // exclusive_manual diff --git a/components/engine_rocks/src/lib.rs b/components/engine_rocks/src/lib.rs index b6f3e36146c..94a4c23a3c4 100644 --- a/components/engine_rocks/src/lib.rs +++ b/components/engine_rocks/src/lib.rs @@ -16,6 +16,8 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(let_chains)] +#![feature(option_get_or_insert_default)] #[allow(unused_extern_crates)] extern crate tikv_alloc; @@ -104,7 +106,10 @@ pub mod file_system; mod raft_engine; -pub use rocksdb::{set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel}; +pub use rocksdb::{ + set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel, + Statistics as RocksStatistics, +}; pub mod flow_control_factors; pub use flow_control_factors::*; diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 482686ffd1a..55546869272 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -8,8 +8,8 @@ use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, r2e, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, - RocksSstWriter, + engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -126,11 +126,18 @@ impl RocksEngine { } impl MiscExt for RocksEngine { - fn flush_cfs(&self, wait: bool) -> Result<()> { + type StatisticsReporter = RocksStatisticsReporter; + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { let mut handles = vec![]; - for cf in self.cf_names() { + for cf in cfs { handles.push(util::get_cf_handle(self.as_inner(), cf)?); } + if handles.is_empty() { + for cf in self.cf_names() { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + } self.as_inner().flush_cfs(&handles, wait).map_err(r2e) } @@ -231,6 +238,24 @@ impl MiscExt for RocksEngine { Ok(false) } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + let handle = util::get_cf_handle(self.as_inner(), cf)?; + let ret = self + .as_inner() + .get_column_family_meta_data(handle) + .get_level(level) + .get_files() + .iter() + .map(|sst_meta| { + ( + sst_meta.get_smallestkey().to_vec(), + sst_meta.get_largestkey().to_vec(), + ) + }) + .collect(); + Ok(ret) + } + fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; for cf in ALL_CFS { @@ -248,10 +273,20 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn pause_background_work(&self) -> Result<()> { + self.as_inner().pause_bg_work(); + Ok(()) + } + fn exists(path: &str) -> bool { crate::util::db_exist(path) } + fn locked(path: &str) -> Result { + let env = rocksdb::Env::default(); + env.is_db_locked(path).map_err(r2e) + } + fn dump_stats(&self) -> Result { const ROCKSDB_DB_STATS_KEY: &str = "rocksdb.dbstats"; const ROCKSDB_CF_STATS_KEY: &str = "rocksdb.cfstats"; @@ -272,11 +307,6 @@ impl MiscExt for RocksEngine { s.extend_from_slice(v.as_bytes()); } - // more stats if enable_statistics is true. - if let Some(v) = self.as_inner().get_statistics() { - s.extend_from_slice(v.as_bytes()); - } - Ok(box_try!(String::from_utf8(s))) } @@ -331,7 +361,8 @@ impl MiscExt for RocksEngine { #[cfg(test)] mod tests { use engine_traits::{ - DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, ALL_CFS, + CompactExt, DeleteStrategy, Iterable, Iterator, Mutable, SyncMutable, WriteBatchExt, + ALL_CFS, }; use tempfile::Builder; @@ -579,4 +610,70 @@ mod tests { .unwrap(); check_data(&db, &[cf], kvs_left.as_slice()); } + + #[test] + fn test_get_sst_key_ranges() { + let path = Builder::new() + .prefix("test_get_sst_key_ranges") + .tempdir() + .unwrap(); + let path_str = path.path().to_str().unwrap(); + + let mut opts = RocksDbOptions::default(); + opts.create_if_missing(true); + opts.enable_multi_batch_write(true); + + let mut cf_opts = RocksCfOptions::default(); + // Prefix extractor(trim the timestamp at tail) for write cf. + cf_opts + .set_prefix_extractor( + "FixedSuffixSliceTransform", + crate::util::FixedSuffixSliceTransform::new(8), + ) + .unwrap_or_else(|err| panic!("{:?}", err)); + // Create prefix bloom filter for memtable. + cf_opts.set_memtable_prefix_bloom_size_ratio(0.1_f64); + let cf = "default"; + let db = new_engine_opt(path_str, opts, vec![(cf, cf_opts)]).unwrap(); + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![ + (b"k1", b"v1"), + (b"k2", b"v2"), + (b"k6", b"v3"), + (b"k7", b"v4"), + ]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k7".to_vec())]; + assert_eq!(sst_range, expected); + + let mut wb = db.write_batch(); + let kvs: Vec<(&[u8], &[u8])> = vec![(b"k3", b"v1"), (b"k4", b"v2"), (b"k8", b"v3")]; + + for &(k, v) in kvs.as_slice() { + wb.put_cf(cf, k, v).unwrap(); + } + wb.write().unwrap(); + + db.flush_cf(cf, true).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + let expected = vec![ + (b"k3".to_vec(), b"k8".to_vec()), + (b"k1".to_vec(), b"k7".to_vec()), + ]; + assert_eq!(sst_range, expected); + + db.compact_range_cf(cf, None, None, false, 1).unwrap(); + let sst_range = db.get_sst_key_ranges(cf, 0).unwrap(); + assert_eq!(sst_range.len(), 0); + let sst_range = db.get_sst_key_ranges(cf, 1).unwrap(); + let expected = vec![(b"k1".to_vec(), b"k8".to_vec())]; + assert_eq!(sst_range, expected); + } } diff --git a/components/engine_rocks/src/options.rs b/components/engine_rocks/src/options.rs index c50c7734f79..7579c92ba79 100644 --- a/components/engine_rocks/src/options.rs +++ b/components/engine_rocks/src/options.rs @@ -40,6 +40,7 @@ impl From for RocksWriteOptions { let mut r = RawWriteOptions::default(); r.set_sync(opts.sync()); r.set_no_slowdown(opts.no_slowdown()); + r.disable_wal(opts.disable_wal()); // TODO: enable it. r.set_memtable_insert_hint_per_batch(false); RocksWriteOptions(r) diff --git a/components/engine_rocks/src/raft_engine.rs b/components/engine_rocks/src/raft_engine.rs index da15b1708b8..d5331a2ce29 100644 --- a/components/engine_rocks/src/raft_engine.rs +++ b/components/engine_rocks/src/raft_engine.rs @@ -3,8 +3,8 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, SyncMutable, WriteBatch, - WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, + RaftEngineReadOnly, RaftLogBatch, Result, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, + RAFT_LOG_MULTI_GET_CNT, }; use kvproto::{ metapb::Region, @@ -144,14 +144,26 @@ impl RaftEngineReadOnly for RocksEngine { self.get_msg_cf(CF_DEFAULT, keys::PREPARE_BOOTSTRAP_KEY) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { - let key = keys::region_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn get_region_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { - let key = keys::apply_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + fn get_apply_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() } fn get_recover_state(&self) -> Result> { @@ -286,48 +298,24 @@ impl RaftEngine for RocksEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut wb = self.write_batch(); - let buf = Vec::with_capacity(1024); - wb.append_impl(raft_group_id, &entries, buf)?; - self.consume(&mut wb, false) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - self.put_msg(&keys::raft_state_key(raft_group_id), state) - } - - fn batch_gc(&self, groups: Vec) -> Result { - let mut total = 0; - let mut raft_wb = self.write_batch_with_cap(4 * 1024); - for task in groups { - total += self.gc_impl(task.raft_group_id, task.from, task.to, &mut raft_wb)?; - } - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; + Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - let mut raft_wb = self.write_batch_with_cap(1024); - let total = self.gc_impl(raft_group_id, from, to, &mut raft_wb)?; - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() } fn flush_metrics(&self, instance: &str) { KvEngine::flush_metrics(self, instance) } - fn reset_statistics(&self) { - KvEngine::reset_statistics(self) - } - fn dump_stats(&self) -> Result { MiscExt::dump_stats(self) } @@ -343,10 +331,6 @@ impl RaftEngine for RocksEngine { self.as_inner().path() } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - self.put_msg(keys::STORE_IDENT_KEY, ident) - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -374,10 +358,6 @@ impl RaftEngine for RocksEngine { Some(e) => Err(e), } } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - self.put_msg(keys::RECOVER_STATE_KEY, state) - } } impl RaftLogBatch for RocksWriteBatchVec { @@ -424,12 +404,38 @@ impl RaftLogBatch for RocksWriteBatchVec { self.delete(keys::PREPARE_BOOTSTRAP_KEY) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { - self.put_msg(&keys::region_state_key(raft_group_id), state) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn put_region_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RaftApplyState, + ) -> Result<()> { + panic!() } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { - self.put_msg(&keys::apply_state_key(raft_group_id), state) + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) } } diff --git a/components/engine_rocks/src/raw.rs b/components/engine_rocks/src/raw.rs index 1a8718588b2..e940fdd2cd7 100644 --- a/components/engine_rocks/src/raw.rs +++ b/components/engine_rocks/src/raw.rs @@ -10,10 +10,10 @@ pub use rocksdb::{ new_compaction_filter_raw, run_ldb_tool, run_sst_dump_tool, BlockBasedOptions, Cache, ChecksumType, CompactOptions, CompactionFilter, CompactionFilterContext, CompactionFilterDecision, CompactionFilterFactory, CompactionFilterValueType, - CompactionJobInfo, CompactionOptions, CompactionPriority, DBBottommostLevelCompaction, - DBCompactionFilter, DBCompactionStyle, DBCompressionType, DBEntryType, DBRateLimiterMode, - DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, Env, EventListener, - IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, - PrepopulateBlockCache, Range, SliceTransform, TablePropertiesCollector, - TablePropertiesCollectorFactory, + CompactionJobInfo, CompactionOptions, CompactionPriority, ConcurrentTaskLimiter, + DBBottommostLevelCompaction, DBCompactionFilter, DBCompactionStyle, DBCompressionType, + DBEntryType, DBRateLimiterMode, DBRecoveryMode, DBStatisticsTickerType, DBTitanDBBlobRunMode, + Env, EventListener, IngestExternalFileOptions, LRUCacheOptions, MemoryAllocator, PerfContext, + PrepopulateBlockCache, Range, RateLimiter, SliceTransform, Statistics, + TablePropertiesCollector, TablePropertiesCollectorFactory, WriteBufferManager, }; diff --git a/components/engine_rocks/src/rocks_metrics.rs b/components/engine_rocks/src/rocks_metrics.rs index 4a88c6675ed..522696cb150 100644 --- a/components/engine_rocks/src/rocks_metrics.rs +++ b/components/engine_rocks/src/rocks_metrics.rs @@ -1,14 +1,15 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::CF_DEFAULT; +use collections::HashMap; +use engine_traits::{StatisticsReporter, CF_DEFAULT}; use lazy_static::lazy_static; use prometheus::*; use prometheus_static_metric::*; use rocksdb::{ - DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, DB, + DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, }; -use crate::rocks_metrics_defs::*; +use crate::{engine::RocksEngine, rocks_metrics_defs::*, RocksStatistics}; make_auto_flush_static_metric! { pub label_enum TickerName { @@ -581,12 +582,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .discardable .inc_by(value); } - TickerType::TitanGcSample => { - STORE_ENGINE_BLOB_GC_ACTION - .get(name_enum) - .sample - .inc_by(value); - } TickerType::TitanGcSmallFile => { STORE_ENGINE_BLOB_GC_ACTION .get(name_enum) @@ -611,6 +606,7 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .trigger_next .inc_by(value); } + // TODO: Some tickers are ignored. _ => {} } } @@ -910,214 +906,351 @@ pub fn flush_engine_histogram_metrics(t: HistType, value: HistogramData, name: & } } -pub fn flush_engine_iostall_properties(engine: &DB, name: &str) { - let stall_num = ROCKSDB_IOSTALL_KEY.len(); - let mut counter = vec![0; stall_num]; - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - if let Some(info) = engine.get_map_property_cf(handle, ROCKSDB_CFSTATS) { - for i in 0..stall_num { - let value = info.get_property_int_value(ROCKSDB_IOSTALL_KEY[i]); - counter[i] += value as i64; - } - } else { - return; - } - } - for i in 0..stall_num { - STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC - .with_label_values(&[name, ROCKSDB_IOSTALL_TYPE[i]]) - .set(counter[i]); - } +#[derive(Default, Clone)] +struct CfLevelStats { + num_files: Option, + // sum(compression_ratio_i * num_files_i) + weighted_compression_ratio: Option, + num_blob_files: Option, } -pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool) { - for cf in engine.cf_names() { - let handle = crate::util::get_cf_handle(engine, cf).unwrap(); - // It is important to monitor each cf's size, especially the "raft" and "lock" - // column families. - let cf_used_size = crate::util::get_engine_cf_used_size(engine, handle); - STORE_ENGINE_SIZE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(cf_used_size as i64); - - if !shared_block_cache { - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(block_cache_usage as i64); - } - - let blob_cache_usage = engine.get_blob_cache_usage_cf(handle); - STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, cf]) - .set(blob_cache_usage as i64); - - // TODO: find a better place to record these metrics. - // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB - // For index and filter blocks memory - if let Some(readers_mem) = engine.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "readers-mem"]) - .set(readers_mem as i64); - } - - // For memtable - if let Some(mem_table) = engine.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) - { - STORE_ENGINE_MEMORY_GAUGE_VEC - .with_label_values(&[name, cf, "mem-tables"]) - .set(mem_table as i64); - } +#[derive(Default)] +struct CfStats { + used_size: Option, + blob_cache_size: Option, + readers_mem: Option, + mem_tables: Option, + num_keys: Option, + pending_compaction_bytes: Option, + num_immutable_mem_table: Option, + live_blob_size: Option, + num_live_blob_file: Option, + num_obsolete_blob_file: Option, + live_blob_file_size: Option, + obsolete_blob_file_size: Option, + blob_file_discardable_ratio_le0: Option, + blob_file_discardable_ratio_le20: Option, + blob_file_discardable_ratio_le50: Option, + blob_file_discardable_ratio_le80: Option, + blob_file_discardable_ratio_le100: Option, + levels: Vec, +} - // TODO: add cache usage and pinned usage. +#[derive(Default)] +struct DbStats { + num_snapshots: Option, + oldest_snapshot_time: Option, + block_cache_size: Option, + stall_num: Option<[u64; ROCKSDB_IOSTALL_KEY.len()]>, +} - if let Some(num_keys) = engine.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { - STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC - .with_label_values(&[name, cf]) - .set(num_keys as i64); - } +pub struct RocksStatisticsReporter { + name: String, + db_stats: DbStats, + cf_stats: HashMap, +} - // Pending compaction bytes - if let Some(pending_compaction_bytes) = - crate::util::get_cf_pending_compaction_bytes(engine, handle) - { - STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC - .with_label_values(&[name, cf]) - .set(pending_compaction_bytes as i64); +impl StatisticsReporter for RocksStatisticsReporter { + fn new(name: &str) -> Self { + Self { + name: name.to_owned(), + db_stats: DbStats::default(), + cf_stats: HashMap::default(), } + } - let opts = engine.get_options_cf(handle); - for level in 0..opts.get_num_levels() { - // Compression ratio at levels + fn collect(&mut self, engine: &RocksEngine) { + let db = engine.as_inner(); + for cf in db.cf_names() { + let cf_stats = self.cf_stats.entry(cf.to_owned()).or_default(); + let handle = crate::util::get_cf_handle(db, cf).unwrap(); + // It is important to monitor each cf's size, especially the "raft" and "lock" + // column families. + *cf_stats.used_size.get_or_insert_default() += + crate::util::get_engine_cf_used_size(db, handle); + *cf_stats.blob_cache_size.get_or_insert_default() += db.get_blob_cache_usage_cf(handle); + // TODO: find a better place to record these metrics. + // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB + // For index and filter blocks memory + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { + *cf_stats.readers_mem.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables.get_or_insert_default() += v; + } + // TODO: add cache usage and pinned usage. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { + *cf_stats.num_keys.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_pending_compaction_bytes(db, handle) { + *cf_stats.pending_compaction_bytes.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_num_immutable_mem_table(db, handle) { + *cf_stats.num_immutable_mem_table.get_or_insert_default() += v; + } + // Titan. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { + *cf_stats.live_blob_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { + *cf_stats.num_live_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) + { + *cf_stats.num_obsolete_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { + *cf_stats.live_blob_file_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) + { + *cf_stats.obsolete_blob_file_size.get_or_insert_default() += v; + } if let Some(v) = - crate::util::get_engine_compression_ratio_at_level(engine, handle, level) + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) { - STORE_ENGINE_COMPRESSION_RATIO_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v); + *cf_stats + .blob_file_discardable_ratio_le0 + .get_or_insert_default() += v; } - - // Num files at levels - if let Some(v) = crate::util::get_cf_num_files_at_level(engine, handle, level) { - STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le20 + .get_or_insert_default() += v; } - - // Titan Num blob files at levels - if let Some(v) = crate::util::get_cf_num_blob_files_at_level(engine, handle, level) { - STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC - .with_label_values(&[name, cf, &level.to_string()]) - .set(v as i64); + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le50 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le80 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le100 + .get_or_insert_default() += v; + } + // Level stats. + let opts = db.get_options_cf(handle); + if cf_stats.levels.len() < opts.get_num_levels() { + cf_stats + .levels + .resize(opts.get_num_levels(), CfLevelStats::default()); + } + for level in 0..opts.get_num_levels() { + if let Some(num_files) = crate::util::get_cf_num_files_at_level(db, handle, level) { + *cf_stats.levels[level].num_files.get_or_insert_default() += num_files; + if let Some(ratio) = + crate::util::get_engine_compression_ratio_at_level(db, handle, level) + { + *cf_stats.levels[level] + .weighted_compression_ratio + .get_or_insert_default() += num_files as f64 * ratio; + } + } + if let Some(v) = crate::util::get_cf_num_blob_files_at_level(db, handle, level) { + *cf_stats.levels[level] + .num_blob_files + .get_or_insert_default() += v; + } } - } - - // Num immutable mem-table - if let Some(v) = crate::util::get_cf_num_immutable_mem_table(engine, handle) { - STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } - // Titan live blob size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(info) = db.get_map_property_cf(handle, ROCKSDB_CFSTATS) { + let stall_num = self.db_stats.stall_num.get_or_insert_default(); + for (key, val) in ROCKSDB_IOSTALL_KEY.iter().zip(stall_num) { + *val += info.get_property_int_value(key); + } + } } - // Titan num live blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { - STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // For snapshot + *self.db_stats.num_snapshots.get_or_insert_default() += + db.get_property_int(ROCKSDB_NUM_SNAPSHOTS).unwrap_or(0); + let oldest_snapshot_time = + db.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) + .map_or(0, |t| { + let now = time::get_time().sec as u64; + // RocksDB returns 0 if no snapshots. + if t > 0 && now > t { now - t } else { 0 } + }); + if oldest_snapshot_time > self.db_stats.oldest_snapshot_time.unwrap_or(0) { + *self.db_stats.oldest_snapshot_time.get_or_insert_default() = oldest_snapshot_time; } - // Titan num obsolete blob file - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) - { - STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + // Since block cache is shared, getting cache size from any CF/DB is fine. Here + // we get from default CF. + if self.db_stats.block_cache_size.is_none() { + let handle = crate::util::get_cf_handle(db, CF_DEFAULT).unwrap(); + *self.db_stats.block_cache_size.get_or_insert_default() = + db.get_block_cache_usage_cf(handle); } + } - // Titan live blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { - STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); - } + fn flush(&mut self) { + for (cf, cf_stats) in &self.cf_stats { + if let Some(v) = cf_stats.used_size { + STORE_ENGINE_SIZE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_cache_size { + STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.readers_mem { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "readers-mem"]) + .set(v as i64); + } + if let Some(v) = cf_stats.mem_tables { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables"]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_keys { + STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.pending_compaction_bytes { + STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + for (level, level_stats) in cf_stats.levels.iter().enumerate() { + if let Some(num_files) = level_stats.num_files { + STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(num_files as i64); + if num_files > 0 && let Some(ratio) = level_stats.weighted_compression_ratio { + let normalized_compression_ratio = + ratio / num_files as f64; + STORE_ENGINE_COMPRESSION_RATIO_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(normalized_compression_ratio); + } + } + if let Some(v) = level_stats.num_blob_files { + STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(v as i64); + } + } - // Titan obsolete blob file size - if let Some(v) = engine.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) - { - STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC - .with_label_values(&[name, cf]) - .set(v as i64); + if let Some(v) = cf_stats.num_immutable_mem_table { + STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_live_blob_file { + STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_obsolete_blob_file { + STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_file_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.obsolete_blob_file_size { + STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le0 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le0"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le20 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le20"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le50 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le50"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le80 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le80"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le100 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le100"]) + .set(v as i64); + } } - // Titan blob file discardable ratio - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le0"]) + if let Some(v) = self.db_stats.num_snapshots { + STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le20"]) + if let Some(v) = self.db_stats.oldest_snapshot_time { + STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC + .with_label_values(&[&self.name]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le50"]) - .set(v as i64); - } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le80"]) + if let Some(v) = self.db_stats.block_cache_size { + STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, "all"]) .set(v as i64); } - if let Some(v) = - engine.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) - { - STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC - .with_label_values(&[name, cf, "le100"]) - .set(v as i64); + if let Some(stall_num) = &self.db_stats.stall_num { + for (ty, val) in ROCKSDB_IOSTALL_TYPE.iter().zip(stall_num) { + STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC + .with_label_values(&[&self.name, ty]) + .set(*val as i64); + } } } +} - // For snapshot - if let Some(n) = engine.get_property_int(ROCKSDB_NUM_SNAPSHOTS) { - STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC - .with_label_values(&[name]) - .set(n as i64); +pub fn flush_engine_statistics(statistics: &RocksStatistics, name: &str, is_titan: bool) { + for t in ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); } - if let Some(t) = engine.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) { - // RocksDB returns 0 if no snapshots. - let now = time::get_time().sec as u64; - let d = if t > 0 && now > t { now - t } else { 0 }; - STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC - .with_label_values(&[name]) - .set(d as i64); + for t in ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } } - - if shared_block_cache { - // Since block cache is shared, getting cache size from any CF is fine. Here we - // get from default CF. - let handle = crate::util::get_cf_handle(engine, CF_DEFAULT).unwrap(); - let block_cache_usage = engine.get_block_cache_usage_cf(handle); - STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC - .with_label_values(&[name, "all"]) - .set(block_cache_usage as i64); + if is_titan { + for t in TITAN_ENGINE_TICKER_TYPES { + let v = statistics.get_and_reset_ticker_count(*t); + flush_engine_ticker_metrics(*t, v, name); + } + for t in TITAN_ENGINE_HIST_TYPES { + if let Some(v) = statistics.get_histogram(*t) { + flush_engine_histogram_metrics(*t, v, name); + } + } } } @@ -1627,12 +1760,8 @@ mod tests { flush_engine_histogram_metrics(*tp, HistogramData::default(), "kv"); } - let shared_block_cache = false; - flush_engine_properties(engine.as_inner(), "kv", shared_block_cache); - let handle = engine.as_inner().cf_handle("default").unwrap(); - let info = engine - .as_inner() - .get_map_property_cf(handle, ROCKSDB_CFSTATS); - assert!(info.is_some()); + let mut reporter = RocksStatisticsReporter::new("kv"); + reporter.collect(&engine); + reporter.flush(); } } diff --git a/components/engine_rocks/src/rocks_metrics_defs.rs b/components/engine_rocks/src/rocks_metrics_defs.rs index fc23871b90f..042949f1c09 100644 --- a/components/engine_rocks/src/rocks_metrics_defs.rs +++ b/components/engine_rocks/src/rocks_metrics_defs.rs @@ -138,8 +138,11 @@ pub const TITAN_ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::TitanGcNoNeed, TickerType::TitanGcRemain, TickerType::TitanGcDiscardable, - TickerType::TitanGcSample, TickerType::TitanGcSmallFile, + TickerType::TitanGcLevelMergeMark, + TickerType::TitanGcLevelMergeDelete, + TickerType::TitanGcNoNeed, + TickerType::TitanGcRemain, TickerType::TitanGcFailure, TickerType::TitanGcSuccess, TickerType::TitanGcTriggerNext, diff --git a/components/engine_rocks/src/util.rs b/components/engine_rocks/src/util.rs index 778e16c1a67..407cf8ee611 100644 --- a/components/engine_rocks/src/util.rs +++ b/components/engine_rocks/src/util.rs @@ -11,7 +11,7 @@ use slog_global::warn; use crate::{ cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, - rocks_metrics_defs::*, + rocks_metrics_defs::*, RocksStatistics, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { @@ -28,7 +28,7 @@ pub fn new_default_engine(path: &str) -> Result { pub fn new_engine(path: &str, cfs: &[&str]) -> Result { let mut db_opts = RocksDbOptions::default(); - db_opts.enable_statistics(true); + db_opts.set_statistics(&RocksStatistics::new_titan()); let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); new_engine_opt(path, db_opts, cf_opts) } diff --git a/components/engine_rocks/src/write_batch.rs b/components/engine_rocks/src/write_batch.rs index 6171ca7ee38..3659a7628d6 100644 --- a/components/engine_rocks/src/write_batch.rs +++ b/components/engine_rocks/src/write_batch.rs @@ -96,20 +96,38 @@ impl RocksWriteBatchVec { } } } -} -impl engine_traits::WriteBatch for RocksWriteBatchVec { - fn write_opt(&mut self, opts: &WriteOptions) -> Result { + #[inline] + fn write_impl(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { let opt: RocksWriteOptions = opts.into(); + let mut seq = 0; if self.support_write_batch_vec { + // FIXME(tabokie): Callback for empty write batch won't be called. self.get_db() - .multi_batch_write(self.as_inner(), &opt.into_raw()) - .map_err(r2e) + .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; } else { self.get_db() - .write_seq_opt(&self.wbs[0], &opt.into_raw()) - .map_err(r2e) + .write_callback(&self.wbs[0], &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; } + Ok(seq) + } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + self.write_impl(opts, || {}) + } + + fn write_callback_opt(&mut self, opts: &WriteOptions, cb: impl FnMut()) -> Result { + self.write_impl(opts, cb) } fn data_size(&self) -> usize { diff --git a/components/engine_rocks_helper/src/sst_recovery.rs b/components/engine_rocks_helper/src/sst_recovery.rs index 7a820e6a79b..85fb8d74bee 100644 --- a/components/engine_rocks_helper/src/sst_recovery.rs +++ b/components/engine_rocks_helper/src/sst_recovery.rs @@ -227,7 +227,8 @@ mod tests { db.put(b"z2", b"val").unwrap(); db.put(b"z7", b"val").unwrap(); // generate SST file. - db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); let files = db.as_inner().get_live_files(); assert_eq!(files.get_smallestkey(0), b"z2"); diff --git a/components/engine_test/src/lib.rs b/components/engine_test/src/lib.rs index 77bd2d3be7c..2d89929a4b2 100644 --- a/components/engine_test/src/lib.rs +++ b/components/engine_test/src/lib.rs @@ -76,12 +76,8 @@ pub mod raft { /// Types and constructors for the "kv" engine pub mod kv { - use std::{ - path::{Path, PathBuf}, - sync::{Arc, Mutex}, - }; + use std::path::Path; - use collections::HashMap; #[cfg(feature = "test-engine-kv-panic")] pub use engine_panic::{ PanicEngine as KvTestEngine, PanicEngineIterator as KvTestEngineIterator, @@ -92,11 +88,7 @@ pub mod kv { RocksEngine as KvTestEngine, RocksEngineIterator as KvTestEngineIterator, RocksSnapshot as KvTestSnapshot, RocksWriteBatchVec as KvTestWriteBatch, }; - use engine_traits::{ - CfOptions, CfOptionsExt, MiscExt, OpenOptions, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, - }; - use tikv_util::box_err; + use engine_traits::{MiscExt, Result, TabletContext, TabletFactory}; use crate::ctor::{CfOptions as KvTestCfOptions, DbOptions, KvEngineConstructorExt}; @@ -112,317 +104,41 @@ pub mod kv { KvTestEngine::new_kv_engine_opt(path, db_opt, cfs_opts) } - const TOMBSTONE_MARK: &str = "TOMBSTONE_TABLET"; + const TOMBSTONE_SUFFIX: &str = ".tombstone"; #[derive(Clone)] pub struct TestTabletFactory { - root_path: PathBuf, db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>, - root_db: Arc>>, } impl TestTabletFactory { - pub fn new( - root_path: &Path, - db_opt: DbOptions, - cf_opts: Vec<(&'static str, KvTestCfOptions)>, - ) -> Self { - let factory = Self { - root_path: root_path.to_path_buf(), - db_opt, - cf_opts, - root_db: Arc::new(Mutex::default()), - }; - let tablet_path = factory.tablets_path(); - if !tablet_path.exists() { - std::fs::create_dir_all(tablet_path).unwrap(); - } - factory + pub fn new(db_opt: DbOptions, cf_opts: Vec<(&'static str, KvTestCfOptions)>) -> Self { + Self { db_opt, cf_opts } } + } - fn create_tablet(&self, tablet_path: &Path) -> Result { - KvTestEngine::new_kv_engine_opt( - tablet_path.to_str().unwrap(), + impl TabletFactory for TestTabletFactory { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + KvTestEngine::new_tablet( + path.to_str().unwrap(), + ctx, self.db_opt.clone(), self.cf_opts.clone(), ) } - } - impl TabletFactory for TestTabletFactory { - fn create_shared_db(&self) -> Result { - let tablet_path = self.tablet_path(0, 0); - let tablet = self.create_tablet(&tablet_path)?; - let mut root_db = self.root_db.lock().unwrap(); - root_db.replace(tablet.clone()); - Ok(tablet) - } - - /// See the comment above the same name method in KvEngineFactory - fn open_tablet( - &self, - _id: u64, - _suffix: Option, - options: OpenOptions, - ) -> Result { - if let Some(db) = self.root_db.lock().unwrap().as_ref() { - if options.create_new() { - return Err(box_err!("root tablet {} already exists", db.path())); - } - return Ok(db.clone()); - } - // No need for mutex protection here since root_db creation only occurs at - // tikv bootstrap time when there is no racing issue. - if options.create_new() || options.create() { - return self.create_shared_db(); - } - - Err(box_err!("root tablet has not been initialized")) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - self.create_shared_db() - } - - fn exists_raw(&self, _path: &Path) -> bool { - false - } - - #[inline] - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - self.root_path.join("db") - } - - #[inline] - fn tablets_path(&self) -> PathBuf { - Path::new(&self.root_path).join("tablets") - } - - #[inline] - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { + fn destroy_tablet(&self, _ctx: TabletContext, path: &Path) -> Result<()> { + let tombstone_path = path.join(TOMBSTONE_SUFFIX); + std::fs::remove_dir_all(&tombstone_path)?; + std::fs::rename(path, &tombstone_path)?; + std::fs::remove_dir_all(tombstone_path)?; Ok(()) } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let db = self.root_db.lock().unwrap(); - let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - Ok(()) - } - } - - impl TabletAccessor for TestTabletFactory { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { - let db = self.root_db.lock().unwrap(); - let db = db.as_ref().unwrap(); - f(0, 0, db); - } - - fn is_single_engine(&self) -> bool { - true - } - } - - #[derive(Clone)] - pub struct TestTabletFactoryV2 { - inner: TestTabletFactory, - // region_id -> (tablet, tablet_suffix) - registry: Arc>>, - } - - impl TestTabletFactoryV2 { - pub fn new( - root_path: &Path, - db_opt: DbOptions, - cf_opts: Vec<(&'static str, KvTestCfOptions)>, - ) -> Self { - Self { - inner: TestTabletFactory::new(root_path, db_opt, cf_opts), - registry: Arc::default(), - } - } - } - - impl TabletFactory for TestTabletFactoryV2 { - /// See the comment above the same name method in KvEngineFactoryV2 - fn open_tablet( - &self, - id: u64, - suffix: Option, - mut options: OpenOptions, - ) -> Result { - if options.create_new() && suffix.is_none() { - return Err(box_err!( - "suffix should be provided when creating new tablet" - )); - } - - if options.create_new() || options.create() { - options = options.set_cache_only(false); - } - - let mut reg = self.registry.lock().unwrap(); - if let Some(suffix) = suffix { - if let Some((cached_tablet, cached_suffix)) = reg.get(&id) && *cached_suffix == suffix { - // Target tablet exist in the cache - if options.create_new() { - return Err(box_err!("region {} {} already exists", id, cached_tablet.path())); - } - return Ok(cached_tablet.clone()); - } else if !options.cache_only() { - let tablet_path = self.tablet_path(id, suffix); - let tablet = self.open_tablet_raw(&tablet_path, id, suffix, options.clone())?; - if !options.skip_cache() { - reg.insert(id, (tablet.clone(), suffix)); - } - return Ok(tablet); - } - } else if let Some((tablet, _)) = reg.get(&id) { - return Ok(tablet.clone()); - } - - Err(box_err!( - "tablet with region id {} suffix {:?} does not exist", - id, - suffix - )) - } - - fn open_tablet_raw( - &self, - path: &Path, - id: u64, - _suffix: u64, - options: OpenOptions, - ) -> Result { - let engine_exist = KvTestEngine::exists(path.to_str().unwrap_or_default()); - // Even though neither options.create nor options.create_new are true, if the - // tablet files already exists, we will open it by calling - // inner.create_tablet. In this case, the tablet exists but not in the cache - // (registry). - if !options.create() && !options.create_new() && !engine_exist { - return Err(box_err!( - "path {} does not have db", - path.to_str().unwrap_or_default() - )); - }; - - if options.create_new() && engine_exist { - return Err(box_err!( - "region {} {} already exists", - id, - path.to_str().unwrap() - )); - } - - self.inner.create_tablet(path) - } - - #[inline] - fn create_shared_db(&self) -> Result { - self.open_tablet(0, Some(0), OpenOptions::default().set_create_new(true)) - } - - #[inline] - fn exists_raw(&self, path: &Path) -> bool { + fn exists(&self, path: &Path) -> bool { KvTestEngine::exists(path.to_str().unwrap_or_default()) } - - #[inline] - fn tablets_path(&self) -> PathBuf { - self.inner.root_path.join("tablets") - } - - #[inline] - fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf { - self.inner - .root_path - .join(format!("tablets/{}{}_{}", prefix, id, suffix)) - } - - #[inline] - fn mark_tombstone(&self, region_id: u64, suffix: u64) { - let path = self.tablet_path(region_id, suffix).join(TOMBSTONE_MARK); - // When the full directory path does not exsit, create will return error and in - // this case, we just ignore it. - let _ = std::fs::File::create(path); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - } - - #[inline] - fn is_tombstoned(&self, region_id: u64, suffix: u64) -> bool { - self.tablet_path(region_id, suffix) - .join(TOMBSTONE_MARK) - .exists() - } - - #[inline] - fn destroy_tablet(&self, region_id: u64, suffix: u64) -> engine_traits::Result<()> { - let path = self.tablet_path(region_id, suffix); - { - let mut reg = self.registry.lock().unwrap(); - if let Some((cached_tablet, cached_suffix)) = reg.remove(®ion_id) && cached_suffix != suffix { - reg.insert(region_id, (cached_tablet, cached_suffix)); - } - } - let _ = std::fs::remove_dir_all(path); - Ok(()) - } - - #[inline] - fn load_tablet(&self, path: &Path, region_id: u64, suffix: u64) -> Result { - { - let reg = self.registry.lock().unwrap(); - if let Some((db, db_suffix)) = reg.get(®ion_id) && *db_suffix == suffix { - return Err(box_err!("region {} {} already exists", region_id, db.path())); - } - } - - let db_path = self.tablet_path(region_id, suffix); - std::fs::rename(path, db_path)?; - self.open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_create(true), - ) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let reg = self.registry.lock().unwrap(); - // pick up any tablet and set the shared block cache capacity - if let Some((_id, (tablet, _suffix))) = (*reg).iter().next() { - let opt = tablet.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - } - Ok(()) - } - } - - impl TabletAccessor for TestTabletFactoryV2 { - #[inline] - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &KvTestEngine)) { - let reg = self.registry.lock().unwrap(); - for (id, (tablet, suffix)) in &*reg { - f(*id, *suffix, tablet) - } - } - - // it have multi tablets. - fn is_single_engine(&self) -> bool { - false - } } } @@ -440,7 +156,7 @@ pub mod ctor { use std::sync::Arc; use encryption::DataKeyManager; - use engine_traits::Result; + use engine_traits::{Result, StateStorage, TabletContext}; use file_system::IoRateLimiter; /// Kv engine construction @@ -473,6 +189,14 @@ pub mod ctor { db_opt: DbOptions, cf_opts: Vec<(&str, CfOptions)>, ) -> Result; + + /// Create a new engine specific for multi rocks. + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result; } /// Raft engine construction @@ -485,6 +209,7 @@ pub mod ctor { pub struct DbOptions { key_manager: Option>, rate_limiter: Option>, + state_storage: Option>, enable_multi_batch_write: bool, } @@ -497,6 +222,10 @@ pub mod ctor { self.rate_limiter = rate_limiter; } + pub fn set_state_storage(&mut self, state_storage: Arc) { + self.state_storage = Some(state_storage); + } + pub fn set_enable_multi_batch_write(&mut self, enable: bool) { self.enable_multi_batch_write = enable; } @@ -614,6 +343,15 @@ pub mod ctor { ) -> Result { Ok(PanicEngine) } + + fn new_tablet( + _path: &str, + _ctx: engine_traits::TabletContext, + _db_opt: DbOptions, + _cf_opts: Vec<(&str, CfOptions)>, + ) -> Result { + Ok(PanicEngine) + } } impl RaftEngineConstructorExt for engine_panic::PanicEngine { @@ -628,9 +366,11 @@ pub mod ctor { get_env, properties::{MvccPropertiesCollectorFactory, RangePropertiesCollectorFactory}, util::new_engine_opt as rocks_new_engine_opt, - RocksCfOptions, RocksDbOptions, + RocksCfOptions, RocksDbOptions, RocksPersistenceListener, + }; + use engine_traits::{ + CfOptions as _, PersistenceListener, Result, TabletContext, CF_DEFAULT, }; - use engine_traits::{CfOptions as _, Result, CF_DEFAULT}; use super::{ CfOptions, DbOptions, KvEngineConstructorExt, RaftDbOptions, RaftEngineConstructorExt, @@ -661,6 +401,36 @@ pub mod ctor { .collect(); rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) } + + fn new_tablet( + path: &str, + ctx: TabletContext, + db_opt: DbOptions, + cf_opts: Vec<(&str, CfOptions)>, + ) -> Result { + let mut rocks_db_opts = RocksDbOptions::default(); + let env = get_env(db_opt.key_manager.clone(), db_opt.rate_limiter)?; + rocks_db_opts.set_env(env); + rocks_db_opts.enable_unordered_write(false); + rocks_db_opts.enable_pipelined_write(false); + rocks_db_opts.enable_multi_batch_write(false); + rocks_db_opts.allow_concurrent_memtable_write(false); + if let Some(storage) = db_opt.state_storage + && let Some(flush_state) = ctx.flush_state { + let listener = PersistenceListener::new( + ctx.id, + ctx.suffix.unwrap(), + flush_state, + storage, + ); + rocks_db_opts.add_event_listener(RocksPersistenceListener::new(listener)); + } + let rocks_cfs_opts = cf_opts + .iter() + .map(|(name, opt)| (*name, get_rocks_cf_opts(opt))) + .collect(); + rocks_new_engine_opt(path, rocks_db_opts, rocks_cfs_opts) + } } impl RaftEngineConstructorExt for engine_rocks::RocksEngine { diff --git a/components/engine_traits/Cargo.toml b/components/engine_traits/Cargo.toml index d38962e71c9..2370f1c9e7e 100644 --- a/components/engine_traits/Cargo.toml +++ b/components/engine_traits/Cargo.toml @@ -9,9 +9,11 @@ failpoints = ["fail/failpoints"] [dependencies] case_macros = { workspace = true } +collections = { workspace = true } error_code = { workspace = true } fail = "0.5" file_system = { workspace = true } +keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } protobuf = "2" diff --git a/components/engine_traits/src/cf_defs.rs b/components/engine_traits/src/cf_defs.rs index e3fe95ec3b6..1658f49053c 100644 --- a/components/engine_traits/src/cf_defs.rs +++ b/components/engine_traits/src/cf_defs.rs @@ -9,6 +9,7 @@ pub const CF_RAFT: CfName = "raft"; pub const LARGE_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; pub const ALL_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE, CF_RAFT]; pub const DATA_CFS: &[CfName] = &[CF_DEFAULT, CF_LOCK, CF_WRITE]; +pub const DATA_CFS_LEN: usize = DATA_CFS.len(); pub fn name_to_cf(name: &str) -> Option { if name.is_empty() { diff --git a/components/engine_traits/src/compact.rs b/components/engine_traits/src/compact.rs index 8dd1cc7d9b4..05590a1ff32 100644 --- a/components/engine_traits/src/compact.rs +++ b/components/engine_traits/src/compact.rs @@ -4,17 +4,30 @@ use std::collections::BTreeMap; -use crate::errors::Result; +use crate::{errors::Result, CfNamesExt}; -pub trait CompactExt { +pub trait CompactExt: CfNamesExt { type CompactedEvent: CompactedEvent; /// Checks whether any column family sets `disable_auto_compactions` to /// `True` or not. fn auto_compactions_is_disabled(&self) -> Result; - /// Compacts the column families in the specified range by manual or not. fn compact_range( + &self, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + exclusive_manual: bool, + max_subcompactions: u32, + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_range_cf(cf, start_key, end_key, exclusive_manual, max_subcompactions)?; + } + Ok(()) + } + + /// Compacts the column families in the specified range by manual or not. + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -32,7 +45,12 @@ pub trait CompactExt { start: Option<&[u8]>, end: Option<&[u8]>, output_level: Option, - ) -> Result<()>; + ) -> Result<()> { + for cf in self.cf_names() { + self.compact_files_in_range_cf(cf, start, end, output_level)?; + } + Ok(()) + } /// Compacts files in the range and above the output level of the given /// column family. Compacts all files to the bottommost level if the diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index 55ab5d63caa..e76765e2ed6 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -1,14 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - fmt::Debug, - io::Write, - path::{Path, PathBuf}, - str, - vec::Vec, -}; - -use tikv_util::error; +use std::{fmt::Debug, str}; use crate::*; @@ -55,10 +47,11 @@ pub trait KvEngine: /// Flush metrics to prometheus /// /// `instance` is the label of the metric to flush. - fn flush_metrics(&self, _instance: &str) {} - - /// Reset internal statistics - fn reset_statistics(&self) {} + fn flush_metrics(&self, instance: &str) { + let mut reporter = Self::StatisticsReporter::new(instance); + reporter.collect(self); + reporter.flush(); + } /// Cast to a concrete engine type /// @@ -74,336 +67,3 @@ pub trait KvEngine: true } } - -/// TabletAccessor is the trait to access all the tablets with provided accessor -/// -/// For single rocksdb instance, it essentially accesses the global kvdb with -/// the accessor For multi rocksdb instances, it accesses all the tablets with -/// the accessor -pub trait TabletAccessor { - /// Loop visit all opened tablets by the specified function. - fn for_each_opened_tablet(&self, _f: &mut (dyn FnMut(u64, u64, &EK))); - - /// return true if it's single engine; - /// return false if it's a multi-tablet factory; - fn is_single_engine(&self) -> bool; -} - -/// max error count to log -const MAX_ERROR_COUNT: u32 = 5; - -/// TabletErrorCollector is the facility struct to handle errors when using -/// TabletAccessor::for_each_opened_tablet -/// -/// It will choose the last failed result as the final result, meanwhile logging -/// errors up to MAX_ERROR_COUNT. -pub struct TabletErrorCollector { - errors: Vec, - max_error_count: u32, - error_count: u32, - result: std::result::Result<(), Box>, -} - -impl TabletErrorCollector { - pub fn new() -> Self { - Self { - errors: vec![], - max_error_count: MAX_ERROR_COUNT, - error_count: 0, - result: Ok(()), - } - } - - pub fn add_result(&mut self, region_id: u64, suffix: u64, result: Result<()>) { - if result.is_ok() { - return; - } - self.result = Err(Box::from(result.err().unwrap())); - self.error_count += 1; - if self.error_count > self.max_error_count { - return; - } - writeln!( - &mut self.errors, - "Tablet {}_{} encountered error: {:?}.", - region_id, suffix, self.result - ) - .unwrap(); - } - - fn flush_error(&self) { - if self.error_count > 0 { - error!( - "Total count {}. Sample errors: {}", - self.error_count, - str::from_utf8(&self.errors).unwrap() - ); - } - } - - pub fn take_result(&mut self) -> std::result::Result<(), Box> { - std::mem::replace(&mut self.result, Ok(())) - } - - pub fn get_error_count(&self) -> u32 { - self.error_count - } -} - -impl Default for TabletErrorCollector { - fn default() -> Self { - Self::new() - } -} - -impl Drop for TabletErrorCollector { - fn drop(&mut self) { - self.flush_error() - } -} - -/// OpenOptionsn is used for specifiying the way of opening a tablet. -#[derive(Default, Clone)] -pub struct OpenOptions { - // create tablet if non-exist - create: bool, - create_new: bool, - read_only: bool, - cache_only: bool, - skip_cache: bool, -} - -impl OpenOptions { - /// Sets the option to create a tablet, or open it if it already exists. - pub fn set_create(mut self, create: bool) -> Self { - self.create = create; - self - } - - /// Sets the option to create a new tablet, failing if it already exists. - pub fn set_create_new(mut self, create_new: bool) -> Self { - self.create_new = create_new; - self - } - - /// Sets the option for read only - pub fn set_read_only(mut self, read_only: bool) -> Self { - self.read_only = read_only; - self - } - - /// Sets the option for only reading from cache. - pub fn set_cache_only(mut self, cache_only: bool) -> Self { - self.cache_only = cache_only; - self - } - - /// Sets the option to open a tablet without updating the cache. - pub fn set_skip_cache(mut self, skip_cache: bool) -> Self { - self.skip_cache = skip_cache; - self - } - - pub fn create(&self) -> bool { - self.create - } - - pub fn create_new(&self) -> bool { - self.create_new - } - - pub fn read_only(&self) -> bool { - self.read_only - } - - pub fn cache_only(&self) -> bool { - self.cache_only - } - - pub fn skip_cache(&self) -> bool { - self.skip_cache - } -} - -pub const SPLIT_PREFIX: &str = "split_"; -pub const MERGE_PREFIX: &str = "merge_"; - -/// A factory trait to create new engine. -// It should be named as `EngineFactory` for consistency, but we are about to -// rename engine to tablet, so always use tablet for new traits/types. -pub trait TabletFactory: TabletAccessor + Send + Sync { - /// Open the tablet with id and suffix according to the OpenOptions. - /// - /// The id is likely the region Id, the suffix could be the current raft log - /// index. They together could specify a unique path for a region's - /// tablet. The reason to have suffix is that we can keep more than one - /// tablet for a region. - fn open_tablet(&self, id: u64, suffix: Option, options: OpenOptions) -> Result; - - /// Open tablet by raw path without updating cache. - fn open_tablet_raw( - &self, - path: &Path, - id: u64, - suffix: u64, - options: OpenOptions, - ) -> Result; - - /// Create the shared db for v1 - fn create_shared_db(&self) -> Result; - - /// Destroy the tablet and its data - fn destroy_tablet(&self, id: u64, suffix: u64) -> Result<()>; - - /// Check if the tablet with specified id/suffix exists - #[inline] - fn exists(&self, id: u64, suffix: u64) -> bool { - self.exists_raw(&self.tablet_path(id, suffix)) - } - - /// Check if the tablet with specified path exists - fn exists_raw(&self, path: &Path) -> bool; - - /// Get the tablet path by id and suffix - fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { - self.tablet_path_with_prefix("", id, suffix) - } - - /// Get the tablet path by id and suffix - /// - /// Used in special situations - /// Ex: split/merge. - fn tablet_path_with_prefix(&self, prefix: &str, id: u64, suffix: u64) -> PathBuf; - - /// Tablets root path - fn tablets_path(&self) -> PathBuf; - - /// Load the tablet from path for id and suffix--for scenarios such as - /// applying snapshot - fn load_tablet(&self, _path: &Path, _id: u64, _suffix: u64) -> Result { - unimplemented!(); - } - - /// Mark the tablet with specified id and suffix tombostone - fn mark_tombstone(&self, _id: u64, _suffix: u64) { - unimplemented!(); - } - - /// Check if the tablet with specified id and suffix tombostone - fn is_tombstoned(&self, _region_id: u64, _suffix: u64) -> bool { - unimplemented!(); - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()>; -} - -pub struct DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - pub engine: Option, - pub root_path: String, -} - -impl TabletFactory for DummyFactory -where - EK: CfOptionsExt + Clone + Send + Sync + 'static, -{ - fn create_shared_db(&self) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn open_tablet(&self, _id: u64, _suffix: Option, _options: OpenOptions) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - Ok(self.engine.as_ref().unwrap().clone()) - } - - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> Result<()> { - Ok(()) - } - - fn exists_raw(&self, _path: &Path) -> bool { - true - } - - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - PathBuf::from(&self.root_path) - } - - fn tablets_path(&self) -> PathBuf { - PathBuf::from(&self.root_path) - } - - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let opt = self - .engine - .as_ref() - .unwrap() - .get_options_cf(CF_DEFAULT) - .unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity) - } -} - -impl TabletAccessor for DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &EK)) { - if let Some(engine) = &self.engine { - f(0, 0, engine); - } - } - - fn is_single_engine(&self) -> bool { - true - } -} - -impl DummyFactory -where - EK: CfOptionsExt + Clone + Send + 'static, -{ - pub fn new(engine: Option, root_path: String) -> DummyFactory { - DummyFactory { engine, root_path } - } -} - -impl Default for DummyFactory { - fn default() -> Self { - Self::new(None, "/tmp".to_string()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_tablet_error_collector_ok() { - let mut err = TabletErrorCollector::new(); - err.add_result(1, 1, Ok(())); - err.take_result().unwrap(); - assert_eq!(err.get_error_count(), 0); - } - - #[test] - fn test_tablet_error_collector_err() { - let mut err = TabletErrorCollector::new(); - err.add_result(1, 1, Ok(())); - err.add_result(1, 1, Err(Status::with_code(Code::Aborted).into())); - err.add_result(1, 1, Err(Status::with_code(Code::NotFound).into())); - err.add_result(1, 1, Ok(())); - err.take_result().unwrap_err(); - assert_eq!(err.get_error_count(), 2); - } -} diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs new file mode 100644 index 00000000000..cfed95f0426 --- /dev/null +++ b/components/engine_traits/src/flush.rs @@ -0,0 +1,179 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! A helper class to detect flush event and trace apply index. +//! +//! The whole idea is when all CFs have flushed to disk, then the apply index +//! should be able to be advanced to the latest. The implementations depends on +//! the assumption that memtable/write buffer is frozen one by one and flushed +//! one by one. +//! +//! Because apply index can be arbitrary value after restart, so apply related +//! states like `RaftApplyState` and `RegionLocalState` are mapped to index. +//! Once apply index is confirmed, the latest states before apply index should +//! be used as the start state. + +use std::{ + collections::LinkedList, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use crate::{RaftEngine, RaftLogBatch}; + +#[derive(Debug)] +pub struct FlushProgress { + cf: String, + apply_index: u64, + earliest_seqno: u64, +} + +impl FlushProgress { + fn merge(&mut self, pr: FlushProgress) { + debug_assert_eq!(self.cf, pr.cf); + debug_assert!(self.apply_index <= pr.apply_index); + self.apply_index = pr.apply_index; + } + + pub fn applied_index(&self) -> u64 { + self.apply_index + } + + pub fn cf(&self) -> &str { + &self.cf + } +} + +/// A share state between raftstore and underlying engine. +/// +/// raftstore will update state changes and corresponding apply index, when +/// flush, `PersistenceListener` will query states related to the memtable +/// and persist the relation to raft engine. +#[derive(Default, Debug)] +pub struct FlushState { + applied_index: AtomicU64, +} + +impl FlushState { + /// Set the latest applied index. + #[inline] + pub fn set_applied_index(&self, index: u64) { + self.applied_index.store(index, Ordering::Release); + } + + /// Query the applied index. + #[inline] + pub fn applied_index(&self) -> u64 { + self.applied_index.load(Ordering::Acquire) + } +} + +/// A helper trait to avoid exposing `RaftEngine` to `TabletFactory`. +pub trait StateStorage: Sync + Send { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress); +} + +/// A flush listener that maps memtable to apply index and persist the relation +/// to raft engine. +pub struct PersistenceListener { + region_id: u64, + tablet_index: u64, + state: Arc, + progress: Mutex>, + storage: Arc, +} + +impl PersistenceListener { + pub fn new( + region_id: u64, + tablet_index: u64, + state: Arc, + storage: Arc, + ) -> Self { + Self { + region_id, + tablet_index, + state, + progress: Mutex::new(LinkedList::new()), + storage, + } + } +} + +impl PersistenceListener { + pub fn flush_state(&self) -> &Arc { + &self.state + } + + /// Called when memtable is frozen. + /// + /// `earliest_seqno` should be the smallest seqno of the memtable. + pub fn on_memtable_sealed(&self, cf: String, earliest_seqno: u64) { + // The correctness relies on the assumption that there will be only one + // thread writting to the DB and increasing apply index. + // Apply index will be set within DB lock, so it's correct even with manual + // flush. + let apply_index = self.state.applied_index.load(Ordering::SeqCst); + self.progress.lock().unwrap().push_back(FlushProgress { + cf, + apply_index, + earliest_seqno, + }); + } + + /// Called a memtable finished flushing. + /// + /// `largest_seqno` should be the largest seqno of the generated file. + pub fn on_flush_completed(&self, cf: &str, largest_seqno: u64) { + // Maybe we should hook the compaction to avoid the file is compacted before + // being recorded. + let pr = { + let mut prs = self.progress.lock().unwrap(); + let mut cursor = prs.cursor_front_mut(); + let mut flushed_pr = None; + while let Some(pr) = cursor.current() { + if pr.cf != cf { + cursor.move_next(); + continue; + } + // Note flushed largest_seqno equals to earliest_seqno of next memtable. + if pr.earliest_seqno < largest_seqno { + match &mut flushed_pr { + None => flushed_pr = cursor.remove_current(), + Some(flushed_pr) => { + flushed_pr.merge(cursor.remove_current().unwrap()); + } + } + continue; + } + break; + } + match flushed_pr { + Some(pr) => pr, + None => panic!("{} not found in {:?}", cf, prs), + } + }; + self.storage + .persist_progress(self.region_id, self.tablet_index, pr); + } +} + +impl StateStorage for R { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + if pr.apply_index == 0 { + return; + } + let mut batch = self.log_batch(1); + // TODO: It's possible that flush succeeds but fails to call + // `on_flush_completed` before exit. In this case the flushed data will + // be replayed again after restarted. To solve the problem, we need to + // (1) persist flushed file numbers in `on_flush_begin` and (2) check + // the file number in `on_compaction_begin`. After restart, (3) check if the + // file exists. If (1) && ((2) || (3)), then we don't need to replay the data. + batch + .put_flushed_index(region_id, &pr.cf, tablet_index, pr.apply_index) + .unwrap(); + self.consume(&mut batch, true).unwrap(); + } +} diff --git a/components/engine_traits/src/lib.rs b/components/engine_traits/src/lib.rs index b9cf8847751..45a3d18fa7a 100644 --- a/components/engine_traits/src/lib.rs +++ b/components/engine_traits/src/lib.rs @@ -251,6 +251,9 @@ #![cfg_attr(test, feature(test))] #![feature(min_specialization)] #![feature(assert_matches)] +#![feature(linked_list_cursors)] +#![feature(let_chains)] +#![feature(str_split_as_str)] #[macro_use(fail_point)] extern crate fail; @@ -277,6 +280,8 @@ mod engine; pub use crate::engine::*; mod file_system; pub use crate::file_system::*; +mod flush; +pub use flush::*; mod import; pub use import::*; mod misc; @@ -294,6 +299,8 @@ mod sst_partitioner; pub use crate::sst_partitioner::*; mod range_properties; pub use crate::{mvcc_properties::*, range_properties::*}; +mod tablet; +pub use tablet::*; mod ttl_properties; pub use crate::ttl_properties::*; mod perf_context; @@ -333,7 +340,7 @@ pub use crate::range::*; mod raft_engine; pub use raft_engine::{ - CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, + CacheStats, RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch, RAFT_LOG_MULTI_GET_CNT, }; diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index 18991038ee8..d9a07a1a915 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -37,8 +37,30 @@ pub enum DeleteStrategy { DeleteByWriter { sst_path: String }, } +/// `StatisticsReporter` can be used to report engine's private statistics to +/// prometheus metrics. For one single engine, using it is equivalent to calling +/// `KvEngine::flush_metrics("name")`. For multiple engines, it can aggregate +/// statistics accordingly. +/// Note that it is not responsible for managing the statistics from +/// user-provided collectors that are potentially shared between engines. +pub trait StatisticsReporter { + fn new(name: &str) -> Self; + + /// Collect statistics from one single engine. + fn collect(&mut self, engine: &T); + + /// Aggregate and report statistics to prometheus metrics counters. The + /// statistics are not cleared afterwards. + fn flush(&mut self); +} + pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { - fn flush_cfs(&self, wait: bool) -> Result<()>; + type StatisticsReporter: StatisticsReporter; + + /// Flush all specified column families at once. + /// + /// If `cfs` is empty, it will try to flush all available column families. + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()>; fn flush_cf(&self, cf: &str, wait: bool) -> Result<()>; @@ -62,6 +84,8 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn ingest_maybe_slowdown_writes(&self, cf: &str) -> Result; + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>>; + /// Gets total used size of rocksdb engine, including: /// * total size (bytes) of all SST files. /// * total size (bytes) of active and unflushed immutable memtables. @@ -73,9 +97,13 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn sync_wal(&self) -> Result<()>; + fn pause_background_work(&self) -> Result<()>; + /// Check whether a database exists at a given path fn exists(path: &str) -> bool; + fn locked(path: &str) -> Result; + /// Dump stats about the database into a string. /// /// For debugging. The format and content is unspecified. diff --git a/components/engine_traits/src/raft_engine.rs b/components/engine_traits/src/raft_engine.rs index 7df681c96d5..9e95ae95e14 100644 --- a/components/engine_traits/src/raft_engine.rs +++ b/components/engine_traits/src/raft_engine.rs @@ -19,8 +19,20 @@ pub trait RaftEngineReadOnly: Sync + Send + 'static { fn get_prepare_bootstrap_region(&self) -> Result>; fn get_raft_state(&self, raft_group_id: u64) -> Result>; - fn get_region_state(&self, raft_group_id: u64) -> Result>; - fn get_apply_state(&self, raft_group_id: u64) -> Result>; + /// Get the latest region state not after the apply index. + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the latest apply state not after the apply index. + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result>; + /// Get the flushed index of the given CF. + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result>; fn get_recover_state(&self) -> Result>; fn get_entry(&self, raft_group_id: u64, index: u64) -> Result>; @@ -62,12 +74,6 @@ pub trait RaftEngineDebug: RaftEngine + Sync + Send + 'static { } } -pub struct RaftLogGcTask { - pub raft_group_id: u64, - pub from: u64, - pub to: u64, -} - // TODO: Refactor common methods between Kv and Raft engine into a shared trait. pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send + 'static { type LogBatch: RaftLogBatch; @@ -98,26 +104,17 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send batch: &mut Self::LogBatch, ) -> Result<()>; - /// Append some log entries and return written bytes. - /// - /// Note: `RaftLocalState` won't be updated in this call. - fn append(&self, raft_group_id: u64, entries: Vec) -> Result; - - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()>; + /// Like `cut_logs` but the range could be very large. + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()>; - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; - - /// Like `cut_logs` but the range could be very large. Return the deleted - /// count. Generally, `from` can be passed in `0`. - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result; - - fn batch_gc(&self, tasks: Vec) -> Result { - let mut total = 0; - for task in tasks { - total += self.gc(task.raft_group_id, task.from, task.to)?; - } - Ok(total) - } + /// Delete all but the latest one of states that are associated with smaller + /// apply_index. + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()>; fn need_manual_purge(&self) -> bool { false @@ -133,7 +130,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send fn flush_stats(&self) -> Option { None } - fn reset_statistics(&self) {} fn stop(&self) {} @@ -151,12 +147,6 @@ pub trait RaftEngine: RaftEngineReadOnly + PerfContextExt + Clone + Sync + Send where F: FnMut(u64) -> std::result::Result<(), E>, E: From; - - /// Indicate whether region states should be recovered from raftdb and - /// replay raft logs. - /// When kvdb's write-ahead-log is disabled, the sequence number of the last - /// boot time is saved. - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()>; } pub trait RaftLogBatch: Send { @@ -172,8 +162,42 @@ pub trait RaftLogBatch: Send { fn remove_prepare_bootstrap_region(&mut self) -> Result<()>; fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()>; - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()>; - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()>; + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()>; + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()>; + + /// Record the flushed apply index. + /// + /// There are two types of apply index: + /// 1. Normal apply index that only related to single tablet. These apply + /// indexes are recorded using its own CF. + /// 2. Apply index that can affect other tablets, like split, merge. These + /// apply indexes are recorded using special Raft CF. + /// + /// Because a peer may have multiple tablets (only one is latest), we use + /// `tablet_index` to avoid conflicts. + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()>; + + /// Indicate whether region states should be recovered from raftdb and + /// replay raft logs. + /// When kvdb's write-ahead-log is disabled, the sequence number of the last + /// boot time is saved. + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()>; /// The data size of this RaftLogBatch. fn persist_size(&self) -> usize; diff --git a/components/engine_traits/src/tablet.rs b/components/engine_traits/src/tablet.rs new file mode 100644 index 00000000000..edc0bd99870 --- /dev/null +++ b/components/engine_traits/src/tablet.rs @@ -0,0 +1,472 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Debug, Formatter}, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, Mutex, + }, +}; + +use collections::HashMap; +use kvproto::metapb::Region; +use tikv_util::box_err; + +use crate::{Error, FlushState, Result}; + +#[derive(Debug)] +struct LatestTablet { + data: Mutex>, + version: AtomicU64, +} + +/// Tablet may change during split, merge and applying snapshot. So we need a +/// shared value to reflect the latest tablet. `CachedTablet` provide cache that +/// can speed up common access. +#[derive(Clone, Debug)] +pub struct CachedTablet { + latest: Arc>, + cache: Option, + version: u64, +} + +impl CachedTablet { + #[inline] + fn new(data: Option) -> Self { + CachedTablet { + latest: Arc::new(LatestTablet { + data: Mutex::new(data.clone()), + version: AtomicU64::new(0), + }), + cache: data, + version: 0, + } + } + + pub fn set(&mut self, data: EK) { + self.version = { + let mut latest_data = self.latest.data.lock().unwrap(); + *latest_data = Some(data.clone()); + self.latest.version.fetch_add(1, Ordering::Relaxed) + 1 + }; + self.cache = Some(data); + } + + /// Get the tablet from cache without checking if it's up to date. + #[inline] + pub fn cache(&self) -> Option<&EK> { + self.cache.as_ref() + } + + /// Get the latest tablet. + #[inline] + pub fn latest(&mut self) -> Option<&EK> { + if self.latest.version.load(Ordering::Relaxed) > self.version { + let latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = latest_data.clone(); + } + self.cache() + } + + /// Returns how many versions has passed. + #[inline] + pub fn refresh(&mut self) -> u64 { + let old_version = self.version; + if self.latest.version.load(Ordering::Relaxed) > old_version { + let latest_data = self.latest.data.lock().unwrap(); + self.version = self.latest.version.load(Ordering::Relaxed); + self.cache = latest_data.clone(); + return self.version - old_version; + } + 0 + } +} + +/// Context to be passed to `TabletFactory`. +#[derive(Clone)] +pub struct TabletContext { + /// ID of the tablet. It is usually the region ID. + pub id: u64, + /// Suffix the tablet. It is usually the index that the tablet starts accept + /// incremental modification. The reason to have suffix is that we can keep + /// more than one tablet for a region. + pub suffix: Option, + /// The expected start key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is smaller than this key can be considered obsolete. + pub start_key: Box<[u8]>, + /// The expected end key of the tablet. The key should be in the format + /// tablet is actually stored, for example should have `z` prefix. + /// + /// Any key that is larger than or equal to this key can be considered + /// obsolete. + pub end_key: Box<[u8]>, + /// The states to be persisted when flush is triggered. + /// + /// If not set, apply may not be resumed correctly. + pub flush_state: Option>, +} + +impl Debug for TabletContext { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("TabletContext") + .field("id", &self.id) + .field("suffix", &self.suffix) + .field("start_key", &log_wrappers::Value::key(&self.start_key)) + .field("end_key", &log_wrappers::Value::key(&self.end_key)) + .finish() + } +} + +impl TabletContext { + pub fn new(region: &Region, suffix: Option) -> Self { + TabletContext { + id: region.get_id(), + suffix, + start_key: keys::data_key(region.get_start_key()).into_boxed_slice(), + end_key: keys::data_end_key(region.get_end_key()).into_boxed_slice(), + flush_state: None, + } + } + + /// Create a context that assumes there is only one region and it covers the + /// whole key space. Normally you should only use this in tests. + pub fn with_infinite_region(id: u64, suffix: Option) -> Self { + let mut region = Region::default(); + region.set_id(id); + Self::new(®ion, suffix) + } +} + +/// A factory trait to create new tablet for multi-rocksdb architecture. +// It should be named as `EngineFactory` for consistency, but we are about to +// rename engine to tablet, so always use tablet for new traits/types. +pub trait TabletFactory: Send + Sync { + /// Open the tablet in `path`. + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result; + + /// Destroy the tablet and its data + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()>; + + /// Check if the tablet with specified path exists + fn exists(&self, path: &Path) -> bool; +} + +pub struct SingletonFactory { + tablet: EK, +} + +impl SingletonFactory { + pub fn new(tablet: EK) -> Self { + SingletonFactory { tablet } + } +} + +impl TabletFactory for SingletonFactory { + /// Open the tablet in `path`. + /// + /// `id` and `suffix` is used to mark the identity of tablet. The id is + /// likely the region Id, the suffix could be the current raft log + /// index. The reason to have suffix is that we can keep more than one + /// tablet for a region. + fn open_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result { + Ok(self.tablet.clone()) + } + + /// Destroy the tablet and its data + fn destroy_tablet(&self, _ctx: TabletContext, _path: &Path) -> Result<()> { + Ok(()) + } + + /// Check if the tablet with specified path exists + fn exists(&self, _path: &Path) -> bool { + true + } +} + +/// A global registry for all tablets. +struct TabletRegistryInner { + // region_id, suffix -> tablet + tablets: Mutex>>, + factory: Box>, + root: PathBuf, +} + +pub struct TabletRegistry { + // One may consider to add cache to speed up access. But it also makes it more + // difficult to gc stale cache. + tablets: Arc>, +} + +impl Clone for TabletRegistry { + fn clone(&self) -> Self { + Self { + tablets: self.tablets.clone(), + } + } +} + +impl TabletRegistry { + pub fn new(factory: Box>, path: impl Into) -> Result { + let root = path.into(); + std::fs::create_dir_all(&root)?; + Ok(TabletRegistry { + tablets: Arc::new(TabletRegistryInner { + tablets: Mutex::new(HashMap::default()), + factory, + root, + }), + }) + } + + pub fn tablet_name(&self, prefix: &str, id: u64, suffix: u64) -> String { + format!("{}{}_{}", prefix, id, suffix) + } + + pub fn parse_tablet_name<'a>(&self, path: &'a Path) -> Option<(&'a str, u64, u64)> { + let name = path.file_name().unwrap().to_str().unwrap(); + let mut parts = name.rsplit('_'); + let suffix = parts.next()?.parse().ok()?; + let id = parts.next()?.parse().ok()?; + let prefix = parts.as_str(); + Some((prefix, id, suffix)) + } + + pub fn tablet_root(&self) -> &Path { + &self.tablets.root + } + + pub fn tablet_path(&self, id: u64, suffix: u64) -> PathBuf { + let name = self.tablet_name("", id, suffix); + self.tablets.root.join(name) + } + + /// Gets a tablet. + pub fn get(&self, id: u64) -> Option> + where + EK: Clone, + { + let tablets = self.tablets.tablets.lock().unwrap(); + tablets.get(&id).cloned() + } + + /// Gets a tablet, create a default one if it doesn't exist. + pub fn get_or_default(&self, id: u64) -> CachedTablet + where + EK: Clone, + { + let mut tablets = self.tablets.tablets.lock().unwrap(); + tablets + .entry(id) + .or_insert_with(|| CachedTablet::new(None)) + .clone() + } + + pub fn tablet_factory(&self) -> &dyn TabletFactory { + self.tablets.factory.as_ref() + } + + pub fn remove(&self, id: u64) { + self.tablets.tablets.lock().unwrap().remove(&id); + } + + /// Load the tablet and set it as the latest. + /// + /// If the tablet doesn't exist, it will create an empty one. + pub fn load(&self, ctx: TabletContext, create: bool) -> Result> + where + EK: Clone, + { + assert!(ctx.suffix.is_some()); + let id = ctx.id; + let path = self.tablet_path(id, ctx.suffix.unwrap()); + if !create && !self.tablets.factory.exists(&path) { + return Err(Error::Other(box_err!( + "tablet ({}, {:?}) doesn't exist", + id, + ctx.suffix + ))); + } + // TODO: use compaction filter to trim range. + let tablet = self.tablets.factory.open_tablet(ctx, &path)?; + let mut cached = self.get_or_default(id); + cached.set(tablet); + Ok(cached) + } + + /// Loop over all opened tablets. Note, it's possible that the visited + /// tablet is not the latest one. If latest one is required, you may + /// either: + /// - loop several times to make it likely to visit all tablets. + /// - send commands to fsms instead, which can guarantee latest tablet is + /// visisted. + pub fn for_each_opened_tablet(&self, mut f: impl FnMut(u64, &mut CachedTablet) -> bool) { + let mut tablets = self.tablets.tablets.lock().unwrap(); + for (id, tablet) in tablets.iter_mut() { + if !f(*id, tablet) { + return; + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cached_tablet() { + let mut cached_tablet = CachedTablet::new(None); + assert_eq!(cached_tablet.cache(), None); + assert_eq!(cached_tablet.latest(), None); + + cached_tablet = CachedTablet::new(Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + + // Setting tablet will refresh cache immediately. + cached_tablet.set(2); + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + + // Test `latest()` will use cache. + // Unsafe modify the data. + let old_data = *cached_tablet.latest.data.lock().unwrap(); + *cached_tablet.latest.data.lock().unwrap() = Some(0); + assert_eq!(cached_tablet.latest().cloned(), old_data); + // Restore the data. + *cached_tablet.latest.data.lock().unwrap() = old_data; + + let mut cloned = cached_tablet.clone(); + // Clone should reuse cache. + assert_eq!(cloned.cache().cloned(), Some(2)); + cloned.set(1); + assert_eq!(cloned.cache().cloned(), Some(1)); + assert_eq!(cloned.latest().cloned(), Some(1)); + + // Local cache won't be refreshed until querying latest. + assert_eq!(cached_tablet.cache().cloned(), Some(2)); + assert_eq!(cached_tablet.latest().cloned(), Some(1)); + assert_eq!(cached_tablet.cache().cloned(), Some(1)); + } + + #[test] + fn test_singleton_factory() { + let tablet = Arc::new(1); + let singleton = SingletonFactory::new(tablet.clone()); + let registry = TabletRegistry::new(Box::new(singleton), "").unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(1)); + registry.load(ctx.clone(), true).unwrap(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + + ctx.id = 2; + registry.load(ctx.clone(), true).unwrap(); + let mut count = 0; + registry.for_each_opened_tablet(|id, cached| { + assert!(&[1, 2].contains(&id), "{}", id); + assert_eq!(cached.latest().cloned(), Some(tablet.clone())); + count += 1; + true + }); + assert_eq!(count, 2); + + // Destroy should be ignored. + registry + .tablet_factory() + .destroy_tablet(ctx.clone(), ®istry.tablet_path(2, 1)) + .unwrap(); + + // Exist check should always succeed. + ctx.id = 3; + registry.load(ctx, false).unwrap(); + let mut cached = registry.get(3).unwrap(); + assert_eq!(cached.latest().cloned(), Some(tablet)); + } + + type Record = Arc<(u64, u64)>; + + struct MemoryTablet { + tablet: Mutex>, + } + + impl TabletFactory for MemoryTablet { + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + let mut tablet = self.tablet.lock().unwrap(); + if tablet.contains_key(path) { + return Err(Error::Other(box_err!("tablet is opened"))); + } + tablet.insert(path.to_owned(), Arc::new((ctx.id, ctx.suffix.unwrap_or(0)))); + Ok(tablet[path].clone()) + } + + fn exists(&self, path: &Path) -> bool { + let tablet = self.tablet.lock().unwrap(); + tablet.contains_key(path) + } + + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { + let prev = self.tablet.lock().unwrap().remove(path).unwrap(); + assert_eq!((ctx.id, ctx.suffix.unwrap_or(0)), *prev); + Ok(()) + } + } + + #[test] + fn test_tablet_registry() { + let factory = MemoryTablet { + tablet: Mutex::new(HashMap::default()), + }; + let registry = TabletRegistry::new(Box::new(factory), "").unwrap(); + + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + let mut tablet_1_10 = registry.load(ctx.clone(), true).unwrap(); + // It's open already, load it twice should report lock error. + registry.load(ctx.clone(), true).unwrap_err(); + let mut cached = registry.get(1).unwrap(); + assert_eq!(cached.latest(), tablet_1_10.latest()); + + let tablet_path = registry.tablet_path(1, 10); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let tablet_path = registry.tablet_path(1, 11); + assert!(!registry.tablet_factory().exists(&tablet_path)); + // Not exist tablet should report error. + ctx.suffix = Some(11); + registry.load(ctx.clone(), false).unwrap_err(); + assert!(registry.get(2).is_none()); + // Though path not exist, but we should be able to create an empty one. + assert_eq!(registry.get_or_default(2).latest(), None); + assert!(!registry.tablet_factory().exists(&tablet_path)); + + // Load new suffix should update cache. + registry.load(ctx, true).unwrap(); + assert_ne!(cached.latest(), tablet_1_10.cache()); + let tablet_path = registry.tablet_path(1, 11); + assert!(registry.tablet_factory().exists(&tablet_path)); + + let mut count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 2); + + registry.remove(2); + assert!(registry.get(2).is_none()); + count = 0; + registry.for_each_opened_tablet(|_, _| { + count += 1; + true + }); + assert_eq!(count, 1); + + let name = registry.tablet_name("prefix_", 12, 30); + assert_eq!(name, "prefix_12_30"); + let normal_name = registry.tablet_name("", 20, 15); + let normal_tablet_path = registry.tablet_path(20, 15); + assert_eq!(registry.tablet_root().join(normal_name), normal_tablet_path); + } +} diff --git a/components/engine_traits/src/write_batch.rs b/components/engine_traits/src/write_batch.rs index d8ff8d07796..8a92ac7c382 100644 --- a/components/engine_traits/src/write_batch.rs +++ b/components/engine_traits/src/write_batch.rs @@ -73,6 +73,13 @@ pub trait WriteBatch: Mutable { /// Commit the WriteBatch to disk with the given options fn write_opt(&mut self, opts: &WriteOptions) -> Result; + // TODO: it should be `FnOnce`. + fn write_callback_opt(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { + let seq = self.write_opt(opts)?; + cb(); + Ok(seq) + } + /// Commit the WriteBatch to disk atomically fn write(&mut self) -> Result { self.write_opt(&WriteOptions::default()) diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 55f0c31b3c5..3d17a94a494 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -47,9 +47,7 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, - config::ReadableDuration, - error, info, + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::{duration_to_sec, Instant}, @@ -71,6 +69,8 @@ use crate::PdFuture; fn request_timeout() -> Duration { fail_point!("pd_client_v2_request_timeout", |s| { use std::str::FromStr; + + use tikv_util::config::ReadableDuration; ReadableDuration::from_str(&s.unwrap()).unwrap().0 }); Duration::from_secs(REQUEST_TIMEOUT_SEC) @@ -412,6 +412,8 @@ async fn reconnect_loop( let backoff = (|| { fail_point!("pd_client_v2_backoff", |s| { use std::str::FromStr; + + use tikv_util::config::ReadableDuration; ReadableDuration::from_str(&s.unwrap()).unwrap().0 }); request_timeout() diff --git a/components/raft_log_engine/Cargo.toml b/components/raft_log_engine/Cargo.toml index 0ee185fd365..faf536e6bfa 100644 --- a/components/raft_log_engine/Cargo.toml +++ b/components/raft_log_engine/Cargo.toml @@ -5,6 +5,7 @@ publish = false edition = "2018" [dependencies] +codec = { workspace = true } encryption = { workspace = true } engine_traits = { workspace = true } file_system = { workspace = true } @@ -22,3 +23,6 @@ slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } + +[dev-dependencies] +tempfile = "3.0" diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index a376adc25b7..7c98adf325f 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -7,11 +7,12 @@ use std::{ sync::Arc, }; +use codec::number::NumberCodec; use encryption::{DataKeyManager, DecrypterReader, EncrypterWriter}; use engine_traits::{ CacheStats, EncryptionKeyManager, EncryptionMethod, PerfContextExt, PerfContextKind, PerfLevel, - RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, - RaftLogGcTask, Result, + RaftEngine, RaftEngineDebug, RaftEngineReadOnly, RaftLogBatch as RaftLogBatchTrait, Result, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use file_system::{IoOp, IoRateLimiter, IoType}; use kvproto::{ @@ -290,6 +291,37 @@ impl FileSystem for ManagedFileSystem { } } +/// Convert a cf to id for encoding. +fn cf_to_id(cf: &str) -> u8 { + match cf { + CF_DEFAULT => 0, + CF_LOCK => 1, + CF_WRITE => 2, + CF_RAFT => 3, + _ => panic!("unrecognized cf {}", cf), + } +} +const MAX_CF_ID: u8 = 3; + +/// Encode a key in the format `{prefix}{num}`. +fn encode_key(prefix: &'static [u8], num: u64) -> [u8; 9] { + debug_assert_eq!(prefix.len(), 1); + let mut buf = [0; 9]; + buf[..prefix.len()].copy_from_slice(prefix); + NumberCodec::encode_u64(&mut buf[prefix.len()..], num); + buf +} + +/// Encode a flush key in the format `{flush key prefix}{cf_id}{tablet_index}`. +fn encode_flushed_key(cf: &str, tablet_index: u64) -> [u8; 10] { + debug_assert_eq!(FLUSH_STATE_KEY.len(), 1); + let mut buf = [0; 10]; + buf[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + buf[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + NumberCodec::encode_u64(&mut buf[FLUSH_STATE_KEY.len() + 1..], tablet_index); + buf +} + #[derive(Clone)] pub struct RaftLogEngine(Arc>); @@ -348,6 +380,9 @@ const PREPARE_BOOTSTRAP_REGION_KEY: &[u8] = &[0x02]; const REGION_STATE_KEY: &[u8] = &[0x03]; const APPLY_STATE_KEY: &[u8] = &[0x04]; const RECOVER_STATE_KEY: &[u8] = &[0x05]; +const FLUSH_STATE_KEY: &[u8] = &[0x06]; +// All keys are of the same length. +const KEY_PREFIX_LEN: usize = RAFT_LOG_STATE_KEY.len(); impl RaftLogBatchTrait for RaftLogBatch { fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { @@ -401,15 +436,47 @@ impl RaftLogBatchTrait for RaftLogBatch { Ok(()) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + fn put_region_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { + let key = encode_key(REGION_STATE_KEY, apply_index); + self.0 + .put_message(raft_group_id, key.to_vec(), state) + .map_err(transfer_error) + } + + fn put_apply_state( + &mut self, + raft_group_id: u64, + apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { + let key = encode_key(APPLY_STATE_KEY, apply_index); self.0 - .put_message(raft_group_id, REGION_STATE_KEY.to_vec(), state) + .put_message(raft_group_id, key.to_vec(), state) .map_err(transfer_error) } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + fn put_flushed_index( + &mut self, + raft_group_id: u64, + cf: &str, + tablet_index: u64, + apply_index: u64, + ) -> Result<()> { + let key = encode_flushed_key(cf, tablet_index); + let mut value = vec![0; 8]; + NumberCodec::encode_u64(&mut value, apply_index); + self.0.put(raft_group_id, key.to_vec(), value); + Ok(()) + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.0 - .put_message(raft_group_id, APPLY_STATE_KEY.to_vec(), state) + .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) .map_err(transfer_error) } } @@ -465,16 +532,72 @@ impl RaftEngineReadOnly for RaftLogEngine { .map_err(transfer_error) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { + fn get_region_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; + self.0 + .scan_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(APPLY_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) + } + + fn get_apply_state( + &self, + raft_group_id: u64, + apply_index: u64, + ) -> Result> { + let mut state = None; self.0 - .get_message(raft_group_id, REGION_STATE_KEY) - .map_err(transfer_error) + .scan_messages( + raft_group_id, + Some(APPLY_STATE_KEY), + Some(RECOVER_STATE_KEY), + true, + |key, value| { + let index = NumberCodec::decode_u64(&key[REGION_STATE_KEY.len()..]); + if index > apply_index { + true + } else { + state = Some(value); + false + } + }, + ) + .map_err(transfer_error)?; + Ok(state) } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { + fn get_flushed_index(&self, raft_group_id: u64, cf: &str) -> Result> { + let mut start = [0; 2]; + start[..FLUSH_STATE_KEY.len()].copy_from_slice(FLUSH_STATE_KEY); + start[FLUSH_STATE_KEY.len()] = cf_to_id(cf); + let mut end = start; + end[FLUSH_STATE_KEY.len()] += 1; + let mut index = None; self.0 - .get_message(raft_group_id, APPLY_STATE_KEY) - .map_err(transfer_error) + .scan_raw_messages(raft_group_id, Some(&start), Some(&end), true, |_, v| { + index = Some(NumberCodec::decode_u64(v)); + false + }) + .map_err(transfer_error)?; + Ok(index) } fn get_recover_state(&self) -> Result> { @@ -538,65 +661,82 @@ impl RaftEngine for RaftLogEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut batch = Self::LogBatch::default(); - batch - .0 - .add_entries::(raft_group_id, &entries) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error) - } - - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - let mut batch = Self::LogBatch::default(); + fn gc( + &self, + raft_group_id: u64, + _from: u64, + to: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { batch .0 - .put_message(STORE_STATE_ID, STORE_IDENT_KEY.to_vec(), ident) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, true).map_err(transfer_error)?; + .add_command(raft_group_id, Command::Compact { index: to }); Ok(()) } - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(raft_group_id, RAFT_LOG_STATE_KEY.to_vec(), state) + fn delete_all_but_one_states_before( + &self, + raft_group_id: u64, + apply_index: u64, + batch: &mut Self::LogBatch, + ) -> Result<()> { + // Makes sure REGION_STATE_KEY is the smallest and FLUSH_STATE_KEY is the + // largest. + debug_assert!(REGION_STATE_KEY < APPLY_STATE_KEY); + debug_assert!(APPLY_STATE_KEY < FLUSH_STATE_KEY); + + let mut end = [0; KEY_PREFIX_LEN + 1]; + end[..KEY_PREFIX_LEN].copy_from_slice(FLUSH_STATE_KEY); + end[KEY_PREFIX_LEN] = MAX_CF_ID + 1; + let mut found_region_state = false; + let mut found_apply_state = false; + let mut found_flush_state = [false; MAX_CF_ID as usize + 1]; + self.0 + .scan_raw_messages( + raft_group_id, + Some(REGION_STATE_KEY), + Some(&end), + true, + |key, _| { + match &key[..KEY_PREFIX_LEN] { + REGION_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_region_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_region_state = true; + } + } + APPLY_STATE_KEY + if NumberCodec::decode_u64(&key[KEY_PREFIX_LEN..]) <= apply_index => + { + if found_apply_state { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_apply_state = true; + } + } + FLUSH_STATE_KEY => { + let cf_id = key[KEY_PREFIX_LEN]; + let tablet_index = NumberCodec::decode_u64(&key[KEY_PREFIX_LEN + 1..]); + if cf_id <= MAX_CF_ID && tablet_index <= apply_index { + if found_flush_state[cf_id as usize] { + batch.0.delete(raft_group_id, key.to_vec()); + } else { + found_flush_state[cf_id as usize] = true; + } + } + } + _ => {} + } + true + }, + ) .map_err(transfer_error)?; - self.0.write(&mut batch.0, false).map_err(transfer_error)?; Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - self.batch_gc(vec![RaftLogGcTask { - raft_group_id, - from, - to, - }]) - } - - fn batch_gc(&self, tasks: Vec) -> Result { - let mut batch = self.log_batch(tasks.len()); - let mut old_first_index = Vec::with_capacity(tasks.len()); - for task in &tasks { - batch - .0 - .add_command(task.raft_group_id, Command::Compact { index: task.to }); - old_first_index.push(self.0.first_index(task.raft_group_id)); - } - - self.0.write(&mut batch.0, false).map_err(transfer_error)?; - - let mut total = 0; - for (old_first_index, task) in old_first_index.iter().zip(tasks) { - let new_first_index = self.0.first_index(task.raft_group_id); - if let (Some(old), Some(new)) = (old_first_index, new_first_index) { - total += new.saturating_sub(*old); - } - } - Ok(total as usize) - } - fn need_manual_purge(&self) -> bool { true } @@ -635,16 +775,6 @@ impl RaftEngine for RaftLogEngine { } Ok(()) } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - let mut batch = Self::LogBatch::default(); - batch - .0 - .put_message(STORE_STATE_ID, RECOVER_STATE_KEY.to_vec(), state) - .map_err(transfer_error)?; - self.0.write(&mut batch.0, true).map_err(transfer_error)?; - Ok(()) - } } fn transfer_error(e: RaftEngineError) -> engine_traits::Error { @@ -657,3 +787,67 @@ fn transfer_error(e: RaftEngineError) -> engine_traits::Error { } } } + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use engine_traits::ALL_CFS; + + use super::*; + + #[test] + fn test_apply_related_states() { + let dir = tempfile::tempdir().unwrap(); + let cfg = RaftEngineConfig { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = RaftLogEngine::new(cfg, None, None).unwrap(); + assert_matches!(engine.get_region_state(2, u64::MAX), Ok(None)); + assert_matches!(engine.get_apply_state(2, u64::MAX), Ok(None)); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut wb = engine.log_batch(10); + let mut region_state = RegionLocalState::default(); + region_state.mut_region().set_id(3); + wb.put_region_state(2, 1, ®ion_state).unwrap(); + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(3); + wb.put_apply_state(2, 3, &apply_state).unwrap(); + for cf in ALL_CFS.iter().take(2) { + wb.put_flushed_index(2, cf, 5, 4).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + for cf in ALL_CFS.iter().take(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(4))); + } + for cf in ALL_CFS.iter().skip(2) { + assert_matches!(engine.get_flushed_index(2, cf), Ok(None)); + } + + let mut region_state2 = region_state.clone(); + region_state2.mut_region().set_id(5); + wb.put_region_state(2, 4, ®ion_state2).unwrap(); + let mut apply_state2 = apply_state.clone(); + apply_state2.set_applied_index(5); + wb.put_apply_state(2, 5, &apply_state2).unwrap(); + for cf in ALL_CFS { + wb.put_flushed_index(2, cf, 6, 5).unwrap(); + } + engine.consume(&mut wb, false).unwrap(); + + assert_matches!(engine.get_region_state(2, 0), Ok(None)); + assert_matches!(engine.get_region_state(2, 1), Ok(Some(s)) if s == region_state); + assert_matches!(engine.get_region_state(2, 4), Ok(Some(s)) if s == region_state2); + assert_matches!(engine.get_apply_state(2, 0), Ok(None)); + assert_matches!(engine.get_apply_state(2, 3), Ok(Some(s)) if s == apply_state); + assert_matches!(engine.get_apply_state(2, 5), Ok(Some(s)) if s == apply_state2); + for cf in ALL_CFS { + assert_matches!(engine.get_flushed_index(2, cf), Ok(Some(5))); + } + } +} diff --git a/components/raft_log_engine/src/lib.rs b/components/raft_log_engine/src/lib.rs index 8eda4e5ae24..25899ddf2bb 100644 --- a/components/raft_log_engine/src/lib.rs +++ b/components/raft_log_engine/src/lib.rs @@ -16,6 +16,7 @@ //! Please read the engine_trait crate docs before hacking. #![cfg_attr(test, feature(test))] +#![feature(assert_matches)] #[macro_use] extern crate tikv_util; diff --git a/components/raftstore-v2/Cargo.toml b/components/raftstore-v2/Cargo.toml index 46ed20f8d10..6726c5ed742 100644 --- a/components/raftstore-v2/Cargo.toml +++ b/components/raftstore-v2/Cargo.toml @@ -30,6 +30,7 @@ cloud-azure = ["raftstore/cloud-azure"] [dependencies] batch-system = { workspace = true } +bytes = "1.0" causal_ts = { workspace = true } collections = { workspace = true } concurrency_manager = { workspace = true } @@ -43,15 +44,18 @@ futures = { version = "0.3", features = ["compat"] } keys = { workspace = true } kvproto = { workspace = true } log_wrappers = { workspace = true } +parking_lot = "0.12" pd_client = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft-proto = { version = "0.7.0" } raftstore = { workspace = true } +rand = "0.8.3" resource_metering = { workspace = true } slog = "2.3" smallvec = "1.4" +thiserror = "1.0" tikv_util = { workspace = true } time = "0.1" tracker = { workspace = true } @@ -68,9 +72,9 @@ test_util = { workspace = true } [[test]] name = "raftstore-v2-failpoints" path = "tests/failpoints/mod.rs" -required-features = ["failpoints", "testexport"] +required-features = ["failpoints", "testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] [[test]] name = "raftstore-v2-integrations" path = "tests/integrations/mod.rs" -required-features = ["testexport"] +required-features = ["testexport", "test-engine-kv-rocksdb", "test-engine-raft-raft-engine"] diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 199e8cafbd8..a3800085522 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -2,7 +2,6 @@ use std::{ ops::{Deref, DerefMut}, - path::Path, sync::{ atomic::{AtomicBool, Ordering}, Arc, Mutex, @@ -16,30 +15,29 @@ use batch_system::{ use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use crossbeam::channel::{Sender, TrySendError}; -use engine_traits::{Engines, KvEngine, RaftEngine, TabletFactory}; +use crossbeam::channel::TrySendError; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use file_system::{set_io_type, IoType}; -use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::{ - metapb::Store, - raft_serverpb::{PeerState, RaftMessage}, -}; +use kvproto::{disk_usage::DiskUsage, raft_serverpb::RaftMessage}; use pd_client::PdClient; -use raft::INVALID_ID; -use raftstore::store::{ - fsm::store::PeerTickBatch, local_metrics::RaftMetrics, Config, ReadRunner, ReadTask, - StoreWriters, TabletSnapManager, Transport, WriteSenders, +use raft::{StateRole, INVALID_ID}; +use raftstore::{ + coprocessor::{CoprocessorHost, RegionChangeEvent}, + store::{ + fsm::store::{PeerTickBatch, ENTRY_CACHE_EVICT_TICK_DURATION}, + local_metrics::RaftMetrics, + Config, ReadRunner, ReadTask, SplitCheckRunner, SplitCheckTask, StoreWriters, + TabletSnapManager, Transport, WriteSenders, + }, }; -use slog::Logger; +use slog::{warn, Logger}; use tikv_util::{ box_err, config::{Tracker, VersionTrack}, - defer, - future::poll_future_notify, sys::SysQuota, time::Instant as TiInstant, timer::SteadyTimer, - worker::{Scheduler, Worker}, + worker::{LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; @@ -47,9 +45,10 @@ use time::Timespec; use crate::{ fsm::{PeerFsm, PeerFsmDelegate, SenderFsmPair, StoreFsm, StoreFsmDelegate, StoreMeta}, + operation::SPLIT_PREFIX, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::{PdRunner, PdTask}, + worker::{pd, tablet_gc}, Error, Result, }; @@ -57,6 +56,7 @@ use crate::{ pub struct StoreContext { /// A logger without any KV. It's clean for creating new PeerFSM. pub logger: Logger, + pub coprocessor_host: CoprocessorHost, /// The transport for sending messages to peers on other stores. pub trans: T, pub current_time: Option, @@ -70,15 +70,43 @@ pub struct StoreContext { pub tick_batch: Vec, /// The precise timer for scheduling tick. pub timer: SteadyTimer, - pub write_senders: WriteSenders, + pub schedulers: Schedulers, /// store meta - pub store_meta: Arc>>, + pub store_meta: Arc>, pub engine: ER, - pub tablet_factory: Arc>, + pub tablet_registry: TabletRegistry, pub apply_pool: FuturePool, - pub read_scheduler: Scheduler>, + + /// Disk usage for the store itself. + pub self_disk_usage: DiskUsage, + pub snap_mgr: TabletSnapManager, - pub pd_scheduler: Scheduler, +} + +impl StoreContext { + pub fn update_ticks_timeout(&mut self) { + self.tick_batch[PeerTick::Raft as usize].wait_duration = self.cfg.raft_base_tick_interval.0; + self.tick_batch[PeerTick::CompactLog as usize].wait_duration = + self.cfg.raft_log_gc_tick_interval.0; + self.tick_batch[PeerTick::EntryCacheEvict as usize].wait_duration = + ENTRY_CACHE_EVICT_TICK_DURATION; + self.tick_batch[PeerTick::PdHeartbeat as usize].wait_duration = + self.cfg.pd_heartbeat_tick_interval.0; + self.tick_batch[PeerTick::SplitRegionCheck as usize].wait_duration = + self.cfg.split_region_check_tick_interval.0; + self.tick_batch[PeerTick::CheckPeerStaleState as usize].wait_duration = + self.cfg.peer_stale_state_check_interval.0; + self.tick_batch[PeerTick::CheckMerge as usize].wait_duration = + self.cfg.merge_check_tick_interval.0; + self.tick_batch[PeerTick::CheckLeaderLease as usize].wait_duration = + self.cfg.check_leader_lease_interval.0; + self.tick_batch[PeerTick::ReactivateMemoryLock as usize].wait_duration = + self.cfg.reactive_memory_lock_tick_interval.0; + self.tick_batch[PeerTick::ReportBuckets as usize].wait_duration = + self.cfg.report_region_buckets_tick_interval.0; + self.tick_batch[PeerTick::CheckLongUncommitted as usize].wait_duration = + self.cfg.check_long_uncommitted_interval.0; + } } /// A [`PollHandler`] that handles updates of [`StoreFsm`]s and [`PeerFsm`]s. @@ -153,6 +181,7 @@ impl PollHandler PollHandler>>]) {} + fn end(&mut self, _batch: &mut [Option>>]) {} fn pause(&mut self) { if self.poll_ctx.trans.need_flush() { @@ -220,17 +249,16 @@ impl PollHandler { cfg: Arc>, + coprocessor_host: CoprocessorHost, store_id: u64, engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, router: StoreRouter, - read_scheduler: Scheduler>, - pd_scheduler: Scheduler, - write_senders: WriteSenders, + schedulers: Schedulers, apply_pool: FuturePool, logger: Logger, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, } @@ -239,15 +267,14 @@ impl StorePollerBuilder { cfg: Arc>, store_id: u64, engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, router: StoreRouter, - read_scheduler: Scheduler>, - pd_scheduler: Scheduler, - store_writers: &mut StoreWriters, + schedulers: Schedulers, logger: Logger, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, + coprocessor_host: CoprocessorHost, ) -> Self { let pool_size = cfg.value().apply_batch_system.pool_size; let max_pool_size = std::cmp::max( @@ -263,16 +290,15 @@ impl StorePollerBuilder { cfg, store_id, engine, - tablet_factory, + tablet_registry, trans, router, - read_scheduler, - pd_scheduler, apply_pool, logger, - write_senders: store_writers.senders(), + schedulers, store_meta, snap_mgr, + coprocessor_host, } } @@ -288,13 +314,24 @@ impl StorePollerBuilder { region_id, self.store_id, self.engine.clone(), - self.read_scheduler.clone(), + self.schedulers.read.clone(), &self.logger, )? { Some(p) => p, None => return Ok(()), }; - let (sender, peer_fsm) = PeerFsm::new(&cfg, &*self.tablet_factory, storage)?; + + if storage.is_initialized() { + self.coprocessor_host.on_region_changed( + storage.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + } + meta.set_region(storage.region(), storage.is_initialized(), &self.logger); + + let (sender, peer_fsm) = + PeerFsm::new(&cfg, &self.tablet_registry, &self.snap_mgr, storage)?; meta.region_read_progress .insert(region_id, peer_fsm.as_ref().peer().read_progress().clone()); @@ -313,6 +350,32 @@ impl StorePollerBuilder { } fn clean_up_tablets(&self, peers: &HashMap>) -> Result<()> { + for entry in file_system::read_dir(self.tablet_registry.tablet_root())? { + let entry = entry?; + let path = entry.path(); + let Some((prefix, region_id, tablet_index)) = self.tablet_registry.parse_tablet_name(&path) else { continue }; + let fsm = match peers.get(®ion_id) { + Some((_, fsm)) => fsm, + None => { + // The peer is either destroyed or not created yet. It will be + // recovered by leader heartbeats. + file_system::remove_dir_all(&path)?; + continue; + } + }; + // Valid split tablet should be installed during recovery. + if prefix == SPLIT_PREFIX { + file_system::remove_dir_all(&path)?; + continue; + } + if prefix.is_empty() { + // Stale split data can be deleted. + if fsm.peer().storage().tablet_index() > tablet_index { + file_system::remove_dir_all(&path)?; + } + } + // TODO: handle other prefix + } // TODO: list all available tablets and destroy those which are not in the // peers. Ok(()) @@ -327,9 +390,9 @@ where { type Handler = StorePoller; - fn build(&mut self, priority: batch_system::Priority) -> Self::Handler { + fn build(&mut self, _priority: batch_system::Priority) -> Self::Handler { let cfg = self.cfg.value().clone(); - let poll_ctx = StoreContext { + let mut poll_ctx = StoreContext { logger: self.logger.clone(), trans: self.trans.clone(), current_time: None, @@ -339,35 +402,55 @@ where router: self.router.clone(), tick_batch: vec![PeerTickBatch::default(); PeerTick::VARIANT_COUNT], timer: SteadyTimer::default(), - write_senders: self.write_senders.clone(), + schedulers: self.schedulers.clone(), store_meta: self.store_meta.clone(), engine: self.engine.clone(), - tablet_factory: self.tablet_factory.clone(), + tablet_registry: self.tablet_registry.clone(), apply_pool: self.apply_pool.clone(), - read_scheduler: self.read_scheduler.clone(), + self_disk_usage: DiskUsage::Normal, snap_mgr: self.snap_mgr.clone(), - pd_scheduler: self.pd_scheduler.clone(), + coprocessor_host: self.coprocessor_host.clone(), }; + poll_ctx.update_ticks_timeout(); let cfg_tracker = self.cfg.clone().tracker("raftstore".to_string()); StorePoller::new(poll_ctx, cfg_tracker) } } +#[derive(Clone)] +pub struct Schedulers { + pub read: Scheduler>, + pub pd: Scheduler, + pub tablet_gc: Scheduler>, + pub write: WriteSenders, + + // Following is not maintained by raftstore itself. + pub split_check: Scheduler, +} + /// A set of background threads that will processing offloaded work from /// raftstore. struct Workers { /// Worker for fetching raft logs asynchronously - async_read_worker: Worker, - pd_worker: Worker, - store_writers: StoreWriters, + async_read: Worker, + pd: LazyWorker, + tablet_gc_worker: Worker, + async_write: StoreWriters, + purge: Option, + + // Following is not maintained by raftstore itself. + background: Worker, } -impl Default for Workers { - fn default() -> Self { +impl Workers { + fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { Self { - async_read_worker: Worker::new("async-read-worker"), - pd_worker: Worker::new("pd-worker"), - store_writers: StoreWriters::default(), + async_read: Worker::new("async-read-worker"), + pd, + tablet_gc_worker: Worker::new("tablet-gc-worker"), + async_write: StoreWriters::default(), + purge, + background, } } } @@ -386,14 +469,17 @@ impl StoreSystem { store_id: u64, cfg: Arc>, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, trans: T, pd_client: Arc, router: &StoreRouter, - store_meta: Arc>>, + store_meta: Arc>, snap_mgr: TabletSnapManager, concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, // used for rawkv apiv2 + coprocessor_host: CoprocessorHost, + background: Worker, + pd_worker: LazyWorker, ) -> Result<()> where T: Transport + 'static, @@ -407,46 +493,84 @@ impl StoreSystem { .broadcast_normal(|| PeerMsg::Tick(PeerTick::PdHeartbeat)); }); - let mut workers = Workers::default(); + let purge_worker = if raft_engine.need_manual_purge() { + let worker = Worker::new("purge-worker"); + let raft_clone = raft_engine.clone(); + let logger = self.logger.clone(); + let router = router.clone(); + worker.spawn_interval_task(cfg.value().raft_engine_purge_interval.0, move || { + match raft_clone.manual_purge() { + Ok(regions) => { + for r in regions { + let _ = router.send(r, PeerMsg::ForceCompactLog); + } + } + Err(e) => { + warn!(logger, "purge expired files"; "err" => %e); + } + }; + }); + Some(worker) + } else { + None + }; + + let mut workers = Workers::new(background, pd_worker, purge_worker); workers - .store_writers + .async_write .spawn(store_id, raft_engine.clone(), None, router, &trans, &cfg)?; let mut read_runner = ReadRunner::new(router.clone(), raft_engine.clone()); read_runner.set_snap_mgr(snap_mgr.clone()); - let read_scheduler = workers - .async_read_worker - .start("async-read-worker", read_runner); - - let pd_scheduler = workers.pd_worker.start( - "pd-worker", - PdRunner::new( - store_id, - pd_client, - raft_engine.clone(), - tablet_factory.clone(), + let read_scheduler = workers.async_read.start("async-read-worker", read_runner); + + workers.pd.start(pd::Runner::new( + store_id, + pd_client, + raft_engine.clone(), + tablet_registry.clone(), + router.clone(), + workers.pd.remote(), + concurrency_manager, + causal_ts_provider, + self.logger.clone(), + self.shutdown.clone(), + )); + + let split_check_scheduler = workers.background.start( + "split-check", + SplitCheckRunner::with_registry( + tablet_registry.clone(), router.clone(), - workers.pd_worker.remote(), - concurrency_manager, - causal_ts_provider, - self.logger.clone(), - self.shutdown.clone(), + coprocessor_host.clone(), ), ); - let mut builder = StorePollerBuilder::new( + let tablet_gc_scheduler = workers.tablet_gc_worker.start( + "tablet-gc-worker", + tablet_gc::Runner::new(tablet_registry.clone(), self.logger.clone()), + ); + + let schedulers = Schedulers { + read: read_scheduler, + pd: workers.pd.scheduler(), + tablet_gc: tablet_gc_scheduler, + write: workers.async_write.senders(), + split_check: split_check_scheduler, + }; + + let builder = StorePollerBuilder::new( cfg.clone(), store_id, raft_engine, - tablet_factory, + tablet_registry, trans, router.clone(), - read_scheduler, - pd_scheduler, - &mut workers.store_writers, + schedulers, self.logger.clone(), store_meta.clone(), snap_mgr, + coprocessor_host, ); self.workers = Some(workers); let peers = builder.init()?; @@ -462,8 +586,6 @@ impl StoreSystem { for (region_id, (tx, fsm)) in peers { meta.readers .insert(region_id, fsm.peer().generate_read_delegate()); - meta.tablet_caches - .insert(region_id, fsm.peer().tablet().clone()); address.push(region_id); mailboxes.push(( @@ -494,9 +616,12 @@ impl StoreSystem { self.system.shutdown(); - workers.store_writers.shutdown(); - workers.async_read_worker.stop(); - workers.pd_worker.stop(); + workers.async_write.shutdown(); + workers.async_read.stop(); + workers.pd.stop(); + if let Some(w) = workers.purge { + w.stop(); + } } } @@ -512,6 +637,14 @@ impl StoreRouter { &self.logger } + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + match self.router.send(addr, msg) { + Ok(()) => Ok(()), + Err(e) => Err(raftstore::router::handle_send_error(addr, e)), + } + } + pub fn send_raft_message( &self, msg: Box, diff --git a/components/raftstore-v2/src/bootstrap.rs b/components/raftstore-v2/src/bootstrap.rs index 6700db4d45f..62bc9e4b8c5 100644 --- a/components/raftstore-v2/src/bootstrap.rs +++ b/components/raftstore-v2/src/bootstrap.rs @@ -15,7 +15,7 @@ use raftstore::store::initial_region; use slog::{debug, error, info, warn, Logger}; use tikv_util::{box_err, box_try}; -use crate::{raft::write_initial_states, Result}; +use crate::{operation::write_initial_states, Result}; const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); @@ -97,8 +97,9 @@ impl<'a, ER: RaftEngine> Bootstrap<'a, ER> { let mut ident = StoreIdent::default(); ident.set_cluster_id(self.cluster_id); ident.set_store_id(id); - self.engine.put_store_ident(&ident)?; - self.engine.sync()?; + let mut lb = self.engine.log_batch(1); + lb.put_store_ident(&ident)?; + self.engine.consume(&mut lb, true)?; fail_point!("node_after_bootstrap_store", |_| Err(box_err!( "injected error: node_after_bootstrap_store" ))); diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index b8faf589760..c0eabd2120e 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -1,30 +1,27 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - pin::Pin, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - task::{Context, Poll}, + sync::Arc, + time::{Duration, Instant}, }; use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, TabletFactory}; -use futures::{Future, StreamExt}; +use engine_traits::{FlushState, KvEngine, TabletRegistry}; +use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use raftstore::store::ReadTask; use slog::Logger; use tikv_util::{ mpsc::future::{self, Receiver, Sender, WakePolicy}, + timer::GLOBAL_TIMER_HANDLE, worker::Scheduler, }; use crate::{ + operation::DataTrace, raft::Apply, router::{ApplyRes, ApplyTask, PeerMsg}, - tablet::CachedTablet, }; /// A trait for reporting apply result. @@ -37,7 +34,7 @@ pub trait ApplyResReporter { impl, S: FsmScheduler> ApplyResReporter for Mailbox { fn report(&self, apply_res: ApplyRes) { // TODO: check shutdown. - self.force_send(PeerMsg::ApplyRes(apply_res)).unwrap(); + let _ = self.force_send(PeerMsg::ApplyRes(apply_res)); } } @@ -64,9 +61,10 @@ impl ApplyFsm { peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, - remote_tablet: CachedTablet, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, logger: Logger, ) -> (ApplyScheduler, Self) { let (tx, rx) = future::unbounded(WakePolicy::Immediately); @@ -74,9 +72,10 @@ impl ApplyFsm { peer, region_state, res_reporter, - remote_tablet, - tablet_factory, + tablet_registry, read_scheduler, + flush_state, + log_recovery, logger, ); ( @@ -92,15 +91,29 @@ impl ApplyFsm { impl ApplyFsm { pub async fn handle_all_tasks(&mut self) { loop { - let mut task = match self.receiver.next().await { - Some(t) => t, - None => return, + let timeout = GLOBAL_TIMER_HANDLE + .delay(Instant::now() + Duration::from_secs(10)) + .compat(); + let res = futures::select! { + res = self.receiver.next().fuse() => res, + _ = timeout.fuse() => None, + }; + let mut task = match res { + Some(r) => r, + None => { + self.apply.release_memory(); + match self.receiver.next().await { + Some(t) => t, + None => return, + } + } }; loop { match task { // TODO: flush by buffer size. ApplyTask::CommittedEntries(ce) => self.apply.apply_committed_entries(ce).await, ApplyTask::Snapshot(snap_task) => self.apply.schedule_gen_snapshot(snap_task), + ApplyTask::UnsafeWrite(raw_write) => self.apply.apply_unsafe_write(raw_write), } // TODO: yield after some time. diff --git a/components/raftstore-v2/src/fsm/peer.rs b/components/raftstore-v2/src/fsm/peer.rs index cd93463a524..49f1efcb760 100644 --- a/components/raftstore-v2/src/fsm/peer.rs +++ b/components/raftstore-v2/src/fsm/peer.rs @@ -6,17 +6,15 @@ use std::borrow::Cow; use batch_system::{BasicMailbox, Fsm}; use crossbeam::channel::TryRecvError; -use engine_traits::{KvEngine, RaftEngine, TabletFactory}; -use raftstore::store::{Config, Transport}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use raftstore::store::{Config, LocksStatus, TabletSnapManager, Transport}; use slog::{debug, error, info, trace, Logger}; use tikv_util::{ is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, time::{duration_to_sec, Instant}, - yatp_pool::FuturePool, }; -use super::ApplyFsm; use crate::{ batch::StoreContext, raft::{Peer, Storage}, @@ -32,25 +30,28 @@ pub struct PeerFsm { receiver: Receiver, /// A registry for all scheduled ticks. This can avoid scheduling ticks /// twice accidentally. - tick_registry: u16, + tick_registry: [bool; PeerTick::VARIANT_COUNT], is_stopped: bool, + reactivate_memory_lock_ticks: usize, } impl PeerFsm { pub fn new( cfg: &Config, - tablet_factory: &dyn TabletFactory, + tablet_registry: &TabletRegistry, + snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result> { - let peer = Peer::new(cfg, tablet_factory, storage)?; + let peer = Peer::new(cfg, tablet_registry, snap_mgr, storage)?; info!(peer.logger, "create peer"); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); let fsm = Box::new(PeerFsm { peer, mailbox: None, receiver: rx, - tick_registry: 0, + tick_registry: [false; PeerTick::VARIANT_COUNT], is_stopped: false, + reactivate_memory_lock_ticks: 0, }); Ok((tx, fsm)) } @@ -127,11 +128,21 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, Self { fsm, store_ctx } } + #[inline] + fn schedule_pending_ticks(&mut self) { + let pending_ticks = self.fsm.peer.take_pending_ticks(); + for tick in pending_ticks { + if tick == PeerTick::ReactivateMemoryLock { + self.fsm.reactivate_memory_lock_ticks = 0; + } + self.schedule_tick(tick); + } + } + pub fn schedule_tick(&mut self, tick: PeerTick) { assert!(PeerTick::VARIANT_COUNT <= u16::BITS as usize); let idx = tick as usize; - let key = 1u16 << (idx as u16); - if self.fsm.tick_registry & key != 0 { + if self.fsm.tick_registry[idx] { return; } if is_zero_duration(&self.store_ctx.tick_batch[idx].wait_duration) { @@ -156,7 +167,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, return; } }; - self.fsm.tick_registry |= key; + self.fsm.tick_registry[idx] = true; let logger = self.fsm.logger().clone(); // TODO: perhaps following allocation can be removed. let cb = Box::new(move || { @@ -177,9 +188,21 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, fn on_start(&mut self) { self.schedule_tick(PeerTick::Raft); + self.schedule_tick(PeerTick::SplitRegionCheck); + self.schedule_tick(PeerTick::PdHeartbeat); + self.schedule_tick(PeerTick::CompactLog); if self.fsm.peer.storage().is_initialized() { self.fsm.peer.schedule_apply_fsm(self.store_ctx); } + // Unlike v1, it's a must to set ready when there are pending entries. Otherwise + // it may block for ever when there is unapplied conf change. + let entry_storage = self.fsm.peer.storage().entry_storage(); + if entry_storage.commit_index() > entry_storage.applied_index() + // Speed up setup if there is only one peer. + || self.fsm.peer.is_leader() + { + self.fsm.peer.set_has_ready(); + } } #[inline] @@ -191,16 +214,17 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, } fn on_tick(&mut self, tick: PeerTick) { + self.fsm.tick_registry[tick as usize] = false; match tick { PeerTick::Raft => self.on_raft_tick(), PeerTick::PdHeartbeat => self.on_pd_heartbeat(), - PeerTick::RaftLogGc => unimplemented!(), - PeerTick::SplitRegionCheck => unimplemented!(), + PeerTick::CompactLog => self.on_compact_log_tick(false), + PeerTick::SplitRegionCheck => self.on_split_region_check(), PeerTick::CheckMerge => unimplemented!(), PeerTick::CheckPeerStaleState => unimplemented!(), - PeerTick::EntryCacheEvict => unimplemented!(), + PeerTick::EntryCacheEvict => self.on_entry_cache_evict(), PeerTick::CheckLeaderLease => unimplemented!(), - PeerTick::ReactivateMemoryLock => unimplemented!(), + PeerTick::ReactivateMemoryLock => self.on_reactivate_memory_lock_tick(), PeerTick::ReportBuckets => unimplemented!(), PeerTick::CheckLongUncommitted => unimplemented!(), } @@ -209,18 +233,40 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, pub fn on_msgs(&mut self, peer_msgs_buf: &mut Vec) { for msg in peer_msgs_buf.drain(..) { match msg { - PeerMsg::RaftMessage(msg) => self.fsm.peer.on_raft_message(self.store_ctx, msg), + PeerMsg::RaftMessage(msg) => { + self.fsm.peer.on_raft_message(self.store_ctx, msg); + } PeerMsg::RaftQuery(cmd) => { self.on_receive_command(cmd.send_time); self.on_query(cmd.request, cmd.ch) } - PeerMsg::RaftCommand(cmd) => { + PeerMsg::AdminCommand(cmd) => { self.on_receive_command(cmd.send_time); - self.on_command(cmd.request, cmd.ch) + self.fsm + .peer_mut() + .on_admin_command(self.store_ctx, cmd.request, cmd.ch) + } + PeerMsg::SimpleWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm.peer_mut().on_simple_write( + self.store_ctx, + write.header, + write.data, + write.ch, + ); + } + PeerMsg::UnsafeWrite(write) => { + self.on_receive_command(write.send_time); + self.fsm + .peer_mut() + .on_unsafe_write(self.store_ctx, write.data); } PeerMsg::Tick(tick) => self.on_tick(tick), PeerMsg::ApplyRes(res) => self.fsm.peer.on_apply_res(self.store_ctx, res), PeerMsg::SplitInit(msg) => self.fsm.peer.on_split_init(self.store_ctx, msg), + PeerMsg::SplitInitFinish(region_id) => { + self.fsm.peer.on_split_init_finish(region_id) + } PeerMsg::Start => self.on_start(), PeerMsg::Noop => unimplemented!(), PeerMsg::Persisted { @@ -237,11 +283,64 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, self.fsm.peer_mut().on_snapshot_generated(snap_res) } PeerMsg::QueryDebugInfo(ch) => self.fsm.peer_mut().on_query_debug_info(ch), + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + } => { + self.fsm + .peer_mut() + .on_data_flushed(cf, tablet_index, flushed_index); + } + PeerMsg::PeerUnreachable { to_peer_id } => { + self.fsm.peer_mut().on_peer_unreachable(to_peer_id) + } + PeerMsg::StoreUnreachable { to_store_id } => { + self.fsm.peer_mut().on_store_unreachable(to_store_id) + } + PeerMsg::SnapshotSent { to_peer_id, status } => { + self.fsm.peer_mut().on_snapshot_sent(to_peer_id, status) + } + PeerMsg::RequestSplit { request, ch } => { + self.fsm + .peer_mut() + .on_request_split(self.store_ctx, request, ch) + } + PeerMsg::ForceCompactLog => self.on_compact_log_tick(true), #[cfg(feature = "testexport")] PeerMsg::WaitFlush(ch) => self.fsm.peer_mut().on_wait_flush(ch), } } // TODO: instead of propose pending commands immediately, we should use timeout. self.fsm.peer.propose_pending_writes(self.store_ctx); + self.schedule_pending_ticks(); + } + + pub fn on_reactivate_memory_lock_tick(&mut self) { + let mut pessimistic_locks = self.fsm.peer.txn_ext().pessimistic_locks.write(); + + // If it is not leader, we needn't reactivate by tick. In-memory pessimistic + // lock will be enabled when this region becomes leader again. + // And this tick is currently only used for the leader transfer failure case. + if !self.fsm.peer().is_leader() + || pessimistic_locks.status != LocksStatus::TransferringLeader + { + return; + } + + self.fsm.reactivate_memory_lock_ticks += 1; + let transferring_leader = self.fsm.peer.raft_group().raft.lead_transferee.is_some(); + // `lead_transferee` is not set immediately after the lock status changes. So, + // we need the tick count condition to avoid reactivating too early. + if !transferring_leader + && self.fsm.reactivate_memory_lock_ticks + >= self.store_ctx.cfg.reactive_memory_lock_timeout_tick + { + pessimistic_locks.status = LocksStatus::Normal; + self.fsm.reactivate_memory_lock_ticks = 0; + } else { + drop(pessimistic_locks); + self.schedule_tick(PeerTick::ReactivateMemoryLock); + } } } diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 546ec95a604..cb7aa99b179 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -1,15 +1,19 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::time::{Duration, SystemTime}; +use std::{ + collections::BTreeMap, + ops::Bound::{Excluded, Unbounded}, + time::{Duration, SystemTime}, +}; use batch_system::Fsm; use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use futures::{compat::Future01CompatExt, FutureExt}; -use kvproto::{metapb::Region, raft_serverpb::RaftMessage}; -use raftstore::{ - coprocessor::RegionChangeReason, - store::{Config, ReadDelegate, RegionReadProgressRegistry}, +use keys::{data_end_key, data_key}; +use kvproto::metapb::Region; +use raftstore::store::{ + fsm::store::StoreRegionMeta, Config, ReadDelegate, RegionReadProgressRegistry, }; use slog::{info, o, Logger}; use tikv_util::{ @@ -20,43 +24,100 @@ use tikv_util::{ use crate::{ batch::StoreContext, - raft::Peer, router::{StoreMsg, StoreTick}, - tablet::CachedTablet, }; -pub struct StoreMeta -where - E: KvEngine, -{ - pub store_id: Option, +pub struct StoreMeta { + pub store_id: u64, /// region_id -> reader pub readers: HashMap, - /// region_id -> tablet cache - pub tablet_caches: HashMap>, /// region_id -> `RegionReadProgress` pub region_read_progress: RegionReadProgressRegistry, + /// (region_end_key, epoch.version) -> region_id + /// + /// Unlinke v1, ranges in v2 may be overlapped. So we use version + /// to avoid end key conflict. + pub(crate) region_ranges: BTreeMap<(Vec, u64), u64>, + /// region_id -> (region, initialized) + pub(crate) regions: HashMap, } -impl StoreMeta -where - E: KvEngine, -{ - pub fn new() -> StoreMeta { +impl StoreMeta { + pub fn new(store_id: u64) -> StoreMeta { StoreMeta { - store_id: None, + store_id, readers: HashMap::default(), - tablet_caches: HashMap::default(), - region_read_progress: RegionReadProgressRegistry::new(), + region_read_progress: RegionReadProgressRegistry::default(), + region_ranges: BTreeMap::default(), + regions: HashMap::default(), + } + } + + pub fn set_region(&mut self, region: &Region, initialized: bool, logger: &Logger) { + let region_id = region.get_id(); + let version = region.get_region_epoch().get_version(); + let prev = self + .regions + .insert(region_id, (region.clone(), initialized)); + // `prev` only makes sense when it's initialized. + if let Some((prev, prev_init)) = prev && prev_init { + assert!(initialized, "{:?} region corrupted", logger.list()); + if prev.get_region_epoch().get_version() != version { + let prev_id = self.region_ranges.remove(&(data_end_key(prev.get_end_key()), prev.get_region_epoch().get_version())); + assert_eq!(prev_id, Some(region_id), "{:?} region corrupted", logger.list()); + } else { + assert!(self.region_ranges.get(&(data_end_key(prev.get_end_key()), version)).is_some(), "{:?} region corrupted", logger.list()); + return; + } + } + if initialized { + assert!( + self.region_ranges + .insert((data_end_key(region.get_end_key()), version), region_id) + .is_none(), + "{:?} region corrupted", + logger.list() + ); } } } -impl Default for StoreMeta { - fn default() -> Self { - Self::new() +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } + + #[inline] + fn search_region( + &self, + start_key: &[u8], + end_key: &[u8], + mut visitor: impl FnMut(&kvproto::metapb::Region), + ) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded((start_key, 0)), Unbounded::<(Vec, u64)>)) + { + let (region, initialized) = &self.regions[id]; + if !initialized { + continue; + } + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } } } + pub struct Store { id: u64, // Unix time when it's started. @@ -186,6 +247,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => self.fsm.store.on_raft_message(self.store_ctx, msg), StoreMsg::SplitInit(msg) => self.fsm.store.on_split_init(self.store_ctx, msg), + StoreMsg::StoreUnreachable { to_store_id } => self + .fsm + .store + .on_store_unreachable(self.store_ctx, to_store_id), } } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index 7dea9d55901..7ddb1687d91 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -21,10 +21,11 @@ // Functionalities like read, write, etc should be implemented in [`operation`] // using a standalone modules. -#![allow(unused)] #![feature(let_chains)] #![feature(array_windows)] #![feature(div_duration)] +#![feature(box_into_inner)] +#![feature(assert_matches)] mod batch; mod bootstrap; @@ -32,11 +33,12 @@ mod fsm; mod operation; mod raft; pub mod router; -mod tablet; mod worker; pub(crate) use batch::StoreContext; pub use batch::{create_store_batch_system, StoreRouter, StoreSystem}; pub use bootstrap::Bootstrap; pub use fsm::StoreMeta; -pub use raftstore::{Error, Result}; +pub use operation::{SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; +pub use raftstore::{store::Config, Error, Result}; +pub use worker::pd::{FlowReporter, Task as PdTask}; diff --git a/components/raftstore-v2/src/operation/command/admin/compact_log.rs b/components/raftstore-v2/src/operation/command/admin/compact_log.rs new file mode 100644 index 00000000000..d1d10d366bf --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/compact_log.rs @@ -0,0 +1,319 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module contains processing logic of the following: +//! +//! # `CompactLog` and `EntryCacheEvict` ticks +//! +//! On region leader, periodically compacts useless Raft logs from the +//! underlying log engine, and evicts logs from entry cache if it reaches memory +//! limit. +//! +//! # `CompactLog` command +//! +//! Updates truncated index, and compacts logs if the corresponding changes have +//! been persisted in kvdb. + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}; +use protobuf::Message; +use raftstore::{ + store::{fsm::new_admin_request, needs_evict_entry_cache, Transport, WriteTask}, + Result, +}; +use slog::{debug, error, info}; +use tikv_util::{box_err, Either}; + +use crate::{ + batch::StoreContext, + fsm::{ApplyResReporter, PeerFsmDelegate}, + operation::AdminCmdResult, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerTick}, + worker::tablet_gc, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { + pub fn on_compact_log_tick(&mut self, force: bool) { + if !self.fsm.peer().is_leader() { + // `compact_cache_to` is called when apply, there is no need to call + // `compact_to` here, snapshot generating has already been cancelled + // when the role becomes follower. + return; + } + self.schedule_tick(PeerTick::CompactLog); + + self.fsm + .peer_mut() + .maybe_propose_compact_log(self.store_ctx, force); + + self.on_entry_cache_evict(); + } + + pub fn on_entry_cache_evict(&mut self) { + if needs_evict_entry_cache(self.store_ctx.cfg.evict_cache_on_memory_ratio) { + self.fsm + .peer_mut() + .entry_storage_mut() + .evict_entry_cache(true); + if !self.fsm.peer().entry_storage().is_entry_cache_empty() { + self.schedule_tick(PeerTick::EntryCacheEvict); + } + } + } +} + +impl Peer { + // Mirrors v1::on_raft_gc_log_tick. + fn maybe_propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + force: bool, + ) { + // As leader, we would not keep caches for the peers that didn't response + // heartbeat in the last few seconds. That happens probably because + // another TiKV is down. In this case if we do not clean up the cache, + // it may keep growing. + let drop_cache_duration = + store_ctx.cfg.raft_heartbeat_interval() + store_ctx.cfg.raft_entry_cache_life_time.0; + let cache_alive_limit = std::time::Instant::now() - drop_cache_duration; + + // Leader will replicate the compact log command to followers, + // If we use current replicated_index (like 10) as the compact index, + // when we replicate this log, the newest replicated_index will be 11, + // but we only compact the log to 10, not 11, at that time, + // the first index is 10, and replicated_index is 11, with an extra log, + // and we will do compact again with compact index 11, in cycles... + // So we introduce a threshold, if replicated index - first index > threshold, + // we will try to compact log. + // raft log entries[..............................................] + // ^ ^ + // |-----------------threshold------------ | + // first_index replicated_index + // `alive_cache_idx` is the smallest `replicated_index` of healthy up nodes. + // `alive_cache_idx` is only used to gc cache. + let applied_idx = self.entry_storage().applied_index(); + let truncated_idx = self.entry_storage().truncated_index(); + let first_idx = self.entry_storage().first_index(); + let last_idx = self.entry_storage().last_index(); + + let (mut replicated_idx, mut alive_cache_idx) = (last_idx, last_idx); + for (peer_id, p) in self.raft_group().raft.prs().iter() { + if replicated_idx > p.matched { + replicated_idx = p.matched; + } + if self.peer_heartbeat_is_fresh(*peer_id, &cache_alive_limit) { + if alive_cache_idx > p.matched && p.matched >= truncated_idx { + alive_cache_idx = p.matched; + } else if p.matched == 0 { + // the new peer is still applying snapshot, do not compact cache now + alive_cache_idx = 0; + } + } + } + + // When an election happened or a new peer is added, replicated_idx can be 0. + if replicated_idx > 0 { + assert!( + last_idx >= replicated_idx, + "expect last index {} >= replicated index {}", + last_idx, + replicated_idx + ); + } + + // leader may call `get_term()` on the latest replicated index, so compact + // entries before `alive_cache_idx` instead of `alive_cache_idx + 1`. + self.entry_storage_mut() + .compact_entry_cache(std::cmp::min(alive_cache_idx, applied_idx + 1)); + + let mut compact_idx = if force && replicated_idx > first_idx { + replicated_idx + } else if applied_idx > first_idx + && applied_idx - first_idx >= store_ctx.cfg.raft_log_gc_count_limit() + || self.approximate_raft_log_size() >= store_ctx.cfg.raft_log_gc_size_limit().0 + { + std::cmp::max(first_idx + (last_idx - first_idx) / 2, replicated_idx) + } else if replicated_idx < first_idx + || last_idx - first_idx < 3 + || replicated_idx - first_idx < store_ctx.cfg.raft_log_gc_threshold + && self.maybe_skip_compact_log(store_ctx.cfg.raft_log_reserve_max_ticks) + { + return; + } else { + replicated_idx + }; + assert!(compact_idx >= first_idx); + // Have no idea why subtract 1 here, but original code did this by magic. + compact_idx -= 1; + if compact_idx < first_idx { + return; + } + + // Create a compact log request and notify directly. + // TODO: move this into a function + let term = self.raft_group().raft.raft_log.term(compact_idx).unwrap(); + + let mut req = new_admin_request(self.region_id(), self.peer().clone()); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(AdminCmdType::CompactLog); + admin.mut_compact_log().set_compact_index(compact_idx); + admin.mut_compact_log().set_compact_term(term); + req.set_admin_request(admin); + + let (ch, _) = CmdResChannel::pair(); + self.on_admin_command(store_ctx, req, ch); + + self.reset_skip_compact_log_ticks(); + } +} + +#[derive(Debug)] +pub struct CompactLogResult { + index: u64, + compact_index: u64, + compact_term: u64, +} + +impl Peer { + pub fn propose_compact_log( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let compact_log = req.get_admin_request().get_compact_log(); + // TODO: add unit tests to cover all the message integrity checks. + if compact_log.get_compact_term() == 0 { + info!( + self.logger, + "compact term missing, skip"; + "command" => ?compact_log + ); + // old format compact log command, safe to ignore. + return Err(box_err!( + "command format is outdated, please upgrade leader" + )); + } + + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_compact_log( + &mut self, + req: &AdminRequest, + index: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + Ok(( + AdminResponse::default(), + AdminCmdResult::CompactLog(CompactLogResult { + index, + compact_index: req.get_compact_log().get_compact_index(), + compact_term: req.get_compact_log().get_compact_term(), + }), + )) + } +} + +impl Peer { + pub fn on_apply_res_compact_log( + &mut self, + store_ctx: &mut StoreContext, + res: CompactLogResult, + ) { + let first_index = self.entry_storage().first_index(); + if res.compact_index <= first_index { + debug!( + self.logger, + "compact index <= first index, no need to compact"; + "compact_index" => res.compact_index, + "first_index" => first_index, + ); + return; + } + // TODO: check is_merging + // TODO: check entry_cache_warmup_state + self.entry_storage_mut() + .compact_entry_cache(res.compact_index); + self.storage_mut() + .cancel_generating_snap_due_to_compacted(res.compact_index); + + let truncated_state = self + .entry_storage_mut() + .apply_state_mut() + .mut_truncated_state(); + let old_truncated = truncated_state.get_index(); + truncated_state.set_index(res.compact_index); + truncated_state.set_term(res.compact_term); + + let region_id = self.region_id(); + // TODO: get around this clone. + let apply_state = self.entry_storage().apply_state().clone(); + self.state_changes_mut() + .put_apply_state(region_id, res.index, &apply_state) + .unwrap(); + self.set_has_extra_write(); + + self.maybe_compact_log_from_engine(store_ctx, Either::Right(old_truncated)); + } + + #[inline] + pub fn on_advance_persisted_apply_index( + &mut self, + store_ctx: &mut StoreContext, + old_persisted: u64, + task: &mut WriteTask, + ) { + let new_persisted = self.storage().apply_trace().persisted_apply_index(); + if old_persisted < new_persisted { + let region_id = self.region_id(); + // TODO: batch it. + if let Err(e) = store_ctx.engine.delete_all_but_one_states_before( + region_id, + new_persisted, + self.state_changes_mut(), + ) { + error!(self.logger, "failed to delete raft states"; "err" => ?e); + } else { + self.set_has_extra_write(); + } + self.maybe_compact_log_from_engine(store_ctx, Either::Left(old_persisted)); + if self.remove_tombstone_tablets_before(new_persisted) { + let sched = store_ctx.schedulers.tablet_gc.clone(); + task.persisted_cbs.push(Box::new(move || { + let _ = sched.schedule(tablet_gc::Task::destroy(region_id, new_persisted)); + })) + } + } + } + + pub fn maybe_compact_log_from_engine( + &mut self, + store_ctx: &mut StoreContext, + old_index: Either, + ) { + let truncated = self.entry_storage().truncated_index(); + let persisted = self.storage().apply_trace().persisted_apply_index(); + match old_index { + Either::Left(old_persisted) if old_persisted >= truncated => return, + Either::Right(old_truncated) if old_truncated >= persisted => return, + _ => {} + } + let compact_index = std::cmp::min(truncated, persisted); + // Raft Engine doesn't care about first index. + if let Err(e) = + store_ctx + .engine + .gc(self.region_id(), 0, compact_index, self.state_changes_mut()) + { + error!(self.logger, "failed to compact raft logs"; "err" => ?e); + } else { + self.set_has_extra_write(); + let applied = self.storage().apply_state().get_applied_index(); + let total_cnt = applied - self.storage().entry_storage().first_index() + 1; + let remain_cnt = applied - compact_index; + self.update_approximate_raft_log_size(|s| s * remain_cnt / total_cnt); + } + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/conf_change.rs b/components/raftstore-v2/src/operation/command/admin/conf_change.rs index 69e318c3a2e..5a6c91d3567 100644 --- a/components/raftstore-v2/src/operation/command/admin/conf_change.rs +++ b/components/raftstore-v2/src/operation/command/admin/conf_change.rs @@ -9,7 +9,6 @@ use std::time::Instant; -use collections::HashSet; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb::{self, PeerRole}, @@ -18,8 +17,8 @@ use kvproto::{ }; use protobuf::Message; use raft::prelude::*; -use raft_proto::ConfChangeI; use raftstore::{ + coprocessor::{RegionChangeEvent, RegionChangeReason}, store::{ metrics::{PEER_ADMIN_CMD_COUNTER_VEC, PEER_PROPOSE_LOG_SIZE_HISTOGRAM}, util::{self, ChangePeerI, ConfChangeKind}, @@ -34,7 +33,6 @@ use super::AdminCmdResult; use crate::{ batch::StoreContext, raft::{Apply, Peer}, - router::ApplyRes, }; /// The apply result of conf change. @@ -56,7 +54,7 @@ impl Peer { pub fn propose_conf_change( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ) -> Result { if self.raft_group().raft.has_pending_conf() { info!( @@ -67,7 +65,6 @@ impl Peer { } let data = req.write_to_bytes()?; let admin = req.get_admin_request(); - let leader_role = self.peer().get_role(); if admin.has_change_peer() { self.propose_conf_change_imp(ctx, admin.get_change_peer(), data) } else if admin.has_change_peer_v2() { @@ -186,6 +183,11 @@ impl Peer { self.set_has_ready(); } } + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Update(RegionChangeReason::ChangePeer), + self.raft_group().raft.state, + ); if remove_self { self.mark_for_destroy(None); } @@ -229,7 +231,6 @@ impl Apply { legacy: bool, ) -> Result<(AdminResponse, AdminCmdResult)> { let region = self.region_state().get_region(); - let peer_id = self.peer().get_id(); let change_kind = ConfChangeKind::confchange_kind(changes.len()); info!(self.logger, "exec ConfChangeV2"; "kind" => ?change_kind, "legacy" => legacy, "epoch" => ?region.get_region_epoch()); let mut new_region = region.clone(); @@ -284,7 +285,7 @@ impl Apply { } let mut resp = AdminResponse::default(); resp.mut_change_peer().set_region(new_region); - let mut conf_change = ConfChangeResult { + let conf_change = ConfChangeResult { index, conf_change: cc, changes: changes.to_vec(), diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index eb6560d239e..977e26e0675 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -1,36 +1,32 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +mod compact_log; mod conf_change; mod split; +mod transfer_leader; +use compact_log::CompactLogResult; +use conf_change::ConfChangeResult; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest}; +use kvproto::raft_cmdpb::{AdminCmdType, RaftCmdRequest}; use protobuf::Message; -use raft::prelude::ConfChangeV2; -use raftstore::{ - store::{ - self, cmd_resp, - fsm::apply, - msg::ErrorCallback, - util::{ChangePeerI, ConfChangeKind}, - }, - Result, -}; +use raftstore::store::{cmd_resp, fsm::apply, msg::ErrorCallback}; use slog::info; -pub use split::{SplitInit, SplitResult}; +use split::SplitResult; +pub use split::{temp_split_path, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX}; use tikv_util::box_err; +use txn_types::WriteBatchFlags; -use self::conf_change::ConfChangeResult; -use crate::{ - batch::StoreContext, - raft::{Apply, Peer}, - router::CmdResChannel, -}; +use crate::{batch::StoreContext, raft::Peer, router::CmdResChannel}; #[derive(Debug)] pub enum AdminCmdResult { + // No side effect produced by the command + None, SplitRegion(SplitResult), ConfChange(ConfChangeResult), + TransferLeader(u64), + CompactLog(CompactLogResult), } impl Peer { @@ -45,7 +41,21 @@ impl Peer { apply::notify_req_region_removed(self.region_id(), ch); return; } - if let Err(e) = self.validate_command(&req, &mut ctx.raft_metrics) { + if !req.has_admin_request() { + let e = box_err!("{:?} expect only execute admin command", self.logger.list()); + let resp = cmd_resp::new_error(e); + ch.report_error(resp); + return; + } + if let Err(e) = ctx.coprocessor_host.pre_propose(self.region(), &mut req) { + let resp = cmd_resp::new_error(e.into()); + ch.report_error(resp); + return; + } + let cmd_type = req.get_admin_request().get_cmd_type(); + if let Err(e) = + self.validate_command(req.get_header(), Some(cmd_type), &mut ctx.raft_metrics) + { let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -65,7 +75,6 @@ impl Peer { ch.report_error(resp); return; } - let cmd_type = req.get_admin_request().get_cmd_type(); if let Some(conflict) = self.proposal_control_mut().check_conflict(Some(cmd_type)) { conflict.delay_channel(ch); return; @@ -81,6 +90,22 @@ impl Peer { "Split is deprecated. Please use BatchSplit instead." )), AdminCmdType::BatchSplit => self.propose_split(ctx, req), + AdminCmdType::TransferLeader => { + // Containing TRANSFER_LEADER_PROPOSAL flag means the this transfer leader + // request should be proposed to the raft group + if WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + .contains(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL) + { + let data = req.write_to_bytes().unwrap(); + self.propose(ctx, data) + } else { + if self.propose_transfer_leader(ctx, req, ch) { + self.set_has_ready(); + } + return; + } + } + AdminCmdType::CompactLog => self.propose_compact_log(ctx, req), _ => unimplemented!(), } }; diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 2782b436439..e1f4ae552f6 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,43 +25,48 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::collections::VecDeque; +use std::{borrow::Cow, cmp, path::PathBuf}; -use crossbeam::channel::{SendError, TrySendError}; +use collections::HashSet; +use crossbeam::channel::SendError; use engine_traits::{ - Checkpointer, DeleteStrategy, KvEngine, OpenOptions, RaftEngine, RaftLogBatch, Range, - CF_DEFAULT, SPLIT_PREFIX, + Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, }; use fail::fail_point; -use keys::enc_end_key; use kvproto::{ metapb::{self, Region, RegionEpoch}, + pdpb::CheckPolicy, raft_cmdpb::{AdminRequest, AdminResponse, RaftCmdRequest, SplitRequest}, - raft_serverpb::RegionLocalState, + raft_serverpb::RaftSnapshotData, }; use protobuf::Message; -use raft::RawNode; +use raft::{prelude::Snapshot, INVALID_ID}; use raftstore::{ coprocessor::RegionChangeReason, store::{ - fsm::apply::validate_batch_split, + cmd_resp, + fsm::{apply::validate_batch_split, ApplyMetrics}, metrics::PEER_ADMIN_CMD_COUNTER, + snap::TABLET_SNAPSHOT_VERSION, util::{self, KeysInfoFormatter}, - PeerPessimisticLocks, PeerStat, ProposalContext, RAFT_INIT_LOG_INDEX, + PeerPessimisticLocks, SplitCheckTask, Transport, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, Result, }; -use slog::{error, info, warn, Logger}; -use tikv_util::box_err; +use slog::info; use crate::{ batch::StoreContext, fsm::{ApplyResReporter, PeerFsmDelegate}, operation::AdminCmdResult, - raft::{write_initial_states, Apply, Peer, Storage}, - router::{ApplyRes, PeerMsg, StoreMsg}, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, + worker::tablet_gc, + Error, }; +pub const SPLIT_PREFIX: &str = "split_"; + #[derive(Debug)] pub struct SplitResult { pub regions: Vec, @@ -69,17 +74,140 @@ pub struct SplitResult { pub derived_index: usize, pub tablet_index: u64, } + +#[derive(Debug)] pub struct SplitInit { /// Split region pub region: metapb::Region, pub check_split: bool, - pub parent_is_leader: bool, + pub scheduled: bool, + pub source_leader: bool, + pub source_id: u64, /// In-memory pessimistic locks that should be inherited from parent region pub locks: PeerPessimisticLocks, } +impl SplitInit { + fn to_snapshot(&self) -> Snapshot { + let mut snapshot = Snapshot::default(); + // Set snapshot metadata. + snapshot.mut_metadata().set_term(RAFT_INIT_LOG_TERM); + snapshot.mut_metadata().set_index(RAFT_INIT_LOG_INDEX); + let conf_state = util::conf_state_from_region(&self.region); + snapshot.mut_metadata().set_conf_state(conf_state); + // Set snapshot data. + let mut snap_data = RaftSnapshotData::default(); + snap_data.set_region(self.region.clone()); + snap_data.set_version(TABLET_SNAPSHOT_VERSION); + snap_data.mut_meta().set_for_balance(false); + snapshot.set_data(snap_data.write_to_bytes().unwrap().into()); + snapshot + } +} + +#[derive(Debug)] +pub struct RequestSplit { + pub epoch: RegionEpoch, + pub split_keys: Vec>, + pub source: Cow<'static, str>, +} + +#[derive(Default, Debug)] +pub struct SplitFlowControl { + size_diff_hint: i64, + skip_split_count: u64, + may_skip_split_check: bool, +} + +pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { + let tablet_name = registry.tablet_name(SPLIT_PREFIX, region_id, RAFT_INIT_LOG_INDEX); + registry.tablet_root().join(tablet_name) +} + +impl PeerFsmDelegate<'_, EK, ER, T> { + pub fn on_split_region_check(&mut self) { + if !self.fsm.peer_mut().on_split_region_check(self.store_ctx) { + self.schedule_tick(PeerTick::SplitRegionCheck) + } + } +} + impl Peer { + /// Handle split check. + /// + /// Returns true means the check tick is consumed, no need to schedule + /// another tick. + pub fn on_split_region_check(&mut self, ctx: &mut StoreContext) -> bool { + if !self.is_leader() { + return true; + } + let is_generating_snapshot = self.storage().is_generating_snapshot(); + let control = self.split_flow_control_mut(); + if control.may_skip_split_check + && control.size_diff_hint < ctx.cfg.region_split_check_diff().0 as i64 + { + return true; + } + if ctx.schedulers.split_check.is_busy() { + return false; + } + if is_generating_snapshot && control.skip_split_count < 3 { + control.skip_split_count += 1; + return false; + } + let task = + SplitCheckTask::split_check(self.region().clone(), true, CheckPolicy::Scan, None); + if let Err(e) = ctx.schedulers.split_check.schedule(task) { + info!(self.logger, "failed to schedule split check"; "err" => ?e); + } + let control = self.split_flow_control_mut(); + control.may_skip_split_check = true; + control.size_diff_hint = 0; + control.skip_split_count = 0; + false + } + + pub fn update_split_flow_control(&mut self, metrics: &ApplyMetrics) { + let control = self.split_flow_control_mut(); + control.size_diff_hint += metrics.size_diff_hint; + } + + pub fn on_request_split( + &mut self, + ctx: &mut StoreContext, + rs: RequestSplit, + ch: CmdResChannel, + ) { + info!( + self.logger, + "on split"; + "split_keys" => %KeysInfoFormatter(rs.split_keys.iter()), + "source" => %&rs.source, + ); + if !self.is_leader() { + // region on this store is no longer leader, skipped. + info!(self.logger, "not leader, skip."); + ch.set_result(cmd_resp::new_error(Error::NotLeader( + self.region_id(), + self.leader(), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.region_id(), + self.peer_id(), + self.region(), + &rs.epoch, + &rs.split_keys, + ) { + info!(self.logger, "invalid split request"; "err" => ?e, "source" => %&rs.source); + ch.set_result(cmd_resp::new_error(e)); + return; + } + self.ask_batch_split_pd(ctx, rs.split_keys, ch); + } + pub fn propose_split( &mut self, store_ctx: &mut StoreContext, @@ -201,17 +329,14 @@ impl Apply { ) }); + let reg = self.tablet_registry(); for new_region in ®ions { let new_region_id = new_region.id; if new_region_id == region_id { continue; } - let split_temp_path = self.tablet_factory().tablet_path_with_prefix( - SPLIT_PREFIX, - new_region_id, - RAFT_INIT_LOG_INDEX, - ); + let split_temp_path = temp_split_path(reg, new_region_id); checkpointer .create_at(&split_temp_path, None, 0) .unwrap_or_else(|e| { @@ -224,23 +349,33 @@ impl Apply { }); } - let derived_path = self.tablet_factory().tablet_path(region_id, log_index); - checkpointer - .create_at(&derived_path, None, 0) - .unwrap_or_else(|e| { - panic!( - "{:?} fails to create checkpoint with path {:?}: {:?}", - self.logger.list(), - derived_path, - e - ) - }); - let tablet = self - .tablet_factory() - .open_tablet(region_id, Some(log_index), OpenOptions::default()) - .unwrap(); + let derived_path = self.tablet_registry().tablet_path(region_id, log_index); + // If it's recovered from restart, it's possible the target path exists already. + // And because checkpoint is atomic, so we don't need to worry about corruption. + // And it's also wrong to delete it and remake as it may has applied and flushed + // some data to the new checkpoint before being restarted. + if !derived_path.exists() { + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + panic!( + "{:?} fails to create checkpoint with path {:?}: {:?}", + self.logger.list(), + derived_path, + e + ) + }); + } // Remove the old write batch. - self.write_batch_mut().take(); + self.write_batch.take(); + let reg = self.tablet_registry(); + let path = reg.tablet_path(region_id, log_index); + let mut ctx = TabletContext::new(®ions[derived_index], Some(log_index)); + // Now the tablet is flushed, so all previous states should be persisted. + // Reusing the tablet should not be a problem. + // TODO: Should we avoid flushing for the old tablet? + ctx.flush_state = Some(self.flush_state().clone()); + let tablet = reg.tablet_factory().open_tablet(ctx, &path).unwrap(); self.publish_tablet(tablet); self.region_state_mut() @@ -263,16 +398,14 @@ impl Apply { } impl Peer { - pub fn on_ready_split_region( + pub fn on_apply_res_split( &mut self, store_ctx: &mut StoreContext, - derived_index: usize, - tablet_index: u64, - regions: Vec, + res: SplitResult, ) { fail_point!("on_split", self.peer().get_store_id() == 3, |_| {}); - let derived = ®ions[derived_index]; + let derived = &res.regions[res.derived_index]; let derived_epoch = derived.get_region_epoch().clone(); let region_id = derived.get_id(); @@ -286,20 +419,20 @@ impl Peer { // Update the version so the concurrent reader will fail due to EpochNotMatch // instead of PessimisticLockNotFound. pessimistic_locks.version = derived_epoch.get_version(); - pessimistic_locks.group_by_regions(®ions, derived) + pessimistic_locks.group_by_regions(&res.regions, derived) }; fail_point!("on_split_invalidate_locks"); - // Roughly estimate the size and keys for new regions. - let new_region_count = regions.len() as u64; { let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(derived, true, &self.logger); let reader = meta.readers.get_mut(&derived.get_id()).unwrap(); self.set_region( + &store_ctx.coprocessor_host, reader, derived.clone(), RegionChangeReason::Split, - tablet_index, + res.tablet_index, ); } @@ -311,26 +444,38 @@ impl Peer { info!( self.logger, "notify pd with split"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), - "split_count" => regions.len(), + "split_count" => res.regions.len(), ); // Now pd only uses ReportBatchSplit for history operation show, // so we send it independently here. - self.report_batch_split_pd(store_ctx, regions.to_vec()); + self.report_batch_split_pd(store_ctx, res.regions.to_vec()); + self.add_pending_tick(PeerTick::SplitRegionCheck); } - let last_region_id = regions.last().unwrap().get_id(); - for (new_region, locks) in regions.into_iter().zip(region_locks) { + self.record_tablet_as_tombstone_and_refresh(res.tablet_index, store_ctx); + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + derived, + )); + + let last_region_id = res.regions.last().unwrap().get_id(); + let mut new_ids = HashSet::default(); + for (new_region, locks) in res.regions.into_iter().zip(region_locks) { let new_region_id = new_region.get_id(); if new_region_id == region_id { continue; } + new_ids.insert(new_region_id); let split_init = PeerMsg::SplitInit(Box::new(SplitInit { region: new_region, - parent_is_leader: self.is_leader(), + source_leader: self.is_leader(), + source_id: region_id, check_split: last_region_id == new_region_id, + scheduled: false, locks, })); @@ -353,108 +498,122 @@ impl Peer { _ => unreachable!(), } } + self.split_trace_mut().push((res.tablet_index, new_ids)); + let region_state = self.storage().region_state().clone(); + self.state_changes_mut() + .put_region_state(region_id, res.tablet_index, ®ion_state) + .unwrap(); + self.set_has_extra_write(); } pub fn on_split_init( &mut self, store_ctx: &mut StoreContext, - split_init: Box, + mut split_init: Box, ) { let region_id = split_init.region.id; - let replace = split_init.region.get_region_epoch().get_version() - > self - .storage() - .region_state() - .get_region() - .get_region_epoch() - .get_version(); - - if !self.storage().is_initialized() || replace { - let split_temp_path = store_ctx.tablet_factory.tablet_path_with_prefix( - SPLIT_PREFIX, - region_id, - RAFT_INIT_LOG_INDEX, - ); - - let tablet = store_ctx - .tablet_factory - .load_tablet(&split_temp_path, region_id, RAFT_INIT_LOG_INDEX) - .unwrap_or_else(|e| { - panic!( - "{:?} fails to load tablet {:?} :{:?}", - self.logger.list(), - split_temp_path, - e - ) - }); - - self.tablet_mut().set(tablet); - - let storage = Storage::with_split( - self.peer().get_store_id(), - &split_init.region, - store_ctx.engine.clone(), - store_ctx.read_scheduler.clone(), - &store_ctx.logger, - ) - .unwrap_or_else(|e| panic!("fail to create storage: {:?}", e)) - .unwrap(); - - let applied_index = storage.apply_state().get_applied_index(); - let peer_id = storage.peer().get_id(); - let raft_cfg = store_ctx.cfg.new_raft_config(peer_id, applied_index); - - let mut raft_group = RawNode::new(&raft_cfg, storage, &self.logger).unwrap(); - // If this region has only one peer and I am the one, campaign directly. - if split_init.region.get_peers().len() == 1 { - raft_group.campaign().unwrap(); - self.set_has_ready(); - } - self.set_raft_group(raft_group); - } else { - // TODO: when reaching here (peer is initalized before and cannot be replaced), - // it is much complexer. + if self.storage().is_initialized() && self.persisted_index() >= RAFT_INIT_LOG_INDEX { + // Race with split operation. The tablet created by split will eventually be + // deleted (TODO). We don't trim it. + let _ = store_ctx + .router + .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); return; } - { - let mut meta = store_ctx.store_meta.lock().unwrap(); + if self.storage().is_initialized() || self.raft_group().snap().is_some() { + // It accepts a snapshot already but not finish applied yet. + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + return; + } - info!( - self.logger, - "init split region"; - "region" => ?split_init.region, + split_init.scheduled = true; + let snap = split_init.to_snapshot(); + let mut msg = raft::eraftpb::Message::default(); + msg.set_to(self.peer_id()); + msg.set_from(self.leader_id()); + msg.set_msg_type(raft::eraftpb::MessageType::MsgSnapshot); + msg.set_snapshot(snap); + msg.set_term(cmp::max(self.term(), RAFT_INIT_LOG_TERM)); + let res = self.raft_group_mut().step(msg); + let accept_snap = self.raft_group().snap().is_some(); + if res.is_err() || !accept_snap { + panic!( + "{:?} failed to accept snapshot {:?} with error {}", + self.logger.list(), + res, + accept_snap ); + } + let prev = self.storage_mut().split_init_mut().replace(split_init); + assert!(prev.is_none(), "{:?}", prev); + self.set_has_ready(); + } - // TODO: GlobalReplicationState - - for p in split_init.region.get_peers() { - self.insert_peer_cache(p.clone()); - } - - if split_init.parent_is_leader { - if self.maybe_campaign() { - self.set_has_ready(); - } - - *self.txn_ext().pessimistic_locks.write() = split_init.locks; - // The new peer is likely to become leader, send a heartbeat immediately to - // reduce client query miss. - self.region_heartbeat_pd(store_ctx); - } + pub fn post_split_init( + &mut self, + store_ctx: &mut StoreContext, + split_init: Box, + ) { + let _ = store_ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::trim( + self.tablet().unwrap().clone(), + self.region(), + )); + if split_init.source_leader + && self.leader_id() == INVALID_ID + && self.term() == RAFT_INIT_LOG_TERM + { + let _ = self.raft_group_mut().campaign(); + self.set_has_ready(); - meta.tablet_caches.insert(region_id, self.tablet().clone()); - meta.readers - .insert(region_id, self.generate_read_delegate()); - meta.region_read_progress - .insert(region_id, self.read_progress().clone()); + *self.txn_ext().pessimistic_locks.write() = split_init.locks; + // The new peer is likely to become leader, send a heartbeat immediately to + // reduce client query miss. + self.region_heartbeat_pd(store_ctx); } + let region_id = self.region_id(); if split_init.check_split { - // TODO: check if the last region needs to split again + self.add_pending_tick(PeerTick::SplitRegionCheck); } + let _ = store_ctx + .router + .force_send(split_init.source_id, PeerMsg::SplitInitFinish(region_id)); + } - self.schedule_apply_fsm(store_ctx); + pub fn on_split_init_finish(&mut self, region_id: u64) { + let mut found = false; + for (_, ids) in self.split_trace_mut() { + if ids.remove(®ion_id) { + found = true; + break; + } + } + assert!(found, "{:?} {}", self.logger.list(), region_id); + let split_trace = self.split_trace_mut(); + let mut off = 0; + let mut admin_flushed = 0; + for (tablet_index, ids) in split_trace.iter() { + if !ids.is_empty() { + break; + } + admin_flushed = *tablet_index; + off += 1; + } + if off > 0 { + // There should be very few elements in the vector. + split_trace.drain(..off); + assert_ne!(admin_flushed, 0); + self.storage_mut() + .apply_trace_mut() + .on_admin_flush(admin_flushed); + // Persist admin flushed. + self.set_has_extra_write(); + } } } @@ -465,35 +624,28 @@ mod test { Arc, }; - use collections::HashMap; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::TestTabletFactoryV2, - raft, + kv::TestTabletFactory, + }; + use engine_traits::{ + Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, }; - use engine_traits::{CfOptionsExt, Peekable, TabletFactory, WriteBatch, ALL_CFS}; - use futures::channel::mpsc::unbounded; use kvproto::{ metapb::RegionEpoch, - raft_cmdpb::{AdminCmdType, BatchSplitRequest, PutRequest, RaftCmdResponse, SplitRequest}, - raft_serverpb::{PeerState, RaftApplyState, RegionLocalState}, + raft_cmdpb::{BatchSplitRequest, SplitRequest}, + raft_serverpb::{PeerState, RegionLocalState}, }; - use raftstore::store::{cmd_resp::new_error, Config, ReadRunner}; + use raftstore::store::cmd_resp::new_error; use slog::o; use tempfile::TempDir; use tikv_util::{ - codec::bytes::encode_bytes, - config::VersionTrack, store::{new_learner_peer, new_peer}, - worker::{dummy_future_scheduler, dummy_scheduler, FutureScheduler, Scheduler, Worker}, + worker::dummy_scheduler, }; use super::*; - use crate::{ - fsm::{ApplyFsm, ApplyResReporter}, - raft::Apply, - tablet::CachedTablet, - }; + use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes}; struct MockReporter { sender: Sender, @@ -522,7 +674,6 @@ mod test { fn assert_split( apply: &mut Apply, - factory: &Arc, parent_id: u64, right_derived: bool, new_region_ids: Vec, @@ -565,8 +716,9 @@ mod test { let state = apply.region_state(); assert_eq!(state.tablet_index, log_index); assert_eq!(state.get_region(), region); - let tablet_path = factory.tablet_path(region.id, log_index); - assert!(factory.exists_raw(&tablet_path)); + let reg = apply.tablet_registry(); + let tablet_path = reg.tablet_path(region.id, log_index); + assert!(reg.tablet_factory().exists(&tablet_path)); match apply_res { AdminCmdResult::SplitRegion(SplitResult { @@ -586,9 +738,10 @@ mod test { } child_idx += 1; - let tablet_path = - factory.tablet_path_with_prefix(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); - assert!(factory.exists_raw(&tablet_path)); + let reg = apply.tablet_registry(); + let tablet_name = reg.tablet_name(SPLIT_PREFIX, region.id, RAFT_INIT_LOG_INDEX); + let path = reg.tablet_root().join(tablet_name); + assert!(reg.tablet_factory().exists(&path)); } } } @@ -606,24 +759,15 @@ mod test { let logger = slog_global::borrow_global().new(o!()); let path = TempDir::new().unwrap(); - let cf_opts = ALL_CFS + let cf_opts = DATA_CFS .iter() .copied() .map(|cf| (cf, CfOptions::default())) .collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path(), - DbOptions::default(), - cf_opts, - )); - - let tablet = factory - .open_tablet( - region.id, - Some(5), - OpenOptions::default().set_create_new(true), - ) - .unwrap(); + let factory = Box::new(TestTabletFactory::new(DbOptions::default(), cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); + let ctx = TabletContext::new(®ion, Some(5)); + reg.load(ctx, true).unwrap(); let mut region_state = RegionLocalState::default(); region_state.set_state(PeerState::Normal); @@ -641,9 +785,10 @@ mod test { .clone(), region_state, reporter, - CachedTablet::new(Some(tablet)), - factory.clone(), + reg, read_scheduler, + Arc::default(), + None, logger.clone(), ); @@ -803,7 +948,6 @@ mod test { assert_split( &mut apply, - &factory, parent_id, right_derive, new_region_ids, @@ -818,17 +962,22 @@ mod test { // Split will create checkpoint tablet, so if there are some writes before // split, they should be flushed immediately. - apply.apply_put(CF_DEFAULT, b"k04", b"v4").unwrap(); - assert!(!WriteBatch::is_empty( - apply.write_batch_mut().as_ref().unwrap() - )); + apply.apply_put(CF_DEFAULT, 50, b"k04", b"v4").unwrap(); + assert!(!WriteBatch::is_empty(apply.write_batch.as_ref().unwrap())); splits.mut_requests().clear(); splits .mut_requests() .push(new_split_req(b"k05", 70, vec![71, 72, 73])); req.set_splits(splits); - apply.apply_batch_split(&req, 50).unwrap(); - assert!(apply.write_batch_mut().is_none()); - assert_eq!(apply.tablet().get_value(b"k04").unwrap().unwrap(), b"v4"); + apply.apply_batch_split(&req, 51).unwrap(); + assert!(apply.write_batch.is_none()); + assert_eq!( + apply + .tablet() + .get_value(&keys::data_key(b"k04")) + .unwrap() + .unwrap(), + b"v4" + ); } } diff --git a/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs new file mode 100644 index 00000000000..12bd7bbf491 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/transfer_leader.rs @@ -0,0 +1,418 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::cmp::Ordering; + +use bytes::Bytes; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use fail::fail_point; +use kvproto::{ + disk_usage::DiskUsage, + metapb, + raft_cmdpb::{ + AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest, RaftRequestHeader, + TransferLeaderRequest, + }, +}; +use parking_lot::RwLockWriteGuard; +use raft::{eraftpb, ProgressState, Storage}; +use raftstore::{ + store::{ + fsm::new_admin_request, make_transfer_leader_response, metrics::PEER_ADMIN_CMD_COUNTER, + LocksStatus, TRANSFER_LEADER_COMMAND_REPLY_CTX, + }, + Result, +}; +use rand::prelude::SliceRandom; +use slog::info; +use txn_types::WriteBatchFlags; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + operation::command::write::SimpleWriteEncoder, + raft::{Apply, Peer}, + router::{CmdResChannel, PeerMsg, PeerTick}, +}; + +fn transfer_leader_cmd(msg: &RaftCmdRequest) -> Option<&TransferLeaderRequest> { + if !msg.has_admin_request() { + return None; + } + let req = msg.get_admin_request(); + if !req.has_transfer_leader() { + return None; + } + + Some(req.get_transfer_leader()) +} + +impl Peer { + /// Return true if the transfer leader request is accepted. + /// + /// When transferring leadership begins, leader sends a pre-transfer + /// to target follower first to ensures it's ready to become leader. + /// After that the real transfer leader process begin. + /// + /// 1. pre_transfer_leader on leader: + /// Leader will send a MsgTransferLeader to follower. + /// 2. execute_transfer_leader on follower + /// If follower passes all necessary checks, it will reply an + /// ACK with type MsgTransferLeader and its promised applied index. + /// 3. ready_to_transfer_leader on leader: + /// Leader checks if it's appropriate to transfer leadership. If it + /// does, it calls raft transfer_leader API to do the remaining work. + /// + /// Additional steps when there are remaining pessimistic + /// locks to propose (detected in function on_transfer_leader_msg). + /// 1. Leader firstly proposes pessimistic locks and then proposes a + /// TransferLeader command. + /// 2. The follower applies the TransferLeader command and replies an + /// ACK with special context TRANSFER_LEADER_COMMAND_REPLY_CTX. + /// + /// See also: tikv/rfcs#37. + pub fn propose_transfer_leader( + &mut self, + ctx: &mut StoreContext, + req: RaftCmdRequest, + ch: CmdResChannel, + ) -> bool { + ctx.raft_metrics.propose.transfer_leader.inc(); + + let transfer_leader = transfer_leader_cmd(&req).unwrap(); + let prs = self.raft_group().raft.prs(); + + // Find the target with the largest matched index among the candidate + // transferee peers + let (_, peers) = transfer_leader + .get_peers() + .iter() + .filter(|peer| peer.id != self.peer().id) + .fold((0, vec![]), |(max_matched, mut chosen), p| { + if let Some(pr) = prs.get(p.id) { + match pr.matched.cmp(&max_matched) { + Ordering::Greater => (pr.matched, vec![p]), + Ordering::Equal => { + chosen.push(p); + (max_matched, chosen) + } + Ordering::Less => (max_matched, chosen), + } + } else { + (max_matched, chosen) + } + }); + let peer = match peers.len() { + 0 => transfer_leader.get_peer(), + 1 => peers.get(0).unwrap(), + _ => peers.choose(&mut rand::thread_rng()).unwrap(), + }; + + let transferee = if peer.id == self.peer_id() { + false + } else { + self.pre_transfer_leader(peer) + }; + + // transfer leader command doesn't need to replicate log and apply, so we + // return immediately. Note that this command may fail, we can view it just as + // an advice + ch.set_result(make_transfer_leader_response()); + + transferee + } + + fn pre_transfer_leader(&mut self, peer: &metapb::Peer) -> bool { + if self.raft_group().raft.has_pending_conf() { + info!( + self.logger, + "reject transfer leader due to pending conf change"; + "peer" => ?peer, + ); + return false; + } + + // Broadcast heartbeat to make sure followers commit the entries immediately. + // It's only necessary to ping the target peer, but ping all for simplicity. + self.raft_group_mut().ping(); + + // todo: entry cache warmup + + let mut msg = eraftpb::Message::new(); + msg.set_to(peer.get_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_from(self.peer_id()); + // log term here represents the term of last log. For leader, the term of last + // log is always its current term. Not just set term because raft library + // forbids setting it for MsgTransferLeader messages. + msg.set_log_term(self.term()); + self.raft_group_mut().raft.msgs.push(msg); + true + } + + pub fn on_transfer_leader_msg( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + peer_disk_usage: DiskUsage, + ) { + // log_term is set by original leader, represents the term last log is written + // in, which should be equal to the original leader's term. + if msg.get_log_term() != self.term() { + return; + } + + if !self.is_leader() { + self.execute_transfer_leader(ctx, msg.get_from(), peer_disk_usage, false); + } else { + let from = match self.peer_from_cache(msg.get_from()) { + Some(p) => p, + None => return, + }; + match self.ready_to_transfer_leader(ctx, msg.get_index(), &from) { + Some(reason) => { + info!( + self.logger, + "reject to transfer leader"; + "to" => ?from, + "reason" => reason, + "index" => msg.get_index(), + "last_index" => self.storage().last_index().unwrap_or_default(), + ); + } + None => { + self.propose_pending_writes(ctx); + if self.propose_locks_before_transfer_leader(ctx, msg) { + // If some pessimistic locks are just proposed, we propose another + // TransferLeader command instead of transferring leader immediately. + info!( + self.logger, + "propose transfer leader command"; + "to" => ?from, + ); + let mut cmd = + new_admin_request(self.region().get_id(), self.peer().clone()); + cmd.mut_header() + .set_region_epoch(self.region().get_region_epoch().clone()); + // Set this flag to propose this command like a normal proposal. + cmd.mut_header() + .set_flags(WriteBatchFlags::TRANSFER_LEADER_PROPOSAL.bits()); + cmd.mut_admin_request() + .set_cmd_type(AdminCmdType::TransferLeader); + cmd.mut_admin_request().mut_transfer_leader().set_peer(from); + if let PeerMsg::AdminCommand(req) = PeerMsg::admin_command(cmd).0 { + self.on_admin_command(ctx, req.request, req.ch); + } else { + unreachable!(); + } + } else { + info!( + self.logger, + "transfer leader"; + "peer" => ?from, + ); + self.raft_group_mut().transfer_leader(from.get_id()); + self.refresh_leader_transferee(); + } + } + } + } + } + + pub fn execute_transfer_leader( + &mut self, + ctx: &mut StoreContext, + from: u64, + peer_disk_usage: DiskUsage, + reply_cmd: bool, // whether it is a reply to a TransferLeader command + ) { + let pending_snapshot = self.is_handling_snapshot() || self.has_pending_snapshot(); + if pending_snapshot + || from != self.leader_id() + // Transfer leader to node with disk full will lead to write availablity downback. + // But if the current leader is disk full, and send such request, we should allow it, + // because it may be a read leader balance request. + || (!matches!(ctx.self_disk_usage, DiskUsage::Normal) && + matches!(peer_disk_usage,DiskUsage::Normal)) + { + info!( + self.logger, + "reject transferring leader"; + "from" => from, + "pending_snapshot" => pending_snapshot, + "disk_usage" => ?ctx.self_disk_usage, + ); + return; + } + + let mut msg = eraftpb::Message::new(); + msg.set_from(self.peer_id()); + msg.set_to(self.leader_id()); + msg.set_msg_type(eraftpb::MessageType::MsgTransferLeader); + msg.set_index(self.storage().apply_state().applied_index); + msg.set_log_term(self.term()); + if reply_cmd { + msg.set_context(Bytes::from_static(TRANSFER_LEADER_COMMAND_REPLY_CTX)); + } + self.raft_group_mut().raft.msgs.push(msg); + } + + fn ready_to_transfer_leader( + &self, + ctx: &mut StoreContext, + mut index: u64, + peer: &metapb::Peer, + ) -> Option<&'static str> { + let status = self.raft_group().status(); + let progress = status.progress.unwrap(); + + if !progress.conf().voters().contains(peer.id) { + return Some("non voter"); + } + + for (id, pr) in progress.iter() { + if pr.state == ProgressState::Snapshot { + return Some("pending snapshot"); + } + if *id == peer.id && index == 0 { + // index will be zero if it's sent from an instance without + // pre-transfer-leader feature. Set it to matched to make it + // possible to transfer leader to an older version. It may be + // useful during rolling restart. + index = pr.matched; + } + } + + if self.raft_group().raft.has_pending_conf() + || self.raft_group().raft.pending_conf_index > index + { + return Some("pending conf change"); + } + + if self.storage().last_index().unwrap_or_default() + >= index + ctx.cfg.leader_transfer_max_log_lag + { + return Some("log gap"); + } + None + } + + // Returns whether we should propose another TransferLeader command. This is + // for: + // - Considering the amount of pessimistic locks can be big, it can reduce + // unavailable time caused by waiting for the transferee catching up logs. + // - Make transferring leader strictly after write commands that executes before + // proposing the locks, preventing unexpected lock loss. + fn propose_locks_before_transfer_leader( + &mut self, + ctx: &mut StoreContext, + msg: &eraftpb::Message, + ) -> bool { + // 1. Disable in-memory pessimistic locks. + + // Clone to make borrow checker happy when registering ticks. + let txn_ext = self.txn_ext().clone(); + let mut pessimistic_locks = txn_ext.pessimistic_locks.write(); + + // If the message context == TRANSFER_LEADER_COMMAND_REPLY_CTX, the message + // is a reply to a transfer leader command before. If the locks status remain + // in the TransferringLeader status, we can safely initiate transferring leader + // now. + // If it's not in TransferringLeader status now, it is probably because several + // ticks have passed after proposing the locks in the last time and we + // reactivate the memory locks. Then, we should propose the locks again. + if msg.get_context() == TRANSFER_LEADER_COMMAND_REPLY_CTX + && pessimistic_locks.status == LocksStatus::TransferringLeader + { + return false; + } + + // If it is not writable, it's probably because it's a retried TransferLeader + // and the locks have been proposed. But we still need to return true to + // propose another TransferLeader command. Otherwise, some write requests that + // have marked some locks as deleted will fail because raft rejects more + // proposals. + // It is OK to return true here if it's in other states like MergingRegion or + // NotLeader. In those cases, the locks will fail to propose and nothing will + // happen. + if !pessimistic_locks.is_writable() { + return true; + } + pessimistic_locks.status = LocksStatus::TransferringLeader; + self.add_pending_tick(PeerTick::ReactivateMemoryLock); + + // 2. Propose pessimistic locks + if pessimistic_locks.is_empty() { + return false; + } + // FIXME: Raft command has size limit. Either limit the total size of + // pessimistic locks in a region, or split commands here. + let mut encoder = SimpleWriteEncoder::with_capacity(512); + let mut lock_count = 0; + { + // Downgrade to a read guard, do not block readers in the scheduler as far as + // possible. + let pessimistic_locks = RwLockWriteGuard::downgrade(pessimistic_locks); + fail_point!("invalidate_locks_before_transfer_leader"); + for (key, (lock, deleted)) in &*pessimistic_locks { + if *deleted { + continue; + } + lock_count += 1; + encoder.put(CF_LOCK, key.as_encoded(), &lock.to_lock().to_bytes()); + } + } + if lock_count == 0 { + // If the map is not empty but all locks are deleted, it is possible that a + // write command has just marked locks deleted but not proposed yet. + // It might cause that command to fail if we skip proposing the + // extra TransferLeader command here. + return true; + } + let mut header = Box::::default(); + header.set_region_id(self.region_id()); + header.set_region_epoch(self.region().get_region_epoch().clone()); + header.set_peer(self.peer().clone()); + info!( + self.logger, + "propose {} locks before transferring leader", lock_count; + ); + let PeerMsg::SimpleWrite(write) = PeerMsg::simple_write(header, encoder.encode()).0 else {unreachable!()}; + self.on_simple_write(ctx, write.header, write.data, write.ch); + true + } +} + +impl Apply { + pub fn apply_transfer_leader( + &mut self, + req: &AdminRequest, + term: u64, + ) -> Result<(AdminResponse, AdminCmdResult)> { + PEER_ADMIN_CMD_COUNTER.transfer_leader.all.inc(); + let resp = AdminResponse::default(); + + let peer = req.get_transfer_leader().get_peer(); + // Only execute TransferLeader if the expected new leader is self. + if peer.get_id() == self.peer().get_id() { + Ok((resp, AdminCmdResult::TransferLeader(term))) + } else { + Ok((resp, AdminCmdResult::None)) + } + } +} + +impl Peer { + pub fn on_transfer_leader(&mut self, ctx: &mut StoreContext, term: u64) { + // If the term has changed between proposing and executing the TransferLeader + // request, ignore it because this request may be stale. + if term != self.term() { + return; + } + + // Reply to leader that it is ready to transfer leader now. + self.execute_transfer_leader(ctx, self.leader_id(), DiskUsage::Normal, true); + + self.set_has_ready(); + } +} diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index 5fb25b4e20d..b330d0093fe 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -1,11 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{collections::LinkedList, mem, num::NonZeroU64}; +use std::{collections::LinkedList, mem}; -use kvproto::{ - metapb, - raft_cmdpb::{AdminCmdType, RaftCmdRequest}, -}; +use kvproto::{metapb, raft_cmdpb::AdminCmdType}; use raftstore::{ store::{ cmd_resp, @@ -263,12 +260,6 @@ impl Drop for ProposalControl { mod tests { use super::*; - fn new_admin_request(cmd_type: AdminCmdType) -> RaftCmdRequest { - let mut request = RaftCmdRequest::default(); - request.mut_admin_request().set_cmd_type(cmd_type); - request - } - #[test] fn test_proposal_control() { let region = metapb::Region::default(); diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 7e69a3f1c7c..35b4ec1918e 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -16,13 +16,11 @@ //! - Applied result are sent back to peer fsm, and update memory state in //! `on_apply_res`. -use std::cmp; +use std::mem; -use batch_system::{Fsm, FsmScheduler, Mailbox}; use engine_traits::{KvEngine, RaftEngine, WriteBatch, WriteOptions}; -use kvproto::{ - raft_cmdpb::{AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader}, - raft_serverpb::RegionLocalState, +use kvproto::raft_cmdpb::{ + AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, }; use protobuf::Message; use raft::eraftpb::{ConfChange, ConfChangeV2, Entry, EntryType}; @@ -31,37 +29,35 @@ use raftstore::{ store::{ cmd_resp, fsm::{ - apply::{ - self, APPLY_WB_SHRINK_SIZE, DEFAULT_APPLY_WB_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP, - }, + apply::{self, APPLY_WB_SHRINK_SIZE, SHRINK_PENDING_CMD_QUEUE_CAP}, Proposal, }, local_metrics::RaftMetrics, - metrics::*, msg::ErrorCallback, - util::{self, admin_cmd_epoch_lookup}, - WriteCallback, + util, WriteCallback, }, Error, Result, }; -use slog::error; use tikv_util::{box_err, time::monotonic_raw_now}; use crate::{ batch::StoreContext, - fsm::{ApplyFsm, ApplyResReporter, PeerFsmDelegate}, - operation::GenSnapTask, + fsm::{ApplyFsm, ApplyResReporter}, raft::{Apply, Peer}, - router::{ApplyRes, ApplyTask, CmdResChannel, PeerMsg}, + router::{ApplyRes, ApplyTask, CmdResChannel}, }; mod admin; mod control; mod write; -pub use admin::{AdminCmdResult, SplitInit, SplitResult}; +pub use admin::{ + temp_split_path, AdminCmdResult, RequestSplit, SplitFlowControl, SplitInit, SPLIT_PREFIX, +}; pub use control::ProposalControl; -pub use write::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use write::{ + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, +}; use self::write::SimpleWrite; @@ -95,23 +91,6 @@ fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { resp } -impl<'a, EK: KvEngine, ER: RaftEngine, T> PeerFsmDelegate<'a, EK, ER, T> { - #[inline] - pub fn on_command(&mut self, req: RaftCmdRequest, ch: CmdResChannel) { - if !req.get_requests().is_empty() { - self.fsm - .peer_mut() - .on_write_command(self.store_ctx, req, ch) - } else if req.has_admin_request() { - self.fsm - .peer_mut() - .on_admin_command(self.store_ctx, req, ch) - } else if req.has_status_request() { - error!(self.fsm.logger(), "status command should be sent by Query"); - } - } -} - impl Peer { /// Schedule an apply fsm to apply logs in the background. /// @@ -122,16 +101,16 @@ impl Peer { pub fn schedule_apply_fsm(&mut self, store_ctx: &mut StoreContext) { let region_state = self.storage().region_state().clone(); let mailbox = store_ctx.router.mailbox(self.region_id()).unwrap(); - let tablet = self.tablet().clone(); let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( self.peer().clone(), region_state, mailbox, - tablet, - store_ctx.tablet_factory.clone(), + store_ctx.tablet_registry.clone(), read_scheduler, + self.flush_state().clone(), + self.storage().apply_trace().log_recovery(), logger, ); @@ -143,17 +122,17 @@ impl Peer { } #[inline] - fn validate_command(&self, req: &RaftCmdRequest, metrics: &mut RaftMetrics) -> Result<()> { - if let Err(e) = util::check_store_id(req, self.peer().get_store_id()) { + fn validate_command( + &self, + header: &RaftRequestHeader, + admin_type: Option, + metrics: &mut RaftMetrics, + ) -> Result<()> { + if let Err(e) = util::check_store_id(header, self.peer().get_store_id()) { metrics.invalid_proposal.mismatch_store_id.inc(); return Err(e); } - for r in req.get_requests() { - if let CmdType::Get | CmdType::Snap | CmdType::ReadIndex = r.get_cmd_type() { - return Err(box_err!("internal error: query can't be sent as command")); - } - } - if let Err(e) = util::check_peer_id(req, self.peer().get_id()) { + if let Err(e) = util::check_peer_id(header, self.peer().get_id()) { metrics.invalid_proposal.mismatch_peer_id.inc(); return Err(e); } @@ -161,12 +140,12 @@ impl Peer { metrics.invalid_proposal.not_leader.inc(); return Err(Error::NotLeader(self.region_id(), self.leader())); } - if let Err(e) = util::check_term(req, self.term()) { + if let Err(e) = util::check_term(header, self.term()) { metrics.invalid_proposal.stale_command.inc(); return Err(e); } - if let Err(mut e) = util::check_region_epoch(req, self.region(), true) { - if let Error::EpochNotMatch(_, new_regions) = &mut e { + if let Err(mut e) = util::check_region_epoch(header, admin_type, self.region(), true) { + if let Error::EpochNotMatch(_, _new_regions) = &mut e { // TODO: query sibling regions. metrics.invalid_proposal.epoch_not_match.inc(); } @@ -175,16 +154,6 @@ impl Peer { Ok(()) } - #[inline] - fn propose_command( - &mut self, - ctx: &mut StoreContext, - req: RaftCmdRequest, - ) -> Result { - let data = req.write_to_bytes().unwrap(); - self.propose(ctx, data) - } - #[inline] fn propose( &mut self, @@ -248,15 +217,10 @@ impl Peer { } #[inline] - pub fn schedule_apply_committed_entries( - &mut self, - ctx: &mut StoreContext, - committed_entries: Vec, - ) { - let last_entry = match committed_entries.last() { - Some(e) => e, - None => return, - }; + pub fn schedule_apply_committed_entries(&mut self, committed_entries: Vec) { + if committed_entries.is_empty() { + return; + } let current_term = self.term(); let mut entry_and_proposals = vec![]; let queue = self.proposals_mut(); @@ -282,6 +246,7 @@ impl Peer { entry_and_proposals, }; self.apply_scheduler() + .unwrap() .send(ApplyTask::CommittedEntries(apply)); } @@ -296,19 +261,26 @@ impl Peer { return; } - for admin_res in apply_res.admin_result { + for admin_res in Vec::from(apply_res.admin_result) { match admin_res { + AdminCmdResult::None => unreachable!(), AdminCmdResult::ConfChange(conf_change) => { self.on_apply_res_conf_change(ctx, conf_change) } - AdminCmdResult::SplitRegion(SplitResult { - regions, - derived_index, - tablet_index, - }) => self.on_ready_split_region(ctx, derived_index, tablet_index, regions), + AdminCmdResult::SplitRegion(res) => { + self.storage_mut() + .apply_trace_mut() + .on_admin_modify(res.tablet_index); + self.on_apply_res_split(ctx, res) + } + AdminCmdResult::TransferLeader(term) => self.on_transfer_leader(ctx, term), + AdminCmdResult::CompactLog(res) => self.on_apply_res_compact_log(ctx, res), } } + self.update_split_flow_control(&apply_res.metrics); + self.update_stat(&apply_res.metrics); + self.raft_group_mut() .advance_apply_to(apply_res.applied_index); self.proposal_control_advance_apply(apply_res.applied_index); @@ -322,6 +294,7 @@ impl Peer { if !is_leader { entry_storage.compact_entry_cache(apply_res.applied_index + 1); } + self.on_data_modified(apply_res.modifications); self.handle_read_on_apply( ctx, apply_res.applied_term, @@ -331,7 +304,44 @@ impl Peer { } } +impl Apply { + #[inline] + fn should_skip(&self, off: usize, index: u64) -> bool { + let log_recovery = self.log_recovery(); + if log_recovery.is_none() { + return false; + } + log_recovery.as_ref().unwrap()[off] >= index + } +} + impl Apply { + pub fn apply_unsafe_write(&mut self, data: Box<[u8]>) { + let decoder = match SimpleWriteReqDecoder::new(&self.logger, &data, u64::MAX, u64::MAX) { + Ok(decoder) => decoder, + Err(req) => unreachable!("unexpected request: {:?}", req), + }; + for req in decoder { + match req { + SimpleWrite::Put(put) => { + let _ = self.apply_put(put.cf, u64::MAX, put.key, put.value); + } + SimpleWrite::Delete(delete) => { + let _ = self.apply_delete(delete.cf, u64::MAX, delete.key); + } + SimpleWrite::DeleteRange(dr) => { + let _ = self.apply_delete_range( + dr.cf, + u64::MAX, + dr.start_key, + dr.end_key, + dr.notify_only, + ); + } + } + } + } + #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); @@ -342,14 +352,14 @@ impl Apply { } if !e.get_data().is_empty() { let mut set_save_point = false; - if let Some(wb) = self.write_batch_mut() { + if let Some(wb) = &mut self.write_batch { wb.set_save_point(); set_save_point = true; } let resp = match self.apply_entry(&e).await { Ok(resp) => resp, Err(e) => { - if let Some(wb) = self.write_batch_mut() { + if let Some(wb) = &mut self.write_batch { if set_save_point { wb.rollback_to_save_point().unwrap(); } else { @@ -371,11 +381,12 @@ impl Apply { #[inline] async fn apply_entry(&mut self, entry: &Entry) -> Result { let mut conf_change = None; + let log_index = entry.get_index(); let req = match entry.get_entry_type() { - EntryType::EntryNormal => match SimpleWriteDecoder::new( + EntryType::EntryNormal => match SimpleWriteReqDecoder::new( &self.logger, entry.get_data(), - entry.get_index(), + log_index, entry.get_term(), ) { Ok(decoder) => { @@ -389,16 +400,21 @@ impl Apply { let res = Ok(new_response(decoder.header())); for req in decoder { match req { - SimpleWrite::Put(put) => self.apply_put(put.cf, put.key, put.value)?, + SimpleWrite::Put(put) => { + self.apply_put(put.cf, log_index, put.key, put.value)?; + } SimpleWrite::Delete(delete) => { - self.apply_delete(delete.cf, delete.key)? + self.apply_delete(delete.cf, log_index, delete.key)?; + } + SimpleWrite::DeleteRange(dr) => { + self.apply_delete_range( + dr.cf, + log_index, + dr.start_key, + dr.end_key, + dr.notify_only, + )?; } - SimpleWrite::DeleteRange(dr) => self.apply_delete_range( - dr.cf, - dr.start_key, - dr.end_key, - dr.notify_only, - )?, } } return res; @@ -406,55 +422,41 @@ impl Apply { Err(req) => req, }, EntryType::EntryConfChange => { - let cc: ConfChange = parse_at( - &self.logger, - entry.get_data(), - entry.get_index(), - entry.get_term(), - ); - let req: RaftCmdRequest = parse_at( - &self.logger, - cc.get_context(), - entry.get_index(), - entry.get_term(), - ); + let cc: ConfChange = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); conf_change = Some(cc.into_v2()); req } EntryType::EntryConfChangeV2 => { - let cc: ConfChangeV2 = parse_at( - &self.logger, - entry.get_data(), - entry.get_index(), - entry.get_term(), - ); - let req: RaftCmdRequest = parse_at( - &self.logger, - cc.get_context(), - entry.get_index(), - entry.get_term(), - ); + let cc: ConfChangeV2 = + parse_at(&self.logger, entry.get_data(), log_index, entry.get_term()); + let req: RaftCmdRequest = + parse_at(&self.logger, cc.get_context(), log_index, entry.get_term()); conf_change = Some(cc); req } }; - util::check_region_epoch(&req, self.region_state().get_region(), true)?; + util::check_req_region_epoch(&req, self.region_state().get_region(), true)?; if req.has_admin_request() { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { - AdminCmdType::CompactLog => unimplemented!(), - AdminCmdType::Split => self.apply_split(admin_req, entry.index)?, - AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, entry.index)?, + AdminCmdType::CompactLog => self.apply_compact_log(admin_req, entry.index)?, + AdminCmdType::Split => self.apply_split(admin_req, log_index)?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, AdminCmdType::PrepareMerge => unimplemented!(), AdminCmdType::CommitMerge => unimplemented!(), AdminCmdType::RollbackMerge => unimplemented!(), - AdminCmdType::TransferLeader => unreachable!(), + AdminCmdType::TransferLeader => { + self.apply_transfer_leader(admin_req, entry.term)? + } AdminCmdType::ChangePeer => { - self.apply_conf_change(entry.get_index(), admin_req, conf_change.unwrap())? + self.apply_conf_change(log_index, admin_req, conf_change.unwrap())? } AdminCmdType::ChangePeerV2 => { - self.apply_conf_change_v2(entry.get_index(), admin_req, conf_change.unwrap())? + self.apply_conf_change_v2(log_index, admin_req, conf_change.unwrap())? } AdminCmdType::ComputeHash => unimplemented!(), AdminCmdType::VerifyHash => unimplemented!(), @@ -466,7 +468,10 @@ impl Apply { } }; - self.push_admin_result(admin_result); + match admin_result { + AdminCmdResult::None => (), + _ => self.push_admin_result(admin_result), + } let mut resp = new_response(req.get_header()); resp.set_admin_response(admin_resp); Ok(resp) @@ -477,16 +482,17 @@ impl Apply { // backward compatibility. CmdType::Put => { let put = r.get_put(); - self.apply_put(put.get_cf(), put.get_key(), put.get_value())?; + self.apply_put(put.get_cf(), log_index, put.get_key(), put.get_value())?; } CmdType::Delete => { let delete = r.get_delete(); - self.apply_delete(delete.get_cf(), delete.get_key())?; + self.apply_delete(delete.get_cf(), log_index, delete.get_key())?; } CmdType::DeleteRange => { let dr = r.get_delete_range(); self.apply_delete_range( dr.get_cf(), + log_index, dr.get_start_key(), dr.get_end_key(), dr.get_notify_only(), @@ -501,16 +507,22 @@ impl Apply { #[inline] pub fn flush(&mut self) { - if let Some(wb) = self.write_batch_mut() && !wb.is_empty() { + let (index, term) = self.apply_progress(); + let flush_state = self.flush_state().clone(); + if let Some(wb) = &mut self.write_batch && !wb.is_empty() { let mut write_opt = WriteOptions::default(); write_opt.set_disable_wal(true); - if let Err(e) = wb.write_opt(&write_opt) { - panic!("failed to write data: {:?}", self.logger.list()); + if let Err(e) = wb.write_callback_opt(&write_opt, || { + flush_state.set_applied_index(index); + }) { + panic!("failed to write data: {:?}: {:?}", self.logger.list(), e); } + self.metrics.written_bytes += wb.data_size() as u64; + self.metrics.written_keys += wb.count() as u64; if wb.data_size() <= APPLY_WB_SHRINK_SIZE { wb.clear(); } else { - self.write_batch_mut().take(); + self.write_batch.take(); } } let callbacks = self.callbacks_mut(); @@ -521,10 +533,11 @@ impl Apply { callbacks.shrink_to(SHRINK_PENDING_CMD_QUEUE_CAP); } let mut apply_res = ApplyRes::default(); - let (index, term) = self.apply_progress(); apply_res.applied_index = index; apply_res.applied_term = term; - apply_res.admin_result = self.take_admin_result(); + apply_res.admin_result = self.take_admin_result().into_boxed_slice(); + apply_res.modifications = *self.modifications_mut(); + apply_res.metrics = mem::take(&mut self.metrics); self.res_reporter().report(apply_res); } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 59c5679f95f..ad6e537b956 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,37 +1,39 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{KvEngine, Mutable, RaftEngine, CF_DEFAULT}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ cmd_resp, - fsm::{apply, Proposal, MAX_PROPOSAL_SIZE_RATIO}, + fsm::{apply, MAX_PROPOSAL_SIZE_RATIO}, msg::ErrorCallback, util::{self, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER}, - WriteCallback, }, - Error, Result, + Result, }; use crate::{ batch::StoreContext, + operation::cf_offset, raft::{Apply, Peer}, - router::CmdResChannel, + router::{ApplyTask, CmdResChannel}, }; mod simple_write; -pub use simple_write::{SimpleWriteDecoder, SimpleWriteEncoder}; +pub use simple_write::{ + SimpleWriteBinary, SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, +}; pub use self::simple_write::SimpleWrite; -use super::CommittedEntries; impl Peer { #[inline] - pub fn on_write_command( + pub fn on_simple_write( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + header: Box, + data: SimpleWriteBinary, ch: CmdResChannel, ) { if !self.serving() { @@ -39,16 +41,13 @@ impl Peer { return; } if let Some(encoder) = self.simple_write_encoder_mut() { - match encoder.amend(req) { - Ok(()) => { - encoder.add_response_channel(ch); - self.set_has_ready(); - return; - } - Err(r) => req = r, + if encoder.amend(&header, &data) { + encoder.add_response_channel(ch); + self.set_has_ready(); + return; } } - if let Err(e) = self.validate_command(&req, &mut ctx.raft_metrics) { + if let Err(e) = self.validate_command(&header, None, &mut ctx.raft_metrics) { let resp = cmd_resp::new_error(e); ch.report_error(resp); return; @@ -61,20 +60,37 @@ impl Peer { } // ProposalControl is reliable only when applied to current term. let call_proposed_on_success = self.applied_to_current_term(); - match SimpleWriteEncoder::new( - req, + let mut encoder = SimpleWriteReqEncoder::new( + header, + data, (ctx.cfg.raft_entry_max_size.0 as f64 * MAX_PROPOSAL_SIZE_RATIO) as usize, call_proposed_on_success, - ) { - Ok(mut encoder) => { - encoder.add_response_channel(ch); - self.set_has_ready(); - self.simple_write_encoder_mut().replace(encoder); - } - Err(req) => { - let res = self.propose_command(ctx, req); - self.post_propose_command(ctx, res, vec![ch], call_proposed_on_success); - } + ); + encoder.add_response_channel(ch); + self.set_has_ready(); + self.simple_write_encoder_mut().replace(encoder); + } + + #[inline] + pub fn on_unsafe_write( + &mut self, + ctx: &mut StoreContext, + data: SimpleWriteBinary, + ) { + if !self.serving() { + return; + } + let bin = SimpleWriteReqEncoder::new( + Box::::default(), + data, + ctx.cfg.raft_entry_max_size.0 as usize, + false, + ) + .encode() + .0 + .into_boxed_slice(); + if let Some(scheduler) = self.apply_scheduler() { + scheduler.send(ApplyTask::UnsafeWrite(bin)); } } @@ -93,7 +109,7 @@ impl Peer { NORMAL_REQ_CHECK_VER, true, ); - if let Err(mut e) = res { + if let Err(e) = res { // TODO: query sibling regions. ctx.raft_metrics.invalid_proposal.epoch_not_match.inc(); encoder.encode().1.report_error(cmd_resp::new_error(e)); @@ -111,13 +127,27 @@ impl Peer { impl Apply { #[inline] - pub fn apply_put(&mut self, cf: &str, key: &[u8], value: &[u8]) -> Result<()> { + pub fn apply_put(&mut self, cf: &str, index: u64, key: &[u8], value: &[u8]) -> Result<()> { + let off = cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } util::check_key_in_region(key, self.region_state().get_region())?; + // Technically it's OK to remove prefix for raftstore v2. But rocksdb doesn't + // support specifying infinite upper bound in various APIs. + keys::data_key_with_buffer(key, &mut self.key_buffer); + self.ensure_write_buffer(); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector - self.write_batch_or_default().put(key, value) + self.write_batch + .as_mut() + .unwrap() + .put(&self.key_buffer, value) } else { - self.write_batch_or_default().put_cf(cf, key, value) + self.write_batch + .as_mut() + .unwrap() + .put_cf(cf, &self.key_buffer, value) }; res.unwrap_or_else(|e| { panic!( @@ -132,17 +162,29 @@ impl Apply { fail::fail_point!("APPLY_PUT", |_| Err(raftstore::Error::Other( "aborted by failpoint".into() ))); + self.metrics.size_diff_hint += (self.key_buffer.len() + value.len()) as i64; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } Ok(()) } #[inline] - pub fn apply_delete(&mut self, cf: &str, key: &[u8]) -> Result<()> { + pub fn apply_delete(&mut self, cf: &str, index: u64, key: &[u8]) -> Result<()> { + let off = cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } util::check_key_in_region(key, self.region_state().get_region())?; + keys::data_key_with_buffer(key, &mut self.key_buffer); let res = if cf.is_empty() || cf == CF_DEFAULT { // TODO: use write_vector - self.write_batch_or_default().delete(key) + self.write_batch.as_mut().unwrap().delete(&self.key_buffer) } else { - self.write_batch_or_default().delete_cf(cf, key) + self.write_batch + .as_mut() + .unwrap() + .delete_cf(cf, &self.key_buffer) }; res.unwrap_or_else(|e| { panic!( @@ -153,18 +195,23 @@ impl Apply { e ); }); + self.metrics.size_diff_hint -= self.key_buffer.len() as i64; + if index != u64::MAX { + self.modifications_mut()[off] = index; + } Ok(()) } #[inline] pub fn apply_delete_range( &mut self, - cf: &str, - start_key: &[u8], - end_key: &[u8], - notify_only: bool, + _cf: &str, + _index: u64, + _start_key: &[u8], + _end_key: &[u8], + _notify_only: bool, ) -> Result<()> { - /// TODO: reuse the same delete as split/merge. + // TODO: reuse the same delete as split/merge. Ok(()) } } diff --git a/components/raftstore-v2/src/operation/command/write/simple_write.rs b/components/raftstore-v2/src/operation/command/write/simple_write.rs index ca9e7d39366..57c01fca9d8 100644 --- a/components/raftstore-v2/src/operation/command/write/simple_write.rs +++ b/components/raftstore-v2/src/operation/command/write/simple_write.rs @@ -1,8 +1,8 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, RaftRequestHeader, Request}; -use protobuf::{CodedInputStream, Message, SingularPtrField}; +use kvproto::raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}; +use protobuf::{CodedInputStream, Message}; use raftstore::store::WriteCallback; use slog::Logger; @@ -13,83 +13,62 @@ use crate::{operation::command::parse_at, router::CmdResChannel}; // TODO: use protobuf blob request seems better. const MAGIC_PREFIX: u8 = 0x00; +#[derive(Clone, Debug)] +#[repr(transparent)] +pub struct SimpleWriteBinary { + buf: Box<[u8]>, +} + /// We usually use `RaftCmdRequest` for read write request. But the codec is /// not efficient enough for simple request. `SimpleWrite` is introduce to make /// codec alloc less and fast. #[derive(Debug)] -pub struct SimpleWriteEncoder { - header: SingularPtrField, +pub struct SimpleWriteReqEncoder { + header: Box, buf: Vec, channels: Vec, size_limit: usize, notify_proposed: bool, } -impl SimpleWriteEncoder { - /// Create an encoder. +impl SimpleWriteReqEncoder { + /// Create a request encoder. /// /// If `notify_proposed` is true, channels will be called `notify_proposed` /// when it's appended. pub fn new( - mut req: RaftCmdRequest, + header: Box, + bin: SimpleWriteBinary, size_limit: usize, notify_proposed: bool, - ) -> Result { - if !Self::allow_request(&req) { - return Err(req); - } - + ) -> SimpleWriteReqEncoder { let mut buf = Vec::with_capacity(256); buf.push(MAGIC_PREFIX); - req.get_header() - .write_length_delimited_to_vec(&mut buf) - .unwrap(); + header.write_length_delimited_to_vec(&mut buf).unwrap(); + buf.extend_from_slice(&bin.buf); - for r in req.get_requests() { - encode(r, &mut buf); - } - Ok(SimpleWriteEncoder { - header: req.header, + SimpleWriteReqEncoder { + header, buf, channels: vec![], size_limit, notify_proposed, - }) - } - - fn allow_request(req: &RaftCmdRequest) -> bool { - if !req.has_status_request() && !req.has_admin_request() { - // TODO: skip the check and make caller use `SimpleWrite` directly. - for r in req.get_requests() { - if r.get_cmd_type() != CmdType::Put - && r.get_cmd_type() != CmdType::Delete - && r.get_cmd_type() != CmdType::DeleteRange - { - return false; - } - } - } else { - return false; - }; - true + } } + /// Encode the simple write into the buffer dispite header check. + /// + /// Return false if the buffer limit is reached or the write can be amended. #[inline] - pub fn amend(&mut self, req: RaftCmdRequest) -> Result<(), RaftCmdRequest> { - if Self::allow_request(&req) && req.header == self.header { - let last_length = self.buf.len(); - for r in req.get_requests() { - encode(r, &mut self.buf); - } - // The default size limit is 8 * 0.4 = 3.2MiB. - if self.buf.len() < self.size_limit { - Ok(()) - } else { - self.buf.truncate(last_length); - Err(req) - } + pub fn amend(&mut self, header: &RaftRequestHeader, bin: &SimpleWriteBinary) -> bool { + if *self.header != *header { + return false; + } + if self.buf.len() + bin.buf.len() < self.size_limit { + self.buf.extend_from_slice(&bin.buf); + true } else { - Err(req) + false } } @@ -118,9 +97,7 @@ impl SimpleWriteEncoder { #[inline] pub fn header(&self) -> &RaftRequestHeader { - self.header - .as_ref() - .unwrap_or_else(|| RaftRequestHeader::default_instance()) + &self.header } } @@ -152,19 +129,63 @@ pub enum SimpleWrite<'a> { DeleteRange(DeleteRange<'a>), } +#[derive(Clone)] +pub struct SimpleWriteEncoder { + buf: Vec, +} + +impl SimpleWriteEncoder { + #[inline] + pub fn with_capacity(cap: usize) -> SimpleWriteEncoder { + SimpleWriteEncoder { + buf: Vec::with_capacity(cap), + } + } + + #[inline] + pub fn put(&mut self, cf: &str, key: &[u8], value: &[u8]) { + encode(SimpleWrite::Put(Put { cf, key, value }), &mut self.buf); + } + + #[inline] + pub fn delete(&mut self, cf: &str, key: &[u8]) { + encode(SimpleWrite::Delete(Delete { cf, key }), &mut self.buf); + } + + #[inline] + pub fn delete_range(&mut self, cf: &str, start_key: &[u8], end_key: &[u8], notify_only: bool) { + encode( + SimpleWrite::DeleteRange(DeleteRange { + cf, + start_key, + end_key, + notify_only, + }), + &mut self.buf, + ); + } + + #[inline] + pub fn encode(self) -> SimpleWriteBinary { + SimpleWriteBinary { + buf: self.buf.into_boxed_slice(), + } + } +} + #[derive(Debug)] -pub struct SimpleWriteDecoder<'a> { +pub struct SimpleWriteReqDecoder<'a> { header: RaftRequestHeader, buf: &'a [u8], } -impl<'a> SimpleWriteDecoder<'a> { +impl<'a> SimpleWriteReqDecoder<'a> { pub fn new( logger: &Logger, buf: &'a [u8], index: u64, term: u64, - ) -> Result, RaftCmdRequest> { + ) -> Result, RaftCmdRequest> { match buf.first().cloned() { Some(MAGIC_PREFIX) => { let mut is = CodedInputStream::from_bytes(&buf[1..]); @@ -179,7 +200,7 @@ impl<'a> SimpleWriteDecoder<'a> { ), }; let read = is.pos(); - Ok(SimpleWriteDecoder { + Ok(SimpleWriteReqDecoder { header, buf: &buf[1 + read as usize..], }) @@ -194,7 +215,7 @@ impl<'a> SimpleWriteDecoder<'a> { } } -impl<'a> Iterator for SimpleWriteDecoder<'a> { +impl<'a> Iterator for SimpleWriteReqDecoder<'a> { type Item = SimpleWrite<'a>; #[inline] @@ -310,43 +331,33 @@ fn decode_cf(buf: &[u8]) -> (&str, &[u8]) { } } -// TODO: we need a way to verify every field is encoded. -#[inline] -fn encode(req: &Request, buf: &mut Vec) { - match req.get_cmd_type() { - CmdType::Put => { +#[inline(always)] +fn encode(simple_write: SimpleWrite<'_>, buf: &mut Vec) { + match simple_write { + SimpleWrite::Put(put) => { buf.push(PUT_TAG); - let put_req = req.get_put(); - encode_cf(put_req.get_cf(), buf); - encode_bytes(put_req.get_key(), buf); - encode_bytes(put_req.get_value(), buf); + encode_cf(put.cf, buf); + encode_bytes(put.key, buf); + encode_bytes(put.value, buf); } - CmdType::Delete => { + SimpleWrite::Delete(delete) => { buf.push(DELETE_TAG); - let delete_req = req.get_delete(); - encode_cf(delete_req.get_cf(), buf); - encode_bytes(delete_req.get_key(), buf); + encode_cf(delete.cf, buf); + encode_bytes(delete.key, buf); } - CmdType::DeleteRange => { + SimpleWrite::DeleteRange(dr) => { buf.push(DELETE_RANGE_TAG); - let delete_range_req = req.get_delete_range(); - encode_cf(delete_range_req.get_cf(), buf); - encode_bytes(delete_range_req.get_start_key(), buf); - encode_bytes(delete_range_req.get_end_key(), buf); - buf.push(delete_range_req.get_notify_only() as u8); + encode_cf(dr.cf, buf); + encode_bytes(dr.start_key, buf); + encode_bytes(dr.end_key, buf); + buf.push(dr.notify_only as u8); } - CmdType::Invalid - | CmdType::Get - | CmdType::Snap - | CmdType::Prewrite - | CmdType::IngestSst - | CmdType::ReadIndex => unreachable!("not supported type should be filtered already"), } } #[inline] fn decode<'a>(buf: &mut &'a [u8]) -> Option> { - let (tag, mut left) = buf.split_first()?; + let (tag, left) = buf.split_first()?; match *tag { PUT_TAG => { let (cf, left) = decode_cf(left); @@ -380,57 +391,32 @@ fn decode<'a>(buf: &mut &'a [u8]) -> Option> { #[cfg(test)] mod tests { + use kvproto::raft_cmdpb::{CmdType, Request}; use slog::o; use super::*; #[test] fn test_codec() { - let mut cmd = RaftCmdRequest::default(); - cmd.mut_header().set_term(2); - - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - let put_req = req.mut_put(); - put_req.set_cf(CF_DEFAULT.to_string()); - put_req.set_key(b"key".to_vec()); - put_req.set_value(b"".to_vec()); - cmd.mut_requests().push(req); - - req = Request::default(); - req.set_cmd_type(CmdType::Delete); - let delete_req = req.mut_delete(); + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); let delete_key = vec![0; 1024]; - delete_req.set_cf(CF_WRITE.to_string()); - delete_req.set_key(delete_key.clone()); - cmd.mut_requests().push(req); - - let mut encoder = SimpleWriteEncoder::new(cmd.clone(), usize::MAX, false).unwrap(); - cmd.clear_requests(); - - req = Request::default(); - req.set_cmd_type(CmdType::DeleteRange); - let delete_range_req = req.mut_delete_range(); - delete_range_req.set_cf(CF_LOCK.to_string()); - delete_range_req.set_start_key(b"key".to_vec()); - delete_range_req.set_end_key(b"key".to_vec()); - delete_range_req.set_notify_only(true); - cmd.mut_requests().push(req); - - req = Request::default(); - req.set_cmd_type(CmdType::DeleteRange); - let delete_range_req = req.mut_delete_range(); - delete_range_req.set_cf("cf".to_string()); - delete_range_req.set_start_key(b"key".to_vec()); - delete_range_req.set_end_key(b"key".to_vec()); - delete_range_req.set_notify_only(false); - cmd.mut_requests().push(req); - - encoder.amend(cmd.clone()).unwrap(); - let (bytes, _) = encoder.encode(); + encoder.delete(CF_WRITE, &delete_key); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin, usize::MAX, false); + + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.delete_range(CF_LOCK, b"key", b"key", true); + encoder.delete_range("cf", b"key", b"key", false); + req_encoder.amend(&header, &encoder.encode()); + + let (bytes, _) = req_encoder.encode(); let logger = slog_global::borrow_global().new(o!()); - let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); - assert_eq!(decoder.header(), cmd.get_header()); + let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); let write = decoder.next().unwrap(); let SimpleWrite::Put(put) = write else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); @@ -488,38 +474,40 @@ mod tests { #[test] fn test_invalid() { - let mut invalid_cmd = RaftCmdRequest::default(); - invalid_cmd.mut_header().set_term(2); + let mut raft_cmd = RaftCmdRequest::default(); + raft_cmd.mut_header().set_term(2); let mut req = Request::default(); req.set_cmd_type(CmdType::Invalid); - invalid_cmd.mut_requests().push(req); - let fallback = SimpleWriteEncoder::new(invalid_cmd.clone(), usize::MAX, false).unwrap_err(); - let bytes = fallback.write_to_bytes().unwrap(); + raft_cmd.mut_requests().push(req); + let bytes = raft_cmd.write_to_bytes().unwrap(); let logger = slog_global::borrow_global().new(o!()); - let decoded = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); - assert_eq!(decoded, invalid_cmd); + let decoded = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap_err(); + // SimpleWriteReqDecoder should be able to decode naive RaftCmdRequest. + assert_eq!(decoded, raft_cmd); - let mut valid_cmd = RaftCmdRequest::default(); - valid_cmd.mut_header().set_term(3); - let mut req = Request::default(); - req.set_cmd_type(CmdType::Put); - let put_req = req.mut_put(); - put_req.set_cf(CF_DEFAULT.to_string()); - put_req.set_key(b"key".to_vec()); - put_req.set_value(b"".to_vec()); - valid_cmd.mut_requests().push(req); - let mut encoder = SimpleWriteEncoder::new(valid_cmd.clone(), usize::MAX, false).unwrap(); - // Only simple write command can be batched. - encoder.amend(invalid_cmd.clone()).unwrap_err(); - let mut valid_cmd2 = valid_cmd.clone(); - valid_cmd2.mut_header().set_term(4); + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", b""); + let bin = encoder.encode(); + + let mut header = Box::::default(); + header.set_term(2); + let mut req_encoder = SimpleWriteReqEncoder::new(header.clone(), bin.clone(), 512, false); + + let mut header2 = Box::::default(); + header2.set_term(4); // Only simple write command with same header can be batched. - encoder.amend(valid_cmd2).unwrap_err(); + assert!(!req_encoder.amend(&header2, &bin)); + + // Batch should not excceed max size limit. + let large_value = vec![0; 512]; + let mut encoder = SimpleWriteEncoder::with_capacity(512); + encoder.put(CF_DEFAULT, b"key", &large_value); + assert!(!req_encoder.amend(&header, &encoder.encode())); - let (bytes, _) = encoder.encode(); - let mut decoder = SimpleWriteDecoder::new(&logger, &bytes, 0, 0).unwrap(); - assert_eq!(decoder.header(), valid_cmd.get_header()); + let (bytes, _) = req_encoder.encode(); + let mut decoder = SimpleWriteReqDecoder::new(&logger, &bytes, 0, 0).unwrap(); + assert_eq!(*decoder.header(), *header); let req = decoder.next().unwrap(); let SimpleWrite::Put(put) = req else { panic!("should be put") }; assert_eq!(put.cf, CF_DEFAULT); diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 60884f63b03..ea42832eaea 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -14,12 +14,12 @@ use std::cmp; use batch_system::BasicMailbox; use crossbeam::channel::{SendError, TrySendError}; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; use kvproto::{ metapb::Region, raft_serverpb::{PeerState, RaftMessage}, }; -use raftstore::store::{util, ExtraStates, WriteTask}; +use raftstore::store::{util, WriteTask}; use slog::{debug, error, info, warn}; use tikv_util::store::find_peer; @@ -149,7 +149,6 @@ impl Store { } else { return; }; - let msg_type = msg.get_message().get_msg_type(); let from_peer = msg.get_from_peer(); let to_peer = msg.get_to_peer(); // Now the peer should not exist. @@ -176,7 +175,7 @@ impl Store { return; } let from_epoch = msg.get_region_epoch(); - let local_state = match ctx.engine.get_region_state(region_id) { + let local_state = match ctx.engine.get_region_state(region_id, u64::MAX) { Ok(s) => s, Err(e) => { error!(self.logger(), "failed to get region state"; "region_id" => region_id, "err" => ?e); @@ -227,10 +226,10 @@ impl Store { self.store_id(), region, ctx.engine.clone(), - ctx.read_scheduler.clone(), + ctx.schedulers.read.clone(), &ctx.logger, ) - .and_then(|s| PeerFsm::new(&ctx.cfg, &*ctx.tablet_factory, s)) + .and_then(|s| PeerFsm::new(&ctx.cfg, &ctx.tablet_registry, &ctx.snap_mgr, s)) { Ok(p) => p, res => { @@ -238,10 +237,15 @@ impl Store { return; } }; + ctx.store_meta + .lock() + .unwrap() + .set_region(fsm.peer().region(), false, fsm.logger()); let mailbox = BasicMailbox::new(tx, fsm, ctx.router.state_cnt().clone()); - if let Err((p, _)) = ctx + if ctx .router .send_and_register(region_id, mailbox, PeerMsg::Start) + .is_err() { panic!( "[region {}] {} failed to register peer", @@ -304,13 +308,20 @@ impl Peer { Some((f, l)) => Some((cmp::min(first_index, f), cmp::max(last_index, l))), }; } - let mut extra_states = ExtraStates::new(entry_storage.apply_state().clone()); + let raft_engine = self.entry_storage().raft_engine(); let mut region_state = self.storage().region_state().clone(); + let region_id = region_state.get_region().get_id(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(2)); + // We only use raft-log-engine for v2, first index is not important. + let raft_state = self.entry_storage().raft_state(); + raft_engine.clean(region_id, 0, raft_state, lb).unwrap(); // Write worker will do the clean up when meeting tombstone state. region_state.set_state(PeerState::Tombstone); - extra_states.set_region_state(region_state); - extra_states.set_raft_state(entry_storage.raft_state().clone()); - write_task.extra_write.set_v2(extra_states); + let applied_index = self.entry_storage().applied_index(); + lb.put_region_state(region_id, applied_index, ®ion_state) + .unwrap(); self.destroy_progress_mut().start(); } @@ -325,6 +336,6 @@ impl Peer { // new peer. Ignore error as it's just a best effort. let _ = ctx.router.send_raft_message(msg); } - // TODO: close apply mailbox. + self.clear_apply_scheduler(); } } diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index 7df897f2b26..c49a14142ce 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -7,9 +7,14 @@ mod query; mod ready; pub use command::{ - AdminCmdResult, CommittedEntries, ProposalControl, SimpleWriteDecoder, SimpleWriteEncoder, + AdminCmdResult, CommittedEntries, ProposalControl, RequestSplit, SimpleWriteBinary, + SimpleWriteEncoder, SimpleWriteReqDecoder, SimpleWriteReqEncoder, SplitFlowControl, + SPLIT_PREFIX, }; pub use life::DestroyProgress; -pub use ready::{AsyncWriter, GenSnapTask, SnapState}; +pub use ready::{ + cf_offset, write_initial_states, ApplyTrace, AsyncWriter, DataTrace, GenSnapTask, SnapState, + StateStorage, +}; pub(crate) use self::{command::SplitInit, query::LocalReader}; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index 659fab00754..894f39f278b 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -2,21 +2,18 @@ //! This module implements the interactions with pd. -use std::cmp; - use engine_traits::{KvEngine, RaftEngine}; use fail::fail_point; use kvproto::{metapb, pdpb}; use raftstore::store::Transport; use slog::error; -use tikv_util::time::InstantExt; use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store, StoreFsmDelegate}, raft::Peer, - router::{PeerTick, StoreTick}, - worker::{PdRegionHeartbeatTask, PdTask}, + router::{CmdResChannel, PeerTick, StoreTick}, + worker::pd, }; impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { @@ -41,7 +38,7 @@ impl Store { stats.set_store_id(self.store_id()); { let meta = ctx.store_meta.lock().unwrap(); - stats.set_region_count(meta.tablet_caches.len() as u32); + stats.set_region_count(meta.readers.len() as u32); } stats.set_sending_snap_count(0); @@ -55,8 +52,8 @@ impl Store { // stats.set_query_stats(query_stats); - let task = PdTask::StoreHeartbeat { stats }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + let task = pd::Task::StoreHeartbeat { stats }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!(self.logger(), "notify pd failed"; "store_id" => self.store_id(), "err" => ?e @@ -80,7 +77,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, impl Peer { #[inline] pub fn region_heartbeat_pd(&self, ctx: &StoreContext) { - let task = PdTask::RegionHeartbeat(PdRegionHeartbeatTask { + let task = pd::Task::RegionHeartbeat(pd::RegionHeartbeatTask { term: self.term(), region: self.region().clone(), down_peers: self.collect_down_peers(ctx.cfg.max_peer_down_duration.0), @@ -92,12 +89,10 @@ impl Peer { approximate_keys: None, wait_data_peers: Vec::new(), }); - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => ?e, ); return; @@ -151,8 +146,6 @@ impl Peer { error!( self.logger, "failed to get peer from cache"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "get_peer_id" => id, ); } @@ -163,34 +156,36 @@ impl Peer { #[inline] pub fn destroy_peer_pd(&self, ctx: &StoreContext) { - let task = PdTask::DestroyPeer { + let task = pd::Task::DestroyPeer { region_id: self.region_id(), }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with DestroyPeer"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => %e, ); } } #[inline] - pub fn ask_batch_split_pd(&self, ctx: &StoreContext, split_keys: Vec>) { - let task = PdTask::AskBatchSplit { + pub fn ask_batch_split_pd( + &self, + ctx: &StoreContext, + split_keys: Vec>, + ch: CmdResChannel, + ) { + let task = pd::Task::AskBatchSplit { region: self.region().clone(), split_keys, peer: self.peer().clone(), right_derive: ctx.cfg.right_derive_when_split, + ch, }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with AskBatchSplit"; - "region_id" => self.region_id(), - "peer_id" => self.peer_id(), "err" => %e, ); } @@ -202,8 +197,8 @@ impl Peer { ctx: &StoreContext, regions: Vec, ) { - let task = PdTask::ReportBatchSplit { regions }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + let task = pd::Task::ReportBatchSplit { regions }; + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with ReportBatchSplit"; @@ -214,12 +209,12 @@ impl Peer { #[inline] pub fn update_max_timestamp_pd(&self, ctx: &StoreContext, initial_status: u64) { - let task = PdTask::UpdateMaxTimestamp { + let task = pd::Task::UpdateMaxTimestamp { region_id: self.region_id(), initial_status, txn_ext: self.txn_ext().clone(), }; - if let Err(e) = ctx.pd_scheduler.schedule(task) { + if let Err(e) = ctx.schedulers.pd.schedule(task) { error!( self.logger, "failed to notify pd with UpdateMaxTimestamp"; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 114080bcdbb..ca92729ee6f 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -1,13 +1,13 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{Arc, Mutex}; +use std::sync::Mutex; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::RaftCmdRequest; use raftstore::store::{ can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, - ReadIndexRequest, ReadProgress, TrackVer, Transport, + ReadIndexRequest, ReadProgress, Transport, }; use slog::debug; use tikv_util::time::monotonic_raw_now; @@ -99,10 +99,9 @@ impl Peer { /// /// awake the read tasks waiting in frontend (such as unified thread pool) /// In v1, it's named as response_read. - pub(crate) fn respond_read_index( + pub(crate) fn respond_read_index( &self, read_index_req: &mut ReadIndexRequest, - ctx: &mut StoreContext, ) { debug!( self.logger, @@ -111,7 +110,7 @@ impl Peer { ); RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); let time = monotonic_raw_now(); - for (req, ch, mut read_index) in read_index_req.take_cmds().drain(..) { + for (_, ch, mut read_index) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) @@ -151,7 +150,7 @@ impl Peer { pub(crate) fn maybe_renew_leader_lease( &mut self, ts: Timespec, - store_meta: &Mutex>, + store_meta: &Mutex, progress: Option, ) { // A nonleader peer should never has leader lease. @@ -175,45 +174,12 @@ impl Peer { self.maybe_update_read_progress(reader, progress); } if let Some(progress) = read_progress { - // TODO: remove it - self.add_reader_if_necessary(store_meta); - let mut meta = store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); } } - // TODO: remove this block of code when snapshot is done; add the logic into - // on_persist_snapshot. - pub(crate) fn add_reader_if_necessary(&mut self, store_meta: &Mutex>) { - let mut meta = store_meta.lock().unwrap(); - // TODO: remove this block of code when snapshot is done; add the logic into - // on_persist_snapshot. - let reader = meta.readers.get_mut(&self.region_id()); - if reader.is_none() { - let region = self.region().clone(); - let region_id = region.get_id(); - let peer_id = self.peer_id(); - let delegate = ReadDelegate { - region: Arc::new(region), - peer_id, - term: self.term(), - applied_term: self.entry_storage().applied_term(), - leader_lease: None, - last_valid_ts: Timespec::new(0, 0), - tag: format!("[region {}] {}", region_id, peer_id), - read_progress: self.read_progress().clone(), - pending_remove: false, - bucket_meta: None, - txn_extra_op: Default::default(), - txn_ext: Default::default(), - track_ver: TrackVer::new(), - }; - meta.readers.insert(self.region_id(), delegate); - } - } - pub(crate) fn maybe_update_read_progress( &self, reader: &mut ReadDelegate, diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index 0736bc13fd8..2cb5497d789 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -8,7 +8,8 @@ use std::{ use batch_system::Router; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{CachedTablet, KvEngine, RaftEngine, TabletRegistry}; +use futures::Future; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse}, @@ -16,29 +17,26 @@ use kvproto::{ use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, util::LeaseState, LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, - ReadExecutorProvider, RegionSnapshot, RequestInspector, RequestPolicy, - TLS_LOCAL_READ_METRICS, + cmd_resp, + util::LeaseState, + worker_metrics::{self, TLS_LOCAL_READ_METRICS}, + LocalReadContext, LocalReaderCore, ReadDelegate, ReadExecutor, ReadExecutorProvider, + RegionSnapshot, RequestPolicy, }, Error, Result, }; use slog::{debug, Logger}; -use tikv_util::{ - box_err, - codec::number::decode_u64, - time::{monotonic_raw_now, ThreadReadId}, -}; +use tikv_util::{box_err, codec::number::decode_u64, time::monotonic_raw_now, Either}; use time::Timespec; use txn_types::WriteBatchFlags; use crate::{ fsm::StoreMeta, router::{PeerMsg, QueryResult}, - tablet::CachedTablet, StoreRouter, }; -pub trait MsgRouter: Send { +pub trait MsgRouter: Clone + Send { fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError>; } @@ -69,16 +67,21 @@ where E: KvEngine, C: MsgRouter, { - pub fn new(store_meta: Arc>>, router: C, logger: Logger) -> Self { + pub fn new( + store_meta: Arc>, + reg: TabletRegistry, + router: C, + logger: Logger, + ) -> Self { Self { - local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta)), + local_reader: LocalReaderCore::new(StoreMetaDelegate::new(store_meta, reg)), router, logger, } } - pub fn store_meta(&self) -> &Arc>> { - self.local_reader.store_meta() + pub fn store_meta(&self) -> &Arc> { + &self.local_reader.store_meta().store_meta } pub fn pre_propose_raft_command( @@ -94,6 +97,8 @@ where Ok(RequestPolicy::ReadLocal) => Ok(Some((delegate, RequestPolicy::ReadLocal))), Ok(RequestPolicy::StaleRead) => Ok(Some((delegate, RequestPolicy::StaleRead))), // It can not handle other policies. + // TODO: we should only abort when lease expires. For other cases we should retry + // infinitely. Ok(_) => Ok(None), Err(e) => Err(e), } @@ -104,44 +109,55 @@ where fn try_get_snapshot( &mut self, - req: RaftCmdRequest, + req: &RaftCmdRequest, ) -> std::result::Result>, RaftCmdResponse> { - match self.pre_propose_raft_command(&req) { - Ok(Some((mut delegate, policy))) => match policy { - RequestPolicy::ReadLocal => { - let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); - // Ensures the snapshot is acquired before getting the time - atomic::fence(atomic::Ordering::Release); - let snapshot_ts = monotonic_raw_now(); - - if !delegate.is_in_leader_lease(snapshot_ts) { - return Ok(None); + match self.pre_propose_raft_command(req) { + Ok(Some((mut delegate, policy))) => { + let mut snap = match policy { + RequestPolicy::ReadLocal => { + let region = Arc::clone(&delegate.region); + let snap = + RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + // Ensures the snapshot is acquired before getting the time + atomic::fence(atomic::Ordering::Release); + let snapshot_ts = monotonic_raw_now(); + + if !delegate.is_in_leader_lease(snapshot_ts) { + return Ok(None); + } + + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); + + // Try renew lease in advance + self.maybe_renew_lease_in_advance(&delegate, req, snapshot_ts); + snap } + RequestPolicy::StaleRead => { + let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); + delegate.check_stale_read_safe(read_ts)?; - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + let region = Arc::clone(&delegate.region); + let snap = + RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); - // Try renew lease in advance - self.maybe_renew_lease_in_advance(&delegate, &req, snapshot_ts); - Ok(Some(snap)) - } - RequestPolicy::StaleRead => { - let read_ts = decode_u64(&mut req.get_header().get_flag_data()).unwrap(); - delegate.check_stale_read_safe(read_ts)?; + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_requests.inc()); - let region = Arc::clone(&delegate.region); - let snap = RegionSnapshot::from_snapshot(delegate.get_snapshot(&None), region); + delegate.check_stale_read_safe(read_ts)?; - TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().local_executed_requests.inc()); + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); + snap + } + _ => unreachable!(), + }; - delegate.check_stale_read_safe(read_ts)?; + snap.txn_ext = Some(delegate.txn_ext.clone()); + snap.bucket_meta = delegate.bucket_meta.clone(); - TLS_LOCAL_READ_METRICS - .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); - Ok(Some(snap)) - } - _ => unreachable!(), - }, + Ok(Some(snap)) + } Ok(None) => Ok(None), Err(e) => { let mut response = cmd_resp::new_error(e); @@ -157,63 +173,102 @@ where } } - pub async fn snapshot( + pub fn snapshot( &mut self, mut req: RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { + ) -> impl Future, RaftCmdResponse>> + Send + { let region_id = req.header.get_ref().region_id; - if let Some(snap) = self.try_get_snapshot(req.clone())? { - return Ok(snap); - } + let res = match self.try_get_snapshot(&req) { + res @ (Ok(Some(_)) | Err(_)) => Either::Left(res), + Ok(None) => Either::Right((self.try_to_renew_lease(region_id, &req), self.clone())), + }; - if let Some(query_res) = self.try_to_renew_lease(region_id, &req).await? { - // If query successful, try again. - if query_res.read().is_some() { - req.mut_header().set_read_quorum(false); - if let Some(snap) = self.try_get_snapshot(req)? { - return Ok(snap); + worker_metrics::maybe_tls_local_read_metrics_flush(); + + async move { + match res { + Either::Left(Ok(Some(snap))) => Ok(snap), + Either::Left(Err(e)) => Err(e), + Either::Right((fut, mut reader)) => { + let err = match fut.await? { + Some(query_res) => { + if query_res.read().is_some() { + // If query successful, try again. + req.mut_header().set_read_quorum(false); + if let Some(snap) = reader.try_get_snapshot(&req)? { + return Ok(snap); + } else { + let mut err = errorpb::Error::default(); + err.set_message(format!("no delegate found for {}", region_id)); + err + } + } else { + let QueryResult::Response(res) = query_res else { unreachable!() }; + assert!(res.get_header().has_error(), "{:?}", res); + return Err(res); + } + } + None => { + let mut err = errorpb::Error::default(); + err.set_message(format!( + "failed to extend lease: canceled: {}", + region_id + )); + err + } + }; + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(err); + Err(resp) } + Either::Left(Ok(None)) => unreachable!(), } } - - let mut err = errorpb::Error::default(); - err.set_message(format!( - "Fail to get snapshot from LocalReader for region {}. \ - Maybe due to `not leader`, `region not found` or `not applied to the current term`", - region_id - )); - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) } // try to renew the lease by sending read query where the reading process may // renew the lease - async fn try_to_renew_lease( + fn try_to_renew_lease( &self, region_id: u64, req: &RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { - let (msg, sub) = PeerMsg::raft_query(req.clone()); - let mut err = errorpb::Error::default(); - match MsgRouter::send(&self.router, region_id, msg) { - Ok(()) => return Ok(sub.result().await), - Err(TrySendError::Full(c)) => { + ) -> impl Future, RaftCmdResponse>> { + let mut req = req.clone(); + // Remote lease is updated step by step. It's possible local reader expires + // while the raftstore doesn't. So we need to trigger an update + // explicitly. TODO: find a way to reduce the triggered heartbeats. + req.mut_header().set_read_quorum(true); + let (msg, sub) = PeerMsg::raft_query(req); + let res = match MsgRouter::send(&self.router, region_id, msg) { + Ok(()) => Ok(sub), + Err(TrySendError::Full(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.channel_full.inc()); + let mut err = errorpb::Error::default(); err.set_message(RAFTSTORE_IS_BUSY.to_owned()); err.mut_server_is_busy() .set_reason(RAFTSTORE_IS_BUSY.to_owned()); + Err(err) } - Err(TrySendError::Disconnected(c)) => { + Err(TrySendError::Disconnected(_)) => { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.no_region.inc()); + let mut err = errorpb::Error::default(); err.set_message(format!("region {} is missing", region_id)); err.mut_region_not_found().set_region_id(region_id); + Err(err) } - } + }; - let mut resp = RaftCmdResponse::default(); - resp.mut_header().set_error(err); - Err(resp) + async move { + match res { + Ok(sub) => Ok(sub.result().await), + Err(e) => { + let mut resp = RaftCmdResponse::default(); + resp.mut_header().set_error(e); + Err(resp) + } + } + } } // If the remote lease will be expired in near future send message @@ -231,7 +286,7 @@ where let region_id = req.header.get_ref().region_id; TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().renew_lease_advance.inc()); // Send a read query which may renew the lease - let (msg, sub) = PeerMsg::raft_query(req.clone()); + let msg = PeerMsg::raft_query(req.clone()).0; if let Err(e) = MsgRouter::send(&self.router, region_id, msg) { debug!( self.logger, @@ -300,15 +355,16 @@ struct StoreMetaDelegate where E: KvEngine, { - store_meta: Arc>>, + store_meta: Arc>, + reg: TabletRegistry, } impl StoreMetaDelegate where E: KvEngine, { - pub fn new(store_meta: Arc>>) -> StoreMetaDelegate { - StoreMetaDelegate { store_meta } + pub fn new(store_meta: Arc>, reg: TabletRegistry) -> StoreMetaDelegate { + StoreMetaDelegate { store_meta, reg } } } @@ -317,10 +373,10 @@ where E: KvEngine, { type Executor = CachedReadDelegate; - type StoreMeta = Arc>>; + type StoreMeta = Arc>; fn store_id(&self) -> Option { - self.store_meta.as_ref().lock().unwrap().store_id + Some(self.store_meta.as_ref().lock().unwrap().store_id) } /// get the ReadDelegate with region_id and the number of delegates in the @@ -330,7 +386,7 @@ where let reader = meta.readers.get(®ion_id).cloned(); if let Some(reader) = reader { // If reader is not None, cache must not be None. - let cached_tablet = meta.tablet_caches.get(®ion_id).cloned().unwrap(); + let cached_tablet = self.reg.get(region_id).unwrap(); return ( meta.readers.len(), Some(CachedReadDelegate { @@ -341,10 +397,6 @@ where } (meta.readers.len(), None) } - - fn store_meta(&self) -> &Self::StoreMeta { - &self.store_meta - } } struct SnapRequestInspector<'r> { @@ -431,14 +483,15 @@ mod tests { use crossbeam::{atomic::AtomicCell, channel::TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, + kv::{KvTestEngine, TestTabletFactory}, }; - use engine_traits::{MiscExt, OpenOptions, Peekable, SyncMutable, TabletFactory, ALL_CFS}; + use engine_traits::{MiscExt, Peekable, SyncMutable, TabletContext, DATA_CFS}; use futures::executor::block_on; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, raft_cmdpb::*}; + use pd_client::BucketMeta; use raftstore::store::{ - util::Lease, ReadCallback, ReadProgress, RegionReadProgress, TrackVer, TxnExt, - TLS_LOCAL_READ_METRICS, + util::Lease, worker_metrics::TLS_LOCAL_READ_METRICS, ReadCallback, ReadProgress, + RegionReadProgress, TrackVer, TxnExt, }; use slog::o; use tempfile::Builder; @@ -449,6 +502,7 @@ mod tests { use super::*; use crate::router::{QueryResult, ReadResponse}; + #[derive(Clone)] struct MockRouter { p_router: SyncSender<(u64, PeerMsg)>, } @@ -470,7 +524,8 @@ mod tests { #[allow(clippy::type_complexity)] fn new_reader( store_id: u64, - store_meta: Arc>>, + store_meta: Arc>, + reg: TabletRegistry, ) -> ( LocalReader, Receiver<(u64, PeerMsg)>, @@ -478,6 +533,7 @@ mod tests { let (ch, rx) = MockRouter::new(); let mut reader = LocalReader::new( store_meta, + reg, ch, Logger::root(slog::Discard, o!("key1" => "value1")), ); @@ -519,13 +575,16 @@ mod tests { match msg { // send the result back to local reader - PeerMsg::RaftQuery(query) => ReadCallback::set_result( - query.ch, - QueryResult::Read(ReadResponse { - read_index: 0, - txn_extra_op: Default::default(), - }), - ), + PeerMsg::RaftQuery(query) => { + assert!(query.request.get_header().get_read_quorum()); + ReadCallback::set_result( + query.ch, + QueryResult::Read(ReadResponse { + read_index: 0, + txn_extra_op: Default::default(), + }), + ) + } _ => unreachable!(), } ch_tx.send(rx).unwrap(); @@ -539,15 +598,16 @@ mod tests { // Building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let path = Builder::new() .prefix("test-local-reader") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); - let store_meta = Arc::new(Mutex::new(StoreMeta::new())); - let (mut reader, mut rx) = new_reader(store_id, store_meta.clone()); + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); + let (mut reader, mut rx) = new_reader(store_id, store_meta.clone(), reg.clone()); let (mix_tx, mix_rx) = sync_channel(1); let handler = mock_raftstore(mix_rx); @@ -602,6 +662,8 @@ mod tests { // Register region 1 lease.renew(monotonic_raw_now()); let remote = lease.maybe_new_remote_lease(term6).unwrap(); + let txn_ext = Arc::new(TxnExt::default()); + let bucket_meta = Arc::new(BucketMeta::default()); { let mut meta = store_meta.as_ref().lock().unwrap(); @@ -615,19 +677,16 @@ mod tests { leader_lease: Some(remote), last_valid_ts: Timespec::new(0, 0), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::default())), - txn_ext: Arc::new(TxnExt::default()), + txn_ext: txn_ext.clone(), read_progress: read_progress.clone(), pending_remove: false, track_ver: TrackVer::new(), - bucket_meta: None, + bucket_meta: Some(bucket_meta.clone()), }; meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - let tablet1 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); - let cache = CachedTablet::new(Some(tablet1)); - meta.tablet_caches.insert(1, cache); + let ctx = TabletContext::new(®ion1, Some(10)); + reg.load(ctx, true).unwrap(); } let (ch_tx, ch_rx) = sync_channel(1); @@ -652,6 +711,11 @@ mod tests { // the applied term by the above thread, the snapshot will be acquired by // retrying. let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); + assert!(Arc::ptr_eq(snap.txn_ext.as_ref().unwrap(), &txn_ext)); + assert!(Arc::ptr_eq( + snap.bucket_meta.as_ref().unwrap(), + &bucket_meta + )); assert_eq!(*snap.get_region(), region1); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), @@ -680,11 +744,12 @@ mod tests { ch_tx.clone(), )) .unwrap(); - let snap = block_on(reader.snapshot(cmd.clone())).unwrap(); - // Updating lease makes cache miss. + block_on(reader.snapshot(cmd.clone())).unwrap(); + // Updating lease makes cache miss. And because the cache is updated on cloned + // copy, so the old cache will still need to be updated again. assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.cache_miss.get()), - 4 + 5 ); assert_eq!( TLS_LOCAL_READ_METRICS.with(|m| m.borrow().reject_reason.lease_expire.get()), @@ -733,15 +798,16 @@ mod tests { fn test_read_delegate() { // Building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); let path = Builder::new() .prefix("test-local-reader") .tempdir() .unwrap(); - let factory = Arc::new(TestTabletFactoryV2::new(path.path(), ops, cf_opts)); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path()).unwrap(); let store_meta = - StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::::new()))); + StoreMetaDelegate::new(Arc::new(Mutex::new(StoreMeta::new(1))), reg.clone()); let tablet1; let tablet2; @@ -753,24 +819,20 @@ mod tests { meta.readers.insert(1, read_delegate); // create tablet with region_id 1 and prepare some data - tablet1 = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + let mut ctx = TabletContext::with_infinite_region(1, Some(10)); + reg.load(ctx, true).unwrap(); + tablet1 = reg.get(1).unwrap().latest().unwrap().clone(); tablet1.put(b"a1", b"val1").unwrap(); - let cache = CachedTablet::new(Some(tablet1.clone())); - meta.tablet_caches.insert(1, cache); // Create read_delegate with region id 2 let read_delegate = ReadDelegate::mock(2); meta.readers.insert(2, read_delegate); // create tablet with region_id 1 and prepare some data - tablet2 = factory - .open_tablet(2, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + ctx = TabletContext::with_infinite_region(2, Some(10)); + reg.load(ctx, true).unwrap(); + tablet2 = reg.get(2).unwrap().latest().unwrap().clone(); tablet2.put(b"a2", b"val2").unwrap(); - let cache = CachedTablet::new(Some(tablet2.clone())); - meta.tablet_caches.insert(2, cache); } let (_, delegate) = store_meta.get_executor_and_len(1); diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 77ca7b90074..59c6f2d0f7c 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -11,27 +11,25 @@ //! Follower's read index and replica read is implemenented replica module. //! Leader's read index and lease renew is implemented in lease module. -use std::{cmp, sync::Arc}; +use std::cmp; use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ errorpb, raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, StatusCmdType}, - raft_serverpb::RaftApplyState, }; -use raft::Ready; +use raft::{Ready, StateRole}; use raftstore::{ errors::RAFTSTORE_IS_BUSY, store::{ - cmd_resp, fsm::ApplyMetrics, local_metrics::RaftMetrics, - metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::ErrorCallback, region_meta::RegionMeta, util, - util::LeaseState, GroupState, ReadCallback, ReadIndexContext, ReadProgress, RequestPolicy, - Transport, + cmd_resp, local_metrics::RaftMetrics, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ErrorCallback, region_meta::RegionMeta, util, util::LeaseState, GroupState, + ReadIndexContext, ReadProgress, RequestPolicy, Transport, }, Error, Result, }; -use slog::info; +use slog::{debug, info}; use tikv_util::box_err; use txn_types::WriteBatchFlags; @@ -40,8 +38,7 @@ use crate::{ fsm::PeerFsmDelegate, raft::Peer, router::{ - message::RaftRequest, ApplyRes, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, - ReadResponse, + message::RaftRequest, DebugInfoChannel, PeerMsg, QueryResChannel, QueryResult, ReadResponse, }, }; @@ -131,7 +128,7 @@ impl Peer { } // Check store_id, make sure that the msg is dispatched to the right place. - if let Err(e) = util::check_store_id(msg, self.peer().get_store_id()) { + if let Err(e) = util::check_store_id(msg.get_header(), self.peer().get_store_id()) { raft_metrics.invalid_proposal.mismatch_store_id.inc(); return Err(e); } @@ -146,7 +143,6 @@ impl Peer { // TODO: add flashback_state check // Check whether the store has the right peer to handle the request. - let leader_id = self.leader_id(); let request = msg.get_requests(); // TODO: add force leader @@ -158,11 +154,11 @@ impl Peer { let allow_replica_read = msg.get_header().get_replica_read(); if !self.is_leader() && !is_read_index_request && !allow_replica_read { raft_metrics.invalid_proposal.not_leader.inc(); - return Err(Error::NotLeader(self.region_id(), None)); + return Err(Error::NotLeader(self.region_id(), self.leader())); } // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.peer_id()) { + if let Err(e) = util::check_peer_id(msg.get_header(), self.peer_id()) { raft_metrics.invalid_proposal.mismatch_peer_id.inc(); return Err(e); } @@ -170,13 +166,13 @@ impl Peer { // TODO: check applying snapshot // Check whether the term is stale. - if let Err(e) = util::check_term(msg, self.term()) { + if let Err(e) = util::check_term(msg.get_header(), self.term()) { raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } // TODO: add check of sibling region for split - util::check_region_epoch(msg, self.region(), true) + util::check_req_region_epoch(msg, self.region(), true) } // For these cases it won't be proposed: @@ -186,7 +182,7 @@ impl Peer { fn read_index( &mut self, ctx: &mut StoreContext, - mut req: RaftCmdRequest, + req: RaftCmdRequest, ch: QueryResChannel, ) { // TODO: add pre_read_index to handle splitting or merging @@ -222,7 +218,7 @@ impl Peer { if self.ready_to_handle_read() { while let Some(mut read) = self.pending_reads_mut().pop_front() { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } } } @@ -264,9 +260,9 @@ impl Peer { && read.cmds()[0].0.get_requests()[0].get_cmd_type() == CmdType::ReadIndex; if is_read_index_request { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } else if self.ready_to_handle_unsafe_replica_read(read.read_index.unwrap()) { - self.respond_replica_read(&mut read, ctx); + self.respond_replica_read(&mut read); } else { // TODO: `ReadIndex` requests could be blocked. self.pending_reads_mut().push_front(read); @@ -344,7 +340,7 @@ impl Peer { } fn query_status(&mut self, req: &RaftCmdRequest, resp: &mut RaftCmdResponse) -> Result<()> { - util::check_store_id(req, self.peer().get_store_id())?; + util::check_store_id(req.get_header(), self.peer().get_store_id())?; let cmd_type = req.get_status_request().get_cmd_type(); let status_resp = resp.mut_status_response(); status_resp.set_cmd_type(cmd_type); @@ -379,16 +375,22 @@ impl Peer { /// Query internal states for debugging purpose. pub fn on_query_debug_info(&self, ch: DebugInfoChannel) { let entry_storage = self.storage().entry_storage(); + let mut status = self.raft_group().status(); + status + .progress + .get_or_insert_with(|| self.raft_group().raft.prs()); let mut meta = RegionMeta::new( self.storage().region_state(), entry_storage.apply_state(), GroupState::Ordered, - self.raft_group().status(), + status, + self.raft_group().raft.raft_log.last_index(), + self.raft_group().raft.raft_log.persisted, ); // V2 doesn't persist commit index and term, fill them with in-memory values. meta.raft_apply.commit_index = cmp::min( self.raft_group().raft.raft_log.committed, - self.raft_group().raft.raft_log.persisted, + self.persisted_index(), ); meta.raft_apply.commit_term = self .raft_group() @@ -396,6 +398,10 @@ impl Peer { .raft_log .term(meta.raft_apply.commit_index) .unwrap(); + debug!(self.logger, "on query debug info"; + "tick" => self.raft_group().raft.election_elapsed, + "election_timeout" => self.raft_group().raft.randomized_election_timeout(), + ); ch.set_result(meta); } @@ -416,7 +422,7 @@ impl Peer { self.post_pending_read_index_on_replica(ctx) } else if self.ready_to_handle_read() { while let Some(mut read) = self.pending_reads_mut().pop_front() { - self.respond_read_index(&mut read, ctx); + self.respond_read_index(&mut read); } } self.pending_reads_mut().gc(); @@ -424,10 +430,11 @@ impl Peer { // Only leaders need to update applied_term. if progress_to_be_updated && self.is_leader() { - // TODO: add coprocessor_host hook + if applied_term == self.term() { + ctx.coprocessor_host + .on_applied_current_term(StateRole::Leader, self.region()); + } let progress = ReadProgress::applied_term(applied_term); - // TODO: remove it - self.add_reader_if_necessary(&ctx.store_meta); let mut meta = ctx.store_meta.lock().unwrap(); let reader = meta.readers.get_mut(&self.region_id()).unwrap(); self.maybe_update_read_progress(reader, progress); diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index 9433cd10c52..fb00adbbc5a 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -62,10 +62,9 @@ impl Peer { self.set_has_ready(); } - pub(crate) fn respond_replica_read( + pub(crate) fn respond_replica_read( &self, read_index_req: &mut ReadIndexRequest, - ctx: &mut StoreContext, ) { debug!( self.logger, @@ -74,7 +73,7 @@ impl Peer { ); RAFT_READ_INDEX_PENDING_COUNT.sub(read_index_req.cmds().len() as i64); let time = monotonic_raw_now(); - for (req, ch, mut read_index) in read_index_req.take_cmds().drain(..) { + for (req, ch, _) in read_index_req.take_cmds().drain(..) { ch.read_tracker().map(|tracker| { GLOBAL_TRACKERS.with_tracker(*tracker, |t| { t.metrics.read_index_confirm_wait_nanos = (time - read_index_req.propose_time) diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs new file mode 100644 index 00000000000..d6a83b7933b --- /dev/null +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -0,0 +1,636 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! In raftstore v2, WAL is always disabled for tablet. So we need a way to +//! trace what have been persisted what haven't, and recover those missing +//! data when restart. +//! +//! In summary, we trace the persist progress by recording flushed event. +//! Because memtable is flushed one by one, so a flushed memtable must contain +//! all the data within the CF before certain apply index. So the minimun +//! flushed apply index + 1 of all data CFs is the recovery start point. In +//! some cases, a CF may not have any updates at all for a long time. In some +//! cases, we may still need to recover from smaller index even if flushed +//! index of all data CFs have advanced. So a special flushed index is +//! introduced and stored with raft CF (only using the name, raft CF is +//! dropped). It's the recommended recovery start point. How these two indexes +//! interact with each other can be found in the `ApplyTrace::recover` and +//! `ApplyTrace::maybe_advance_admin_flushed`. +//! +//! The correctness of raft cf index relies on the fact that: +//! - apply is sequential, so if any apply index is updated to apply trace, all +//! modification events before that must be processed. +//! - admin commands that marked by raft cf index must flush all data before +//! being executed. Note this contraint is not just for recovery, but also +//! necessary to guarantee safety of operations like split init or log gc. +//! So data of logs before raft cf index must be applied and flushed to disk. +//! +//! All apply related states are associated with an apply index. During +//! recovery states corresponding to the start index should be used. + +use std::{cmp, sync::Mutex}; + +use engine_traits::{ + FlushProgress, KvEngine, RaftEngine, RaftLogBatch, TabletRegistry, ALL_CFS, CF_DEFAULT, + CF_LOCK, CF_RAFT, CF_WRITE, DATA_CFS, DATA_CFS_LEN, +}; +use kvproto::{ + metapb::Region, + raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, +}; +use raftstore::store::{ + ReadTask, TabletSnapManager, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, +}; +use slog::Logger; +use tikv_util::{box_err, worker::Scheduler}; + +use crate::{ + operation::{ + command::temp_split_path, + ready::snapshot::{install_tablet, recv_snap_path}, + }, + raft::{Peer, Storage}, + router::PeerMsg, + Result, StoreRouter, +}; + +/// Write states for the given region. The region is supposed to have all its +/// data persisted and not governed by any raft group before. +pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { + let region_id = region.get_id(); + + let mut state = RegionLocalState::default(); + state.set_region(region); + state.set_tablet_index(RAFT_INIT_LOG_INDEX); + wb.put_region_state(region_id, RAFT_INIT_LOG_INDEX, &state)?; + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_index(RAFT_INIT_LOG_INDEX); + apply_state + .mut_truncated_state() + .set_term(RAFT_INIT_LOG_TERM); + wb.put_apply_state(region_id, RAFT_INIT_LOG_INDEX, &apply_state)?; + + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(RAFT_INIT_LOG_INDEX); + raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); + raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); + wb.put_raft_state(region_id, &raft_state)?; + + for cf in ALL_CFS { + wb.put_flushed_index(region_id, cf, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_INDEX)?; + } + + Ok(()) +} + +fn to_static_cf(cf: &str) -> &'static str { + match cf { + CF_DEFAULT => CF_DEFAULT, + CF_RAFT => CF_RAFT, + CF_WRITE => CF_WRITE, + CF_LOCK => CF_LOCK, + _ => unreachable!("unexpected cf: {cf}"), + } +} + +pub struct StateStorage { + raft_engine: ER, + router: Mutex>, +} + +impl StateStorage { + pub fn new(raft_engine: ER, router: StoreRouter) -> Self { + Self { + raft_engine, + router: Mutex::new(router), + } + } +} + +impl engine_traits::StateStorage for StateStorage { + fn persist_progress(&self, region_id: u64, tablet_index: u64, pr: FlushProgress) { + let cf = to_static_cf(pr.cf()); + let flushed_index = pr.applied_index(); + self.raft_engine + .persist_progress(region_id, tablet_index, pr); + let _ = self.router.lock().unwrap().send( + region_id, + PeerMsg::DataFlushed { + cf, + tablet_index, + flushed_index, + }, + ); + } +} + +/// Mapping from data cf to an u64 index. +pub type DataTrace = [u64; DATA_CFS_LEN]; + +#[derive(Clone, Copy, Default)] +struct Progress { + flushed: u64, + /// The index of last entry that has modification to the CF. + /// + /// If `flushed` == `last_modified`, then all data in the CF is persisted. + last_modified: u64, +} + +pub fn cf_offset(cf: &str) -> usize { + let cf = if cf.is_empty() { CF_DEFAULT } else { cf }; + DATA_CFS.iter().position(|c| *c == cf).expect(cf) +} + +/// `ApplyTrace` is used to track the indexes of modifications and flushes. +/// +/// It has 3 core functionalities: +/// - recover from stopped state and figure out the correct log replay start +/// point. +/// - trace the admin flushed index and issue persistence once admin operation +/// is considered finished. Note only those admin commands that needs to +/// interact with other peers will be traced. +/// - support query the flushed progress without actually scanning raft engine, +/// which is useful for cleaning up stale flush records. +#[derive(Default)] +pub struct ApplyTrace { + /// The modified indexes and flushed index of each data CF. + data_cfs: Box<[Progress; DATA_CFS_LEN]>, + /// The modified indexes and flushed index of raft CF. + /// + /// raft CF is a virtual CF that only used for recording apply index of + /// certain admin commands (like split/merge). So there is no flush at all. + /// The `flushed` field is advanced when the admin command doesn't need to + /// be replayed after restart. A write should be triggered to persist the + /// record. + admin: Progress, + /// Index that is issued to be written. It may not be truely persisted. + persisted_applied: u64, + /// `true` means the raft cf record should be persisted in next ready. + try_persist: bool, +} + +impl ApplyTrace { + fn recover(region_id: u64, engine: &impl RaftEngine) -> Result<(Self, RegionLocalState)> { + let mut trace = ApplyTrace::default(); + // Get all the recorded apply index from data CFs. + for (off, cf) in DATA_CFS.iter().enumerate() { + // There should be at least one record. + let i = engine.get_flushed_index(region_id, cf)?.unwrap(); + trace.data_cfs[off].flushed = i; + trace.data_cfs[off].last_modified = i; + } + let i = engine.get_flushed_index(region_id, CF_RAFT)?.unwrap(); + // Index of raft CF means all data before that must be persisted. + trace.admin.flushed = i; + trace.admin.last_modified = i; + trace.persisted_applied = i; + let applied_region_state = engine + .get_region_state(region_id, trace.admin.flushed)? + .unwrap(); + Ok((trace, applied_region_state)) + } + + fn on_flush(&mut self, cf: &str, index: u64) { + let off = cf_offset(cf); + // Technically it should always be true. + if index > self.data_cfs[off].flushed { + self.data_cfs[off].flushed = index; + } + } + + fn on_modify(&mut self, cf: &str, index: u64) { + let off = cf_offset(cf); + self.data_cfs[off].last_modified = index; + } + + pub fn on_admin_flush(&mut self, index: u64) { + if index > self.admin.flushed { + self.admin.flushed = index; + self.try_persist = true; + } + } + + pub fn on_admin_modify(&mut self, index: u64) { + self.admin.last_modified = index; + } + + pub fn persisted_apply_index(&self) -> u64 { + self.admin.flushed + } + + // All events before `mem_index` must be consumed before calling this function. + fn maybe_advance_admin_flushed(&mut self, mem_index: u64) { + if self.admin.flushed < self.admin.last_modified { + return; + } + let min_flushed = self + .data_cfs + .iter() + // Only unflushed CFs are considered. Flushed CF always have uptodate changes + // persisted. + .filter_map(|pr| { + if pr.last_modified != pr.flushed { + Some(pr.flushed) + } else { + None + } + }) + .min(); + // At best effort, we can only advance the index to `mem_index`. + let candidate = cmp::min(mem_index, min_flushed.unwrap_or(u64::MAX)); + if candidate > self.admin.flushed { + self.admin.flushed = candidate; + if self.admin.flushed > self.persisted_applied + 100 { + self.try_persist = true; + } + } + // TODO: persist admin.flushed every 10 minutes. + } + + /// Get the flushed indexes of all data CF that is needed when recoverying + /// logs. + /// + /// Logs may be replayed from the persisted apply index, but those data may + /// have been flushed in the past, so we need the flushed indexes to decide + /// what logs can be skipped for certain CFs. If all CFs are flushed before + /// the persisted apply index, then there is nothing to skipped, so + /// `None` is returned. + #[inline] + pub fn log_recovery(&self) -> Option> { + let mut flushed_indexes = [0; DATA_CFS_LEN]; + for (off, pr) in self.data_cfs.iter().enumerate() { + flushed_indexes[off] = pr.flushed; + } + for i in flushed_indexes { + if i > self.admin.flushed { + return Some(Box::new(flushed_indexes)); + } + } + None + } + + pub fn reset_snapshot(&mut self, index: u64) { + for pr in self.data_cfs.iter_mut() { + pr.flushed = index; + pr.last_modified = index; + } + self.admin.flushed = index; + self.persisted_applied = index; + self.try_persist = false; + } + + #[inline] + pub fn reset_should_persist(&mut self) { + self.try_persist = false; + } + + #[inline] + pub fn should_persist(&self) -> bool { + self.try_persist + } +} + +impl Storage { + /// Creates a new storage with uninit states. + /// + /// This should only be used for creating new peer from raft message. + pub fn uninit( + store_id: u64, + region: Region, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result { + let mut region_state = RegionLocalState::default(); + region_state.set_region(region); + Self::create( + store_id, + region_state, + RaftLocalState::default(), + RaftApplyState::default(), + engine, + read_scheduler, + false, + ApplyTrace::default(), + logger, + ) + } + + /// Creates a new storage. + /// + /// All metadata should be initialized before calling this method. If the + /// region is destroyed, `None` will be returned. + pub fn new( + region_id: u64, + store_id: u64, + engine: ER, + read_scheduler: Scheduler>, + logger: &Logger, + ) -> Result>> { + // Check latest region state to determine whether the peer is destroyed. + let region_state = match engine.get_region_state(region_id, u64::MAX) { + Ok(Some(s)) => s, + res => { + return Err(box_err!( + "failed to get region state for region {}: {:?}", + region_id, + res + )); + } + }; + + if region_state.get_state() == PeerState::Tombstone { + return Ok(None); + } + + let (trace, region_state) = ApplyTrace::recover(region_id, &engine)?; + + let raft_state = match engine.get_raft_state(region_id) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get raft state: {:?}", res)); + } + }; + + let applied_index = trace.persisted_apply_index(); + let mut apply_state = match engine.get_apply_state(region_id, applied_index) { + Ok(Some(s)) => s, + res => { + return Err(box_err!("failed to get apply state: {:?}", res)); + } + }; + apply_state.set_applied_index(applied_index); + + Self::create( + store_id, + region_state, + raft_state, + apply_state, + engine, + read_scheduler, + true, + trace, + logger, + ) + .map(Some) + } + + /// Region state is written before actually moving data. It's possible that + /// the tablet is missing after restart. We need to move the data again + /// after being restarted. + pub fn recover_tablet(&self, registry: &TabletRegistry, snap_mgr: &TabletSnapManager) { + let tablet_index = self.region_state().get_tablet_index(); + if tablet_index == 0 { + // It's an uninitialized peer, nothing to recover. + return; + } + let region_id = self.region().get_id(); + let target_path = registry.tablet_path(region_id, tablet_index); + if target_path.exists() { + // Move data succeeded before restart, nothing to recover. + return; + } + if tablet_index == RAFT_INIT_LOG_INDEX { + // Its data may come from split or snapshot. Try split first. + let split_path = temp_split_path(registry, region_id); + if install_tablet(registry, &split_path, region_id, tablet_index) { + return; + } + } + let truncated_index = self.entry_storage().truncated_index(); + if truncated_index == tablet_index { + // Try snapshot. + let peer_id = self.peer().get_id(); + let snap_path = recv_snap_path( + snap_mgr, + region_id, + peer_id, + self.entry_storage().truncated_term(), + tablet_index, + ); + if install_tablet(registry, &snap_path, region_id, tablet_index) { + return; + } + } + panic!( + "{:?} data loss detected: {}_{} not found", + self.logger().list(), + region_id, + tablet_index + ); + } + + /// Write initial persist trace for uninit peer. + pub fn init_apply_trace(&self, write_task: &mut WriteTask) { + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(3)); + lb.put_apply_state(region_id, 0, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, 0, self.region_state()) + .unwrap(); + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, 0, 0).unwrap(); + } + } + + pub fn record_apply_trace(&mut self, write_task: &mut WriteTask) { + let region_id = self.region().get_id(); + let raft_engine = self.entry_storage().raft_engine(); + let tablet_index = self.tablet_index(); + let lb = write_task + .extra_write + .ensure_v2(|| raft_engine.log_batch(1)); + let trace = self.apply_trace_mut(); + lb.put_flushed_index(region_id, CF_RAFT, tablet_index, trace.admin.flushed) + .unwrap(); + trace.try_persist = false; + trace.persisted_applied = trace.admin.flushed; + } +} + +impl Peer { + pub fn on_data_flushed(&mut self, cf: &str, tablet_index: u64, index: u64) { + if tablet_index < self.storage().tablet_index() { + // Stale tablet. + return; + } + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + apply_trace.on_flush(cf, index); + apply_trace.maybe_advance_admin_flushed(apply_index); + } + + pub fn on_data_modified(&mut self, modification: DataTrace) { + let apply_index = self.storage().entry_storage().applied_index(); + let apply_trace = self.storage_mut().apply_trace_mut(); + for (cf, index) in DATA_CFS.iter().zip(modification) { + if index != 0 { + apply_trace.on_modify(cf, index); + } + } + apply_trace.maybe_advance_admin_flushed(apply_index); + } +} + +#[cfg(test)] +mod tests { + use engine_traits::RaftEngineReadOnly; + use kvproto::metapb::Peer; + use tempfile::TempDir; + + use super::*; + + fn new_region() -> Region { + let mut region = Region::default(); + region.set_id(4); + let mut p = Peer::default(); + p.set_id(5); + p.set_store_id(6); + region.mut_peers().push(p); + region.mut_region_epoch().set_version(2); + region.mut_region_epoch().set_conf_ver(4); + region + } + + #[test] + fn test_write_initial_states() { + let region = new_region(); + let path = TempDir::new().unwrap(); + let engine = engine_test::new_temp_engine(&path); + let raft_engine = &engine.raft; + let mut wb = raft_engine.log_batch(10); + write_initial_states(&mut wb, region.clone()).unwrap(); + assert!(!wb.is_empty()); + raft_engine.consume(&mut wb, true).unwrap(); + + let local_state = raft_engine.get_region_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(local_state.get_state(), PeerState::Normal); + assert_eq!(*local_state.get_region(), region); + assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + local_state, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_region_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + + let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); + assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); + let hs = raft_state.get_hard_state(); + assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); + + let apply_state = raft_engine.get_apply_state(4, u64::MAX).unwrap().unwrap(); + assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); + let ts = apply_state.get_truncated_state(); + assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); + assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); + assert_eq!( + apply_state, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX) + .unwrap() + .unwrap() + ); + assert_eq!( + None, + raft_engine + .get_apply_state(4, RAFT_INIT_LOG_INDEX - 1) + .unwrap() + ); + } + + #[test] + fn test_apply_trace() { + let mut trace = ApplyTrace::default(); + assert_eq!(0, trace.persisted_apply_index()); + // If there is no modifications, index should be advanced anyway. + trace.maybe_advance_admin_flushed(2); + assert_eq!(2, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_modify(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // Modification is not flushed. + assert_eq!(2, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_flush(cf, 3); + } + trace.maybe_advance_admin_flushed(3); + // No admin is recorded, index should be advanced. + assert_eq!(3, trace.persisted_apply_index()); + trace.on_admin_modify(4); + for cf in DATA_CFS { + trace.on_flush(cf, 4); + } + for cf in DATA_CFS { + trace.on_modify(cf, 4); + } + trace.maybe_advance_admin_flushed(4); + // Unflushed admin modification should hold index. + assert_eq!(3, trace.persisted_apply_index()); + trace.on_admin_flush(4); + trace.maybe_advance_admin_flushed(4); + // Admin is flushed, index should be advanced. + assert_eq!(4, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_flush(cf, 5); + } + trace.maybe_advance_admin_flushed(4); + // Though all data CFs are flushed, but index should not be + // advanced as we don't know whether there is admin modification. + assert_eq!(4, trace.persisted_apply_index()); + for cf in DATA_CFS { + trace.on_modify(cf, 5); + } + trace.maybe_advance_admin_flushed(5); + // Because modify is recorded, so we know there should be no admin + // modification and index can be advanced. + assert_eq!(5, trace.persisted_apply_index()); + } + + #[test] + fn test_advance_admin_flushed() { + let cases = &[ + // When all are flushed, admin index should be advanced to latest. + ([(2, 2), (3, 3), (5, 5)], (3, 3), 5, 5), + ([(2, 2), (3, 3), (5, 5)], (5, 3), 6, 6), + // Any unflushed result should block advancing. + ([(2, 3), (3, 3), (5, 5)], (2, 2), 5, 2), + ([(2, 4), (3, 4), (5, 6)], (2, 2), 6, 2), + // But it should not make index go back. + ([(2, 4), (3, 4), (5, 6)], (3, 3), 6, 3), + // Unflush admin should not be advanced. + ([(2, 2), (3, 3), (5, 5)], (2, 3), 5, 2), + // Flushed may race with modification. + ([(2, 2), (3, 3), (6, 5)], (2, 2), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (4, 4), 5, 5), + ([(8, 2), (9, 3), (7, 5)], (5, 5), 5, 5), + ([(2, 3), (9, 3), (7, 5)], (2, 2), 5, 2), + ]; + for (case, (data_cfs, admin, mem_index, exp)) in cases.iter().enumerate() { + let mut trace = ApplyTrace::default(); + for (i, (flushed, modified)) in data_cfs.iter().enumerate() { + trace.data_cfs[i].flushed = *flushed; + trace.data_cfs[i].last_modified = *modified; + } + trace.admin.flushed = admin.0; + trace.admin.last_modified = admin.1; + trace.maybe_advance_admin_flushed(*mem_index); + assert_eq!(trace.admin.flushed, *exp, "{case}"); + } + } +} diff --git a/components/raftstore-v2/src/operation/ready/async_writer.rs b/components/raftstore-v2/src/operation/ready/async_writer.rs index a7bce44fe05..a2707b6d411 100644 --- a/components/raftstore-v2/src/operation/ready/async_writer.rs +++ b/components/raftstore-v2/src/operation/ready/async_writer.rs @@ -86,7 +86,6 @@ impl AsyncWriter { } fn merge(&mut self, task: WriteTask) -> Option> { - let ready_number = task.ready_number(); if self.unpersisted_readies.is_empty() { // If this ready don't need to be persisted and there is no previous unpersisted // ready, we can safely consider it is persisted so the persisted msgs can be @@ -202,7 +201,7 @@ where ER: RaftEngine, { fn write_senders(&self) -> &WriteSenders { - &self.write_senders + &self.schedulers.write } fn config(&self) -> &Config { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 1c8c9d80338..8b125844d0e 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -17,35 +17,53 @@ //! //! There two steps can be processed concurrently. +mod apply_trace; mod async_writer; mod snapshot; use std::{cmp, time::Instant}; -use engine_traits::{KvEngine, MiscExt, OpenOptions, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, RaftEngine}; use error_code::ErrorCodeExt; -use kvproto::{ - raft_cmdpb::AdminCmdType, - raft_serverpb::{PeerState, RaftMessage, RaftSnapshotData}, -}; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::RaftMessage}; use protobuf::Message as _; -use raft::{eraftpb, Ready, StateRole, INVALID_ID}; -use raftstore::store::{util, ExtraStates, FetchedLogs, ReadProgress, Transport, WriteTask}; +use raft::{eraftpb, prelude::MessageType, Ready, StateRole, INVALID_ID}; +use raftstore::{ + coprocessor::{RegionChangeEvent, RoleChange}, + store::{needs_evict_entry_cache, util, FetchedLogs, ReadProgress, Transport, WriteTask}, +}; use slog::{debug, error, trace, warn}; -use tikv_util::time::{duration_to_sec, monotonic_raw_now}; +use tikv_util::{ + store::find_peer, + time::{duration_to_sec, monotonic_raw_now}, +}; pub use self::{ + apply_trace::{cf_offset, write_initial_states, ApplyTrace, DataTrace, StateStorage}, async_writer::AsyncWriter, snapshot::{GenSnapTask, SnapState}, }; use crate::{ batch::StoreContext, - fsm::PeerFsmDelegate, + fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, - router::{ApplyTask, PeerTick}, - Result, + router::{ApplyTask, PeerMsg, PeerTick}, }; +impl Store { + pub fn on_store_unreachable( + &mut self, + ctx: &mut StoreContext, + to_store_id: u64, + ) where + EK: KvEngine, + ER: RaftEngine, + { + ctx.router + .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); + } +} + impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { /// Raft relies on periodic ticks to keep the state machine sync with other /// peers. @@ -63,6 +81,20 @@ impl Peer { self.raft_group_mut().tick() } + pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { + if self.is_leader() { + self.raft_group_mut().report_unreachable(to_peer_id); + } + } + + pub fn on_store_unreachable(&mut self, to_store_id: u64) { + if self.is_leader() { + if let Some(peer_id) = find_peer(self.region(), to_store_id).map(|p| p.get_id()) { + self.raft_group_mut().report_unreachable(peer_id); + } + } + } + pub fn on_raft_message( &mut self, ctx: &mut StoreContext, @@ -114,16 +146,20 @@ impl Peer { unimplemented!(); // return; } + // TODO: drop all msg append when the peer is uninitialized and has conflict // ranges with other peers. let from_peer = msg.take_from_peer(); if self.is_leader() && from_peer.get_id() != INVALID_ID { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } - self.insert_peer_cache(from_peer); - if let Err(e) = self.raft_group_mut().step(msg.take_message()) { + self.insert_peer_cache(msg.take_from_peer()); + if msg.get_message().get_msg_type() == MessageType::MsgTransferLeader { + self.on_transfer_leader_msg(ctx, msg.get_message(), msg.disk_usage) + } else if let Err(e) = self.raft_group_mut().step(msg.take_message()) { error!(self.logger, "raft step error"; "err" => ?e); } + self.set_has_ready(); } @@ -165,11 +201,7 @@ impl Peer { /// /// If the recipient can't be found, `None` is returned. #[inline] - fn build_raft_message( - &mut self, - ctx: &mut StoreContext, - msg: eraftpb::Message, - ) -> Option { + fn build_raft_message(&mut self, msg: eraftpb::Message) -> Option { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { @@ -243,7 +275,7 @@ impl Peer { // asynchronously. if self.is_leader() { for entry in committed_entries.iter().rev() { - // TODO: handle raft_log_size_hint + self.update_approximate_raft_log_size(|s| s + entry.get_data().len() as u64); let propose_time = self .proposals() .find_propose_time(entry.get_term(), entry.get_index()); @@ -261,7 +293,11 @@ impl Peer { } } } - self.schedule_apply_committed_entries(ctx, committed_entries); + if needs_evict_entry_cache(ctx.cfg.evict_cache_on_memory_ratio) { + // Compact all cached entries instead of half evict. + self.entry_storage_mut().evict_entry_cache(false); + } + self.schedule_apply_committed_entries(committed_entries); } /// Processing the ready of raft. A detail description of how it's handled @@ -273,6 +309,7 @@ impl Peer { #[inline] pub fn handle_raft_ready(&mut self, ctx: &mut StoreContext) { let has_ready = self.reset_has_ready(); + let has_extra_write = self.reset_has_extra_write(); if !has_ready || self.destroy_progress().started() { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); @@ -280,7 +317,10 @@ impl Peer { } ctx.has_ready = true; - if !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { + if !has_extra_write + && !self.raft_group().has_ready() + && (self.serving() || self.postponed_destroy()) + { #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -316,7 +356,7 @@ impl Peer { if !ready.messages().is_empty() { debug_assert!(self.is_leader()); for msg in ready.take_messages() { - if let Some(msg) = self.build_raft_message(ctx, msg) { + if let Some(msg) = self.build_raft_message(msg) { self.send_raft_message(ctx, msg); } } @@ -332,22 +372,33 @@ impl Peer { // Always sending snapshot task after apply task, so it gets latest // snapshot. if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { - self.apply_scheduler().send(ApplyTask::Snapshot(gen_task)); + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); } let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); + let prev_persisted = self.storage().apply_trace().persisted_apply_index(); + self.merge_state_changes_to(&mut write_task); self.storage_mut() .handle_raft_ready(ctx, &mut ready, &mut write_task); + self.on_advance_persisted_apply_index(ctx, prev_persisted, &mut write_task); + if !ready.persisted_messages().is_empty() { write_task.messages = ready .take_persisted_messages() .into_iter() - .flat_map(|m| self.build_raft_message(ctx, m)) + .flat_map(|m| self.build_raft_message(m)) .collect(); } if !self.serving() { self.start_destroy(&mut write_task); + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); @@ -403,12 +454,12 @@ impl Peer { let persisted_number = self.async_writer.persisted_number(); self.raft_group_mut().on_persist_ready(persisted_number); - let persisted_index = self.raft_group().raft.raft_log.persisted; - /// The apply snapshot process order would be: - /// - Get the snapshot from the ready - /// - Wait for async writer to load this tablet - /// In this step, the snapshot has loaded finish, but some apply state - /// need to update. + let persisted_index = self.persisted_index(); + // The apply snapshot process order would be: + // - Get the snapshot from the ready + // - Wait for async writer to load this tablet + // In this step, the snapshot loading has been finished, but some apply + // state need to update. if has_snapshot { self.on_applied_snapshot(ctx); } @@ -462,20 +513,37 @@ impl Peer { // latency. self.raft_group_mut().skip_bcast_commit(false); + // Init the in-memory pessimistic lock table when the peer becomes leader. + self.activate_in_memory_pessimistic_locks(); + // A more recent read may happen on the old leader. So max ts should // be updated after a peer becomes leader. self.require_updating_max_ts(ctx); + // Exit entry cache warmup state when the peer becomes leader. self.entry_storage_mut().clear_entry_cache_warmup_state(); self.region_heartbeat_pd(ctx); + self.add_pending_tick(PeerTick::CompactLog); } StateRole::Follower => { self.leader_lease_mut().expire(); self.storage_mut().cancel_generating_snap(None); + self.clear_in_memory_pessimistic_locks(); } _ => {} } + let target = self.refresh_leader_transferee(); + ctx.coprocessor_host.on_role_change( + self.region(), + RoleChange { + state: ss.raft_state, + leader_id: ss.leader_id, + prev_lead_transferee: target, + vote: self.raft_group().raft.vote, + initialized: self.storage().is_initialized(), + }, + ); self.proposal_control_mut().maybe_update_term(term); } } @@ -537,7 +605,7 @@ impl Storage { ready.snapshot(), write_task, ctx.snap_mgr.clone(), - ctx.tablet_factory.clone(), + ctx.tablet_registry.clone(), ) { error!(self.logger(),"failed to apply snapshot";"error" => ?e) } @@ -553,11 +621,13 @@ impl Storage { if !ever_persisted || prev_raft_state != *entry_storage.raft_state() { write_task.raft_state = Some(entry_storage.raft_state().clone()); } - if !ever_persisted { - let mut extra_states = ExtraStates::new(self.apply_state().clone()); - extra_states.set_region_state(self.region_state().clone()); - write_task.extra_write.set_v2(extra_states); + // If snapshot initializes the peer, we don't need to write apply trace again. + if !self.ever_persisted() { + self.init_apply_trace(write_task); self.set_ever_persisted(); } + if self.apply_trace().should_persist() { + self.record_apply_trace(write_task); + } } } diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 32e8a3f8ff8..41dc0d39429 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -19,30 +19,34 @@ //! peer fsm, then Raft will get the snapshot. use std::{ - borrow::BorrowMut, + assert_matches::assert_matches, fmt::{self, Debug}, - mem, + fs, + path::{Path, PathBuf}, sync::{ atomic::{AtomicBool, AtomicU64, Ordering}, - mpsc, Arc, + Arc, }, }; -use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; -use kvproto::raft_serverpb::{PeerState, RaftSnapshotData, RegionLocalState}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, ALL_CFS}; +use kvproto::raft_serverpb::{PeerState, RaftSnapshotData}; use protobuf::Message; -use raft::eraftpb::Snapshot; -use raftstore::store::{ - metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, - TabletSnapManager, Transport, WriteTask, +use raft::{eraftpb::Snapshot, StateRole}; +use raftstore::{ + coprocessor::RegionChangeEvent, + store::{ + metrics::STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER, GenSnapRes, ReadTask, TabletSnapKey, + TabletSnapManager, Transport, WriteTask, RAFT_INIT_LOG_INDEX, + }, }; use slog::{error, info, warn}; -use tikv_util::{box_err, box_try, worker::Scheduler}; +use tikv_util::box_err; use crate::{ fsm::ApplyResReporter, + operation::command::temp_split_path, raft::{Apply, Peer, Storage}, - router::{ApplyTask, PeerTick}, Result, StoreContext, }; @@ -59,11 +63,9 @@ pub enum SnapState { impl PartialEq for SnapState { fn eq(&self, other: &SnapState) -> bool { match (self, other) { - (&SnapState::Relax, &SnapState::Relax) - | (&SnapState::Generating { .. }, &SnapState::Generating { .. }) => true, - (&SnapState::Generated(ref snap1), &SnapState::Generated(ref snap2)) => { - *snap1 == *snap2 - } + (SnapState::Relax, SnapState::Relax) + | (SnapState::Generating { .. }, SnapState::Generating { .. }) => true, + (SnapState::Generated(snap1), SnapState::Generated(snap2)) => *snap1 == *snap2, _ => false, } } @@ -101,6 +103,10 @@ impl GenSnapTask { pub fn set_for_balance(&mut self) { self.for_balance = true; } + + pub fn to_peer(&self) -> u64 { + self.to_peer + } } impl Debug for GenSnapTask { @@ -111,6 +117,48 @@ impl Debug for GenSnapTask { } } +pub fn recv_snap_path( + snap_mgr: &TabletSnapManager, + region_id: u64, + peer_id: u64, + term: u64, + index: u64, +) -> PathBuf { + let key = TabletSnapKey::new(region_id, peer_id, term, index); + snap_mgr.final_recv_path(&key) +} + +/// Move the tablet from `source` to managed path. +/// +/// Returns false if `source` doesn't exist. +pub fn install_tablet( + registry: &TabletRegistry, + source: &Path, + region_id: u64, + tablet_index: u64, +) -> bool { + if !source.exists() { + return false; + } + let target_path = registry.tablet_path(region_id, tablet_index); + assert_matches!( + EK::locked(source.to_str().unwrap()), + Ok(false), + "source is locked: {} => {}", + source.display(), + target_path.display() + ); + if let Err(e) = fs::rename(source, &target_path) { + panic!( + "failed to rename tablet {} => {}: {:?}", + source.display(), + target_path.display(), + e + ); + } + true +} + impl Peer { pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { @@ -119,22 +167,69 @@ impl Peer { } } + pub fn on_snapshot_sent(&mut self, to_peer_id: u64, status: raft::SnapshotStatus) { + let to_peer = match self.peer_from_cache(to_peer_id) { + Some(peer) => peer, + None => { + // If to_peer is gone, ignore this snapshot status + warn!( + self.logger, + "peer not found, ignore snapshot status"; + "to_peer_id" => to_peer_id, + "status" => ?status, + ); + return; + } + }; + info!( + self.logger, + "report snapshot status"; + "to" => ?to_peer, + "status" => ?status, + ); + self.raft_group_mut().report_snapshot(to_peer_id, status); + } + pub fn on_applied_snapshot(&mut self, ctx: &mut StoreContext) { - let persisted_index = self.raft_group().raft.raft_log.persisted; + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Create, + StateRole::Follower, + ); + let persisted_index = self.persisted_index(); let first_index = self.storage().entry_storage().first_index(); if first_index == persisted_index + 1 { let region_id = self.region_id(); - let tablet = ctx - .tablet_factory - .open_tablet(region_id, Some(persisted_index), OpenOptions::default()) - .unwrap(); - self.tablet_mut().set(tablet); + self.reset_flush_state(); + let flush_state = self.flush_state().clone(); + let mut tablet_ctx = TabletContext::new(self.region(), Some(persisted_index)); + // Use a new FlushState to avoid conflicts with the old one. + tablet_ctx.flush_state = Some(flush_state); + ctx.tablet_registry.load(tablet_ctx, false).unwrap(); + self.record_tablet_as_tombstone_and_refresh(persisted_index, ctx); self.schedule_apply_fsm(ctx); self.storage_mut().on_applied_snapshot(); self.raft_group_mut().advance_apply_to(persisted_index); + { + let mut meta = ctx.store_meta.lock().unwrap(); + meta.set_region(self.region(), true, &self.logger); + meta.readers + .insert(region_id, self.generate_read_delegate()); + meta.region_read_progress + .insert(region_id, self.read_progress().clone()); + } self.read_progress_mut() .update_applied_core(persisted_index); - info!(self.logger, "apply tablet snapshot completely"); + let split = self.storage_mut().split_init_mut().take(); + if split.as_ref().map_or(true, |s| { + !s.scheduled || persisted_index != RAFT_INIT_LOG_INDEX + }) { + info!(self.logger, "apply tablet snapshot completely"); + } + if let Some(init) = split { + info!(self.logger, "init with snapshot finished"); + self.post_split_init(ctx, init); + } } } } @@ -185,35 +280,39 @@ impl Apply { } impl Storage { + pub fn is_generating_snapshot(&self) -> bool { + let snap_states = self.snap_states.borrow_mut(); + for (_, state) in snap_states.iter() { + if matches!(*state, SnapState::Generating { .. }) { + return true; + } + } + false + } + /// Gets a snapshot. Returns `SnapshotTemporarilyUnavailable` if there is no /// unavailable snapshot. pub fn snapshot(&self, request_index: u64, to: u64) -> raft::Result { - let mut snap_state = self.snap_state_mut(); - match *snap_state { - SnapState::Generating { ref canceled, .. } => { - if canceled.load(Ordering::SeqCst) { - self.cancel_generating_snap(None); - } else { - return Err(raft::Error::Store( - raft::StorageError::SnapshotTemporarilyUnavailable, - )); + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to) { + match state { + SnapState::Generating { ref canceled, .. } => { + if canceled.load(Ordering::SeqCst) { + self.cancel_generating_snap(Some(to)); + } else { + return Err(raft::Error::Store( + raft::StorageError::SnapshotTemporarilyUnavailable, + )); + } } - } - SnapState::Generated(ref s) => { - let SnapState::Generated(snap) = mem::replace(&mut *snap_state, SnapState::Relax) else { unreachable!() }; - if self.validate_snap(&snap, request_index) { - return Ok(*snap); + SnapState::Generated(ref s) => { + let snap = *s.clone(); + *state = SnapState::Relax; + if self.validate_snap(&snap, request_index) { + return Ok(snap); + } } - } - _ => {} - } - - if SnapState::Relax != *snap_state { - panic!( - "{:?} unexpected state: {:?}", - self.logger().list(), - *snap_state - ); + _ => {} + }; } info!( @@ -224,15 +323,18 @@ impl Storage { ); let canceled = Arc::new(AtomicBool::new(false)); let index = Arc::new(AtomicU64::new(0)); - *snap_state = SnapState::Generating { - canceled: canceled.clone(), - index: index.clone(), - }; - - let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); let mut gen_snap_task = self.gen_snap_task_mut(); - assert!(gen_snap_task.is_none()); - *gen_snap_task = Box::new(Some(task)); + if gen_snap_task.is_none() { + self.snap_states.borrow_mut().insert( + to, + SnapState::Generating { + canceled: canceled.clone(), + index: index.clone(), + }, + ); + let task = GenSnapTask::new(self.region().get_id(), to, index, canceled); + *gen_snap_task = Box::new(Some(task)); + } Err(raft::Error::Store( raft::StorageError::SnapshotTemporarilyUnavailable, )) @@ -280,31 +382,50 @@ impl Storage { true } - /// Cancel generating snapshot. - pub fn cancel_generating_snap(&self, compact_to: Option) { - let mut snap_state = self.snap_state_mut(); - let SnapState::Generating { - ref canceled, - ref index, - } = *snap_state else { return }; - - if let Some(idx) = compact_to { - let snap_index = index.load(Ordering::SeqCst); - if snap_index == 0 || idx <= snap_index + 1 { - return; + pub fn cancel_generating_snap(&self, to_peer: Option) { + if let Some(id) = to_peer { + let mut states = self.snap_states.borrow_mut(); + if let Some(state) = states.get(&id) + && matches!(*state, SnapState::Generating { .. }) + { + info!( + self.logger(), + "snapshot is canceled"; + "to_peer" => to_peer, + ); + self.cancel_snap_task(to_peer); + states.remove(&id); } + } else { + self.cancel_snap_task(to_peer); + self.snap_states.borrow_mut().clear(); } - canceled.store(true, Ordering::SeqCst); - *snap_state = SnapState::Relax; - self.gen_snap_task_mut().take(); - info!( - self.logger(), - "snapshot is canceled"; - "compact_to" => compact_to, - ); STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); } + pub fn cancel_generating_snap_due_to_compacted(&self, compact_to: u64) { + let mut states = self.snap_states.borrow_mut(); + states.retain(|id, state| { + let SnapState::Generating { + ref index, + .. + } = *state else { return true; }; + let snap_index = index.load(Ordering::SeqCst); + if snap_index == 0 || compact_to <= snap_index + 1 { + return true; + } + info!( + self.logger(), + "snapshot is canceled"; + "compact_to" => compact_to, + "to_peer" => id, + ); + self.cancel_snap_task(Some(*id)); + STORE_SNAPSHOT_VALIDATION_FAILURE_COUNTER.cancel.inc(); + false + }); + } + /// Try to switch snap state to generated. only `Generating` can switch to /// `Generated`. /// TODO: make the snap state more clearer, the snapshot must be consumed. @@ -313,36 +434,34 @@ impl Storage { self.cancel_generating_snap(None); return false; } - let snap = res.unwrap(); - let mut snap_state = self.snap_state_mut(); - let SnapState::Generating { - ref canceled, - ref index, - } = *snap_state else { return false }; - - if snap.get_metadata().get_index() < index.load(Ordering::SeqCst) { - warn!( - self.logger(), - "snapshot is staled, skip"; - "snap index" => snap.get_metadata().get_index(), - "required index" => index.load(Ordering::SeqCst), - ); - return false; + let (snapshot, to_peer_id) = *res.unwrap(); + if let Some(state) = self.snap_states.borrow_mut().get_mut(&to_peer_id) { + let SnapState::Generating { + ref index, + .. + } = *state else { return false }; + if snapshot.get_metadata().get_index() < index.load(Ordering::SeqCst) { + warn!( + self.logger(), + "snapshot is staled, skip"; + "snap index" => snapshot.get_metadata().get_index(), + "required index" => index.load(Ordering::SeqCst), + "to_peer_id" => to_peer_id, + ); + return false; + } + *state = SnapState::Generated(Box::new(snapshot)); } - // Should changed `SnapState::Generated` to `SnapState::Relax` when the - // snap is consumed or canceled. Such as leader changed, the state of generated - // should be reset. - *snap_state = SnapState::Generated(snap); true } pub fn on_applied_snapshot(&mut self) { - let mut entry = self.entry_storage_mut(); + let entry = self.entry_storage_mut(); let term = entry.truncated_term(); let index = entry.truncated_index(); entry.set_applied_term(term); entry.apply_state_mut().set_applied_index(index); - self.region_state_mut().set_tablet_index(index); + self.apply_trace_mut().reset_snapshot(index); } pub fn apply_snapshot( @@ -350,7 +469,7 @@ impl Storage { snap: &Snapshot, task: &mut WriteTask, snap_mgr: TabletSnapManager, - tablet_factory: Arc>, + reg: TabletRegistry, ) -> Result<()> { let region_id = self.region().get_id(); let peer_id = self.peer().get_id(); @@ -372,31 +491,62 @@ impl Storage { let last_index = snap.get_metadata().get_index(); let last_term = snap.get_metadata().get_term(); - self.region_state_mut().set_state(PeerState::Normal); - self.region_state_mut().set_region(region); - self.entry_storage_mut() - .raft_state_mut() - .set_last_index(last_index); - self.entry_storage_mut().set_truncated_index(last_index); - self.entry_storage_mut().set_truncated_term(last_term); - self.entry_storage_mut().set_last_term(last_term); - - let key = TabletSnapKey::new(region_id, peer_id, last_term, last_index); - let mut path = snap_mgr.final_recv_path(&key); + let region_state = self.region_state_mut(); + region_state.set_state(PeerState::Normal); + region_state.set_region(region); + region_state.set_tablet_index(last_index); + let entry_storage = self.entry_storage_mut(); + entry_storage.raft_state_mut().set_last_index(last_index); + entry_storage.set_truncated_index(last_index); + entry_storage.set_truncated_term(last_term); + entry_storage.set_last_term(last_term); + + self.apply_trace_mut().reset_should_persist(); + self.set_ever_persisted(); + let lb = task + .extra_write + .ensure_v2(|| self.entry_storage().raft_engine().log_batch(3)); + lb.put_apply_state(region_id, last_index, self.apply_state()) + .unwrap(); + lb.put_region_state(region_id, last_index, self.region_state()) + .unwrap(); + // We assume there should be flush records in all CFs. Skip any CF here may + // break the constraint. + for cf in ALL_CFS { + lb.put_flushed_index(region_id, cf, last_index, last_index) + .unwrap(); + } + + let (path, clean_split) = match self.split_init_mut() { + // If index not match, the peer may accept a newer snapshot after split. + Some(init) if init.scheduled && last_index == RAFT_INIT_LOG_INDEX => { + (temp_split_path(®, region_id), false) + } + si => ( + recv_snap_path(&snap_mgr, region_id, peer_id, last_term, last_index), + si.is_some(), + ), + }; + let logger = self.logger().clone(); // The snapshot require no additional processing such as ingest them to DB, but // it should load it into the factory after it persisted. let hook = move || { - if let Err(e) = tablet_factory.load_tablet(path.as_path(), region_id, last_index) { + if !install_tablet(®, &path, region_id, last_index) { panic!( - "{:?} failed to load tablet, path: {}, {:?}", + "{:?} failed to install tablet, path: {}, region_id: {}, tablet_index: {}", logger.list(), path.display(), - e + region_id, + last_index ); } + if clean_split { + let path = temp_split_path(®, region_id); + let _ = fs::remove_dir_all(path); + } }; - task.persisted_cb = (Some(Box::new(hook))); + task.persisted_cbs.push(Box::new(hook)); task.has_snapshot = true; Ok(()) } diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index 06101da8d83..666f3adb699 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -2,18 +2,18 @@ use std::{mem, sync::Arc}; -use engine_traits::{KvEngine, TabletFactory}; +use engine_traits::{CachedTablet, FlushState, KvEngine, TabletRegistry, WriteBatch, DATA_CFS_LEN}; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; -use raftstore::store::{fsm::apply::DEFAULT_APPLY_WB_SIZE, ReadTask}; +use raftstore::store::{ + fsm::{apply::DEFAULT_APPLY_WB_SIZE, ApplyMetrics}, + ReadTask, +}; use slog::Logger; use tikv_util::worker::Scheduler; -use super::Peer; use crate::{ - fsm::ApplyResReporter, - operation::AdminCmdResult, - router::{ApplyRes, CmdResChannel}, - tablet::CachedTablet, + operation::{AdminCmdResult, DataTrace}, + router::CmdResChannel, }; /// Apply applies all the committed commands to kv db. @@ -22,23 +22,35 @@ pub struct Apply { /// publish the update of the tablet remote_tablet: CachedTablet, tablet: EK, - write_batch: Option, + pub write_batch: Option, + /// A buffer for encoding key. + pub key_buffer: Vec, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, callbacks: Vec<(Vec, RaftCmdResponse)>, /// A flag indicates whether the peer is destroyed by applying admin /// command. tombstone: bool, - applied_index: u64, applied_term: u64, + applied_index: u64, + /// The largest index that have modified each column family. + modifications: DataTrace, admin_cmd_result: Vec, + flush_state: Arc, + /// The flushed indexes of each column family before being restarted. + /// + /// If an apply index is less than the flushed index, the log can be + /// skipped. `None` means logs should apply to all required column + /// families. + log_recovery: Option>, region_state: RegionLocalState, res_reporter: R, read_scheduler: Scheduler>, + pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, } @@ -48,11 +60,15 @@ impl Apply { peer: metapb::Peer, region_state: RegionLocalState, res_reporter: R, - mut remote_tablet: CachedTablet, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, read_scheduler: Scheduler>, + flush_state: Arc, + log_recovery: Option>, logger: Logger, ) -> Self { + let mut remote_tablet = tablet_registry + .get(region_state.get_region().get_id()) + .unwrap(); Apply { peer, tablet: remote_tablet.latest().unwrap().clone(), @@ -60,20 +76,25 @@ impl Apply { write_batch: None, callbacks: vec![], tombstone: false, - applied_index: 0, applied_term: 0, + applied_index: flush_state.applied_index(), + modifications: [0; DATA_CFS_LEN], admin_cmd_result: vec![], region_state, - tablet_factory, + tablet_registry, read_scheduler, + key_buffer: vec![], res_reporter, + flush_state, + log_recovery, + metrics: ApplyMetrics::default(), logger, } } #[inline] - pub fn tablet_factory(&self) -> &Arc> { - &self.tablet_factory + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.tablet_registry } #[inline] @@ -87,22 +108,27 @@ impl Apply { } #[inline] - pub fn write_batch_mut(&mut self) -> &mut Option { - &mut self.write_batch - } - - #[inline] - pub fn write_batch_or_default(&mut self) -> &mut EK::WriteBatch { - if self.write_batch.is_none() { - self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); + pub fn ensure_write_buffer(&mut self) { + if self.write_batch.is_some() { + return; } - self.write_batch.as_mut().unwrap() + self.write_batch = Some(self.tablet.write_batch_with_cap(DEFAULT_APPLY_WB_SIZE)); } #[inline] pub fn set_apply_progress(&mut self, index: u64, term: u64) { self.applied_index = index; self.applied_term = term; + if self.log_recovery.is_none() { + return; + } + let log_recovery = self.log_recovery.as_ref().unwrap(); + if log_recovery.iter().all(|v| index >= *v) { + self.log_recovery.take(); + // Now all logs are recovered, flush them to avoid recover again + // and again. + let _ = self.tablet.flush_cfs(&[], false); + } } #[inline] @@ -169,4 +195,27 @@ impl Apply { pub fn take_admin_result(&mut self) -> Vec { mem::take(&mut self.admin_cmd_result) } + + #[inline] + pub fn release_memory(&mut self) { + mem::take(&mut self.key_buffer); + if self.write_batch.as_ref().map_or(false, |wb| wb.is_empty()) { + self.write_batch = None; + } + } + + #[inline] + pub fn modifications_mut(&mut self) -> &mut DataTrace { + &mut self.modifications + } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + #[inline] + pub fn log_recovery(&self) -> &Option> { + &self.log_recovery + } } diff --git a/components/raftstore-v2/src/raft/mod.rs b/components/raftstore-v2/src/raft/mod.rs index c1d6a522d79..495d7ad87ed 100644 --- a/components/raftstore-v2/src/raft/mod.rs +++ b/components/raftstore-v2/src/raft/mod.rs @@ -6,4 +6,4 @@ mod storage; pub use apply::Apply; pub use peer::Peer; -pub use storage::{write_initial_states, Storage}; +pub use storage::Storage; diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index a9730a036e7..668b0ebf41d 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -6,40 +6,34 @@ use std::{ time::{Duration, Instant}, }; -use collections::HashMap; +use collections::{HashMap, HashSet}; use crossbeam::atomic::AtomicCell; -use engine_traits::{KvEngine, OpenOptions, RaftEngine, TabletFactory}; +use engine_traits::{ + CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, +}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, metapb, pdpb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; use raft::{RawNode, StateRole}; use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ - fsm::Proposal, + fsm::ApplyMetrics, util::{Lease, RegionReadProgress}, - Config, EntryStorage, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, ReadProgress, - TxnExt, + Config, EntryStorage, LocksStatus, PeerStat, ProposalQueue, ReadDelegate, ReadIndexQueue, + ReadProgress, TabletSnapManager, TxnExt, WriteTask, }, - Error, -}; -use slog::{debug, error, info, o, warn, Logger}; -use tikv_util::{ - box_err, - config::ReadableSize, - time::{monotonic_raw_now, Instant as TiInstant}, - worker::Scheduler, - Either, }; -use time::Timespec; +use slog::Logger; -use super::{storage::Storage, Apply}; +use super::storage::Storage; use crate::{ batch::StoreContext, - fsm::{ApplyFsm, ApplyScheduler}, - operation::{AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteEncoder}, - router::{CmdResChannel, QueryResChannel}, - tablet::CachedTablet, - worker::PdTask, + fsm::ApplyScheduler, + operation::{ + AsyncWriter, DestroyProgress, ProposalControl, SimpleWriteReqEncoder, SplitFlowControl, + }, + router::{CmdResChannel, PeerTick, QueryResChannel}, + worker::tablet_gc, Result, }; @@ -49,6 +43,11 @@ const REGION_READ_PROGRESS_CAP: usize = 128; pub struct Peer { raft_group: RawNode>, tablet: CachedTablet, + /// Tombstone tablets can only be destroyed when the tablet that replaces it + /// is persisted. This is a list of tablet index that awaits to be + /// persisted. When persisted_apply is advanced, we need to notify tablet_gc + /// worker to destroy them. + pending_tombstone_tablets: Vec, /// Statistics for self. self_stat: PeerStat, @@ -60,14 +59,20 @@ pub struct Peer { /// Statistics for other peers, only maintained when self is the leader. peer_heartbeats: HashMap, + /// For raft log compaction. + skip_compact_log_ticks: usize, + approximate_raft_log_size: u64, + /// Encoder for batching proposals and encoding them in a more efficient way /// than protobuf. - raw_write_encoder: Option, + raw_write_encoder: Option, proposals: ProposalQueue>, apply_scheduler: Option, /// Set to true if any side effect needs to be handled. has_ready: bool, + /// Sometimes there is no ready at all, but we need to trigger async write. + has_extra_write: bool, /// Writer for persisting side effects asynchronously. pub(crate) async_writer: AsyncWriter, @@ -86,8 +91,24 @@ pub struct Peer { txn_ext: Arc, txn_extra_op: Arc>, + pending_ticks: Vec, + /// Check whether this proposal can be proposed based on its epoch. proposal_control: ProposalControl, + + // Trace which peers have not finished split. + split_trace: Vec<(u64, HashSet)>, + split_flow_control: SplitFlowControl, + + /// Apply related State changes that needs to be persisted to raft engine. + /// + /// To make recovery correct, we need to persist all state changes before + /// advancing apply index. + state_changes: Option>, + flush_state: Arc, + + /// lead_transferee if this peer(leader) is in a leadership transferring. + leader_transferee: u64, } impl Peer { @@ -96,7 +117,8 @@ impl Peer { /// If peer is destroyed, `None` is returned. pub fn new( cfg: &Config, - tablet_factory: &dyn TabletFactory, + tablet_registry: &TabletRegistry, + snap_mgr: &TabletSnapManager, storage: Storage, ) -> Result { let logger = storage.logger().clone(); @@ -107,41 +129,38 @@ impl Peer { let region_id = storage.region().get_id(); let tablet_index = storage.region_state().get_tablet_index(); - // Another option is always create tablet even if tablet index is 0. But this - // can introduce race when gc old tablet and create new peer. - let tablet = if tablet_index != 0 { - if !tablet_factory.exists(region_id, tablet_index) { - return Err(box_err!( - "missing tablet {} for region {}", - tablet_index, - region_id - )); - } - // TODO: Perhaps we should stop create the tablet automatically. - Some(tablet_factory.open_tablet( - region_id, - Some(tablet_index), - OpenOptions::default().set_create(true), - )?) - } else { - None - }; - - let tablet = CachedTablet::new(tablet); let raft_group = RawNode::new(&raft_cfg, storage, &logger)?; let region = raft_group.store().region_state().get_region().clone(); + + let flush_state: Arc = Arc::default(); + // We can't create tablet if tablet index is 0. It can introduce race when gc + // old tablet and create new peer. We also can't get the correct range of the + // region, which is required for kv data gc. + if tablet_index != 0 { + raft_group.store().recover_tablet(tablet_registry, snap_mgr); + let mut ctx = TabletContext::new(®ion, Some(tablet_index)); + ctx.flush_state = Some(flush_state.clone()); + // TODO: Perhaps we should stop create the tablet automatically. + tablet_registry.load(ctx, false)?; + } + let cached_tablet = tablet_registry.get_or_default(region_id); + let tag = format!("[region {}] {}", region.get_id(), peer_id); let mut peer = Peer { - tablet, + tablet: cached_tablet, + pending_tombstone_tablets: Vec::new(), self_stat: PeerStat::default(), peer_cache: vec![], peer_heartbeats: HashMap::default(), + skip_compact_log_ticks: 0, + approximate_raft_log_size: 0, raw_write_encoder: None, proposals: ProposalQueue::new(region_id, raft_group.raft.id), async_writer: AsyncWriter::new(region_id, peer_id), apply_scheduler: None, has_ready: false, + has_extra_write: false, destroy_progress: DestroyProgress::None, raft_group, logger, @@ -161,6 +180,12 @@ impl Peer { txn_ext: Arc::default(), txn_extra_op: Arc::new(AtomicCell::new(TxnExtraOp::Noop)), proposal_control: ProposalControl::new(0), + pending_ticks: Vec::new(), + split_trace: vec![], + state_changes: None, + flush_state, + split_flow_control: SplitFlowControl::default(), + leader_transferee: raft::INVALID_ID, }; // If this region has only one peer and I am the one, campaign directly. @@ -194,7 +219,7 @@ impl Peer { /// has been preserved in a durable device. pub fn set_region( &mut self, - // host: &CoprocessorHost, + host: &CoprocessorHost, reader: &mut ReadDelegate, region: metapb::Region, reason: RegionChangeReason, @@ -242,7 +267,13 @@ impl Peer { pessimistic_locks.version = self.region().get_region_epoch().get_version(); } - // TODO: CoprocessorHost + if self.serving() { + host.on_region_changed( + self.region(), + RegionChangeEvent::Update(reason), + self.state_role(), + ); + } } #[inline] @@ -306,13 +337,43 @@ impl Peer { } #[inline] - pub fn tablet(&self) -> &CachedTablet { - &self.tablet + pub fn tablet(&mut self) -> Option<&EK> { + self.tablet.latest() } #[inline] - pub fn tablet_mut(&mut self) -> &mut CachedTablet { - &mut self.tablet + pub fn record_tablet_as_tombstone_and_refresh( + &mut self, + new_tablet_index: u64, + ctx: &StoreContext, + ) { + if let Some(old_tablet) = self.tablet.cache() { + self.pending_tombstone_tablets.push(new_tablet_index); + let _ = ctx + .schedulers + .tablet_gc + .schedule(tablet_gc::Task::prepare_destroy( + old_tablet.clone(), + self.region_id(), + new_tablet_index, + )); + } + // TODO: Handle race between split and snapshot. So that we can assert + // `self.tablet.refresh() == 1` + assert!(self.tablet.refresh() > 0); + } + + /// Returns if there's any tombstone being removed. + #[inline] + pub fn remove_tombstone_tablets_before(&mut self, persisted: u64) -> bool { + let mut removed = 0; + while let Some(i) = self.pending_tombstone_tablets.first() + && *i <= persisted + { + removed += 1; + } + self.pending_tombstone_tablets.drain(..removed); + removed > 0 } #[inline] @@ -330,11 +391,22 @@ impl Peer { self.raft_group = raft_group; } + #[inline] + pub fn persisted_index(&self) -> u64 { + self.raft_group.raft.raft_log.persisted + } + #[inline] pub fn self_stat(&self) -> &PeerStat { &self.self_stat } + #[inline] + pub fn update_stat(&mut self, metrics: &ApplyMetrics) { + self.self_stat.written_bytes += metrics.written_bytes; + self.self_stat.written_keys += metrics.written_keys; + } + /// Mark the peer has a ready so it will be checked at the end of every /// processing round. #[inline] @@ -348,6 +420,17 @@ impl Peer { mem::take(&mut self.has_ready) } + #[inline] + pub fn set_has_extra_write(&mut self) { + self.set_has_ready(); + self.has_extra_write = true; + } + + #[inline] + pub fn reset_has_extra_write(&mut self) -> bool { + mem::take(&mut self.has_extra_write) + } + #[inline] pub fn insert_peer_cache(&mut self, peer: metapb::Peer) { for p in self.raft_group.store().region().get_peers() { @@ -412,6 +495,16 @@ impl Peer { self.peer_heartbeats.remove(&peer_id); } + /// Returns whether or not the peer sent heartbeat after the provided + /// deadline time. + #[inline] + pub fn peer_heartbeat_is_fresh(&self, peer_id: u64, deadline: &Instant) -> bool { + matches!( + self.peer_heartbeats.get(&peer_id), + Some(last_heartbeat) if *last_heartbeat >= *deadline + ) + } + pub fn collect_down_peers(&self, max_duration: Duration) -> Vec { let mut down_peers = Vec::new(); let now = Instant::now(); @@ -433,6 +526,36 @@ impl Peer { down_peers } + #[inline] + pub fn reset_skip_compact_log_ticks(&mut self) { + self.skip_compact_log_ticks = 0; + } + + #[inline] + pub fn maybe_skip_compact_log(&mut self, max_skip_ticks: usize) -> bool { + if self.skip_compact_log_ticks < max_skip_ticks { + self.skip_compact_log_ticks += 1; + true + } else { + false + } + } + + #[inline] + pub fn approximate_raft_log_size(&self) -> u64 { + self.approximate_raft_log_size + } + + #[inline] + pub fn update_approximate_raft_log_size(&mut self, f: impl Fn(u64) -> u64) { + self.approximate_raft_log_size = f(self.approximate_raft_log_size); + } + + #[inline] + pub fn state_role(&self) -> StateRole { + self.raft_group.raft.state + } + #[inline] pub fn is_leader(&self) -> bool { self.raft_group.raft.state == StateRole::Leader @@ -489,17 +612,12 @@ impl Peer { } #[inline] - pub(crate) fn has_applied_to_current_term(&self) -> bool { - self.entry_storage().applied_term() == self.term() - } - - #[inline] - pub fn simple_write_encoder_mut(&mut self) -> &mut Option { + pub fn simple_write_encoder_mut(&mut self) -> &mut Option { &mut self.raw_write_encoder } #[inline] - pub fn simple_write_encoder(&self) -> &Option { + pub fn simple_write_encoder(&self) -> &Option { &self.raw_write_encoder } @@ -518,8 +636,8 @@ impl Peer { &self.proposals } - pub fn apply_scheduler(&self) -> &ApplyScheduler { - self.apply_scheduler.as_ref().unwrap() + pub fn apply_scheduler(&self) -> Option<&ApplyScheduler> { + self.apply_scheduler.as_ref() } #[inline] @@ -527,6 +645,51 @@ impl Peer { self.apply_scheduler = Some(apply_scheduler); } + #[inline] + pub fn clear_apply_scheduler(&mut self) { + self.apply_scheduler.take(); + } + + /// Whether the snapshot is handling. + /// See the comments of `check_snap_status` for more details. + #[inline] + pub fn is_handling_snapshot(&self) -> bool { + // todo: This method may be unnecessary now? + false + } + + /// Returns `true` if the raft group has replicated a snapshot but not + /// committed it yet. + #[inline] + pub fn has_pending_snapshot(&self) -> bool { + self.raft_group().snap().is_some() + } + + #[inline] + pub fn add_pending_tick(&mut self, tick: PeerTick) { + self.pending_ticks.push(tick); + } + + #[inline] + pub fn take_pending_ticks(&mut self) -> Vec { + mem::take(&mut self.pending_ticks) + } + + pub fn activate_in_memory_pessimistic_locks(&mut self) { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::Normal; + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + + pub fn clear_in_memory_pessimistic_locks(&mut self) { + let mut pessimistic_locks = self.txn_ext.pessimistic_locks.write(); + pessimistic_locks.status = LocksStatus::NotLeader; + pessimistic_locks.clear(); + pessimistic_locks.term = self.term(); + pessimistic_locks.version = self.region().get_region_epoch().get_version(); + } + #[inline] pub fn post_split(&mut self) { self.reset_region_buckets(); @@ -600,4 +763,52 @@ impl Peer { self.update_max_timestamp_pd(ctx, initial_status); } + + #[inline] + pub fn split_trace_mut(&mut self) -> &mut Vec<(u64, HashSet)> { + &mut self.split_trace + } + + #[inline] + pub fn flush_state(&self) -> &Arc { + &self.flush_state + } + + pub fn reset_flush_state(&mut self) { + self.flush_state = Arc::default(); + } + + // Note: Call `set_has_extra_write` after adding new state changes. + #[inline] + pub fn state_changes_mut(&mut self) -> &mut ER::LogBatch { + if self.state_changes.is_none() { + self.state_changes = Some(Box::new(self.entry_storage().raft_engine().log_batch(0))); + } + self.state_changes.as_mut().unwrap() + } + + #[inline] + pub fn merge_state_changes_to(&mut self, task: &mut WriteTask) { + if self.state_changes.is_none() { + return; + } + task.extra_write + .merge_v2(Box::into_inner(self.state_changes.take().unwrap())); + } + + #[inline] + pub fn split_flow_control_mut(&mut self) -> &mut SplitFlowControl { + &mut self.split_flow_control + } + + #[inline] + pub fn refresh_leader_transferee(&mut self) -> u64 { + mem::replace( + &mut self.leader_transferee, + self.raft_group + .raft + .lead_transferee + .unwrap_or(raft::INVALID_ID), + ) + } } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index b3ad56af4fd..636970c0ad1 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -3,56 +3,27 @@ use std::{ cell::{RefCell, RefMut}, fmt::{self, Debug, Formatter}, - sync::{mpsc::Receiver, Arc}, }; -use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use collections::HashMap; +use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ - metapb::{self, Region}, + metapb, raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{ConfState, Entry, Snapshot}, GetEntriesContext, RaftState, INVALID_ID, }; -use raftstore::store::{ - util, EntryStorage, ReadTask, WriteTask, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, -}; -use slog::{info, o, Logger}; +use raftstore::store::{util, EntryStorage, ReadTask}; +use slog::{o, Logger}; use tikv_util::{box_err, store::find_peer, worker::Scheduler}; use crate::{ - operation::{GenSnapTask, SnapState}, + operation::{ApplyTrace, GenSnapTask, SnapState, SplitInit}, Result, }; -pub fn write_initial_states(wb: &mut impl RaftLogBatch, region: Region) -> Result<()> { - let region_id = region.get_id(); - - let mut state = RegionLocalState::default(); - state.set_region(region); - state.set_tablet_index(RAFT_INIT_LOG_INDEX); - wb.put_region_state(region_id, &state)?; - - let mut apply_state = RaftApplyState::default(); - apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_term(RAFT_INIT_LOG_TERM); - wb.put_apply_state(region_id, &apply_state)?; - - let mut raft_state = RaftLocalState::default(); - raft_state.set_last_index(RAFT_INIT_LOG_INDEX); - raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); - raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - wb.put_raft_state(region_id, &raft_state)?; - - Ok(()) -} - /// A storage for raft. /// /// It's similar to `PeerStorage` in v1. @@ -67,8 +38,11 @@ pub struct Storage { logger: Logger, /// Snapshot part. - snap_state: RefCell, - gen_snap_task: RefCell>>, + pub snap_states: RefCell>, + pub gen_snap_task: RefCell>>, + split_init: Option>, + /// The flushed index of all CFs. + apply_trace: ApplyTrace, } impl Debug for Storage { @@ -113,139 +87,39 @@ impl Storage { &self.logger } - #[inline] - pub fn snap_state_mut(&self) -> RefMut<'_, SnapState> { - self.snap_state.borrow_mut() - } - #[inline] pub fn gen_snap_task_mut(&self) -> RefMut<'_, Box>> { self.gen_snap_task.borrow_mut() } -} - -impl Storage { - /// Creates a new storage with uninit states. - /// - /// This should only be used for creating new peer from raft message. - pub fn uninit( - store_id: u64, - region: Region, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result { - let mut region_state = RegionLocalState::default(); - region_state.set_region(region); - Self::create( - store_id, - region_state, - RaftLocalState::default(), - RaftApplyState::default(), - engine, - read_scheduler, - false, - logger, - ) - } - - /// Creates a new storage. - /// - /// All metadata should be initialized before calling this method. If the - /// region is destroyed, `None` will be returned. - pub fn new( - region_id: u64, - store_id: u64, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result>> { - let region_state = match engine.get_region_state(region_id) { - Ok(Some(s)) => s, - res => { - return Err(box_err!( - "failed to get region state for region {}: {:?}", - region_id, - res - )); - } - }; - if region_state.get_state() == PeerState::Tombstone { - return Ok(None); + #[inline] + pub fn cancel_snap_task(&self, to_peer_id: Option) { + if to_peer_id.is_none() { + self.gen_snap_task.borrow_mut().take(); + return; } + let to = to_peer_id.unwrap(); + let mut task = self.gen_snap_task.borrow_mut(); + if let Some(t) = &**task { + if to == t.to_peer() { + *task = Box::new(None); + }; + } + } - let raft_state = match engine.get_raft_state(region_id) { - Ok(Some(s)) => s, - res => { - return Err(box_err!("failed to get raft state: {:?}", res)); - } - }; - - let apply_state = match engine.get_apply_state(region_id) { - Ok(Some(s)) => s, - res => { - return Err(box_err!("failed to get apply state: {:?}", res)); - } - }; - - Self::create( - store_id, - region_state, - raft_state, - apply_state, - engine, - read_scheduler, - true, - logger, - ) - .map(Some) + #[inline] + pub fn apply_trace_mut(&mut self) -> &mut ApplyTrace { + &mut self.apply_trace } - /// Creates a new storage for split peer. - /// - /// Except for region local state which uses the `region` provided with the - /// inital tablet index, all uses the inital states. - pub fn with_split( - store_id: u64, - region: &metapb::Region, - engine: ER, - read_scheduler: Scheduler>, - logger: &Logger, - ) -> Result>> { - let mut region_state = RegionLocalState::default(); - region_state.set_region(region.clone()); - region_state.set_state(PeerState::Normal); - region_state.set_tablet_index(RAFT_INIT_LOG_INDEX); - - let mut apply_state = RaftApplyState::default(); - apply_state.set_applied_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_index(RAFT_INIT_LOG_INDEX); - apply_state - .mut_truncated_state() - .set_term(RAFT_INIT_LOG_TERM); - - let mut raft_state = RaftLocalState::default(); - raft_state.set_last_index(RAFT_INIT_LOG_INDEX); - raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); - raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - - Self::create( - store_id, - region_state, - raft_state, - apply_state, - engine, - read_scheduler, - true, - logger, - ) - .map(Some) + #[inline] + pub fn apply_trace(&self) -> &ApplyTrace { + &self.apply_trace } +} - fn create( +impl Storage { + pub(crate) fn create( store_id: u64, region_state: RegionLocalState, raft_state: RaftLocalState, @@ -253,6 +127,7 @@ impl Storage { engine: ER, read_scheduler: Scheduler>, persisted: bool, + apply_trace: ApplyTrace, logger: &Logger, ) -> Result { let peer = find_peer(region_state.get_region(), store_id); @@ -279,8 +154,10 @@ impl Storage { region_state, ever_persisted: persisted, logger, - snap_state: RefCell::new(SnapState::Relax), + snap_states: RefCell::new(HashMap::default()), gen_snap_task: RefCell::new(Box::new(None)), + split_init: None, + apply_trace, }) } @@ -289,6 +166,11 @@ impl Storage { &mut self.region_state } + #[inline] + pub fn split_init_mut(&mut self) -> &mut Option> { + &mut self.split_init + } + #[inline] pub fn raft_state(&self) -> &RaftLocalState { self.entry_storage.raft_state() @@ -304,6 +186,9 @@ impl Storage { self.entry_storage.apply_state() } + /// Check if the storage is initialized. + /// + /// The storage is considered initialized when data is applied in memory. #[inline] pub fn is_initialized(&self) -> bool { self.region_state.get_tablet_index() != 0 @@ -402,33 +287,35 @@ impl raft::Storage for Storage { #[cfg(test)] mod tests { use std::{ - sync::mpsc::{sync_channel, SyncSender}, + sync::{ + mpsc::{sync_channel, Receiver, SyncSender}, + Arc, + }, time::Duration, }; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, - raft::RaftTestEngine, - }; - use engine_traits::{ - KvEngine, OpenOptions, RaftEngine, RaftEngineReadOnly, RaftLogBatch, TabletFactory, ALL_CFS, + kv::TestTabletFactory, }; + use engine_traits::{RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS}; use kvproto::{ metapb::{Peer, Region}, raft_serverpb::PeerState, }; - use raft::{eraftpb::Snapshot as RaftSnapshot, Error as RaftError, StorageError}; + use raft::{Error as RaftError, StorageError}; use raftstore::store::{ - util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask, - TabletSnapKey, TabletSnapManager, RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, + util::new_empty_snapshot, AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, + TabletSnapKey, TabletSnapManager, WriteTask, }; use slog::o; use tempfile::TempDir; - use tikv_util::worker::{Runnable, Worker}; + use tikv_util::worker::Worker; use super::*; - use crate::{fsm::ApplyResReporter, raft::Apply, router::ApplyRes, tablet::CachedTablet}; + use crate::{ + fsm::ApplyResReporter, operation::write_initial_states, raft::Apply, router::ApplyRes, + }; #[derive(Clone)] pub struct TestRouter { @@ -468,41 +355,11 @@ mod tests { region } - #[test] - fn test_write_initial_states() { - let region = new_region(); - let path = TempDir::new().unwrap(); - let engine = engine_test::new_temp_engine(&path); - let raft_engine = &engine.raft; - let mut wb = raft_engine.log_batch(10); - write_initial_states(&mut wb, region.clone()).unwrap(); - assert!(!wb.is_empty()); - raft_engine.consume(&mut wb, true).unwrap(); - - let local_state = raft_engine.get_region_state(4).unwrap().unwrap(); - assert_eq!(local_state.get_state(), PeerState::Normal); - assert_eq!(*local_state.get_region(), region); - assert_eq!(local_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); - - let raft_state = raft_engine.get_raft_state(4).unwrap().unwrap(); - assert_eq!(raft_state.get_last_index(), RAFT_INIT_LOG_INDEX); - let hs = raft_state.get_hard_state(); - assert_eq!(hs.get_term(), RAFT_INIT_LOG_TERM); - assert_eq!(hs.get_commit(), RAFT_INIT_LOG_INDEX); - - let apply_state = raft_engine.get_apply_state(4).unwrap().unwrap(); - assert_eq!(apply_state.get_applied_index(), RAFT_INIT_LOG_INDEX); - let ts = apply_state.get_truncated_state(); - assert_eq!(ts.get_index(), RAFT_INIT_LOG_INDEX); - assert_eq!(ts.get_term(), RAFT_INIT_LOG_TERM); - } - #[test] fn test_apply_snapshot() { let region = new_region(); let path = TempDir::new().unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); - mgr.init().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); let raft_engine = engine_test::raft::new_engine(&format!("{}", path.path().join("raft").display()), None) .unwrap(); @@ -512,13 +369,10 @@ mod tests { raft_engine.consume(&mut wb, true).unwrap(); // building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path().join("tablet").as_path(), - ops, - cf_opts, - )); - let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); + let worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); let mut s = Storage::new(4, 6, raft_engine.clone(), sched, &logger.clone()) @@ -527,8 +381,7 @@ mod tests { let snapshot = new_empty_snapshot(region.clone(), 10, 1, false); let mut task = WriteTask::new(region.get_id(), 5, 0); - s.apply_snapshot(&snapshot, &mut task, mgr, factory) - .unwrap(); + s.apply_snapshot(&snapshot, &mut task, mgr, reg).unwrap(); // It can be set before load tablet. assert_eq!(PeerState::Normal, s.region_state().get_state()); @@ -539,8 +392,8 @@ mod tests { // This index can't be set before load tablet. assert_ne!(10, s.entry_storage().applied_index()); assert_ne!(1, s.entry_storage().applied_term()); - assert_ne!(10, s.region_state().get_tablet_index()); - assert!(task.persisted_cb.is_some()); + assert_eq!(10, s.region_state().get_tablet_index()); + assert!(!task.persisted_cbs.is_empty()); s.on_applied_snapshot(); assert_eq!(10, s.entry_storage().applied_index()); @@ -559,51 +412,51 @@ mod tests { write_initial_states(&mut wb, region.clone()).unwrap(); assert!(!wb.is_empty()); raft_engine.consume(&mut wb, true).unwrap(); - let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()); - mgr.init().unwrap(); + let mgr = TabletSnapManager::new(path.path().join("snap_dir").to_str().unwrap()).unwrap(); // building a tablet factory let ops = DbOptions::default(); - let cf_opts = ALL_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path.path().join("tablet").as_path(), - ops, - cf_opts, - )); - // create tablet with region_id 1 - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + let cf_opts = DATA_CFS.iter().map(|cf| (*cf, CfOptions::new())).collect(); + let factory = Box::new(TestTabletFactory::new(ops, cf_opts)); + let reg = TabletRegistry::new(factory, path.path().join("tablets")).unwrap(); + let tablet_ctx = TabletContext::new(®ion, Some(10)); + reg.load(tablet_ctx, true).unwrap(); // setup read runner worker and peer storage let mut worker = Worker::new("test-read-worker").lazy_build("test-read-worker"); let sched = worker.scheduler(); let logger = slog_global::borrow_global().new(o!()); - let mut s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) + let s = Storage::new(4, 6, raft_engine.clone(), sched.clone(), &logger.clone()) .unwrap() .unwrap(); let (router, rx) = TestRouter::new(); let mut read_runner = ReadRunner::new(router.clone(), raft_engine); read_runner.set_snap_mgr(mgr.clone()); worker.start(read_runner); + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); // setup peer applyer let mut apply = Apply::new( region.get_peers()[0].clone(), - RegionLocalState::default(), + state, router, - CachedTablet::new(Some(tablet)), - factory, + reg, sched, + Arc::default(), + None, logger, ); // Test get snapshot - let snap = s.snapshot(0, 7); + let to_peer_id = 7; + let snap = s.snapshot(0, to_peer_id); let unavailable = RaftError::Store(StorageError::SnapshotTemporarilyUnavailable); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); s.on_snapshot_generated(res); - let snap = match *s.snap_state.borrow() { + assert_eq!(s.snapshot(0, 8).unwrap_err(), unavailable); + assert!(s.snap_states.borrow().get(&8).is_some()); + let snap = match *s.snap_states.borrow().get(&to_peer_id).unwrap() { SnapState::Generated(ref snap) => *snap.clone(), ref s => panic!("unexpected state: {:?}", s), }; @@ -613,15 +466,16 @@ mod tests { let snap_key = TabletSnapKey::from_region_snap(4, 7, &snap); let checkpointer_path = mgr.tablet_gen_path(&snap_key); assert!(checkpointer_path.exists()); + s.snapshot(0, to_peer_id).unwrap(); // Test cancel snapshot - let snap = s.snapshot(0, 0); + let snap = s.snapshot(0, 7); assert_eq!(snap.unwrap_err(), unavailable); let gen_task = s.gen_snap_task.borrow_mut().take().unwrap(); apply.schedule_gen_snapshot(gen_task); - let res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + let _res = rx.recv_timeout(Duration::from_secs(1)).unwrap(); s.cancel_generating_snap(None); - assert_eq!(*s.snap_state.borrow(), SnapState::Relax); + assert!(s.snap_states.borrow().get(&to_peer_id).is_none()); // Test get twice snapshot and cancel once. // get snapshot a diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index 8cb65e40a3c..668d7591a40 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -1,19 +1,26 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::{Arc, Mutex}; +use std::{ + borrow::Cow, + sync::{Arc, Mutex}, +}; use crossbeam::channel::TrySendError; -use engine_traits::{KvEngine, RaftEngine}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; +use futures::Future; use kvproto::{ raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, raft_serverpb::RaftMessage, }; -use raft::eraftpb::Snapshot as RaftSnapshot; use raftstore::store::{AsyncReadNotifier, FetchedLogs, GenSnapRes, RegionSnapshot}; -use slog::Logger; +use slog::warn; -use super::PeerMsg; -use crate::{batch::StoreRouter, operation::LocalReader, StoreMeta}; +use super::{CmdResChannel, PeerMsg}; +use crate::{ + batch::StoreRouter, + operation::{LocalReader, RequestSplit}, + StoreMeta, +}; impl AsyncReadNotifier for StoreRouter { fn notify_logs_fetched(&self, region_id: u64, fetched_logs: FetchedLogs) { @@ -25,6 +32,65 @@ impl AsyncReadNotifier for StoreRouter { } } +impl raftstore::coprocessor::StoreHandle for StoreRouter { + fn update_approximate_size(&self, _region_id: u64, _size: u64) { + // TODO + } + + fn update_approximate_keys(&self, _region_id: u64, _keys: u64) { + // TODO + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let (ch, _) = CmdResChannel::pair(); + let res = self.send( + region_id, + PeerMsg::RequestSplit { + request: RequestSplit { + epoch: region_epoch, + split_keys, + source, + }, + ch, + }, + ); + if let Err(e) = res { + warn!( + self.logger(), + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + _region_id: u64, + _region_epoch: kvproto::metapb::RegionEpoch, + _buckets: Vec, + _bucket_ranges: Option>, + ) { + // TODO + } + + fn update_compute_hash_result( + &self, + _region_id: u64, + _index: u64, + _context: Vec, + _hash: Vec, + ) { + // TODO + } +} + /// A router that routes messages to the raftstore pub struct RaftRouter where @@ -49,15 +115,13 @@ where } impl RaftRouter { - pub fn new(store_id: u64, router: StoreRouter) -> Self { - let mut store_meta = StoreMeta::new(); - store_meta.store_id = Some(store_id); - let store_meta = Arc::new(Mutex::new(store_meta)); + pub fn new(store_id: u64, reg: TabletRegistry, router: StoreRouter) -> Self { + let store_meta = Arc::new(Mutex::new(StoreMeta::new(store_id))); let logger = router.logger().clone(); RaftRouter { router: router.clone(), - local_reader: LocalReader::new(store_meta, router, logger), + local_reader: LocalReader::new(store_meta, reg, router, logger), } } @@ -69,7 +133,12 @@ impl RaftRouter { self.router.send(addr, msg) } - pub fn store_meta(&self) -> &Arc>> { + #[inline] + pub fn check_send(&self, addr: u64, msg: PeerMsg) -> crate::Result<()> { + self.router.check_send(addr, msg) + } + + pub fn store_meta(&self) -> &Arc> { self.local_reader.store_meta() } @@ -80,10 +149,11 @@ impl RaftRouter { self.router.send_raft_message(msg) } - pub async fn get_snapshot( + pub fn snapshot( &mut self, req: RaftCmdRequest, - ) -> std::result::Result, RaftCmdResponse> { - self.local_reader.snapshot(req).await + ) -> impl Future, RaftCmdResponse>> + Send + { + self.local_reader.snapshot(req) } } diff --git a/components/raftstore-v2/src/router/internal_message.rs b/components/raftstore-v2/src/router/internal_message.rs index 1507d404297..05e1baea1cf 100644 --- a/components/raftstore-v2/src/router/internal_message.rs +++ b/components/raftstore-v2/src/router/internal_message.rs @@ -1,18 +1,22 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use raftstore::store::fsm::ChangePeer; +use raftstore::store::fsm::ApplyMetrics; -use crate::operation::{AdminCmdResult, CommittedEntries, GenSnapTask}; +use crate::operation::{AdminCmdResult, CommittedEntries, DataTrace, GenSnapTask}; #[derive(Debug)] pub enum ApplyTask { CommittedEntries(CommittedEntries), Snapshot(GenSnapTask), + /// Writes that doesn't care consistency. + UnsafeWrite(Box<[u8]>), } #[derive(Debug, Default)] pub struct ApplyRes { pub applied_index: u64, pub applied_term: u64, - pub admin_result: Vec, + pub admin_result: Box<[AdminCmdResult]>, + pub modifications: DataTrace, + pub metrics: ApplyMetrics, } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a4681d8a873..930de5ff036 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -1,11 +1,12 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] -use std::fmt; -use engine_traits::Snapshot; -use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; -use raft::eraftpb::Snapshot as RaftSnapshot; +use kvproto::{ + metapb, + raft_cmdpb::{RaftCmdRequest, RaftRequestHeader}, + raft_serverpb::RaftMessage, +}; use raftstore::store::{metrics::RaftEventDurationType, FetchedLogs, GenSnapRes}; use tikv_util::time::Instant; @@ -15,13 +16,13 @@ use super::{ }, ApplyRes, }; -use crate::operation::SplitInit; +use crate::operation::{RequestSplit, SimpleWriteBinary, SplitInit}; #[derive(Debug, Clone, Copy, PartialEq, Hash)] #[repr(u8)] pub enum PeerTick { Raft = 0, - RaftLogGc = 1, + CompactLog = 1, SplitRegionCheck = 2, PdHeartbeat = 3, CheckMerge = 4, @@ -40,7 +41,7 @@ impl PeerTick { pub fn tag(self) -> &'static str { match self { PeerTick::Raft => "raft", - PeerTick::RaftLogGc => "raft_log_gc", + PeerTick::CompactLog => "compact_log", PeerTick::SplitRegionCheck => "split_region_check", PeerTick::PdHeartbeat => "pd_heartbeat", PeerTick::CheckMerge => "check_merge", @@ -56,7 +57,7 @@ impl PeerTick { pub const fn all_ticks() -> &'static [PeerTick] { const TICKS: &[PeerTick] = &[ PeerTick::Raft, - PeerTick::RaftLogGc, + PeerTick::CompactLog, PeerTick::SplitRegionCheck, PeerTick::PdHeartbeat, PeerTick::CheckMerge, @@ -93,6 +94,7 @@ impl StoreTick { } /// Command that can be handled by raftstore. +#[derive(Debug)] pub struct RaftRequest { pub send_time: Instant, pub request: RaftCmdRequest, @@ -109,7 +111,22 @@ impl RaftRequest { } } +#[derive(Debug)] +pub struct SimpleWrite { + pub send_time: Instant, + pub header: Box, + pub data: SimpleWriteBinary, + pub ch: CmdResChannel, +} + +#[derive(Debug)] +pub struct UnsafeWrite { + pub send_time: Instant, + pub data: SimpleWriteBinary, +} + /// Message that can be sent to a peer. +#[derive(Debug)] pub enum PeerMsg { /// Raft message is the message sent between raft nodes in the same /// raft group. Messages need to be redirected to raftstore if target @@ -120,7 +137,10 @@ pub enum PeerMsg { RaftQuery(RaftRequest), /// Command changes the inernal states. It will be transformed into logs and /// applied on all replicas. - RaftCommand(RaftRequest), + SimpleWrite(SimpleWrite), + UnsafeWrite(UnsafeWrite), + /// Command that contains admin requests. + AdminCommand(RaftRequest), /// Tick is periodical task. If target peer doesn't exist there is a /// potential that the raft node will not work anymore. Tick(PeerTick), @@ -132,6 +152,7 @@ pub enum PeerMsg { Start, /// Messages from peer to peer in the same store SplitInit(Box), + SplitInitFinish(u64), /// A message only used to notify a peer. Noop, /// A message that indicates an asynchronous write has finished. @@ -140,6 +161,27 @@ pub enum PeerMsg { ready_number: u64, }, QueryDebugInfo(DebugInfoChannel), + DataFlushed { + cf: &'static str, + tablet_index: u64, + flushed_index: u64, + }, + PeerUnreachable { + to_peer_id: u64, + }, + StoreUnreachable { + to_store_id: u64, + }, + /// Reports whether the snapshot sending is successful or not. + SnapshotSent { + to_peer_id: u64, + status: raft::SnapshotStatus, + }, + RequestSplit { + request: RequestSplit, + ch: CmdResChannel, + }, + ForceCompactLog, /// A message that used to check if a flush is happened. #[cfg(feature = "testexport")] WaitFlush(super::FlushChannel), @@ -151,60 +193,59 @@ impl PeerMsg { (PeerMsg::RaftQuery(RaftRequest::new(req, ch)), sub) } - pub fn raft_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { + pub fn admin_command(req: RaftCmdRequest) -> (Self, CmdResSubscriber) { let (ch, sub) = CmdResChannel::pair(); - (PeerMsg::RaftCommand(RaftRequest::new(req, ch)), sub) + (PeerMsg::AdminCommand(RaftRequest::new(req, ch)), sub) } -} -impl fmt::Debug for PeerMsg { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - PeerMsg::RaftMessage(_) => write!(fmt, "Raft Message"), - PeerMsg::RaftQuery(_) => write!(fmt, "Raft Query"), - PeerMsg::RaftCommand(_) => write!(fmt, "Raft Command"), - PeerMsg::Tick(tick) => write! { - fmt, - "{:?}", - tick + pub fn simple_write( + header: Box, + data: SimpleWriteBinary, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::SimpleWrite(SimpleWrite { + send_time: Instant::now(), + header, + data, + ch, + }), + sub, + ) + } + + pub fn unsafe_write(data: SimpleWriteBinary) -> Self { + PeerMsg::UnsafeWrite(UnsafeWrite { + send_time: Instant::now(), + data, + }) + } + + pub fn request_split( + epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> (Self, CmdResSubscriber) { + let (ch, sub) = CmdResChannel::pair(); + ( + PeerMsg::RequestSplit { + request: RequestSplit { + epoch, + split_keys, + source: source.into(), + }, + ch, }, - PeerMsg::ApplyRes(res) => write!(fmt, "ApplyRes {:?}", res), - PeerMsg::Start => write!(fmt, "Startup"), - PeerMsg::SplitInit(_) => { - write!(fmt, "Split initialization") - } - PeerMsg::Noop => write!(fmt, "Noop"), - PeerMsg::Persisted { - peer_id, - ready_number, - } => write!( - fmt, - "Persisted peer_id {}, ready_number {}", - peer_id, ready_number - ), - PeerMsg::LogsFetched(fetched) => write!(fmt, "LogsFetched {:?}", fetched), - PeerMsg::SnapshotGenerated(_) => write!(fmt, "SnapshotGenerated"), - PeerMsg::QueryDebugInfo(_) => write!(fmt, "QueryDebugInfo"), - #[cfg(feature = "testexport")] - PeerMsg::WaitFlush(_) => write!(fmt, "FlushMessages"), - } + sub, + ) } } +#[derive(Debug)] pub enum StoreMsg { RaftMessage(Box), SplitInit(Box), Tick(StoreTick), Start, -} - -impl fmt::Debug for StoreMsg { - fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - match *self { - StoreMsg::RaftMessage(_) => write!(fmt, "Raft Message"), - StoreMsg::SplitInit(_) => write!(fmt, "Split initialization"), - StoreMsg::Tick(tick) => write!(fmt, "StoreTick {:?}", tick), - StoreMsg::Start => write!(fmt, "Start store"), - } - } + StoreUnreachable { to_store_id: u64 }, } diff --git a/components/raftstore-v2/src/router/mod.rs b/components/raftstore-v2/src/router/mod.rs index a09b0593b80..d6846f61e4b 100644 --- a/components/raftstore-v2/src/router/mod.rs +++ b/components/raftstore-v2/src/router/mod.rs @@ -15,7 +15,7 @@ pub use self::{ internal_message::ApplyRes, message::{PeerMsg, PeerTick, RaftRequest, StoreMsg, StoreTick}, response_channel::{ - CmdResChannel, DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, - ReadResponse, + CmdResChannel, CmdResChannelBuilder, CmdResEvent, CmdResStream, CmdResSubscriber, + DebugInfoChannel, DebugInfoSubscriber, QueryResChannel, QueryResult, ReadResponse, }, }; diff --git a/components/raftstore-v2/src/router/response_channel.rs b/components/raftstore-v2/src/router/response_channel.rs index b6da3c804f0..2cb75acccfc 100644 --- a/components/raftstore-v2/src/router/response_channel.rs +++ b/components/raftstore-v2/src/router/response_channel.rs @@ -24,7 +24,7 @@ use std::{ task::{Context, Poll}, }; -use futures::task::AtomicWaker; +use futures::{task::AtomicWaker, FutureExt, Stream}; use kvproto::{kvrpcpb::ExtraOp as TxnExtraOp, raft_cmdpb::RaftCmdResponse}; use raftstore::store::{ local_metrics::TimeTracker, msg::ErrorCallback, region_meta::RegionMeta, ReadCallback, @@ -47,7 +47,11 @@ struct EventCore { /// Event 0 and Event 31 is reserved as payload and cancel respectively. /// Other events should be defined within [1, 30]. event: AtomicU64, + /// Even a channel supports multiple events, it's not necessary to trigger + /// all of them. `event_mask` is used to filter unnecessary events. + event_mask: u32, res: UnsafeCell>, + before_set: UnsafeCell>>, // Waker can be changed, need to use `AtomicWaker` to guarantee no data race. waker: AtomicWaker, } @@ -57,6 +61,10 @@ unsafe impl Send for EventCore {} const PAYLOAD_EVENT: u64 = 0; const CANCEL_EVENT: u64 = 31; +const fn event_mask_bit_of(event: u64) -> u32 { + 1 << event +} + #[inline] const fn subscribed_bit_of(event: u64) -> u64 { 1 << (event * 2) @@ -67,23 +75,14 @@ const fn fired_bit_of(event: u64) -> u64 { 1 << (event * 2 + 1) } -impl Default for EventCore { - #[inline] - fn default() -> Self { - Self { - event: AtomicU64::new(0), - res: UnsafeCell::new(None), - waker: AtomicWaker::new(), - } - } -} - impl EventCore { #[inline] fn notify_event(&self, event: u64) { - let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); - if previous & subscribed_bit_of(event) != 0 { - self.waker.wake() + if self.event_mask & event_mask_bit_of(event) != 0 { + let previous = self.event.fetch_or(fired_bit_of(event), Ordering::AcqRel); + if previous & subscribed_bit_of(event) != 0 { + self.waker.wake() + } } } @@ -91,8 +90,11 @@ impl EventCore { /// /// After this call, no events should be notified. #[inline] - fn set_result(&self, result: Res) { + fn set_result(&self, mut result: Res) { unsafe { + if let Some(cb) = (*self.before_set.get()).take() { + cb(&mut result); + } *self.res.get() = Some(result); } let previous = self.event.fetch_or( @@ -173,7 +175,7 @@ impl<'a, Res> Future for WaitEvent<'a, Res> { } struct WaitResult<'a, Res> { - core: &'a EventCore, + sub: &'a BaseSubscriber, } impl<'a, Res> Future for WaitResult<'a, Res> { @@ -181,16 +183,16 @@ impl<'a, Res> Future for WaitResult<'a, Res> { #[inline] fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let event = &self.core.event; + let event = &self.sub.core.event; let fired_bit = fired_bit_of(PAYLOAD_EVENT); let mut e = event.load(Ordering::Relaxed); if check_bit(e, fired_bit).is_some() { unsafe { - return Poll::Ready((*self.core.res.get()).take()); + return Poll::Ready((*self.sub.core.res.get()).take()); } } let subscribed_bit = subscribed_bit_of(PAYLOAD_EVENT); - self.core.waker.register(cx.waker()); + self.sub.core.waker.register(cx.waker()); loop { match event.compare_exchange_weak( e, @@ -203,7 +205,7 @@ impl<'a, Res> Future for WaitResult<'a, Res> { }; if check_bit(e, fired_bit).is_some() { unsafe { - return Poll::Ready((*self.core.res.get()).take()); + return Poll::Ready((*self.sub.core.res.get()).take()); } } } @@ -219,7 +221,7 @@ impl BaseSubscriber { /// Wait for the result. #[inline] pub async fn result(self) -> Option { - WaitResult { core: &self.core }.await + WaitResult { sub: &self }.await } /// Test if the result is ready without any polling. @@ -242,7 +244,17 @@ impl BaseChannel { /// Creates a pair of channel and subscriber. #[inline] pub fn pair() -> (Self, BaseSubscriber) { - let core: Arc> = Arc::default(); + Self::with_mask(u32::MAX) + } + + fn with_mask(mask: u32) -> (Self, BaseSubscriber) { + let core: Arc> = Arc::new(EventCore { + event: AtomicU64::new(0), + res: UnsafeCell::new(None), + event_mask: mask, + before_set: UnsafeCell::new(None), + waker: AtomicWaker::new(), + }); (Self { core: core.clone() }, BaseSubscriber { core }) } @@ -283,6 +295,122 @@ impl CmdResSubscriber { } } +#[derive(Clone, Copy, Debug)] +enum CmdResPollStage { + ExpectProposed, + ExpectCommitted, + ExpectResult, + Drained, +} + +impl CmdResPollStage { + #[inline] + fn init(event_mask: u32) -> CmdResPollStage { + if event_mask & event_mask_bit_of(CmdResChannel::PROPOSED_EVENT) != 0 { + CmdResPollStage::ExpectProposed + } else if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) != 0 { + CmdResPollStage::ExpectCommitted + } else { + CmdResPollStage::ExpectResult + } + } + + #[inline] + fn next(&mut self, event_mask: u32) { + *self = match self { + CmdResPollStage::ExpectProposed => { + if event_mask & event_mask_bit_of(CmdResChannel::COMMITTED_EVENT) == 0 { + CmdResPollStage::ExpectResult + } else { + CmdResPollStage::ExpectCommitted + } + } + CmdResPollStage::ExpectCommitted => CmdResPollStage::ExpectResult, + CmdResPollStage::ExpectResult => CmdResPollStage::Drained, + CmdResPollStage::Drained => CmdResPollStage::Drained, + } + } +} + +#[derive(Debug)] +pub enum CmdResEvent { + Proposed, + Committed, + Finished(RaftCmdResponse), +} + +pub struct CmdResStream { + sub: CmdResSubscriber, + stage: CmdResPollStage, +} + +impl CmdResStream { + #[inline] + pub fn new(sub: CmdResSubscriber) -> Self { + Self { + stage: CmdResPollStage::init(sub.core.event_mask), + sub, + } + } +} + +impl Stream for CmdResStream { + type Item = CmdResEvent; + + #[inline] + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let stream = self.get_mut(); + loop { + match stream.stage { + CmdResPollStage::ExpectProposed => { + match (WaitEvent { + event: CmdResChannel::PROPOSED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Proposed)); + } + } + } + } + CmdResPollStage::ExpectCommitted => { + match (WaitEvent { + event: CmdResChannel::COMMITTED_EVENT, + core: &stream.sub.core, + }) + .poll_unpin(cx) + { + Poll::Pending => return Poll::Pending, + Poll::Ready(b) => { + stream.stage.next(stream.sub.core.event_mask); + if b { + return Poll::Ready(Some(CmdResEvent::Committed)); + } + } + } + } + CmdResPollStage::ExpectResult => { + match (WaitResult { sub: &stream.sub }).poll_unpin(cx) { + Poll::Pending => return Poll::Pending, + Poll::Ready(res) => { + stream.stage.next(stream.sub.core.event_mask); + if let Some(res) = res { + return Poll::Ready(Some(CmdResEvent::Finished(res))); + } + } + } + } + CmdResPollStage::Drained => return Poll::Ready(None), + } + } + } +} + pub type CmdResChannel = BaseChannel; impl Debug for CmdResChannel { @@ -291,6 +419,46 @@ impl Debug for CmdResChannel { } } +#[derive(Default)] +pub struct CmdResChannelBuilder { + event_mask: u32, + before_set: Option>, +} + +impl CmdResChannelBuilder { + #[inline] + pub fn subscribe_proposed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::PROPOSED_EVENT); + self + } + + #[inline] + pub fn subscribe_committed(&mut self) -> &mut Self { + self.event_mask |= event_mask_bit_of(CmdResChannel::COMMITTED_EVENT); + self + } + + #[inline] + pub fn before_set( + &mut self, + f: impl FnOnce(&mut RaftCmdResponse) + Send + 'static, + ) -> &mut Self { + self.before_set = Some(Box::new(f)); + self + } + + #[inline] + pub fn build(self) -> (CmdResChannel, CmdResSubscriber) { + let (c, s) = CmdResChannel::with_mask(self.event_mask); + if let Some(f) = self.before_set { + unsafe { + *c.core.before_set.get() = Some(f); + } + } + (c, s) + } +} + impl CmdResChannel { // Valid range is [1, 30] const PROPOSED_EVENT: u64 = 1; @@ -404,7 +572,7 @@ impl ReadCallback for QueryResChannel { type Response = QueryResult; #[inline] - fn set_result(mut self, res: QueryResult) { + fn set_result(self, res: QueryResult) { self.set_result(res); } @@ -424,14 +592,29 @@ impl fmt::Debug for QueryResChannel { pub type DebugInfoChannel = BaseChannel; pub type DebugInfoSubscriber = BaseSubscriber; +impl Debug for DebugInfoChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "DebugInfoChannel") + } +} + #[cfg(feature = "testexport")] pub type FlushChannel = BaseChannel<()>; #[cfg(feature = "testexport")] pub type FlushSubscriber = BaseSubscriber<()>; +#[cfg(feature = "testexport")] +impl Debug for FlushChannel { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + write!(f, "FlushChannel") + } +} + #[cfg(test)] mod tests { - use futures::executor::block_on; + use std::assert_matches::assert_matches; + + use futures::{executor::block_on, StreamExt}; use super::*; @@ -482,4 +665,44 @@ mod tests { chan.set_result(read.clone()); assert_eq!(block_on(sub.result()).unwrap(), read); } + + #[test] + fn test_cmd_res_stream() { + let mut builder = CmdResChannelBuilder::default(); + builder.before_set(|res| { + res.mut_header().set_current_term(6); + }); + let (chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.set_result(RaftCmdResponse::default()); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Finished(res)) if res.get_header().get_current_term() == 6); + + // When using builder, no event is subscribed by default. + let (mut chan, sub) = CmdResChannelBuilder::default().build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_proposed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Proposed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + + let mut builder = CmdResChannelBuilder::default(); + builder.subscribe_committed(); + let (mut chan, sub) = builder.build(); + let mut stream = CmdResStream::new(sub); + chan.notify_proposed(); + chan.notify_committed(); + assert_matches!(block_on(stream.next()), Some(CmdResEvent::Committed)); + drop(chan); + assert_matches!(block_on(stream.next()), None); + } } diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index ad8249d22a4..6fafd01df85 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,5 +1,4 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -mod pd; - -pub use pd::{RegionHeartbeatTask as PdRegionHeartbeatTask, Runner as PdRunner, Task as PdTask}; +pub mod pd; +pub mod tablet_gc; diff --git a/components/raftstore-v2/src/worker/pd/mod.rs b/components/raftstore-v2/src/worker/pd/mod.rs index 132678e21f2..cc977e68236 100644 --- a/components/raftstore-v2/src/worker/pd/mod.rs +++ b/components/raftstore-v2/src/worker/pd/mod.rs @@ -2,24 +2,27 @@ use std::{ fmt::{self, Display, Formatter}, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, + sync::{atomic::AtomicBool, Arc}, }; use causal_ts::CausalTsProviderImpl; use collections::HashMap; use concurrency_manager::ConcurrencyManager; -use engine_traits::{KvEngine, RaftEngine, TabletFactory}; +use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; use kvproto::{metapb, pdpb}; use pd_client::PdClient; -use raftstore::store::{util::KeysInfoFormatter, TxnExt}; +use raftstore::store::{util::KeysInfoFormatter, FlowStatsReporter, ReadStats, TxnExt, WriteStats}; use slog::{error, info, Logger}; -use tikv_util::{time::UnixSecs, worker::Runnable}; +use tikv_util::{ + time::UnixSecs, + worker::{Runnable, Scheduler}, +}; use yatp::{task::future::TaskCell, Remote}; -use crate::{batch::StoreRouter, router::PeerMsg}; +use crate::{ + batch::StoreRouter, + router::{CmdResChannel, PeerMsg}, +}; mod region_heartbeat; mod split; @@ -42,6 +45,7 @@ pub enum Task { split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + ch: CmdResChannel, }, ReportBatchSplit { regions: Vec, @@ -97,7 +101,7 @@ where store_id: u64, pd_client: Arc, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, router: StoreRouter, remote: Remote, @@ -130,7 +134,7 @@ where store_id: u64, pd_client: Arc, raft_engine: ER, - tablet_factory: Arc>, + tablet_registry: TabletRegistry, router: StoreRouter, remote: Remote, concurrency_manager: ConcurrencyManager, @@ -142,7 +146,7 @@ where store_id, pd_client, raft_engine, - tablet_factory, + tablet_registry, router, remote, region_peers: HashMap::default(), @@ -177,7 +181,8 @@ where split_keys, peer, right_derive, - } => self.handle_ask_batch_split(region, split_keys, peer, right_derive), + ch, + } => self.handle_ask_batch_split(region, split_keys, peer, right_derive, ch), Task::ReportBatchSplit { regions } => self.handle_report_batch_split(regions), Task::UpdateMaxTimestamp { region_id, @@ -204,14 +209,37 @@ where } } -pub mod requests { +#[derive(Clone)] +pub struct FlowReporter { + _scheduler: Scheduler, +} + +impl FlowReporter { + pub fn new(scheduler: Scheduler) -> Self { + FlowReporter { + _scheduler: scheduler, + } + } +} + +impl FlowStatsReporter for FlowReporter { + fn report_read_stats(&self, _read_stats: ReadStats) { + // TODO + } + + fn report_write_stats(&self, _write_stats: WriteStats) { + // TODO + } +} + +mod requests { use kvproto::raft_cmdpb::{ AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, }; use raft::eraftpb::ConfChangeType; use super::*; + use crate::router::RaftRequest; pub fn send_admin_request( logger: &Logger, @@ -220,6 +248,7 @@ pub mod requests { epoch: metapb::RegionEpoch, peer: metapb::Peer, request: AdminRequest, + ch: Option, ) where EK: KvEngine, ER: RaftEngine, @@ -232,7 +261,10 @@ pub mod requests { req.mut_header().set_peer(peer); req.set_admin_request(request); - let (msg, _) = PeerMsg::raft_command(req); + let msg = match ch { + Some(ch) => PeerMsg::AdminCommand(RaftRequest::new(req, ch)), + None => PeerMsg::admin_command(req).0, + }; if let Err(e) = router.send(region_id, msg) { error!( logger, @@ -271,41 +303,6 @@ pub mod requests { req } - pub fn new_split_region_request( - split_key: Vec, - new_region_id: u64, - peer_ids: Vec, - right_derive: bool, - ) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::Split); - req.mut_split().set_split_key(split_key); - req.mut_split().set_new_region_id(new_region_id); - req.mut_split().set_new_peer_ids(peer_ids); - req.mut_split().set_right_derive(right_derive); - req - } - - pub fn new_batch_split_region_request( - split_keys: Vec>, - ids: Vec, - right_derive: bool, - ) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::BatchSplit); - req.mut_splits().set_right_derive(right_derive); - let mut requests = Vec::with_capacity(ids.len()); - for (mut id, key) in ids.into_iter().zip(split_keys) { - let mut split = SplitRequest::default(); - split.set_split_key(key); - split.set_new_region_id(id.get_new_region_id()); - split.set_new_peer_ids(id.take_new_peer_ids()); - requests.push(split); - } - req.mut_splits().set_requests(requests.into()); - req - } - pub fn new_transfer_leader_request( peer: metapb::Peer, peers: Vec, @@ -316,12 +313,4 @@ pub mod requests { req.mut_transfer_leader().set_peers(peers.into()); req } - - pub fn new_merge_request(merge: pdpb::Merge) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::PrepareMerge); - req.mut_prepare_merge() - .set_target(merge.get_target().to_owned()); - req - } } diff --git a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs index ad0293d0b6d..31f84801ed2 100644 --- a/components/raftstore-v2/src/worker/pd/region_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/region_heartbeat.rs @@ -3,18 +3,9 @@ use std::time::Duration; use engine_traits::{KvEngine, RaftEngine}; -use kvproto::{ - metapb, pdpb, - raft_cmdpb::{ - AdminCmdType, AdminRequest, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, - SplitRequest, - }, - raft_serverpb::RaftMessage, - replication_modepb::{RegionReplicationStatus, StoreDrAutoSyncStatus}, -}; +use kvproto::{metapb, pdpb}; use pd_client::{metrics::PD_HEARTBEAT_COUNTER_VEC, PdClient, RegionStat}; -use raft::eraftpb::ConfChangeType; -use slog::{debug, error, info}; +use slog::{debug, info}; use tikv_util::{store::QueryStats, time::UnixSecs}; use super::{requests::*, Runner}; @@ -193,7 +184,7 @@ where change_peer.get_change_type(), change_peer.take_peer(), ); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_change_peer_v2() { PD_HEARTBEAT_COUNTER_VEC .with_label_values(&["change peer"]) @@ -207,7 +198,7 @@ where "changes" => ?change_peer_v2.get_changes(), ); let req = new_change_peer_v2_request(change_peer_v2.take_changes().into()); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_transfer_leader() { PD_HEARTBEAT_COUNTER_VEC .with_label_values(&["transfer leader"]) @@ -226,7 +217,7 @@ where transfer_leader.take_peer(), transfer_leader.take_peers().into(), ); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, None); } else if resp.has_split_region() { // TODO info!(logger, "pd asks for split but ignored"); diff --git a/components/raftstore-v2/src/worker/pd/split.rs b/components/raftstore-v2/src/worker/pd/split.rs index 3cb85f6698c..cb7c3ad9308 100644 --- a/components/raftstore-v2/src/worker/pd/split.rs +++ b/components/raftstore-v2/src/worker/pd/split.rs @@ -9,6 +9,7 @@ use pd_client::PdClient; use slog::{info, warn}; use super::{requests::*, Runner}; +use crate::router::CmdResChannel; fn new_batch_split_region_request( split_keys: Vec>, @@ -42,6 +43,7 @@ where split_keys: Vec>, peer: metapb::Peer, right_derive: bool, + ch: CmdResChannel, ) { if split_keys.is_empty() { info!(self.logger, "empty split key, skip ask batch split"; @@ -71,7 +73,7 @@ where ); let region_id = region.get_id(); let epoch = region.take_region_epoch(); - send_admin_request(&logger, &router, region_id, epoch, peer, req); + send_admin_request(&logger, &router, region_id, epoch, peer, req, Some(ch)); } Err(e) => { warn!( diff --git a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs index 1caa96a5225..2fbe378cff8 100644 --- a/components/raftstore-v2/src/worker/pd/store_heartbeat.rs +++ b/components/raftstore-v2/src/worker/pd/store_heartbeat.rs @@ -247,7 +247,6 @@ where // TODO: slow score - let router = self.router.clone(); let resp = self.pd_client.store_heartbeat(stats, None, None); let logger = self.logger.clone(); let f = async move { @@ -260,12 +259,12 @@ where /// Returns (capacity, used, available). fn collect_engine_size(&self) -> Option<(u64, u64, u64)> { - let disk_stats = match fs2::statvfs(self.tablet_factory.tablets_path()) { + let disk_stats = match fs2::statvfs(self.tablet_registry.tablet_root()) { Err(e) => { error!( self.logger, "get disk stat for rocksdb failed"; - "engine_path" => self.tablet_factory.tablets_path().display(), + "engine_path" => self.tablet_registry.tablet_root().display(), "err" => ?e ); return None; diff --git a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs index cbfecb8171d..0de3fb9a87c 100644 --- a/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs +++ b/components/raftstore-v2/src/worker/pd/update_max_timestamp.rs @@ -7,7 +7,6 @@ use std::{ use causal_ts::CausalTsProvider; use engine_traits::{KvEngine, RaftEngine}; -use fail::fail_point; use futures::{compat::Future01CompatExt, FutureExt}; use pd_client::PdClient; use raftstore::{store::TxnExt, Result}; @@ -96,7 +95,7 @@ where #[cfg(feature = "failpoints")] let delay = (|| { - fail_point!("delay_update_max_ts", |_| true); + fail::fail_point!("delay_update_max_ts", |_| true); false })(); #[cfg(not(feature = "failpoints"))] diff --git a/components/raftstore-v2/src/worker/tablet_gc.rs b/components/raftstore-v2/src/worker/tablet_gc.rs new file mode 100644 index 00000000000..cc1fcd971e9 --- /dev/null +++ b/components/raftstore-v2/src/worker/tablet_gc.rs @@ -0,0 +1,227 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + fmt::{self, Display, Formatter}, + path::{Path, PathBuf}, + time::Duration, +}; + +use collections::HashMap; +use engine_traits::{DeleteStrategy, KvEngine, Range, TabletContext, TabletRegistry}; +use kvproto::metapb::Region; +use slog::{error, warn, Logger}; +use tikv_util::worker::{Runnable, RunnableWithTimer}; + +pub enum Task { + Trim { + tablet: EK, + start_key: Box<[u8]>, + end_key: Box<[u8]>, + }, + PrepareDestroy { + tablet: EK, + region_id: u64, + wait_for_persisted: u64, + }, + Destroy { + region_id: u64, + persisted_index: u64, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::Trim { + ref start_key, + ref end_key, + .. + } => write!( + f, + "trim tablet for start_key {}, end_key {}", + log_wrappers::Value::key(start_key), + log_wrappers::Value::key(end_key), + ), + Task::PrepareDestroy { + region_id, + wait_for_persisted, + .. + } => write!( + f, + "prepare destroy tablet for region_id {}, wait_for_persisted {}", + region_id, wait_for_persisted, + ), + Task::Destroy { + region_id, + persisted_index, + } => write!( + f, + "destroy tablet for region_id {} persisted_index {}", + region_id, persisted_index, + ), + } + } +} + +impl Task { + #[inline] + pub fn trim(tablet: EK, region: &Region) -> Self { + Task::Trim { + tablet, + start_key: region.get_start_key().into(), + end_key: region.get_end_key().into(), + } + } + + #[inline] + pub fn prepare_destroy(tablet: EK, region_id: u64, wait_for_persisted: u64) -> Self { + Task::PrepareDestroy { + tablet, + region_id, + wait_for_persisted, + } + } + + #[inline] + pub fn destroy(region_id: u64, persisted_index: u64) -> Self { + Task::Destroy { + region_id, + persisted_index, + } + } +} + +pub struct Runner { + tablet_registry: TabletRegistry, + logger: Logger, + + // region_id -> [(tablet_path, wait_for_persisted)]. + waiting_destroy_tasks: HashMap>, + pending_destroy_tasks: Vec, +} + +impl Runner { + pub fn new(tablet_registry: TabletRegistry, logger: Logger) -> Self { + Self { + tablet_registry, + logger, + waiting_destroy_tasks: HashMap::default(), + pending_destroy_tasks: Vec::new(), + } + } + + fn trim(tablet: &EK, start_key: &[u8], end_key: &[u8]) -> engine_traits::Result<()> { + let start_key = keys::data_key(start_key); + let end_key = keys::data_end_key(end_key); + let range1 = Range::new(&[], &start_key); + let range2 = Range::new(&end_key, keys::DATA_MAX_KEY); + tablet.delete_ranges_cfs(DeleteStrategy::DeleteFiles, &[range1, range2])?; + // TODO: Avoid this after compaction filter is ready. + tablet.delete_ranges_cfs(DeleteStrategy::DeleteByRange, &[range1, range2])?; + for r in [range1, range2] { + tablet.compact_range(Some(r.start_key), Some(r.end_key), false, 1)?; + } + Ok(()) + } + + fn prepare_destroy(&mut self, region_id: u64, tablet: EK, wait_for_persisted: u64) { + let _ = tablet.pause_background_work(); + self.waiting_destroy_tasks + .entry(region_id) + .or_default() + .push((PathBuf::from(tablet.path()), wait_for_persisted)); + } + + fn destroy(&mut self, region_id: u64, persisted: u64) { + if let Some(v) = self.waiting_destroy_tasks.get_mut(®ion_id) { + v.retain(|(path, wait)| { + if *wait <= persisted { + if !Self::process_destroy_task(&self.logger, &self.tablet_registry, path) { + self.pending_destroy_tasks.push(path.clone()); + } + return false; + } + true + }); + } + } + + /// Returns true if task is consumed. Failure is considered consumed. + fn process_destroy_task(logger: &Logger, registry: &TabletRegistry, path: &Path) -> bool { + match EK::locked(path.to_str().unwrap()) { + Err(e) => warn!( + logger, + "failed to check whether the tablet path is locked"; + "err" => ?e, + "path" => path.display(), + ), + Ok(false) => { + // TODO: use a meaningful table context. + let _ = registry + .tablet_factory() + .destroy_tablet(TabletContext::with_infinite_region(0, None), path) + .map_err(|e| { + warn!( + logger, + "failed to destroy tablet"; + "err" => ?e, + "path" => path.display(), + ) + }); + return true; + } + _ => {} + } + false + } +} + +impl Runnable for Runner +where + EK: KvEngine, +{ + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Trim { + tablet, + start_key, + end_key, + } => { + if let Err(e) = Self::trim(&tablet, &start_key, &end_key) { + error!( + self.logger, + "failed to trim tablet"; + "start_key" => log_wrappers::Value::key(&start_key), + "end_key" => log_wrappers::Value::key(&end_key), + "err" => %e, + ); + } + } + Task::PrepareDestroy { + region_id, + tablet, + wait_for_persisted, + } => self.prepare_destroy(region_id, tablet, wait_for_persisted), + Task::Destroy { + region_id, + persisted_index, + } => self.destroy(region_id, persisted_index), + } + } +} + +impl RunnableWithTimer for Runner +where + EK: KvEngine, +{ + fn on_timeout(&mut self) { + self.pending_destroy_tasks + .retain(|task| !Self::process_destroy_task(&self.logger, &self.tablet_registry, task)); + } + + fn get_interval(&self) -> Duration { + Duration::from_secs(2) + } +} diff --git a/components/raftstore-v2/tests/failpoints/mod.rs b/components/raftstore-v2/tests/failpoints/mod.rs index 26403f2f0a3..d04ad2cafc2 100644 --- a/components/raftstore-v2/tests/failpoints/mod.rs +++ b/components/raftstore-v2/tests/failpoints/mod.rs @@ -10,3 +10,5 @@ mod cluster; mod test_basic_write; mod test_bootstrap; +mod test_split; +mod test_trace_apply; diff --git a/components/raftstore-v2/tests/failpoints/test_basic_write.rs b/components/raftstore-v2/tests/failpoints/test_basic_write.rs index 4bf4201f67c..55d85b90fa4 100644 --- a/components/raftstore-v2/tests/failpoints/test_basic_write.rs +++ b/components/raftstore-v2/tests/failpoints/test_basic_write.rs @@ -2,44 +2,36 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::raft_cmdpb::{CmdType, Request}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use crate::cluster::Cluster; /// Check if write batch is correctly maintained during apply. #[test] fn test_write_batch_rollback() { - let cluster = Cluster::default(); - let router = cluster.router(0); - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req.clone()); + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); // Make several entries to batch in apply thread. fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); - let tablet_factory = cluster.node(0).tablet_factory(); - let tablet = tablet_factory - .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - // Good proposal should be committed. - let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub0.wait_proposed())); assert!(block_on(sub0.wait_committed())); // If the write batch is correctly initialized, next write should not contain // last result. - req.mut_requests()[0].mut_put().set_key(b"key1".to_vec()); - let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key1", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub1.wait_proposed())); assert!(block_on(sub1.wait_committed())); @@ -60,22 +52,26 @@ fn test_write_batch_rollback() { ); let resp = block_on(sub1.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key"), Ok(None)); - assert_eq!(tablet.get_value(b"key1").unwrap().unwrap(), b"value"); + + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); + assert_eq!(snap.get_value(b"key1").unwrap().unwrap(), b"value"); fail::cfg("APPLY_COMMITTED_ENTRIES", "pause").unwrap(); // Trigger error again, so an initialized write batch should be rolled back. - req.mut_requests()[0].mut_put().set_key(b"key2".to_vec()); - let (msg, mut sub0) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key2", b"value"); + let (msg, mut sub0) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub0.wait_proposed())); assert!(block_on(sub0.wait_committed())); // If the write batch is correctly rollbacked, next write should not contain // last result. - req.mut_requests()[0].mut_put().set_key(b"key3".to_vec()); - let (msg, mut sub1) = PeerMsg::raft_command(req.clone()); + put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key3", b"value"); + let (msg, mut sub1) = PeerMsg::simple_write(header, put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub1.wait_proposed())); assert!(block_on(sub1.wait_committed())); @@ -93,6 +89,7 @@ fn test_write_batch_rollback() { ); let resp = block_on(sub1.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key2"), Ok(None)); - assert_eq!(tablet.get_value(b"key3").unwrap().unwrap(), b"value"); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key2"), Ok(None)); + assert_eq!(snap.get_value(b"key3").unwrap().unwrap(), b"value"); } diff --git a/components/raftstore-v2/tests/failpoints/test_split.rs b/components/raftstore-v2/tests/failpoints/test_split.rs new file mode 100644 index 00000000000..79356ae5805 --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_split.rs @@ -0,0 +1,106 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + thread, + time::{Duration, Instant}, +}; + +use engine_traits::{RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::{split_helper::split_region, Cluster}; + +/// If a node is restarted after metadata is persisted before tablet is not +/// installed, it should resume install the tablet. +#[test] +fn test_restart_resume() { + let mut cluster = Cluster::default(); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; + + let region_id = 2; + let region = router.region_detail(region_id); + let peer = region.get_peers()[0].clone(); + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + let fp = "async_write_before_cb"; + fail::cfg(fp, "return").unwrap(); + + let split_region_id = 1000; + let mut new_peer = peer.clone(); + new_peer.set_id(1001); + split_region( + router, + region, + peer, + split_region_id, + new_peer, + None, + None, + b"k11", + b"k11", + true, + ); + + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"k22", b"value"); + let header = Box::new(router.new_request_for(region_id).take_header()); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + block_on(sub.wait_proposed()); + + let region_state = raft_engine + .get_region_state(split_region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + let path = cluster + .node(0) + .tablet_registry() + .tablet_path(split_region_id, RAFT_INIT_LOG_INDEX); + assert!(!path.exists(), "{} should not exist", path.display()); + drop(raft_engine); + + cluster.restart(0); + // If split is resumed, the tablet should be installed. + assert!( + path.exists(), + "{} should exist after restart", + path.display() + ); + + // Both region should be recovered correctly. + let cases = vec![ + (split_region_id, b"k01", b"v01"), + (region_id, b"k21", b"v21"), + ]; + let router = &mut cluster.routers[0]; + let new_epoch = router + .new_request_for(split_region_id) + .take_header() + .take_region_epoch(); + let timer = Instant::now(); + for (region_id, key, val) in cases { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let mut header = Box::new(router.new_request_for(region_id).take_header()); + while timer.elapsed() < Duration::from_secs(3) { + // We need to wait till source peer replay split. + if *header.get_region_epoch() != new_epoch { + thread::sleep(Duration::from_millis(100)); + header = Box::new(router.new_request_for(region_id).take_header()); + continue; + } + break; + } + assert_eq!(*header.get_region_epoch(), new_epoch, "{:?}", header); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + // Send a command to ensure split init is triggered. + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } +} diff --git a/components/raftstore-v2/tests/failpoints/test_trace_apply.rs b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs new file mode 100644 index 00000000000..15bf39d17ba --- /dev/null +++ b/components/raftstore-v2/tests/failpoints/test_trace_apply.rs @@ -0,0 +1,7 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +// TODO: check if it can recover from: +// - split not start +// - split not finish +// - two pending split the second one finished before the first one +// - all split finish diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 24184233117..4c025a0fc85 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -5,7 +5,7 @@ use std::{ path::Path, sync::{ atomic::{AtomicUsize, Ordering}, - Arc, Mutex, + Arc, }, thread, time::{Duration, Instant}, @@ -17,26 +17,29 @@ use concurrency_manager::ConcurrencyManager; use crossbeam::channel::{self, Receiver, Sender, TrySendError}; use engine_test::{ ctor::{CfOptions, DbOptions}, - kv::{KvTestEngine, TestTabletFactoryV2}, + kv::{KvTestEngine, KvTestSnapshot, TestTabletFactory}, raft::RaftTestEngine, }; -use engine_traits::{OpenOptions, TabletFactory, ALL_CFS}; +use engine_traits::{TabletContext, TabletRegistry, DATA_CFS}; use futures::executor::block_on; use kvproto::{ metapb::{self, RegionEpoch, Store}, - raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, + raft_cmdpb::{CmdType, RaftCmdRequest, RaftCmdResponse, RaftRequestHeader, Request}, raft_serverpb::RaftMessage, }; use pd_client::RpcClient; use raft::eraftpb::MessageType; -use raftstore::store::{ - region_meta::{RegionLocalState, RegionMeta}, - Config, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, +use raftstore::{ + coprocessor::CoprocessorHost, + store::{ + region_meta::{RegionLocalState, RegionMeta}, + Config, RegionSnapshot, TabletSnapKey, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX, + }, }; use raftstore_v2::{ create_store_batch_system, router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, - Bootstrap, StoreMeta, StoreSystem, + Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; use slog::{debug, o, Logger}; use tempfile::TempDir; @@ -44,9 +47,22 @@ use test_pd::mocker::Service; use tikv_util::{ config::{ReadableDuration, VersionTrack}, store::new_peer, + worker::{LazyWorker, Worker}, }; +use txn_types::WriteBatchFlags; + +pub fn check_skip_wal(path: &str) { + let mut found = false; + for f in std::fs::read_dir(path).unwrap() { + let e = f.unwrap(); + if e.path().extension().map_or(false, |ext| ext == "log") { + found = true; + assert_eq!(e.metadata().unwrap().len(), 0, "{}", e.path().display()); + } + } + assert!(found, "no WAL found in {}", path); +} -#[derive(Clone)] pub struct TestRouter(RaftRouter); impl Deref for TestRouter { @@ -85,8 +101,19 @@ impl TestRouter { None } - pub fn command(&self, region_id: u64, req: RaftCmdRequest) -> Option { - let (msg, sub) = PeerMsg::raft_command(req); + pub fn simple_write( + &self, + region_id: u64, + header: Box, + write: SimpleWriteEncoder, + ) -> Option { + let (msg, sub) = PeerMsg::simple_write(header, write.encode()); + self.send(region_id, msg).unwrap(); + block_on(sub.result()) + } + + pub fn admin_command(&self, region_id: u64, req: RaftCmdRequest) -> Option { + let (msg, sub) = PeerMsg::admin_command(req); self.send(region_id, msg).unwrap(); block_on(sub.result()) } @@ -152,6 +179,17 @@ impl TestRouter { req } + pub fn stale_snapshot(&mut self, region_id: u64) -> RegionSnapshot { + let mut req = self.new_request_for(region_id); + let header = req.mut_header(); + header.set_flags(WriteBatchFlags::STALE_READ.bits()); + header.set_flag_data(vec![0; 8]); + let mut snap_req = Request::default(); + snap_req.set_cmd_type(CmdType::Snap); + req.mut_requests().push(snap_req); + block_on(self.snapshot(req)).unwrap() + } + pub fn region_detail(&self, region_id: u64) -> metapb::Region { let RegionLocalState { id, @@ -182,12 +220,12 @@ impl TestRouter { pub struct RunningState { store_id: u64, pub raft_engine: RaftTestEngine, - pub factory: Arc, + pub registry: TabletRegistry, pub system: StoreSystem, pub cfg: Arc>, pub transport: TestTransport, - // We need this to clear the ref counts of CachedTablet when shutdown - store_meta: Arc>>, + snap_mgr: TabletSnapManager, + background: Worker, } impl RunningState { @@ -199,81 +237,92 @@ impl RunningState { concurrency_manager: ConcurrencyManager, causal_ts_provider: Option>, logger: &Logger, - ) -> (TestRouter, TabletSnapManager, Self) { - let cf_opts = ALL_CFS - .iter() - .copied() - .map(|cf| (cf, CfOptions::default())) - .collect(); - let factory = Arc::new(TestTabletFactoryV2::new( - path, - DbOptions::default(), - cf_opts, - )); + ) -> (TestRouter, Self) { let raft_engine = engine_test::raft::new_engine(&format!("{}", path.join("raft").display()), None) .unwrap(); + let mut bootstrap = Bootstrap::new(&raft_engine, 0, pd_client.as_ref(), logger.clone()); let store_id = bootstrap.bootstrap_store().unwrap(); let mut store = Store::default(); store.set_id(store_id); - if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { - if factory.exists(region.get_id(), RAFT_INIT_LOG_INDEX) { - factory - .destroy_tablet(region.get_id(), RAFT_INIT_LOG_INDEX) - .unwrap(); - } - factory - .open_tablet( - region.get_id(), - Some(RAFT_INIT_LOG_INDEX), - OpenOptions::default().set_create_new(true), - ) - .unwrap(); - } let (router, mut system) = create_store_batch_system::( &cfg.value(), store_id, logger.clone(), ); + let cf_opts = DATA_CFS + .iter() + .copied() + .map(|cf| (cf, CfOptions::default())) + .collect(); + let mut db_opt = DbOptions::default(); + db_opt.set_state_storage(Arc::new(StateStorage::new( + raft_engine.clone(), + router.clone(), + ))); + let factory = Box::new(TestTabletFactory::new(db_opt, cf_opts)); + let registry = TabletRegistry::new(factory, path.join("tablets")).unwrap(); + if let Some(region) = bootstrap.bootstrap_first_region(&store, store_id).unwrap() { + let factory = registry.tablet_factory(); + let path = registry.tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); + if factory.exists(&path) { + registry.remove(region.get_id()); + factory.destroy_tablet(ctx.clone(), &path).unwrap(); + } + // Create the tablet without loading it in cache. + factory.open_tablet(ctx, &path).unwrap(); + } - let router = RaftRouter::new(store_id, router); + let router = RaftRouter::new(store_id, registry.clone(), router); let store_meta = router.store_meta().clone(); - let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()); - snap_mgr.init().unwrap(); + let snap_mgr = TabletSnapManager::new(path.join("tablets_snap").to_str().unwrap()).unwrap(); + + let coprocessor_host = CoprocessorHost::new( + router.store_router().clone(), + raftstore::coprocessor::Config::default(), + ); + let background = Worker::new("background"); + let pd_worker = LazyWorker::new("pd-worker"); system .start( store_id, cfg.clone(), raft_engine.clone(), - factory.clone(), + registry.clone(), transport.clone(), pd_client.clone(), router.store_router(), - store_meta.clone(), + store_meta, snap_mgr.clone(), concurrency_manager, causal_ts_provider, + coprocessor_host, + background.clone(), + pd_worker, ) .unwrap(); let state = Self { store_id, raft_engine, - factory, + registry, system, cfg, transport, - store_meta, + snap_mgr, + background, }; - (TestRouter(router), snap_mgr, state) + (TestRouter(router), state) } } impl Drop for RunningState { fn drop(&mut self) { self.system.shutdown(); + self.background.stop(); } } @@ -282,7 +331,6 @@ pub struct TestNode { path: TempDir, running_state: Option, logger: Logger, - snap_mgr: Option, } impl TestNode { @@ -294,12 +342,11 @@ impl TestNode { path, running_state: None, logger, - snap_mgr: None, } } fn start(&mut self, cfg: Arc>, trans: TestTransport) -> TestRouter { - let (router, snap_mgr, state) = RunningState::new( + let (router, state) = RunningState::new( &self.pd_client, self.path.path(), cfg, @@ -309,12 +356,12 @@ impl TestNode { &self.logger, ); self.running_state = Some(state); - self.snap_mgr = Some(snap_mgr); router } - pub fn tablet_factory(&self) -> &Arc { - &self.running_state().unwrap().factory + #[allow(dead_code)] + pub fn tablet_registry(&self) -> &TabletRegistry { + &self.running_state().unwrap().registry } pub fn pd_client(&self) -> &Arc { @@ -322,10 +369,7 @@ impl TestNode { } fn stop(&mut self) { - if let Some(state) = std::mem::take(&mut self.running_state) { - let mut meta = state.store_meta.lock().unwrap(); - meta.tablet_caches.clear(); - } + self.running_state.take(); } fn restart(&mut self) -> TestRouter { @@ -340,10 +384,6 @@ impl TestNode { self.running_state.as_ref() } - pub fn snap_mgr(&self) -> Option<&TabletSnapManager> { - self.snap_mgr.as_ref() - } - pub fn id(&self) -> u64 { self.running_state().unwrap().store_id } @@ -420,7 +460,7 @@ pub struct Cluster { pd_server: test_pd::Server, nodes: Vec, receivers: Vec>, - routers: Vec, + pub routers: Vec, logger: Logger, } @@ -463,18 +503,15 @@ impl Cluster { } pub fn restart(&mut self, offset: usize) { + self.routers.remove(offset); let router = self.nodes[offset].restart(); - self.routers[offset] = router; + self.routers.insert(offset, router); } pub fn node(&self, offset: usize) -> &TestNode { &self.nodes[offset] } - pub fn router(&self, offset: usize) -> TestRouter { - self.routers[offset].clone() - } - /// Send messages and wait for side effects are all handled. #[allow(clippy::vec_box)] pub fn dispatch(&self, region_id: u64, mut msgs: Vec>) { @@ -512,8 +549,8 @@ impl Cluster { msg.get_message().get_snapshot().get_metadata().get_term(), msg.get_message().get_snapshot().get_metadata().get_index(), ); - let from_snap_mgr = self.node(from_offset).snap_mgr().unwrap(); - let to_snap_mgr = self.node(offset).snap_mgr().unwrap(); + let from_snap_mgr = &self.node(from_offset).running_state().unwrap().snap_mgr; + let to_snap_mgr = &self.node(offset).running_state().unwrap().snap_mgr; let gen_path = from_snap_mgr.tablet_gen_path(&key); let recv_path = to_snap_mgr.final_recv_path(&key); assert!(gen_path.exists()); @@ -540,3 +577,130 @@ impl Cluster { } } } + +impl Drop for Cluster { + fn drop(&mut self) { + self.routers.clear(); + for node in &mut self.nodes { + node.stop(); + } + } +} + +pub mod split_helper { + use std::{thread, time::Duration}; + + use engine_traits::CF_DEFAULT; + use futures::executor::block_on; + use kvproto::{ + metapb, pdpb, + raft_cmdpb::{AdminCmdType, AdminRequest, RaftCmdRequest, RaftCmdResponse, SplitRequest}, + }; + use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + + use super::TestRouter; + + pub fn new_batch_split_region_request( + split_keys: Vec>, + ids: Vec, + right_derive: bool, + ) -> AdminRequest { + let mut req = AdminRequest::default(); + req.set_cmd_type(AdminCmdType::BatchSplit); + req.mut_splits().set_right_derive(right_derive); + let mut requests = Vec::with_capacity(ids.len()); + for (mut id, key) in ids.into_iter().zip(split_keys) { + let mut split = SplitRequest::default(); + split.set_split_key(key); + split.set_new_region_id(id.get_new_region_id()); + split.set_new_peer_ids(id.take_new_peer_ids()); + requests.push(split); + } + req.mut_splits().set_requests(requests.into()); + req + } + + pub fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { + let (msg, sub) = PeerMsg::admin_command(req); + router.send(region_id, msg).unwrap(); + block_on(sub.result()).unwrap(); + + // TODO: when persistent implementation is ready, we can use tablet index of + // the parent to check whether the split is done. Now, just sleep a second. + thread::sleep(Duration::from_secs(1)); + } + + pub fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, b"v1"); + router.simple_write(region_id, header, put).unwrap() + } + + // Split the region according to the parameters + // return the updated original region + pub fn split_region<'a>( + router: &'a mut TestRouter, + region: metapb::Region, + peer: metapb::Peer, + split_region_id: u64, + split_peer: metapb::Peer, + left_key: Option<&'a [u8]>, + right_key: Option<&'a [u8]>, + propose_key: &[u8], + split_key: &[u8], + right_derive: bool, + ) -> (metapb::Region, metapb::Region) { + let region_id = region.id; + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header() + .set_region_epoch(region.get_region_epoch().clone()); + req.mut_header().set_peer(peer); + + let mut split_id = pdpb::SplitId::new(); + split_id.new_region_id = split_region_id; + split_id.new_peer_ids = vec![split_peer.id]; + let admin_req = new_batch_split_region_request( + vec![propose_key.to_vec()], + vec![split_id], + right_derive, + ); + req.mut_requests().clear(); + req.set_admin_request(admin_req); + + must_split(region_id, req, router); + + let (left, right) = if !right_derive { + ( + router.region_detail(region_id), + router.region_detail(split_region_id), + ) + } else { + ( + router.region_detail(split_region_id), + router.region_detail(region_id), + ) + }; + + if let Some(right_key) = right_key { + let resp = put(router, left.id, right_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, right_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + } + if let Some(left_key) = left_key { + let resp = put(router, left.id, left_key); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = put(router, right.id, left_key); + assert!(resp.get_header().has_error(), "{:?}", resp); + } + + assert_eq!(left.get_end_key(), split_key); + assert_eq!(right.get_start_key(), split_key); + assert_eq!(region.get_start_key(), left.get_start_key()); + assert_eq!(region.get_end_key(), right.get_end_key()); + + (left, right) + } +} diff --git a/components/raftstore-v2/tests/integrations/mod.rs b/components/raftstore-v2/tests/integrations/mod.rs index 52c8ba5e1f8..fbf54eaa243 100644 --- a/components/raftstore-v2/tests/integrations/mod.rs +++ b/components/raftstore-v2/tests/integrations/mod.rs @@ -15,3 +15,5 @@ mod test_pd_heartbeat; mod test_read; mod test_split; mod test_status; +mod test_trace_apply; +mod test_transfer_leader; diff --git a/components/raftstore-v2/tests/integrations/test_basic_write.rs b/components/raftstore-v2/tests/integrations/test_basic_write.rs index fc23e46e12f..cb8d71840cf 100644 --- a/components/raftstore-v2/tests/integrations/test_basic_write.rs +++ b/components/raftstore-v2/tests/integrations/test_basic_write.rs @@ -2,34 +2,28 @@ use std::{assert_matches::assert_matches, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; +use engine_traits::{Peekable, CF_DEFAULT}; use futures::executor::block_on; -use kvproto::{ - raft_cmdpb::{CmdType, Request}, - raft_serverpb::RaftMessage, -}; +use kvproto::raft_serverpb::RaftMessage; use raftstore::store::{INIT_EPOCH_CONF_VER, INIT_EPOCH_VER}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::store::new_peer; -use crate::cluster::Cluster; +use crate::cluster::{check_skip_wal, Cluster}; /// Test basic write flow. #[test] fn test_basic_write() { let cluster = Cluster::default(); - let router = cluster.router(0); - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let router = &cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); // Good proposal should be committed. - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.clone().encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); @@ -37,9 +31,9 @@ fn test_basic_write() { assert!(!resp.get_header().has_error(), "{:?}", resp); // Store id should be checked. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_peer(new_peer(3, 3)); - let resp = router.command(2, invalid_req.clone()).unwrap(); + let mut invalid_header = header.clone(); + invalid_header.set_peer(new_peer(3, 3)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_store_not_match(), "{:?}", @@ -47,36 +41,27 @@ fn test_basic_write() { ); // Peer id should be checked. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_peer(new_peer(1, 1)); - let resp = router.command(2, invalid_req.clone()).unwrap(); + invalid_header = header.clone(); + invalid_header.set_peer(new_peer(1, 1)); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!(resp.get_header().has_error(), "{:?}", resp); // Epoch should be checked. - let mut invalid_req = req.clone(); - invalid_req - .mut_header() + invalid_header = header.clone(); + invalid_header .mut_region_epoch() .set_version(INIT_EPOCH_VER - 1); - let resp = router.command(2, invalid_req.clone()).unwrap(); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_epoch_not_match(), "{:?}", resp ); - // It's wrong to send query to write command. - let mut invalid_req = req.clone(); - let mut snap_req = Request::default(); - snap_req.set_cmd_type(CmdType::Snap); - invalid_req.mut_requests().push(snap_req); - let resp = router.command(2, invalid_req.clone()).unwrap(); - assert!(resp.get_header().has_error(), "{:?}", resp); - // Term should be checked if set. - let mut invalid_req = req.clone(); - invalid_req.mut_header().set_term(1); - let resp = router.command(2, invalid_req).unwrap(); + invalid_header = header.clone(); + invalid_header.set_term(1); + let resp = router.simple_write(2, invalid_header, put.clone()).unwrap(); assert!( resp.get_header().get_error().has_stale_command(), "{:?}", @@ -84,11 +69,9 @@ fn test_basic_write() { ); // Too large message can cause regression and should be rejected. - let mut invalid_req = req.clone(); - invalid_req.mut_requests()[0] - .mut_put() - .set_value(vec![0; 8 * 1024 * 1024]); - let resp = router.command(2, invalid_req).unwrap(); + let mut invalid_put = SimpleWriteEncoder::with_capacity(9 * 1024 * 1024); + invalid_put.put(CF_DEFAULT, b"key", &vec![0; 8 * 1024 * 1024]); + let resp = router.simple_write(2, header.clone(), invalid_put).unwrap(); assert!( resp.get_header().get_error().has_raft_entry_too_large(), "{:?}", @@ -106,46 +89,43 @@ fn test_basic_write() { raft_message.set_from(4); raft_message.set_term(8); router.send_raft_message(msg).unwrap(); - let resp = router.command(2, req).unwrap(); + let resp = router.simple_write(2, header, put).unwrap(); assert!(resp.get_header().get_error().has_not_leader(), "{:?}", resp); } #[test] fn test_put_delete() { - let cluster = Cluster::default(); - let router = cluster.router(0); - let mut req = router.new_request_for(2); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; + let header = Box::new(router.new_request_for(2).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); router.wait_applied_to_current_term(2, Duration::from_secs(3)); - let tablet_factory = cluster.node(0).tablet_factory(); - let tablet = tablet_factory - .open_tablet(2, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert!(tablet.get_value(b"key").unwrap().is_none()); - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let snap = router.stale_snapshot(2); + assert!(snap.get_value(b"key").unwrap().is_none()); + let (msg, mut sub) = PeerMsg::simple_write(header.clone(), put.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_eq!(tablet.get_value(b"key").unwrap().unwrap(), b"value"); - - let mut delete_req = Request::default(); - delete_req.set_cmd_type(CmdType::Delete); - delete_req.mut_delete().set_key(b"key".to_vec()); - req.clear_requests(); - req.mut_requests().push(delete_req); - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); + let snap = router.stale_snapshot(2); + assert_eq!(snap.get_value(b"key").unwrap().unwrap(), b"value"); + + let mut delete = SimpleWriteEncoder::with_capacity(64); + delete.delete(CF_DEFAULT, b"key"); + let (msg, mut sub) = PeerMsg::simple_write(header, delete.encode()); router.send(2, msg).unwrap(); assert!(block_on(sub.wait_proposed())); assert!(block_on(sub.wait_committed())); let resp = block_on(sub.result()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); - assert_matches!(tablet.get_value(b"key"), Ok(None)); + let snap = router.stale_snapshot(2); + assert_matches!(snap.get_value(b"key"), Ok(None)); + + // Check if WAL is skipped for basic writes. + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 558962f8ef6..8a075bb9a35 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,20 +2,22 @@ use std::{self, time::Duration}; -use engine_traits::{OpenOptions, Peekable, TabletFactory}; -use kvproto::raft_cmdpb::{AdminCmdType, CmdType, Request}; +use engine_traits::{Peekable, CF_DEFAULT}; +use kvproto::raft_cmdpb::AdminCmdType; use raft::prelude::ConfChangeType; -use raftstore_v2::router::{PeerMsg, PeerTick}; +use raftstore_v2::{ + router::{PeerMsg, PeerTick}, + SimpleWriteEncoder, +}; use tikv_util::store::new_learner_peer; -use crate::cluster::Cluster; +use crate::cluster::{check_skip_wal, Cluster}; #[test] fn test_simple_change() { - let cluster = Cluster::with_node_count(2, None); + let mut cluster = Cluster::with_node_count(2, None); let region_id = 2; - let router0 = cluster.router(0); - let mut req = router0.new_request_for(2); + let mut req = cluster.routers[0].new_request_for(2); let admin_req = req.mut_admin_request(); admin_req.set_cmd_type(AdminCmdType::ChangePeer); admin_req @@ -24,12 +26,12 @@ fn test_simple_change() { let store_id = cluster.node(1).id(); let new_peer = new_learner_peer(store_id, 10); admin_req.mut_change_peer().set_peer(new_peer.clone()); - let resp = router0.command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; let leader_peer = req.get_header().get_peer().clone(); - let meta = router0 + let meta = cluster.routers[0] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); let match_index = meta.raft_apply.applied_index; @@ -39,8 +41,7 @@ fn test_simple_change() { // So heartbeat will create a learner. cluster.dispatch(2, vec![]); - let router1 = cluster.router(1); - let meta = router1 + let meta = cluster.routers[1] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); assert_eq!(meta.raft_status.id, 10, "{:?}", meta); @@ -52,36 +53,30 @@ fn test_simple_change() { ); // Trigger the raft tick to replica the log to the learner and execute the // snapshot task. - router0 + cluster.routers[0] .send(region_id, PeerMsg::Tick(PeerTick::Raft)) .unwrap(); cluster.dispatch(region_id, vec![]); // write one kv after snapshot let (key, val) = (b"key", b"value"); - let mut write_req = router0.new_request_for(region_id); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(key.to_vec()); - put_req.mut_put().set_value(val.to_vec()); - write_req.mut_requests().push(put_req); - let (msg, _) = PeerMsg::raft_command(write_req.clone()); - router0.send(region_id, msg).unwrap(); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); std::thread::sleep(Duration::from_millis(1000)); cluster.dispatch(region_id, vec![]); - let meta = router1 + let meta = cluster.routers[1] .must_query_debug_info(region_id, Duration::from_secs(3)) .unwrap(); // the learner truncated index muse be equal the leader applied index and can // read the new written kv. assert_eq!(match_index, meta.raft_apply.truncated_state.index); assert!(meta.raft_apply.applied_index >= match_index); - let tablet_factory = cluster.node(1).tablet_factory(); - let tablet = tablet_factory - .open_tablet(region_id, None, OpenOptions::default().set_cache_only(true)) - .unwrap(); - assert_eq!(tablet.get_value(key).unwrap().unwrap(), val); + let snap = cluster.routers[1].stale_snapshot(2); + assert_eq!(snap.get_value(key).unwrap().unwrap(), val); req.mut_header() .mut_region_epoch() @@ -89,12 +84,12 @@ fn test_simple_change() { req.mut_admin_request() .mut_change_peer() .set_change_type(ConfChangeType::RemoveNode); - let resp = router0.command(2, req.clone()).unwrap(); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); let epoch = req.get_header().get_region_epoch(); let new_conf_ver = epoch.get_conf_ver() + 1; let leader_peer = req.get_header().get_peer().clone(); - let meta = router0 + let meta = cluster.routers[0] .must_query_debug_info(2, Duration::from_secs(3)) .unwrap(); assert_eq!(meta.region_state.epoch.version, epoch.get_version()); @@ -102,4 +97,8 @@ fn test_simple_change() { assert_eq!(meta.region_state.peers, vec![leader_peer]); // TODO: check if the peer is removed once life trace is implemented or // snapshot is implemented. + + // Check if WAL is skipped for admin command. + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + check_skip_wal(cached.latest().unwrap().as_inner().path()); } diff --git a/components/raftstore-v2/tests/integrations/test_life.rs b/components/raftstore-v2/tests/integrations/test_life.rs index ed0ebcc9b8a..a2ae0bbb9f8 100644 --- a/components/raftstore-v2/tests/integrations/test_life.rs +++ b/components/raftstore-v2/tests/integrations/test_life.rs @@ -49,8 +49,11 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb raft_engine.get_all_entries_to(region_id, &mut buf).unwrap(); assert!(buf.is_empty(), "{:?}", buf); assert_matches!(raft_engine.get_raft_state(region_id), Ok(None)); - assert_matches!(raft_engine.get_apply_state(region_id), Ok(None)); - let region_state = raft_engine.get_region_state(region_id).unwrap().unwrap(); + assert_matches!(raft_engine.get_apply_state(region_id, u64::MAX), Ok(None)); + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); assert_matches!(region_state.get_state(), PeerState::Tombstone); assert!( region_state.get_region().get_peers().contains(peer), @@ -64,11 +67,11 @@ fn assert_tombstone(raft_engine: &impl RaftEngine, region_id: u64, peer: &metapb #[test] fn test_life_by_message() { let mut cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let test_region_id = 4; let test_peer_id = 5; let test_leader_id = 6; - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); // Build a correct message. let mut msg = Box::::default(); @@ -85,7 +88,7 @@ fn test_life_by_message() { let mut wrong_msg = msg.clone(); f(&mut wrong_msg); router.send_raft_message(wrong_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); }; // Check mismatch store id. @@ -113,7 +116,7 @@ fn test_life_by_message() { // The peer should survive restart. cluster.restart(0); - let router = cluster.router(0); + let router = &cluster.routers[0]; let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); @@ -121,7 +124,7 @@ fn test_life_by_message() { let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; raft_engine.get_raft_state(test_region_id).unwrap().unwrap(); raft_engine - .get_apply_state(test_region_id) + .get_apply_state(test_region_id, 0) .unwrap() .unwrap(); @@ -129,13 +132,13 @@ fn test_life_by_message() { let mut tombstone_msg = msg.clone(); tombstone_msg.set_is_tombstone(true); router.send_raft_message(tombstone_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); // Restart should not recreate tombstoned peer. cluster.restart(0); - let router = cluster.router(0); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + let router = &cluster.routers[0]; + assert_peer_not_exist(test_region_id, test_peer_id, router); let raft_engine = &cluster.node(0).running_state().unwrap().raft_engine; assert_tombstone(raft_engine, test_region_id, &new_peer(1, test_peer_id)); } @@ -143,7 +146,7 @@ fn test_life_by_message() { #[test] fn test_destroy_by_larger_id() { let mut cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; let test_region_id = 4; let test_peer_id = 6; let init_term = 5; @@ -180,7 +183,7 @@ fn test_destroy_by_larger_id() { let mut larger_id_msg = smaller_id_msg; larger_id_msg.set_to_peer(new_peer(1, test_peer_id + 1)); router.send_raft_message(larger_id_msg).unwrap(); - assert_peer_not_exist(test_region_id, test_peer_id, &router); + assert_peer_not_exist(test_region_id, test_peer_id, router); let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); @@ -189,7 +192,7 @@ fn test_destroy_by_larger_id() { // New peer should survive restart. cluster.restart(0); - let router = cluster.router(0); + let router = &cluster.routers[0]; let meta = router .must_query_debug_info(test_region_id, timeout) .unwrap(); diff --git a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs index c22ef4908bf..96bcbbccf7a 100644 --- a/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs +++ b/components/raftstore-v2/tests/integrations/test_pd_heartbeat.rs @@ -11,7 +11,7 @@ use crate::cluster::Cluster; fn test_region_heartbeat() { let region_id = 2; let cluster = Cluster::with_node_count(1, None); - let router = cluster.router(0); + let router = &cluster.routers[0]; // When there is only one peer, it should campaign immediately. let mut req = RaftCmdRequest::default(); diff --git a/components/raftstore-v2/tests/integrations/test_read.rs b/components/raftstore-v2/tests/integrations/test_read.rs index 2155a4775c6..f9575ff8da1 100644 --- a/components/raftstore-v2/tests/integrations/test_read.rs +++ b/components/raftstore-v2/tests/integrations/test_read.rs @@ -1,8 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use engine_traits::CF_DEFAULT; use futures::executor::block_on; use kvproto::raft_cmdpb::{CmdType, Request}; -use raftstore_v2::router::PeerMsg; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; use tikv_util::{config::ReadableDuration, store::new_peer}; use txn_types::WriteBatchFlags; @@ -13,7 +14,7 @@ fn test_read_index() { let mut config = v2_default_config(); config.raft_store_max_leader_lease = ReadableDuration::millis(150); let cluster = Cluster::with_config(config); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -39,14 +40,11 @@ fn test_read_index() { std::thread::sleep(std::time::Duration::from_millis(200)); let read_req = req.clone(); // the read lease should be expired and renewed by write - let mut req = router.new_request_for(region_id); - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(b"key".to_vec()); - put_req.mut_put().set_value(b"value".to_vec()); - req.mut_requests().push(put_req); + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key", b"value"); - let (msg, sub) = PeerMsg::raft_command(req.clone()); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); router.send(region_id, msg).unwrap(); block_on(sub.result()).unwrap(); @@ -58,7 +56,7 @@ fn test_read_index() { #[test] fn test_snap_without_read_index() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -84,7 +82,7 @@ fn test_snap_without_read_index() { #[test] fn test_query_with_write_cmd() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(2); @@ -111,7 +109,7 @@ fn test_query_with_write_cmd() { #[test] fn test_snap_with_invalid_parameter() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -163,8 +161,8 @@ fn test_snap_with_invalid_parameter() { #[test] fn test_local_read() { - let cluster = Cluster::default(); - let mut router = cluster.router(0); + let mut cluster = Cluster::default(); + let router = &mut cluster.routers[0]; std::thread::sleep(std::time::Duration::from_millis(200)); let region_id = 2; let mut req = router.new_request_for(region_id); @@ -172,7 +170,7 @@ fn test_local_read() { request_inner.set_cmd_type(CmdType::Snap); req.mut_requests().push(request_inner); - block_on(async { router.get_snapshot(req.clone()).await.unwrap() }); + block_on(async { router.snapshot(req.clone()).await.unwrap() }); let res = router.query(region_id, req.clone()).unwrap(); let resp = res.read().unwrap(); // The read index will be 0 as the retry process in the `get_snapshot` will diff --git a/components/raftstore-v2/tests/integrations/test_split.rs b/components/raftstore-v2/tests/integrations/test_split.rs index 336a9c9d038..7cea980beac 100644 --- a/components/raftstore-v2/tests/integrations/test_split.rs +++ b/components/raftstore-v2/tests/integrations/test_split.rs @@ -1,134 +1,20 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{thread, time::Duration}; +use std::time::Duration; -use futures::executor::block_on; -use kvproto::{ - metapb, pdpb, - raft_cmdpb::{ - AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, Request, SplitRequest, - }, -}; -use raftstore_v2::router::PeerMsg; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_RAFT}; +use raftstore::store::{INIT_EPOCH_VER, RAFT_INIT_LOG_INDEX}; use tikv_util::store::new_peer; +use txn_types::{Key, TimeStamp}; -use crate::cluster::{Cluster, TestRouter}; - -fn new_batch_split_region_request( - split_keys: Vec>, - ids: Vec, - right_derive: bool, -) -> AdminRequest { - let mut req = AdminRequest::default(); - req.set_cmd_type(AdminCmdType::BatchSplit); - req.mut_splits().set_right_derive(right_derive); - let mut requests = Vec::with_capacity(ids.len()); - for (mut id, key) in ids.into_iter().zip(split_keys) { - let mut split = SplitRequest::default(); - split.set_split_key(key); - split.set_new_region_id(id.get_new_region_id()); - split.set_new_peer_ids(id.take_new_peer_ids()); - requests.push(split); - } - req.mut_splits().set_requests(requests.into()); - req -} - -fn must_split(region_id: u64, req: RaftCmdRequest, router: &mut TestRouter) { - let (msg, sub) = PeerMsg::raft_command(req); - router.send(region_id, msg).unwrap(); - block_on(sub.result()).unwrap(); - - // TODO: when persistent implementation is ready, we can use tablet index of - // the parent to check whether the split is done. Now, just sleep a second. - thread::sleep(Duration::from_secs(1)); -} - -fn put(router: &mut TestRouter, region_id: u64, key: &[u8]) -> RaftCmdResponse { - let mut req = router.new_request_for(region_id); - - let mut put_req = Request::default(); - put_req.set_cmd_type(CmdType::Put); - put_req.mut_put().set_key(key.to_vec()); - put_req.mut_put().set_value(b"v1".to_vec()); - req.mut_requests().push(put_req); - - let (msg, mut sub) = PeerMsg::raft_command(req.clone()); - router.send(region_id, msg).unwrap(); - assert!(block_on(sub.wait_proposed())); - assert!(block_on(sub.wait_committed())); - block_on(sub.result()).unwrap() -} - -// Split the region according to the parameters -// return the updated original region -fn split_region( - router: &mut TestRouter, - region: metapb::Region, - peer: metapb::Peer, - split_region_id: u64, - split_peer: metapb::Peer, - left_key: &[u8], - right_key: &[u8], - split_key: &[u8], - right_derive: bool, -) -> (metapb::Region, metapb::Region) { - let region_id = region.id; - let mut req = RaftCmdRequest::default(); - req.mut_header().set_region_id(region_id); - req.mut_header() - .set_region_epoch(region.get_region_epoch().clone()); - req.mut_header().set_peer(peer); - - let mut split_id = pdpb::SplitId::new(); - split_id.new_region_id = split_region_id; - split_id.new_peer_ids = vec![split_peer.id]; - let admin_req = - new_batch_split_region_request(vec![split_key.to_vec()], vec![split_id], right_derive); - req.mut_requests().clear(); - req.set_admin_request(admin_req); - - must_split(region_id, req, router); - - let (left, right) = if !right_derive { - ( - router.region_detail(region_id), - router.region_detail(split_region_id), - ) - } else { - ( - router.region_detail(split_region_id), - router.region_detail(region_id), - ) - }; - - // The end key of left region is `split_key` - // So writing `right_key` will fail - let resp = put(router, left.id, right_key); - assert!(resp.get_header().has_error(), "{:?}", resp); - // But `left_key` should succeed - let resp = put(router, left.id, left_key); - assert!(!resp.get_header().has_error(), "{:?}", resp); - - // Mirror of above case - let resp = put(router, right.id, left_key); - assert!(resp.get_header().has_error(), "{:?}", resp); - let resp = put(router, right.id, right_key); - assert!(!resp.get_header().has_error(), "{:?}", resp); - - assert_eq!(left.get_end_key(), split_key); - assert_eq!(right.get_start_key(), split_key); - assert_eq!(region.get_start_key(), left.get_start_key()); - assert_eq!(region.get_end_key(), right.get_end_key()); - - (left, right) -} +use crate::cluster::{split_helper::split_region, Cluster}; #[test] fn test_split() { - let cluster = Cluster::default(); + let mut cluster = Cluster::default(); let store_id = cluster.node(0).id(); - let mut router = cluster.router(0); + let raft_engine = cluster.node(0).running_state().unwrap().raft_engine.clone(); + let router = &mut cluster.routers[0]; // let factory = cluster.node(0).tablet_factory(); let region_id = 2; @@ -139,45 +25,154 @@ fn test_split() { // Region 2 ["", ""] peer(1, 3) // -> Region 2 ["", "k22"] peer(1, 3) // Region 1000 ["k22", ""] peer(1, 10) - let (left, right) = split_region( - &mut router, + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + let (left, mut right) = split_region( + router, region, peer.clone(), 1000, new_peer(store_id, 10), - b"k11", - b"k33", + Some(b"k11"), + Some(b"k33"), + b"k22", b"k22", false, ); + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 1 + ); + let region_state0 = raft_engine + .get_region_state(2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state0); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); // Region 2 ["", "k22"] peer(1, 3) // -> Region 2 ["", "k11"] peer(1, 3) // Region 1001 ["k11", "k22"] peer(1, 11) let _ = split_region( - &mut router, + router, left, peer, 1001, new_peer(store_id, 11), - b"k00", + Some(b"k00"), + Some(b"k11"), b"k11", b"k11", false, ); + let region_state = raft_engine.get_region_state(2, u64::MAX).unwrap().unwrap(); + assert_ne!( + region_state.get_tablet_index(), + region_state0.get_tablet_index() + ); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 + ); + let region_state1 = raft_engine + .get_region_state(2, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state1); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); // Region 1000 ["k22", ""] peer(1, 10) // -> Region 1000 ["k22", "k33"] peer(1, 10) // Region 1002 ["k33", ""] peer(1, 12) - let _ = split_region( - &mut router, + let region_state = raft_engine + .get_region_state(1000, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + right = split_region( + router, right, new_peer(store_id, 10), 1002, new_peer(store_id, 12), - b"k22", + Some(b"k22"), + Some(b"k33"), b"k33", b"k33", false, + ) + .1; + let region_state = raft_engine + .get_region_state(1000, u64::MAX) + .unwrap() + .unwrap(); + assert_ne!(region_state.get_tablet_index(), RAFT_INIT_LOG_INDEX); + assert_eq!( + region_state.get_region().get_region_epoch().get_version(), + INIT_EPOCH_VER + 2 ); + let region_state2 = raft_engine + .get_region_state(1000, region_state.get_tablet_index()) + .unwrap() + .unwrap(); + assert_eq!(region_state, region_state2); + let flushed_index = raft_engine.get_flushed_index(2, CF_RAFT).unwrap().unwrap(); + assert!( + flushed_index >= region_state.get_tablet_index(), + "{flushed_index} >= {}", + region_state.get_tablet_index() + ); + + let split_key = Key::from_raw(b"k44").append_ts(TimeStamp::zero()); + let actual_split_key = split_key.clone().truncate_ts().unwrap(); + split_region( + router, + right, + new_peer(store_id, 12), + 1003, + new_peer(store_id, 13), + Some(b"k33"), + Some(b"k55"), + split_key.as_encoded(), + actual_split_key.as_encoded(), + false, + ); + + // Split should survive restart. + drop(raft_engine); + cluster.restart(0); + let region_and_key = vec![ + (2, b"k00"), + (1000, b"k22"), + (1001, b"k11"), + (1002, b"k33"), + (1003, b"k55"), + ]; + for (region_id, key) in region_and_key { + let snapshot = cluster.routers[0].stale_snapshot(region_id); + assert!( + snapshot.get_value(key).unwrap().is_some(), + "{} {:?}", + region_id, + key + ); + } } + +// TODO: test split race with +// - created peer +// - created peer with pending snapshot +// - created peer with persisting snapshot +// - created peer with persisted snapshot diff --git a/components/raftstore-v2/tests/integrations/test_status.rs b/components/raftstore-v2/tests/integrations/test_status.rs index 1f7415d9da3..59c23c4180f 100644 --- a/components/raftstore-v2/tests/integrations/test_status.rs +++ b/components/raftstore-v2/tests/integrations/test_status.rs @@ -8,7 +8,7 @@ use crate::cluster::Cluster; #[test] fn test_status() { let cluster = Cluster::default(); - let router = cluster.router(0); + let router = &cluster.routers[0]; // When there is only one peer, it should campaign immediately. let mut req = RaftCmdRequest::default(); req.mut_header().set_peer(new_peer(1, 3)); diff --git a/components/raftstore-v2/tests/integrations/test_trace_apply.rs b/components/raftstore-v2/tests/integrations/test_trace_apply.rs new file mode 100644 index 00000000000..71682ff52a4 --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_trace_apply.rs @@ -0,0 +1,217 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{path::Path, time::Duration}; + +use engine_traits::{DbOptionsExt, MiscExt, Peekable, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; +use futures::executor::block_on; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; + +use crate::cluster::Cluster; + +fn count_file(path: &Path, pat: impl Fn(&Path) -> bool) -> usize { + let mut count = 0; + for path in std::fs::read_dir(path).unwrap() { + if pat(&path.unwrap().path()) { + count += 1; + } + } + count +} + +fn count_sst(path: &Path) -> usize { + count_file(path, |path| { + path.extension().map_or(false, |ext| ext == "sst") + }) +} + +fn count_info_log(path: &Path) -> usize { + count_file(path, |path| { + path.file_name() + .unwrap() + .to_string_lossy() + .starts_with("LOG") + }) +} + +/// Test if data will be recovered correctly after being restarted. +#[test] +fn test_data_recovery() { + let mut cluster = Cluster::default(); + let registry = cluster.node(0).tablet_registry(); + let tablet_2_path = registry.tablet_path(2, RAFT_INIT_LOG_INDEX); + // The rocksdb is a bootstrapped tablet, so it will be opened and closed in + // bootstrap, and then open again in fsm initialization. + assert_eq!(count_info_log(&tablet_2_path), 2); + let router = &mut cluster.routers[0]; + router.wait_applied_to_current_term(2, Duration::from_secs(3)); + + // Write 100 keys to default CF and not flush. + let header = Box::new(router.new_request_for(2).take_header()); + for i in 0..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_DEFAULT, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + router + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) + .unwrap(); + } + + // Write 100 keys to write CF and flush half. + let mut sub = None; + for i in 0..50 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_WRITE, true).unwrap(); + let router = &mut cluster.routers[0]; + for i in 50..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_WRITE, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + router + .send(2, PeerMsg::simple_write(header.clone(), put.encode()).0) + .unwrap(); + } + + // Write 100 keys to lock CF and flush all. + for i in 0..100 { + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put( + CF_LOCK, + format!("key{}", i).as_bytes(), + format!("value{}", i).as_bytes(), + ); + let (msg, s) = PeerMsg::simple_write(header.clone(), put.encode()); + router.send(2, msg).unwrap(); + sub = Some(s); + } + let resp = block_on(sub.take().unwrap().result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cf(CF_LOCK, true).unwrap(); + + // Make sure all keys must be written. + let router = &mut cluster.routers[0]; + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + drop((snap, cached)); + + cluster.restart(0); + + let registry = cluster.node(0).tablet_registry(); + cached = registry.get(2).unwrap(); + cached + .latest() + .unwrap() + .set_db_options(&[("avoid_flush_during_shutdown", "true")]) + .unwrap(); + let router = &mut cluster.routers[0]; + + // Write another key to ensure all data are recovered. + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, b"key101", b"value101"); + let resp = router.simple_write(2, header, put).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // After being restarted, all unflushed logs should be applied again. So there + // should be no missing data. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + + // There is a restart, so LOG file should be rotate. + assert_eq!(count_info_log(&tablet_2_path), 3); + // We only trigger Flush twice, so there should be only 2 files. And because WAL + // is disabled, so when rocksdb is restarted, there should be no WAL to recover, + // so no additional flush will be triggered. + assert_eq!(count_sst(&tablet_2_path), 2); + + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // Although all CFs are triggered again, but recovery should only write: + // 1. [0, 101) to CF_DEFAULT + // 2. [50, 100) to CF_WRITE + // + // So there will be only 2 memtables to be flushed. + assert_eq!(count_sst(&tablet_2_path), 4); + + drop((snap, cached)); + + cluster.restart(0); + + let router = &mut cluster.routers[0]; + + assert_eq!(count_info_log(&tablet_2_path), 4); + // Because data is flushed before restarted, so all data can be read + // immediately. + let snap = router.stale_snapshot(2); + for cf in DATA_CFS { + for i in 0..100 { + let key = format!("key{}", i); + let value = snap.get_value_cf(cf, key.as_bytes()).unwrap(); + assert_eq!( + value.as_deref(), + Some(format!("value{}", i).as_bytes()), + "{} {}", + cf, + key + ); + } + } + // Trigger flush again. + cached = cluster.node(0).tablet_registry().get(2).unwrap(); + cached.latest().unwrap().flush_cfs(DATA_CFS, true).unwrap(); + + // There is no recovery, so there should be nothing to flush. + assert_eq!(count_sst(&tablet_2_path), 4); +} diff --git a/components/raftstore-v2/tests/integrations/test_transfer_leader.rs b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs new file mode 100644 index 00000000000..d031d6b1eba --- /dev/null +++ b/components/raftstore-v2/tests/integrations/test_transfer_leader.rs @@ -0,0 +1,151 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::time::Duration; + +use engine_traits::{Peekable, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{ + metapb, + raft_cmdpb::{AdminCmdType, TransferLeaderRequest}, +}; +use raft::prelude::ConfChangeType; +use raftstore_v2::{router::PeerMsg, SimpleWriteEncoder}; +use tikv_util::store::new_peer; + +use crate::cluster::Cluster; + +fn put_data( + region_id: u64, + cluster: &Cluster, + node_off: usize, + node_off_for_verify: usize, + key: &[u8], +) { + let router = &cluster.routers[node_off]; + + router.wait_applied_to_current_term(region_id, Duration::from_secs(3)); + + // router.wait_applied_to_current_term(2, Duration::from_secs(3)); + let tablet_registry = cluster.node(node_off).tablet_registry(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + assert!(tablet.get_value(key).unwrap().is_none()); + + let header = Box::new(router.new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, &key[1..], b"value"); + let (msg, mut sub) = PeerMsg::simple_write(header, put.encode()); + router.send(region_id, msg).unwrap(); + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_proposed())); + + std::thread::sleep(std::time::Duration::from_millis(10)); + cluster.dispatch(region_id, vec![]); + // triage send snapshot + std::thread::sleep(std::time::Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + assert!(block_on(sub.wait_committed())); + + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); + + // Verify the data is ready in the other node + let tablet_registry = cluster.node(node_off_for_verify).tablet_registry(); + let tablet = tablet_registry + .get(region_id) + .unwrap() + .latest() + .unwrap() + .clone(); + assert_eq!(tablet.get_value(key).unwrap().unwrap(), b"value"); +} + +pub fn must_transfer_leader( + cluster: &Cluster, + region_id: u64, + from_off: usize, + to_off: usize, + to_peer: metapb::Peer, +) { + let router = &cluster.routers[from_off]; + let router2 = &cluster.routers[to_off]; + let mut req = router.new_request_for(region_id); + let mut transfer_req = TransferLeaderRequest::default(); + transfer_req.set_peer(to_peer.clone()); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::TransferLeader); + admin_req.set_transfer_leader(transfer_req); + let resp = router.admin_command(region_id, req).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + cluster.dispatch(region_id, vec![]); + + let meta = router + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); + let meta = router2 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.raft_status.soft_state.leader_id, to_peer.id); +} + +#[test] +fn test_transfer_leader() { + let cluster = Cluster::with_node_count(3, None); + let region_id = 2; + let router0 = &cluster.routers[0]; + + let mut req = router0.new_request_for(region_id); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddNode); + let store_id = cluster.node(1).id(); + let peer1 = new_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(peer1.clone()); + let req_clone = req.clone(); + let resp = router0.admin_command(region_id, req_clone).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let epoch = req.get_header().get_region_epoch(); + let new_conf_ver = epoch.get_conf_ver() + 1; + let leader_peer = req.get_header().get_peer().clone(); + let meta = router0 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + assert_eq!(meta.region_state.peers, vec![leader_peer, peer1.clone()]); + let peer0_id = meta.raft_status.id; + + // So heartbeat will create a learner. + cluster.dispatch(region_id, vec![]); + let router1 = &cluster.routers[1]; + let meta = router1 + .must_query_debug_info(region_id, Duration::from_secs(3)) + .unwrap(); + assert_eq!(peer0_id, meta.raft_status.soft_state.leader_id); + assert_eq!(meta.raft_status.id, peer1.id, "{:?}", meta); + assert_eq!(meta.region_state.epoch.version, epoch.get_version()); + assert_eq!(meta.region_state.epoch.conf_ver, new_conf_ver); + cluster.dispatch(region_id, vec![]); + + // Ensure follower has latest entries before transfer leader. + put_data(region_id, &cluster, 0, 1, b"zkey1"); + + // Perform transfer leader + must_transfer_leader(&cluster, region_id, 0, 1, peer1); + + // Before transfer back to peer0, put some data again. + put_data(region_id, &cluster, 1, 0, b"zkey2"); + + // Perform transfer leader + let store_id = cluster.node(0).id(); + must_transfer_leader(&cluster, region_id, 1, 0, new_peer(store_id, peer0_id)); +} diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 64b49a227c9..13b7c5281f5 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -1,11 +1,11 @@ // Copyright 2016 TiKV Project Authors. Licensed under Apache-2.0. // #[PerformanceCriticalPath] called by Fsm on_ready_compute_hash -use std::{marker::PhantomData, mem, ops::Deref}; +use std::{borrow::Cow, marker::PhantomData, mem, ops::Deref}; use engine_traits::{CfName, KvEngine}; use kvproto::{ - metapb::Region, + metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, raft_serverpb::RaftMessage, @@ -14,8 +14,120 @@ use protobuf::Message; use raft::eraftpb; use tikv_util::box_try; -use super::*; -use crate::store::CasualRouter; +use super::{split_observer::SplitObserver, *}; +use crate::store::BucketRange; + +/// A handle for coprocessor to schedule some command back to raftstore. +pub trait StoreHandle: Clone + Send { + fn update_approximate_size(&self, region_id: u64, size: u64); + fn update_approximate_keys(&self, region_id: u64, keys: u64); + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ); + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ); + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ); +} + +#[derive(Clone, Debug, PartialEq)] +pub enum SchedTask { + UpdateApproximateSize { + region_id: u64, + size: u64, + }, + UpdateApproximateKeys { + region_id: u64, + keys: u64, + }, + AskSplit { + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + }, + RefreshRegionBuckets { + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + }, + UpdateComputeHashResult { + region_id: u64, + index: u64, + hash: Vec, + context: Vec, + }, +} + +impl StoreHandle for std::sync::mpsc::SyncSender { + fn update_approximate_size(&self, region_id: u64, size: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateSize { region_id, size }); + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + let _ = self.try_send(SchedTask::UpdateApproximateKeys { region_id, keys }); + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + let _ = self.try_send(SchedTask::AskSplit { + region_id, + region_epoch, + split_keys, + source, + }); + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = self.try_send(SchedTask::RefreshRegionBuckets { + region_id, + region_epoch, + buckets, + bucket_ranges, + }); + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + let _ = self.try_send(SchedTask::UpdateComputeHashResult { + region_id, + index, + context, + hash, + }); + } +} struct Entry { priority: u32, @@ -340,10 +452,8 @@ where } impl CoprocessorHost { - pub fn new + Clone + Send + 'static>( - ch: C, - cfg: Config, - ) -> CoprocessorHost { + pub fn new(ch: C, cfg: Config) -> CoprocessorHost { + // TODO load coprocessors from configuration let mut registry = Registry::default(); registry.register_split_check_observer( 200, @@ -358,6 +468,7 @@ impl CoprocessorHost { 400, BoxSplitCheckObserver::new(TableCheckObserver::default()), ); + registry.register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); CoprocessorHost { registry, cfg } } @@ -680,13 +791,6 @@ impl CoprocessorHost { false } - pub fn on_peer_created(&self, region_id: u64, peer_id: u64, event: PeerCreateEvent) { - for observer in &self.registry.region_change_observers { - let observer = observer.observer.inner(); - observer.on_peer_created(region_id, peer_id, event) - } - } - pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index dc7cb7471f0..b8e2b07235a 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -33,7 +33,7 @@ pub use self::{ dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, }, error::{Error, Result}, region_info_accessor::{ @@ -268,15 +268,18 @@ pub struct RoleChange { pub prev_lead_transferee: u64, /// Which peer is voted by itself. pub vote: u64, + pub initialized: bool, } impl RoleChange { + #[cfg(feature = "testexport")] pub fn new(state: StateRole) -> Self { RoleChange { state, leader_id: raft::INVALID_ID, prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, + initialized: true, } } } @@ -307,11 +310,6 @@ pub enum RegionChangeEvent { UpdateBuckets(usize), } -#[derive(Clone, Copy, Debug, PartialEq)] -pub enum PeerCreateEvent { - Replicate, - Create, -} pub trait RegionChangeObserver: Coprocessor { /// Hook to call when a region changed on this TiKV fn on_region_changed(&self, _: &mut ObserverContext<'_>, _: RegionChangeEvent, _: StateRole) {} @@ -337,8 +335,6 @@ pub trait RegionChangeObserver: Coprocessor { fn should_skip_raft_message(&self, _: &RaftMessage) -> bool { false } - - fn on_peer_created(&self, _: u64, _: u64, _: PeerCreateEvent) {} } #[derive(Clone, Debug, Default)] diff --git a/components/raftstore/src/coprocessor/region_info_accessor.rs b/components/raftstore/src/coprocessor/region_info_accessor.rs index 338cf3962c4..37403310baf 100644 --- a/components/raftstore/src/coprocessor/region_info_accessor.rs +++ b/components/raftstore/src/coprocessor/region_info_accessor.rs @@ -46,11 +46,26 @@ use super::{ /// `RaftStoreEvent` Represents events dispatched from raftstore coprocessor. #[derive(Debug)] pub enum RaftStoreEvent { - CreateRegion { region: Region, role: StateRole }, - UpdateRegion { region: Region, role: StateRole }, - DestroyRegion { region: Region }, - RoleChange { region: Region, role: StateRole }, - UpdateRegionBuckets { region: Region, buckets: usize }, + CreateRegion { + region: Region, + role: StateRole, + }, + UpdateRegion { + region: Region, + role: StateRole, + }, + DestroyRegion { + region: Region, + }, + RoleChange { + region: Region, + role: StateRole, + initialized: bool, + }, + UpdateRegionBuckets { + region: Region, + buckets: usize, + }, } impl RaftStoreEvent { @@ -191,7 +206,11 @@ impl RoleObserver for RegionEventListener { fn on_role_change(&self, context: &mut ObserverContext<'_>, role_change: &RoleChange) { let region = context.region().clone(); let role = role_change.state; - let event = RaftStoreEvent::RoleChange { region, role }; + let event = RaftStoreEvent::RoleChange { + region, + role, + initialized: role_change.initialized, + }; self.scheduler .schedule(RegionInfoQuery::RaftStoreEvent(event)) .unwrap(); @@ -426,7 +445,10 @@ impl RegionCollector { // They are impossible to equal, or they cannot overlap. assert_ne!( region.get_region_epoch().get_version(), - current_region.get_region_epoch().get_version() + current_region.get_region_epoch().get_version(), + "{:?} vs {:?}", + region, + current_region, ); // Remove it since it's a out-of-date region info. if clear_regions_in_range { @@ -492,6 +514,10 @@ impl RegionCollector { // epoch is properly set and an Update message was sent. return; } + if let RaftStoreEvent::RoleChange { initialized, .. } = &event && !initialized { + // Ignore uninitialized peers. + return; + } if !self.check_region_range(region, true) { debug!( "Received stale event"; @@ -511,7 +537,7 @@ impl RegionCollector { RaftStoreEvent::DestroyRegion { region } => { self.handle_destroy_region(region); } - RaftStoreEvent::RoleChange { region, role } => { + RaftStoreEvent::RoleChange { region, role, .. } => { self.handle_role_change(region, role); } RaftStoreEvent::UpdateRegionBuckets { region, buckets } => { @@ -988,10 +1014,16 @@ mod tests { } } - fn must_change_role(c: &mut RegionCollector, region: &Region, role: StateRole) { + fn must_change_role( + c: &mut RegionCollector, + region: &Region, + role: StateRole, + initialized: bool, + ) { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: region.clone(), role, + initialized, }); if let Some(r) = c.regions.get(®ion.get_id()) { @@ -1037,6 +1069,12 @@ mod tests { c.handle_raftstore_event(RaftStoreEvent::RoleChange { region: new_region(1, b"k1", b"k2", 0), role: StateRole::Leader, + initialized: true, + }); + c.handle_raftstore_event(RaftStoreEvent::RoleChange { + region: new_region(1, b"", b"", 3), + role: StateRole::Leader, + initialized: false, }); check_collection(&c, &[]); @@ -1198,9 +1236,15 @@ mod tests { &mut c, &new_region(1, b"k0", b"k1", 2), StateRole::Candidate, + true, ); must_create_region(&mut c, &new_region(5, b"k99", b"", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k2", b"k8", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k2", b"k8", 2), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k3", b"k7", 3), StateRole::Leader); // test region buckets update must_update_region_buckets(&mut c, &new_region(2, b"k3", b"k7", 3), 4); @@ -1343,7 +1387,12 @@ mod tests { // which haven't been handled. must_create_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); must_update_region(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Follower); - must_change_role(&mut c, &new_region(2, b"k1", b"k9", 1), StateRole::Leader); + must_change_role( + &mut c, + &new_region(2, b"k1", b"k9", 1), + StateRole::Leader, + true, + ); must_update_region(&mut c, &new_region(2, b"k1", b"k5", 2), StateRole::Leader); // TODO: In fact, region 2's role should be follower. However because it's // previous state was removed while creating updating region 4, it can't be @@ -1364,7 +1413,12 @@ mod tests { // handled. must_update_region(&mut c, &new_region(2, b"k1", b"k9", 3), StateRole::Leader); must_update_region(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Follower); - must_change_role(&mut c, &new_region(4, b"k5", b"k9", 2), StateRole::Leader); + must_change_role( + &mut c, + &new_region(4, b"k5", b"k9", 2), + StateRole::Leader, + true, + ); must_destroy_region(&mut c, new_region(4, b"k5", b"k9", 2)); check_collection( &c, diff --git a/components/raftstore/src/coprocessor/split_check/half.rs b/components/raftstore/src/coprocessor/split_check/half.rs index 8f572eb1f9f..fafa41e44b5 100644 --- a/components/raftstore/src/coprocessor/split_check/half.rs +++ b/components/raftstore/src/coprocessor/split_check/half.rs @@ -140,8 +140,8 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{BucketRange, CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{BucketRange, SplitCheckRunner, SplitCheckTask}, }; #[test] @@ -451,15 +451,11 @@ mod tests { )); loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - buckets, - bucket_ranges, - .. - }, - )) = rx.try_recv() + if let Ok(SchedTask::RefreshRegionBuckets { + buckets, + bucket_ranges, + .. + }) = rx.try_recv() { assert_eq!(buckets.len(), bucket_ranges.unwrap().len()); assert_eq!(buckets.len(), 5); diff --git a/components/raftstore/src/coprocessor/split_check/keys.rs b/components/raftstore/src/coprocessor/split_check/keys.rs index e2e58933e57..92e159d233f 100644 --- a/components/raftstore/src/coprocessor/split_check/keys.rs +++ b/components/raftstore/src/coprocessor/split_check/keys.rs @@ -1,10 +1,5 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -19,7 +14,7 @@ use super::{ size::get_approximate_split_keys, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_keys_count: u64, @@ -116,29 +111,19 @@ where } #[derive(Clone)] -pub struct KeysCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct KeysCheckObserver { + router: C, } -impl, E> KeysCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> KeysCheckObserver { - KeysCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl KeysCheckObserver { + pub fn new(router: C) -> KeysCheckObserver { + KeysCheckObserver { router } } } -impl Coprocessor for KeysCheckObserver {} +impl Coprocessor for KeysCheckObserver {} -impl + Send, E> SplitCheckObserver for KeysCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for KeysCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -172,15 +157,7 @@ where } }; - let res = CasualMessage::RegionApproximateKeys { keys: region_keys }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region keys"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_keys(region_id, region_keys); REGION_KEYS_HISTOGRAM.observe(region_keys as f64); // if bucket checker using scan is added, to utilize the scan, @@ -253,8 +230,8 @@ mod tests { *, }; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; fn put_data(engine: &impl KvEngine, mut start_idx: u64, end_idx: u64, fill_short_value: bool) { @@ -323,8 +300,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -427,8 +404,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), @@ -599,8 +576,8 @@ mod tests { )); // keys has not reached the max_keys 100 yet. match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) => { assert_eq!(region_id, region.get_id()); } others => panic!("expect recv empty, but got {:?}", others), diff --git a/components/raftstore/src/coprocessor/split_check/size.rs b/components/raftstore/src/coprocessor/split_check/size.rs index bdcf817365c..1f4a33d7af7 100644 --- a/components/raftstore/src/coprocessor/split_check/size.rs +++ b/components/raftstore/src/coprocessor/split_check/size.rs @@ -1,10 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - marker::PhantomData, - sync::{Arc, Mutex}, -}; - use engine_traits::{KvEngine, Range}; use error_code::ErrorCodeExt; use kvproto::{metapb::Region, pdpb::CheckPolicy}; @@ -17,7 +12,7 @@ use super::{ }, calc_split_keys_count, Host, }; -use crate::store::{CasualMessage, CasualRouter}; +use crate::coprocessor::dispatcher::StoreHandle; pub struct Checker { max_size: u64, @@ -116,29 +111,19 @@ where } #[derive(Clone)] -pub struct SizeCheckObserver { - router: Arc>, - _phantom: PhantomData, +pub struct SizeCheckObserver { + router: C, } -impl, E> SizeCheckObserver -where - E: KvEngine, -{ - pub fn new(router: C) -> SizeCheckObserver { - SizeCheckObserver { - router: Arc::new(Mutex::new(router)), - _phantom: PhantomData, - } +impl SizeCheckObserver { + pub fn new(router: C) -> SizeCheckObserver { + SizeCheckObserver { router } } } -impl Coprocessor for SizeCheckObserver {} +impl Coprocessor for SizeCheckObserver {} -impl + Send, E> SplitCheckObserver for SizeCheckObserver -where - E: KvEngine, -{ +impl SplitCheckObserver for SizeCheckObserver { fn add_checker( &self, ctx: &mut ObserverContext<'_>, @@ -173,15 +158,7 @@ where }; // send it to raftstore to update region approximate size - let res = CasualMessage::RegionApproximateSize { size: region_size }; - if let Err(e) = self.router.lock().unwrap().send(region_id, res) { - warn!( - "failed to send approximate region size"; - "region_id" => region_id, - "err" => %e, - "error_code" => %e.error_code(), - ); - } + self.router.update_approximate_size(region_id, region_size); let need_bucket_checker = host.cfg.enable_region_bucket && region_size >= 2 * host.cfg.region_bucket_size.0; @@ -256,7 +233,7 @@ pub fn get_approximate_split_keys( #[cfg(test)] pub mod tests { - use std::{iter, sync::mpsc, u64}; + use std::{assert_matches::assert_matches, iter, sync::mpsc, u64}; use collections::HashSet; use engine_test::{ @@ -276,30 +253,31 @@ pub mod tests { use super::{Checker, *}; use crate::{ - coprocessor::{Config, CoprocessorHost, ObserverContext, SplitChecker}, - store::{BucketRange, CasualMessage, KeyEntry, SplitCheckRunner, SplitCheckTask}, + coprocessor::{ + dispatcher::SchedTask, Config, CoprocessorHost, ObserverContext, SplitChecker, + }, + store::{BucketRange, KeyEntry, SplitCheckRunner, SplitCheckTask}, }; fn must_split_at_impl( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ignore_split_keys: bool, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); if !ignore_split_keys { @@ -307,14 +285,13 @@ pub mod tests { } break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } pub fn must_split_at( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys: Vec>, ) { @@ -322,50 +299,36 @@ pub mod tests { } pub fn must_split_with( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, exp_region: &Region, exp_split_keys_count: usize, ) { loop { match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) - | Ok((region_id, CasualMessage::RegionApproximateKeys { .. })) => { + Ok(SchedTask::UpdateApproximateSize { region_id, .. }) + | Ok(SchedTask::UpdateApproximateKeys { region_id, .. }) + | Ok(SchedTask::RefreshRegionBuckets { region_id, .. }) => { assert_eq!(region_id, exp_region.get_id()); } - Ok(( + Ok(SchedTask::AskSplit { region_id, - CasualMessage::SplitRegion { - region_epoch, - split_keys, - .. - }, - )) => { + region_epoch, + split_keys, + .. + }) => { assert_eq!(region_id, exp_region.get_id()); assert_eq!(®ion_epoch, exp_region.get_region_epoch()); assert_eq!(split_keys.len(), exp_split_keys_count); break; } - Ok((_region_id, CasualMessage::RefreshRegionBuckets { .. })) => {} others => panic!("expect split check result, but got {:?}", others), } } } - pub fn must_generate_buckets( - rx: &mpsc::Receiver<(u64, CasualMessage)>, - exp_buckets_keys: &[Vec], - ) { + pub fn must_generate_buckets(rx: &mpsc::Receiver, exp_buckets_keys: &[Vec]) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let mut i = 0; if !exp_buckets_keys.is_empty() { let bucket = buckets.pop().unwrap(); @@ -383,23 +346,14 @@ pub mod tests { } pub fn must_generate_buckets_approximate( - rx: &mpsc::Receiver<(u64, CasualMessage)>, + rx: &mpsc::Receiver, bucket_range: Option, min_leap: i32, max_leap: i32, mvcc: bool, ) { loop { - if let Ok(( - _, - CasualMessage::RefreshRegionBuckets { - region_epoch: _, - mut buckets, - bucket_ranges: _, - .. - }, - )) = rx.try_recv() - { + if let Ok(SchedTask::RefreshRegionBuckets { mut buckets, .. }) = rx.try_recv() { let bucket_keys = buckets.pop().unwrap().keys; if let Some(bucket_range) = bucket_range { assert!(!bucket_keys.is_empty()); @@ -489,12 +443,7 @@ pub mod tests { None, )); // size has not reached the max_size 100 yet. - match rx.try_recv() { - Ok((region_id, CasualMessage::RegionApproximateSize { .. })) => { - assert_eq!(region_id, region.get_id()); - } - others => panic!("expect recv empty, but got {:?}", others), - } + assert_matches!(rx.try_recv(), Ok(SchedTask::UpdateApproximateSize { region_id, .. }) if region_id == region.get_id()); for i in 7..11 { let s = keys::data_key(format!("{:04}", i).as_bytes()); diff --git a/components/raftstore/src/coprocessor/split_check/table.rs b/components/raftstore/src/coprocessor/split_check/table.rs index 9b5220938fd..684e87e1693 100644 --- a/components/raftstore/src/coprocessor/split_check/table.rs +++ b/components/raftstore/src/coprocessor/split_check/table.rs @@ -238,8 +238,8 @@ mod tests { use super::*; use crate::{ - coprocessor::{Config, CoprocessorHost}, - store::{CasualMessage, SplitCheckRunner, SplitCheckTask}, + coprocessor::{dispatcher::SchedTask, Config, CoprocessorHost}, + store::{SplitCheckRunner, SplitCheckTask}, }; /// Composes table record and index prefix: `t[table_id]`. @@ -353,9 +353,9 @@ mod tests { let key = Key::from_raw(&gen_table_prefix(id)); loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), - Ok((_, CasualMessage::SplitRegion { split_keys, .. })) => { + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), + Ok(SchedTask::AskSplit { split_keys, .. }) => { assert_eq!(split_keys, vec![key.into_encoded()]); break; } @@ -365,8 +365,8 @@ mod tests { } else { loop { match rx.try_recv() { - Ok((_, CasualMessage::RegionApproximateSize { .. })) - | Ok((_, CasualMessage::RegionApproximateKeys { .. })) => (), + Ok(SchedTask::UpdateApproximateSize { .. }) + | Ok(SchedTask::UpdateApproximateKeys { .. }) => (), Err(mpsc::TryRecvError::Empty) => { break; } diff --git a/components/raftstore/src/lib.rs b/components/raftstore/src/lib.rs index e56678edec2..6104ae7b7cf 100644 --- a/components/raftstore/src/lib.rs +++ b/components/raftstore/src/lib.rs @@ -7,6 +7,7 @@ #![feature(box_patterns)] #![feature(hash_drain_filter)] #![feature(let_chains)] +#![feature(assert_matches)] #![recursion_limit = "256"] #[cfg(test)] diff --git a/components/raftstore/src/router.rs b/components/raftstore/src/router.rs index 1ded8be3886..0f22eb483a0 100644 --- a/components/raftstore/src/router.rs +++ b/components/raftstore/src/router.rs @@ -1,10 +1,14 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use std::borrow::Cow; + // #[PerformanceCriticalPath] use crossbeam::channel::TrySendError; use engine_traits::{KvEngine, RaftEngine, Snapshot}; -use kvproto::{raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; +use error_code::ErrorCodeExt; +use kvproto::{metapb, raft_cmdpb::RaftCmdRequest, raft_serverpb::RaftMessage}; use raft::SnapshotStatus; +use slog_global::warn; use tikv_util::time::ThreadReadId; use crate::{ @@ -276,3 +280,107 @@ impl RaftStoreRouter for RaftRouter { batch_system::Router::broadcast_normal(self, msg_gen) } } + +// Because `CasualRouter` needs an generic while `RaftRotuer` doesn't. We have +// to bridge two by manually implementations. Using functions to reduce +// duplicated codes. + +impl crate::coprocessor::StoreHandle for RaftRouter { + fn update_approximate_size(&self, region_id: u64, size: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateSize { size }, + ) { + warn!( + "failed to send approximate region size"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn update_approximate_keys(&self, region_id: u64, keys: u64) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::RegionApproximateKeys { keys }, + ) { + warn!( + "failed to send approximate region keys"; + "region_id" => region_id, + "err" => %e, + "error_code" => %e.error_code(), + ); + } + } + + fn ask_split( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + split_keys: Vec>, + source: Cow<'static, str>, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::SplitRegion { + region_epoch, + split_keys, + callback: Callback::None, + source, + }, + ) { + warn!( + "failed to send ask split"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn update_compute_hash_result( + &self, + region_id: u64, + index: u64, + context: Vec, + hash: Vec, + ) { + if let Err(e) = CasualRouter::send( + self, + region_id, + CasualMessage::ComputeHashResult { + index, + context, + hash, + }, + ) { + warn!( + "failed to send hash compute result"; + "region_id" => region_id, + "err" => %e, + ); + } + } + + fn refresh_region_buckets( + &self, + region_id: u64, + region_epoch: metapb::RegionEpoch, + buckets: Vec, + bucket_ranges: Option>, + ) { + let _ = CasualRouter::send( + self, + region_id, + CasualMessage::RefreshRegionBuckets { + region_epoch, + buckets, + bucket_ranges, + cb: Callback::None, + }, + ); + } +} diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index 5dc01b40ef3..b298ed3529e 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -79,12 +79,12 @@ pub struct FetchedLogs { pub logs: Box, } -pub type GenSnapRes = Option>; +pub type GenSnapRes = Option>; /// A router for receiving fetched result. pub trait AsyncReadNotifier: Send { fn notify_logs_fetched(&self, region_id: u64, fetched: FetchedLogs); - fn notify_snapshot_generated(&self, region_id: u64, res: Option>); + fn notify_snapshot_generated(&self, region_id: u64, res: GenSnapRes); } pub struct ReadRunner @@ -231,7 +231,7 @@ where SNAP_HISTOGRAM .generate .observe(start.saturating_elapsed_secs()); - res = Some(Box::new(snapshot)) + res = Some(Box::new((snapshot, to_peer))) } self.notifier.notify_snapshot_generated(region_id, res); diff --git a/components/raftstore/src/store/async_io/write.rs b/components/raftstore/src/store/async_io/write.rs index 354a796c99c..b4cceb96a82 100644 --- a/components/raftstore/src/store/async_io/write.rs +++ b/components/raftstore/src/store/async_io/write.rs @@ -20,14 +20,12 @@ use engine_traits::{ }; use error_code::ErrorCodeExt; use fail::fail_point; -use kvproto::raft_serverpb::{ - PeerState, RaftApplyState, RaftLocalState, RaftMessage, RegionLocalState, -}; +use kvproto::raft_serverpb::{RaftLocalState, RaftMessage}; use protobuf::Message; use raft::eraftpb::Entry; use tikv_util::{ box_err, - config::{Tracker, VersionTrack}, + config::{ReadableSize, Tracker, VersionTrack}, debug, info, slow_log, sys::thread::StdThreadBuildWrapper, thd_name, @@ -39,7 +37,6 @@ use super::write_router::WriteSenders; use crate::{ store::{ config::Config, - entry_storage::first_index, fsm::RaftRouter, local_metrics::{RaftSendMessageMetrics, StoreWriteMetrics, TimeTracker}, metrics::*, @@ -54,6 +51,7 @@ const KV_WB_SHRINK_SIZE: usize = 1024 * 1024; const KV_WB_DEFAULT_SIZE: usize = 16 * 1024; const RAFT_WB_SHRINK_SIZE: usize = 10 * 1024 * 1024; const RAFT_WB_DEFAULT_SIZE: usize = 256 * 1024; +const RAFT_WB_SPLIT_SIZE: usize = ReadableSize::gb(1).0 as usize; /// Notify the event to the specified region. pub trait PersistedNotifier: Clone + Send + 'static { @@ -88,22 +86,24 @@ where /// /// For now, applying snapshot needs to persist some extra states. For v1, /// these states are written to KvEngine. For v2, they are written to -/// RaftEngine. +/// RaftEngine. Although in v2 these states are also written to raft engine, +/// but we have to use `ExtraState` as they should be written as the last +/// updates. // TODO: perhaps we should always pass states instead of a write batch even // for v1. -pub enum ExtraWrite { +pub enum ExtraWrite { None, V1(W), - V2(ExtraStates), + V2(L), } -impl ExtraWrite { +impl ExtraWrite { #[inline] pub fn is_empty(&self) -> bool { match self { ExtraWrite::None => true, ExtraWrite::V1(w) => w.is_empty(), - _ => false, + ExtraWrite::V2(l) => l.is_empty(), } } @@ -112,7 +112,7 @@ impl ExtraWrite { match self { ExtraWrite::None => 0, ExtraWrite::V1(w) => w.data_size(), - ExtraWrite::V2(m) => mem::size_of_val(m), + ExtraWrite::V2(l) => l.persist_size(), } } @@ -139,18 +139,33 @@ impl ExtraWrite { } #[inline] - pub fn set_v2(&mut self, extra_states: ExtraStates) { - if let ExtraWrite::V1(_) = self { + pub fn ensure_v2(&mut self, log_batch: impl FnOnce() -> L) -> &mut L { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch()); + } else if let ExtraWrite::V1(_) = self { unreachable!("v1 and v2 are mixed used"); - } else { - *self = ExtraWrite::V2(extra_states); + } + match self { + ExtraWrite::V2(l) => l, + _ => unreachable!(), } } #[inline] - pub fn v2_mut(&mut self) -> Option<&mut ExtraStates> { - if let ExtraWrite::V2(m) = self { - Some(m) + pub fn merge_v2(&mut self, log_batch: L) { + if let ExtraWrite::None = self { + *self = ExtraWrite::V2(log_batch); + } else if let ExtraWrite::V1(_) = self { + unreachable!("v1 and v2 are mixed used"); + } else if let ExtraWrite::V2(l) = self { + l.merge(log_batch).unwrap(); + } + } + + #[inline] + pub fn v2_mut(&mut self) -> Option<&mut L> { + if let ExtraWrite::V2(l) = self { + Some(l) } else { None } @@ -170,11 +185,11 @@ where pub send_time: Instant, pub raft_wb: Option, // called after writing to kvdb and raftdb. - pub persisted_cb: Option>, + pub persisted_cbs: Vec>, pub entries: Vec, pub cut_logs: Option<(u64, u64)>, pub raft_state: Option, - pub extra_write: ExtraWrite, + pub extra_write: ExtraWrite, pub messages: Vec, pub trackers: Vec, pub has_snapshot: bool, @@ -198,7 +213,7 @@ where extra_write: ExtraWrite::None, messages: vec![], trackers: vec![], - persisted_cb: None, + persisted_cbs: Vec::new(), has_snapshot: false, } } @@ -263,57 +278,27 @@ where } } -/// These states are set only in raftstore V2. -#[derive(Default)] -pub struct ExtraStates { - apply_state: RaftApplyState, - region_state: Option, - // Set only want to destroy the raft group in write worker. - raft_state: Option, -} - -impl ExtraStates { - #[inline] - pub fn new(apply_state: RaftApplyState) -> Self { - Self { - apply_state, - region_state: None, - raft_state: None, - } - } - - #[inline] - pub fn set_region_state(&mut self, region_state: RegionLocalState) { - self.region_state = Some(region_state); - } - - #[inline] - pub fn set_raft_state(&mut self, raft_state: RaftLocalState) { - self.raft_state = Some(raft_state); - } -} - -pub enum ExtraBatchWrite { +pub enum ExtraBatchWrite { None, V1(W), - V2(HashMap), + V2(L), } -impl ExtraBatchWrite { +impl ExtraBatchWrite { #[inline] fn clear(&mut self) { match self { ExtraBatchWrite::None => {} ExtraBatchWrite::V1(w) => w.clear(), - ExtraBatchWrite::V2(m) => m.clear(), + // No clear in in `RaftLogBatch`. + ExtraBatchWrite::V2(_) => *self = ExtraBatchWrite::None, } } /// Merge the extra_write with this batch. /// /// If there is any new states inserted, return the size of the state. - fn merge(&mut self, region_id: u64, extra_write: &mut ExtraWrite) -> usize { - let mut inserted = false; + fn merge(&mut self, extra_write: &mut ExtraWrite) { match mem::replace(extra_write, ExtraWrite::None) { ExtraWrite::None => (), ExtraWrite::V1(wb) => match self { @@ -321,35 +306,11 @@ impl ExtraBatchWrite { ExtraBatchWrite::V1(kv_wb) => kv_wb.merge(wb).unwrap(), ExtraBatchWrite::V2(_) => unreachable!("v2 and v1 are mixed used"), }, - ExtraWrite::V2(extra_states) => match self { - ExtraBatchWrite::None => { - let mut map = HashMap::default(); - map.insert(region_id, extra_states); - *self = ExtraBatchWrite::V2(map); - inserted = true; - } + ExtraWrite::V2(lb) => match self { + ExtraBatchWrite::None => *self = ExtraBatchWrite::V2(lb), ExtraBatchWrite::V1(_) => unreachable!("v2 and v1 are mixed used"), - ExtraBatchWrite::V2(extra_states_map) => match extra_states_map.entry(region_id) { - collections::HashMapEntry::Occupied(mut slot) => { - slot.get_mut().apply_state = extra_states.apply_state; - if let Some(region_state) = extra_states.region_state { - slot.get_mut().region_state = Some(region_state); - } - if let Some(raft_state) = extra_states.raft_state { - slot.get_mut().raft_state = Some(raft_state); - } - } - collections::HashMapEntry::Vacant(slot) => { - slot.insert(extra_states); - inserted = true; - } - }, + ExtraBatchWrite::V2(raft_wb) => raft_wb.merge(lb).unwrap(), }, - }; - if inserted { - std::mem::size_of::() - } else { - 0 } } } @@ -360,15 +321,20 @@ where EK: KvEngine, ER: RaftEngine, { - pub raft_wb: ER::LogBatch, - // Write raft state once for a region everytime writing to disk + // When a single batch becomes too large, we uses multiple batches each containing atomic + // writes. + pub raft_wbs: Vec, + // Write states once for a region everytime writing to disk. + // These states only corresponds to entries inside `raft_wbs.last()`. States for other write + // batches must be inlined early. pub raft_states: HashMap, - pub extra_batch_write: ExtraBatchWrite, + pub extra_batch_write: ExtraBatchWrite, pub state_size: usize, pub tasks: Vec>, pub persisted_cbs: Vec>, // region_id -> (peer_id, ready_number) pub readies: HashMap, + pub(crate) raft_wb_split_size: usize, } impl WriteTaskBatch @@ -378,44 +344,60 @@ where { fn new(raft_wb: ER::LogBatch) -> Self { Self { - raft_wb, + raft_wbs: vec![raft_wb], raft_states: HashMap::default(), extra_batch_write: ExtraBatchWrite::None, state_size: 0, tasks: vec![], persisted_cbs: vec![], readies: HashMap::default(), + raft_wb_split_size: RAFT_WB_SPLIT_SIZE, + } + } + + #[inline] + fn flush_states_to_raft_wb(&mut self) { + let wb = self.raft_wbs.last_mut().unwrap(); + for (region_id, state) in self.raft_states.drain() { + wb.put_raft_state(region_id, &state).unwrap(); + } + self.state_size = 0; + if let ExtraBatchWrite::V2(_) = self.extra_batch_write { + let ExtraBatchWrite::V2(lb) = mem::replace(&mut self.extra_batch_write, ExtraBatchWrite::None) else { unreachable!() }; + wb.merge(lb).unwrap(); } } /// Add write task to this batch - fn add_write_task(&mut self, mut task: WriteTask) { + fn add_write_task(&mut self, raft_engine: &ER, mut task: WriteTask) { if let Err(e) = task.valid() { panic!("task is not valid: {:?}", e); } - if let Some(raft_wb) = task.raft_wb.take() { - self.raft_wb.merge(raft_wb).unwrap(); + + if self.raft_wb_split_size > 0 + && self.raft_wbs.last().unwrap().persist_size() >= self.raft_wb_split_size + { + self.flush_states_to_raft_wb(); + self.raft_wbs + .push(raft_engine.log_batch(RAFT_WB_DEFAULT_SIZE)); } - let entries = std::mem::take(&mut task.entries); - self.raft_wb.append(task.region_id, entries).unwrap(); + let raft_wb = self.raft_wbs.last_mut().unwrap(); + if let Some(wb) = task.raft_wb.take() { + raft_wb.merge(wb).unwrap(); + } + raft_wb + .append(task.region_id, std::mem::take(&mut task.entries)) + .unwrap(); if let Some((from, to)) = task.cut_logs { - self.raft_wb.cut_logs(task.region_id, from, to); + raft_wb.cut_logs(task.region_id, from, to); } - if let Some(raft_state) = task.raft_state.take() { - if self - .raft_states - .insert(task.region_id, raft_state) - .is_none() - { - self.state_size += std::mem::size_of::(); - } + if let Some(raft_state) = task.raft_state.take() + && self.raft_states.insert(task.region_id, raft_state).is_none() { + self.state_size += std::mem::size_of::(); } - - self.state_size += self - .extra_batch_write - .merge(task.region_id, &mut task.extra_write); + self.extra_batch_write.merge(&mut task.extra_write); if let Some(prev_readies) = self .readies @@ -437,9 +419,9 @@ where ); } } - if let Some(v) = task.persisted_cb.take() { + for v in task.persisted_cbs.drain(..) { self.persisted_cbs.push(v); - }; + } self.tasks.push(task); } @@ -460,41 +442,16 @@ where #[inline] fn get_raft_size(&self) -> usize { - self.state_size + self.raft_wb.persist_size() + self.state_size + + self + .raft_wbs + .iter() + .map(|wb| wb.persist_size()) + .sum::() } - fn before_write_to_db(&mut self, engine: &ER, metrics: &StoreWriteMetrics) { - // Put raft state to raft writebatch - for (region_id, state) in self.raft_states.drain() { - self.raft_wb.put_raft_state(region_id, &state).unwrap(); - } - if let ExtraBatchWrite::V2(extra_states_map) = &mut self.extra_batch_write { - for (region_id, state) in extra_states_map.drain() { - let mut tombstone = false; - if let Some(region_state) = state.region_state { - if region_state.get_state() == PeerState::Tombstone { - tombstone = true; - engine - .clean( - region_id, - first_index(&state.apply_state), - state.raft_state.as_ref().unwrap(), - &mut self.raft_wb, - ) - .unwrap(); - } - self.raft_wb - .put_region_state(region_id, ®ion_state) - .unwrap(); - } - if !tombstone { - self.raft_wb - .put_apply_state(region_id, &state.apply_state) - .unwrap(); - } - } - } - self.state_size = 0; + fn before_write_to_db(&mut self, metrics: &StoreWriteMetrics) { + self.flush_states_to_raft_wb(); if metrics.waterfall_metrics { let now = std::time::Instant::now(); for task in &self.tasks { @@ -677,7 +634,7 @@ where } pub fn handle_write_task(&mut self, task: WriteTask) { - self.batch.add_write_task(task); + self.batch.add_write_task(&self.raft_engine, task); } pub fn write_to_db(&mut self, notify: bool) { @@ -687,8 +644,7 @@ where let timer = Instant::now(); - self.batch - .before_write_to_db(&self.raft_engine, &self.metrics); + self.batch.before_write_to_db(&self.metrics); fail_point!("raft_before_save"); @@ -726,24 +682,27 @@ where fail_point!("raft_between_save"); let mut write_raft_time = 0f64; - if !self.batch.raft_wb.is_empty() { + if !self.batch.raft_wbs[0].is_empty() { fail_point!("raft_before_save_on_store_1", self.store_id == 1, |_| {}); let now = Instant::now(); self.perf_context.start_observe(); - self.raft_engine - .consume_and_shrink( - &mut self.batch.raft_wb, - true, - RAFT_WB_SHRINK_SIZE, - RAFT_WB_DEFAULT_SIZE, - ) - .unwrap_or_else(|e| { - panic!( - "store {}: {} failed to write to raft engine: {:?}", - self.store_id, self.tag, e - ); - }); + for i in 0..self.batch.raft_wbs.len() { + self.raft_engine + .consume_and_shrink( + &mut self.batch.raft_wbs[i], + true, + RAFT_WB_SHRINK_SIZE, + RAFT_WB_DEFAULT_SIZE, + ) + .unwrap_or_else(|e| { + panic!( + "store {}: {} failed to write to raft engine: {:?}", + self.store_id, self.tag, e + ); + }); + } + self.batch.raft_wbs.truncate(1); let trackers: Vec<_> = self .batch .tasks @@ -759,6 +718,11 @@ where self.batch.after_write_to_raft_db(&self.metrics); + fail_point!( + "async_write_before_cb", + !self.batch.persisted_cbs.is_empty(), + |_| () + ); self.batch.after_write_all(); fail_point!("raft_before_follower_send"); @@ -946,8 +910,8 @@ pub fn write_to_db_for_test( ER: RaftEngine, { let mut batch = WriteTaskBatch::new(engines.raft.log_batch(RAFT_WB_DEFAULT_SIZE)); - batch.add_write_task(task); - batch.before_write_to_db(&engines.raft, &StoreWriteMetrics::new(false)); + batch.add_write_task(&engines.raft, task); + batch.before_write_to_db(&StoreWriteMetrics::new(false)); if let ExtraBatchWrite::V1(kv_wb) = &mut batch.extra_batch_write { if !kv_wb.is_empty() { let mut write_opts = WriteOptions::new(); @@ -957,13 +921,12 @@ pub fn write_to_db_for_test( }); } } - if !batch.raft_wb.is_empty() { - engines - .raft - .consume(&mut batch.raft_wb, true) - .unwrap_or_else(|e| { + if !batch.raft_wbs[0].is_empty() { + for wb in &mut batch.raft_wbs { + engines.raft.consume(wb, true).unwrap_or_else(|e| { panic!("test failed to write to raft engine: {:?}", e); }); + } } } diff --git a/components/raftstore/src/store/async_io/write_tests.rs b/components/raftstore/src/store/async_io/write_tests.rs index 1642c90d075..6007b39489e 100644 --- a/components/raftstore/src/store/async_io/write_tests.rs +++ b/components/raftstore/src/store/async_io/write_tests.rs @@ -6,7 +6,7 @@ use collections::HashSet; use crossbeam::channel::unbounded; use engine_test::{kv::KvTestEngine, new_temp_engine, raft::RaftTestEngine}; use engine_traits::{Engines, Mutable, Peekable, RaftEngineReadOnly, WriteBatchExt}; -use kvproto::raft_serverpb::RaftMessage; +use kvproto::raft_serverpb::{RaftApplyState, RaftMessage, RegionLocalState}; use tempfile::Builder; use super::*; @@ -273,7 +273,7 @@ fn test_worker() { task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); task_1.messages.append(&mut vec![RaftMessage::default()]); - t.worker.batch.add_write_task(task_1); + t.worker.batch.add_write_task(&engines.raft, task_1); let mut task_2 = WriteTask::::new(region_2, 2, 15); init_write_batch(&engines, &mut task_2); @@ -287,7 +287,7 @@ fn test_worker() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_2); + t.worker.batch.add_write_task(&engines.raft, task_2); let mut task_3 = WriteTask::::new(region_1, 1, 11); init_write_batch(&engines, &mut task_3); @@ -303,7 +303,7 @@ fn test_worker() { .messages .append(&mut vec![RaftMessage::default(), RaftMessage::default()]); - t.worker.batch.add_write_task(task_3); + t.worker.batch.add_write_task(&engines.raft, task_3); t.worker.write_to_db(true); @@ -337,6 +337,124 @@ fn test_worker() { must_have_same_count_msg(5, &t.msg_rx); } +#[test] +fn test_worker_split_raft_wb() { + let path = Builder::new().prefix("async-io-worker").tempdir().unwrap(); + let engines = new_temp_engine(&path); + let mut t = TestWorker::new(&Config::default(), &engines); + + let mut run_test = |region_1: u64, region_2: u64, split: (bool, bool)| { + let raft_key_1 = 17 + region_1; + let raft_key_2 = 27 + region_1; + let raft_key_3 = 37 + region_1; + let mut expected_wbs = 1; + + let mut task_1 = WriteTask::::new(region_1, 1, 10); + task_1.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_1 = RaftApplyState::default(); + apply_state_1.set_applied_index(10); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 10, &apply_state_1).unwrap(); + put_raft_kv(task_1.raft_wb.as_mut(), raft_key_1); + task_1.entries.append(&mut vec![ + new_entry(5, 5), + new_entry(6, 5), + new_entry(7, 5), + new_entry(8, 5), + ]); + task_1.raft_state = Some(new_raft_state(5, 123, 6, 8)); + t.worker.batch.add_write_task(&engines.raft, task_1); + + let mut task_2 = WriteTask::::new(region_2, 2, 15); + task_2.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_2 = RaftApplyState::default(); + apply_state_2.set_applied_index(16); + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_2, 16, &apply_state_2).unwrap(); + put_raft_kv(task_2.raft_wb.as_mut(), raft_key_2); + task_2 + .entries + .append(&mut vec![new_entry(20, 15), new_entry(21, 15)]); + task_2.raft_state = Some(new_raft_state(15, 234, 20, 21)); + if split.0 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_2); + + let mut task_3 = WriteTask::::new(region_1, 1, 11); + task_3.raft_wb = Some(engines.raft.log_batch(0)); + let mut apply_state_3 = RaftApplyState::default(); + apply_state_3.set_applied_index(25); + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 25, &apply_state_3).unwrap(); + put_raft_kv(task_3.raft_wb.as_mut(), raft_key_3); + delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), raft_key_1); + task_3 + .entries + .append(&mut vec![new_entry(6, 6), new_entry(7, 7)]); + task_3.cut_logs = Some((8, 9)); + task_3.raft_state = Some(new_raft_state(7, 124, 6, 7)); + if split.1 { + expected_wbs += 1; + t.worker.batch.raft_wb_split_size = 1; + } else { + t.worker.batch.raft_wb_split_size = 0; + } + t.worker.batch.add_write_task(&engines.raft, task_3); + + assert_eq!(t.worker.batch.raft_wbs.len(), expected_wbs); + t.worker.write_to_db(true); + assert_eq!(t.worker.batch.raft_wbs.len(), 1); + + must_have_same_notifies(vec![(region_1, (1, 11)), (region_2, (2, 15))], &t.notify_rx); + + assert_eq!(test_raft_kv(&engines.raft, raft_key_1), false); + assert_eq!(test_raft_kv(&engines.raft, raft_key_2), true); + assert_eq!(test_raft_kv(&engines.raft, raft_key_3), true); + + must_have_entries_and_state( + &engines.raft, + vec![ + ( + region_1, + vec![new_entry(5, 5), new_entry(6, 6), new_entry(7, 7)], + new_raft_state(7, 124, 6, 7), + ), + ( + region_2, + vec![new_entry(20, 15), new_entry(21, 15)], + new_raft_state(15, 234, 20, 21), + ), + ], + ); + assert_eq!( + engines.raft.get_apply_state(region_1, 25).unwrap(), + Some(RaftApplyState { + applied_index: 25, + ..Default::default() + }) + ); + assert_eq!( + engines.raft.get_apply_state(region_2, 16).unwrap(), + Some(RaftApplyState { + applied_index: 16, + ..Default::default() + }) + ); + }; + + let mut first_region = 1; + for a in [true, false] { + for b in [true, false] { + run_test(first_region, first_region + 1, (a, b)); + first_region += 10; + } + } +} + #[test] fn test_basic_flow() { let region_1 = 1; @@ -441,14 +559,14 @@ fn test_basic_flow_with_states() { task_1.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_1 = RaftApplyState::default(); apply_state_1.applied_index = 2; - let mut extra_state = ExtraStates::new(apply_state_1); let mut region_state_1 = RegionLocalState::default(); region_state_1 .mut_region() .mut_region_epoch() .set_version(3); - extra_state.region_state = Some(region_state_1.clone()); - task_1.extra_write.set_v2(extra_state); + let lb = task_1.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 2, &apply_state_1).unwrap(); + lb.put_region_state(region_1, 2, ®ion_state_1).unwrap(); put_raft_kv(task_1.raft_wb.as_mut(), 17); task_1 .entries @@ -464,8 +582,8 @@ fn test_basic_flow_with_states() { task_2.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_2 = RaftApplyState::default(); apply_state_2.applied_index = 30; - let extra_state = ExtraStates::new(apply_state_2.clone()); - task_2.extra_write.set_v2(extra_state); + let lb = task_2.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(2, 30, &apply_state_2).unwrap(); put_raft_kv(task_2.raft_wb.as_mut(), 27); task_2 .entries @@ -481,8 +599,8 @@ fn test_basic_flow_with_states() { task_3.raft_wb = Some(engines.raft.log_batch(0)); let mut apply_state_3 = RaftApplyState::default(); apply_state_3.applied_index = 5; - let extra_state = ExtraStates::new(apply_state_3.clone()); - task_3.extra_write.set_v2(extra_state); + let lb = task_3.extra_write.ensure_v2(|| engines.raft.log_batch(0)); + lb.put_apply_state(region_1, 5, &apply_state_3).unwrap(); put_raft_kv(task_3.raft_wb.as_mut(), 37); delete_raft_kv(&engines.raft, task_3.raft_wb.as_mut(), 17); task_3.entries.append(&mut vec![new_entry(6, 6)]); @@ -516,18 +634,18 @@ fn test_basic_flow_with_states() { ], ); assert_eq!( - engines.raft.get_apply_state(region_1).unwrap().unwrap(), + engines.raft.get_apply_state(region_1, 5).unwrap().unwrap(), apply_state_3 ); assert_eq!( - engines.raft.get_apply_state(region_2).unwrap().unwrap(), + engines.raft.get_apply_state(region_2, 30).unwrap().unwrap(), apply_state_2 ); assert_eq!( - engines.raft.get_region_state(region_1).unwrap().unwrap(), + engines.raft.get_region_state(region_1, 2).unwrap().unwrap(), region_state_1 ); - assert_eq!(engines.raft.get_region_state(region_2).unwrap(), None); + assert_eq!(engines.raft.get_region_state(region_2, 1).unwrap(), None); must_have_same_count_msg(6, &t.msg_rx); diff --git a/components/raftstore/src/store/compaction_guard.rs b/components/raftstore/src/store/compaction_guard.rs index 78dbccbf585..efee09be906 100644 --- a/components/raftstore/src/store/compaction_guard.rs +++ b/components/raftstore/src/store/compaction_guard.rs @@ -447,15 +447,15 @@ mod tests { db.put(b"za1", b"").unwrap(); db.put(b"zb1", &value).unwrap(); db.put(b"zc1", &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); + db.flush_cfs(&[], true /* wait */).unwrap(); db.put(b"zb2", &value).unwrap(); db.put(b"zc2", &value).unwrap(); db.put(b"zc3", &value).unwrap(); db.put(b"zc4", &value).unwrap(); db.put(b"zc5", &value).unwrap(); db.put(b"zc6", &value).unwrap(); - db.flush_cfs(true /* wait */).unwrap(); - db.compact_range( + db.flush_cfs(&[], true /* wait */).unwrap(); + db.compact_range_cf( CF_DEFAULT, None, // start_key None, // end_key false, // exclusive_manual diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index bd582d1c24a..affa0205e8f 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -90,7 +90,7 @@ use crate::{ peer::Peer, peer_storage::{write_initial_apply_state, write_peer_state}, util::{ - self, admin_cmd_epoch_lookup, check_flashback_state, check_region_epoch, + self, admin_cmd_epoch_lookup, check_flashback_state, check_req_region_epoch, compare_region_epoch, ChangePeerI, ConfChangeKind, KeysInfoFormatter, LatencyInspector, }, Config, RegionSnapshot, RegionTask, WriteCallback, @@ -1587,8 +1587,13 @@ where // Include region for epoch not match after merge may cause key not in range. let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; - check_region_epoch(req, &self.region, include_region)?; - check_flashback_state(self.region.get_is_in_flashback(), req, self.region_id())?; + check_req_region_epoch(req, &self.region, include_region)?; + check_flashback_state( + self.region.get_is_in_flashback(), + req, + self.region_id(), + false, + )?; if req.has_admin_request() { self.exec_admin_cmd(ctx, req) } else { diff --git a/components/raftstore/src/store/fsm/mod.rs b/components/raftstore/src/store/fsm/mod.rs index a9b954552d3..2f700eec9bf 100644 --- a/components/raftstore/src/store/fsm/mod.rs +++ b/components/raftstore/src/store/fsm/mod.rs @@ -16,7 +16,7 @@ pub use self::{ ChangePeer, ExecResult, GenSnapTask, Msg as ApplyTask, Notifier as ApplyNotifier, Proposal, Registration, TaskRes as ApplyTaskRes, }, - peer::{DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, + peer::{new_admin_request, DestroyPeerJob, PeerFsm, MAX_PROPOSAL_SIZE_RATIO}, store::{ create_raft_batch_system, RaftBatchSystem, RaftPollerBuilder, RaftRouter, StoreInfo, StoreMeta, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index b31ca7c1afa..3f2a14e76c8 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -54,7 +54,7 @@ use tikv_util::{ box_err, debug, defer, error, escape, info, is_zero_duration, mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, is_learner, region_on_same_stores}, - sys::{disk::DiskUsage, memory_usage_reaches_high_water}, + sys::disk::DiskUsage, time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, @@ -67,7 +67,7 @@ use self::memtrace::*; #[cfg(any(test, feature = "testexport"))] use crate::store::PeerInternalStat; use crate::{ - coprocessor::{CoprocessorHost, PeerCreateEvent, RegionChangeEvent, RegionChangeReason}, + coprocessor::{RegionChangeEvent, RegionChangeReason}, store::{ cmd_resp::{bind_term, new_error}, entry_storage::MAX_WARMED_UP_CACHE_KEEP_TIME, @@ -247,7 +247,6 @@ where raftlog_fetch_scheduler: Scheduler>, engines: Engines, region: &metapb::Region, - coprocessor_host: &CoprocessorHost, ) -> Result> { let meta_peer = match find_peer(region, store_id) { None => { @@ -267,11 +266,6 @@ where ); HIBERNATED_PEER_STATE_GAUGE.awaken.inc(); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); - coprocessor_host.on_peer_created( - region.get_id(), - meta_peer.get_id(), - PeerCreateEvent::Create, - ); Ok(( tx, Box::new(PeerFsm { @@ -313,7 +307,6 @@ where engines: Engines, region_id: u64, peer: metapb::Peer, - coprocessor_host: &CoprocessorHost, ) -> Result> { // We will remove tombstone key when apply snapshot info!( @@ -327,7 +320,6 @@ where HIBERNATED_PEER_STATE_GAUGE.awaken.inc(); let (tx, rx) = mpsc::loose_bounded(cfg.notify_capacity); - coprocessor_host.on_peer_created(region_id, peer.get_id(), PeerCreateEvent::Replicate); Ok(( tx, Box::new(PeerFsm { @@ -1133,6 +1125,8 @@ where store.apply_state(), self.fsm.hibernate_state.group_state(), peer.raft_group.status(), + peer.raft_group.raft.raft_log.last_index(), + peer.raft_group.raft.raft_log.persisted, )) } CasualMessage::QueryRegionLeaderResp { region, leader } => { @@ -1345,7 +1339,7 @@ where new_read_index_request(region_id, region_epoch.clone(), self.fsm.peer.peer.clone()); // Allow to capture change even is in flashback state. // TODO: add a test case for this kind of situation. - if self.fsm.peer.is_in_flashback { + if self.region().is_in_flashback { let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); flags.insert(WriteBatchFlags::FLASHBACK); msg.mut_header().set_flags(flags.bits()); @@ -4021,7 +4015,6 @@ where self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), &new_region, - &self.ctx.coprocessor_host, ) { Ok((sender, new_peer)) => (sender, new_peer), Err(e) => { @@ -4906,9 +4899,7 @@ where } ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), - ExecResult::SetFlashbackState { region } => { - self.on_set_flashback_state(region.get_is_in_flashback()) - } + ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), } } @@ -4999,7 +4990,7 @@ where msg: &RaftCmdRequest, ) -> Result> { // Check store_id, make sure that the msg is dispatched to the right place. - if let Err(e) = util::check_store_id(msg, self.store_id()) { + if let Err(e) = util::check_store_id(msg.get_header(), self.store_id()) { self.ctx .raft_metrics .invalid_proposal @@ -5018,7 +5009,7 @@ where let request = msg.get_requests(); // peer_id must be the same as peer's. - if let Err(e) = util::check_peer_id(msg, self.fsm.peer.peer_id()) { + if let Err(e) = util::check_peer_id(msg.get_header(), self.fsm.peer.peer_id()) { self.ctx .raft_metrics .invalid_proposal @@ -5042,13 +5033,12 @@ where // ReadIndex can be processed on the replicas. let is_read_index_request = request.len() == 1 && request[0].get_cmd_type() == CmdType::ReadIndex; - let mut read_only = true; - for r in msg.get_requests() { - match r.get_cmd_type() { - CmdType::Get | CmdType::Snap | CmdType::ReadIndex => (), - _ => read_only = false, - } - } + let read_only = msg.get_requests().iter().all(|r| { + matches!( + r.get_cmd_type(), + CmdType::Get | CmdType::Snap | CmdType::ReadIndex, + ) + }); let region_id = self.region_id(); let allow_replica_read = read_only && msg.get_header().get_replica_read(); let flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); @@ -5099,12 +5089,12 @@ where ))); } // Check whether the term is stale. - if let Err(e) = util::check_term(msg, self.fsm.peer.term()) { + if let Err(e) = util::check_term(msg.get_header(), self.fsm.peer.term()) { self.ctx.raft_metrics.invalid_proposal.stale_command.inc(); return Err(e); } - match util::check_region_epoch(msg, self.fsm.peer.region(), true) { + match util::check_req_region_epoch(msg, self.fsm.peer.region(), true) { Err(Error::EpochNotMatch(m, mut new_regions)) => { // Attach the region which might be split from the current region. But it // doesn't matter if the region is not split from the current region. If the @@ -5119,8 +5109,15 @@ where _ => {} }; // Check whether the region is in the flashback state and the request could be - // proposed. - if let Err(e) = util::check_flashback_state(self.fsm.peer.is_in_flashback, msg, region_id) { + // proposed. Skip the not prepared error because the + // `self.region().is_in_flashback` may not be the latest right after applying + // the `PrepareFlashback` admin command, we will let it pass here and check in + // the apply phase and because a read-only request doesn't need to be applied, + // so it will be allowed during the flashback progress, for example, a snapshot + // request. + if let Err(e) = + util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) + { match e { Error::FlashbackInProgress(_) => self .ctx @@ -5431,12 +5428,9 @@ where fail_point!("on_entry_cache_evict_tick", |_| {}); if needs_evict_entry_cache(self.ctx.cfg.evict_cache_on_memory_ratio) { self.fsm.peer.mut_store().evict_entry_cache(true); - } - let mut _usage = 0; - if memory_usage_reaches_high_water(&mut _usage) - && !self.fsm.peer.get_store().is_entry_cache_empty() - { - self.register_entry_cache_evict_tick(); + if !self.fsm.peer.get_store().is_entry_cache_empty() { + self.register_entry_cache_evict_tick(); + } } } @@ -5561,7 +5555,34 @@ where "split_keys" => %KeysInfoFormatter(split_keys.iter()), "source" => source, ); - if let Err(e) = self.validate_split_region(®ion_epoch, &split_keys) { + + if !self.fsm.peer.is_leader() { + // region on this store is no longer leader, skipped. + info!( + "not leader, skip proposing split"; + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + ); + cb.invoke_with_response(new_error(Error::NotLeader( + self.region_id(), + self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), + ))); + return; + } + if let Err(e) = util::validate_split_region( + self.fsm.region_id(), + self.fsm.peer_id(), + self.region(), + ®ion_epoch, + &split_keys, + ) { + info!( + "invalid split request"; + "err" => ?e, + "region_id" => self.fsm.region_id(), + "peer_id" => self.fsm.peer_id(), + "source" => %source + ); cb.invoke_with_response(new_error(e)); return; } @@ -5591,70 +5612,6 @@ where } } - fn validate_split_region( - &mut self, - epoch: &metapb::RegionEpoch, - split_keys: &[Vec], - ) -> Result<()> { - if split_keys.is_empty() { - error!( - "no split key is specified."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!("{} no split key is specified.", self.fsm.peer.tag)); - } - for key in split_keys { - if key.is_empty() { - error!( - "split key should not be empty!!!"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(box_err!( - "{} split key should not be empty", - self.fsm.peer.tag - )); - } - } - if !self.fsm.peer.is_leader() { - // region on this store is no longer leader, skipped. - info!( - "not leader, skip."; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - ); - return Err(Error::NotLeader( - self.region_id(), - self.fsm.peer.get_peer_from_cache(self.fsm.peer.leader_id()), - )); - } - - let region = self.fsm.peer.region(); - let latest_epoch = region.get_region_epoch(); - - // This is a little difference for `check_region_epoch` in region split case. - // Here we just need to check `version` because `conf_ver` will be update - // to the latest value of the peer, and then send to PD. - if latest_epoch.get_version() != epoch.get_version() { - info!( - "epoch changed, retry later"; - "region_id" => self.fsm.region_id(), - "peer_id" => self.fsm.peer_id(), - "prev_epoch" => ?region.get_region_epoch(), - "epoch" => ?epoch, - ); - return Err(Error::EpochNotMatch( - format!( - "{} epoch changed {:?} != {:?}, retry later", - self.fsm.peer.tag, latest_epoch, epoch - ), - vec![region.to_owned()], - )); - } - Ok(()) - } - fn on_approximate_region_size(&mut self, size: u64) { self.fsm.peer.approximate_size = Some(size); self.register_split_region_check_tick(); @@ -6288,9 +6245,17 @@ where self.fsm.has_ready = true; } - fn on_set_flashback_state(&mut self, is_in_flashback: bool) { - // Set flashback memory - self.fsm.peer.is_in_flashback = is_in_flashback; + fn on_set_flashback_state(&mut self, region: metapb::Region) { + // Update the region meta. + self.update_region((|| { + #[cfg(feature = "failpoints")] + fail_point!("keep_peer_fsm_flashback_state_false", |_| { + let mut region = region.clone(); + region.is_in_flashback = false; + region + }); + region + })()); // Let the leader lease to None to ensure that local reads are not executed. self.fsm.peer.leader_lease_mut().expire_remote_lease(); } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 1e0d845f4fa..c4250ec0fd4 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -66,10 +66,7 @@ use time::{self, Timespec}; use crate::{ bytes_capacity, - coprocessor::{ - split_observer::SplitObserver, BoxAdminObserver, CoprocessorHost, RegionChangeEvent, - RegionChangeReason, - }, + coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ async_io::{ read::{ReadRunner, ReadTask}, @@ -109,7 +106,7 @@ use crate::{ type Key = Vec; pub const PENDING_MSG_CAP: usize = 100; -const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); +pub const ENTRY_CACHE_EVICT_TICK_DURATION: Duration = Duration::from_secs(1); pub const MULTI_FILES_SNAPSHOT_FEATURE: Feature = Feature::require(6, 1, 0); // it only makes sense for large region pub struct StoreInfo { @@ -118,6 +115,14 @@ pub struct StoreInfo { pub capacity: u64, } +/// A trait that provide the meta information that can be accessed outside +/// of raftstore. +pub trait StoreRegionMeta: Send { + fn store_id(&self) -> u64; + fn region_read_progress(&self) -> &RegionReadProgressRegistry; + fn search_region(&self, start_key: &[u8], end_key: &[u8], visitor: impl FnMut(&Region)); +} + pub struct StoreMeta { pub store_id: Option, /// region_end_key -> region_id @@ -157,6 +162,34 @@ pub struct StoreMeta { pub damaged_ranges: HashMap, Vec)>, } +impl StoreRegionMeta for StoreMeta { + #[inline] + fn store_id(&self) -> u64 { + self.store_id.unwrap() + } + + #[inline] + fn search_region(&self, start_key: &[u8], end_key: &[u8], mut visitor: impl FnMut(&Region)) { + let start_key = data_key(start_key); + for (_, id) in self + .region_ranges + .range((Excluded(start_key), Unbounded::>)) + { + let region = &self.regions[id]; + if end_key.is_empty() || end_key > region.get_start_key() { + visitor(region); + } else { + break; + } + } + } + + #[inline] + fn region_read_progress(&self) -> &RegionReadProgressRegistry { + &self.region_read_progress + } +} + impl StoreMeta { pub fn new(vote_capacity: usize) -> StoreMeta { StoreMeta { @@ -1173,7 +1206,6 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), region, - &self.coprocessor_host, )); peer.peer.init_replication_mode(&mut replication_state); if local_state.get_state() == PeerState::Merging { @@ -1214,7 +1246,6 @@ impl RaftPollerBuilder { self.raftlog_fetch_scheduler.clone(), self.engines.clone(), ®ion, - &self.coprocessor_host, )?; peer.peer.init_replication_mode(&mut replication_state); peer.schedule_applying_snapshot(); @@ -1472,7 +1503,7 @@ impl RaftBatchSystem { mgr: SnapManager, pd_worker: LazyWorker>, store_meta: Arc>, - mut coprocessor_host: CoprocessorHost, + coprocessor_host: CoprocessorHost, importer: Arc, split_check_scheduler: Scheduler, background_worker: Worker, @@ -1485,12 +1516,6 @@ impl RaftBatchSystem { ) -> Result<()> { assert!(self.workers.is_none()); // TODO: we can get cluster meta regularly too later. - - // TODO load coprocessors from configuration - coprocessor_host - .registry - .register_admin_observer(100, BoxAdminObserver::new(SplitObserver)); - let purge_worker = if engines.raft.need_manual_purge() { let worker = Worker::new("purge-worker"); let raft_clone = engines.raft.clone(); @@ -2230,7 +2255,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.engines.clone(), region_id, target.clone(), - &self.ctx.coprocessor_host, )?; // WARNING: The checking code must be above this line. @@ -2886,7 +2910,6 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER self.ctx.raftlog_fetch_scheduler.clone(), self.ctx.engines.clone(), ®ion, - &self.ctx.coprocessor_host, ) { Ok((sender, peer)) => (sender, peer), Err(e) => { diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index 5d7455b2d1c..62561c63cbc 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -31,10 +31,7 @@ pub use self::msg::PeerInternalStat; pub use self::{ async_io::{ read::{AsyncReadNotifier, FetchedLogs, GenSnapRes, ReadRunner, ReadTask}, - write::{ - ExtraStates, PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, - WriteTask, - }, + write::{PersistedNotifier, StoreWriters, Worker as WriteWorker, WriteMsg, WriteTask}, write_router::{WriteRouter, WriteRouterContext, WriteSenders}, }, bootstrap::{ @@ -54,9 +51,10 @@ pub use self::{ StoreMsg, StoreTick, WriteCallback, WriteResponse, }, peer::{ - can_amend_read, get_sync_log_from_request, propose_read_index, should_renew_lease, Peer, - PeerStat, ProposalContext, ProposalQueue, RequestInspector, RequestPolicy, - SnapshotRecoveryWaitApplySyncer, + can_amend_read, get_sync_log_from_request, make_transfer_leader_response, + propose_read_index, should_renew_lease, Peer, PeerStat, ProposalContext, ProposalQueue, + RequestInspector, RequestPolicy, SnapshotRecoveryWaitApplySyncer, + TRANSFER_LEADER_COMMAND_REPLY_CTX, }, peer_storage::{ clear_meta, do_snapshot, write_initial_apply_state, write_initial_raft_state, @@ -76,10 +74,10 @@ pub use self::{ txn_ext::{LocksStatus, PeerPessimisticLocks, PessimisticLockPair, TxnExt}, util::{RegionReadProgress, RegionReadProgressRegistry}, worker::{ - metrics::TLS_LOCAL_READ_METRICS, AutoSplitController, Bucket, BucketRange, - CachedReadDelegate, CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, - KeyEntry, LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, - ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, + metrics as worker_metrics, AutoSplitController, Bucket, BucketRange, CachedReadDelegate, + CheckLeaderRunner, CheckLeaderTask, FlowStatistics, FlowStatsReporter, KeyEntry, + LocalReadContext, LocalReader, LocalReaderCore, PdTask, ReadDelegate, ReadExecutor, + ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, StoreMetaDelegate, TrackVer, WriteStats, }, diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 9614161739a..a72bb59d8bf 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -76,8 +76,8 @@ use super::{ read_queue::{ReadIndexQueue, ReadIndexRequest}, transport::Transport, util::{ - self, check_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, ConfChangeKind, - Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, + self, check_req_region_epoch, is_initial_msg, AdminCmdEpochState, ChangePeerI, + ConfChangeKind, Lease, LeaseState, NORMAL_REQ_CHECK_CONF_VER, NORMAL_REQ_CHECK_VER, }, DestroyPeerJob, LocalReadContext, }; @@ -1030,8 +1030,6 @@ where /// lead_transferee if this peer(leader) is in a leadership transferring. pub lead_transferee: u64, pub unsafe_recovery_state: Option, - // Used as the memory state for Flashback to reject RW/Schedule before proposing. - pub is_in_flashback: bool, pub snapshot_recovery_state: Option, } @@ -1167,7 +1165,6 @@ where last_region_buckets: None, lead_transferee: raft::INVALID_ID, unsafe_recovery_state: None, - is_in_flashback: region.get_is_in_flashback(), snapshot_recovery_state: None, }; @@ -2285,6 +2282,7 @@ where leader_id: ss.leader_id, prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, + initialized: self.is_initialized(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); @@ -3531,7 +3529,7 @@ where self.force_leader.is_some(), ) { None - } else if self.is_in_flashback { + } else if self.region().is_in_flashback { debug!( "prevents renew lease while in flashback state"; "region_id" => self.region_id, @@ -4516,7 +4514,7 @@ where self.raft_group.raft.msgs.push(msg); } - /// Return true to if the transfer leader request is accepted. + /// Return true if the transfer leader request is accepted. /// /// When transferring leadership begins, leader sends a pre-transfer /// to target follower first to ensures it's ready to become leader. @@ -4711,7 +4709,7 @@ where ) -> ReadResponse { let region = self.region().clone(); if check_epoch { - if let Err(e) = check_region_epoch(&req, ®ion, true) { + if let Err(e) = check_req_region_epoch(&req, ®ion, true) { debug!("epoch not match"; "region_id" => region.get_id(), "err" => ?e); let mut response = cmd_resp::new_error(e); cmd_resp::bind_term(&mut response, self.term()); @@ -5658,7 +5656,7 @@ fn is_request_urgent(req: &RaftCmdRequest) -> bool { ) } -fn make_transfer_leader_response() -> RaftCmdResponse { +pub fn make_transfer_leader_response() -> RaftCmdResponse { let mut response = AdminResponse::default(); response.set_cmd_type(AdminCmdType::TransferLeader); response.set_transfer_leader(TransferLeaderResponse::default()); diff --git a/components/raftstore/src/store/peer_storage.rs b/components/raftstore/src/store/peer_storage.rs index 0d10b1f36cf..c9e460d1cbc 100644 --- a/components/raftstore/src/store/peer_storage.rs +++ b/components/raftstore/src/store/peer_storage.rs @@ -183,7 +183,9 @@ fn init_raft_state( raft_state.last_index = RAFT_INIT_LOG_INDEX; raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.mut_hard_state().set_commit(RAFT_INIT_LOG_INDEX); - engines.raft.put_raft_state(region.get_id(), &raft_state)?; + let mut lb = engines.raft.log_batch(0); + lb.put_raft_state(region.get_id(), &raft_state)?; + engines.raft.consume(&mut lb, true)?; } Ok(raft_state) } @@ -2077,32 +2079,35 @@ pub mod tests { let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); + let mut lb = engines.raft.log_batch(4096); // last_index < commit_index is invalid. raft_state.set_last_index(11); - engines - .raft - .append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) + lb.append(1, vec![new_entry(11, RAFT_INIT_LOG_TERM)]) .unwrap(); raft_state.mut_hard_state().set_commit(12); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); let entries = (12..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.append(1, entries).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); assert_eq!(initial_state.hard_state, *raft_state.get_hard_state()); // Missing last log is invalid. raft_state.set_last_index(21); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); raft_state.set_last_index(20); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); // applied_index > commit_index is invalid. let mut apply_state = RaftApplyState::default(); @@ -2119,7 +2124,8 @@ pub mod tests { assert!(build_storage().is_err()); // It should not recover if corresponding log doesn't exist. - engines.raft.gc(1, 14, 15).unwrap(); + engines.raft.gc(1, 14, 15, &mut lb).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); apply_state.set_commit_index(14); apply_state.set_commit_term(RAFT_INIT_LOG_TERM); engines @@ -2131,8 +2137,9 @@ pub mod tests { let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.gc(1, 0, 21).unwrap(); - engines.raft.append(1, entries).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); + lb.append(1, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); raft_state.mut_hard_state().set_commit(14); s = build_storage().unwrap(); let initial_state = s.initial_state().unwrap(); @@ -2143,27 +2150,28 @@ pub mod tests { .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); entries[0].set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.append(1, entries).unwrap(); + lb.append(1, entries).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // hard state term miss match is invalid. let entries = (14..=20) .map(|index| new_entry(index, RAFT_INIT_LOG_TERM)) .collect(); - engines.raft.append(1, entries).unwrap(); + lb.append(1, entries).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM - 1); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); // last index < recorded_commit_index is invalid. - engines.raft.gc(1, 0, 21).unwrap(); + engines.raft.gc(1, 0, 21, &mut lb).unwrap(); raft_state.mut_hard_state().set_term(RAFT_INIT_LOG_TERM); raft_state.set_last_index(13); - engines - .raft - .append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) + lb.append(1, vec![new_entry(13, RAFT_INIT_LOG_TERM)]) .unwrap(); - engines.raft.put_raft_state(1, &raft_state).unwrap(); + lb.put_raft_state(1, &raft_state).unwrap(); + engines.raft.consume(&mut lb, false).unwrap(); assert!(build_storage().is_err()); } diff --git a/components/raftstore/src/store/region_meta.rs b/components/raftstore/src/store/region_meta.rs index 7de687e9dbb..4d44673e057 100644 --- a/components/raftstore/src/store/region_meta.rs +++ b/components/raftstore/src/store/region_meta.rs @@ -93,6 +93,8 @@ pub struct RaftStatus { pub applied: u64, pub voters: HashMap, pub learners: HashMap, + pub last_index: u64, + pub persisted_index: u64, } impl<'a> From> for RaftStatus { @@ -126,6 +128,8 @@ impl<'a> From> for RaftStatus { applied, voters, learners, + last_index: 0, + persisted_index: 0, } } } @@ -250,6 +254,8 @@ impl RegionMeta { apply_state: &raft_serverpb::RaftApplyState, group_state: GroupState, raft_status: Status<'_>, + last_index: u64, + persisted_index: u64, ) -> Self { let region = local_state.get_region(); let epoch = region.get_region_epoch(); @@ -270,10 +276,13 @@ impl RegionMeta { } else { None }; + let mut raft_status: RaftStatus = raft_status.into(); + raft_status.last_index = last_index; + raft_status.persisted_index = persisted_index; Self { group_state, - raft_status: raft_status.into(), + raft_status, raft_apply: RaftApplyState { applied_index: apply_state.get_applied_index(), commit_index: apply_state.get_commit_index(), diff --git a/components/raftstore/src/store/region_snapshot.rs b/components/raftstore/src/store/region_snapshot.rs index fe58a2587a7..ccf5f94e39e 100644 --- a/components/raftstore/src/store/region_snapshot.rs +++ b/components/raftstore/src/store/region_snapshot.rs @@ -460,7 +460,7 @@ mod tests { let db = &engines.kv; for &(ref k, level) in &levels { db.put(&data_key(k), k).unwrap(); - db.flush_cfs(true).unwrap(); + db.flush_cfs(&[], true).unwrap(); data.push((k.to_vec(), k.to_vec())); db.compact_files_in_range(Some(&data_key(k)), Some(&data_key(k)), Some(level)) .unwrap(); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 40ccac997fa..0c39288f939 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1945,20 +1945,15 @@ impl Display for TabletSnapKey { #[derive(Clone)] pub struct TabletSnapManager { // directory to store snapfile. - base: String, + base: PathBuf, } impl TabletSnapManager { - pub fn new>(path: T) -> Self { - Self { base: path.into() } - } - - pub fn init(&self) -> io::Result<()> { + pub fn new>(path: T) -> io::Result { // Initialize the directory if it doesn't exist. - let path = Path::new(&self.base); + let path = path.into(); if !path.exists() { - file_system::create_dir_all(path)?; - return Ok(()); + file_system::create_dir_all(&path)?; } if !path.is_dir() { return Err(io::Error::new( @@ -1966,7 +1961,7 @@ impl TabletSnapManager { format!("{} should be a directory", path.display()), )); } - Ok(()) + Ok(Self { base: path }) } pub fn tablet_gen_path(&self, key: &TabletSnapKey) -> PathBuf { @@ -1997,6 +1992,35 @@ impl TabletSnapManager { true } } + + pub fn total_snap_size(&self) -> Result { + let mut total_size = 0; + for entry in file_system::read_dir(&self.base)? { + let entry = match entry { + Ok(e) => e, + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + }; + + let path = entry.path(); + // Generated snapshots are just checkpoints, only counts received snapshots. + if !path + .file_name() + .and_then(|n| n.to_str()) + .map_or(true, |n| n.starts_with(SNAP_REV_PREFIX)) + { + continue; + } + for e in file_system::read_dir(path)? { + match e.and_then(|e| e.metadata()) { + Ok(m) => total_size += m.len(), + Err(e) if e.kind() == ErrorKind::NotFound => continue, + Err(e) => return Err(Error::from(e)), + } + } + } + Ok(total_size) + } } #[cfg(test)] @@ -2019,8 +2043,9 @@ pub mod tests { raft::RaftTestEngine, }; use engine_traits::{ - Engines, ExternalSstFileInfo, KvEngine, RaftEngine, Snapshot as EngineSnapshot, SstExt, - SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + Engines, ExternalSstFileInfo, KvEngine, RaftEngine, RaftLogBatch, + Snapshot as EngineSnapshot, SstExt, SstWriter, SstWriterBuilder, SyncMutable, ALL_CFS, + CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ encryptionpb::EncryptionMethod, @@ -2127,6 +2152,7 @@ pub mod tests { let kv: KvTestEngine = open_test_db(p.join("kv").as_path(), kv_db_opt, kv_cf_opts)?; let raft: RaftTestEngine = engine_test::raft::new_engine(p.join("raft").to_str().unwrap(), raft_db_opt)?; + let mut lb = raft.log_batch(regions.len() * 128); for ®ion_id in regions { // Put apply state into kv engine. let mut apply_state = RaftApplyState::default(); @@ -2136,7 +2162,7 @@ pub mod tests { apply_entry.set_term(0); apply_state.mut_truncated_state().set_index(10); kv.put_msg_cf(CF_RAFT, &keys::apply_state_key(region_id), &apply_state)?; - raft.append(region_id, vec![apply_entry])?; + lb.append(region_id, vec![apply_entry])?; // Put region info into kv engine. let region = gen_test_region(region_id, 1, 1); @@ -2144,6 +2170,7 @@ pub mod tests { region_state.set_region(region); kv.put_msg_cf(CF_RAFT, &keys::region_state_key(region_id), ®ion_state)?; } + raft.consume(&mut lb, false).unwrap(); Ok(Engines::new(kv, raft)) } diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 5f78065d32b..2d27b56fda5 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -19,7 +19,9 @@ use engine_traits::KvEngine; use kvproto::{ kvrpcpb::{self, KeyRange, LeaderInfo}, metapb::{self, Peer, PeerRole, Region, RegionEpoch}, - raft_cmdpb::{AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest}, + raft_cmdpb::{ + AdminCmdType, ChangePeerRequest, ChangePeerV2Request, RaftCmdRequest, RaftRequestHeader, + }, raft_serverpb::{RaftMessage, RaftSnapshotData}, }; use protobuf::{self, Message}; @@ -35,7 +37,7 @@ use tikv_util::{ Either, }; use time::{Duration, Timespec}; -use txn_types::{TimeStamp, WriteBatchFlags}; +use txn_types::WriteBatchFlags; use super::{metrics::PEER_ADMIN_CMD_COUNTER_VEC, peer_storage, Config}; use crate::{coprocessor::CoprocessorHost, store::snap::SNAPSHOT_VERSION, Error, Result}; @@ -235,28 +237,45 @@ pub fn admin_cmd_epoch_lookup(admin_cmp_type: AdminCmdType) -> AdminCmdEpochStat pub static NORMAL_REQ_CHECK_VER: bool = true; pub static NORMAL_REQ_CHECK_CONF_VER: bool = false; -pub fn check_region_epoch( +pub fn check_req_region_epoch( req: &RaftCmdRequest, region: &metapb::Region, include_region: bool, ) -> Result<()> { - let (check_ver, check_conf_ver) = if !req.has_admin_request() { - // for get/set/delete, we don't care conf_version. - (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + let admin_ty = if !req.has_admin_request() { + None } else { - let epoch_state = admin_cmd_epoch_lookup(req.get_admin_request().get_cmd_type()); - (epoch_state.check_ver, epoch_state.check_conf_ver) + Some(req.get_admin_request().get_cmd_type()) + }; + check_region_epoch(req.get_header(), admin_ty, region, include_region) +} + +pub fn check_region_epoch( + header: &RaftRequestHeader, + admin_ty: Option, + region: &metapb::Region, + include_region: bool, +) -> Result<()> { + let (check_ver, check_conf_ver) = match admin_ty { + None => { + // for get/set/delete, we don't care conf_version. + (NORMAL_REQ_CHECK_VER, NORMAL_REQ_CHECK_CONF_VER) + } + Some(ty) => { + let epoch_state = admin_cmd_epoch_lookup(ty); + (epoch_state.check_ver, epoch_state.check_conf_ver) + } }; if !check_ver && !check_conf_ver { return Ok(()); } - if !req.get_header().has_region_epoch() { + if !header.has_region_epoch() { return Err(box_err!("missing epoch!")); } - let from_epoch = req.get_header().get_region_epoch(); + let from_epoch = header.get_region_epoch(); compare_region_epoch( from_epoch, region, @@ -318,6 +337,7 @@ pub fn check_flashback_state( is_in_flashback: bool, req: &RaftCmdRequest, region_id: u64, + skip_not_prepared: bool, ) -> Result<()> { // The admin flashback cmd could be proposed/applied under any state. if req.has_admin_request() @@ -335,7 +355,7 @@ pub fn check_flashback_state( } // If the region is not in the flashback state, the flashback request itself // should be rejected. - if !is_in_flashback && is_flashback_request { + if !is_in_flashback && is_flashback_request && !skip_not_prepared { return Err(Error::FlashbackNotPrepared(region_id)); } Ok(()) @@ -350,8 +370,8 @@ pub fn is_region_epoch_equal( } #[inline] -pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { - let peer = req.get_header().get_peer(); +pub fn check_store_id(header: &RaftRequestHeader, store_id: u64) -> Result<()> { + let peer = header.get_peer(); if peer.get_store_id() == store_id { Ok(()) } else { @@ -363,8 +383,7 @@ pub fn check_store_id(req: &RaftCmdRequest, store_id: u64) -> Result<()> { } #[inline] -pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_term(header: &RaftRequestHeader, term: u64) -> Result<()> { if header.get_term() == 0 || term <= header.get_term() + 1 { Ok(()) } else { @@ -375,8 +394,7 @@ pub fn check_term(req: &RaftCmdRequest, term: u64) -> Result<()> { } #[inline] -pub fn check_peer_id(req: &RaftCmdRequest, peer_id: u64) -> Result<()> { - let header = req.get_header(); +pub fn check_peer_id(header: &RaftRequestHeader, peer_id: u64) -> Result<()> { if header.get_peer().get_id() == peer_id { Ok(()) } else { @@ -1185,11 +1203,7 @@ impl RegionReadProgress { if !core.pause { self.safe_ts.store(ts, AtomicOrdering::Release); // No need to update leader safe ts here. - coprocessor.on_update_safe_ts( - core.region_id, - TimeStamp::new(ts).physical(), - INVALID_TIMESTAMP, - ) + coprocessor.on_update_safe_ts(core.region_id, ts, INVALID_TIMESTAMP) } } } @@ -1231,11 +1245,7 @@ impl RegionReadProgress { self.safe_ts.store(ts, AtomicOrdering::Release); // After region merge, self safe ts may decrease, so leader safe ts should be // reset. - coprocessor.on_update_safe_ts( - core.region_id, - TimeStamp::new(ts).physical(), - TimeStamp::new(ts).physical(), - ) + coprocessor.on_update_safe_ts(core.region_id, ts, ts) } } } @@ -1260,9 +1270,7 @@ impl RegionReadProgress { } } } - let self_phy_ts = TimeStamp::new(self.safe_ts()).physical(); - let leader_phy_ts = TimeStamp::new(rs.get_safe_ts()).physical(); - coprocessor.on_update_safe_ts(leader_info.region_id, self_phy_ts, leader_phy_ts) + coprocessor.on_update_safe_ts(leader_info.region_id, self.safe_ts(), rs.get_safe_ts()) } // whether the provided `LeaderInfo` is same as ours core.leader_info.leader_term == leader_info.term @@ -1613,6 +1621,47 @@ impl LatencyInspector { } } +pub fn validate_split_region( + region_id: u64, + peer_id: u64, + region: &Region, + epoch: &RegionEpoch, + split_keys: &[Vec], +) -> Result<()> { + if split_keys.is_empty() { + return Err(box_err!( + "[region {}] {} no split key is specified.", + region_id, + peer_id + )); + } + + let latest_epoch = region.get_region_epoch(); + // This is a little difference for `check_region_epoch` in region split case. + // Here we just need to check `version` because `conf_ver` will be update + // to the latest value of the peer, and then send to PD. + if latest_epoch.get_version() != epoch.get_version() { + return Err(Error::EpochNotMatch( + format!( + "[region {}] {} epoch changed {:?} != {:?}, retry later", + region_id, peer_id, latest_epoch, epoch + ), + vec![region.to_owned()], + )); + } + for key in split_keys { + if key.is_empty() { + return Err(box_err!( + "[region {}] {} split key should not be empty", + region_id, + peer_id + )); + } + check_key_in_region(key, region)?; + } + Ok(()) +} + #[cfg(test)] mod tests { use std::thread; @@ -2010,34 +2059,34 @@ mod tests { #[test] fn test_check_store_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_store_id(1); - check_store_id(&req, 1).unwrap(); - check_store_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_store_id(1); + check_store_id(&header, 1).unwrap(); + check_store_id(&header, 2).unwrap_err(); } #[test] fn test_check_peer_id() { - let mut req = RaftCmdRequest::default(); - req.mut_header().mut_peer().set_id(1); - check_peer_id(&req, 1).unwrap(); - check_peer_id(&req, 2).unwrap_err(); + let mut header = RaftRequestHeader::default(); + header.mut_peer().set_id(1); + check_peer_id(&header, 1).unwrap(); + check_peer_id(&header, 2).unwrap_err(); } #[test] fn test_check_term() { - let mut req = RaftCmdRequest::default(); - req.mut_header().set_term(7); - check_term(&req, 7).unwrap(); - check_term(&req, 8).unwrap(); + let mut header = RaftRequestHeader::default(); + header.set_term(7); + check_term(&header, 7).unwrap(); + check_term(&header, 8).unwrap(); // If header's term is 2 verions behind current term, // leadership may have been changed away. - check_term(&req, 9).unwrap_err(); - check_term(&req, 10).unwrap_err(); + check_term(&header, 9).unwrap_err(); + check_term(&header, 10).unwrap_err(); } #[test] - fn test_check_region_epoch() { + fn test_check_req_region_epoch() { let mut epoch = RegionEpoch::default(); epoch.set_conf_ver(2); epoch.set_version(2); @@ -2045,7 +2094,7 @@ mod tests { region.set_region_epoch(epoch.clone()); // Epoch is required for most requests even if it's empty. - check_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); + check_req_region_epoch(&RaftCmdRequest::default(), ®ion, false).unwrap_err(); // These admin commands do not require epoch. for ty in &[ @@ -2060,11 +2109,11 @@ mod tests { req.set_admin_request(admin); // It is Okay if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, true).unwrap(); - check_region_epoch(&req, ®ion, false).unwrap(); + check_req_region_epoch(&req, ®ion, true).unwrap(); + check_req_region_epoch(&req, ®ion, false).unwrap(); } // These admin commands requires epoch.version. @@ -2082,7 +2131,7 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_version_epoch = epoch.clone(); stale_version_epoch.set_version(1); @@ -2090,14 +2139,14 @@ mod tests { stale_region.set_region_epoch(stale_version_epoch.clone()); req.mut_header() .set_region_epoch(stale_version_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_version_epoch = epoch.clone(); latest_version_epoch.set_version(3); for epoch in &[stale_version_epoch, latest_version_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } @@ -2118,21 +2167,21 @@ mod tests { req.set_admin_request(admin); // Error if req does not have region epoch. - check_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); let mut stale_conf_epoch = epoch.clone(); stale_conf_epoch.set_conf_ver(1); let mut stale_region = metapb::Region::default(); stale_region.set_region_epoch(stale_conf_epoch.clone()); req.mut_header().set_region_epoch(stale_conf_epoch.clone()); - check_region_epoch(&req, &stale_region, false).unwrap(); + check_req_region_epoch(&req, &stale_region, false).unwrap(); let mut latest_conf_epoch = epoch.clone(); latest_conf_epoch.set_conf_ver(3); for epoch in &[stale_conf_epoch, latest_conf_epoch] { req.mut_header().set_region_epoch(epoch.clone()); - check_region_epoch(&req, ®ion, false).unwrap_err(); - check_region_epoch(&req, ®ion, true).unwrap_err(); + check_req_region_epoch(&req, ®ion, false).unwrap_err(); + check_req_region_epoch(&req, ®ion, true).unwrap_err(); } } } diff --git a/components/raftstore/src/store/worker/check_leader.rs b/components/raftstore/src/store/worker/check_leader.rs index ab83752d8c3..c4646de35a4 100644 --- a/components/raftstore/src/store/worker/check_leader.rs +++ b/components/raftstore/src/store/worker/check_leader.rs @@ -1,27 +1,25 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - collections::Bound::{Excluded, Unbounded}, fmt, sync::{Arc, Mutex}, }; use engine_traits::KvEngine; use fail::fail_point; -use keys::{data_end_key, data_key, enc_start_key}; use kvproto::kvrpcpb::{KeyRange, LeaderInfo}; use tikv_util::worker::Runnable; use crate::{ coprocessor::CoprocessorHost, - store::{fsm::store::StoreMeta, util::RegionReadProgressRegistry}, + store::{fsm::store::StoreRegionMeta, util::RegionReadProgressRegistry}, }; -pub struct Runner +pub struct Runner where E: KvEngine, { - store_meta: Arc>, + store_meta: Arc>, region_read_progress: RegionReadProgressRegistry, coprocessor: CoprocessorHost, } @@ -55,12 +53,13 @@ impl fmt::Display for Task { } } -impl Runner +impl Runner where + S: StoreRegionMeta, E: KvEngine, { - pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Runner { - let region_read_progress = store_meta.lock().unwrap().region_read_progress.clone(); + pub fn new(store_meta: Arc>, coprocessor: CoprocessorHost) -> Self { + let region_read_progress = store_meta.lock().unwrap().region_read_progress().clone(); Runner { region_read_progress, store_meta, @@ -82,48 +81,39 @@ where .unwrap_or(0) }) } else { - let (start_key, end_key) = ( - data_key(key_range.get_start_key()), - data_end_key(key_range.get_end_key()), - ); // `store_safe_ts` won't be accessed frequently (like per-request or // per-transaction), also this branch won't entry because the request key range // is empty currently (in v5.1) keep this branch for robustness and future use, // so it is okay getting `store_safe_ts` from `store_meta` (behide a mutex) let meta = self.store_meta.lock().unwrap(); - meta.region_read_progress.with(|registry| { - meta.region_ranges - // get overlapped regions - .range((Excluded(start_key), Unbounded)) - .take_while(|(_, id)| end_key > enc_start_key(&meta.regions[*id])) - // get the min `safe_ts` - .map(|(_, id)| { - registry.get(id).unwrap().safe_ts() - }) - .filter(|ts| *ts != 0) // ts == 0 means the peer is uninitialized - .min() - .unwrap_or(0) + meta.region_read_progress().with(|registry| { + let mut min_ts = u64::MAX; + meta.search_region(key_range.get_start_key(), key_range.get_end_key(), |r| { + let ts = registry.get(&r.get_id()).unwrap().safe_ts(); + // ts == 0 means the peer is uninitialized + if ts != 0 && ts < min_ts { + min_ts = ts; + } + }); + if min_ts == u64::MAX { 0 } else { min_ts } }) } } } -impl Runnable for Runner -where - E: KvEngine, -{ +impl Runnable for Runner { type Task = Task; fn run(&mut self, task: Task) { match task { Task::CheckLeader { leaders, cb } => { fail_point!( "before_check_leader_store_2", - self.store_meta.lock().unwrap().store_id == Some(2), + self.store_meta.lock().unwrap().store_id() == 2, |_| {} ); fail_point!( "before_check_leader_store_3", - self.store_meta.lock().unwrap().store_id == Some(3), + self.store_meta.lock().unwrap().store_id() == 3, |_| {} ); let regions = self @@ -146,7 +136,7 @@ mod tests { use kvproto::metapb::Region; use super::*; - use crate::store::util::RegionReadProgress; + use crate::store::{fsm::StoreMeta, util::RegionReadProgress}; #[test] fn test_get_range_min_safe_ts() { diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 958da2adaa6..7bc7052b277 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -108,7 +108,7 @@ where .start_coarse_timer(); box_try!( self.engine - .compact_range(cf_name, start_key, end_key, false, 1 /* threads */,) + .compact_range_cf(cf_name, start_key, end_key, false, 1 /* threads */,) ); compact_range_timer.observe_duration(); info!( diff --git a/components/raftstore/src/store/worker/consistency_check.rs b/components/raftstore/src/store/worker/consistency_check.rs index b3bd7ef32d0..fef2bae332c 100644 --- a/components/raftstore/src/store/worker/consistency_check.rs +++ b/components/raftstore/src/store/worker/consistency_check.rs @@ -9,8 +9,8 @@ use tikv_util::{error, info, warn, worker::Runnable}; use super::metrics::*; use crate::{ - coprocessor::CoprocessorHost, - store::{metrics::*, CasualMessage, CasualRouter}, + coprocessor::{dispatcher::StoreHandle, CoprocessorHost}, + store::metrics::*, }; /// Consistency checking task. @@ -44,12 +44,12 @@ impl Display for Task { } } -pub struct Runner> { +pub struct Runner { router: C, coprocessor_host: CoprocessorHost, } -impl> Runner { +impl Runner { pub fn new(router: C, cop_host: CoprocessorHost) -> Runner { Runner { router, @@ -85,18 +85,8 @@ impl> Runner { for (ctx, sum) in hashes { let mut checksum = Vec::with_capacity(4); checksum.write_u32::(sum).unwrap(); - let msg = CasualMessage::ComputeHashResult { - index, - context: ctx, - hash: checksum, - }; - if let Err(e) = self.router.send(region.get_id(), msg) { - warn!( - "failed to send hash compute result"; - "region_id" => region.get_id(), - "err" => %e, - ); - } + self.router + .update_compute_hash_result(region.get_id(), index, ctx, checksum); } timer.observe_duration(); @@ -106,7 +96,7 @@ impl> Runner { impl Runnable for Runner where EK: KvEngine, - C: CasualRouter, + C: StoreHandle, { type Task = Task; @@ -124,7 +114,7 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::{assert_matches::assert_matches, sync::mpsc, time::Duration}; use byteorder::{BigEndian, WriteBytesExt}; use engine_test::kv::{new_engine, KvTestEngine}; @@ -135,7 +125,8 @@ mod tests { use super::*; use crate::coprocessor::{ - BoxConsistencyCheckObserver, ConsistencyCheckMethod, RawConsistencyCheckObserver, + dispatcher::SchedTask, BoxConsistencyCheckObserver, ConsistencyCheckMethod, + RawConsistencyCheckObserver, }; #[test] @@ -177,21 +168,8 @@ mod tests { checksum_bytes.write_u32::(sum).unwrap(); let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - match res { - ( - region_id, - CasualMessage::ComputeHashResult { - index, - hash, - context, - }, - ) => { - assert_eq!(region_id, region.get_id()); - assert_eq!(index, 10); - assert_eq!(context, vec![0]); - assert_eq!(hash, checksum_bytes); - } - e => panic!("unexpected {:?}", e), - } + assert_matches!(res, SchedTask::UpdateComputeHashResult { region_id, index, hash, context} if + region_id == region.get_id() && index == 10 && context == vec![0] && hash == checksum_bytes + ); } } diff --git a/components/raftstore/src/store/worker/raftlog_gc.rs b/components/raftstore/src/store/worker/raftlog_gc.rs index f93213dfa0d..ce829ed61b2 100644 --- a/components/raftstore/src/store/worker/raftlog_gc.rs +++ b/components/raftstore/src/store/worker/raftlog_gc.rs @@ -3,10 +3,9 @@ use std::{ error::Error as StdError, fmt::{self, Display, Formatter}, - sync::mpsc::Sender, }; -use engine_traits::{Engines, KvEngine, RaftEngine, RaftLogGcTask}; +use engine_traits::{Engines, KvEngine, RaftEngine}; use file_system::{IoType, WithIoType}; use thiserror::Error; use tikv_util::{ @@ -73,7 +72,6 @@ enum Error { pub struct Runner { tasks: Vec, engines: Engines, - gc_entries: Option>, compact_sync_interval: Duration, } @@ -82,25 +80,15 @@ impl Runner { Runner { engines, tasks: vec![], - gc_entries: None, compact_sync_interval: compact_log_interval, } } - /// Does the GC job and returns the count of logs collected. - fn gc_raft_log(&mut self, regions: Vec) -> Result { - fail::fail_point!("worker_gc_raft_log", |s| { - Ok(s.and_then(|s| s.parse().ok()).unwrap_or(0)) - }); - let deleted = box_try!(self.engines.raft.batch_gc(regions)); - fail::fail_point!("worker_gc_raft_log_finished", |_| { Ok(deleted) }); - Ok(deleted) - } - - fn report_collected(&self, collected: usize) { - if let Some(ref ch) = self.gc_entries { - ch.send(collected).unwrap(); - } + fn raft_log_gc(&mut self, mut batch: ER::LogBatch) -> Result<(), Error> { + fail::fail_point!("worker_gc_raft_log", |_| Ok(())); + box_try!(self.engines.raft.consume(&mut batch, false)); + fail::fail_point!("worker_gc_raft_log_finished"); + Ok(()) } fn flush(&mut self) { @@ -115,9 +103,11 @@ impl Runner { panic!("failed to sync kv_engine in raft_log_gc: {:?}", e); }); RAFT_LOG_GC_KV_SYNC_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); + let tasks = std::mem::take(&mut self.tasks); - let mut groups = Vec::with_capacity(tasks.len()); let mut cbs = Vec::new(); + let mut batch = self.engines.raft.log_batch(tasks.len()); + let start = Instant::now(); for t in tasks { debug!("gc raft log"; "region_id" => t.region_id, "start_index" => t.start_idx, "end_index" => t.end_idx); if let Some(cb) = t.cb { @@ -137,28 +127,22 @@ impl Runner { "end_index" => t.end_idx, ); } - groups.push(RaftLogGcTask { - raft_group_id: t.region_id, - from: t.start_idx, - to: t.end_idx, - }); - } - let start = Instant::now(); - match self.gc_raft_log(groups) { - Err(e) => { + if let Err(e) = self + .engines + .raft + .gc(t.region_id, t.start_idx, t.end_idx, &mut batch) + { error!("failed to gc"; "err" => %e); - self.report_collected(0); RAFT_LOG_GC_FAILED.inc(); } - Ok(n) => { - debug!("gc log entries"; "entry_count" => n); - self.report_collected(n); - RAFT_LOG_GC_DELETED_KEYS_HISTOGRAM.observe(n as f64); - } + } + if let Err(e) = self.raft_log_gc(batch) { + error!("failed to write gc task"; "err" => %e); + RAFT_LOG_GC_FAILED.inc(); } RAFT_LOG_GC_WRITE_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); for cb in cbs { - cb() + cb(); } } } @@ -201,7 +185,7 @@ where #[cfg(test)] mod tests { - use std::{sync::mpsc, time::Duration}; + use std::time::Duration; use engine_traits::{RaftEngine, RaftLogBatch, ALL_CFS}; use raft::eraftpb::Entry; @@ -218,9 +202,7 @@ mod tests { let kv_db = engine_test::kv::new_engine(path_raft.to_str().unwrap(), ALL_CFS).unwrap(); let engines = Engines::new(kv_db, raft_db.clone()); - let (tx, rx) = mpsc::channel(); let mut runner = Runner { - gc_entries: Some(tx), engines, tasks: vec![], compact_sync_interval: Duration::from_secs(5), @@ -237,17 +219,15 @@ mod tests { raft_db.consume(&mut raft_wb, false /* sync */).unwrap(); let tbls = vec![ - (Task::gc(region_id, 0, 10), 10, (0, 10), (10, 100)), - (Task::gc(region_id, 0, 50), 40, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 50), 0, (0, 50), (50, 100)), - (Task::gc(region_id, 50, 60), 10, (0, 60), (60, 100)), + (Task::gc(region_id, 0, 10), (0, 10), (10, 100)), + (Task::gc(region_id, 0, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 50), (0, 50), (50, 100)), + (Task::gc(region_id, 50, 60), (0, 60), (60, 100)), ]; - for (task, expected_collectd, not_exist_range, exist_range) in tbls { + for (task, not_exist_range, exist_range) in tbls { runner.run(task); runner.flush(); - let res = rx.recv_timeout(Duration::from_secs(3)).unwrap(); - assert_eq!(res, expected_collectd); raft_log_must_not_exist(&raft_db, 1, not_exist_range.0, not_exist_range.1); raft_log_must_exist(&raft_db, 1, exist_range.0, exist_range.1); } diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 08e56aa7481..a8fc2e6e3df 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -294,8 +294,6 @@ pub trait ReadExecutorProvider: Send + Clone + 'static { /// get the ReadDelegate with region_id and the number of delegates in the /// StoreMeta fn get_executor_and_len(&self, region_id: u64) -> (usize, Option); - - fn store_meta(&self) -> &Self::StoreMeta; } #[derive(Clone)] @@ -346,10 +344,6 @@ where } (meta.readers.len(), None) } - - fn store_meta(&self) -> &Self::StoreMeta { - &self.store_meta - } } /// #[RaftstoreCommon] @@ -716,8 +710,8 @@ where } } - pub fn store_meta(&self) -> &S::StoreMeta { - self.store_meta.store_meta() + pub fn store_meta(&self) -> &S { + &self.store_meta } // Ideally `get_delegate` should return `Option<&ReadDelegate>`, but if so the @@ -760,7 +754,7 @@ where } let store_id = self.store_id.get().unwrap(); - if let Err(e) = util::check_store_id(req, store_id) { + if let Err(e) = util::check_store_id(req.get_header(), store_id) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.store_id_mismatch.inc()); debug!("rejected by store id not match"; "err" => %e); return Err(e); @@ -780,13 +774,13 @@ where fail_point!("localreader_on_find_delegate"); // Check peer id. - if let Err(e) = util::check_peer_id(req, delegate.peer_id) { + if let Err(e) = util::check_peer_id(req.get_header(), delegate.peer_id) { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.peer_id_mismatch.inc()); return Err(e); } // Check term. - if let Err(e) = util::check_term(req, delegate.term) { + if let Err(e) = util::check_term(req.get_header(), delegate.term) { debug!( "check term"; "delegate_term" => delegate.term, @@ -797,7 +791,7 @@ where } // Check region epoch. - if util::check_region_epoch(req, &delegate.region, false).is_err() { + if util::check_req_region_epoch(req, &delegate.region, false).is_err() { TLS_LOCAL_READ_METRICS.with(|m| m.borrow_mut().reject_reason.epoch.inc()); // Stale epoch, redirect it to raftstore to get the latest region. debug!("rejected by epoch not match"; "tag" => &delegate.tag); @@ -813,7 +807,7 @@ where // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; - if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id) { + if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id, false) { TLS_LOCAL_READ_METRICS.with(|m| match e { Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() diff --git a/components/raftstore/src/store/worker/split_check.rs b/components/raftstore/src/store/worker/split_check.rs index d1c531070ac..b6bc5fca65f 100644 --- a/components/raftstore/src/store/worker/split_check.rs +++ b/components/raftstore/src/store/worker/split_check.rs @@ -7,15 +7,16 @@ use std::{ mem, }; -use engine_traits::{CfName, IterOptions, Iterable, Iterator, KvEngine, CF_WRITE, LARGE_CFS}; +use engine_traits::{ + CfName, IterOptions, Iterable, Iterator, KvEngine, TabletRegistry, CF_WRITE, LARGE_CFS, +}; use file_system::{IoType, WithIoType}; use itertools::Itertools; -use kvproto::{ - metapb::{Region, RegionEpoch}, - pdpb::CheckPolicy, -}; +use kvproto::{metapb::Region, pdpb::CheckPolicy}; use online_config::{ConfigChange, OnlineConfig}; -use tikv_util::{box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable}; +use tikv_util::{ + box_err, debug, error, info, keybuilder::KeyBuilder, warn, worker::Runnable, Either, +}; use txn_types::Key; use super::metrics::*; @@ -23,10 +24,10 @@ use super::metrics::*; use crate::coprocessor::Config; use crate::{ coprocessor::{ + dispatcher::StoreHandle, split_observer::{is_valid_split_key, strip_timestamp_if_exists}, CoprocessorHost, SplitCheckerHost, }, - store::{Callback, CasualMessage, CasualRouter}, Result, }; @@ -131,10 +132,10 @@ where } } -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct BucketRange(pub Vec, pub Vec); -#[derive(Default, Clone, Debug)] +#[derive(Default, Clone, Debug, PartialEq)] pub struct Bucket { // new proposed split keys under the bucket for split // if it does not need split, it's empty @@ -219,23 +220,30 @@ impl Display for Task { } } -pub struct Runner -where - E: KvEngine, -{ - engine: E, +pub struct Runner { + // We can't just use `TabletRegistry` here, otherwise v1 may create many + // invalid records and cause other problems. + engine: Either>, router: S, - coprocessor: CoprocessorHost, + coprocessor: CoprocessorHost, } -impl Runner -where - E: KvEngine, - S: CasualRouter, -{ - pub fn new(engine: E, router: S, coprocessor: CoprocessorHost) -> Runner { +impl Runner { + pub fn new(engine: EK, router: S, coprocessor: CoprocessorHost) -> Runner { Runner { - engine, + engine: Either::Left(engine), + router, + coprocessor, + } + } + + pub fn with_registry( + registry: TabletRegistry, + router: S, + coprocessor: CoprocessorHost, + ) -> Runner { + Runner { + engine: Either::Right(registry), router, coprocessor, } @@ -243,8 +251,9 @@ where fn approximate_check_bucket( &self, + tablet: &EK, region: &Region, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, bucket_ranges: Option>, ) -> Result<()> { let ranges = bucket_ranges.clone().unwrap_or_else(|| { @@ -258,7 +267,7 @@ where let mut bucket = region.clone(); bucket.set_start_key(range.0.clone()); bucket.set_end_key(range.1.clone()); - let bucket_entry = host.approximate_bucket_keys(&bucket, &self.engine)?; + let bucket_entry = host.approximate_bucket_keys(&bucket, tablet)?; debug!( "bucket_entry size {} keys count {}", bucket_entry.size, @@ -328,14 +337,11 @@ where region: &Region, bucket_ranges: Option>, ) { - let _ = self.router.send( + self.router.refresh_region_buckets( region.get_id(), - CasualMessage::RefreshRegionBuckets { - region_epoch: region.get_region_epoch().clone(), - buckets, - bucket_ranges, - cb: Callback::None, - }, + region.get_region_epoch().clone(), + buckets, + bucket_ranges, ); } @@ -350,6 +356,20 @@ where policy: CheckPolicy, bucket_ranges: Option>, ) { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let region_id = region.get_id(); let is_key_range = start_key.is_some() && end_key.is_some(); let start_key = if is_key_range { @@ -372,9 +392,9 @@ where "policy" => ?policy, ); CHECK_SPILT_COUNTER.all.inc(); - let mut host = - self.coprocessor - .new_split_checker_host(region, &self.engine, auto_split, policy); + let mut host = self + .coprocessor + .new_split_checker_host(region, tablet, auto_split, policy); if host.skip() { debug!("skip split check"; @@ -390,6 +410,7 @@ where CheckPolicy::Scan => { match self.scan_split_keys( &mut host, + tablet, region, is_key_range, &start_key, @@ -408,11 +429,11 @@ where } } } - CheckPolicy::Approximate => match host.approximate_split_keys(region, &self.engine) { + CheckPolicy::Approximate => match host.approximate_split_keys(region, tablet) { Ok(keys) => { if host.enable_region_bucket() { if let Err(e) = - self.approximate_check_bucket(region, &mut host, bucket_ranges) + self.approximate_check_bucket(tablet, region, &mut host, bucket_ranges) { error!(%e; "approximate_check_bucket failed"; @@ -437,6 +458,7 @@ where ); match self.scan_split_keys( &mut host, + tablet, region, is_key_range, &start_key, @@ -461,12 +483,8 @@ where if !split_keys.is_empty() { let region_epoch = region.get_region_epoch().clone(); - let msg = new_split_region(region_epoch, split_keys, "split checker"); - let res = self.router.send(region_id, msg); - if let Err(e) = res { - warn!("failed to send check result"; "region_id" => region_id, "err" => %e); - } - + self.router + .ask_split(region_id, region_epoch, split_keys, "split checker".into()); CHECK_SPILT_COUNTER.success.inc(); } else { debug!( @@ -484,7 +502,8 @@ where /// If it's Some(vec![]), skip generating buckets. fn scan_split_keys( &self, - host: &mut SplitCheckerHost<'_, E>, + host: &mut SplitCheckerHost<'_, EK>, + tablet: &EK, region: &Region, is_key_range: bool, start_key: &[u8], @@ -505,12 +524,8 @@ where (!host.enable_region_bucket(), &empty_bucket) }; - MergedIterator::<::Iterator>::new( - &self.engine, - LARGE_CFS, - start_key, - end_key, - false, + MergedIterator::<::Iterator>::new( + tablet, LARGE_CFS, start_key, end_key, false, ) .map(|mut iter| { let mut size = 0; @@ -595,14 +610,8 @@ where "bucket_count" => buckets.len(), "bucket_size" => bucket_size, ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateSize { size }, - ); - let _ = self.router.send( - region.get_id(), - CasualMessage::RegionApproximateKeys { keys }, - ); + self.router.update_approximate_size(region.get_id(), size); + self.router.update_approximate_keys(region.get_id(), keys); })?; if host.enable_region_bucket() { @@ -632,10 +641,10 @@ where } } -impl Runnable for Runner +impl Runnable for Runner where - E: KvEngine, - S: CasualRouter, + EK: KvEngine, + S: StoreHandle, { type Task = Task; fn run(&mut self, task: Task) { @@ -659,13 +668,28 @@ where Task::ChangeConfig(c) => self.change_cfg(c), Task::ApproximateBuckets(region) => { if self.coprocessor.cfg.enable_region_bucket { + let mut cached; + let tablet = match &self.engine { + Either::Left(e) => e, + Either::Right(r) => match r.get(region.get_id()) { + Some(c) => { + cached = Some(c); + match cached.as_mut().unwrap().latest() { + Some(t) => t, + None => return, + } + } + None => return, + }, + }; let mut host = self.coprocessor.new_split_checker_host( ®ion, - &self.engine, + tablet, false, CheckPolicy::Approximate, ); - if let Err(e) = self.approximate_check_bucket(®ion, &mut host, None) { + if let Err(e) = self.approximate_check_bucket(tablet, ®ion, &mut host, None) + { error!(%e; "approximate_check_bucket failed"; "region_id" => region.get_id(), @@ -678,19 +702,3 @@ where } } } - -fn new_split_region( - region_epoch: RegionEpoch, - split_keys: Vec>, - source: &'static str, -) -> CasualMessage -where - E: KvEngine, -{ - CasualMessage::SplitRegion { - region_epoch, - split_keys, - callback: Callback::None, - source: source.into(), - } -} diff --git a/components/server/Cargo.toml b/components/server/Cargo.toml index b27846ad5a3..acdca09b29c 100644 --- a/components/server/Cargo.toml +++ b/components/server/Cargo.toml @@ -66,6 +66,7 @@ protobuf = { version = "2.8", features = ["bytes"] } raft = { version = "0.7.0", default-features = false, features = ["protobuf-codec"] } raft_log_engine = { workspace = true } raftstore = { workspace = true, features = ["engine_rocks"] } +raftstore-v2 = { workspace = true } rand = "0.8" resolved_ts = { workspace = true } resource_metering = { workspace = true } diff --git a/components/server/src/lib.rs b/components/server/src/lib.rs index 57793792289..d5c8e352a88 100644 --- a/components/server/src/lib.rs +++ b/components/server/src/lib.rs @@ -2,6 +2,7 @@ #![allow(incomplete_features)] #![feature(specialization)] +#![feature(let_chains)] #[macro_use] extern crate tikv_util; @@ -11,4 +12,5 @@ pub mod setup; pub mod memory; pub mod raft_engine_switch; pub mod server; +pub mod server2; pub mod signal_handler; diff --git a/components/server/src/raft_engine_switch.rs b/components/server/src/raft_engine_switch.rs index 29144c8ca18..d0637a04b0a 100644 --- a/components/server/src/raft_engine_switch.rs +++ b/components/server/src/raft_engine_switch.rs @@ -193,11 +193,11 @@ fn run_dump_raft_engine_worker( new_engine: &RocksEngine, count_size: &Arc, ) { + let mut batch = new_engine.log_batch(0); while let Ok(id) = rx.recv() { let state = old_engine.get_raft_state(id).unwrap().unwrap(); - new_engine.put_raft_state(id, &state).unwrap(); + batch.put_raft_state(id, &state).unwrap(); if let Some(last_index) = old_engine.last_index(id) { - let mut batch = new_engine.log_batch(0); let mut begin = old_engine.first_index(id).unwrap(); while begin <= last_index { let end = std::cmp::min(begin + 1024, last_index + 1); @@ -210,6 +210,9 @@ fn run_dump_raft_engine_worker( count_size.fetch_add(size, Ordering::Relaxed); } } + if !batch.is_empty() { + new_engine.consume(&mut batch, false).unwrap(); + } } } @@ -234,6 +237,7 @@ mod tests { cfg.raft_store.raftdb_path = raftdb_path.to_str().unwrap().to_owned(); cfg.raftdb.wal_dir = raftdb_wal_path.to_str().unwrap().to_owned(); cfg.raft_engine.mut_config().dir = raft_engine_path.to_str().unwrap().to_owned(); + let cache = cfg.storage.block_cache.build_shared_cache(); // Dump logs from RocksEngine to RaftLogEngine. let raft_engine = RaftLogEngine::new( @@ -247,8 +251,8 @@ mod tests { // Prepare some data for the RocksEngine. let raftdb = engine_rocks::util::new_engine_opt( &cfg.raft_store.raftdb_path, - cfg.raftdb.build_opt(), - cfg.raftdb.build_cf_opts(&None), + cfg.raftdb.build_opt(Default::default(), None), + cfg.raftdb.build_cf_opts(&cache), ) .unwrap(); let mut batch = raftdb.log_batch(0); diff --git a/components/server/src/server.rs b/components/server/src/server.rs index b52abc960d8..3c926969ce2 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -38,14 +38,15 @@ use cdc::{CdcConfigManager, MemoryQuota}; use concurrency_manager::ConcurrencyManager; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{ - from_rocks_compression_type, + flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, - FlowInfo, RocksEngine, + FlowInfo, RocksEngine, RocksStatistics, }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_traits::{ - CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, - TabletFactory, CF_DEFAULT, CF_LOCK, CF_WRITE, + CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, + RaftEngine, SingletonFactory, StatisticsReporter, TabletContext, TabletRegistry, CF_DEFAULT, + CF_LOCK, CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -91,7 +92,6 @@ use tikv::{ read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, - create_raft_storage, gc_worker::{AutoGcConfig, GcWorker}, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, @@ -107,7 +107,7 @@ use tikv::{ config_manager::StorageConfigManger, mvcc::MvccConsistencyCheckObserver, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -123,6 +123,7 @@ use tikv_util::{ thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + Either, }; use tokio::runtime::Builder; @@ -169,7 +170,11 @@ fn run_impl(config: TikvConfig) { tikv.run_status_server(); tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); - signal_handler::wait_for_signal(Some(tikv.engines.take().unwrap().engines)); + signal_handler::wait_for_signal( + Some(tikv.engines.take().unwrap().engines), + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); tikv.stop(); } @@ -226,6 +231,8 @@ struct TikvServer { snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, @@ -238,7 +245,7 @@ struct TikvServer { sst_worker: Option>>, quota_limiter: Arc, causal_ts_provider: Option>, // used for rawkv apiv2 - tablet_factory: Option + Send + Sync>>, + tablet_registry: Option>, br_snap_recovery_mode: bool, // use for br snapshot recovery } @@ -376,6 +383,8 @@ where snap_mgr: None, encryption_key_manager: None, engines: None, + kv_statistics: None, + raft_statistics: None, servers: None, region_info_accessor, coprocessor_host, @@ -390,7 +399,7 @@ where sst_worker: None, quota_limiter, causal_ts_provider, - tablet_factory: None, + tablet_registry: None, br_snap_recovery_mode: is_recovering_marked, } } @@ -788,7 +797,7 @@ where storage_read_pools.handle() }; - let storage = create_raft_storage::<_, _, _, F, _>( + let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), &self.config.storage, storage_read_pool_handle, @@ -806,8 +815,7 @@ where cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( - self.tablet_factory.as_ref().unwrap().clone(), - self.config.storage.block_cache.shared, + self.tablet_registry.as_ref().unwrap().clone(), ttl_scheduler, flow_controller, storage.get_scheduler(), @@ -817,7 +825,7 @@ where let (resolver, state) = resolve::new_resolver( self.pd_client.clone(), &self.background_worker, - storage.get_engine().raft_extension().clone(), + storage.get_engine().raft_extension(), ); self.resolver = Some(resolver); @@ -952,7 +960,7 @@ where ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), self.resolver.clone().unwrap(), - snap_mgr.clone(), + Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler, self.env.clone(), @@ -1205,8 +1213,10 @@ where // Debug service. let debug_service = DebugService::new( engines.engines.clone(), + self.kv_statistics.clone(), + self.raft_statistics.clone(), servers.server.get_debug_thread_pool().clone(), - engines.engine.raft_extension().clone(), + engines.engine.raft_extension(), self.cfg_controller.as_ref().unwrap().clone(), ); if servers @@ -1357,7 +1367,11 @@ where engines_info: Arc, ) { let mut engine_metrics = EngineMetricsManager::::new( - self.engines.as_ref().unwrap().engines.clone(), + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().engines.raft.clone(), + self.raft_statistics.clone(), ); let mut io_metrics = IoMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); @@ -1367,7 +1381,7 @@ where // for recording the latest tablet for each region. // `cached_latest_tablets` is passed to `update` to avoid memory // allocation each time when calling `update`. - let mut cached_latest_tablets: HashMap = HashMap::new(); + let mut cached_latest_tablets = HashMap::default(); self.background_worker .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { let now = Instant::now(); @@ -1636,7 +1650,7 @@ where self.config.server.status_thread_pool_size, self.cfg_controller.take().unwrap(), Arc::new(self.config.security.clone()), - self.router.clone(), + self.engines.as_ref().unwrap().engine.raft_extension(), self.store_path.clone(), ) { Ok(status_server) => Box::new(status_server), @@ -1680,10 +1694,10 @@ pub trait ConfiguredRaftEngine: RaftEngine { _: &TikvConfig, _: &Arc, _: &Option>, - _: &Option, - ) -> Self; + _: &Cache, + ) -> (Self, Option>); fn as_rocks_engine(&self) -> Option<&RocksEngine>; - fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool); + fn register_config(&self, _cfg_controller: &mut ConfigController); } impl ConfiguredRaftEngine for T { @@ -1691,14 +1705,14 @@ impl ConfiguredRaftEngine for T { _: &TikvConfig, _: &Arc, _: &Option>, - _: &Option, - ) -> Self { + _: &Cache, + ) -> (Self, Option>) { unimplemented!() } default fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } - default fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} } impl ConfiguredRaftEngine for RocksEngine { @@ -1706,8 +1720,8 @@ impl ConfiguredRaftEngine for RocksEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, - ) -> Self { + block_cache: &Cache, + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_engine.config().dir, @@ -1717,13 +1731,11 @@ impl ConfiguredRaftEngine for RocksEngine { let raft_db_path = &config.raft_store.raftdb_path; let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let statistics = Arc::new(RocksStatistics::new_titan()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let mut raftdb = - engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - raftdb.set_shared_block_cache(block_cache.is_some()); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); if should_dump { let raft_engine = @@ -1734,21 +1746,17 @@ impl ConfiguredRaftEngine for RocksEngine { drop(raft_engine); raft_data_state_machine.after_dump_data(); } - raftdb + (raftdb, Some(statistics)) } fn as_rocks_engine(&self) -> Option<&RocksEngine> { Some(self) } - fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { + fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new( - Arc::new(self.clone()), - DbType::Raft, - share_cache, - )), + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), ); } } @@ -1758,8 +1766,8 @@ impl ConfiguredRaftEngine for RaftLogEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, - ) -> Self { + block_cache: &Cache, + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_store.raftdb_path, @@ -1774,8 +1782,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { if should_dump { let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt( &config.raft_store.raftdb_path, @@ -1788,7 +1795,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { drop(raftdb); raft_data_state_machine.after_dump_data(); } - raft_engine + (raft_engine, None) } } @@ -1804,46 +1811,44 @@ impl TikvServer { .unwrap(); // Create raft engine - let raft_engine = CER::build( + let (raft_engine, raft_statistics) = CER::build( &self.config, &env, &self.encryption_key_manager, &block_cache, ); + self.raft_statistics = raft_statistics; // Create kv engine. - let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) + let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) .compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(self.router.clone()), })) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - if let Some(cache) = block_cache { - builder = builder.block_cache(cache); - } - let factory = Arc::new(builder.build()); + let factory = Box::new(builder.build()); let kv_engine = factory - .create_shared_db() + .create_shared_db(&self.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); - let engines = Engines::new(kv_engine, raft_engine); + self.kv_statistics = Some(factory.rocks_statistics()); + let engines = Engines::new(kv_engine.clone(), raft_engine); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new( - factory.clone(), - DbType::Kv, - self.config.storage.block_cache.shared, - )), + Box::new(DbConfigManger::new(kv_engine.clone(), DbType::Kv)), ); - self.tablet_factory = Some(factory.clone()); - engines - .raft - .register_config(cfg_controller, self.config.storage.block_cache.shared); + let reg = TabletRegistry::new(Box::new(SingletonFactory::new(kv_engine)), &self.store_path) + .unwrap(); + // It always use the singleton kv_engine, use arbitrary id and suffix. + let ctx = TabletContext::with_infinite_region(0, Some(0)); + reg.load(ctx, false).unwrap(); + self.tablet_registry = Some(reg.clone()); + engines.raft.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( - factory, + reg, engines.raft.as_rocks_engine().cloned(), 180, // max_samples_to_preserve )); @@ -1944,13 +1949,12 @@ fn get_lock_dir() -> String { /// A small trait for components which can be trivially stopped. Lets us keep /// a list of these in `TiKV`, rather than storing each component individually. -trait Stop { +pub(crate) trait Stop { fn stop(self: Box); } -impl Stop for StatusServer +impl Stop for StatusServer where - E: 'static, R: 'static + Send, { fn stop(self: Box) { @@ -1970,32 +1974,65 @@ impl Stop for LazyWorker { } } -pub struct EngineMetricsManager { - engines: Engines, +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, last_reset: Instant, } -impl EngineMetricsManager { - pub fn new(engines: Engines) -> Self { +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { EngineMetricsManager { - engines, + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, last_reset: Instant::now(), } } pub fn flush(&mut self, now: Instant) { - KvEngine::flush_metrics(&self.engines.kv, "kv"); - self.engines.raft.flush_metrics("raft"); + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - KvEngine::reset_statistics(&self.engines.kv); - self.engines.raft.reset_statistics(); + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } self.last_reset = now; } } } pub struct EnginesResourceInfo { - tablet_factory: Arc + Sync + Send>, + tablet_registry: TabletRegistry, raft_engine: Option, latest_normalized_pending_bytes: AtomicU32, normalized_pending_bytes_collector: MovingAvgU32, @@ -2005,12 +2042,12 @@ impl EnginesResourceInfo { const SCALE_FACTOR: u64 = 100; fn new( - tablet_factory: Arc + Sync + Send>, + tablet_registry: TabletRegistry, raft_engine: Option, max_samples_to_preserve: usize, ) -> Self { EnginesResourceInfo { - tablet_factory, + tablet_registry, raft_engine, latest_normalized_pending_bytes: AtomicU32::new(0), normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), @@ -2020,7 +2057,7 @@ impl EnginesResourceInfo { pub fn update( &self, _now: Instant, - cached_latest_tablets: &mut HashMap, + cached_latest_tablets: &mut HashMap>, ) { let mut normalized_pending_bytes = 0; @@ -2043,19 +2080,11 @@ impl EnginesResourceInfo { fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); } - self.tablet_factory - .for_each_opened_tablet( - &mut |id, suffix, db: &RocksEngine| match cached_latest_tablets.entry(id) { - collections::HashMapEntry::Occupied(mut slot) => { - if slot.get().0 < suffix { - slot.insert((suffix, db.clone())); - } - } - collections::HashMapEntry::Vacant(slot) => { - slot.insert((suffix, db.clone())); - } - }, - ); + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); // todo(SpadeA): Now, there's a potential race condition problem where the // tablet could be destroyed after the clone and before the fetching @@ -2066,7 +2095,8 @@ impl EnginesResourceInfo { // propose another PR to tackle it such as destory tablet lazily in a GC // thread. - for (_, (_, tablet)) in cached_latest_tablets.iter() { + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); } @@ -2110,9 +2140,9 @@ mod test { sync::{atomic::Ordering, Arc}, }; - use engine_rocks::{raw::Env, RocksEngine}; + use engine_rocks::raw::Env; use engine_traits::{ - FlowControlFactorsExt, MiscExt, OpenOptions, SyncMutable, TabletFactory, CF_DEFAULT, + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, }; use tempfile::Builder; use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; @@ -2129,19 +2159,18 @@ mod test { config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); let env = Arc::new(Env::default()); let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config.storage.block_cache.build_shared_cache(); - let builder = KvEngineFactoryBuilder::new(env, &config, path.path()); - let factory = builder.build_v2(); + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); for i in 1..6 { - let _ = factory - .open_tablet(i, Some(10), OpenOptions::default().set_create_new(true)) - .unwrap(); + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); } - let tablet = factory - .open_tablet(1, Some(10), OpenOptions::default().set_cache_only(true)) - .unwrap(); + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); // Prepare some data for two tablets of the same region. So we can test whether // we fetch the bytes from the latest one. for i in 1..21 { @@ -2155,9 +2184,9 @@ mod test { .unwrap() .unwrap(); - let tablet = factory - .open_tablet(1, Some(20), OpenOptions::default().set_create_new(true)) - .unwrap(); + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); + tablet = cached.latest().unwrap(); for i in 1..11 { tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); @@ -2172,9 +2201,9 @@ mod test { assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); - let engines_info = Arc::new(EnginesResourceInfo::new(Arc::new(factory), None, 10)); + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); - let mut cached_latest_tablets: HashMap = HashMap::new(); + let mut cached_latest_tablets = HashMap::default(); engines_info.update(Instant::now(), &mut cached_latest_tablets); // The memory allocation should be reserved diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs new file mode 100644 index 00000000000..5beddf60151 --- /dev/null +++ b/components/server/src/server2.rs @@ -0,0 +1,1776 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +//! This module startups all the components of a TiKV server. +//! +//! It is responsible for reading from configs, starting up the various server +//! components, and handling errors (mostly by aborting and reporting to the +//! user). +//! +//! The entry point is `run_tikv`. +//! +//! Components are often used to initialize other components, and/or must be +//! explicitly stopped. We keep these components in the `TikvServer` struct. + +use std::{ + cmp, + collections::HashMap, + env, + net::SocketAddr, + path::{Path, PathBuf}, + str::FromStr, + sync::{ + atomic::{AtomicU32, AtomicU64, Ordering}, + mpsc, Arc, + }, + time::Duration, + u64, +}; + +use api_version::{dispatch_api_version, KvFormat}; +use causal_ts::CausalTsProviderImpl; +use concurrency_manager::ConcurrencyManager; +use encryption_export::{data_key_manager_from_config, DataKeyManager}; +use engine_rocks::{ + flush_engine_statistics, + raw::{Cache, Env}, + FlowInfo, RocksEngine, RocksStatistics, +}; +use engine_traits::{ + CachedTablet, CfOptions, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, + RaftEngine, StatisticsReporter, TabletRegistry, CF_DEFAULT, CF_LOCK, CF_WRITE, +}; +use error_code::ErrorCodeExt; +use file_system::{ + get_io_rate_limiter, set_io_rate_limiter, BytesFetcher, File, IoBudgetAdjustor, + MetricsManager as IoMetricsManager, +}; +use futures::executor::block_on; +use grpcio::{EnvBuilder, Environment}; +use grpcio_health::HealthService; +use kvproto::{deadlock::create_deadlock, diagnosticspb::create_diagnostics, kvrpcpb::ApiVersion}; +use pd_client::{PdClient, RpcClient}; +use raft_log_engine::RaftLogEngine; +use raftstore::{ + coprocessor::{ + BoxConsistencyCheckObserver, ConsistencyCheckMethod, CoprocessorHost, + RawConsistencyCheckObserver, + }, + store::{ + memory::MEMTRACE_ROOT as MEMTRACE_RAFTSTORE, CheckLeaderRunner, SplitConfigManager, + TabletSnapManager, + }, + RegionInfoAccessor, +}; +use security::SecurityManager; +use tikv::{ + config::{ConfigController, DbConfigManger, DbType, LogConfigManager, TikvConfig}, + coprocessor::{self, MEMTRACE_ROOT as MEMTRACE_COPROCESSOR}, + coprocessor_v2, + read_pool::{build_yatp_read_pool, ReadPool}, + server::{ + config::{Config as ServerConfig, ServerConfigManager}, + gc_worker::{AutoGcConfig, GcWorker}, + lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, + resolve, + service::DiagnosticsService, + status_server::StatusServer, + KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, DEFAULT_CLUSTER_ID, + GRPC_THREAD_PREFIX, + }, + storage::{ + self, + config_manager::StorageConfigManger, + mvcc::MvccConsistencyCheckObserver, + txn::flow_controller::{FlowController, TabletFlowController}, + Engine, Storage, + }, +}; +use tikv_util::{ + check_environment_variables, + config::{ensure_dir_exist, RaftDataStateMachine, VersionTrack}, + math::MovingAvgU32, + metrics::INSTANCE_BACKEND_CPU_QUOTA, + quota_limiter::{QuotaLimitConfigManager, QuotaLimiter}, + sys::{ + cpu_time::ProcessStat, disk, path_in_diff_mount_point, register_memory_usage_high_water, + SysQuota, + }, + thread_group::GroupProperties, + time::{Instant, Monitor}, + worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + Either, +}; +use tokio::runtime::Builder; + +use crate::{ + memory::*, raft_engine_switch::*, server::Stop, setup::*, signal_handler, + tikv_util::sys::thread::ThreadBuildWrapper, +}; + +// minimum number of core kept for background requests +const BACKGROUND_REQUEST_CORE_LOWER_BOUND: f64 = 1.0; +// max ratio of core quota for background requests +const BACKGROUND_REQUEST_CORE_MAX_RATIO: f64 = 0.95; +// default ratio of core quota for background requests = core_number * 0.5 +const BACKGROUND_REQUEST_CORE_DEFAULT_RATIO: f64 = 0.5; +// indication of TiKV instance is short of cpu +const SYSTEM_BUSY_THRESHOLD: f64 = 0.80; +// indication of TiKV instance in healthy state when cpu usage is in [0.5, 0.80) +const SYSTEM_HEALTHY_THRESHOLD: f64 = 0.50; +// pace of cpu quota adjustment +const CPU_QUOTA_ADJUSTMENT_PACE: f64 = 200.0; // 0.2 vcpu + +#[inline] +fn run_impl(config: TikvConfig) { + let mut tikv = TikvServer::::init::(config); + + // Must be called after `TikvServer::init`. + let memory_limit = tikv.config.memory_usage_limit.unwrap().0; + let high_water = (tikv.config.memory_usage_high_water * memory_limit as f64) as u64; + register_memory_usage_high_water(high_water); + + tikv.check_conflict_addr(); + tikv.init_fs(); + tikv.init_yatp(); + tikv.init_encryption(); + let fetcher = tikv.init_io_utility(); + let listener = tikv.init_flow_receiver(); + let (raft_engine, engines_info) = tikv.init_raw_engines(listener); + tikv.init_engines(raft_engine); + let server_config = tikv.init_servers::(); + tikv.register_services(); + tikv.init_metrics_flusher(fetcher, engines_info); + tikv.init_storage_stats_task(); + tikv.run_server(server_config); + tikv.run_status_server(); + tikv.init_quota_tuning_task(tikv.quota_limiter.clone()); + + // TODO: support signal dump stats + signal_handler::wait_for_signal( + None as Option>, + tikv.kv_statistics.clone(), + tikv.raft_statistics.clone(), + ); + tikv.stop(); +} + +/// Run a TiKV server. Returns when the server is shutdown by the user, in which +/// case the server will be properly stopped. +pub fn run_tikv(config: TikvConfig) { + // Sets the global logger ASAP. + // It is okay to use the config w/o `validate()`, + // because `initial_logger()` handles various conditions. + initial_logger(&config); + + // Print version information. + let build_timestamp = option_env!("TIKV_BUILD_TIME"); + tikv::log_tikv_info(build_timestamp); + + // Print resource quota. + SysQuota::log_quota(); + CPU_CORES_QUOTA_GAUGE.set(SysQuota::cpu_cores_quota()); + + // Do some prepare works before start. + pre_start(); + + let _m = Monitor::default(); + + dispatch_api_version!(config.storage.api_version(), { + if !config.raft_engine.enable { + run_impl::(config) + } else { + run_impl::(config) + } + }) +} + +const RESERVED_OPEN_FDS: u64 = 1000; + +const DEFAULT_METRICS_FLUSH_INTERVAL: Duration = Duration::from_millis(10_000); +const DEFAULT_MEMTRACE_FLUSH_INTERVAL: Duration = Duration::from_millis(1_000); +const DEFAULT_ENGINE_METRICS_RESET_INTERVAL: Duration = Duration::from_millis(60_000); +const DEFAULT_STORAGE_STATS_INTERVAL: Duration = Duration::from_secs(1); +const DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL: Duration = Duration::from_secs(5); + +/// A complete TiKV server. +struct TikvServer { + config: TikvConfig, + cfg_controller: Option, + security_mgr: Arc, + pd_client: Arc, + flow_info_sender: Option>, + flow_info_receiver: Option>, + node: Option>, + resolver: Option, + store_path: PathBuf, + snap_mgr: Option, // Will be filled in `init_servers`. + encryption_key_manager: Option>, + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + servers: Option>, + region_info_accessor: Option, + coprocessor_host: Option>, + to_stop: Vec>, + lock_files: Vec, + concurrency_manager: ConcurrencyManager, + env: Arc, + background_worker: Worker, + check_leader_worker: Worker, + sst_worker: Option>>, + quota_limiter: Arc, + causal_ts_provider: Option>, // used for rawkv apiv2 + tablet_registry: Option>, +} + +struct TikvEngines { + raft_engine: ER, + engine: RaftKv2, +} + +struct Servers { + lock_mgr: LockManager, + server: LocalServer, +} + +type LocalServer = Server>; + +impl TikvServer +where + ER: RaftEngine, +{ + fn init(mut config: TikvConfig) -> TikvServer { + tikv_util::thread_group::set_properties(Some(GroupProperties::default())); + // It is okay use pd config and security config before `init_config`, + // because these configs must be provided by command line, and only + // used during startup process. + let security_mgr = Arc::new( + SecurityManager::new(&config.security) + .unwrap_or_else(|e| fatal!("failed to create security manager: {}", e)), + ); + let env = Arc::new( + EnvBuilder::new() + .cq_count(config.server.grpc_concurrency) + .name_prefix(thd_name!(GRPC_THREAD_PREFIX)) + .build(), + ); + let pd_client = + Self::connect_to_pd_cluster(&mut config, env.clone(), Arc::clone(&security_mgr)); + + // Initialize and check config + let cfg_controller = Self::init_config(config); + let config = cfg_controller.get_current(); + + let store_path = Path::new(&config.storage.data_dir).to_owned(); + + let thread_count = config.server.background_thread_count; + let background_worker = WorkerBuilder::new("background") + .thread_count(thread_count) + .create(); + + // Initialize concurrency manager + let latest_ts = block_on(pd_client.get_tso()).expect("failed to get timestamp from PD"); + let concurrency_manager = ConcurrencyManager::new(latest_ts); + + // use different quota for front-end and back-end requests + let quota_limiter = Arc::new(QuotaLimiter::new( + config.quota.foreground_cpu_time, + config.quota.foreground_write_bandwidth, + config.quota.foreground_read_bandwidth, + config.quota.background_cpu_time, + config.quota.background_write_bandwidth, + config.quota.background_read_bandwidth, + config.quota.max_delay_duration, + config.quota.enable_auto_tune, + )); + + let mut causal_ts_provider = None; + if let ApiVersion::V2 = F::TAG { + let tso = block_on(causal_ts::BatchTsoProvider::new_opt( + pd_client.clone(), + config.causal_ts.renew_interval.0, + config.causal_ts.alloc_ahead_buffer.0, + config.causal_ts.renew_batch_min_size, + config.causal_ts.renew_batch_max_size, + )); + if let Err(e) = tso { + fatal!("Causal timestamp provider initialize failed: {:?}", e); + } + causal_ts_provider = Some(Arc::new(tso.unwrap().into())); + info!("Causal timestamp provider startup."); + } + + // Run check leader in a dedicate thread, because it is time sensitive + // and crucial to TiCDC replication lag. + let check_leader_worker = WorkerBuilder::new("check_leader").thread_count(1).create(); + + TikvServer { + config, + cfg_controller: Some(cfg_controller), + security_mgr, + pd_client, + node: None, + resolver: None, + store_path, + snap_mgr: None, + encryption_key_manager: None, + engines: None, + kv_statistics: None, + raft_statistics: None, + servers: None, + region_info_accessor: None, + coprocessor_host: None, + to_stop: vec![], + lock_files: vec![], + concurrency_manager, + env, + background_worker, + check_leader_worker, + flow_info_sender: None, + flow_info_receiver: None, + sst_worker: None, + quota_limiter, + causal_ts_provider, + tablet_registry: None, + } + } + + /// Initialize and check the config + /// + /// Warnings are logged and fatal errors exist. + /// + /// # Fatal errors + /// + /// - If `dynamic config` feature is enabled and failed to register config + /// to PD + /// - If some critical configs (like data dir) are differrent from last run + /// - If the config can't pass `validate()` + /// - If the max open file descriptor limit is not high enough to support + /// the main database and the raft database. + fn init_config(mut config: TikvConfig) -> ConfigController { + validate_and_persist_config(&mut config, true); + + ensure_dir_exist(&config.storage.data_dir).unwrap(); + if !config.rocksdb.wal_dir.is_empty() { + ensure_dir_exist(&config.rocksdb.wal_dir).unwrap(); + } + if config.raft_engine.enable { + ensure_dir_exist(&config.raft_engine.config().dir).unwrap(); + } else { + ensure_dir_exist(&config.raft_store.raftdb_path).unwrap(); + if !config.raftdb.wal_dir.is_empty() { + ensure_dir_exist(&config.raftdb.wal_dir).unwrap(); + } + } + + check_system_config(&config); + + tikv_util::set_panic_hook(config.abort_on_panic, &config.storage.data_dir); + + info!( + "using config"; + "config" => serde_json::to_string(&config).unwrap(), + ); + if config.panic_when_unexpected_key_or_data { + info!("panic-when-unexpected-key-or-data is on"); + tikv_util::set_panic_when_unexpected_key_or_data(true); + } + + config.write_into_metrics(); + + ConfigController::new(config) + } + + fn connect_to_pd_cluster( + config: &mut TikvConfig, + env: Arc, + security_mgr: Arc, + ) -> Arc { + let pd_client = Arc::new( + RpcClient::new(&config.pd, Some(env), security_mgr) + .unwrap_or_else(|e| fatal!("failed to create rpc client: {}", e)), + ); + + let cluster_id = pd_client + .get_cluster_id() + .unwrap_or_else(|e| fatal!("failed to get cluster id: {}", e)); + if cluster_id == DEFAULT_CLUSTER_ID { + fatal!("cluster id can't be {}", DEFAULT_CLUSTER_ID); + } + config.server.cluster_id = cluster_id; + info!( + "connect to PD cluster"; + "cluster_id" => cluster_id + ); + + pd_client + } + + fn check_conflict_addr(&mut self) { + let cur_addr: SocketAddr = self + .config + .server + .addr + .parse() + .expect("failed to parse into a socket address"); + let cur_ip = cur_addr.ip(); + let cur_port = cur_addr.port(); + let lock_dir = get_lock_dir(); + + let search_base = env::temp_dir().join(lock_dir); + file_system::create_dir_all(&search_base) + .unwrap_or_else(|_| panic!("create {} failed", search_base.display())); + + for entry in file_system::read_dir(&search_base).unwrap().flatten() { + if !entry.file_type().unwrap().is_file() { + continue; + } + let file_path = entry.path(); + let file_name = file_path.file_name().unwrap().to_str().unwrap(); + if let Ok(addr) = file_name.replace('_', ":").parse::() { + let ip = addr.ip(); + let port = addr.port(); + if cur_port == port + && (cur_ip == ip || cur_ip.is_unspecified() || ip.is_unspecified()) + { + let _ = try_lock_conflict_addr(file_path); + } + } + } + + let cur_path = search_base.join(cur_addr.to_string().replace(':', "_")); + let cur_file = try_lock_conflict_addr(cur_path); + self.lock_files.push(cur_file); + } + + fn init_fs(&mut self) { + let lock_path = self.store_path.join(Path::new("LOCK")); + + let f = File::create(lock_path.as_path()) + .unwrap_or_else(|e| fatal!("failed to create lock at {}: {}", lock_path.display(), e)); + if f.try_lock_exclusive().is_err() { + fatal!( + "lock {} failed, maybe another instance is using this directory.", + self.store_path.display() + ); + } + self.lock_files.push(f); + + if tikv_util::panic_mark_file_exists(&self.config.storage.data_dir) { + fatal!( + "panic_mark_file {} exists, there must be something wrong with the db. \ + Do not remove the panic_mark_file and force the TiKV node to restart. \ + Please contact TiKV maintainers to investigate the issue. \ + If needed, use scale in and scale out to replace the TiKV node. \ + https://docs.pingcap.com/tidb/stable/scale-tidb-using-tiup", + tikv_util::panic_mark_file_path(&self.config.storage.data_dir).display() + ); + } + + // We truncate a big file to make sure that both raftdb and kvdb of TiKV have + // enough space to do compaction and region migration when TiKV recover. + // This file is created in data_dir rather than db_path, because we must not + // increase store size of db_path. + fn calculate_reserved_space(capacity: u64, reserved_size_from_config: u64) -> u64 { + let mut reserved_size = reserved_size_from_config; + if reserved_size_from_config != 0 { + reserved_size = + cmp::max((capacity as f64 * 0.05) as u64, reserved_size_from_config); + } + reserved_size + } + fn reserve_physical_space(data_dir: &String, available: u64, reserved_size: u64) { + let path = Path::new(data_dir).join(file_system::SPACE_PLACEHOLDER_FILE); + if let Err(e) = file_system::remove_file(path) { + warn!("failed to remove space holder on starting: {}", e); + } + + // place holder file size is 20% of total reserved space. + if available > reserved_size { + file_system::reserve_space_for_recover(data_dir, reserved_size / 5) + .map_err(|e| panic!("Failed to reserve space for recovery: {}.", e)) + .unwrap(); + } else { + warn!("no enough disk space left to create the place holder file"); + } + } + + let disk_stats = fs2::statvfs(&self.config.storage.data_dir).unwrap(); + let mut capacity = disk_stats.total_space(); + if self.config.raft_store.capacity.0 > 0 { + capacity = cmp::min(capacity, self.config.raft_store.capacity.0); + } + // reserve space for kv engine + let kv_reserved_size = + calculate_reserved_space(capacity, self.config.storage.reserve_space.0); + disk::set_disk_reserved_space(kv_reserved_size); + reserve_physical_space( + &self.config.storage.data_dir, + disk_stats.available_space(), + kv_reserved_size, + ); + + let raft_data_dir = if self.config.raft_engine.enable { + self.config.raft_engine.config().dir + } else { + self.config.raft_store.raftdb_path.clone() + }; + + let separated_raft_mount_path = + path_in_diff_mount_point(&self.config.storage.data_dir, &raft_data_dir); + if separated_raft_mount_path { + let raft_disk_stats = fs2::statvfs(&raft_data_dir).unwrap(); + // reserve space for raft engine if raft engine is deployed separately + let raft_reserved_size = calculate_reserved_space( + raft_disk_stats.total_space(), + self.config.storage.reserve_raft_space.0, + ); + disk::set_raft_disk_reserved_space(raft_reserved_size); + reserve_physical_space( + &raft_data_dir, + raft_disk_stats.available_space(), + raft_reserved_size, + ); + } + } + + fn init_yatp(&self) { + yatp::metrics::set_namespace(Some("tikv")); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL0_CHANCE.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::MULTILEVEL_LEVEL_ELAPSED.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_POLL_DURATION.clone())).unwrap(); + prometheus::register(Box::new(yatp::metrics::TASK_EXEC_TIMES.clone())).unwrap(); + } + + fn init_encryption(&mut self) { + self.encryption_key_manager = data_key_manager_from_config( + &self.config.security.encryption, + &self.config.storage.data_dir, + ) + .map_err(|e| { + panic!( + "Encryption failed to initialize: {}. code: {}", + e, + e.error_code() + ) + }) + .unwrap() + .map(Arc::new); + } + + fn init_flow_receiver(&mut self) -> engine_rocks::FlowListener { + let (tx, rx) = mpsc::channel(); + self.flow_info_sender = Some(tx.clone()); + self.flow_info_receiver = Some(rx); + engine_rocks::FlowListener::new(tx) + } + + fn init_engines(&mut self, raft_engine: ER) { + let tablet_registry = self.tablet_registry.clone().unwrap(); + let mut node = NodeV2::new( + &self.config.server, + self.pd_client.clone(), + None, + tablet_registry, + ); + node.try_bootstrap_store(&self.config.raft_store, &raft_engine) + .unwrap_or_else(|e| fatal!("failed to bootstrap store: {:?}", e)); + assert_ne!(node.id(), 0); + + let router = node.router(); + let mut coprocessor_host: CoprocessorHost = CoprocessorHost::new( + router.store_router().clone(), + self.config.coprocessor.clone(), + ); + let region_info_accessor = RegionInfoAccessor::new(&mut coprocessor_host); + + let engine = RaftKv2::new(router.clone(), region_info_accessor.region_leaders()); + + self.engines = Some(TikvEngines { + raft_engine, + engine, + }); + self.node = Some(node); + self.coprocessor_host = Some(coprocessor_host); + self.region_info_accessor = Some(region_info_accessor); + } + + fn init_gc_worker(&mut self) -> GcWorker> { + let engines = self.engines.as_ref().unwrap(); + let gc_worker = GcWorker::new( + engines.engine.clone(), + self.flow_info_sender.take().unwrap(), + self.config.gc.clone(), + self.pd_client.feature_gate().clone(), + Arc::new(self.region_info_accessor.clone().unwrap()), + ); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Gc, + Box::new(gc_worker.get_config_manager()), + ); + + gc_worker + } + + fn init_servers(&mut self) -> Arc> { + let flow_controller = Arc::new(FlowController::Tablet(TabletFlowController::new( + &self.config.storage.flow_control, + self.tablet_registry.clone().unwrap(), + self.flow_info_receiver.take().unwrap(), + ))); + let mut gc_worker = self.init_gc_worker(); + let ttl_checker = Box::new(LazyWorker::new("ttl-checker")); + let ttl_scheduler = ttl_checker.scheduler(); + + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + + cfg_controller.register( + tikv::config::Module::Quota, + Box::new(QuotaLimitConfigManager::new(Arc::clone( + &self.quota_limiter, + ))), + ); + + cfg_controller.register(tikv::config::Module::Log, Box::new(LogConfigManager)); + + let lock_mgr = LockManager::new(&self.config.pessimistic_txn); + cfg_controller.register( + tikv::config::Module::PessimisticTxn, + Box::new(lock_mgr.config_manager()), + ); + lock_mgr.register_detector_role_change_observer(self.coprocessor_host.as_mut().unwrap()); + + let engines = self.engines.as_ref().unwrap(); + + let pd_worker = LazyWorker::new("pd-worker"); + let pd_sender = raftstore_v2::FlowReporter::new(pd_worker.scheduler()); + + let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + Some(build_yatp_read_pool( + &self.config.readpool.unified, + pd_sender.clone(), + engines.engine.clone(), + )) + } else { + None + }; + + // The `DebugService` and `DiagnosticsService` will share the same thread pool + let props = tikv_util::thread_group::current_properties(); + let debug_thread_pool = Arc::new( + Builder::new_multi_thread() + .thread_name(thd_name!("debugger")) + .worker_threads(1) + .after_start_wrapper(move || { + tikv_alloc::add_thread_memory_accessor(); + tikv_util::thread_group::set_properties(props.clone()); + }) + .before_stop_wrapper(tikv_alloc::remove_thread_memory_accessor) + .build() + .unwrap(), + ); + + // Start resource metering. + let (recorder_notifier, collector_reg_handle, resource_tag_factory, recorder_worker) = + resource_metering::init_recorder(self.config.resource_metering.precision.as_millis()); + self.to_stop.push(recorder_worker); + let (reporter_notifier, data_sink_reg_handle, reporter_worker) = + resource_metering::init_reporter( + self.config.resource_metering.clone(), + collector_reg_handle, + ); + self.to_stop.push(reporter_worker); + let (address_change_notifier, single_target_worker) = resource_metering::init_single_target( + self.config.resource_metering.receiver_address.clone(), + self.env.clone(), + data_sink_reg_handle, + ); + self.to_stop.push(single_target_worker); + + let cfg_manager = resource_metering::ConfigManager::new( + self.config.resource_metering.clone(), + recorder_notifier, + reporter_notifier, + address_change_notifier, + ); + cfg_controller.register( + tikv::config::Module::ResourceMetering, + Box::new(cfg_manager), + ); + + let storage_read_pool_handle = if self.config.readpool.storage.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let storage_read_pools = ReadPool::from(storage::build_read_pool( + &self.config.readpool.storage, + pd_sender.clone(), + engines.engine.clone(), + )); + storage_read_pools.handle() + }; + + let storage = Storage::<_, _, F>::from_engine( + engines.engine.clone(), + &self.config.storage, + storage_read_pool_handle, + lock_mgr.clone(), + self.concurrency_manager.clone(), + lock_mgr.get_storage_dynamic_configs(), + flow_controller.clone(), + pd_sender.clone(), + resource_tag_factory.clone(), + Arc::clone(&self.quota_limiter), + self.pd_client.feature_gate().clone(), + self.causal_ts_provider.clone(), + ) + .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); + cfg_controller.register( + tikv::config::Module::Storage, + Box::new(StorageConfigManger::new( + self.tablet_registry.as_ref().unwrap().clone(), + ttl_scheduler, + flow_controller, + storage.get_scheduler(), + )), + ); + + let (resolver, state) = resolve::new_resolver( + self.pd_client.clone(), + &self.background_worker, + storage.get_engine().raft_extension(), + ); + self.resolver = Some(resolver); + + ReplicaReadLockChecker::new(self.concurrency_manager.clone()) + .register(self.coprocessor_host.as_mut().unwrap()); + + // Create snapshot manager, server. + let snap_path = self + .store_path + .join(Path::new("tablet_snap")) + .to_str() + .unwrap() + .to_owned(); + + let snap_mgr = match TabletSnapManager::new(&snap_path) { + Ok(mgr) => mgr, + Err(e) => fatal!("failed to create snapshot manager at {}: {}", snap_path, e), + }; + + // Create coprocessor endpoint. + let cop_read_pool_handle = if self.config.readpool.coprocessor.use_unified_pool() { + unified_read_pool.as_ref().unwrap().handle() + } else { + let cop_read_pools = ReadPool::from(coprocessor::readpool_impl::build_read_pool( + &self.config.readpool.coprocessor, + pd_sender, + engines.engine.clone(), + )); + cop_read_pools.handle() + }; + + let check_leader_runner = CheckLeaderRunner::new( + self.node.as_ref().unwrap().router().store_meta().clone(), + self.coprocessor_host.clone().unwrap(), + ); + let check_leader_scheduler = self + .check_leader_worker + .start("check-leader", check_leader_runner); + + let server_config = Arc::new(VersionTrack::new(self.config.server.clone())); + + self.config + .raft_store + .validate( + self.config.coprocessor.region_split_size, + self.config.coprocessor.enable_region_bucket, + self.config.coprocessor.region_bucket_size, + ) + .unwrap_or_else(|e| fatal!("failed to validate raftstore config {}", e)); + let raft_store = Arc::new(VersionTrack::new(self.config.raft_store.clone())); + let health_service = HealthService::default(); + + let node = self.node.as_ref().unwrap(); + + self.snap_mgr = Some(snap_mgr.clone()); + // Create server + let server = Server::new( + node.id(), + &server_config, + &self.security_mgr, + storage, + coprocessor::Endpoint::new( + &server_config.value(), + cop_read_pool_handle, + self.concurrency_manager.clone(), + resource_tag_factory, + Arc::clone(&self.quota_limiter), + ), + coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), + self.resolver.clone().unwrap(), + Either::Right(snap_mgr.clone()), + gc_worker.clone(), + check_leader_scheduler, + self.env.clone(), + unified_read_pool, + debug_thread_pool, + health_service, + ) + .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + cfg_controller.register( + tikv::config::Module::Server, + Box::new(ServerConfigManager::new( + server.get_snap_worker_scheduler(), + server_config.clone(), + server.get_grpc_mem_quota().clone(), + )), + ); + + let split_config_manager = + SplitConfigManager::new(Arc::new(VersionTrack::new(self.config.split.clone()))); + cfg_controller.register(tikv::config::Module::Split, Box::new(split_config_manager)); + + // `ConsistencyCheckObserver` must be registered before `Node::start`. + let safe_point = Arc::new(AtomicU64::new(0)); + let observer = match self.config.coprocessor.consistency_check_method { + ConsistencyCheckMethod::Mvcc => BoxConsistencyCheckObserver::new( + MvccConsistencyCheckObserver::new(safe_point.clone()), + ), + ConsistencyCheckMethod::Raw => { + BoxConsistencyCheckObserver::new(RawConsistencyCheckObserver::default()) + } + }; + self.coprocessor_host + .as_mut() + .unwrap() + .registry + .register_consistency_check_observer(100, observer); + + self.node + .as_mut() + .unwrap() + .start( + engines.raft_engine.clone(), + server.transport(), + snap_mgr, + self.concurrency_manager.clone(), + self.causal_ts_provider.clone(), + self.coprocessor_host.clone().unwrap(), + self.background_worker.clone(), + pd_worker, + raft_store, + &state, + ) + .unwrap_or_else(|e| fatal!("failed to start node: {}", e)); + + // Start auto gc. Must after `Node::start` because `node_id` is initialized + // there. + let store_id = self.node.as_ref().unwrap().id(); + let auto_gc_config = AutoGcConfig::new( + self.pd_client.clone(), + self.region_info_accessor.clone().unwrap(), + store_id, + ); + gc_worker + .start(store_id) + .unwrap_or_else(|e| fatal!("failed to start gc worker: {}", e)); + if let Err(e) = gc_worker.start_auto_gc(auto_gc_config, safe_point) { + fatal!("failed to start auto_gc on storage, error: {}", e); + } + + initial_metric(&self.config.metric); + + self.servers = Some(Servers { lock_mgr, server }); + + server_config + } + + fn register_services(&mut self) { + let servers = self.servers.as_mut().unwrap(); + + // Create Diagnostics service + let diag_service = DiagnosticsService::new( + servers.server.get_debug_thread_pool().clone(), + self.config.log.file.filename.clone(), + self.config.slow_log_file.clone(), + ); + if servers + .server + .register_service(create_diagnostics(diag_service)) + .is_some() + { + fatal!("failed to register diagnostics service"); + } + + // Lock manager. + if servers + .server + .register_service(create_deadlock(servers.lock_mgr.deadlock_service())) + .is_some() + { + fatal!("failed to register deadlock service"); + } + + servers + .lock_mgr + .start( + self.node.as_ref().unwrap().id(), + self.pd_client.clone(), + self.resolver.clone().unwrap(), + self.security_mgr.clone(), + &self.config.pessimistic_txn, + ) + .unwrap_or_else(|e| fatal!("failed to start lock manager: {}", e)); + } + + fn init_io_utility(&mut self) -> BytesFetcher { + let stats_collector_enabled = file_system::init_io_stats_collector() + .map_err(|e| warn!("failed to init I/O stats collector: {}", e)) + .is_ok(); + + let limiter = Arc::new( + self.config + .storage + .io_rate_limit + .build(!stats_collector_enabled /* enable_statistics */), + ); + let fetcher = if stats_collector_enabled { + BytesFetcher::FromIoStatsCollector() + } else { + BytesFetcher::FromRateLimiter(limiter.statistics().unwrap()) + }; + // Set up IO limiter even when rate limit is disabled, so that rate limits can + // be dynamically applied later on. + set_io_rate_limiter(Some(limiter)); + fetcher + } + + fn init_metrics_flusher( + &mut self, + fetcher: BytesFetcher, + engines_info: Arc, + ) { + let mut engine_metrics = EngineMetricsManager::::new( + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().raft_engine.clone(), + self.raft_statistics.clone(), + ); + let mut io_metrics = IoMetricsManager::new(fetcher); + let engines_info_clone = engines_info.clone(); + + // region_id -> (suffix, tablet) + // `update` of EnginesResourceInfo is called perodically which needs this map + // for recording the latest tablet for each region. + // `cached_latest_tablets` is passed to `update` to avoid memory + // allocation each time when calling `update`. + let mut cached_latest_tablets = HashMap::default(); + self.background_worker + .spawn_interval_task(DEFAULT_METRICS_FLUSH_INTERVAL, move || { + let now = Instant::now(); + engine_metrics.flush(now); + io_metrics.flush(now); + engines_info_clone.update(now, &mut cached_latest_tablets); + }); + if let Some(limiter) = get_io_rate_limiter() { + limiter.set_low_priority_io_adjustor_if_needed(Some(engines_info)); + } + + let mut mem_trace_metrics = MemoryTraceManager::default(); + mem_trace_metrics.register_provider(MEMTRACE_RAFTSTORE.clone()); + mem_trace_metrics.register_provider(MEMTRACE_COPROCESSOR.clone()); + self.background_worker + .spawn_interval_task(DEFAULT_MEMTRACE_FLUSH_INTERVAL, move || { + let now = Instant::now(); + mem_trace_metrics.flush(now); + }); + } + + // Only background cpu quota tuning is implemented at present. iops and frontend + // quota tuning is on the way + fn init_quota_tuning_task(&self, quota_limiter: Arc) { + // No need to do auto tune when capacity is really low + if SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO + < BACKGROUND_REQUEST_CORE_LOWER_BOUND + { + return; + }; + + // Determine the base cpu quota + let base_cpu_quota = + // if cpu quota is not specified, start from optimistic case + if quota_limiter.cputime_limiter(false).is_infinite() { + 1000_f64 + * f64::max( + BACKGROUND_REQUEST_CORE_LOWER_BOUND, + SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_DEFAULT_RATIO, + ) + } else { + quota_limiter.cputime_limiter(false) / 1000_f64 + }; + + // Calculate the celling and floor quota + let celling_quota = f64::min( + base_cpu_quota * 2.0, + 1_000_f64 * SysQuota::cpu_cores_quota() * BACKGROUND_REQUEST_CORE_MAX_RATIO, + ); + let floor_quota = f64::max( + base_cpu_quota * 0.5, + 1_000_f64 * BACKGROUND_REQUEST_CORE_LOWER_BOUND, + ); + + let mut proc_stats: ProcessStat = ProcessStat::cur_proc_stat().unwrap(); + self.background_worker.spawn_interval_task( + DEFAULT_QUOTA_LIMITER_TUNE_INTERVAL, + move || { + if quota_limiter.auto_tune_enabled() { + let cputime_limit = quota_limiter.cputime_limiter(false); + let old_quota = if cputime_limit.is_infinite() { + base_cpu_quota + } else { + cputime_limit / 1000_f64 + }; + let cpu_usage = match proc_stats.cpu_usage() { + Ok(r) => r, + Err(_e) => 0.0, + }; + // Try tuning quota when cpu_usage is correctly collected. + // rule based tuning: + // - if instance is busy, shrink cpu quota for analyze by one quota pace until + // lower bound is hit; + // - if instance cpu usage is healthy, no op; + // - if instance is idle, increase cpu quota by one quota pace until upper + // bound is hit. + if cpu_usage > 0.0f64 { + let mut target_quota = old_quota; + + let cpu_util = cpu_usage / SysQuota::cpu_cores_quota(); + if cpu_util >= SYSTEM_BUSY_THRESHOLD { + target_quota = + f64::max(target_quota - CPU_QUOTA_ADJUSTMENT_PACE, floor_quota); + } else if cpu_util < SYSTEM_HEALTHY_THRESHOLD { + target_quota = + f64::min(target_quota + CPU_QUOTA_ADJUSTMENT_PACE, celling_quota); + } + + if old_quota != target_quota { + quota_limiter.set_cpu_time_limit(target_quota as usize, false); + debug!( + "cpu_time_limiter tuned for backend request"; + "cpu_util" => ?cpu_util, + "new quota" => ?target_quota); + INSTANCE_BACKEND_CPU_QUOTA.set(target_quota as i64); + } + } + } + }, + ); + } + + fn init_storage_stats_task(&self) { + let config_disk_capacity: u64 = self.config.raft_store.capacity.0; + let data_dir = self.config.storage.data_dir.clone(); + let store_path = self.store_path.clone(); + let snap_mgr = self.snap_mgr.clone().unwrap(); + let reserve_space = disk::get_disk_reserved_space(); + let reserve_raft_space = disk::get_raft_disk_reserved_space(); + if reserve_space == 0 && reserve_raft_space == 0 { + info!("disk space checker not enabled"); + return; + } + let raft_engine = self.engines.as_ref().unwrap().raft_engine.clone(); + let tablet_registry = self.tablet_registry.clone().unwrap(); + let raft_path = raft_engine.get_engine_path().to_string(); + let separated_raft_mount_path = + path_in_diff_mount_point(raft_path.as_str(), tablet_registry.tablet_root()); + let raft_almost_full_threshold = reserve_raft_space; + let raft_already_full_threshold = reserve_raft_space / 2; + + let almost_full_threshold = reserve_space; + let already_full_threshold = reserve_space / 2; + fn calculate_disk_usage(a: disk::DiskUsage, b: disk::DiskUsage) -> disk::DiskUsage { + match (a, b) { + (disk::DiskUsage::AlreadyFull, _) => disk::DiskUsage::AlreadyFull, + (_, disk::DiskUsage::AlreadyFull) => disk::DiskUsage::AlreadyFull, + (disk::DiskUsage::AlmostFull, _) => disk::DiskUsage::AlmostFull, + (_, disk::DiskUsage::AlmostFull) => disk::DiskUsage::AlmostFull, + (disk::DiskUsage::Normal, disk::DiskUsage::Normal) => disk::DiskUsage::Normal, + } + } + self.background_worker + .spawn_interval_task(DEFAULT_STORAGE_STATS_INTERVAL, move || { + let disk_stats = match fs2::statvfs(&store_path) { + Err(e) => { + error!( + "get disk stat for kv store failed"; + "kv path" => store_path.to_str(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let disk_cap = disk_stats.total_space(); + let snap_size = snap_mgr.total_snap_size().unwrap(); + + let mut kv_size = 0; + tablet_registry.for_each_opened_tablet(|_, cached| { + if let Some(tablet) = cached.latest() { + kv_size += tablet.get_engine_used_size().unwrap_or(0); + } + true + }); + + let raft_size = raft_engine + .get_engine_size() + .expect("get raft engine size"); + + let mut raft_disk_status = disk::DiskUsage::Normal; + if separated_raft_mount_path && reserve_raft_space != 0 { + let raft_disk_stats = match fs2::statvfs(&raft_path) { + Err(e) => { + error!( + "get disk stat for raft engine failed"; + "raft engine path" => raft_path.clone(), + "err" => ?e + ); + return; + } + Ok(stats) => stats, + }; + let raft_disk_cap = raft_disk_stats.total_space(); + let mut raft_disk_available = + raft_disk_cap.checked_sub(raft_size).unwrap_or_default(); + raft_disk_available = cmp::min(raft_disk_available, raft_disk_stats.available_space()); + raft_disk_status = if raft_disk_available <= raft_already_full_threshold + { + disk::DiskUsage::AlreadyFull + } else if raft_disk_available <= raft_almost_full_threshold + { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + } + let placeholer_file_path = PathBuf::from_str(&data_dir) + .unwrap() + .join(Path::new(file_system::SPACE_PLACEHOLDER_FILE)); + + let placeholder_size: u64 = + file_system::get_file_size(placeholer_file_path).unwrap_or(0); + + let used_size = if !separated_raft_mount_path { + snap_size + kv_size + raft_size + placeholder_size + } else { + snap_size + kv_size + placeholder_size + }; + let capacity = if config_disk_capacity == 0 || disk_cap < config_disk_capacity { + disk_cap + } else { + config_disk_capacity + }; + + let mut available = capacity.checked_sub(used_size).unwrap_or_default(); + available = cmp::min(available, disk_stats.available_space()); + + let prev_disk_status = disk::get_disk_status(0); //0 no need care about failpoint. + let cur_kv_disk_status = if available <= already_full_threshold { + disk::DiskUsage::AlreadyFull + } else if available <= almost_full_threshold { + disk::DiskUsage::AlmostFull + } else { + disk::DiskUsage::Normal + }; + let cur_disk_status = calculate_disk_usage(raft_disk_status, cur_kv_disk_status); + if prev_disk_status != cur_disk_status { + warn!( + "disk usage {:?}->{:?} (raft engine usage: {:?}, kv engine usage: {:?}), seperated raft mount={}, kv available={}, snap={}, kv={}, raft={}, capacity={}", + prev_disk_status, + cur_disk_status, + raft_disk_status, + cur_kv_disk_status, + separated_raft_mount_path, + available, + snap_size, + kv_size, + raft_size, + capacity + ); + } + disk::set_disk_status(cur_disk_status); + }) + } + + fn init_sst_recovery_sender(&mut self) -> Option> { + if !self + .config + .storage + .background_error_recovery_window + .is_zero() + { + let sst_worker = Box::new(LazyWorker::new("sst-recovery")); + let scheduler = sst_worker.scheduler(); + self.sst_worker = Some(sst_worker); + Some(scheduler) + } else { + None + } + } + + fn run_server(&mut self, server_config: Arc>) { + let server = self.servers.as_mut().unwrap(); + server + .server + .build_and_bind() + .unwrap_or_else(|e| fatal!("failed to build server: {}", e)); + server + .server + .start(server_config, self.security_mgr.clone()) + .unwrap_or_else(|e| fatal!("failed to start server: {}", e)); + } + + fn run_status_server(&mut self) { + // Create a status server. + let status_enabled = !self.config.server.status_addr.is_empty(); + if status_enabled { + let mut status_server = match StatusServer::new( + self.config.server.status_thread_pool_size, + self.cfg_controller.take().unwrap(), + Arc::new(self.config.security.clone()), + self.engines.as_ref().unwrap().engine.raft_extension(), + self.store_path.clone(), + ) { + Ok(status_server) => Box::new(status_server), + Err(e) => { + error_unknown!(%e; "failed to start runtime for status service"); + return; + } + }; + // Start the status server. + if let Err(e) = status_server.start(self.config.server.status_addr.clone()) { + error_unknown!(%e; "failed to bind addr for status service"); + } else { + self.to_stop.push(status_server); + } + } + } + + fn stop(mut self) { + tikv_util::thread_group::mark_shutdown(); + let mut servers = self.servers.unwrap(); + servers + .server + .stop() + .unwrap_or_else(|e| fatal!("failed to stop server: {}", e)); + + self.node.as_mut().unwrap().stop(); + self.region_info_accessor.as_mut().unwrap().stop(); + + servers.lock_mgr.stop(); + + if let Some(sst_worker) = self.sst_worker { + sst_worker.stop_worker(); + } + + self.to_stop.into_iter().for_each(|s| s.stop()); + } +} + +pub trait ConfiguredRaftEngine: RaftEngine { + fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>); + fn as_rocks_engine(&self) -> Option<&RocksEngine>; + fn register_config(&self, _cfg_controller: &mut ConfigController); +} + +impl ConfiguredRaftEngine for T { + default fn build( + _: &TikvConfig, + _: &Arc, + _: &Option>, + _: &Cache, + ) -> (Self, Option>) { + unimplemented!() + } + default fn as_rocks_engine(&self) -> Option<&RocksEngine> { + None + } + default fn register_config(&self, _cfg_controller: &mut ConfigController) {} +} + +impl ConfiguredRaftEngine for RocksEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_engine.config().dir, + &config.raft_store.raftdb_path, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_db_path = &config.raft_store.raftdb_path; + let config_raftdb = &config.raftdb; + let statistics = Arc::new(RocksStatistics::new_titan()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); + + if should_dump { + let raft_engine = + RaftLogEngine::new(config.raft_engine.config(), key_manager.clone(), None) + .expect("failed to open raft engine for migration"); + dump_raft_engine_to_raftdb(&raft_engine, &raftdb, 8 /* threads */); + raft_engine.stop(); + drop(raft_engine); + raft_data_state_machine.after_dump_data(); + } + (raftdb, Some(statistics)) + } + + fn as_rocks_engine(&self) -> Option<&RocksEngine> { + Some(self) + } + + fn register_config(&self, cfg_controller: &mut ConfigController) { + cfg_controller.register( + tikv::config::Module::Raftdb, + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), + ); + } +} + +impl ConfiguredRaftEngine for RaftLogEngine { + fn build( + config: &TikvConfig, + env: &Arc, + key_manager: &Option>, + block_cache: &Cache, + ) -> (Self, Option>) { + let mut raft_data_state_machine = RaftDataStateMachine::new( + &config.storage.data_dir, + &config.raft_store.raftdb_path, + &config.raft_engine.config().dir, + ); + let should_dump = raft_data_state_machine.before_open_target(); + + let raft_config = config.raft_engine.config(); + let raft_engine = + RaftLogEngine::new(raft_config, key_manager.clone(), get_io_rate_limiter()) + .expect("failed to open raft engine"); + + if should_dump { + let config_raftdb = &config.raftdb; + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); + let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); + let raftdb = engine_rocks::util::new_engine_opt( + &config.raft_store.raftdb_path, + raft_db_opts, + raft_cf_opts, + ) + .expect("failed to open raftdb for migration"); + dump_raftdb_to_raft_engine(&raftdb, &raft_engine, 8 /* threads */); + raftdb.stop(); + drop(raftdb); + raft_data_state_machine.after_dump_data(); + } + (raft_engine, None) + } +} + +impl TikvServer { + fn init_raw_engines( + &mut self, + flow_listener: engine_rocks::FlowListener, + ) -> (CER, Arc) { + let block_cache = self.config.storage.block_cache.build_shared_cache(); + let env = self + .config + .build_shared_rocks_env(self.encryption_key_manager.clone(), get_io_rate_limiter()) + .unwrap(); + + // Create raft engine + let (raft_engine, raft_statistics) = CER::build( + &self.config, + &env, + &self.encryption_key_manager, + &block_cache, + ); + self.raft_statistics = raft_statistics; + + // Create kv engine. + let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) + .sst_recovery_sender(self.init_sst_recovery_sender()) + .flow_listener(flow_listener); + let factory = Box::new(builder.build()); + self.kv_statistics = Some(factory.rocks_statistics()); + let registry = TabletRegistry::new(factory, self.store_path.join("tablets")) + .unwrap_or_else(|e| fatal!("failed to create tablet registry {:?}", e)); + let cfg_controller = self.cfg_controller.as_mut().unwrap(); + cfg_controller.register( + tikv::config::Module::Rocksdb, + Box::new(DbConfigManger::new(registry.clone(), DbType::Kv)), + ); + self.tablet_registry = Some(registry.clone()); + raft_engine.register_config(cfg_controller); + + let engines_info = Arc::new(EnginesResourceInfo::new( + registry, + raft_engine.as_rocks_engine().cloned(), + 180, // max_samples_to_preserve + )); + + (raft_engine, engines_info) + } +} + +/// Various sanity-checks and logging before running a server. +/// +/// Warnings are logged. +/// +/// # Logs +/// +/// The presence of these environment variables that affect the database +/// behavior is logged. +/// +/// - `GRPC_POLL_STRATEGY` +/// - `http_proxy` and `https_proxy` +/// +/// # Warnings +/// +/// - if `net.core.somaxconn` < 32768 +/// - if `net.ipv4.tcp_syncookies` is not 0 +/// - if `vm.swappiness` is not 0 +/// - if data directories are not on SSDs +/// - if the "TZ" environment variable is not set on unix +fn pre_start() { + check_environment_variables(); + for e in tikv_util::config::check_kernel() { + warn!( + "check: kernel"; + "err" => %e + ); + } +} + +fn check_system_config(config: &TikvConfig) { + info!("beginning system configuration check"); + let mut rocksdb_max_open_files = config.rocksdb.max_open_files; + if config.rocksdb.titan.enabled { + // Titan engine maintains yet another pool of blob files and uses the same max + // number of open files setup as rocksdb does. So we double the max required + // open files here + rocksdb_max_open_files *= 2; + } + if let Err(e) = tikv_util::config::check_max_open_fds( + RESERVED_OPEN_FDS + (rocksdb_max_open_files + config.raftdb.max_open_files) as u64, + ) { + fatal!("{}", e); + } + + // Check RocksDB data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.storage.data_dir) { + warn!( + "check: rocksdb-data-dir"; + "path" => &config.storage.data_dir, + "err" => %e + ); + } + // Check raft data dir + if let Err(e) = tikv_util::config::check_data_dir(&config.raft_store.raftdb_path) { + warn!( + "check: raftdb-path"; + "path" => &config.raft_store.raftdb_path, + "err" => %e + ); + } +} + +fn try_lock_conflict_addr>(path: P) -> File { + let f = File::create(path.as_ref()).unwrap_or_else(|e| { + fatal!( + "failed to create lock at {}: {}", + path.as_ref().display(), + e + ) + }); + + if f.try_lock_exclusive().is_err() { + fatal!( + "{} already in use, maybe another instance is binding with this address.", + path.as_ref().file_name().unwrap().to_str().unwrap() + ); + } + f +} + +#[cfg(unix)] +fn get_lock_dir() -> String { + format!("{}_TIKV_LOCK_FILES", unsafe { libc::getuid() }) +} + +#[cfg(not(unix))] +fn get_lock_dir() -> String { + "TIKV_LOCK_FILES".to_owned() +} + +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + last_reset: Instant, +} + +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { + EngineMetricsManager { + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, + last_reset: Instant::now(), + } + } + + pub fn flush(&mut self, now: Instant) { + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } + if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } + self.last_reset = now; + } + } +} + +pub struct EnginesResourceInfo { + tablet_registry: TabletRegistry, + raft_engine: Option, + latest_normalized_pending_bytes: AtomicU32, + normalized_pending_bytes_collector: MovingAvgU32, +} + +impl EnginesResourceInfo { + const SCALE_FACTOR: u64 = 100; + + fn new( + tablet_registry: TabletRegistry, + raft_engine: Option, + max_samples_to_preserve: usize, + ) -> Self { + EnginesResourceInfo { + tablet_registry, + raft_engine, + latest_normalized_pending_bytes: AtomicU32::new(0), + normalized_pending_bytes_collector: MovingAvgU32::new(max_samples_to_preserve), + } + } + + pub fn update( + &self, + _now: Instant, + cached_latest_tablets: &mut HashMap>, + ) { + let mut normalized_pending_bytes = 0; + + fn fetch_engine_cf(engine: &RocksEngine, cf: &str, normalized_pending_bytes: &mut u32) { + if let Ok(cf_opts) = engine.get_options_cf(cf) { + if let Ok(Some(b)) = engine.get_cf_pending_compaction_bytes(cf) { + if cf_opts.get_soft_pending_compaction_bytes_limit() > 0 { + *normalized_pending_bytes = std::cmp::max( + *normalized_pending_bytes, + (b * EnginesResourceInfo::SCALE_FACTOR + / cf_opts.get_soft_pending_compaction_bytes_limit()) + as u32, + ); + } + } + } + } + + if let Some(raft_engine) = &self.raft_engine { + fetch_engine_cf(raft_engine, CF_DEFAULT, &mut normalized_pending_bytes); + } + + self.tablet_registry + .for_each_opened_tablet(|id, db: &mut CachedTablet| { + cached_latest_tablets.insert(id, db.clone()); + true + }); + + // todo(SpadeA): Now, there's a potential race condition problem where the + // tablet could be destroyed after the clone and before the fetching + // which could result in programme panic. It's okay now as the single global + // kv_engine will not be destroyed in normal operation and v2 is not + // ready for operation. Furthermore, this race condition is general to v2 as + // tablet clone is not a case exclusively happened here. We should + // propose another PR to tackle it such as destory tablet lazily in a GC + // thread. + + for (_, cache) in cached_latest_tablets.iter_mut() { + let Some(tablet) = cache.latest() else { continue }; + for cf in &[CF_DEFAULT, CF_WRITE, CF_LOCK] { + fetch_engine_cf(tablet, cf, &mut normalized_pending_bytes); + } + } + + // Clear ensures that these tablets are not hold forever. + cached_latest_tablets.clear(); + + let (_, avg) = self + .normalized_pending_bytes_collector + .add(normalized_pending_bytes); + self.latest_normalized_pending_bytes.store( + std::cmp::max(normalized_pending_bytes, avg), + Ordering::Relaxed, + ); + } +} + +impl IoBudgetAdjustor for EnginesResourceInfo { + fn adjust(&self, total_budgets: usize) -> usize { + let score = self.latest_normalized_pending_bytes.load(Ordering::Relaxed) as f32 + / Self::SCALE_FACTOR as f32; + // Two reasons for adding `sqrt` on top: + // 1) In theory the convergence point is independent of the value of pending + // bytes (as long as backlog generating rate equals consuming rate, which is + // determined by compaction budgets), a convex helps reach that point while + // maintaining low level of pending bytes. + // 2) Variance of compaction pending bytes grows with its magnitude, a filter + // with decreasing derivative can help balance such trend. + let score = score.sqrt(); + // The target global write flow slides between Bandwidth / 2 and Bandwidth. + let score = 0.5 + score / 2.0; + (total_budgets as f32 * score) as usize + } +} + +#[cfg(test)] +mod test { + use std::{ + collections::HashMap, + sync::{atomic::Ordering, Arc}, + }; + + use engine_rocks::raw::Env; + use engine_traits::{ + FlowControlFactorsExt, MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, + }; + use tempfile::Builder; + use tikv::{config::TikvConfig, server::KvEngineFactoryBuilder}; + use tikv_util::{config::ReadableSize, time::Instant}; + + use super::EnginesResourceInfo; + + #[test] + fn test_engines_resource_info_update() { + let mut config = TikvConfig::default(); + config.rocksdb.defaultcf.disable_auto_compactions = true; + config.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.writecf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + config.rocksdb.lockcf.soft_pending_compaction_bytes_limit = Some(ReadableSize(1)); + let env = Arc::new(Env::default()); + let path = Builder::new().prefix("test-update").tempdir().unwrap(); + let cache = config.storage.block_cache.build_shared_cache(); + + let factory = KvEngineFactoryBuilder::new(env, &config, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path.path().join("tablets")).unwrap(); + + for i in 1..6 { + let ctx = TabletContext::with_infinite_region(i, Some(10)); + reg.load(ctx, true).unwrap(); + } + + let mut cached = reg.get(1).unwrap(); + let mut tablet = cached.latest().unwrap(); + // Prepare some data for two tablets of the same region. So we can test whether + // we fetch the bytes from the latest one. + for i in 1..21 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let old_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + let ctx = TabletContext::with_infinite_region(1, Some(20)); + reg.load(ctx, true).unwrap(); + tablet = cached.latest().unwrap(); + + for i in 1..11 { + tablet.put_cf(CF_DEFAULT, b"key", b"val").unwrap(); + if i % 2 == 0 { + tablet.flush_cf(CF_DEFAULT, true).unwrap(); + } + } + let new_pending_compaction_bytes = tablet + .get_cf_pending_compaction_bytes(CF_DEFAULT) + .unwrap() + .unwrap(); + + assert!(old_pending_compaction_bytes > new_pending_compaction_bytes); + + let engines_info = Arc::new(EnginesResourceInfo::new(reg, None, 10)); + + let mut cached_latest_tablets = HashMap::default(); + engines_info.update(Instant::now(), &mut cached_latest_tablets); + + // The memory allocation should be reserved + assert!(cached_latest_tablets.capacity() >= 5); + // The tablet cache should be cleared + assert!(cached_latest_tablets.is_empty()); + + // The latest_normalized_pending_bytes should be equal to the pending compaction + // bytes of tablet_1_20 + assert_eq!( + (new_pending_compaction_bytes * 100) as u32, + engines_info + .latest_normalized_pending_bytes + .load(Ordering::Relaxed) + ); + } +} diff --git a/components/server/src/signal_handler.rs b/components/server/src/signal_handler.rs index a92845b843d..0977a1ed814 100644 --- a/components/server/src/signal_handler.rs +++ b/components/server/src/signal_handler.rs @@ -1,18 +1,29 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. +use std::sync::Arc; + +use engine_rocks::RocksStatistics; +use engine_traits::{Engines, KvEngine, RaftEngine}; + pub use self::imp::wait_for_signal; #[cfg(unix)] mod imp { - use engine_traits::{Engines, KvEngine, MiscExt, RaftEngine}; + use engine_traits::MiscExt; use signal_hook::{ consts::{SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2}, iterator::Signals, }; use tikv_util::metrics; + use super::*; + #[allow(dead_code)] - pub fn wait_for_signal(engines: Option>) { + pub fn wait_for_signal( + engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, + ) { let mut signals = Signals::new([SIGTERM, SIGINT, SIGHUP, SIGUSR1, SIGUSR2]).unwrap(); for signal in &mut signals { match signal { @@ -25,7 +36,13 @@ mod imp { info!("{}", metrics::dump(false)); if let Some(ref engines) = engines { info!("{:?}", MiscExt::dump_stats(&engines.kv)); + if let Some(s) = kv_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } info!("{:?}", RaftEngine::dump_stats(&engines.raft)); + if let Some(s) = raft_statistics.as_ref() && let Some(s) = s.to_string() { + info!("{:?}", s); + } } } // TODO: handle more signal @@ -37,7 +54,12 @@ mod imp { #[cfg(not(unix))] mod imp { - use engine_traits::{Engines, KvEngine, RaftEngine}; + use super::*; - pub fn wait_for_signal(_: Option>) {} + pub fn wait_for_signal( + _: Option>, + _: Option>, + _: Option>, + ) { + } } diff --git a/components/snap_recovery/src/init_cluster.rs b/components/snap_recovery/src/init_cluster.rs index fe6c559da27..e7818b3f888 100644 --- a/components/snap_recovery/src/init_cluster.rs +++ b/components/snap_recovery/src/init_cluster.rs @@ -10,7 +10,10 @@ use pd_client::{Error as PdError, PdClient}; use raft_log_engine::RaftLogEngine; use raftstore::store::initial_region; use thiserror::Error; -use tikv::{config::TikvConfig, server::config::Config as ServerConfig}; +use tikv::{ + config::TikvConfig, + server::{config::Config as ServerConfig, KvEngineFactoryBuilder}, +}; use tikv_util::config::{ReadableDuration, ReadableSize, VersionTrack}; const CLUSTER_BOOTSTRAPPED_MAX_RETRY: u64 = 60; @@ -89,6 +92,12 @@ pub fn enter_snap_recovery_mode(config: &mut TikvConfig) { // disable resolve ts during the recovery config.resolved_ts.enable = false; + // ebs volume has very poor performance during restore, it easy to cause the + // raft client timeout, at the same time clean up all message included + // significant message. restore is not memory sensetive, we may keep + // messages as much as possible during the network disturbing in recovery mode + config.server.raft_client_max_backoff = ReadableDuration::secs(20); + // Disable region split during recovering. config.coprocessor.region_max_size = Some(ReadableSize::gb(MAX_REGION_SIZE)); config.coprocessor.region_split_size = ReadableSize::gb(MAX_REGION_SIZE); @@ -308,37 +317,26 @@ pub fn create_local_engine_service( let block_cache = config.storage.block_cache.build_shared_cache(); // init rocksdb / kv db - let mut db_opts = config.rocksdb.build_opt(); - db_opts.set_env(env.clone()); - let cf_opts = config - .rocksdb - .build_cf_opts(&block_cache, None, config.storage.api_version()); - let db_path = config - .infer_kv_engine_path(None) - .map_err(|e| format!("infer kvdb path: {}", e))?; - let mut kv_db = match new_engine_opt(&db_path, db_opts, cf_opts) { + let factory = KvEngineFactoryBuilder::new(env.clone(), config, block_cache) + .lite(true) + .build(); + let kv_db = match factory.create_shared_db(&config.storage.data_dir) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - let shared_block_cache = block_cache.is_some(); - kv_db.set_shared_block_cache(shared_block_cache); - // init raft engine, either is rocksdb or raft engine if !config.raft_engine.enable { // rocksdb - let mut raft_db_opts = config.raftdb.build_opt(); - raft_db_opts.set_env(env); - let raft_db_cf_opts = config.raftdb.build_cf_opts(&block_cache); + let raft_db_opts = config.raftdb.build_opt(env, None); + let raft_db_cf_opts = config.raftdb.build_cf_opts(factory.block_cache()); let raft_path = config .infer_raft_db_path(None) .map_err(|e| format!("infer raftdb path: {}", e))?; - let mut raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { + let raft_db = match new_engine_opt(&raft_path, raft_db_opts, raft_db_cf_opts) { Ok(db) => db, Err(e) => handle_engine_error(e), }; - // let mut raft_db = RocksEngine::from_db(Arc::new(raft_db)); - raft_db.set_shared_block_cache(shared_block_cache); let local_engines = LocalEngines::new(Engines::new(kv_db, raft_db)); Ok(Box::new(local_engines) as Box) diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index f9088ff4e3b..b2330e26f93 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -12,7 +12,7 @@ use std::{ use collections::{HashMap, HashSet}; use crossbeam::channel::TrySendError; use encryption_export::DataKeyManager; -use engine_rocks::{RocksEngine, RocksSnapshot}; +use engine_rocks::{RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ CompactExt, Engines, Iterable, MiscExt, Mutable, Peekable, RaftEngineReadOnly, WriteBatch, @@ -170,6 +170,8 @@ pub struct Cluster { group_props: HashMap, pub sst_workers: Vec>, pub sst_workers_map: HashMap, + pub kv_statistics: Vec>, + pub raft_statistics: Vec>>, pub sim: Arc>, pub pd_client: Arc, } @@ -205,6 +207,8 @@ impl Cluster { pd_client, sst_workers: vec![], sst_workers_map: HashMap::default(), + kv_statistics: vec![], + raft_statistics: vec![], } } @@ -240,12 +244,14 @@ impl Cluster { } fn create_engine(&mut self, router: Option>) { - let (engines, key_manager, dir, sst_worker) = + let (engines, key_manager, dir, sst_worker, kv_statistics, raft_statistics) = create_test_engine(router, self.io_rate_limiter.clone(), &self.cfg); self.dbs.push(engines); self.key_managers.push(key_manager); self.paths.push(dir); self.sst_workers.push(sst_worker); + self.kv_statistics.push(kv_statistics); + self.raft_statistics.push(raft_statistics); } pub fn create_engines(&mut self) { @@ -302,7 +308,8 @@ impl Cluster { pub fn compact_data(&self) { for engine in self.engines.values() { let db = &engine.kv; - db.compact_range(CF_DEFAULT, None, None, false, 1).unwrap(); + db.compact_range_cf(CF_DEFAULT, None, None, false, 1) + .unwrap(); } } diff --git a/components/test_raftstore/src/common-test.toml b/components/test_raftstore/src/common-test.toml index a121a6c1e0e..334291f7213 100644 --- a/components/test_raftstore/src/common-test.toml +++ b/components/test_raftstore/src/common-test.toml @@ -34,7 +34,6 @@ scheduler-concurrency = 10 scheduler-worker-pool-size = 1 [storage.block-cache] -shared = true capacity = "64MB" [pd] diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index ea9868afdbd..0ec60e468ee 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -52,7 +52,6 @@ use tikv::{ import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ - create_raft_storage, gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -66,7 +65,7 @@ use tikv::{ self, kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -401,8 +400,8 @@ impl ServerCluster { cfg.quota.max_delay_duration, cfg.quota.enable_auto_tune, )); - let extension = engine.raft_extension().clone(); - let store = create_raft_storage::<_, _, _, F, _>( + let extension = engine.raft_extension(); + let store = Storage::<_, _, F>::from_engine( engine, &cfg.storage, storage_read_pool.handle(), @@ -483,6 +482,8 @@ impl ServerCluster { let debug_thread_handle = debug_thread_pool.handle().clone(); let debug_service = DebugService::new( engines.clone(), + None, + None, debug_thread_handle, extension, ConfigController::default(), @@ -522,7 +523,7 @@ impl ServerCluster { copr.clone(), copr_v2.clone(), resolver.clone(), - snap_mgr.clone(), + tikv_util::Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler.clone(), self.env.clone(), @@ -794,6 +795,10 @@ impl Cluster { } panic!("failed to get snapshot of region {}", region_id); } + + pub fn raft_extension(&self, node_id: u64) -> SimulateRaftExtension { + self.sim.rl().storages[&node_id].raft_extension() + } } pub fn new_server_cluster(id: u64, count: usize) -> Cluster { diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 64bdca19025..d5c2eefa6d6 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -13,11 +13,10 @@ use collections::HashMap; use encryption_export::{ data_key_manager_from_config, DataKeyManager, FileConfig, MasterKeyConfig, }; -use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot}; +use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, TabletFactory, ALL_CFS, - CF_DEFAULT, CF_RAFT, + Engines, Iterable, Peekable, RaftEngineDebug, RaftEngineReadOnly, ALL_CFS, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; use futures::executor::block_on; @@ -576,6 +575,8 @@ pub fn create_test_engine( Option>, TempDir, LazyWorker, + Arc, + Option>, ) { let dir = test_util::temp_dir("test_cluster", cfg.prefer_mem); let mut cfg = cfg.clone(); @@ -594,22 +595,26 @@ pub fn create_test_engine( let sst_worker = LazyWorker::new("sst-recovery"); let scheduler = sst_worker.scheduler(); - let raft_engine = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); + let (raft_engine, raft_statistics) = RaftTestEngine::build(&cfg, &env, &key_manager, &cache); let mut builder = - KvEngineFactoryBuilder::new(env, &cfg, dir.path()).sst_recovery_sender(Some(scheduler)); - if let Some(cache) = cache { - builder = builder.block_cache(cache); - } + KvEngineFactoryBuilder::new(env, &cfg, cache).sst_recovery_sender(Some(scheduler)); if let Some(router) = router { builder = builder.compaction_event_sender(Arc::new(RaftRouterCompactedEventSender { router: Mutex::new(router), })); } let factory = builder.build(); - let engine = factory.create_shared_db().unwrap(); + let engine = factory.create_shared_db(dir.path()).unwrap(); let engines = Engines::new(engine, raft_engine); - (engines, key_manager, dir, sst_worker) + ( + engines, + key_manager, + dir, + sst_worker, + factory.rocks_statistics(), + raft_statistics, + ) } pub fn configure_for_request_snapshot(cluster: &mut Cluster) { diff --git a/components/tikv_kv/src/lib.rs b/components/tikv_kv/src/lib.rs index f78b2243331..5af54ee61b6 100644 --- a/components/tikv_kv/src/lib.rs +++ b/components/tikv_kv/src/lib.rs @@ -294,8 +294,8 @@ pub struct SnapContext<'a> { // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. pub key_ranges: Vec, - // Marks that this read is a FlashbackToVersionReadPhase. - pub for_flashback: bool, + // Marks that this snapshot request is allowed in the flashback state. + pub allowed_in_flashback: bool, } /// Engine defines the common behaviour for a storage engine type. @@ -311,7 +311,7 @@ pub trait Engine: Send + Clone + 'static { type RaftExtension: raft_extension::RaftExtension = FakeExtension; /// Get the underlying raft extension. - fn raft_extension(&self) -> &Self::RaftExtension { + fn raft_extension(&self) -> Self::RaftExtension { unimplemented!() } diff --git a/components/tikv_kv/src/mock_engine.rs b/components/tikv_kv/src/mock_engine.rs index dc812e84d93..69a61d58963 100644 --- a/components/tikv_kv/src/mock_engine.rs +++ b/components/tikv_kv/src/mock_engine.rs @@ -154,7 +154,7 @@ impl Engine for MockEngine { } type RaftExtension = ::RaftExtension; - fn raft_extension(&self) -> &Self::RaftExtension { + fn raft_extension(&self) -> Self::RaftExtension { self.base.raft_extension() } diff --git a/components/tikv_kv/src/rocksdb_engine.rs b/components/tikv_kv/src/rocksdb_engine.rs index 26e2c735254..21099974d2d 100644 --- a/components/tikv_kv/src/rocksdb_engine.rs +++ b/components/tikv_kv/src/rocksdb_engine.rs @@ -114,7 +114,6 @@ impl RocksEngine { path: &str, db_opts: Option, cfs_opts: Vec<(CfName, RocksCfOptions)>, - shared_block_cache: bool, io_rate_limiter: Option>, ) -> Result { info!("RocksEngine: creating for path"; "path" => path); @@ -134,11 +133,7 @@ impl RocksEngine { let db = engine_rocks::util::new_engine_opt(&path, db_opts, cfs_opts)?; // It does not use the raft_engine, so it is ok to fill with the same // rocksdb. - let mut kv_engine = db.clone(); - let mut raft_engine = db; - kv_engine.set_shared_block_cache(shared_block_cache); - raft_engine.set_shared_block_cache(shared_block_cache); - let engines = Engines::new(kv_engine, raft_engine); + let engines = Engines::new(db.clone(), db); let sched = worker.start("engine-rocksdb", Runner(engines.clone())); Ok(RocksEngine { sched, @@ -238,8 +233,8 @@ impl Engine for RocksEngine { } type RaftExtension = RE; - fn raft_extension(&self) -> &Self::RaftExtension { - &self.ext + fn raft_extension(&self) -> Self::RaftExtension { + self.ext.clone() } fn modify_on_kv_engine(&self, region_modifies: HashMap>) -> Result<()> { diff --git a/components/tikv_util/Cargo.toml b/components/tikv_util/Cargo.toml index 12c3983ef2d..663eb2b681f 100644 --- a/components/tikv_util/Cargo.toml +++ b/components/tikv_util/Cargo.toml @@ -50,7 +50,7 @@ slog-async = "2.3" slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global.git", rev = "d592f88e4dbba5eb439998463054f1a44fbf17b9" } slog-json = "2.3" slog-term = "2.4" -sysinfo = "0.16" +sysinfo = "0.26" thiserror = "1.0" tikv_alloc = { workspace = true } time = "0.1" diff --git a/components/tikv_util/src/lib.rs b/components/tikv_util/src/lib.rs index 9421c0e174b..9b13250fe1e 100644 --- a/components/tikv_util/src/lib.rs +++ b/components/tikv_util/src/lib.rs @@ -4,6 +4,7 @@ #![feature(thread_id_value)] #![feature(box_patterns)] #![feature(vec_into_raw_parts)] +#![feature(let_chains)] #[cfg(test)] extern crate test; diff --git a/components/tikv_util/src/sys/mod.rs b/components/tikv_util/src/sys/mod.rs index 8b5e846592f..49e6812b81f 100644 --- a/components/tikv_util/src/sys/mod.rs +++ b/components/tikv_util/src/sys/mod.rs @@ -9,9 +9,10 @@ pub mod ioload; pub mod thread; // re-export some traits for ease of use -#[cfg(target_os = "linux")] -use std::path::PathBuf; -use std::sync::atomic::{AtomicU64, Ordering}; +use std::{ + path::Path, + sync::atomic::{AtomicU64, Ordering}, +}; use fail::fail_point; #[cfg(target_os = "linux")] @@ -19,7 +20,7 @@ use lazy_static::lazy_static; #[cfg(target_os = "linux")] use mnt::get_mount; use sysinfo::RefreshKind; -pub use sysinfo::{DiskExt, NetworkExt, ProcessExt, ProcessorExt, SystemExt}; +pub use sysinfo::{CpuExt, DiskExt, NetworkExt, ProcessExt, SystemExt}; use crate::config::{ReadableSize, KIB}; @@ -92,7 +93,7 @@ impl SysQuota { fn sysinfo_memory_limit_in_bytes() -> u64 { let system = sysinfo::System::new_with_specifics(RefreshKind::new().with_memory()); - system.get_total_memory() * KIB + system.total_memory() * KIB } } @@ -162,13 +163,13 @@ pub fn cache_line_size(level: usize) -> Option { } #[cfg(target_os = "linux")] -pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { - if path1.is_empty() || path2.is_empty() { +pub fn path_in_diff_mount_point(path1: impl AsRef, path2: impl AsRef) -> bool { + let (path1, path2) = (path1.as_ref(), path2.as_ref()); + let empty_path = |p: &Path| p.to_str().map_or(false, |s| s.is_empty()); + if empty_path(path1) || empty_path(path2) { return false; } - let path1 = PathBuf::from(path1); - let path2 = PathBuf::from(path2); - match (get_mount(&path1), get_mount(&path2)) { + match (get_mount(path1), get_mount(path2)) { (Err(e1), _) => { warn!("Get mount point error for path {}, {}", path1.display(), e1); false @@ -190,7 +191,7 @@ pub fn path_in_diff_mount_point(path1: &str, path2: &str) -> bool { } #[cfg(not(target_os = "linux"))] -pub fn path_in_diff_mount_point(_path1: &str, _path2: &str) -> bool { +pub fn path_in_diff_mount_point(_path1: impl AsRef, _path2: impl AsRef) -> bool { false } diff --git a/components/tikv_util/src/worker/pool.rs b/components/tikv_util/src/worker/pool.rs index ba4b1e27f41..e761fac8bb5 100644 --- a/components/tikv_util/src/worker/pool.rs +++ b/components/tikv_util/src/worker/pool.rs @@ -376,8 +376,11 @@ impl Worker { let mut interval = GLOBAL_TIMER_HANDLE .interval(std::time::Instant::now(), interval) .compat(); + let stop = self.stop.clone(); self.remote.spawn(async move { - while let Some(Ok(_)) = interval.next().await { + while !stop.load(Ordering::Relaxed) + && let Some(Ok(_)) = interval.next().await + { func(); } }); @@ -391,8 +394,11 @@ impl Worker { let mut interval = GLOBAL_TIMER_HANDLE .interval(std::time::Instant::now(), interval) .compat(); + let stop = self.stop.clone(); self.remote.spawn(async move { - while let Some(Ok(_)) = interval.next().await { + while !stop.load(Ordering::Relaxed) + && let Some(Ok(_)) = interval.next().await + { let fut = func(); fut.await; } diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 6a2c953afc1..60e64bf444a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -532,8 +532,8 @@ pub struct TxnExtra { // Marks that this transaction is a 1PC transaction. RaftKv should set this flag // in the raft command request. pub one_pc: bool, - // Marks that this transaction is a flashback transaction. - pub for_flashback: bool, + // Marks that this transaction is allowed in the flashback state. + pub allowed_in_flashback: bool, } impl TxnExtra { diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index e0593948329..32ac9122f65 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -23,9 +23,9 @@ use raftstore::{ coprocessor::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, BoxApplySnapshotObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, - BoxUpdateSafeTsObserver, Cmd, Coprocessor, CoprocessorHost, ObserverContext, - PdTaskObserver, PeerCreateEvent, QueryObserver, RegionChangeEvent, RegionChangeObserver, - RegionState, StoreSizeInfo, UpdateSafeTsObserver, + BoxRoleObserver, BoxUpdateSafeTsObserver, Cmd, Coprocessor, CoprocessorHost, + ObserverContext, PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, + RegionState, RoleChange, RoleObserver, StoreSizeInfo, UpdateSafeTsObserver, }, store::{ self, check_sst_for_ingestion, @@ -163,6 +163,13 @@ impl TiFlashObserver { // fast path not enabled return false; } + let inner_msg = msg.get_message(); + if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { + return false; + } else if inner_msg.get_msg_type() == MessageType::MsgAppend { + } else { + return false; + } // TODO We don't need to recover all region infomation from restart, // since we have `has_already_inited`. let inner_msg = msg.get_message(); @@ -697,6 +704,10 @@ impl TiFlashObserver { TIFLASH_OBSERVER_PRIORITY, BoxUpdateSafeTsObserver::new(self.clone()), ); + coprocessor_host.registry.register_role_observer( + TIFLASH_OBSERVER_PRIORITY, + BoxRoleObserver::new(self.clone()), + ); } fn handle_ingest_sst_for_engine_store( @@ -1183,37 +1194,7 @@ impl RegionChangeObserver for TiFlashObs } fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { - let inner_msg = msg.get_message(); - if inner_msg.get_commit() == 0 && inner_msg.get_msg_type() == MessageType::MsgHeartbeat { - } else if inner_msg.get_msg_type() == MessageType::MsgAppend { - return self.maybe_fast_path(&msg); - } - false - } - - fn on_peer_created(&self, region_id: u64, peer_id: u64, event: PeerCreateEvent) { - if event == PeerCreateEvent::Replicate { - let f = |info: MapEntry>| match info { - MapEntry::Occupied(mut o) => { - o.get_mut() - .replicated_or_created - .store(true, Ordering::SeqCst); - } - MapEntry::Vacant(v) => { - let c = CachedRegionInfo::default(); - c.replicated_or_created.store(true, Ordering::SeqCst); - v.insert(Arc::new(c)); - } - }; - info!("fast path: ongoing {}:{} {}, peer created", - self.store_id, region_id, peer_id; - "region_id" => region_id, - ); - // TODO remove unwrap - self.get_cached_manager() - .access_cached_region_info_mut(region_id, f) - .unwrap(); - } + self.maybe_fast_path(&msg) } } @@ -1565,3 +1546,42 @@ impl ApplySnapshotObserver for TiFlashOb true } } + +impl RoleObserver for TiFlashObserver { + fn on_role_change(&self, ctx: &mut ObserverContext<'_>, r: &RoleChange) { + let region_id = ctx.region().get_id(); + let is_replicated = !r.initialized; + let f = |info: MapEntry>| match info { + MapEntry::Occupied(mut o) => { + // Note the region info may be registered by maybe_fast_path + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, 0; + "region_id" => region_id, + "is_replicated" => is_replicated, + ); + if is_replicated { + o.get_mut() + .replicated_or_created + .store(true, Ordering::SeqCst); + } + } + MapEntry::Vacant(v) => { + // TODO support peer_id + info!("fast path: ongoing {}:{} {}, peer created", + self.store_id, region_id, 0; + "region_id" => region_id, + "is_replicated" => is_replicated, + ); + if is_replicated { + let c = CachedRegionInfo::default(); + c.replicated_or_created.store(true, Ordering::SeqCst); + v.insert(Arc::new(c)); + } + } + }; + // TODO remove unwrap + self.get_cached_manager() + .access_cached_region_info_mut(region_id, f) + .unwrap(); + } +} diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 6be0ab41657..6ce5d2fb17d 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -10,7 +10,7 @@ use std::{ use engine_traits::{ Error, PerfContext, PerfContextExt, PerfContextKind, PerfLevel, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, + RaftEngineReadOnly, RaftLogBatch, Result, }; use kvproto::{ metapb::Region, @@ -147,7 +147,8 @@ impl RaftLogBatch for PSEngineWriteBatch { fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { // This function is used to clean entries that will be overwritten - // later. TODO: make sure overlapped entries will be overwritten + // later. + // TODO: make sure overlapped entries will be overwritten // by newer log. for index in from..to { // let key = ps_raft_log_key(raft_group_id, index); // self.del_page(&key).unwrap(); @@ -185,13 +186,37 @@ impl RaftLogBatch for PSEngineWriteBatch { self.del_page(keys::PREPARE_BOOTSTRAP_KEY) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { + fn put_region_state( + &mut self, + raft_group_id: u64, + _apply_index: u64, + state: &RegionLocalState, + ) -> Result<()> { self.put_msg(&keys::region_state_key(raft_group_id), state) } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { + fn put_apply_state( + &mut self, + raft_group_id: u64, + _apply_index: u64, + state: &RaftApplyState, + ) -> Result<()> { self.put_msg(&keys::apply_state_key(raft_group_id), state) } + + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) + } } #[derive(Clone)] @@ -279,7 +304,13 @@ impl PSEngine { Ok(()) } - fn gc_impl(&self, raft_group_id: u64, mut from: u64, to: u64) -> Result { + fn gc_impl( + &self, + raft_group_id: u64, + mut from: u64, + to: u64, + raft_wb: &mut PSEngineWriteBatch, + ) -> Result { if from == 0 { let start_key = keys::raft_log_key(raft_group_id, 0); let prefix = keys::raft_log_prefix(raft_group_id); @@ -372,12 +403,20 @@ impl RaftEngineReadOnly for PSEngine { self.get_msg_cf(keys::PREPARE_BOOTSTRAP_KEY) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { + fn get_region_state( + &self, + _apply_index: u64, + raft_group_id: u64, + ) -> Result> { let key = keys::region_state_key(raft_group_id); self.get_msg_cf(&key) } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { + fn get_apply_state( + &self, + _apply_index: u64, + raft_group_id: u64, + ) -> Result> { let key = keys::apply_state_key(raft_group_id); self.get_msg_cf(&key) } @@ -385,6 +424,10 @@ impl RaftEngineReadOnly for PSEngine { fn get_recover_state(&self) -> Result> { self.get_msg_cf(keys::RECOVER_STATE_KEY) } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() + } } impl RaftEngineDebug for PSEngine { @@ -474,39 +517,22 @@ impl RaftEngine for PSEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut wb = self.log_batch(0); - if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { - let buf = Vec::with_capacity(max_size as usize); - wb.append_impl(raft_group_id, &entries, buf)?; - return self.consume(&mut wb, false); - } - Ok(0) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - let mut wb = self.log_batch(0); - wb.put_msg(&keys::raft_state_key(raft_group_id), state)?; - self.consume(&mut wb, false)?; + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - self.gc_impl(raft_group_id, from, to) - } - - fn batch_gc(&self, groups: Vec) -> Result { - let mut total = 0; - for task in groups { - total += self.gc(task.raft_group_id, task.from, task.to)?; - } - Ok(total) + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() } fn flush_metrics(&self, instance: &str) {} - fn reset_statistics(&self) {} - fn dump_stats(&self) -> Result { Ok(String::from("")) } @@ -519,13 +545,6 @@ impl RaftEngine for PSEngine { Ok(0) } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - let mut wb = self.log_batch(0); - wb.put_msg(keys::STORE_IDENT_KEY, ident)?; - self.consume(&mut wb, false)?; - Ok(()) - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -553,13 +572,6 @@ impl RaftEngine for PSEngine { Some(e) => Err(e), } } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - let mut wb = self.log_batch(0); - wb.put_msg(keys::RECOVER_STATE_KEY, state)?; - self.consume(&mut wb, false)?; - Ok(()) - } } impl PerfContextExt for PSEngine { diff --git a/engine_tiflash/src/compact.rs b/engine_tiflash/src/compact.rs index b9e3e5fe558..199b7d9f3be 100644 --- a/engine_tiflash/src/compact.rs +++ b/engine_tiflash/src/compact.rs @@ -24,7 +24,7 @@ impl CompactExt for RocksEngine { Ok(false) } - fn compact_range( + fn compact_range_cf( &self, cf: &str, start_key: Option<&[u8]>, @@ -43,18 +43,6 @@ impl CompactExt for RocksEngine { Ok(()) } - fn compact_files_in_range( - &self, - start: Option<&[u8]>, - end: Option<&[u8]>, - output_level: Option, - ) -> Result<()> { - for cf_name in self.cf_names() { - self.compact_files_in_range_cf(cf_name, start, end, output_level)?; - } - Ok(()) - } - fn compact_files_in_range_cf( &self, cf: &str, diff --git a/engine_tiflash/src/db_options.rs b/engine_tiflash/src/db_options.rs index f4044c44449..f437cc7b433 100644 --- a/engine_tiflash/src/db_options.rs +++ b/engine_tiflash/src/db_options.rs @@ -66,23 +66,29 @@ impl DbOptions for RocksDbOptions { } fn get_rate_bytes_per_sec(&self) -> Option { - self.0.get_rate_bytes_per_sec() + self.0.get_rate_limiter().map(|r| r.get_bytes_per_second()) } fn set_rate_bytes_per_sec(&mut self, rate_bytes_per_sec: i64) -> Result<()> { - self.0 - .set_rate_bytes_per_sec(rate_bytes_per_sec) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_bytes_per_second(rate_bytes_per_sec); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn get_rate_limiter_auto_tuned(&self) -> Option { - self.0.get_auto_tuned() + self.0.get_rate_limiter().map(|r| r.get_auto_tuned()) } fn set_rate_limiter_auto_tuned(&mut self, rate_limiter_auto_tuned: bool) -> Result<()> { - self.0 - .set_auto_tuned(rate_limiter_auto_tuned) - .map_err(|e| box_err!(e)) + if let Some(r) = self.0.get_rate_limiter() { + r.set_auto_tuned(rate_limiter_auto_tuned); + } else { + return Err(box_err!("rate limiter not found")); + } + Ok(()) } fn set_titandb_options(&mut self, opts: &Self::TitanDbOptions) { diff --git a/engine_tiflash/src/engine.rs b/engine_tiflash/src/engine.rs index 5dba0cbfc30..44dc6566353 100644 --- a/engine_tiflash/src/engine.rs +++ b/engine_tiflash/src/engine.rs @@ -171,10 +171,6 @@ impl RocksEngine { fs::read_dir(&path).unwrap().next().is_some() } - pub fn set_shared_block_cache(&mut self, enable: bool) { - self.rocks.set_shared_block_cache(enable); - } - pub fn support_multi_batch_write(&self) -> bool { self.rocks.support_multi_batch_write() } @@ -195,10 +191,6 @@ impl KvEngine for RocksEngine { self.rocks.flush_metrics(instance); } - fn reset_statistics(&self) { - self.rocks.reset_statistics(); - } - fn bad_downcast(&self) -> &T { self.rocks.bad_downcast() } diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 022418a58ae..5911b442e20 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -16,6 +16,8 @@ //! Please read the engine_trait crate docs before hacking. #![allow(dead_code)] #![cfg_attr(test, feature(test))] +#![feature(option_get_or_insert_default)] +#![feature(let_chains)] #[allow(unused_extern_crates)] extern crate tikv_alloc; @@ -111,7 +113,10 @@ pub mod file_system; mod raft_engine; -pub use rocksdb::{set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel}; +pub use rocksdb::{ + set_perf_flags, set_perf_level, PerfContext, PerfFlag, PerfFlags, PerfLevel, + Statistics as RocksStatistics, +}; pub mod flow_control_factors; pub use flow_control_factors::*; diff --git a/engine_tiflash/src/misc.rs b/engine_tiflash/src/misc.rs index e7c9ef547d8..0393a96bd02 100644 --- a/engine_tiflash/src/misc.rs +++ b/engine_tiflash/src/misc.rs @@ -7,8 +7,8 @@ use engine_traits::{ use tikv_util::{box_try, keybuilder::KeyBuilder}; use crate::{ - engine::RocksEngine, r2e, rocks_metrics_defs::*, sst::RocksSstWriterBuilder, util, - RocksSstWriter, + engine::RocksEngine, r2e, rocks_metrics::RocksStatisticsReporter, rocks_metrics_defs::*, + sst::RocksSstWriterBuilder, util, RocksSstWriter, }; pub const MAX_DELETE_COUNT_BY_KEY: usize = 2048; @@ -125,11 +125,18 @@ impl RocksEngine { } impl MiscExt for RocksEngine { - fn flush_cfs(&self, wait: bool) -> Result<()> { + type StatisticsReporter = RocksStatisticsReporter; + + fn flush_cfs(&self, cfs: &[&str], wait: bool) -> Result<()> { let mut handles = vec![]; - for cf in self.cf_names() { + for cf in cfs { handles.push(util::get_cf_handle(self.as_inner(), cf)?); } + if handles.is_empty() { + for cf in self.cf_names() { + handles.push(util::get_cf_handle(self.as_inner(), cf)?); + } + } self.as_inner().flush_cfs(&handles, wait).map_err(r2e) } @@ -214,6 +221,24 @@ impl MiscExt for RocksEngine { Ok(false) } + fn get_sst_key_ranges(&self, cf: &str, level: usize) -> Result, Vec)>> { + let handle = util::get_cf_handle(self.as_inner(), cf)?; + let ret = self + .as_inner() + .get_column_family_meta_data(handle) + .get_level(level) + .get_files() + .iter() + .map(|sst_meta| { + ( + sst_meta.get_smallestkey().to_vec(), + sst_meta.get_largestkey().to_vec(), + ) + }) + .collect(); + Ok(ret) + } + fn get_engine_used_size(&self) -> Result { let mut used_size: u64 = 0; for cf in ALL_CFS { @@ -231,10 +256,20 @@ impl MiscExt for RocksEngine { self.as_inner().sync_wal().map_err(r2e) } + fn pause_background_work(&self) -> Result<()> { + self.as_inner().pause_bg_work(); + Ok(()) + } + fn exists(path: &str) -> bool { crate::util::db_exist(path) } + fn locked(path: &str) -> Result { + let env = rocksdb::Env::default(); + env.is_db_locked(path).map_err(r2e) + } + fn dump_stats(&self) -> Result { const ROCKSDB_DB_STATS_KEY: &str = "rocksdb.dbstats"; const ROCKSDB_CF_STATS_KEY: &str = "rocksdb.cfstats"; @@ -255,11 +290,6 @@ impl MiscExt for RocksEngine { s.extend_from_slice(v.as_bytes()); } - // more stats if enable_statistics is true. - if let Some(v) = self.as_inner().get_statistics() { - s.extend_from_slice(v.as_bytes()); - } - Ok(box_try!(String::from_utf8(s))) } diff --git a/engine_tiflash/src/ps_write_batch.rs b/engine_tiflash/src/ps_write_batch.rs index e76a0daa468..b7e895fb29b 100644 --- a/engine_tiflash/src/ps_write_batch.rs +++ b/engine_tiflash/src/ps_write_batch.rs @@ -5,10 +5,7 @@ use std::sync::Arc; use engine_traits::{self, Mutable, Result, WriteBatchExt, WriteOptions}; use rocksdb::{Writable, WriteBatch as RawWriteBatch, DB}; -use crate::{ - engine::RocksEngine, options::RocksWriteOptions, r2e, util::get_cf_handle, FFIHubInner, - RawPSWriteBatchWrapper, -}; +use crate::{engine::RocksEngine, r2e, FFIHubInner, RawPSWriteBatchWrapper}; const WRITE_BATCH_MAX_BATCH: usize = 16; const WRITE_BATCH_LIMIT: usize = 16; diff --git a/engine_tiflash/src/raft_engine.rs b/engine_tiflash/src/raft_engine.rs index dce56148ae6..b41064058ec 100644 --- a/engine_tiflash/src/raft_engine.rs +++ b/engine_tiflash/src/raft_engine.rs @@ -3,8 +3,8 @@ // #[PerformanceCriticalPath] use engine_traits::{ Error, Iterable, KvEngine, MiscExt, Mutable, Peekable, RaftEngine, RaftEngineDebug, - RaftEngineReadOnly, RaftLogBatch, RaftLogGcTask, Result, SyncMutable, WriteBatch, - WriteBatchExt, WriteOptions, CF_DEFAULT, RAFT_LOG_MULTI_GET_CNT, + RaftEngineReadOnly, RaftLogBatch, Result, WriteBatch, WriteBatchExt, WriteOptions, CF_DEFAULT, + RAFT_LOG_MULTI_GET_CNT, }; use kvproto::{ metapb::Region, @@ -144,14 +144,26 @@ impl RaftEngineReadOnly for RocksEngine { self.get_msg_cf(CF_DEFAULT, keys::PREPARE_BOOTSTRAP_KEY) } - fn get_region_state(&self, raft_group_id: u64) -> Result> { - let key = keys::region_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn get_region_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() } - fn get_apply_state(&self, raft_group_id: u64) -> Result> { - let key = keys::apply_state_key(raft_group_id); - self.get_msg_cf(CF_DEFAULT, &key) + fn get_apply_state( + &self, + _raft_group_id: u64, + _apply_index: u64, + ) -> Result> { + panic!() + } + + fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { + panic!() } fn get_recover_state(&self) -> Result> { @@ -290,48 +302,24 @@ impl RaftEngine for RocksEngine { Ok(()) } - fn append(&self, raft_group_id: u64, entries: Vec) -> Result { - let mut wb = self.write_batch(); - let buf = Vec::with_capacity(1024); - wb.append_impl(raft_group_id, &entries, buf)?; - self.consume(&mut wb, false) - } - - fn put_raft_state(&self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { - self.put_msg(&keys::raft_state_key(raft_group_id), state) - } - - fn batch_gc(&self, groups: Vec) -> Result { - let mut total = 0; - let mut raft_wb = self.write_batch_with_cap(4 * 1024); - for task in groups { - total += self.gc_impl(task.raft_group_id, task.from, task.to, &mut raft_wb)?; - } - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn gc(&self, raft_group_id: u64, from: u64, to: u64, batch: &mut Self::LogBatch) -> Result<()> { + self.gc_impl(raft_group_id, from, to, batch)?; + Ok(()) } - fn gc(&self, raft_group_id: u64, from: u64, to: u64) -> Result { - let mut raft_wb = self.write_batch_with_cap(1024); - let total = self.gc_impl(raft_group_id, from, to, &mut raft_wb)?; - // TODO: disable WAL here. - if !WriteBatch::is_empty(&raft_wb) { - raft_wb.write()?; - } - Ok(total) + fn delete_all_but_one_states_before( + &self, + _raft_group_id: u64, + _apply_index: u64, + _batch: &mut Self::LogBatch, + ) -> Result<()> { + panic!() } fn flush_metrics(&self, instance: &str) { KvEngine::flush_metrics(self, instance) } - fn reset_statistics(&self) { - KvEngine::reset_statistics(self) - } - fn dump_stats(&self) -> Result { MiscExt::dump_stats(self) } @@ -347,10 +335,6 @@ impl RaftEngine for RocksEngine { self.as_inner().path() } - fn put_store_ident(&self, ident: &StoreIdent) -> Result<()> { - self.put_msg(keys::STORE_IDENT_KEY, ident) - } - fn for_each_raft_group(&self, f: &mut F) -> std::result::Result<(), E> where F: FnMut(u64) -> std::result::Result<(), E>, @@ -378,10 +362,6 @@ impl RaftEngine for RocksEngine { Some(e) => Err(e), } } - - fn put_recover_state(&self, state: &StoreRecoverState) -> Result<()> { - self.put_msg(keys::RECOVER_STATE_KEY, state) - } } impl RaftLogBatch for RocksWriteBatchVec { @@ -428,12 +408,38 @@ impl RaftLogBatch for RocksWriteBatchVec { self.delete(keys::PREPARE_BOOTSTRAP_KEY) } - fn put_region_state(&mut self, raft_group_id: u64, state: &RegionLocalState) -> Result<()> { - self.put_msg(&keys::region_state_key(raft_group_id), state) + // Following methods are used by raftstore v2 only, which always use raft log + // engine. + fn put_region_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RegionLocalState, + ) -> Result<()> { + panic!() + } + + fn put_apply_state( + &mut self, + _raft_group_id: u64, + _apply_index: u64, + _state: &RaftApplyState, + ) -> Result<()> { + panic!() } - fn put_apply_state(&mut self, raft_group_id: u64, state: &RaftApplyState) -> Result<()> { - self.put_msg(&keys::apply_state_key(raft_group_id), state) + fn put_flushed_index( + &mut self, + _raft_group_id: u64, + _cf: &str, + _tablet_index: u64, + _apply_index: u64, + ) -> Result<()> { + panic!() + } + + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { + self.put_msg(keys::RECOVER_STATE_KEY, state) } } diff --git a/engine_tiflash/src/rocks_metrics.rs b/engine_tiflash/src/rocks_metrics.rs index 4a88c6675ed..4e9425be2dc 100644 --- a/engine_tiflash/src/rocks_metrics.rs +++ b/engine_tiflash/src/rocks_metrics.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. - -use engine_traits::CF_DEFAULT; +use collections::HashMap; +use engine_traits::{StatisticsReporter, CF_DEFAULT}; use lazy_static::lazy_static; use prometheus::*; use prometheus_static_metric::*; @@ -8,7 +8,7 @@ use rocksdb::{ DBStatisticsHistogramType as HistType, DBStatisticsTickerType as TickerType, HistogramData, DB, }; -use crate::rocks_metrics_defs::*; +use crate::{engine::RocksEngine, rocks_metrics_defs::*}; make_auto_flush_static_metric! { pub label_enum TickerName { @@ -581,12 +581,6 @@ pub fn flush_engine_ticker_metrics(t: TickerType, value: u64, name: &str) { .discardable .inc_by(value); } - TickerType::TitanGcSample => { - STORE_ENGINE_BLOB_GC_ACTION - .get(name_enum) - .sample - .inc_by(value); - } TickerType::TitanGcSmallFile => { STORE_ENGINE_BLOB_GC_ACTION .get(name_enum) @@ -1121,6 +1115,331 @@ pub fn flush_engine_properties(engine: &DB, name: &str, shared_block_cache: bool } } +#[derive(Default, Clone)] +struct CfLevelStats { + num_files: Option, + // sum(compression_ratio_i * num_files_i) + weighted_compression_ratio: Option, + num_blob_files: Option, +} + +#[derive(Default)] +struct CfStats { + used_size: Option, + blob_cache_size: Option, + readers_mem: Option, + mem_tables: Option, + num_keys: Option, + pending_compaction_bytes: Option, + num_immutable_mem_table: Option, + live_blob_size: Option, + num_live_blob_file: Option, + num_obsolete_blob_file: Option, + live_blob_file_size: Option, + obsolete_blob_file_size: Option, + blob_file_discardable_ratio_le0: Option, + blob_file_discardable_ratio_le20: Option, + blob_file_discardable_ratio_le50: Option, + blob_file_discardable_ratio_le80: Option, + blob_file_discardable_ratio_le100: Option, + levels: Vec, +} + +#[derive(Default)] +struct DbStats { + num_snapshots: Option, + oldest_snapshot_time: Option, + block_cache_size: Option, + stall_num: Option<[u64; ROCKSDB_IOSTALL_KEY.len()]>, +} + +pub struct RocksStatisticsReporter { + name: String, + db_stats: DbStats, + cf_stats: HashMap, +} + +impl StatisticsReporter for RocksStatisticsReporter { + fn new(name: &str) -> Self { + Self { + name: name.to_owned(), + db_stats: DbStats::default(), + cf_stats: HashMap::default(), + } + } + + fn collect(&mut self, engine: &RocksEngine) { + let db = engine.as_inner(); + for cf in db.cf_names() { + let cf_stats = self.cf_stats.entry(cf.to_owned()).or_default(); + let handle = crate::util::get_cf_handle(db, cf).unwrap(); + // It is important to monitor each cf's size, especially the "raft" and "lock" + // column families. + *cf_stats.used_size.get_or_insert_default() += + crate::util::get_engine_cf_used_size(db, handle); + *cf_stats.blob_cache_size.get_or_insert_default() += db.get_blob_cache_usage_cf(handle); + // TODO: find a better place to record these metrics. + // Refer: https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB + // For index and filter blocks memory + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TABLE_READERS_MEM) { + *cf_stats.readers_mem.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_CUR_SIZE_ALL_MEM_TABLES) { + *cf_stats.mem_tables.get_or_insert_default() += v; + } + // TODO: add cache usage and pinned usage. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) { + *cf_stats.num_keys.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_pending_compaction_bytes(db, handle) { + *cf_stats.pending_compaction_bytes.get_or_insert_default() += v; + } + if let Some(v) = crate::util::get_cf_num_immutable_mem_table(db, handle) { + *cf_stats.num_immutable_mem_table.get_or_insert_default() += v; + } + // Titan. + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_SIZE) { + *cf_stats.live_blob_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_LIVE_BLOB_FILE) { + *cf_stats.num_live_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_NUM_OBSOLETE_BLOB_FILE) + { + *cf_stats.num_obsolete_blob_file.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_LIVE_BLOB_FILE_SIZE) { + *cf_stats.live_blob_file_size.get_or_insert_default() += v; + } + if let Some(v) = db.get_property_int_cf(handle, ROCKSDB_TITANDB_OBSOLETE_BLOB_FILE_SIZE) + { + *cf_stats.obsolete_blob_file_size.get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE0_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le0 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE20_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le20 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE50_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le50 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE80_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le80 + .get_or_insert_default() += v; + } + if let Some(v) = + db.get_property_int_cf(handle, ROCKSDB_TITANDB_DISCARDABLE_RATIO_LE100_FILE) + { + *cf_stats + .blob_file_discardable_ratio_le100 + .get_or_insert_default() += v; + } + // Level stats. + let opts = db.get_options_cf(handle); + if cf_stats.levels.len() < opts.get_num_levels() { + cf_stats + .levels + .resize(opts.get_num_levels(), CfLevelStats::default()); + } + for level in 0..opts.get_num_levels() { + if let Some(num_files) = crate::util::get_cf_num_files_at_level(db, handle, level) { + *cf_stats.levels[level].num_files.get_or_insert_default() += num_files; + if let Some(ratio) = + crate::util::get_engine_compression_ratio_at_level(db, handle, level) + { + *cf_stats.levels[level] + .weighted_compression_ratio + .get_or_insert_default() += num_files as f64 * ratio; + } + } + if let Some(v) = crate::util::get_cf_num_blob_files_at_level(db, handle, level) { + *cf_stats.levels[level] + .num_blob_files + .get_or_insert_default() += v; + } + } + + if let Some(info) = db.get_map_property_cf(handle, ROCKSDB_CFSTATS) { + let stall_num = self.db_stats.stall_num.get_or_insert_default(); + for (key, val) in ROCKSDB_IOSTALL_KEY.iter().zip(stall_num) { + *val += info.get_property_int_value(key); + } + } + } + + // For snapshot + *self.db_stats.num_snapshots.get_or_insert_default() += + db.get_property_int(ROCKSDB_NUM_SNAPSHOTS).unwrap_or(0); + let oldest_snapshot_time = + db.get_property_int(ROCKSDB_OLDEST_SNAPSHOT_TIME) + .map_or(0, |t| { + let now = time::get_time().sec as u64; + // RocksDB returns 0 if no snapshots. + if t > 0 && now > t { now - t } else { 0 } + }); + if oldest_snapshot_time > self.db_stats.oldest_snapshot_time.unwrap_or(0) { + *self.db_stats.oldest_snapshot_time.get_or_insert_default() = oldest_snapshot_time; + } + + // Since block cache is shared, getting cache size from any CF/DB is fine. Here + // we get from default CF. + if self.db_stats.block_cache_size.is_none() { + let handle = crate::util::get_cf_handle(db, CF_DEFAULT).unwrap(); + *self.db_stats.block_cache_size.get_or_insert_default() = + db.get_block_cache_usage_cf(handle); + } + } + + fn flush(&mut self) { + for (cf, cf_stats) in &self.cf_stats { + if let Some(v) = cf_stats.used_size { + STORE_ENGINE_SIZE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_cache_size { + STORE_ENGINE_BLOB_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.readers_mem { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "readers-mem"]) + .set(v as i64); + } + if let Some(v) = cf_stats.mem_tables { + STORE_ENGINE_MEMORY_GAUGE_VEC + .with_label_values(&[&self.name, cf, "mem-tables"]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_keys { + STORE_ENGINE_ESTIMATE_NUM_KEYS_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.pending_compaction_bytes { + STORE_ENGINE_PENDING_COMPACTION_BYTES_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + for (level, level_stats) in cf_stats.levels.iter().enumerate() { + if let Some(num_files) = level_stats.num_files { + STORE_ENGINE_NUM_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(num_files as i64); + if num_files > 0 && let Some(ratio) = level_stats.weighted_compression_ratio { + let normalized_compression_ratio = + ratio / num_files as f64; + STORE_ENGINE_COMPRESSION_RATIO_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(normalized_compression_ratio); + } + } + if let Some(v) = level_stats.num_blob_files { + STORE_ENGINE_TITANDB_NUM_BLOB_FILES_AT_LEVEL_VEC + .with_label_values(&[&self.name, cf, &level.to_string()]) + .set(v as i64); + } + } + + if let Some(v) = cf_stats.num_immutable_mem_table { + STORE_ENGINE_NUM_IMMUTABLE_MEM_TABLE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_live_blob_file { + STORE_ENGINE_TITANDB_NUM_LIVE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.num_obsolete_blob_file { + STORE_ENGINE_TITANDB_NUM_OBSOLETE_BLOB_FILE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.live_blob_file_size { + STORE_ENGINE_TITANDB_LIVE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.obsolete_blob_file_size { + STORE_ENGINE_TITANDB_OBSOLETE_BLOB_FILE_SIZE_VEC + .with_label_values(&[&self.name, cf]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le0 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le0"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le20 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le20"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le50 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le50"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le80 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le80"]) + .set(v as i64); + } + if let Some(v) = cf_stats.blob_file_discardable_ratio_le100 { + STORE_ENGINE_TITANDB_BLOB_FILE_DISCARDABLE_RATIO_VEC + .with_label_values(&[&self.name, cf, "le100"]) + .set(v as i64); + } + } + + if let Some(v) = self.db_stats.num_snapshots { + STORE_ENGINE_NUM_SNAPSHOTS_GAUGE_VEC + .with_label_values(&[&self.name]) + .set(v as i64); + } + if let Some(v) = self.db_stats.oldest_snapshot_time { + STORE_ENGINE_OLDEST_SNAPSHOT_DURATION_GAUGE_VEC + .with_label_values(&[&self.name]) + .set(v as i64); + } + if let Some(v) = self.db_stats.block_cache_size { + STORE_ENGINE_BLOCK_CACHE_USAGE_GAUGE_VEC + .with_label_values(&[&self.name, "all"]) + .set(v as i64); + } + if let Some(stall_num) = &self.db_stats.stall_num { + for (ty, val) in ROCKSDB_IOSTALL_TYPE.iter().zip(stall_num) { + STORE_ENGINE_WRITE_STALL_REASON_GAUGE_VEC + .with_label_values(&[&self.name, ty]) + .set(*val as i64); + } + } + } +} + // For property metrics #[rustfmt::skip] lazy_static! { diff --git a/engine_tiflash/src/rocks_metrics_defs.rs b/engine_tiflash/src/rocks_metrics_defs.rs index fc23871b90f..204ca61b545 100644 --- a/engine_tiflash/src/rocks_metrics_defs.rs +++ b/engine_tiflash/src/rocks_metrics_defs.rs @@ -138,7 +138,6 @@ pub const TITAN_ENGINE_TICKER_TYPES: &[TickerType] = &[ TickerType::TitanGcNoNeed, TickerType::TitanGcRemain, TickerType::TitanGcDiscardable, - TickerType::TitanGcSample, TickerType::TitanGcSmallFile, TickerType::TitanGcFailure, TickerType::TitanGcSuccess, diff --git a/engine_tiflash/src/util.rs b/engine_tiflash/src/util.rs index f749f78851c..f0bc4236bcb 100644 --- a/engine_tiflash/src/util.rs +++ b/engine_tiflash/src/util.rs @@ -11,7 +11,7 @@ use slog_global::warn; use crate::{ cf_options::RocksCfOptions, db_options::RocksDbOptions, engine::RocksEngine, r2e, - rocks_metrics_defs::*, + rocks_metrics_defs::*, RocksStatistics, }; pub fn new_temp_engine(path: &tempfile::TempDir) -> Engines { @@ -28,7 +28,7 @@ pub fn new_default_engine(path: &str) -> Result { pub fn new_engine(path: &str, cfs: &[&str]) -> Result { let mut db_opts = RocksDbOptions::default(); - db_opts.enable_statistics(true); + db_opts.set_statistics(&RocksStatistics::new_titan()); let cf_opts = cfs.iter().map(|name| (*name, Default::default())).collect(); new_engine_opt(path, db_opts, cf_opts) } diff --git a/engine_tiflash/src/write_batch.rs b/engine_tiflash/src/write_batch.rs index c8b68dd781c..42e218a53b3 100644 --- a/engine_tiflash/src/write_batch.rs +++ b/engine_tiflash/src/write_batch.rs @@ -113,23 +113,37 @@ impl RocksWriteBatchVec { } } } -} -impl engine_traits::WriteBatch for RocksWriteBatchVec { - fn write_opt(&mut self, opts: &WriteOptions) -> Result { + #[inline] + fn write_impl(&mut self, opts: &WriteOptions, mut cb: impl FnMut()) -> Result { let opt: RocksWriteOptions = opts.into(); - if crate::log_check_double_write(self) { - return Ok(0); - } + let mut seq = 0; if self.support_write_batch_vec { + // FIXME(tabokie): Callback for empty write batch won't be called. self.get_db() - .multi_batch_write(self.as_inner(), &opt.into_raw()) - .map_err(r2e) + .multi_batch_write_callback(self.as_inner(), &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; } else { self.get_db() - .write_seq_opt(&self.wbs[0], &opt.into_raw()) - .map_err(r2e) + .write_callback(&self.wbs[0], &opt.into_raw(), |s| { + seq = s; + cb(); + }) + .map_err(r2e)?; + } + Ok(seq) + } +} + +impl engine_traits::WriteBatch for RocksWriteBatchVec { + fn write_opt(&mut self, opts: &WriteOptions) -> Result { + if crate::log_check_double_write(self) { + return Ok(0); } + self.write_impl(opts, || {}) } fn data_size(&self) -> usize { diff --git a/etc/config-template.toml b/etc/config-template.toml index a2b3ab13b00..62623afed0e 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -271,17 +271,11 @@ ## Set to 0 to disable this feature if you want to panic immediately when encountering such an error. # background-error-recovery-window = "1h" -[storage.block-cache] -## Whether to create a shared block cache for all RocksDB column families. -## ## Block cache is used by RocksDB to cache uncompressed blocks. Big block cache can speed up read. ## It is recommended to turn on shared block cache. Since only the total cache size need to be ## set, it is easier to config. In most cases it should be able to auto-balance cache usage ## between column families with standard LRU algorithm. -## -## The rest of config in the storage.block-cache session is effective only when shared block cache -## is on. -# shared = true +[storage.block-cache] ## Size of the shared block cache. Normally it should be tuned to 30%-50% of system's total memory. ## When the config is not set, it is decided by the sum of the following fields or their default @@ -565,11 +559,6 @@ ## Max RocksDB WAL size in total # max-total-wal-size = "4GB" -## RocksDB Statistics provides cumulative stats over time. -## Turning statistics on will introduce about 5%-10% overhead for RocksDB, but it can help you to -## know the internal status of RocksDB. -# enable-statistics = true - ## Dump statistics periodically in information logs. ## Same as RocksDB's default value (10 min). # stats-dump-period = "10m" @@ -637,6 +626,11 @@ ## RocksDB log levels # info-log-level = "info" +## Memory usage limit for Raft Engine. Undersized write buffers will be flushed to satisfy the +## requirement. +## No limit when not specified. +# write-buffer-limit = "1GB" + ## Options for `Titan`. [rocksdb.titan] ## Enables or disables `Titan`. Note that Titan is still an experimental feature. Once @@ -859,6 +853,9 @@ ## # checksum = "crc32c" +## The maximum number of concurrent compaction tasks. 0 stands for no limit. +# max-compactions = 0 + ## Options for "Default" Column Family for `Titan`. [rocksdb.defaultcf.titan] ## The smallest value to store in blob files. Value smaller than @@ -946,6 +943,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [rocksdb.lockcf] # compression-per-level = ["no", "no", "no", "no", "no", "no", "no"] @@ -970,6 +968,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [raftdb] # max-background-jobs = 4 @@ -978,7 +977,6 @@ # max-manifest-file-size = "20MB" # create-if-missing = true -# enable-statistics = true # stats-dump-period = "10m" ## Raft RocksDB WAL directory. @@ -1032,6 +1030,7 @@ # format-version = 2 # prepopulate-block-cache = "disabled" # checksum = "crc32c" +# max-compactions = 0 [raft-engine] ## Determines whether to use Raft Engine to store raft logs. When it is diff --git a/metrics/alertmanager/tikv.rules.yml b/metrics/alertmanager/tikv.rules.yml index 9b25637d14f..19f8085866e 100644 --- a/metrics/alertmanager/tikv.rules.yml +++ b/metrics/alertmanager/tikv.rules.yml @@ -253,17 +253,17 @@ groups: value: '{{ $value }}' summary: TiKV pending {{ $labels.type }} request is high - - alert: TiKV_batch_request_snapshot_nums - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 + - alert: TiKV_coprocessor_cpu_util + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1 for: 1m labels: env: ENV_LABELS_ENV level: warning - expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 + expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' - summary: TiKV batch request snapshot nums is high + summary: TiKV coprocessor CPU utilization exceeds 90% - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 diff --git a/metrics/grafana/performance_write.json b/metrics/grafana/performance_write.json index c289d979dc8..ddb9621b97a 100644 --- a/metrics/grafana/performance_write.json +++ b/metrics/grafana/performance_write.json @@ -3029,7 +3029,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", diff --git a/metrics/grafana/tikv_details.json b/metrics/grafana/tikv_details.json index ccac776b508..cff4b5f7742 100644 --- a/metrics/grafana/tikv_details.json +++ b/metrics/grafana/tikv_details.json @@ -5816,7 +5816,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", @@ -30428,11 +30428,11 @@ "steppedLine": false, "targets": [ { - "expr": "avg(tikv_engine_compression_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (level)", + "expr": "avg(tikv_engine_compression_ratio{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", db=\"$db\"}) by (cf, level)", "format": "time_series", "hide": false, "intervalFactor": 2, - "legendFormat": "level - {{level}}", + "legendFormat": "{{cf}} - level - {{level}}", "metric": "", "refId": "A", "step": 10 diff --git a/metrics/grafana/tikv_summary.json b/metrics/grafana/tikv_summary.json index b19478464a2..847ac5ef289 100644 --- a/metrics/grafana/tikv_summary.json +++ b/metrics/grafana/tikv_summary.json @@ -3109,7 +3109,7 @@ "query": { "datasourceId": 1, "model": { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", diff --git a/metrics/grafana/tikv_trouble_shooting.json b/metrics/grafana/tikv_trouble_shooting.json index 735c1f305f7..bf1fd5baacf 100644 --- a/metrics/grafana/tikv_trouble_shooting.json +++ b/metrics/grafana/tikv_trouble_shooting.json @@ -1326,7 +1326,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", + "expr": "sum(rate(tikv_thread_cpu_seconds_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", name=~\"(raftstore|rs)_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index 65278790b13..acac3e589f1 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -526,35 +526,36 @@ pub fn create_tiflash_test_engine( .map(Arc::new); let env = engine_rocks::get_env(key_manager.clone(), limiter).unwrap(); - let cache = cfg.storage.block_cache.build_shared_cache(); let kv_path = dir.path().join(tikv::config::DEFAULT_ROCKSDB_SUB_DIR); let kv_path_str = kv_path.to_str().unwrap(); - let mut kv_db_opt = cfg.rocksdb.build_opt(); - kv_db_opt.set_env(env.clone()); - - let kv_cfs_opt = cfg + let kv_db_opt = cfg .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); + .build_opt(&cfg.rocksdb.build_resources(env.clone())); + + let cache = cfg.storage.block_cache.build_shared_cache(); + let raft_cfs_opt = cfg.raftdb.build_cf_opts(&cache); + + let kv_cfs_opt = cfg.rocksdb.build_cf_opts( + &cfg.rocksdb.build_cf_resources(cache), + None, + cfg.storage.api_version(), + cfg.storage.engine, + ); let engine = engine_rocks::util::new_engine_opt(kv_path_str, kv_db_opt, kv_cfs_opt).unwrap(); - let mut engine = TiFlashEngine::from_rocks(engine); + let engine = TiFlashEngine::from_rocks(engine); let raft_path = dir.path().join("raft"); let raft_path_str = raft_path.to_str().unwrap(); - let mut raft_db_opt = cfg.raftdb.build_opt(); - raft_db_opt.set_env(env); + let raft_db_opt = cfg.raftdb.build_opt(env.clone(), None); - let raft_cfs_opt = cfg.raftdb.build_cf_opts(&cache); - let mut raft_engine = + let raft_engine = engine_rocks::util::new_engine_opt(raft_path_str, raft_db_opt, raft_cfs_opt).unwrap(); // FFI is not usable, until create_engine. - let shared_block_cache = cache.is_some(); - engine.set_shared_block_cache(shared_block_cache); - raft_engine.set_shared_block_cache(shared_block_cache); let engines = Engines::new(engine, raft_engine); (engines, key_manager, dir) } diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 69479fa4cf3..32ebea7e328 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -49,7 +49,6 @@ use tikv::{ import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ - create_raft_storage, gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -62,7 +61,7 @@ use tikv::{ self, kv::{FakeExtension, SnapContext}, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -406,7 +405,7 @@ impl ServerCluster { cfg.quota.enable_auto_tune, )); let extension = engine.raft_extension().clone(); - let store = create_raft_storage::<_, _, _, F, _>( + let store = Storage::<_, _, F>::from_engine( engine, &cfg.storage, storage_read_pool.handle(), @@ -498,7 +497,7 @@ impl ServerCluster { copr.clone(), copr_v2.clone(), resolver.clone(), - snap_mgr.clone(), + tikv_util::Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler.clone(), self.env.clone(), diff --git a/proxy_server/src/config.rs b/proxy_server/src/config.rs index f00854df00a..106d3bfcb19 100644 --- a/proxy_server/src/config.rs +++ b/proxy_server/src/config.rs @@ -29,6 +29,7 @@ pub struct RaftstoreConfig { #[online_config(skip)] pub region_worker_tick_interval: ReadableDuration, pub apply_low_priority_pool_size: usize, + pub evict_cache_on_memory_ratio: f64, } impl Default for RaftstoreConfig { @@ -45,6 +46,7 @@ impl Default for RaftstoreConfig { // This pool is used when handling ingest SST raft messages, e.g. // when using BR / lightning. apply_low_priority_pool_size: (cpu_num * 0.3).clamp(2.0, 8.0) as usize, + evict_cache_on_memory_ratio: 0.1, } } } @@ -63,6 +65,7 @@ pub struct ServerConfig { #[online_config(skip)] pub background_thread_count: usize, pub status_thread_pool_size: usize, + pub reject_messages_on_memory_ratio: f64, } impl Default for ServerConfig { @@ -79,6 +82,7 @@ impl Default for ServerConfig { advertise_addr: TIFLASH_DEFAULT_ADVERTISE_LISTENING_ADDR.to_string(), background_thread_count, status_thread_pool_size: 2, + reject_messages_on_memory_ratio: 0.05, } } } @@ -252,6 +256,10 @@ pub struct ProxyConfig { #[online_config(skip)] pub enable_io_snoop: bool, + #[doc(hidden)] + #[online_config(skip)] + pub memory_usage_high_water: f64, + #[online_config(submodule)] pub readpool: ReadPoolConfig, @@ -273,6 +281,7 @@ impl Default for ProxyConfig { raftdb: RaftDbConfig::default(), storage: StorageConfig::default(), enable_io_snoop: false, + memory_usage_high_water: 0.1, readpool: ReadPoolConfig::default(), import: ImportConfig::default(), engine_store: EngineStoreConfig::default(), @@ -379,6 +388,9 @@ pub fn address_proxy_config(config: &mut TikvConfig, proxy_config: &ProxyConfig) config.raft_store.clean_stale_ranges_tick = clean_stale_ranges_tick; config.raft_store.apply_batch_system.low_priority_pool_size = proxy_config.raft_store.apply_low_priority_pool_size; + config.raft_store.evict_cache_on_memory_ratio = + proxy_config.raft_store.evict_cache_on_memory_ratio; + config.raftdb.defaultcf.block_cache_size = proxy_config.raftdb.defaultcf.block_cache_size; config.rocksdb.defaultcf.block_cache_size = proxy_config.rocksdb.defaultcf.block_cache_size; config.rocksdb.writecf.block_cache_size = proxy_config.rocksdb.writecf.block_cache_size; @@ -387,6 +399,8 @@ pub fn address_proxy_config(config: &mut TikvConfig, proxy_config: &ProxyConfig) config.storage.reserve_space = proxy_config.storage.reserve_space; config.enable_io_snoop = proxy_config.enable_io_snoop; + config.memory_usage_high_water = proxy_config.memory_usage_high_water; + config.server.addr = proxy_config.server.addr.clone(); config.server.advertise_addr = proxy_config.server.advertise_addr.clone(); config.server.status_addr = proxy_config.server.status_addr.clone(); @@ -399,6 +413,8 @@ pub fn address_proxy_config(config: &mut TikvConfig, proxy_config: &ProxyConfig) config.server.background_thread_count = proxy_config.server.background_thread_count; config.import.num_threads = proxy_config.import.num_threads; config.server.status_thread_pool_size = proxy_config.server.status_thread_pool_size; + config.server.reject_messages_on_memory_ratio = + proxy_config.server.reject_messages_on_memory_ratio; } pub fn validate_and_persist_config(config: &mut TikvConfig, persist: bool) { diff --git a/proxy_server/src/engine.rs b/proxy_server/src/engine.rs new file mode 100644 index 00000000000..6963b2b8aae --- /dev/null +++ b/proxy_server/src/engine.rs @@ -0,0 +1,55 @@ +use engine_store_ffi::TiFlashEngine; +use engine_traits::{CfOptionsExt, DbOptions, DbOptionsExt, CF_DEFAULT}; +use tikv::config::ConfigurableDb; + +#[derive(Clone, Debug)] +pub struct ProxyRocksEngine { + pub inner: TiFlashEngine, +} + +impl ProxyRocksEngine { + pub(crate) fn new(engine: TiFlashEngine) -> ProxyRocksEngine { + ProxyRocksEngine { inner: engine } + } +} + +pub type ConfigRes = std::result::Result<(), Box>; + +impl ConfigurableDb for ProxyRocksEngine { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes { + self.inner.set_db_options(opts).map_err(Box::from) + } + + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes { + self.inner.set_options_cf(cf, opts).map_err(Box::from) + } + + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { + let mut opt = self.inner.get_db_options(); + opt.set_rate_bytes_per_sec(rate_bytes_per_sec) + .map_err(Box::from) + } + + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { + let mut opt = self.inner.get_db_options(); + opt.set_rate_limiter_auto_tuned(auto_tuned) + .map_err(Box::new)?; + // double check the new state + let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); + if new_auto_tuned == Some(auto_tuned) { + Ok(()) + } else { + Err(engine_traits::Status::with_error( + engine_traits::Code::IoError, + "fail to set rate_limiter_auto_tuned", + ) + .into()) + } + } + + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { + let opt = self.inner.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity as u64) + .map_err(Box::from) + } +} diff --git a/proxy_server/src/lib.rs b/proxy_server/src/lib.rs index 72a677dabd5..073ed75024a 100644 --- a/proxy_server/src/lib.rs +++ b/proxy_server/src/lib.rs @@ -16,6 +16,7 @@ extern crate tikv_util; #[macro_use] pub mod config; +pub mod engine; pub mod hacked_lock_mgr; pub mod proxy; pub mod run; diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index c96941a6f67..7848cdffab4 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -21,9 +21,9 @@ use api_version::{dispatch_api_version, KvFormat}; use concurrency_manager::ConcurrencyManager; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{ - from_rocks_compression_type, + flush_engine_statistics, from_rocks_compression_type, raw::{Cache, Env}, - FlowInfo, RocksEngine, + FlowInfo, RocksEngine, RocksStatistics, }; use engine_rocks_helper::sst_recovery::{RecoveryRunner, DEFAULT_CHECK_INTERVAL}; use engine_store_ffi::{ @@ -31,8 +31,9 @@ use engine_store_ffi::{ RaftStoreProxy, RaftStoreProxyFFI, RaftStoreProxyFFIHelper, ReadIndexClient, TiFlashEngine, }; use engine_traits::{ - CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, TabletFactory, - CF_DEFAULT, CF_LOCK, CF_WRITE, + CachedTablet, CfOptionsExt, Engines, FlowControlFactorsExt, KvEngine, MiscExt, RaftEngine, + SingletonFactory, StatisticsReporter, TabletContext, TabletRegistry, CF_DEFAULT, CF_LOCK, + CF_WRITE, }; use error_code::ErrorCodeExt; use file_system::{ @@ -71,7 +72,6 @@ use tikv::{ read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, - create_raft_storage, gc_worker::GcWorker, raftkv::ReplicaReadLockChecker, resolve, @@ -84,7 +84,7 @@ use tikv::{ self, config_manager::StorageConfigManger, txn::flow_controller::{EngineFlowController, FlowController}, - Engine, + Engine, Storage, }, }; use tikv_util::{ @@ -97,12 +97,14 @@ use tikv_util::{ thread_group::GroupProperties, time::{Instant, Monitor}, worker::{Builder as WorkerBuilder, LazyWorker, Scheduler, Worker}, + Either, }; use tokio::runtime::Builder; use crate::{ - config::ProxyConfig, fatal, hacked_lock_mgr::HackedLockManager as LockManager, setup::*, - status_server::StatusServer, util::ffi_server_info, + config::ProxyConfig, engine::ProxyRocksEngine, fatal, + hacked_lock_mgr::HackedLockManager as LockManager, setup::*, status_server::StatusServer, + util::ffi_server_info, }; #[inline] @@ -393,34 +395,35 @@ impl TiKvServer { .unwrap(); // Create raft engine - let mut raft_engine = CER::build( + let (mut raft_engine, raft_statistics) = CER::build( &self.config, &env, &self.encryption_key_manager, &block_cache, ); + match raft_engine.as_ps_engine() { None => {} Some(ps_engine) => { ps_engine.init(engine_store_server_helper); } } + self.raft_statistics = raft_statistics; // Create kv engine. - let mut builder = KvEngineFactoryBuilder::new(env, &self.config, &self.store_path) + let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) // TODO(tiflash) check if we need a old version of RocksEngine, or if we need to upgrade // .compaction_filter_router(self.router.clone()) .region_info_accessor(self.region_info_accessor.clone()) .sst_recovery_sender(self.init_sst_recovery_sender()) .flow_listener(flow_listener); - if let Some(cache) = block_cache { - builder = builder.block_cache(cache); - } - let factory = Arc::new(builder.build()); + let factory = Box::new(builder.build()); let kv_engine = factory - .create_shared_db() + .create_shared_db(&self.store_path) .unwrap_or_else(|s| fatal!("failed to create kv engine: {}", s)); + self.kv_statistics = Some(factory.rocks_statistics()); + let helper = engine_store_ffi::gen_engine_store_server_helper(engine_store_server_helper); let ffi_hub = Arc::new(engine_store_ffi::TiFlashFFIHub { engine_store_server_helper: helper, @@ -437,21 +440,25 @@ impl TiKvServer { Some(proxy_config_set), ); - let engines = Engines::new(kv_engine, raft_engine); + let engines = Engines::new(kv_engine.clone(), raft_engine); + let proxy_rocks_engine = ProxyRocksEngine::new(kv_engine.clone()); let cfg_controller = self.cfg_controller.as_mut().unwrap(); cfg_controller.register( tikv::config::Module::Rocksdb, - Box::new(DbConfigManger::new( - factory.clone(), - DbType::Kv, - self.config.storage.block_cache.shared, - )), + Box::new(DbConfigManger::new(proxy_rocks_engine, DbType::Kv)), ); - self.tablet_factory = Some(factory.clone()); - engines - .raft - .register_config(cfg_controller, self.config.storage.block_cache.shared); + + let reg = TabletRegistry::new( + Box::new(SingletonFactory::new(kv_engine.rocks.clone())), + &self.store_path, + ) + .unwrap(); + // It always use the singleton kv_engine, use arbitrary id and suffix. + let ctx = TabletContext::with_infinite_region(0, Some(0)); + reg.load(ctx, false).unwrap(); + self.tablet_registry = Some(reg.clone()); + engines.raft.register_config(cfg_controller); let engines_info = Arc::new(EnginesResourceInfo::new( &engines, 180, // max_samples_to_preserve @@ -485,6 +492,8 @@ struct TiKvServer { snap_mgr: Option, // Will be filled in `init_servers`. encryption_key_manager: Option>, engines: Option>, + kv_statistics: Option>, + raft_statistics: Option>, servers: Option>, region_info_accessor: RegionInfoAccessor, coprocessor_host: Option>, @@ -495,7 +504,7 @@ struct TiKvServer { background_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, - tablet_factory: Option + Send + Sync>>, + tablet_registry: Option>, } struct TiKvEngines { @@ -590,6 +599,8 @@ impl TiKvServer { snap_mgr: None, encryption_key_manager: None, engines: None, + kv_statistics: None, + raft_statistics: None, servers: None, region_info_accessor, coprocessor_host, @@ -602,7 +613,7 @@ impl TiKvServer { flow_info_receiver: None, sst_worker: None, quota_limiter, - tablet_factory: None, + tablet_registry: None, } } @@ -979,7 +990,7 @@ impl TiKvServer { )), }; - let storage = create_raft_storage::<_, _, _, F, _>( + let storage = Storage::<_, _, F>::from_engine( engines.engine.clone(), &self.config.storage, storage_read_pool_handle, @@ -997,8 +1008,7 @@ impl TiKvServer { cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( - self.tablet_factory.as_ref().unwrap().clone(), - self.config.storage.block_cache.shared, + self.tablet_registry.as_ref().unwrap().clone(), ttl_scheduler, flow_controller, storage.get_scheduler(), @@ -1209,7 +1219,7 @@ impl TiKvServer { ), coprocessor_v2::Endpoint::new(&self.config.coprocessor_v2), self.resolver.clone().unwrap(), - snap_mgr.clone(), + Either::Left(snap_mgr.clone()), gc_worker.clone(), check_leader_scheduler, self.env.clone(), @@ -1376,6 +1386,8 @@ impl TiKvServer { kv: engines.engines.kv.rocks.clone(), raft: engines.engines.raft.clone(), }, + self.kv_statistics.clone(), + self.raft_statistics.clone(), servers.server.get_debug_thread_pool().clone(), engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), @@ -1433,8 +1445,12 @@ impl TiKvServer { fetcher: BytesFetcher, engines_info: Arc, ) { - let mut engine_metrics = EngineMetricsManager::::new( - self.engines.as_ref().unwrap().engines.clone(), + let mut engine_metrics = EngineMetricsManager::::new( + self.tablet_registry.clone().unwrap(), + self.kv_statistics.clone(), + self.config.rocksdb.titan.enabled, + self.engines.as_ref().unwrap().engines.raft.clone(), + self.raft_statistics.clone(), ); let mut io_metrics = IOMetricsManager::new(fetcher); let engines_info_clone = engines_info.clone(); @@ -1622,15 +1638,17 @@ pub trait ConfiguredRaftEngine: RaftEngine { _: &TikvConfig, _: &Arc, _: &Option>, - _: &Option, - ) -> Self; + _: &Cache, + ) -> (Self, Option>); fn as_rocks_engine(&self) -> Option<&RocksEngine> { None } + + fn register_config(&self, _cfg_controller: &mut ConfigController) {} + fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { None } - fn register_config(&self, _cfg_controller: &mut ConfigController, _share_cache: bool) {} } impl ConfiguredRaftEngine for engine_rocks::RocksEngine { @@ -1638,8 +1656,8 @@ impl ConfiguredRaftEngine for engine_rocks::RocksEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, - ) -> Self { + block_cache: &Cache, + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_engine.config().dir, @@ -1649,13 +1667,11 @@ impl ConfiguredRaftEngine for engine_rocks::RocksEngine { let raft_db_path = &config.raft_store.raftdb_path; let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let statistics = Arc::new(RocksStatistics::new_titan()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), Some(&statistics)); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); - let mut raftdb = - engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) - .expect("failed to open raftdb"); - raftdb.set_shared_block_cache(block_cache.is_some()); + let raftdb = engine_rocks::util::new_engine_opt(raft_db_path, raft_db_opts, raft_cf_opts) + .expect("failed to open raftdb"); if should_dump { let raft_engine = @@ -1666,21 +1682,17 @@ impl ConfiguredRaftEngine for engine_rocks::RocksEngine { drop(raft_engine); raft_data_state_machine.after_dump_data(); } - raftdb + (raftdb, Some(statistics)) } fn as_rocks_engine(&self) -> Option<&RocksEngine> { Some(self) } - fn register_config(&self, cfg_controller: &mut ConfigController, share_cache: bool) { + fn register_config(&self, cfg_controller: &mut ConfigController) { cfg_controller.register( tikv::config::Module::Raftdb, - Box::new(DbConfigManger::new( - Arc::new(self.clone()), - DbType::Raft, - share_cache, - )), + Box::new(DbConfigManger::new(self.clone(), DbType::Raft)), ); } } @@ -1690,8 +1702,8 @@ impl ConfiguredRaftEngine for RaftLogEngine { config: &TikvConfig, env: &Arc, key_manager: &Option>, - block_cache: &Option, - ) -> Self { + block_cache: &Cache, + ) -> (Self, Option>) { let mut raft_data_state_machine = RaftDataStateMachine::new( &config.storage.data_dir, &config.raft_store.raftdb_path, @@ -1706,8 +1718,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { if should_dump { let config_raftdb = &config.raftdb; - let mut raft_db_opts = config_raftdb.build_opt(); - raft_db_opts.set_env(env.clone()); + let raft_db_opts = config_raftdb.build_opt(env.clone(), None); let raft_cf_opts = config_raftdb.build_cf_opts(block_cache); let raftdb = engine_rocks::util::new_engine_opt( &config.raft_store.raftdb_path, @@ -1720,7 +1731,7 @@ impl ConfiguredRaftEngine for RaftLogEngine { drop(raftdb); raft_data_state_machine.after_dump_data(); } - raft_engine + (raft_engine, None) } } @@ -1729,9 +1740,9 @@ impl ConfiguredRaftEngine for PSEngine { _config: &TikvConfig, _env: &Arc, _key_manager: &Option>, - _block_cache: &Option, - ) -> Self { - PSEngine::new() + _block_cache: &Cache, + ) -> (Self, Option>) { + (PSEngine::new(), None) } fn as_ps_engine(&mut self) -> Option<&mut PSEngine> { @@ -1857,25 +1868,58 @@ impl Stop for LazyWorker { } } -pub struct EngineMetricsManager { - engines: Engines, +pub struct EngineMetricsManager { + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, last_reset: Instant, } -impl EngineMetricsManager { - pub fn new(engines: Engines) -> Self { +impl EngineMetricsManager { + pub fn new( + tablet_registry: TabletRegistry, + kv_statistics: Option>, + kv_is_titan: bool, + raft_engine: ER, + raft_statistics: Option>, + ) -> Self { EngineMetricsManager { - engines, + tablet_registry, + kv_statistics, + kv_is_titan, + raft_engine, + raft_statistics, last_reset: Instant::now(), } } pub fn flush(&mut self, now: Instant) { - KvEngine::flush_metrics(&self.engines.kv, "kv"); - self.engines.raft.flush_metrics("raft"); + let mut reporter = EK::StatisticsReporter::new("kv"); + self.tablet_registry + .for_each_opened_tablet(|_, db: &mut CachedTablet| { + if let Some(db) = db.latest() { + reporter.collect(db); + } + true + }); + reporter.flush(); + self.raft_engine.flush_metrics("raft"); + + if let Some(s) = self.kv_statistics.as_ref() { + flush_engine_statistics(s, "kv", self.kv_is_titan); + } + if let Some(s) = self.raft_statistics.as_ref() { + flush_engine_statistics(s, "raft", false); + } if now.saturating_duration_since(self.last_reset) >= DEFAULT_ENGINE_METRICS_RESET_INTERVAL { - KvEngine::reset_statistics(&self.engines.kv); - self.engines.raft.reset_statistics(); + if let Some(s) = self.kv_statistics.as_ref() { + s.reset(); + } + if let Some(s) = self.raft_statistics.as_ref() { + s.reset(); + } self.last_reset = now; } } diff --git a/proxy_server/src/util.rs b/proxy_server/src/util.rs index 0b929285044..aecf00ae530 100644 --- a/proxy_server/src/util.rs +++ b/proxy_server/src/util.rs @@ -24,7 +24,7 @@ fn server_info_for_ffi(req: ServerInfoRequest) -> ServerInfoResponse { let load = ( sys::cpu_time_snapshot(), system - .get_networks() + .networks() .into_iter() .map(|(n, d)| (n.to_owned(), sys::NicSnapshot::from_network_data(d))) .collect(), diff --git a/proxy_tests/proxy/config.rs b/proxy_tests/proxy/config.rs index 95b8e00d3cb..253c21d2eef 100644 --- a/proxy_tests/proxy/config.rs +++ b/proxy_tests/proxy/config.rs @@ -119,6 +119,10 @@ fn test_config_proxy_default_no_config_item() { assert_eq!(config.import.num_threads, 4); assert_eq!(config.server.status_thread_pool_size, 2); + + assert_eq!(config.raft_store.evict_cache_on_memory_ratio, 0.1); + assert_eq!(config.memory_usage_high_water, 0.1); + assert_eq!(config.server.reject_messages_on_memory_ratio, 0.05); } /// We test if the engine-label is set properly. diff --git a/proxy_tests/proxy/normal.rs b/proxy_tests/proxy/normal.rs index 28040a63e94..24a984ecff3 100644 --- a/proxy_tests/proxy/normal.rs +++ b/proxy_tests/proxy/normal.rs @@ -67,7 +67,7 @@ mod config { ProxyConfig::from_file(path, Some(&mut proxy_unrecognized_keys)).unwrap(); assert_eq!(proxy_config.raft_store.snap_handle_pool_size, 4); assert_eq!(proxy_config.server.engine_addr, "1.2.3.4:5"); - assert!(proxy_unrecognized_keys.contains(&"memory-usage-high-water".to_string())); + assert_eq!(proxy_config.memory_usage_high_water, 0.65); assert!(proxy_unrecognized_keys.contains(&"nosense".to_string())); let v1 = vec!["a.b", "b"] .iter() diff --git a/proxy_tests/proxy/region.rs b/proxy_tests/proxy/region.rs index d285496bdda..7432c53ba15 100644 --- a/proxy_tests/proxy/region.rs +++ b/proxy_tests/proxy/region.rs @@ -145,8 +145,9 @@ fn test_get_region_local_state() { } /// This test is very important. -/// If make sure we can add learner peer for a store which is not started +/// It make sure we can add learner peer for a store which is not started /// actually. +/// We don't start the absent learner peer in this test. #[test] fn test_add_absent_learner_peer_by_simple() { let (mut cluster, pd_client) = new_mock_cluster(0, 3); @@ -181,8 +182,9 @@ fn test_add_absent_learner_peer_by_simple() { } /// This test is very important. -/// If make sure we can add learner peer for a store which is not started +/// It make sure we can add learner peer for a store which is not started /// actually. +/// We don't start the absent learner peer in this test. #[test] fn test_add_absent_learner_peer_by_joint() { let (mut cluster, pd_client) = new_mock_cluster(0, 3); @@ -317,6 +319,8 @@ fn later_bootstrap_learner_peer( } } +/// We start the absent learner peer in this test. +/// We don't try to reuse data from other learner peer. #[test] fn test_add_delayed_started_learner_by_joint() { let (mut cluster, pd_client) = new_later_add_learner_cluster( @@ -429,6 +433,9 @@ fn recover_from_peer(cluster: &Cluster, from: u64, to: u64, region_ } } +/// We start the absent learner peer in this test. +/// We try to reuse data from other learner peer. +/// We don't use a snapshot to initialize a peer. #[test] fn test_add_delayed_started_learner_no_snapshot() { // fail::cfg("before_tiflash_check_double_write", "return").unwrap(); @@ -471,6 +478,8 @@ fn test_add_delayed_started_learner_no_snapshot() { later_bootstrap_learner_peer(&mut cluster, vec![5], 1); // After that, we manually compose data, to avoid snapshot sending. recover_from_peer(&cluster, 4, 5, 1); + + cluster.must_put(b"m1", b"v1"); // Add node 5 to cluster. pd_client.must_add_peer(1, new_learner_peer(5, 5)); @@ -515,6 +524,9 @@ fn test_add_delayed_started_learner_no_snapshot() { // fail::remove("before_tiflash_do_write"); } +/// We start the absent learner peer in this test. +/// We try to reuse data from other learner peer. +/// We use a snapshot to initialize a peer. #[test] fn test_add_delayed_started_learner_snapshot() { let (mut cluster, pd_client) = new_later_add_learner_cluster( diff --git a/proxy_tests/proxy/server_cluster_test.rs b/proxy_tests/proxy/server_cluster_test.rs index 815914491dd..c6d48367c4c 100644 --- a/proxy_tests/proxy/server_cluster_test.rs +++ b/proxy_tests/proxy/server_cluster_test.rs @@ -93,7 +93,7 @@ fn test_safe_ts_basic() { suite .cluster .set_expected_safe_ts(physical_time, physical_time); - suite.must_check_leader(1, TimeStamp::compose(physical_time, 10), 1, 1); + suite.must_check_leader(1, TimeStamp::new(physical_time), 1, 1); suite.stop(); } diff --git a/src/config/configurable.rs b/src/config/configurable.rs new file mode 100644 index 00000000000..7cbcc731eb6 --- /dev/null +++ b/src/config/configurable.rs @@ -0,0 +1,141 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{error::Error, io::Write}; + +use engine_rocks::RocksEngine; +use engine_traits::{ + CachedTablet, CfOptionsExt, DbOptions, DbOptionsExt, TabletRegistry, CF_DEFAULT, +}; + +pub type ConfigRes = Result<(), Box>; + +pub trait ConfigurableDb { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes; + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes; + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes; + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes; + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes; +} + +impl ConfigurableDb for RocksEngine { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes { + self.set_db_options(opts).map_err(Box::from) + } + + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes { + self.set_options_cf(cf, opts).map_err(Box::from) + } + + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_rate_bytes_per_sec(rate_bytes_per_sec) + .map_err(Box::from) + } + + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { + let mut opt = self.get_db_options(); + opt.set_rate_limiter_auto_tuned(auto_tuned) + .map_err(Box::new)?; + // double check the new state + let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); + if new_auto_tuned == Some(auto_tuned) { + Ok(()) + } else { + Err(engine_traits::Status::with_error( + engine_traits::Code::IoError, + "fail to set rate_limiter_auto_tuned", + ) + .into()) + } + } + + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { + let opt = self.get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap + opt.set_block_cache_capacity(capacity as u64) + .map_err(Box::from) + } +} + +fn loop_registry( + registry: &TabletRegistry, + mut f: impl FnMut(&mut CachedTablet) -> std::result::Result>, +) -> ConfigRes { + let mut error_count = 0; + let mut res = Ok(()); + let mut error_samples: Vec = vec![]; + registry.for_each_opened_tablet(|id, cache| match f(cache) { + Ok(b) => b, + Err(e) => { + error_count += 1; + res = Err(e); + if error_count <= 3 { + writeln!( + error_samples, + "Tablet {} {:?} encountered error: {:?}.", + id, + cache.cache().map(|c| c.as_inner().path()), + res + ) + .unwrap(); + } + true + } + }); + if error_count > 0 { + error!( + "Total count {}. Sample errors: {}", + error_count, + std::str::from_utf8(&error_samples).unwrap() + ); + } + res +} + +impl ConfigurableDb for TabletRegistry { + fn set_db_config(&self, opts: &[(&str, &str)]) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_db_config(opts)?; + } + Ok(true) + }) + } + + fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_cf_config(cf, opts)?; + } + Ok(true) + }) + } + + fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_rate_bytes_per_sec(rate_bytes_per_sec)? + } + Ok(true) + }) + } + + fn set_rate_limiter_auto_tuned(&self, auto_tuned: bool) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_rate_limiter_auto_tuned(auto_tuned)? + } + Ok(true) + }) + } + + fn set_shared_block_cache_capacity(&self, capacity: usize) -> ConfigRes { + loop_registry(self, |cache| { + if let Some(latest) = cache.latest() { + latest.set_shared_block_cache_capacity(capacity)?; + Ok(false) + } else { + Ok(true) + } + }) + } +} diff --git a/src/config.rs b/src/config/mod.rs similarity index 93% rename from src/config.rs rename to src/config/mod.rs index e9eca154d6e..6ed8da3f111 100644 --- a/src/config.rs +++ b/src/config/mod.rs @@ -5,6 +5,8 @@ //! TiKV is configured through the `TikvConfig` type, which is in turn //! made up of many other configuration types. +mod configurable; + use std::{ cmp, collections::{HashMap, HashSet}, @@ -20,25 +22,26 @@ use std::{ use api_version::ApiV1Ttl; use causal_ts::Config as CausalTsConfig; +pub use configurable::{ConfigRes, ConfigurableDb}; use encryption_export::DataKeyManager; use engine_rocks::{ config::{self as rocks_config, BlobRunMode, CompressionType, LogLevel as RocksLogLevel}, get_env, properties::MvccPropertiesCollectorFactory, raw::{ - BlockBasedOptions, Cache, ChecksumType, CompactionPriority, DBCompactionStyle, - DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, LRUCacheOptions, - PrepopulateBlockCache, + BlockBasedOptions, Cache, ChecksumType, CompactionPriority, ConcurrentTaskLimiter, + DBCompactionStyle, DBCompressionType, DBRateLimiterMode, DBRecoveryMode, Env, + PrepopulateBlockCache, RateLimiter, WriteBufferManager, }, util::{FixedPrefixSliceTransform, FixedSuffixSliceTransform, NoopSliceTransform}, RaftDbLogger, RangePropertiesCollectorFactory, RawMvccPropertiesCollectorFactory, - RocksCfOptions, RocksDbOptions, RocksEngine, RocksEventListener, RocksTitanDbOptions, - RocksdbLogger, TtlPropertiesCollectorFactory, DEFAULT_PROP_KEYS_INDEX_DISTANCE, - DEFAULT_PROP_SIZE_INDEX_DISTANCE, + RocksCfOptions, RocksDbOptions, RocksEngine, RocksEventListener, RocksStatistics, + RocksTitanDbOptions, RocksdbLogger, TtlPropertiesCollectorFactory, + DEFAULT_PROP_KEYS_INDEX_DISTANCE, DEFAULT_PROP_SIZE_INDEX_DISTANCE, }; use engine_traits::{ - CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt, MiscExt, TabletAccessor, - TabletErrorCollector, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + CfOptions as _, DbOptions as _, MiscExt, TitanCfOptions as _, CF_DEFAULT, CF_LOCK, CF_RAFT, + CF_WRITE, }; use file_system::IoRateLimiter; use keys::region_raft_prefix_len; @@ -78,7 +81,7 @@ use crate::{ ttl::TtlCompactionFilterFactory, Config as ServerConfig, CONFIG_ROCKSDB_GAUGE, }, - storage::config::{Config as StorageConfig, DEFAULT_DATA_DIR}, + storage::config::{Config as StorageConfig, EngineType, DEFAULT_DATA_DIR}, }; pub const DEFAULT_ROCKSDB_SUB_DIR: &str = "db"; @@ -95,6 +98,8 @@ pub const MIN_BLOCK_CACHE_SHARD_SIZE: usize = 128 * MIB as usize; /// Maximum of 15% of system memory can be used by Raft Engine. Normally its /// memory usage is much smaller than that. const RAFT_ENGINE_MEMORY_LIMIT_RATE: f64 = 0.15; +/// Tentative value. +const WRITE_BUFFER_MEMORY_LIMIT_RATE: f64 = 0.25; const LOCKCF_MIN_MEM: usize = 256 * MIB as usize; const LOCKCF_MAX_MEM: usize = GIB as usize; @@ -105,6 +110,15 @@ pub const LAST_CONFIG_FILE: &str = "last_tikv.toml"; const TMP_CONFIG_FILE: &str = "tmp_tikv.toml"; const MAX_BLOCK_SIZE: usize = 32 * MIB as usize; +fn bloom_filter_ratio(et: EngineType) -> f64 { + match et { + EngineType::RaftKv => 0.1, + // In v2, every peer has its own tablet. The data scale is about tens of + // GiBs. We only need a small portion for those key. + EngineType::RaftKv2 => 0.005, + } +} + fn memory_limit_for_cf(is_raft_db: bool, cf: &str, total_mem: u64) -> ReadableSize { let (ratio, min, max) = match (is_raft_db, cf) { (true, CF_DEFAULT) => (0.02, RAFT_MIN_MEM, RAFT_MAX_MEM), @@ -350,6 +364,8 @@ macro_rules! cf_config { #[serde(with = "rocks_config::checksum_serde")] #[online_config(skip)] pub checksum: ChecksumType, + #[online_config(skip)] + pub max_compactions: u32, #[online_config(submodule)] pub titan: TitanCfConfig, } @@ -503,17 +519,17 @@ macro_rules! write_into_metrics { } macro_rules! build_cf_opt { - ($opt:ident, $cf_name:ident, $cache:ident, $region_info_provider:ident) => {{ + ( + $opt:ident, + $cf_name:ident, + $cache:expr, + $compaction_limiter:expr, + $region_info_provider:ident + ) => {{ let mut block_base_opts = BlockBasedOptions::new(); block_base_opts.set_block_size($opt.block_size.0 as usize); block_base_opts.set_no_block_cache($opt.disable_block_cache); - if let Some(cache) = $cache { - block_base_opts.set_block_cache(cache); - } else { - let mut cache_opts = LRUCacheOptions::new(); - cache_opts.set_capacity($opt.block_cache_size.0 as usize); - block_base_opts.set_block_cache(&Cache::new_lru_cache(cache_opts)); - } + block_base_opts.set_block_cache($cache); block_base_opts.set_cache_index_and_filter_blocks($opt.cache_index_and_filter_blocks); block_base_opts .set_pin_l0_filter_and_index_blocks_in_cache($opt.pin_l0_filter_and_index_blocks); @@ -594,10 +610,18 @@ macro_rules! build_cf_opt { warn!("compaction guard is disabled due to region info provider not available") } } + if let Some(r) = $compaction_limiter { + cf_opts.set_compaction_thread_limiter(r); + } cf_opts }}; } +pub struct CfResources { + pub cache: Cache, + pub compaction_thread_limiters: HashMap<&'static str, ConcurrentTaskLimiter>, +} + cf_config!(DefaultCfConfig); impl Default for DefaultCfConfig { @@ -656,6 +680,7 @@ impl Default for DefaultCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan: TitanCfConfig::default(), } } @@ -664,11 +689,19 @@ impl Default for DefaultCfConfig { impl DefaultCfConfig { pub fn build_opt( &self, - cache: &Option, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + for_engine: EngineType, ) -> RocksCfOptions { - let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_DEFAULT, + &shared.cache, + shared.compaction_thread_limiters.get(CF_DEFAULT), + region_info_accessor + ); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); let f = RangePropertiesCollectorFactory { prop_size_index_distance: self.prop_size_index_distance, prop_keys_index_distance: self.prop_keys_index_distance, @@ -772,6 +805,7 @@ impl Default for WriteCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } @@ -780,10 +814,17 @@ impl Default for WriteCfConfig { impl WriteCfConfig { pub fn build_opt( &self, - cache: &Option, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, + for_engine: EngineType, ) -> RocksCfOptions { - let mut cf_opts = build_cf_opt!(self, CF_WRITE, cache, region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_WRITE, + &shared.cache, + shared.compaction_thread_limiters.get(CF_WRITE), + region_info_accessor + ); // Prefix extractor(trim the timestamp at tail) for write cf. cf_opts .set_prefix_extractor( @@ -792,7 +833,7 @@ impl WriteCfConfig { ) .unwrap(); // Create prefix bloom filter for memtable. - cf_opts.set_memtable_prefix_bloom_size_ratio(0.1); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); // Collects user defined properties. cf_opts.add_table_properties_collector_factory( "tikv.mvcc-properties-collector", @@ -870,15 +911,22 @@ impl Default for LockCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } } impl LockCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, shared: &CfResources, for_engine: EngineType) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_LOCK, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_LOCK, + &shared.cache, + shared.compaction_thread_limiters.get(CF_LOCK), + no_region_info_accessor + ); cf_opts .set_prefix_extractor("NoopSliceTransform", NoopSliceTransform) .unwrap(); @@ -887,7 +935,7 @@ impl LockCfConfig { prop_keys_index_distance: self.prop_keys_index_distance, }; cf_opts.add_table_properties_collector_factory("tikv.range-properties-collector", f); - cf_opts.set_memtable_prefix_bloom_size_ratio(0.1); + cf_opts.set_memtable_prefix_bloom_size_ratio(bloom_filter_ratio(for_engine)); cf_opts.set_titan_cf_options(&self.titan.build_opts()); cf_opts } @@ -946,15 +994,22 @@ impl Default for RaftCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan, } } } impl RaftCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, shared: &CfResources) -> RocksCfOptions { let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_RAFT, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_RAFT, + &shared.cache, + shared.compaction_thread_limiters.get(CF_RAFT), + no_region_info_accessor + ); cf_opts .set_prefix_extractor("NoopSliceTransform", NoopSliceTransform) .unwrap(); @@ -1029,6 +1084,8 @@ pub struct DbConfig { pub create_if_missing: bool, pub max_open_files: i32, #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] pub enable_statistics: bool, #[online_config(skip)] pub stats_dump_period: ReadableDuration, @@ -1062,14 +1119,26 @@ pub struct DbConfig { pub use_direct_io_for_flush_and_compaction: bool, #[online_config(skip)] pub enable_pipelined_write: bool, - // deprecated. TiKV will use a new write mode when set `enable_pipelined_write` false and fall - // back to write mode in 3.0 when set `enable_pipelined_write` true. The code of - // multi-batch-write in RocksDB has been removed. #[online_config(skip)] - #[serde(skip_serializing)] - pub enable_multi_batch_write: bool, + pub enable_multi_batch_write: Option, #[online_config(skip)] pub enable_unordered_write: bool, + #[online_config(skip)] + pub allow_concurrent_memtable_write: Option, + #[online_config(skip)] + pub write_buffer_limit: Option, + #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] + pub write_buffer_stall_ratio: f32, + #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] + pub write_buffer_flush_oldest_first: bool, + // Dangerous option only for programming use. + #[online_config(skip)] + #[serde(skip)] + pub paranoid_checks: Option, #[online_config(submodule)] pub defaultcf: DefaultCfConfig, #[online_config(submodule)] @@ -1082,6 +1151,15 @@ pub struct DbConfig { pub titan: TitanDbConfig, } +#[derive(Clone)] +pub struct DbResources { + // DB Options. + pub env: Arc, + pub statistics: Arc, + pub rate_limiter: Option>, + pub write_buffer_manager: Option>, +} + impl Default for DbConfig { fn default() -> DbConfig { let bg_job_limits = get_background_job_limits(&KVDB_DEFAULT_BACKGROUND_JOB_LIMITS); @@ -1119,8 +1197,13 @@ impl Default for DbConfig { writable_file_max_buffer_size: ReadableSize::mb(1), use_direct_io_for_flush_and_compaction: false, enable_pipelined_write: false, - enable_multi_batch_write: true, // deprecated + enable_multi_batch_write: None, // deprecated enable_unordered_write: false, + allow_concurrent_memtable_write: None, + write_buffer_limit: None, + write_buffer_stall_ratio: 0.0, + write_buffer_flush_oldest_first: false, + paranoid_checks: None, defaultcf: DefaultCfConfig::default(), writecf: WriteCfConfig::default(), lockcf: LockCfConfig::default(), @@ -1131,7 +1214,49 @@ impl Default for DbConfig { } impl DbConfig { - pub fn build_opt(&self) -> RocksDbOptions { + pub fn optimize_for(&mut self, engine: EngineType) { + match engine { + EngineType::RaftKv => { + self.allow_concurrent_memtable_write.get_or_insert(true); + } + EngineType::RaftKv2 => { + self.enable_multi_batch_write.get_or_insert(false); + self.allow_concurrent_memtable_write.get_or_insert(false); + let total_mem = SysQuota::memory_limit_in_bytes() as f64; + self.write_buffer_limit.get_or_insert(ReadableSize( + (total_mem * WRITE_BUFFER_MEMORY_LIMIT_RATE) as u64, + )); + } + } + } + + pub fn build_resources(&self, env: Arc) -> DbResources { + let rate_limiter = if self.rate_bytes_per_sec.0 > 0 { + Some(Arc::new(RateLimiter::new_writeampbased_with_auto_tuned( + self.rate_bytes_per_sec.0 as i64, + (self.rate_limiter_refill_period.as_millis() * 1000) as i64, + 10, // fairness + self.rate_limiter_mode, + self.rate_limiter_auto_tuned, + ))) + } else { + None + }; + DbResources { + env, + statistics: Arc::new(RocksStatistics::new_titan()), + rate_limiter, + write_buffer_manager: self.write_buffer_limit.map(|limit| { + Arc::new(WriteBufferManager::new( + limit.0 as usize, + self.write_buffer_stall_ratio, + self.write_buffer_flush_oldest_first, + )) + }), + } + } + + pub fn build_opt(&self, shared: &DbResources) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1147,30 +1272,11 @@ impl DbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - opts.enable_statistics(self.enable_statistics); opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); opts.set_log_file_time_to_roll(self.info_log_roll_time.as_secs()); opts.set_keep_log_file_num(self.info_log_keep_log_file_num); - if self.rate_bytes_per_sec.0 > 0 { - if self.rate_limiter_auto_tuned { - opts.set_writeampbasedratelimiter_with_auto_tuned( - self.rate_bytes_per_sec.0 as i64, - (self.rate_limiter_refill_period.as_millis() * 1000) as i64, - self.rate_limiter_mode, - self.rate_limiter_auto_tuned, - ); - } else { - opts.set_ratelimiter_with_auto_tuned( - self.rate_bytes_per_sec.0 as i64, - (self.rate_limiter_refill_period.as_millis() * 1000) as i64, - self.rate_limiter_mode, - self.rate_limiter_auto_tuned, - ); - } - } - opts.set_bytes_per_sync(self.bytes_per_sync.0); opts.set_wal_bytes_per_sync(self.wal_bytes_per_sync.0); opts.set_max_subcompactions(self.max_sub_compactions); @@ -1179,37 +1285,90 @@ impl DbConfig { self.use_direct_io_for_flush_and_compaction, ); opts.enable_pipelined_write(self.enable_pipelined_write); - let enable_multi_batch_write = !self.enable_pipelined_write && !self.enable_unordered_write; + let mut enable_multi_batch_write = + !self.enable_pipelined_write && !self.enable_unordered_write; + if self.allow_concurrent_memtable_write == Some(false) + && self.enable_multi_batch_write == Some(false) + { + enable_multi_batch_write = false + } opts.enable_multi_batch_write(enable_multi_batch_write); opts.enable_unordered_write(self.enable_unordered_write); + opts.allow_concurrent_memtable_write(self.allow_concurrent_memtable_write.unwrap_or(true)); + if let Some(b) = self.paranoid_checks { + opts.set_paranoid_checks(b); + } opts.set_info_log(RocksdbLogger::default()); opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); } + opts.set_env(shared.env.clone()); + opts.set_statistics(&shared.statistics); + if let Some(r) = &shared.rate_limiter { + opts.set_rate_limiter(r); + } + if let Some(r) = &shared.write_buffer_manager { + opts.set_write_buffer_manager(r); + } opts } + pub fn build_cf_resources(&self, cache: Cache) -> CfResources { + let mut compaction_thread_limiters = HashMap::new(); + if self.defaultcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_DEFAULT, + ConcurrentTaskLimiter::new(CF_DEFAULT, self.defaultcf.max_compactions), + ); + } + if self.writecf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_WRITE, + ConcurrentTaskLimiter::new(CF_WRITE, self.writecf.max_compactions), + ); + } + if self.lockcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_LOCK, + ConcurrentTaskLimiter::new(CF_LOCK, self.lockcf.max_compactions), + ); + } + if self.raftcf.max_compactions > 0 { + compaction_thread_limiters.insert( + CF_RAFT, + ConcurrentTaskLimiter::new(CF_RAFT, self.raftcf.max_compactions), + ); + } + CfResources { + cache, + compaction_thread_limiters, + } + } + pub fn build_cf_opts( &self, - cache: &Option, + shared: &CfResources, region_info_accessor: Option<&RegionInfoAccessor>, api_version: ApiVersion, + for_engine: EngineType, ) -> Vec<(&'static str, RocksCfOptions)> { - vec![ - ( - CF_DEFAULT, - self.defaultcf - .build_opt(cache, region_info_accessor, api_version), - ), - (CF_LOCK, self.lockcf.build_opt(cache)), - ( - CF_WRITE, - self.writecf.build_opt(cache, region_info_accessor), - ), - // TODO: remove CF_RAFT. - (CF_RAFT, self.raftcf.build_opt(cache)), - ] + let mut cf_opts = Vec::with_capacity(4); + cf_opts.push(( + CF_DEFAULT, + self.defaultcf + .build_opt(shared, region_info_accessor, api_version, for_engine), + )); + cf_opts.push((CF_LOCK, self.lockcf.build_opt(shared, for_engine))); + cf_opts.push(( + CF_WRITE, + self.writecf + .build_opt(shared, region_info_accessor, for_engine), + )); + if for_engine == EngineType::RaftKv { + cf_opts.push((CF_RAFT, self.raftcf.build_opt(shared))); + } + cf_opts } fn validate(&mut self) -> Result<(), Box> { @@ -1253,6 +1412,9 @@ impl DbConfig { ) .into()); } + if !self.enable_statistics { + warn!("kvdb: ignoring `enable_statistics`, statistics is always on.") + } Ok(()) } @@ -1322,15 +1484,27 @@ impl Default for RaftDefaultCfConfig { prepopulate_block_cache: PrepopulateBlockCache::Disabled, format_version: 2, checksum: ChecksumType::CRC32c, + max_compactions: 0, titan: TitanCfConfig::default(), } } } impl RaftDefaultCfConfig { - pub fn build_opt(&self, cache: &Option) -> RocksCfOptions { + pub fn build_opt(&self, cache: &Cache) -> RocksCfOptions { + let limiter = if self.max_compactions > 0 { + Some(ConcurrentTaskLimiter::new(CF_DEFAULT, self.max_compactions)) + } else { + None + }; let no_region_info_accessor: Option<&RegionInfoAccessor> = None; - let mut cf_opts = build_cf_opt!(self, CF_DEFAULT, cache, no_region_info_accessor); + let mut cf_opts = build_cf_opt!( + self, + CF_DEFAULT, + cache, + limiter.as_ref(), + no_region_info_accessor + ); let f = FixedPrefixSliceTransform::new(region_raft_prefix_len()); cf_opts .set_memtable_insert_hint_prefix_extractor("RaftPrefixSliceTransform", f) @@ -1368,6 +1542,8 @@ pub struct RaftDbConfig { pub create_if_missing: bool, pub max_open_files: i32, #[online_config(skip)] + #[doc(hidden)] + #[serde(skip_serializing)] pub enable_statistics: bool, #[online_config(skip)] pub stats_dump_period: ReadableDuration, @@ -1441,7 +1617,7 @@ impl Default for RaftDbConfig { } impl RaftDbConfig { - pub fn build_opt(&self) -> RocksDbOptions { + pub fn build_opt(&self, env: Arc, statistics: Option<&RocksStatistics>) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1456,7 +1632,10 @@ impl RaftDbConfig { opts.set_max_manifest_file_size(self.max_manifest_file_size.0); opts.create_if_missing(self.create_if_missing); opts.set_max_open_files(self.max_open_files); - opts.enable_statistics(self.enable_statistics); + match statistics { + Some(s) => opts.set_statistics(s), + None => opts.set_statistics(&RocksStatistics::new_titan()), + } opts.set_stats_dump_period_sec(self.stats_dump_period.as_secs() as usize); opts.set_compaction_readahead_size(self.compaction_readahead_size.0); opts.set_max_log_file_size(self.info_log_max_size.0); @@ -1479,11 +1658,11 @@ impl RaftDbConfig { if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); } - + opts.set_env(env); opts } - pub fn build_cf_opts(&self, cache: &Option) -> Vec<(&'static str, RocksCfOptions)> { + pub fn build_cf_opts(&self, cache: &Cache) -> Vec<(&'static str, RocksCfOptions)> { vec![(CF_DEFAULT, self.defaultcf.build_opt(cache))] } @@ -1499,6 +1678,9 @@ impl RaftDbConfig { ); } } + if !self.enable_statistics { + warn!("raftdb: ignoring `enable_statistics`, statistics is always on.") + } Ok(()) } } @@ -1546,38 +1728,21 @@ pub enum DbType { Raft, } -pub struct DbConfigManger> { - tablet_accessor: Arc, +pub struct DbConfigManger { + db: D, db_type: DbType, - shared_block_cache: bool, } -impl> DbConfigManger { - pub fn new(tablet_accessor: Arc, db_type: DbType, shared_block_cache: bool) -> Self { - DbConfigManger { - tablet_accessor, - db_type, - shared_block_cache, - } - } - - fn set_db_config(&self, opts: &[(&str, &str)]) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - error_collector.add_result(region_id, suffix, db.set_db_options(opts)); - }); - error_collector.take_result() +impl DbConfigManger { + pub fn new(db: D, db_type: DbType) -> Self { + DbConfigManger { db, db_type } } +} +impl DbConfigManger { fn set_cf_config(&self, cf: &str, opts: &[(&str, &str)]) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); self.validate_cf(cf)?; - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - error_collector.add_result(region_id, suffix, db.set_options_cf(cf, opts)); - }); - error_collector.take_result()?; + self.db.set_cf_config(cf, opts)?; // Write config to metric for (cfg_name, cfg_value) in opts { @@ -1595,100 +1760,6 @@ impl> DbConfigManger { Ok(()) } - fn set_block_cache_size(&self, cf: &str, size: ReadableSize) -> Result<(), Box> { - self.validate_cf(cf)?; - if self.shared_block_cache { - return Err("shared block cache is enabled, change cache size through \ - block-cache.capacity in storage module instead" - .into()); - } - // for multi-rocks, shared block cache has to be enabled and thus should - // shortcut in the above if statement. - assert!(self.tablet_accessor.is_single_engine()); - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let r = db - .get_options_cf(cf) - .and_then(|opt| opt.set_block_cache_capacity(size.0)); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } - }); - // Write config to metric - CONFIG_ROCKSDB_GAUGE - .with_label_values(&[cf, "block_cache_size"]) - .set(size.0 as f64); - error_collector.take_result() - } - - fn set_rate_bytes_per_sec(&self, rate_bytes_per_sec: i64) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.get_db_options(); - let r = opt.set_rate_bytes_per_sec(rate_bytes_per_sec); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } - }); - error_collector.take_result() - } - - fn set_rate_limiter_auto_tuned( - &self, - rate_limiter_auto_tuned: bool, - ) -> Result<(), Box> { - let mut error_collector = TabletErrorCollector::new(); - self.tablet_accessor - .for_each_opened_tablet(&mut |region_id, suffix, db: &RocksEngine| { - let mut opt = db.get_db_options(); - let r = opt.set_rate_limiter_auto_tuned(rate_limiter_auto_tuned); - if r.is_err() { - error_collector.add_result(region_id, suffix, r); - } else { - // double check the new state - let new_auto_tuned = opt.get_rate_limiter_auto_tuned(); - if new_auto_tuned.is_none() - || new_auto_tuned.unwrap() != rate_limiter_auto_tuned - { - error_collector.add_result( - region_id, - suffix, - Err(engine_traits::Status::with_error( - engine_traits::Code::IoError, - "fail to set rate_limiter_auto_tuned", - ) - .into()), - ); - } - } - }); - - error_collector.take_result() - } - - fn set_max_background_jobs(&self, max_background_jobs: i32) -> Result<(), Box> { - self.set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; - Ok(()) - } - - fn set_max_background_flushes( - &self, - max_background_flushes: i32, - ) -> Result<(), Box> { - self.set_db_config(&[( - "max_background_flushes", - &max_background_flushes.to_string(), - )])?; - Ok(()) - } - - fn set_max_subcompactions(&self, max_subcompactions: u32) -> Result<(), Box> { - self.set_db_config(&[("max_subcompactions", &max_subcompactions.to_string())])?; - Ok(()) - } - fn validate_cf(&self, cf: &str) -> Result<(), Box> { match (self.db_type, cf) { (DbType::Kv, CF_DEFAULT) @@ -1701,7 +1772,7 @@ impl> DbConfigManger { } } -impl + Send + Sync> ConfigManager for DbConfigManger { +impl ConfigManager for DbConfigManger { fn dispatch(&mut self, change: ConfigChange) -> Result<(), Box> { let change_str = format!("{:?}", change); let mut change: Vec<(String, ConfigValue)> = change.into_iter().collect(); @@ -1710,9 +1781,11 @@ impl + Send + Sync> ConfigManager for DbConfigMan if let ConfigValue::Module(mut cf_change) = cf_change { // defaultcf -> default let cf_name = &cf_name[..(cf_name.len() - 2)]; - if let Some(v) = cf_change.remove("block_cache_size") { + if cf_change.remove("block_cache_size").is_some() { // currently we can't modify block_cache_size via set_options_cf - self.set_block_cache_size(cf_name, v.into())?; + return Err("shared block cache is enabled, change cache size through \ + block-cache.capacity in storage module instead" + .into()); } if let Some(ConfigValue::Module(titan_change)) = cf_change.remove("titan") { for (name, value) in titan_change { @@ -1732,7 +1805,8 @@ impl + Send + Sync> ConfigManager for DbConfigMan .next() { let rate_bytes_per_sec: ReadableSize = rate_bytes_config.1.into(); - self.set_rate_bytes_per_sec(rate_bytes_per_sec.0 as i64)?; + self.db + .set_rate_bytes_per_sec(rate_bytes_per_sec.0 as i64)?; } if let Some(rate_bytes_config) = change @@ -1740,37 +1814,43 @@ impl + Send + Sync> ConfigManager for DbConfigMan .next() { let rate_limiter_auto_tuned: bool = rate_bytes_config.1.into(); - self.set_rate_limiter_auto_tuned(rate_limiter_auto_tuned)?; + self.db + .set_rate_limiter_auto_tuned(rate_limiter_auto_tuned)?; } if let Some(background_jobs_config) = change .drain_filter(|(name, _)| name == "max_background_jobs") .next() { - let max_background_jobs = background_jobs_config.1.into(); - self.set_max_background_jobs(max_background_jobs)?; + let max_background_jobs: i32 = background_jobs_config.1.into(); + self.db + .set_db_config(&[("max_background_jobs", &max_background_jobs.to_string())])?; } if let Some(background_subcompactions_config) = change .drain_filter(|(name, _)| name == "max_sub_compactions") .next() { - let max_subcompactions = background_subcompactions_config.1.into(); - self.set_max_subcompactions(max_subcompactions)?; + let max_subcompactions: u32 = background_subcompactions_config.1.into(); + self.db + .set_db_config(&[("max_subcompactions", &max_subcompactions.to_string())])?; } if let Some(background_flushes_config) = change .drain_filter(|(name, _)| name == "max_background_flushes") .next() { - let max_background_flushes = background_flushes_config.1.into(); - self.set_max_background_flushes(max_background_flushes)?; + let max_background_flushes: i32 = background_flushes_config.1.into(); + self.db.set_db_config(&[( + "max_background_flushes", + &max_background_flushes.to_string(), + )])?; } if !change.is_empty() { let change = config_value_to_string(change); let change_slice = config_to_slice(&change); - self.set_db_config(&change_slice)?; + self.db.set_db_config(&change_slice)?; } info!( "rocksdb config changed"; @@ -3040,6 +3120,10 @@ impl TikvConfig { .to_owned(); } + if self.storage.engine == EngineType::RaftKv2 { + self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); + } + self.raft_store.raftdb_path = self.infer_raft_db_path(None)?; self.raft_engine.config.dir = self.infer_raft_engine_path(None)?; @@ -3066,12 +3150,18 @@ impl TikvConfig { return Err("raftdb.wal_dir can't be same as rocksdb.wal_dir".into()); } + let kv_data_exists = if self.storage.engine == EngineType::RaftKv { + RocksEngine::exists(&kv_db_path) + } else { + Path::new(&self.storage.data_dir).join("tablets").exists() + }; + RaftDataStateMachine::new( &self.storage.data_dir, &self.raft_store.raftdb_path, &self.raft_engine.config.dir, ) - .validate(RocksEngine::exists(&kv_db_path))?; + .validate(kv_data_exists)?; // Check blob file dir is empty when titan is disabled if !self.rocksdb.titan.enabled { @@ -3116,6 +3206,8 @@ impl TikvConfig { config::canonicalize_sub_path(&self.storage.data_dir, "log-backup-temp")?; } + self.rocksdb.optimize_for(self.storage.engine); + self.rocksdb.validate()?; self.raftdb.validate()?; self.raft_engine.validate()?; @@ -3228,20 +3320,11 @@ impl TikvConfig { } } else { // Adjust `memory_usage_limit` if necessary. - if self.storage.block_cache.shared { - if let Some(cap) = self.storage.block_cache.capacity { - let limit = (cap.0 as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; - self.memory_usage_limit = Some(ReadableSize(limit)); - } else { - self.memory_usage_limit = Some(Self::suggested_memory_usage_limit()); - } - } else { - let cap = self.rocksdb.defaultcf.block_cache_size.0 - + self.rocksdb.writecf.block_cache_size.0 - + self.rocksdb.lockcf.block_cache_size.0 - + self.raftdb.defaultcf.block_cache_size.0; - let limit = (cap as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; + if let Some(cap) = self.storage.block_cache.capacity { + let limit = (cap.0 as f64 / BLOCK_CACHE_RATE * MEMORY_USAGE_LIMIT_RATE) as u64; self.memory_usage_limit = Some(ReadableSize(limit)); + } else { + self.memory_usage_limit = Some(Self::suggested_memory_usage_limit()); } } @@ -3407,7 +3490,7 @@ impl TikvConfig { // individual block cache sizes. Otherwise use the sum of block cache // size of all column families as the shared cache size. let cache_cfg = &mut self.storage.block_cache; - if cache_cfg.shared && cache_cfg.capacity.is_none() { + if cache_cfg.capacity.is_none() { cache_cfg.capacity = Some(ReadableSize( self.rocksdb.defaultcf.block_cache_size.0 + self.rocksdb.writecf.block_cache_size.0 @@ -4061,7 +4144,8 @@ mod tests { use api_version::{ApiV1, KvFormat}; use case_macros::*; - use engine_traits::{CfOptions as _, DbOptions as _, DummyFactory}; + use engine_rocks::raw::LRUCacheOptions; + use engine_traits::{CfOptions as _, CfOptionsExt, DbOptions as _, DbOptionsExt}; use futures::executor::block_on; use grpcio::ResourceQuota; use itertools::Itertools; @@ -4321,6 +4405,15 @@ mod tests { tikv_cfg.validate().unwrap(); } + #[test] + fn test_rocks_rate_limit_zero() { + let mut tikv_cfg = TikvConfig::default(); + tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); + tikv_cfg + .rocksdb + .build_opt(&tikv_cfg.rocksdb.build_resources(Arc::new(Env::default()))); + } + #[test] fn test_parse_log_level() { #[derive(Serialize, Deserialize, Debug)] @@ -4481,13 +4574,17 @@ mod tests { assert_eq!(F::TAG, cfg.storage.api_version()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some(cfg.rocksdb.build_opt()), + Some( + cfg.rocksdb + .build_opt(&cfg.rocksdb.build_resources(Arc::new(Env::default()))), + ), cfg.rocksdb.build_cf_opts( - &cfg.storage.block_cache.build_shared_cache(), + &cfg.rocksdb + .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), None, cfg.storage.api_version(), + cfg.storage.engine, ), - true, None, ) .unwrap(); @@ -4504,21 +4601,16 @@ mod tests { rx, ))); - let (shared, cfg_controller) = (cfg.storage.block_cache.shared, ConfigController::new(cfg)); + let cfg_controller = ConfigController::new(cfg); cfg_controller.register( Module::Rocksdb, - Box::new(DbConfigManger::new( - Arc::new(engine.clone()), - DbType::Kv, - shared, - )), + Box::new(DbConfigManger::new(engine.clone(), DbType::Kv)), ); let (scheduler, receiver) = dummy_scheduler(); cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - Arc::new(DummyFactory::new(Some(engine), "".to_string())), - shared, + engine, scheduler, flow_controller.clone(), storage.get_scheduler(), @@ -4651,7 +4743,6 @@ mod tests { cfg.rocksdb.defaultcf.block_cache_size = ReadableSize::mb(8); cfg.rocksdb.rate_bytes_per_sec = ReadableSize::mb(64); cfg.rocksdb.rate_limiter_auto_tuned = false; - cfg.storage.block_cache.shared = false; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -4690,7 +4781,6 @@ mod tests { let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), false); assert_eq!(cf_opts.get_target_file_size_base(), ReadableSize::mb(64).0); - assert_eq!(cf_opts.get_block_cache_capacity(), ReadableSize::mb(8).0); let mut change = HashMap::new(); change.insert( @@ -4701,22 +4791,11 @@ mod tests { "rocksdb.defaultcf.target-file-size-base".to_owned(), "32MB".to_owned(), ); - change.insert( - "rocksdb.defaultcf.block-cache-size".to_owned(), - "256MB".to_owned(), - ); cfg_controller.update(change).unwrap(); let cf_opts = db.get_options_cf(CF_DEFAULT).unwrap(); assert_eq!(cf_opts.get_disable_auto_compactions(), true); assert_eq!(cf_opts.get_target_file_size_base(), ReadableSize::mb(32).0); - assert_eq!(cf_opts.get_block_cache_capacity(), ReadableSize::mb(256).0); - - // Can not update block cache through storage module - // when shared block cache is disabled - cfg_controller - .update_config("storage.block-cache.capacity", "512MB") - .unwrap_err(); } #[test] @@ -4746,7 +4825,6 @@ mod tests { #[test] fn test_change_shared_block_cache() { let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); - cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (storage, cfg_controller, ..) = new_engines::(cfg); let db = storage.get_engine().get_rocksdb(); @@ -4813,7 +4891,6 @@ mod tests { #[test] fn test_change_ttl_check_poll_interval() { let (mut cfg, _dir) = TikvConfig::with_tmp().unwrap(); - cfg.storage.block_cache.shared = true; cfg.validate().unwrap(); let (_, cfg_controller, mut rx, _) = new_engines::(cfg); @@ -5118,50 +5195,48 @@ mod tests { #[test] fn test_compaction_guard() { + let cache = Cache::new_lru_cache(LRUCacheOptions::new()); + let no_limiter: Option = None; // Test comopaction guard disabled. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: false, - ..Default::default() - }; - let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.target_file_size_base.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: false, + ..Default::default() + }; + let provider = Some(MockRegionInfoProvider::new(vec![])); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); + assert_eq!( + config.target_file_size_base.0, + cf_opts.get_target_file_size_base() + ); + // Test compaction guard enabled but region info provider is missing. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, - ..Default::default() - }; - let provider: Option = None; - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.target_file_size_base.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: true, + ..Default::default() + }; + let provider: Option = None; + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); + assert_eq!( + config.target_file_size_base.0, + cf_opts.get_target_file_size_base() + ); + // Test compaction guard enabled. - { - let config = DefaultCfConfig { - target_file_size_base: ReadableSize::mb(16), - enable_compaction_guard: true, - compaction_guard_min_output_file_size: ReadableSize::mb(4), - compaction_guard_max_output_file_size: ReadableSize::mb(64), - ..Default::default() - }; - let provider = Some(MockRegionInfoProvider::new(vec![])); - let cf_opts = build_cf_opt!(config, CF_DEFAULT, None /* cache */, provider); - assert_eq!( - config.compaction_guard_max_output_file_size.0, - cf_opts.get_target_file_size_base() - ); - } + let config = DefaultCfConfig { + target_file_size_base: ReadableSize::mb(16), + enable_compaction_guard: true, + compaction_guard_min_output_file_size: ReadableSize::mb(4), + compaction_guard_max_output_file_size: ReadableSize::mb(64), + ..Default::default() + }; + let provider = Some(MockRegionInfoProvider::new(vec![])); + let cf_opts = build_cf_opt!(config, CF_DEFAULT, &cache, no_limiter.as_ref(), provider); + assert_eq!( + config.compaction_guard_max_output_file_size.0, + cf_opts.get_target_file_size_base() + ); } #[test] @@ -5371,9 +5446,11 @@ mod tests { ); } + static CONFIG_TEMPLATE: &str = include_str!("../../etc/config-template.toml"); + #[test] fn test_config_template_is_valid() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); @@ -5384,7 +5461,7 @@ mod tests { #[test] fn test_config_template_no_superfluous_keys() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); @@ -5402,7 +5479,7 @@ mod tests { #[test] fn test_config_template_matches_default() { - let template_config = std::include_str!("../etc/config-template.toml") + let template_config = CONFIG_TEMPLATE .lines() .map(|l| l.strip_prefix('#').unwrap_or(l)) .join("\n"); @@ -5460,6 +5537,7 @@ mod tests { cfg.memory_usage_limit = None; cfg.raft_engine.mut_config().memory_limit = None; cfg.coprocessor_v2.coprocessor_plugin_directory = None; // Default is `None`, which is represented by not setting the key. + cfg.rocksdb.write_buffer_limit = None; cfg.rocksdb.defaultcf.level0_slowdown_writes_trigger = None; cfg.rocksdb.defaultcf.level0_stop_writes_trigger = None; cfg.rocksdb.defaultcf.soft_pending_compaction_bytes_limit = None; diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index 3274700d812..54fcaeb0489 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -171,7 +171,7 @@ impl Endpoint { let mut input = CodedInputStream::from_bytes(&data); input.set_recursion_limit(self.recursion_limit); - let req_ctx: ReqContext; + let mut req_ctx: ReqContext; let builder: RequestHandlerBuilder; match req.get_tp() { @@ -316,6 +316,9 @@ impl Endpoint { cache_match_version, self.perf_level, ); + // Checksum is allowed during the flashback period to make sure the tool such + // like BR can work. + req_ctx.allowed_in_flashback = true; with_tls_tracker(|tracker| { tracker.req_info.request_type = RequestType::CoprocessorChecksum; tracker.req_info.start_ts = start_ts; @@ -358,6 +361,7 @@ impl Endpoint { let mut snap_ctx = SnapContext { pb_ctx: &ctx.context, start_ts: Some(ctx.txn_start_ts), + allowed_in_flashback: ctx.allowed_in_flashback, ..Default::default() }; // need to pass start_ts and ranges to check memory locks for replica read diff --git a/src/coprocessor/mod.rs b/src/coprocessor/mod.rs index 8acd5325a1e..140d3c0476e 100644 --- a/src/coprocessor/mod.rs +++ b/src/coprocessor/mod.rs @@ -142,6 +142,9 @@ pub struct ReqContext { /// Perf level pub perf_level: PerfLevel, + + /// Whether the request is allowed in the flashback state. + pub allowed_in_flashback: bool, } impl ReqContext { @@ -181,6 +184,7 @@ impl ReqContext { lower_bound, upper_bound, perf_level, + allowed_in_flashback: false, } } diff --git a/src/coprocessor/tracker.rs b/src/coprocessor/tracker.rs index 24290701457..d6e146adf11 100644 --- a/src/coprocessor/tracker.rs +++ b/src/coprocessor/tracker.rs @@ -147,7 +147,11 @@ impl Tracker { _ => unreachable!(), } - self.with_perf_context(|perf_context| perf_context.start_observe()); + self.with_perf_context(|perf_context| { + if let Some(c) = perf_context { + c.start_observe(); + } + }); self.current_stage = TrackerState::ItemBegan(now); } @@ -160,7 +164,9 @@ impl Tracker { self.total_storage_stats.add(&storage_stats); } self.with_perf_context(|perf_context| { - perf_context.report_metrics(&[get_tls_tracker_token()]) + if let Some(c) = perf_context { + c.report_metrics(&[get_tls_tracker_token()]); + } }); self.current_stage = TrackerState::ItemFinished(now); } else { @@ -355,7 +361,7 @@ impl Tracker { fn with_perf_context(&self, f: F) -> T where - F: FnOnce(&mut Box) -> T, + F: FnOnce(&mut Option>) -> T, { thread_local! { static SELECT: RefCell>> = RefCell::new(None); @@ -379,15 +385,19 @@ impl Tracker { }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - let perf_context = c.get_or_insert_with(|| unsafe { - with_tls_engine::(|engine| { - Box::new(engine.kv_engine().unwrap().get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), - )) - }) - }); - f(perf_context) + if c.is_none() { + *c = unsafe { + with_tls_engine::(|engine| { + engine.kv_engine().map(|engine| { + Box::new(engine.get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Coprocessor(self.req_ctx.tag.get_str()), + )) as Box + }) + }) + }; + } + f(&mut c) }) } } diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index 9d45052fea9..8ce6f9961fb 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -12,6 +12,7 @@ use collections::HashSet; use engine_traits::{KvEngine, CF_DEFAULT, CF_WRITE}; use file_system::{set_io_type, IoType}; use futures::{future::join_all, sink::SinkExt, stream::TryStreamExt, TryFutureExt}; +use futures_executor::{ThreadPool, ThreadPoolBuilder}; use grpcio::{ ClientStreamingSink, RequestStream, RpcContext, ServerStreamingSink, UnarySink, WriteFlags, }; @@ -56,6 +57,12 @@ where engine: E, router: Router, threads: Arc, + // For now, PiTR cannot be executed in the tokio runtime because it is synchronous and may + // blocks. (tokio is so strict... it panics if we do insane things like blocking in an async + // context.) + // We need to execute these code in a context which allows blocking. + // FIXME: Make PiTR restore asynchronous. Get rid of this pool. + block_threads: Arc, importer: Arc, limiter: Limiter, task_slots: Arc>>, @@ -92,6 +99,18 @@ where .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) .build() .unwrap(); + let props = tikv_util::thread_group::current_properties(); + let block_threads = ThreadPoolBuilder::new() + .pool_size(cfg.num_threads) + .name_prefix("sst-importer") + .after_start_wrapper(move || { + tikv_util::thread_group::set_properties(props.clone()); + tikv_alloc::add_thread_memory_accessor(); + set_io_type(IoType::Import); + }) + .before_stop_wrapper(move || tikv_alloc::remove_thread_memory_accessor()) + .create() + .unwrap(); importer.start_switch_mode_check(threads.handle(), engine.clone()); threads.spawn(Self::tick(importer.clone())); @@ -99,6 +118,7 @@ where cfg, engine, threads: Arc::new(threads), + block_threads: Arc::new(block_threads), router, importer, limiter: Limiter::new(f64::INFINITY), @@ -165,6 +185,17 @@ where .ingest_maybe_slowdown_writes(CF_WRITE) .expect("cf") { + match self.engine.get_sst_key_ranges(CF_WRITE, 0) { + Ok(l0_sst_ranges) => { + warn!( + "sst ingest is too slow"; + "sst_ranges" => ?l0_sst_ranges, + ); + } + Err(e) => { + error!("get sst key ranges failed"; "err" => ?e); + } + } let mut errorpb = errorpb::Error::default(); let err = "too many sst files are ingesting"; let mut server_is_busy_err = errorpb::ServerIsBusy::default(); @@ -596,7 +627,7 @@ where debug!("finished apply kv file with {:?}", resp); crate::send_rpc_response!(resp, sink, label, timer); }; - self.threads.spawn(handle_task); + self.block_threads.spawn_ok(handle_task); } /// Downloads the file and performs key-rewrite for later ingesting. @@ -1044,7 +1075,7 @@ where Box::new(move |k: Vec, v: Vec| { // Need to skip the empty key/value that could break the transaction or cause // data corruption. see details at https://github.com/pingcap/tiflow/issues/5468. - if k.is_empty() || v.is_empty() { + if k.is_empty() || (!is_delete && v.is_empty()) { return; } diff --git a/src/server/debug.rs b/src/server/debug.rs index 48435f72163..c16621f4d85 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -5,6 +5,7 @@ use std::{ iter::FromIterator, path::Path, result, + sync::Arc, thread::{Builder as ThreadBuilder, JoinHandle}, }; @@ -12,12 +13,12 @@ use collections::HashSet; use engine_rocks::{ raw::{CompactOptions, DBBottommostLevelCompaction}, util::get_cf_handle, - RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksWriteBatchVec, + RocksEngine, RocksEngineIterator, RocksMvccProperties, RocksStatistics, RocksWriteBatchVec, }; use engine_traits::{ - Engines, IterOptions, Iterable, Iterator as EngineIterator, Mutable, MvccProperties, Peekable, - RaftEngine, Range, RangePropertiesExt, SyncMutable, WriteBatch, WriteBatchExt, WriteOptions, - CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, + Engines, IterOptions, Iterable, Iterator as EngineIterator, MiscExt, Mutable, MvccProperties, + Peekable, RaftEngine, RaftLogBatch, Range, RangePropertiesExt, SyncMutable, WriteBatch, + WriteBatchExt, WriteOptions, CF_DEFAULT, CF_LOCK, CF_RAFT, CF_WRITE, }; use kvproto::{ debugpb::{self, Db as DbType}, @@ -127,6 +128,8 @@ trait InnerRocksEngineExtractor { #[derive(Clone)] pub struct Debugger { engines: Engines, + kv_statistics: Option>, + raft_statistics: Option>, reset_to_version_manager: ResetToVersionManager, cfg_controller: ConfigController, } @@ -159,15 +162,41 @@ impl Debugger { let reset_to_version_manager = ResetToVersionManager::new(engines.kv.clone()); Debugger { engines, + kv_statistics: None, + raft_statistics: None, reset_to_version_manager, cfg_controller, } } + pub fn set_kv_statistics(&mut self, s: Option>) { + self.kv_statistics = s; + } + + pub fn set_raft_statistics(&mut self, s: Option>) { + self.raft_statistics = s; + } + pub fn get_engine(&self) -> &Engines { &self.engines } + pub fn dump_kv_stats(&self) -> Result { + let mut kv_str = box_try!(MiscExt::dump_stats(&self.engines.kv)); + if let Some(s) = self.kv_statistics.as_ref() && let Some(s) = s.to_string() { + kv_str.push_str(&s); + } + Ok(kv_str) + } + + pub fn dump_raft_stats(&self) -> Result { + let mut raft_str = box_try!(RaftEngine::dump_stats(&self.engines.raft)); + if let Some(s) = self.raft_statistics.as_ref() && let Some(s) = s.to_string() { + raft_str.push_str(&s); + } + Ok(raft_str) + } + /// Get all regions holding region meta data from raft CF in KV storage. pub fn get_all_regions_in_store(&self) -> Result> { let db = &self.engines.kv; @@ -735,9 +764,10 @@ impl Debugger { &keys::apply_state_key(region_id), &new_raft_apply_state )); - box_try!(raft.put_raft_state(region_id, &new_raft_local_state)); - let deleted_logs = box_try!(raft.gc(region_id, applied_index + 1, last_index + 1)); - raft.sync().unwrap(); + let mut lb = raft.log_batch(0); + box_try!(lb.put_raft_state(region_id, &new_raft_local_state)); + box_try!(raft.gc(region_id, applied_index + 1, last_index + 1, &mut lb)); + box_try!(raft.consume(&mut lb, true)); kv.sync().unwrap(); info!( @@ -747,7 +777,6 @@ impl Debugger { "new_raft_local_state" => ?new_raft_local_state, "old_raft_apply_state" => ?old_raft_apply_state, "new_raft_apply_state" => ?new_raft_apply_state, - "deleted logs" => deleted_logs, ); } diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 7e8a1457500..91b5178f8a0 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -1,36 +1,35 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - path::{Path, PathBuf}, - sync::{Arc, Mutex}, -}; +use std::{path::Path, sync::Arc}; use engine_rocks::{ raw::{Cache, Env}, - CompactedEventSender, CompactionListener, FlowListener, RocksCompactionJobInfo, RocksEngine, - RocksEventListener, + CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, + RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, }; use engine_traits::{ - CfOptions, CfOptionsExt, CompactionJobInfo, OpenOptions, Result, TabletAccessor, TabletFactory, - CF_DEFAULT, CF_WRITE, + CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, + TabletFactory, CF_DEFAULT, CF_WRITE, }; use kvproto::kvrpcpb::ApiVersion; use raftstore::RegionInfoAccessor; use tikv_util::worker::Scheduler; -use super::engine_factory_v2::KvEngineFactoryV2; -use crate::config::{DbConfig, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}; +use crate::{ + config::{CfResources, DbConfig, DbResources, TikvConfig, DEFAULT_ROCKSDB_SUB_DIR}, + storage::config::EngineType, +}; struct FactoryInner { - env: Arc, region_info_accessor: Option, - block_cache: Option, rocksdb_config: Arc, - store_path: PathBuf, api_version: ApiVersion, flow_listener: Option, sst_recovery_sender: Option>, - root_db: Mutex>, + db_resources: DbResources, + cf_resources: CfResources, + state_storage: Option>, + lite: bool, } pub struct KvEngineFactoryBuilder { @@ -39,18 +38,18 @@ pub struct KvEngineFactoryBuilder { } impl KvEngineFactoryBuilder { - pub fn new(env: Arc, config: &TikvConfig, store_path: impl Into) -> Self { + pub fn new(env: Arc, config: &TikvConfig, cache: Cache) -> Self { Self { inner: FactoryInner { - env, region_info_accessor: None, - block_cache: None, rocksdb_config: Arc::new(config.rocksdb.clone()), - store_path: store_path.into(), api_version: config.storage.api_version(), flow_listener: None, sst_recovery_sender: None, - root_db: Mutex::default(), + db_resources: config.rocksdb.build_resources(env), + cf_resources: config.rocksdb.build_cf_resources(cache), + state_storage: None, + lite: false, }, compact_event_sender: None, } @@ -61,11 +60,6 @@ impl KvEngineFactoryBuilder { self } - pub fn block_cache(mut self, cache: Cache) -> Self { - self.inner.block_cache = Some(cache); - self - } - pub fn flow_listener(mut self, listener: FlowListener) -> Self { self.inner.flow_listener = Some(listener); self @@ -84,20 +78,27 @@ impl KvEngineFactoryBuilder { self } + /// Set whether enable lite mode. + /// + /// In lite mode, most listener/filters will not be installed. + pub fn lite(mut self, lite: bool) -> Self { + self.inner.lite = lite; + self + } + + /// A storage for persisting flush states, which is used for recovering when + /// disable WAL. Only work for v2. + pub fn state_storage(mut self, storage: Arc) -> Self { + self.inner.state_storage = Some(storage); + self + } + pub fn build(self) -> KvEngineFactory { KvEngineFactory { inner: Arc::new(self.inner), compact_event_sender: self.compact_event_sender.clone(), } } - - pub fn build_v2(self) -> KvEngineFactoryV2 { - let factory = KvEngineFactory { - inner: Arc::new(self.inner), - compact_event_sender: self.compact_event_sender.clone(), - }; - KvEngineFactoryV2::new(factory) - } } #[derive(Clone)] @@ -129,178 +130,154 @@ impl KvEngineFactory { )) } - pub fn create_tablet( - &self, - tablet_path: &Path, - region_id: u64, - suffix: u64, - ) -> Result { - // Create kv engine. - let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); - kv_db_opts.set_env(self.inner.env.clone()); - kv_db_opts.add_event_listener(RocksEventListener::new( - "kv", - self.inner.sst_recovery_sender.clone(), - )); - if let Some(filter) = self.create_raftstore_compaction_listener() { - kv_db_opts.add_event_listener(filter); - } - if let Some(listener) = &self.inner.flow_listener { - kv_db_opts.add_event_listener(listener.clone_with(region_id, suffix)); - } - let kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( - &self.inner.block_cache, - self.inner.region_info_accessor.as_ref(), - self.inner.api_version, - ); - let kv_engine = engine_rocks::util::new_engine_opt( - tablet_path.to_str().unwrap(), - kv_db_opts, - kv_cfs_opts, - ); - let mut kv_engine = match kv_engine { - Ok(e) => e, - Err(e) => { - error!("failed to create kv engine"; "path" => %tablet_path.display(), "err" => ?e); - return Err(e); - } - }; - let shared_block_cache = self.inner.block_cache.is_some(); - kv_engine.set_shared_block_cache(shared_block_cache); - Ok(kv_engine) + pub fn rocks_statistics(&self) -> Arc { + self.inner.db_resources.statistics.clone() } - pub fn on_tablet_created(&self, region_id: u64, suffix: u64) { - if let Some(listener) = &self.inner.flow_listener { - let listener = listener.clone_with(region_id, suffix); - listener.on_created(); + fn db_opts(&self) -> RocksDbOptions { + // Create kv engine. + let mut db_opts = self + .inner + .rocksdb_config + .build_opt(&self.inner.db_resources); + if !self.inner.lite { + db_opts.add_event_listener(RocksEventListener::new( + "kv", + self.inner.sst_recovery_sender.clone(), + )); + if let Some(filter) = self.create_raftstore_compaction_listener() { + db_opts.add_event_listener(filter); + } } + db_opts } - pub fn destroy_tablet(&self, tablet_path: &Path) -> engine_traits::Result<()> { - info!("destroy tablet"; "path" => %tablet_path.display()); - // Create kv engine. - let mut kv_db_opts = self.inner.rocksdb_config.build_opt(); - kv_db_opts.set_env(self.inner.env.clone()); - if let Some(filter) = self.create_raftstore_compaction_listener() { - kv_db_opts.add_event_listener(filter); - } - let _kv_cfs_opts = self.inner.rocksdb_config.build_cf_opts( - &self.inner.block_cache, + fn cf_opts(&self, for_engine: EngineType) -> Vec<(&str, RocksCfOptions)> { + self.inner.rocksdb_config.build_cf_opts( + &self.inner.cf_resources, self.inner.region_info_accessor.as_ref(), self.inner.api_version, - ); - // TODOTODO: call rust-rocks or tirocks to destroy_engine; - // engine_rocks::util::destroy_engine( - // tablet_path.to_str().unwrap(), - // kv_db_opts, - // kv_cfs_opts, - // )?; - let _ = std::fs::remove_dir_all(tablet_path); - Ok(()) + for_engine, + ) } - pub fn on_tablet_destroy(&self, region_id: u64, suffix: u64) { - if let Some(listener) = &self.inner.flow_listener { - let listener = listener.clone_with(region_id, suffix); - listener.on_destroyed(); - } + pub fn block_cache(&self) -> &Cache { + &self.inner.cf_resources.cache } - pub fn store_path(&self) -> PathBuf { - self.inner.store_path.clone() - } - - #[inline] - fn kv_engine_path(&self) -> PathBuf { - self.inner.store_path.join(DEFAULT_ROCKSDB_SUB_DIR) + /// Create a shared db. + /// + /// It will always create in path/DEFAULT_DB_SUB_DIR. + pub fn create_shared_db(&self, path: impl AsRef) -> Result { + let path = path.as_ref(); + let mut db_opts = self.db_opts(); + let cf_opts = self.cf_opts(EngineType::RaftKv); + if let Some(listener) = &self.inner.flow_listener { + db_opts.add_event_listener(listener.clone()); + } + let target_path = path.join(DEFAULT_ROCKSDB_SUB_DIR); + let kv_engine = + engine_rocks::util::new_engine_opt(target_path.to_str().unwrap(), db_opts, cf_opts); + if let Err(e) = &kv_engine { + error!("failed to create kv engine"; "path" => %path.display(), "err" => ?e); + } + kv_engine } } impl TabletFactory for KvEngineFactory { - #[inline] - fn create_shared_db(&self) -> Result { - let root_path = self.kv_engine_path(); - let tablet = self.create_tablet(&root_path, 0, 0)?; - let mut root_db = self.inner.root_db.lock().unwrap(); - root_db.replace(tablet.clone()); - Ok(tablet) - } - - /// Open the root tablet according to the OpenOptions. - /// - /// If options.create_new is true, create the root tablet. If the tablet - /// exists, it will fail. - /// - /// If options.create is true, open the the root tablet if it exists or - /// create it otherwise. - fn open_tablet( - &self, - _id: u64, - _suffix: Option, - options: OpenOptions, - ) -> Result { - if let Some(db) = self.inner.root_db.lock().unwrap().as_ref() { - if options.create_new() { - return Err(box_err!( - "root tablet {} already exists", - db.as_inner().path() - )); - } - return Ok(db.clone()); + fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { + let mut db_opts = self.db_opts(); + let cf_opts = self.cf_opts(EngineType::RaftKv2); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); } - // No need for mutex protection here since root_db creation only occurs at - // tikv bootstrap time when there is no racing issue. - if options.create_new() || options.create() { - return self.create_shared_db(); + if let Some(storage) = &self.inner.state_storage + && let Some(flush_state) = ctx.flush_state { + let listener = PersistenceListener::new( + ctx.id, + ctx.suffix.unwrap(), + flush_state, + storage.clone(), + ); + db_opts.add_event_listener(RocksPersistenceListener::new(listener)); } - - Err(box_err!("root tablet has not been initialized")) - } - - fn open_tablet_raw( - &self, - _path: &Path, - _id: u64, - _suffix: u64, - _options: OpenOptions, - ) -> Result { - self.create_shared_db() - } - - fn exists_raw(&self, _path: &Path) -> bool { - false - } - - fn tablet_path_with_prefix(&self, _prefix: &str, _id: u64, _suffix: u64) -> PathBuf { - self.kv_engine_path() - } - - fn tablets_path(&self) -> PathBuf { - self.kv_engine_path() + let kv_engine = + engine_rocks::util::new_engine_opt(path.to_str().unwrap(), db_opts, cf_opts); + if let Err(e) = &kv_engine { + error!("failed to create tablet"; "id" => ctx.id, "suffix" => ?ctx.suffix, "path" => %path.display(), "err" => ?e); + } else if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + listener.clone_with(ctx.id, suffix).on_created(); + } + kv_engine } - #[inline] - fn destroy_tablet(&self, _id: u64, _suffix: u64) -> engine_traits::Result<()> { + fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { + info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); + // Create kv engine. + let _db_opts = self.db_opts(); + let _cf_opts = self.cf_opts(EngineType::RaftKv2); + // TODOTODO: call rust-rocks or tirocks to destroy_engine; + // engine_rocks::util::destroy_engine( + // path.to_str().unwrap(), + // kv_db_opts, + // kv_cfs_opts, + // )?; + let _ = std::fs::remove_dir_all(path); + if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { + listener.clone_with(ctx.id, suffix).on_destroyed(); + } Ok(()) } - fn set_shared_block_cache_capacity(&self, capacity: u64) -> Result<()> { - let db = self.inner.root_db.lock().unwrap(); - let opt = db.as_ref().unwrap().get_options_cf(CF_DEFAULT).unwrap(); // FIXME unwrap - opt.set_block_cache_capacity(capacity)?; - Ok(()) + fn exists(&self, path: &Path) -> bool { + RocksEngine::exists(path.to_str().unwrap()) } } -impl TabletAccessor for KvEngineFactory { - fn for_each_opened_tablet(&self, f: &mut dyn FnMut(u64, u64, &RocksEngine)) { - let db = self.inner.root_db.lock().unwrap(); - let db = db.as_ref().unwrap(); - f(0, 0, db); - } - - fn is_single_engine(&self) -> bool { - true +#[cfg(test)] +mod tests { + use std::path::Path; + + use engine_traits::TabletRegistry; + + use super::*; + use crate::config::TikvConfig; + + #[test] + fn test_engine_factory() { + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + let common_test_cfg = manifest_dir.join("components/test_raftstore/src/common-test.toml"); + let cfg = TikvConfig::from_file(&common_test_cfg, None).unwrap_or_else(|e| { + panic!( + "invalid auto generated configuration file {}, err {}", + manifest_dir.display(), + e + ); + }); + let cache = cfg.storage.block_cache.build_shared_cache(); + let dir = test_util::temp_dir("test-engine-factory", false); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), dir.path()).unwrap(); + let path = reg.tablet_path(1, 3); + assert!(!reg.tablet_factory().exists(&path)); + let mut tablet_ctx = TabletContext::with_infinite_region(1, Some(3)); + let engine = reg + .tablet_factory() + .open_tablet(tablet_ctx.clone(), &path) + .unwrap(); + assert!(reg.tablet_factory().exists(&path)); + // Second attempt should fail with lock. + reg.tablet_factory() + .open_tablet(tablet_ctx.clone(), &path) + .unwrap_err(); + drop(engine); + tablet_ctx.suffix = Some(3); + reg.tablet_factory() + .destroy_tablet(tablet_ctx, &path) + .unwrap(); + assert!(!reg.tablet_factory().exists(&path)); } } diff --git a/src/server/gc_worker/compaction_filter.rs b/src/server/gc_worker/compaction_filter.rs index bd5896296bb..5d33346a844 100644 --- a/src/server/gc_worker/compaction_filter.rs +++ b/src/server/gc_worker/compaction_filter.rs @@ -20,9 +20,7 @@ use engine_rocks::{ }, RocksEngine, RocksMvccProperties, RocksWriteBatchVec, }; -use engine_traits::{ - KvEngine, MiscExt, Mutable, MvccProperties, WriteBatch, WriteBatchExt, WriteOptions, -}; +use engine_traits::{KvEngine, MiscExt, MvccProperties, WriteBatch, WriteOptions}; use file_system::{IoType, WithIoType}; use pd_client::{Feature, FeatureGate}; use prometheus::{local::*, *}; @@ -30,6 +28,7 @@ use raftstore::coprocessor::RegionInfoProvider; use tikv_util::{ time::Instant, worker::{ScheduleError, Scheduler}, + Either, }; use txn_types::{Key, TimeStamp, WriteRef, WriteType}; @@ -51,7 +50,7 @@ const COMPACTION_FILTER_GC_FEATURE: Feature = Feature::require(5, 0, 0); // these fields are not available when constructing // `WriteCompactionFilterFactory`. pub struct GcContext { - pub(crate) db: RocksEngine, + pub(crate) db: Option, pub(crate) store_id: u64, pub(crate) safe_point: Arc, pub(crate) cfg_tracker: GcWorkerConfigManager, @@ -154,7 +153,7 @@ where ); } -impl CompactionFilterInitializer for EK +impl CompactionFilterInitializer for Option where EK: KvEngine, { @@ -171,7 +170,7 @@ where } } -impl CompactionFilterInitializer for RocksEngine { +impl CompactionFilterInitializer for Option { fn init_compaction_filter( &self, store_id: u64, @@ -237,7 +236,10 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { "ratio_threshold" => ratio_threshold, ); - if db.is_stalled_or_stopped() { + if db + .as_ref() + .map_or(false, RocksEngine::is_stalled_or_stopped) + { debug!("skip gc in compaction filter because the DB is stalled"); return std::ptr::null_mut(); } @@ -277,13 +279,60 @@ impl CompactionFilterFactory for WriteCompactionFilterFactory { } } +pub struct DeleteBatch { + pub batch: Either>, +} + +impl DeleteBatch { + fn new(db: &Option) -> Self + where + EK: KvEngine, + { + Self { + batch: match db { + Some(db) => Either::Left(db.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE)), + None => Either::Right(Vec::with_capacity(64)), + }, + } + } + + // `key` has prefix `DATA_KEY`. + fn delete(&mut self, key: &[u8], ts: TimeStamp) -> Result<(), String> { + match &mut self.batch { + Either::Left(batch) => { + let key = Key::from_encoded_slice(key).append_ts(ts); + batch.delete(key.as_encoded())?; + } + Either::Right(keys) => { + let key = Key::from_encoded_slice(keys::origin_key(key)).append_ts(ts); + keys.push(key); + } + } + Ok(()) + } + + fn is_empty(&self) -> bool { + match &self.batch { + Either::Left(batch) => batch.is_empty(), + Either::Right(keys) => keys.is_empty(), + } + } + + pub fn count(&self) -> usize { + match &self.batch { + Either::Left(batch) => batch.count(), + Either::Right(keys) => keys.len(), + } + } +} + struct WriteCompactionFilter { safe_point: u64, - engine: RocksEngine, + engine: Option, is_bottommost_level: bool, encountered_errors: bool, - write_batch: RocksWriteBatchVec, + write_batch: DeleteBatch, gc_scheduler: Scheduler>, // A key batch which is going to be sent to the GC worker. mvcc_deletions: Vec, @@ -312,7 +361,7 @@ struct WriteCompactionFilter { impl WriteCompactionFilter { fn new( - engine: RocksEngine, + engine: Option, safe_point: u64, context: &CompactionFilterContext, gc_scheduler: Scheduler>, @@ -322,7 +371,7 @@ impl WriteCompactionFilter { assert!(safe_point > 0); debug!("gc in compaction filter"; "safe_point" => safe_point); - let write_batch = engine.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE); + let write_batch = DeleteBatch::new(&engine); WriteCompactionFilter { safe_point, engine, @@ -469,9 +518,8 @@ impl WriteCompactionFilter { fn handle_filtered_write(&mut self, write: WriteRef<'_>) -> Result<(), String> { if write.short_value.is_none() && write.write_type == WriteType::Put { - let prefix = Key::from_encoded_slice(&self.mvcc_key_prefix); - let def_key = prefix.append_ts(write.start_ts).into_encoded(); - self.write_batch.delete(&def_key)?; + self.write_batch + .delete(&self.mvcc_key_prefix, write.start_ts)?; } Ok(()) } @@ -499,24 +547,40 @@ impl WriteCompactionFilter { } if self.write_batch.count() > DEFAULT_DELETE_BATCH_COUNT || force { - let mut wopts = WriteOptions::default(); - wopts.set_no_slowdown(true); - if let Err(e) = do_flush(&mut self.write_batch, &wopts) { - let wb = mem::replace( - &mut self.write_batch, - self.engine.write_batch_with_cap(DEFAULT_DELETE_BATCH_SIZE), - ); - self.orphan_versions += wb.count(); - let id = ORPHAN_VERSIONS_ID.fetch_add(1, Ordering::Relaxed); - let task = GcTask::OrphanVersions { wb, id }; + let err = match &mut self.write_batch.batch { + Either::Left(wb) => { + let mut wopts = WriteOptions::default(); + wopts.set_no_slowdown(true); + match do_flush(wb, &wopts) { + Ok(()) => { + wb.clear(); + return Ok(()); + } + Err(e) => Some(e), + } + } + Either::Right(_) => None, + }; + + let wb = mem::replace(&mut self.write_batch, DeleteBatch::new(&self.engine)); + self.orphan_versions += wb.count(); + let id = ORPHAN_VERSIONS_ID.fetch_add(1, Ordering::Relaxed); + let region_info_provider = self.regions_provider.1.clone(); + let task = GcTask::OrphanVersions { + wb, + id, + region_info_provider, + }; + if let Some(e) = &err { warn!( - "compaction filter flush fail, dispatch to gc worker"; - "task" => %task, "err" => ?e, + "compaction filter flush fail, dispatch to gc worker"; + "task" => %task, "err" => ?e, ); - self.schedule_gc_task(task, true); - return Err(e); } - self.write_batch.clear(); + self.schedule_gc_task(task, true); + if let Some(err) = err { + return Err(err); + } } Ok(()) } @@ -607,7 +671,9 @@ impl Drop for WriteCompactionFilter { if let Err(e) = self.flush_pending_writes_if_need(true) { error!("compaction filter flush writes fail"; "err" => ?e); } - self.engine.sync_wal().unwrap(); + if let Some(engine) = &self.engine { + engine.sync_wal().unwrap(); + } self.switch_key_metrics(); self.flush_metrics(); @@ -685,6 +751,15 @@ pub fn check_need_gc( context: &CompactionFilterContext, ) -> bool { let check_props = |props: &MvccProperties| -> (bool, bool /* skip_more_checks */) { + // Disable GC directly once the config is negative or +inf. + // Disabling GC is useful in some abnormal scenarios where the transaction model + // would be break (e.g. writes with higher commit TS would be written BEFORE + // writes with lower commit TS, or write data with TS lower than current GC safe + // point). Use this at your own risk. + if ratio_threshold.is_sign_negative() || ratio_threshold.is_infinite() { + return (false, false); + } + if props.min_ts > safe_point { return (false, false); } @@ -822,7 +897,7 @@ pub mod test_utils { let mut gc_context_opt = GC_CONTEXT.lock().unwrap(); *gc_context_opt = Some(GcContext { - db: engine.clone(), + db: Some(engine.clone()), store_id: 1, safe_point, cfg_tracker, @@ -970,6 +1045,13 @@ pub mod tests { let default_key = Key::from_encoded_slice(b"zkey").append_ts(100.into()); let default_key = default_key.into_encoded(); assert!(raw_engine.get_value(&default_key).unwrap().is_none()); + + // If the ratio threshold is less than 0, GC would be skipped. + must_prewrite_put(&mut engine, b"zkey", &value, b"zkey", 210); + must_commit(&mut engine, b"zkey", 210, 220); + gc_runner.ratio_threshold = Some(-1.0); + gc_runner.safe_point(256).gc(&raw_engine); + must_get(&mut engine, b"zkey", 210, &value); } // Test dirty versions before a deletion mark can be handled correctly. diff --git a/src/server/gc_worker/gc_worker.rs b/src/server/gc_worker/gc_worker.rs index 1ccac8860c6..106b36f61ad 100644 --- a/src/server/gc_worker/gc_worker.rs +++ b/src/server/gc_worker/gc_worker.rs @@ -38,7 +38,7 @@ use txn_types::{Key, TimeStamp}; use super::{ check_need_gc, compaction_filter::{ - CompactionFilterInitializer, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, + CompactionFilterInitializer, DeleteBatch, GC_COMPACTION_FILTER_MVCC_DELETION_HANDLED, GC_COMPACTION_FILTER_MVCC_DELETION_WASTED, GC_COMPACTION_FILTER_ORPHAN_VERSIONS, }, config::{GcConfig, GcWorkerConfigManager}, @@ -118,7 +118,11 @@ where /// until `DefaultCompactionFilter` is introduced. /// /// The tracking issue: . - OrphanVersions { wb: E::WriteBatch, id: usize }, + OrphanVersions { + wb: DeleteBatch, + id: usize, + region_info_provider: Arc, + }, #[cfg(any(test, feature = "testexport"))] Validate(Box), } @@ -162,7 +166,7 @@ where .field("start_key", &format!("{}", start_key)) .field("end_key", &format!("{}", end_key)) .finish(), - GcTask::OrphanVersions { id, wb } => f + GcTask::OrphanVersions { id, wb, .. } => f .debug_struct("OrphanVersions") .field("id", id) .field("count", &wb.count()) @@ -871,6 +875,46 @@ impl GcRunner { tikv_kv::snapshot(&mut self.engine, snap_ctx).await })?) } + + fn flush_deletes(&mut self, deletes: Vec, provider: Arc) { + let mut region_modifies = HashMap::default(); + // Should not panic. + let regions = match get_regions_for_range_of_keys(self.store_id, &deletes, provider) { + Ok(r) => r, + Err(e) => { + error!("failed to flush deletes, will leave garbage"; "err" => ?e); + return; + } + }; + if regions.is_empty() { + error!("no region is found, will leave garbage"); + return; + } + let mut keys = deletes.into_iter().peekable(); + let mut modifies = vec![]; + for region in ®ions { + let start_key = region.get_start_key(); + let end_key = region.get_end_key(); + while let Some(key) = keys.peek() { + if key.as_encoded().as_slice() < start_key { + error!("key is not in any region, will leave garbage"; "key" => %key); + keys.next(); + continue; + } + if !end_key.is_empty() && key.as_encoded().as_slice() >= end_key { + break; + } + modifies.push(Modify::Delete(CF_DEFAULT, keys.next().unwrap())); + } + if !modifies.is_empty() { + region_modifies.insert(region.id, modifies); + modifies = vec![]; + } + } + if let Err(e) = self.engine.modify_on_kv_engine(region_modifies) { + error!("failed to flush deletes, will leave garbage"; "err" => ?e); + } + } } impl Runnable for GcRunner { @@ -982,19 +1026,29 @@ impl Runnable for GcRunner { end_key ); } - GcTask::OrphanVersions { mut wb, id } => { - info!("handling GcTask::OrphanVersions"; "id" => id); - let mut wopts = WriteOptions::default(); - wopts.set_sync(true); - if let Err(e) = wb.write_opt(&wopts) { - error!("write GcTask::OrphanVersions fail"; "id" => id, "err" => ?e); - update_metrics(true); - return; + GcTask::OrphanVersions { + wb, + id, + region_info_provider, + } => { + let count = wb.count(); + match wb.batch { + Either::Left(mut wb) => { + info!("handling GcTask::OrphanVersions"; "id" => id); + let mut wopts = WriteOptions::default(); + wopts.set_sync(true); + if let Err(e) = wb.write_opt(&wopts) { + error!("write GcTask::OrphanVersions fail"; "id" => id, "err" => ?e); + update_metrics(true); + return; + } + info!("write GcTask::OrphanVersions success"; "id" => id); + } + Either::Right(deletes) => self.flush_deletes(deletes, region_info_provider), } - info!("write GcTask::OrphanVersions success"; "id" => id); GC_COMPACTION_FILTER_ORPHAN_VERSIONS .with_label_values(&[STAT_TXN_KEYMODE, "cleaned"]) - .inc_by(wb.count() as u64); + .inc_by(count as u64); update_metrics(false); } #[cfg(any(test, feature = "testexport"))] @@ -1144,7 +1198,7 @@ impl GcWorker { ); info!("initialize compaction filter to perform GC when necessary"); - self.engine.kv_engine().unwrap().init_compaction_filter( + self.engine.kv_engine().init_compaction_filter( cfg.self_store_id, safe_point.clone(), self.config_manager.clone(), diff --git a/src/server/gc_worker/mod.rs b/src/server/gc_worker/mod.rs index a5b8837cd2e..75b7441fbcb 100644 --- a/src/server/gc_worker/mod.rs +++ b/src/server/gc_worker/mod.rs @@ -26,6 +26,14 @@ pub use crate::storage::{Callback, Error, ErrorInner, Result}; // Returns true if it needs gc. // This is for optimization purpose, does not mean to be accurate. fn check_need_gc(safe_point: TimeStamp, ratio_threshold: f64, props: &MvccProperties) -> bool { + // Disable GC directly once the config is negative or +inf. + // Disabling GC is useful in some abnormal scenarios where the transaction model + // would be break (e.g. writes with higher commit TS would be written BEFORE + // writes with lower commit TS, or write data with TS lower than current GC safe + // point). Use this at your own risk. + if ratio_threshold.is_sign_negative() || ratio_threshold.is_infinite() { + return false; + } // Always GC. if ratio_threshold < 1.0 { return true; @@ -77,6 +85,14 @@ mod tests { props } + #[test] + fn test_check_need_gc() { + let props = MvccProperties::default(); + assert!(!check_need_gc(TimeStamp::max(), -1.0, &props)); + assert!(!check_need_gc(TimeStamp::max(), f64::INFINITY, &props)); + assert!(check_need_gc(TimeStamp::max(), 0.9, &props)); + } + #[test] fn test_need_gc() { let path = tempfile::Builder::new() diff --git a/src/server/gc_worker/rawkv_compaction_filter.rs b/src/server/gc_worker/rawkv_compaction_filter.rs index b1174d7d4f3..5e3913f4d40 100644 --- a/src/server/gc_worker/rawkv_compaction_filter.rs +++ b/src/server/gc_worker/rawkv_compaction_filter.rs @@ -48,7 +48,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { }; //---------------- GC context END -------------- - let db = gc_context.db.clone(); let gc_scheduler = gc_context.gc_scheduler.clone(); let store_id = gc_context.store_id; let region_info_provider = gc_context.region_info_provider.clone(); @@ -71,7 +70,11 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { "ratio_threshold" => ratio_threshold, ); - if db.is_stalled_or_stopped() { + if gc_context + .db + .as_ref() + .map_or(false, RocksEngine::is_stalled_or_stopped) + { debug!("skip gc in compaction filter because the DB is stalled"); return std::ptr::null_mut(); } @@ -91,7 +94,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { } let filter = RawCompactionFilter::new( - db, safe_point, gc_scheduler, current, @@ -105,7 +107,6 @@ impl CompactionFilterFactory for RawCompactionFilterFactory { struct RawCompactionFilter { safe_point: u64, - engine: RocksEngine, is_bottommost_level: bool, gc_scheduler: Scheduler>, current_ts: u64, @@ -135,8 +136,6 @@ impl Drop for RawCompactionFilter { fn drop(&mut self) { self.raw_gc_mvcc_deletions(); - self.engine.sync_wal().unwrap(); - self.switch_key_metrics(); self.flush_metrics(); } @@ -172,7 +171,6 @@ impl CompactionFilter for RawCompactionFilter { impl RawCompactionFilter { fn new( - engine: RocksEngine, safe_point: u64, gc_scheduler: Scheduler>, ts: u64, @@ -184,7 +182,6 @@ impl RawCompactionFilter { debug!("gc in compaction filter"; "safe_point" => safe_point); RawCompactionFilter { safe_point, - engine, is_bottommost_level: context.is_bottommost_level(), gc_scheduler, current_ts: ts, diff --git a/src/server/lock_manager/waiter_manager.rs b/src/server/lock_manager/waiter_manager.rs index 467580645d3..d8271998653 100644 --- a/src/server/lock_manager/waiter_manager.rs +++ b/src/server/lock_manager/waiter_manager.rs @@ -340,7 +340,10 @@ impl WaitTable { Some(waiter) } - fn update_waiter(&mut self, update_event: &UpdateWaitForEvent) -> Option { + fn update_waiter( + &mut self, + update_event: &UpdateWaitForEvent, + ) -> Option<(KeyLockWaitInfo, DiagnosticContext)> { let waiter = self.waiter_pool.get_mut(&update_event.token)?; assert_eq!(waiter.wait_info.key, update_event.wait_info.key); @@ -351,9 +354,8 @@ impl WaitTable { } let result = std::mem::replace(&mut waiter.wait_info, update_event.wait_info.clone()); - waiter.diag_ctx = update_event.diag_ctx.clone(); - Some(result) + Some((result, waiter.diag_ctx.clone())) } fn take_waiter_by_lock_digest( @@ -542,11 +544,11 @@ impl WaiterManager { continue; } - if let Some(previous_wait_info) = previous_wait_info { + if let Some((previous_wait_info, diag_ctx)) = previous_wait_info { self.detector_scheduler .clean_up_wait_for(event.start_ts, previous_wait_info); self.detector_scheduler - .detect(event.start_ts, event.wait_info, event.diag_ctx); + .detect(event.start_ts, event.wait_info, diag_ctx); } } } diff --git a/src/server/mod.rs b/src/server/mod.rs index d926ca40b2a..0bb6da62ac7 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -6,7 +6,6 @@ mod raft_client; pub mod config; pub mod debug; mod engine_factory; -mod engine_factory_v2; pub mod errors; pub mod gc_worker; pub mod load_statistics; @@ -14,6 +13,7 @@ pub mod lock_manager; pub mod node; mod proxy; pub mod raftkv; +mod raftkv2; mod reset_to_version; pub mod resolve; pub mod server; @@ -32,10 +32,11 @@ pub use self::{ config::{Config, ServerConfigManager, DEFAULT_CLUSTER_ID, DEFAULT_LISTENING_ADDR}, errors::{Error, Result}, metrics::{CONFIG_ROCKSDB_GAUGE, CPU_CORES_QUOTA_GAUGE, MEM_TRACE_SUM_GAUGE}, - node::{create_raft_storage, Node}, + node::Node, proxy::{build_forward_option, get_target_address, Proxy}, raft_client::{ConnectionBuilder, RaftClient}, raftkv::RaftKv, + raftkv2::{NodeV2, RaftKv2}, resolve::{PdStoreAddrResolver, StoreAddrResolver}, server::{Server, GRPC_THREAD_PREFIX}, transport::ServerTransport, diff --git a/src/server/node.rs b/src/server/node.rs index 0b654921f59..e36e980e1d3 100644 --- a/src/server/node.rs +++ b/src/server/node.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use api_version::{api_v2::TIDB_RANGES_COMPLEMENT, KvFormat}; +use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use causal_ts::CausalTsProviderImpl; use concurrency_manager::ConcurrencyManager; use engine_traits::{Engines, Iterable, KvEngine, RaftEngine, DATA_CFS, DATA_KEY_PREFIX_LEN}; @@ -14,10 +14,9 @@ use grpcio_health::HealthService; use kvproto::{ kvrpcpb::ApiVersion, metapb, raft_serverpb::StoreIdent, replication_modepb::ReplicationStatus, }; -use pd_client::{Error as PdError, FeatureGate, PdClient, INVALID_ID}; +use pd_client::{Error as PdError, PdClient, INVALID_ID}; use raftstore::{ coprocessor::dispatcher::CoprocessorHost, - router::{LocalReadRouter, RaftStoreRouter}, store::{ self, fsm::{store::StoreMeta, ApplyRouter, RaftBatchSystem, RaftRouter}, @@ -25,68 +24,69 @@ use raftstore::{ RefreshConfigTask, SnapManager, SplitCheckTask, Transport, }, }; -use resource_metering::{CollectorRegHandle, ResourceTagFactory}; +use resource_metering::CollectorRegHandle; use tikv_util::{ config::VersionTrack, - quota_limiter::QuotaLimiter, worker::{LazyWorker, Scheduler, Worker}, }; -use super::{RaftKv, Result}; -use crate::{ - import::SstImporter, - read_pool::ReadPoolHandle, - server::Config as ServerConfig, - storage::{ - config::Config as StorageConfig, kv::FlowStatsReporter, lock_manager, - txn::flow_controller::FlowController, DynamicConfigs as StorageDynamicConfigs, Storage, - }, -}; +use super::Result; +use crate::{import::SstImporter, server::Config as ServerConfig}; const MAX_CHECK_CLUSTER_BOOTSTRAPPED_RETRY_COUNT: u64 = 60; const CHECK_CLUSTER_BOOTSTRAPPED_RETRY_INTERVAL: Duration = Duration::from_secs(3); -/// Creates a new storage engine which is backed by the Raft consensus -/// protocol. -pub fn create_raft_storage< - S, - EK, - R: FlowStatsReporter, - F: KvFormat, - LM: lock_manager::LockManager, ->( - engine: RaftKv, - cfg: &StorageConfig, - read_pool: ReadPoolHandle, - lock_mgr: LM, - concurrency_manager: ConcurrencyManager, - dynamic_configs: StorageDynamicConfigs, - flow_controller: Arc, - reporter: R, - resource_tag_factory: ResourceTagFactory, - quota_limiter: Arc, - feature_gate: FeatureGate, - causal_ts_provider: Option>, -) -> Result, LM, F>> -where - S: RaftStoreRouter + LocalReadRouter + 'static, - EK: KvEngine, -{ - let store = Storage::from_engine( - engine, - cfg, - read_pool, - lock_mgr, - concurrency_manager, - dynamic_configs, - flow_controller, - reporter, - resource_tag_factory, - quota_limiter, - feature_gate, - causal_ts_provider, - )?; - Ok(store) +pub(crate) fn init_store(store: Option, cfg: &ServerConfig) -> metapb::Store { + let mut store = store.unwrap_or_default(); + store.set_id(INVALID_ID); + if store.get_address().is_empty() { + if cfg.advertise_addr.is_empty() { + store.set_address(cfg.addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.addr.clone()); + } + } else { + store.set_address(cfg.advertise_addr.clone()); + if store.get_peer_address().is_empty() { + store.set_peer_address(cfg.advertise_addr.clone()); + } + } + } + if store.get_status_address().is_empty() { + if cfg.advertise_status_addr.is_empty() { + store.set_status_address(cfg.status_addr.clone()); + } else { + store.set_status_address(cfg.advertise_status_addr.clone()) + } + } + if store.get_version().is_empty() { + store.set_version(env!("CARGO_PKG_VERSION").to_string()); + } + + if let Ok(path) = std::env::current_exe() { + if let Some(path) = path.parent() { + store.set_deploy_path(path.to_string_lossy().to_string()); + } + }; + + store.set_start_timestamp(chrono::Local::now().timestamp()); + if store.get_git_hash().is_empty() { + store.set_git_hash( + option_env!("TIKV_BUILD_GIT_HASH") + .unwrap_or("Unknown git hash") + .to_string(), + ); + } + + let mut labels = Vec::new(); + for (k, v) in &cfg.labels { + let mut label = metapb::StoreLabel::default(); + label.set_key(k.to_owned()); + label.set_value(v.to_owned()); + labels.push(label); + } + store.set_labels(labels.into()); + store } /// A wrapper for the raftstore which runs Multi-Raft. @@ -123,58 +123,7 @@ where health_service: Option, default_store: Option, ) -> Node { - let mut store = match default_store { - None => metapb::Store::default(), - Some(s) => s, - }; - store.set_id(INVALID_ID); - if store.get_address().is_empty() { - if cfg.advertise_addr.is_empty() { - store.set_address(cfg.addr.clone()); - if store.get_peer_address().is_empty() { - store.set_peer_address(cfg.addr.clone()); - } - } else { - store.set_address(cfg.advertise_addr.clone()); - if store.get_peer_address().is_empty() { - store.set_peer_address(cfg.advertise_addr.clone()); - } - } - } - if store.get_status_address().is_empty() { - if cfg.advertise_status_addr.is_empty() { - store.set_status_address(cfg.status_addr.clone()); - } else { - store.set_status_address(cfg.advertise_status_addr.clone()) - } - } - if store.get_version().is_empty() { - store.set_version(env!("CARGO_PKG_VERSION").to_string()); - } - - if let Ok(path) = std::env::current_exe() { - if let Some(path) = path.parent() { - store.set_deploy_path(path.to_string_lossy().to_string()); - } - }; - - store.set_start_timestamp(chrono::Local::now().timestamp()); - if store.get_git_hash().is_empty() { - store.set_git_hash( - option_env!("TIKV_BUILD_GIT_HASH") - .unwrap_or("Unknown git hash") - .to_string(), - ); - } - - let mut labels = Vec::new(); - for (k, v) in &cfg.labels { - let mut label = metapb::StoreLabel::default(); - label.set_key(k.to_owned()); - label.set_value(v.to_owned()); - labels.push(label); - } - store.set_labels(labels.into()); + let store = init_store(default_store, cfg); Node { cluster_id: cfg.cluster_id, diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 6c7169d043c..c50c42c9fc6 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -84,7 +84,7 @@ pub enum Error { Timeout(Duration), } -fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { +pub fn get_status_kind_from_engine_error(e: &kv::Error) -> RequestStatusKind { match *e { KvError(box KvErrorInner::Request(ref header)) => { RequestStatusKind::from(storage::get_error_kind_from_header(header)) @@ -364,8 +364,8 @@ where type RaftExtension = RaftRouterWrap; #[inline] - fn raft_extension(&self) -> &Self::RaftExtension { - &self.router + fn raft_extension(&self) -> Self::RaftExtension { + self.router.clone() } fn modify_on_kv_engine( @@ -453,7 +453,7 @@ where if txn_extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); } - if txn_extra.for_flashback { + if txn_extra.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); @@ -555,7 +555,7 @@ where flags |= WriteBatchFlags::STALE_READ.bits(); header.set_flag_data(data.into()); } - if ctx.for_flashback { + if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs new file mode 100644 index 00000000000..526a1fab3ca --- /dev/null +++ b/src/server/raftkv2/mod.rs @@ -0,0 +1,296 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +mod node; +mod raft_extension; + +use std::{ + mem, + pin::Pin, + sync::{Arc, RwLock}, + task::Poll, +}; + +use collections::HashSet; +use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; +use futures::{Future, Stream, StreamExt}; +use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +pub use node::NodeV2; +use raftstore::store::RegionSnapshot; +use raftstore_v2::{ + router::{ + message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, + }, + SimpleWriteBinary, SimpleWriteEncoder, +}; +use tikv_kv::{Modify, WriteEvent}; +use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; + +use super::{ + metrics::{ASYNC_REQUESTS_COUNTER_VEC, ASYNC_REQUESTS_DURATIONS_VEC}, + raftkv::{get_status_kind_from_engine_error, new_request_header}, +}; + +struct Transform { + resp: CmdResStream, + early_err: Option, +} + +impl Stream for Transform { + type Item = WriteEvent; + + fn poll_next( + self: Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let stream = self.get_mut(); + if stream.early_err.is_some() { + return Poll::Ready(Some(WriteEvent::Finished(Err(stream + .early_err + .take() + .unwrap())))); + } + match stream.resp.poll_next_unpin(cx) { + Poll::Pending => Poll::Pending, + Poll::Ready(Some(CmdResEvent::Proposed)) => Poll::Ready(Some(WriteEvent::Proposed)), + Poll::Ready(Some(CmdResEvent::Committed)) => Poll::Ready(Some(WriteEvent::Committed)), + Poll::Ready(Some(CmdResEvent::Finished(mut resp))) => { + let res = if !resp.get_header().has_error() { + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.take_header().take_error())) + }; + Poll::Ready(Some(WriteEvent::Finished(res))) + } + Poll::Ready(None) => Poll::Ready(None), + } + } +} + +fn modifies_to_simple_write(modifies: Vec) -> SimpleWriteBinary { + let mut encoder = SimpleWriteEncoder::with_capacity(128); + for m in modifies { + match m { + Modify::Put(cf, k, v) => encoder.put(cf, k.as_encoded(), &v), + Modify::Delete(cf, k) => encoder.delete(cf, k.as_encoded()), + Modify::PessimisticLock(k, lock) => { + encoder.put(CF_LOCK, k.as_encoded(), &lock.into_lock().to_bytes()) + } + Modify::DeleteRange(cf, start_key, end_key, notify_only) => encoder.delete_range( + cf, + start_key.as_encoded(), + end_key.as_encoded(), + notify_only, + ), + } + } + encoder.encode() +} + +#[derive(Clone)] +pub struct RaftKv2 { + router: RaftRouter, + txn_extra_scheduler: Option>, + region_leaders: Arc>>, +} + +impl RaftKv2 { + #[allow(unused)] + pub fn new( + router: RaftRouter, + region_leaders: Arc>>, + ) -> RaftKv2 { + RaftKv2 { + router, + region_leaders, + txn_extra_scheduler: None, + } + } + + pub fn set_txn_extra_scheduler(&mut self, txn_extra_scheduler: Arc) { + self.txn_extra_scheduler = Some(txn_extra_scheduler); + } +} + +impl tikv_kv::Engine for RaftKv2 { + type Snap = RegionSnapshot; + type Local = EK; + + #[inline] + fn kv_engine(&self) -> Option { + None + } + + type RaftExtension = raft_extension::Extension; + #[inline] + fn raft_extension(&self) -> Self::RaftExtension { + raft_extension::Extension::new(self.router.store_router().clone()) + } + + fn modify_on_kv_engine( + &self, + region_modifies: collections::HashMap>, + ) -> tikv_kv::Result<()> { + for (region_id, batch) in region_modifies { + let bin = modifies_to_simple_write(batch); + let _ = self.router.send(region_id, PeerMsg::unsafe_write(bin)); + } + Ok(()) + } + + type SnapshotRes = impl Future> + Send; + fn async_snapshot(&mut self, mut ctx: tikv_kv::SnapContext<'_>) -> Self::SnapshotRes { + let mut req = Request::default(); + req.set_cmd_type(CmdType::Snap); + if !ctx.key_ranges.is_empty() && ctx.start_ts.map_or(false, |ts| !ts.is_zero()) { + req.mut_read_index() + .set_start_ts(ctx.start_ts.as_ref().unwrap().into_inner()); + req.mut_read_index() + .set_key_ranges(mem::take(&mut ctx.key_ranges).into()); + } + ASYNC_REQUESTS_COUNTER_VEC.snapshot.all.inc(); + let begin_instant = Instant::now_coarse(); + + let mut header = new_request_header(ctx.pb_ctx); + let mut flags = 0; + if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { + let mut data = [0u8; 8]; + (&mut data[..]) + .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) + .unwrap(); + flags |= WriteBatchFlags::STALE_READ.bits(); + header.set_flag_data(data.into()); + } + if ctx.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + let mut cmd = RaftCmdRequest::default(); + cmd.set_header(header); + cmd.set_requests(vec![req].into()); + let f = self.router.snapshot(cmd); + async move { + let res = f.await; + match res { + Ok(snap) => { + ASYNC_REQUESTS_DURATIONS_VEC + .snapshot + .observe(begin_instant.saturating_elapsed_secs()); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.success.inc(); + Ok(snap) + } + Err(mut resp) => { + if resp + .get_responses() + .get(0) + .map_or(false, |r| r.get_read_index().has_locked()) + { + let locked = resp.mut_responses()[0].mut_read_index().take_locked(); + Err(tikv_kv::Error::from(tikv_kv::ErrorInner::KeyIsLocked( + locked, + ))) + } else if resp.get_header().has_error() { + let err = tikv_kv::Error::from(resp.take_header().take_error()); + let status_kind = get_status_kind_from_engine_error(&err); + ASYNC_REQUESTS_COUNTER_VEC.snapshot.get(status_kind).inc(); + Err(err) + } else { + Err(box_err!("unexpected response: {:?}", resp)) + } + } + } + } + } + + type WriteRes = impl Stream + Send + Unpin; + fn async_write( + &self, + ctx: &kvproto::kvrpcpb::Context, + batch: tikv_kv::WriteData, + subscribed: u8, + on_applied: Option, + ) -> Self::WriteRes { + let region_id = ctx.region_id; + ASYNC_REQUESTS_COUNTER_VEC.write.all.inc(); + let begin_instant = Instant::now_coarse(); + let mut header = Box::new(new_request_header(ctx)); + let mut flags = 0; + if batch.extra.one_pc { + flags |= WriteBatchFlags::ONE_PC.bits(); + } + if batch.extra.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } + header.set_flags(flags); + + self.schedule_txn_extra(batch.extra); + let data = modifies_to_simple_write(batch.modifies); + let mut builder = CmdResChannelBuilder::default(); + if WriteEvent::subscribed_proposed(subscribed) { + builder.subscribe_proposed(); + } + if WriteEvent::subscribed_committed(subscribed) { + builder.subscribe_committed(); + } + if let Some(cb) = on_applied { + builder.before_set(move |resp| { + let mut res = if !resp.get_header().has_error() { + Ok(()) + } else { + Err(tikv_kv::Error::from(resp.get_header().get_error().clone())) + }; + cb(&mut res); + }); + } + let (ch, sub) = builder.build(); + let msg = PeerMsg::SimpleWrite(SimpleWrite { + header, + data, + ch, + send_time: Instant::now_coarse(), + }); + let res = self + .router + .store_router() + .check_send(region_id, msg) + .map_err(tikv_kv::Error::from); + (Transform { + resp: CmdResStream::new(sub), + early_err: res.err(), + }) + .inspect(move |ev| { + let WriteEvent::Finished(res) = ev else { return }; + match res { + Ok(()) => { + ASYNC_REQUESTS_COUNTER_VEC.write.success.inc(); + ASYNC_REQUESTS_DURATIONS_VEC + .write + .observe(begin_instant.saturating_elapsed_secs()); + } + Err(e) => { + let status_kind = get_status_kind_from_engine_error(e); + ASYNC_REQUESTS_COUNTER_VEC.write.get(status_kind).inc(); + } + } + }) + } + + #[inline] + fn precheck_write_with_ctx(&self, ctx: &kvproto::kvrpcpb::Context) -> tikv_kv::Result<()> { + let region_id = ctx.get_region_id(); + match self.region_leaders.read().unwrap().get(®ion_id) { + Some(_) => Ok(()), + None => Err(raftstore_v2::Error::NotLeader(region_id, None).into()), + } + } + + #[inline] + fn schedule_txn_extra(&self, txn_extra: TxnExtra) { + if let Some(tx) = self.txn_extra_scheduler.as_ref() { + if !txn_extra.is_empty() { + tx.schedule(txn_extra); + } + } + } +} diff --git a/src/server/raftkv2/node.rs b/src/server/raftkv2/node.rs new file mode 100644 index 00000000000..ed6f16e8bec --- /dev/null +++ b/src/server/raftkv2/node.rs @@ -0,0 +1,238 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::{Arc, Mutex}; + +use causal_ts::CausalTsProviderImpl; +use concurrency_manager::ConcurrencyManager; +use engine_traits::{KvEngine, RaftEngine, TabletContext, TabletRegistry}; +use kvproto::{metapb, replication_modepb::ReplicationStatus}; +use pd_client::PdClient; +use raftstore::{ + coprocessor::CoprocessorHost, + store::{GlobalReplicationState, TabletSnapManager, Transport, RAFT_INIT_LOG_INDEX}, +}; +use raftstore_v2::{router::RaftRouter, Bootstrap, PdTask, StoreSystem}; +use slog::{info, o, Logger}; +use tikv_util::{ + config::VersionTrack, + worker::{LazyWorker, Worker}, +}; + +use crate::server::{node::init_store, Result}; + +// TODO: we will rename another better name like RaftStore later. +pub struct NodeV2 { + cluster_id: u64, + store: metapb::Store, + system: Option<(RaftRouter, StoreSystem)>, + has_started: bool, + + pd_client: Arc, + registry: TabletRegistry, + logger: Logger, +} + +impl NodeV2 +where + C: PdClient, + EK: KvEngine, + ER: RaftEngine, +{ + /// Creates a new Node. + pub fn new( + cfg: &crate::server::Config, + pd_client: Arc, + store: Option, + registry: TabletRegistry, + ) -> NodeV2 { + let store = init_store(store, cfg); + + NodeV2 { + cluster_id: cfg.cluster_id, + store, + pd_client, + system: None, + has_started: false, + registry, + logger: slog_global::borrow_global().new(o!()), + } + } + + pub fn try_bootstrap_store( + &mut self, + cfg: &raftstore_v2::Config, + raft_engine: &ER, + ) -> Result<()> { + let store_id = Bootstrap::new( + raft_engine, + self.cluster_id, + &*self.pd_client, + self.logger.clone(), + ) + .bootstrap_store()?; + self.store.set_id(store_id); + let (router, system) = + raftstore_v2::create_store_batch_system(cfg, store_id, self.logger.clone()); + self.system = Some(( + RaftRouter::new(store_id, self.registry.clone(), router), + system, + )); + Ok(()) + } + + pub fn router(&self) -> &RaftRouter { + &self.system.as_ref().unwrap().0 + } + + /// Starts the Node. It tries to bootstrap cluster if the cluster is not + /// bootstrapped yet. Then it spawns a thread to run the raftstore in + /// background. + pub fn start( + &mut self, + raft_engine: ER, + trans: T, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + coprocessor_host: CoprocessorHost, + background: Worker, + pd_worker: LazyWorker, + store_cfg: Arc>, + state: &Mutex, + ) -> Result<()> + where + T: Transport + 'static, + { + let store_id = self.id(); + if let Some(region) = Bootstrap::new( + &raft_engine, + self.cluster_id, + &*self.pd_client, + self.logger.clone(), + ) + .bootstrap_first_region(&self.store, store_id)? + { + let path = self + .registry + .tablet_path(region.get_id(), RAFT_INIT_LOG_INDEX); + let ctx = TabletContext::new(®ion, Some(RAFT_INIT_LOG_INDEX)); + // TODO: make follow line can recover from abort. + self.registry + .tablet_factory() + .open_tablet(ctx, &path) + .unwrap(); + } + + // Put store only if the cluster is bootstrapped. + info!(self.logger, "put store to PD"; "store" => ?&self.store); + let status = self.pd_client.put_store(self.store.clone())?; + self.load_all_stores(state, status); + + self.start_store( + raft_engine, + trans, + snap_mgr, + concurrency_manager, + causal_ts_provider, + coprocessor_host, + background, + pd_worker, + store_cfg, + )?; + + Ok(()) + } + + /// Gets the store id. + pub fn id(&self) -> u64 { + self.store.get_id() + } + + pub fn logger(&self) -> Logger { + self.logger.clone() + } + + /// Gets a copy of Store which is registered to Pd. + pub fn store(&self) -> metapb::Store { + self.store.clone() + } + + // TODO: support updating dynamic configuration. + + // TODO: check api version. + // Do we really need to do the check giving we don't consider support upgrade + // ATM? + + fn load_all_stores( + &mut self, + state: &Mutex, + status: Option, + ) { + info!(self.logger, "initializing replication mode"; "status" => ?status, "store_id" => self.store.id); + let stores = match self.pd_client.get_all_stores(false) { + Ok(stores) => stores, + Err(e) => panic!("failed to load all stores: {:?}", e), + }; + let mut state = state.lock().unwrap(); + if let Some(s) = status { + state.set_status(s); + } + for mut store in stores { + state + .group + .register_store(store.id, store.take_labels().into()); + } + } + + fn start_store( + &mut self, + raft_engine: ER, + trans: T, + snap_mgr: TabletSnapManager, + concurrency_manager: ConcurrencyManager, + causal_ts_provider: Option>, // used for rawkv apiv2 + coprocessor_host: CoprocessorHost, + background: Worker, + pd_worker: LazyWorker, + store_cfg: Arc>, + ) -> Result<()> + where + T: Transport + 'static, + { + let store_id = self.store.get_id(); + info!(self.logger, "start raft store thread"; "store_id" => store_id); + + if self.has_started { + return Err(box_err!("{} is already started", store_id)); + } + self.has_started = true; + + let (router, system) = self.system.as_mut().unwrap(); + + system.start( + store_id, + store_cfg, + raft_engine, + self.registry.clone(), + trans, + self.pd_client.clone(), + router.store_router(), + router.store_meta().clone(), + snap_mgr, + concurrency_manager, + causal_ts_provider, + coprocessor_host, + background, + pd_worker, + )?; + Ok(()) + } + + /// Stops the Node. + pub fn stop(&mut self) { + let store_id = self.store.get_id(); + let Some((_, mut system)) = self.system.take() else { return }; + info!(self.logger, "stop raft store thread"; "store_id" => store_id); + system.shutdown(); + } +} diff --git a/src/server/raftkv2/raft_extension.rs b/src/server/raftkv2/raft_extension.rs new file mode 100644 index 00000000000..f2f433999b9 --- /dev/null +++ b/src/server/raftkv2/raft_extension.rs @@ -0,0 +1,109 @@ +// Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine}; +use kvproto::raft_serverpb::RaftMessage; +use raftstore_v2::{ + router::{DebugInfoChannel, PeerMsg, StoreMsg}, + StoreRouter, +}; + +#[derive(Clone)] +pub struct Extension { + router: StoreRouter, +} + +impl Extension { + pub fn new(router: StoreRouter) -> Self { + Extension { router } + } +} + +impl tikv_kv::RaftExtension for Extension { + #[inline] + fn feed(&self, msg: RaftMessage, key_message: bool) { + let region_id = msg.get_region_id(); + let msg_ty = msg.get_message().get_msg_type(); + // Channel full and region not found are ignored unless it's a key message. + if let Err(e) = self.router.send_raft_message(Box::new(msg)) && key_message { + error!("failed to send raft message"; "region_id" => region_id, "msg_ty" => ?msg_ty, "err" => ?e); + } + } + + #[inline] + fn report_reject_message(&self, _region_id: u64, _from_peer_id: u64) { + // TODO:reject the message on connection side instead of go through + // raft layer. + } + + #[inline] + fn report_peer_unreachable(&self, region_id: u64, to_peer_id: u64) { + let _ = self + .router + .send(region_id, PeerMsg::PeerUnreachable { to_peer_id }); + } + + #[inline] + fn report_store_unreachable(&self, to_store_id: u64) { + let _ = self + .router + .send_control(StoreMsg::StoreUnreachable { to_store_id }); + } + + fn report_snapshot_status( + &self, + region_id: u64, + to_peer_id: u64, + status: raft::SnapshotStatus, + ) { + let _ = self + .router + .force_send(region_id, PeerMsg::SnapshotSent { to_peer_id, status }); + } + + fn report_resolved(&self, _store_id: u64, _group_id: u64) { + // TODO: support commit group + } + + fn split( + &self, + region_id: u64, + region_epoch: kvproto::metapb::RegionEpoch, + split_keys: Vec>, + source: String, + ) -> futures::future::BoxFuture<'static, tikv_kv::Result>> { + let (msg, sub) = PeerMsg::request_split(region_epoch, split_keys, source); + let res = self.router.check_send(region_id, msg); + Box::pin(async move { + res?; + let mut resp = match sub.result().await { + Some(r) => r, + None => return Err(box_err!("split is aborted")), + }; + if !resp.get_header().has_error() { + let regions = resp.mut_admin_response().mut_splits().take_regions(); + Ok(regions.into()) + } else { + Err(tikv_kv::Error::from(resp.mut_header().take_error())) + } + }) + } + + fn query_region( + &self, + region_id: u64, + ) -> futures::future::BoxFuture< + 'static, + tikv_kv::Result, + > { + let (ch, sub) = DebugInfoChannel::pair(); + let msg = PeerMsg::QueryDebugInfo(ch); + let res = self.router.check_send(region_id, msg); + Box::pin(async move { + res?; + match sub.result().await { + Some(res) => Ok(res), + None => Err(box_err!("query region is aborted")), + } + }) + } +} diff --git a/src/server/server.rs b/src/server/server.rs index 1921483e37b..4c1f5e7ef69 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -13,7 +13,7 @@ use futures::{compat::Stream01CompatExt, stream::StreamExt}; use grpcio::{ChannelBuilder, Environment, ResourceQuota, Server as GrpcServer, ServerBuilder}; use grpcio_health::{create_health, HealthService, ServingStatus}; use kvproto::tikvpb::*; -use raftstore::store::{CheckLeaderTask, SnapManager}; +use raftstore::store::{CheckLeaderTask, SnapManager, TabletSnapManager}; use security::SecurityManager; use tikv_util::{ config::VersionTrack, @@ -39,7 +39,7 @@ use crate::{ coprocessor::Endpoint, coprocessor_v2, read_pool::ReadPool, - server::{gc_worker::GcWorker, Proxy}, + server::{gc_worker::GcWorker, tablet_snap::TabletRunner, Proxy}, storage::{lock_manager::LockManager, Engine, Storage}, tikv_util::sys::thread::ThreadBuildWrapper, }; @@ -67,7 +67,7 @@ pub struct Server { trans: ServerTransport, raft_router: E::RaftExtension, // For sending/receiving snapshots. - snap_mgr: SnapManager, + snap_mgr: Either, snap_worker: LazyWorker, // Currently load statistics is done in the thread. @@ -94,7 +94,7 @@ where copr: Endpoint, copr_v2: coprocessor_v2::Endpoint, resolver: S, - snap_mgr: SnapManager, + snap_mgr: Either, gc_worker: GcWorker, check_leader_scheduler: Scheduler, env: Arc, @@ -122,7 +122,7 @@ where let snap_worker = Worker::new("snap-handler"); let lazy_worker = snap_worker.lazy_build("snap-handler"); - let raft_ext = storage.get_engine().raft_extension().clone(); + let raft_ext = storage.get_engine().raft_extension(); let proxy = Proxy::new(security_mgr.clone(), &env, Arc::new(cfg.value().clone())); let kv_service = KvService::new( @@ -252,14 +252,28 @@ where cfg: Arc>, security_mgr: Arc, ) -> Result<()> { - let snap_runner = SnapHandler::new( - Arc::clone(&self.env), - self.snap_mgr.clone(), - self.raft_router.clone(), - security_mgr, - Arc::clone(&cfg), - ); - self.snap_worker.start(snap_runner); + match self.snap_mgr.clone() { + Either::Left(mgr) => { + let snap_runner = SnapHandler::new( + self.env.clone(), + mgr, + self.raft_router.clone(), + security_mgr, + cfg, + ); + self.snap_worker.start(snap_runner); + } + Either::Right(mgr) => { + let snap_runner = TabletRunner::new( + self.env.clone(), + mgr, + self.raft_router.clone(), + security_mgr, + cfg, + ); + self.snap_worker.start(snap_runner); + } + } let mut grpc_server = self.builder_or_server.take().unwrap().right().unwrap(); info!("listening on addr"; "addr" => &self.local_addr); @@ -564,7 +578,7 @@ mod tests { quick_fail: Arc::clone(&quick_fail), addr: Arc::clone(&addr), }, - SnapManager::new(""), + Either::Left(SnapManager::new("")), gc_worker, check_leader_scheduler, env, diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index ae0d53bacda..e0ec9173ad5 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,7 +1,9 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use engine_rocks::RocksEngine; -use engine_traits::{Engines, MiscExt, RaftEngine}; +use std::sync::Arc; + +use engine_rocks::{RocksEngine, RocksStatistics}; +use engine_traits::{Engines, RaftEngine}; use futures::{ future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -54,11 +56,15 @@ impl Service { /// `GcWorker`. pub fn new( engines: Engines, + kv_statistics: Option>, + raft_statistics: Option>, pool: Handle, raft_router: T, cfg_controller: ConfigController, ) -> Self { - let debugger = Debugger::new(engines, cfg_controller); + let mut debugger = Debugger::new(engines, cfg_controller); + debugger.set_kv_statistics(kv_statistics); + debugger.set_raft_statistics(raft_statistics); Service { pool, debugger, @@ -353,9 +359,8 @@ impl debugpb::Debug for Service NicSnapshot { NicSnapshot { - rx_bytes: data.get_total_received(), - tx_bytes: data.get_total_transmitted(), - rx_packets: data.get_total_packets_received(), - tx_packets: data.get_total_packets_transmitted(), - rx_errors: data.get_total_errors_on_received(), - tx_errors: data.get_total_errors_on_transmitted(), + rx_bytes: data.total_received(), + tx_bytes: data.total_transmitted(), + rx_packets: data.total_packets_received(), + tx_packets: data.total_packets_transmitted(), + rx_errors: data.total_errors_on_received(), + tx_errors: data.total_errors_on_transmitted(), } } @@ -62,7 +62,7 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) let infos = { let mut system = SYS_INFO.lock().unwrap(); system.refresh_system(); - let load = system.get_load_average(); + let load = system.load_average(); vec![ ("load1", load.one), ("load5", load.five), @@ -129,12 +129,12 @@ fn cpu_load_info(prev_cpu: CpuTimeSnapshot, collector: &mut Vec) fn mem_load_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_memory(); - let total_memory = system.get_total_memory() * KIB; - let used_memory = system.get_used_memory() * KIB; - let free_memory = system.get_free_memory() * KIB; - let total_swap = system.get_total_swap() * KIB; - let used_swap = system.get_used_swap() * KIB; - let free_swap = system.get_free_swap() * KIB; + let total_memory = system.total_memory() * KIB; + let used_memory = system.used_memory() * KIB; + let free_memory = system.free_memory() * KIB; + let total_swap = system.total_swap() * KIB; + let used_swap = system.used_swap() * KIB; + let free_swap = system.free_swap() * KIB; drop(system); let used_memory_pct = (used_memory as f64) / (total_memory as f64); let free_memory_pct = (free_memory as f64) / (total_memory as f64); @@ -182,7 +182,7 @@ fn nic_load_info(prev_nic: HashMap, collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_cpu(); - let processor = match system.get_processors().iter().next() { + let processor = match system.cpus().iter().next() { Some(p) => p, None => return, }; let mut infos = vec![ ("cpu-logical-cores", SysQuota::cpu_cores_quota().to_string()), ("cpu-physical-cores", num_cpus::get_physical().to_string()), - ("cpu-frequency", format!("{}MHz", processor.get_frequency())), - ("cpu-vendor-id", processor.get_vendor_id().to_string()), + ("cpu-frequency", format!("{}MHz", processor.frequency())), + ("cpu-vendor-id", processor.vendor_id().to_string()), ]; // Depend on Rust lib return CPU arch not matching // Golang lib so need this match loop to conversion @@ -362,26 +362,23 @@ fn disk_hardware_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_disks_list(); system.refresh_disks(); - let disks = system.get_disks(); + let disks = system.disks(); for disk in disks { - let file_sys = std::str::from_utf8(disk.get_file_system()).unwrap_or("unknown"); + let file_sys = std::str::from_utf8(disk.file_system()).unwrap_or("unknown"); if file_sys == "rootfs" { continue; } - let total = disk.get_total_space(); - let free = disk.get_available_space(); + let total = disk.total_space(); + let free = disk.available_space(); let used = total - free; let free_pct = (free as f64) / (total as f64); let used_pct = (used as f64) / (total as f64); let infos = vec![ - ("type", format!("{:?}", disk.get_type())), + ("type", format!("{:?}", disk.type_())), ("fstype", file_sys.to_string()), ( "path", - disk.get_mount_point() - .to_str() - .unwrap_or("unknown") - .to_string(), + disk.mount_point().to_str().unwrap_or("unknown").to_string(), ), ("total", total.to_string()), ("free", free.to_string()), @@ -398,7 +395,7 @@ fn disk_hardware_info(collector: &mut Vec) { } let mut item = ServerInfoItem::default(); item.set_tp("disk".to_string()); - item.set_name(disk.get_name().to_str().unwrap_or("disk").to_string()); + item.set_name(disk.name().to_str().unwrap_or("disk").to_string()); item.set_pairs(pairs.into()); collector.push(item); } @@ -515,7 +512,7 @@ fn get_transparent_hugepage() -> Option { pub fn process_info(collector: &mut Vec) { let mut system = SYS_INFO.lock().unwrap(); system.refresh_processes(); - let processes = system.get_processes(); + let processes = system.processes(); for (pid, p) in processes.iter() { if p.cmd().is_empty() { continue; @@ -555,7 +552,7 @@ mod tests { system.refresh_networks_list(); system.refresh_all(); system - .get_networks() + .networks() .into_iter() .map(|(n, d)| (n.to_owned(), NicSnapshot::from_network_data(d))) .collect() diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index db50dfe459e..6c85741f64a 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -65,6 +65,7 @@ const GRPC_MSG_NOTIFY_SIZE: usize = 8; pub struct Service { store_id: u64, /// Used to handle requests related to GC. + // TODO: make it Some after GC is supported for v2. gc_worker: GcWorker, // For handling KV requests. storage: Storage, @@ -590,7 +591,7 @@ impl Tikv for Service { sink: ClientStreamingSink, ) { let store_id = self.store_id; - let ch = self.storage.get_engine().raft_extension().clone(); + let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { @@ -633,7 +634,7 @@ impl Tikv for Service { ) { info!("batch_raft RPC is called, new gRPC stream established"); let store_id = self.store_id; - let ch = self.storage.get_engine().raft_extension().clone(); + let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; let res = async move { diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 78302550fd5..2f87c5d0264 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -4,7 +4,6 @@ mod profile; use std::{ error::Error as StdError, - marker::PhantomData, net::SocketAddr, path::PathBuf, pin::Pin, @@ -16,7 +15,6 @@ use std::{ use async_stream::stream; use collections::HashMap; -use engine_traits::KvEngine; use flate2::{write::GzEncoder, Compression}; use futures::{ compat::{Compat01As03, Stream01CompatExt}, @@ -45,10 +43,10 @@ pub use profile::{ read_file, start_one_cpu_profile, start_one_heap_profile, }; use prometheus::TEXT_FORMAT; -use raftstore::store::{transport::CasualRouter, CasualMessage}; use regex::Regex; use security::{self, SecurityConfig}; use serde_json::Value; +use tikv_kv::RaftExtension; use tikv_util::{ logger::set_log_level, metrics::{dump, dump_to}, @@ -82,7 +80,7 @@ struct LogLevelRequest { pub log_level: LogLevel, } -pub struct StatusServer { +pub struct StatusServer { thread_pool: Runtime, tx: Sender<()>, rx: Option>, @@ -91,12 +89,10 @@ pub struct StatusServer { router: R, security_config: Arc, store_path: PathBuf, - _snap: PhantomData, } -impl StatusServer +impl StatusServer where - E: 'static, R: 'static + Send, { pub fn new( @@ -124,7 +120,6 @@ where router, security_config, store_path, - _snap: PhantomData, }) } @@ -423,10 +418,9 @@ where } } -impl StatusServer +impl StatusServer where - E: KvEngine, - R: 'static + Send + CasualRouter + Clone, + R: 'static + Send + RaftExtension + Clone, { pub async fn dump_region_meta(req: Request, router: R) -> hyper::Result> { lazy_static! { @@ -451,33 +445,18 @@ where )); } }; - let (tx, rx) = oneshot::channel(); - match router.send( - id, - CasualMessage::AccessPeer(Box::new(move |meta| { - if let Err(meta) = tx.send(meta) { - error!("receiver dropped, region meta: {:?}", meta) - } - })), - ) { - Ok(_) => (), - Err(raftstore::Error::RegionNotFound(_)) => { + let f = router.query_region(id); + let meta = match f.await { + Ok(meta) => meta, + Err(tikv_kv::Error(box tikv_kv::ErrorInner::Request(header))) + if header.has_region_not_found() => + { return not_found(format!("region({}) not found", id)); } Err(err) => { return Ok(make_response( StatusCode::INTERNAL_SERVER_ERROR, - format!("channel pending or disconnect: {}", err), - )); - } - } - - let meta = match rx.await { - Ok(meta) => meta, - Err(_) => { - return Ok(make_response( - StatusCode::INTERNAL_SERVER_ERROR, - "query cancelled", + format!("query failed: {}", err), )); } }; @@ -938,17 +917,21 @@ mod tests { use std::{env, io::Read, path::PathBuf, sync::Arc}; use collections::HashSet; - use engine_test::kv::KvTestEngine; use flate2::read::GzDecoder; - use futures::{executor::block_on, future::ok, prelude::*}; + use futures::{ + executor::block_on, + future::{ok, BoxFuture}, + prelude::*, + }; use http::header::{HeaderValue, ACCEPT_ENCODING}; use hyper::{body::Buf, client::HttpConnector, Body, Client, Method, Request, StatusCode, Uri}; use hyper_openssl::HttpsConnector; use online_config::OnlineConfig; use openssl::ssl::{SslConnector, SslFiletype, SslMethod}; - use raftstore::store::{transport::CasualRouter, CasualMessage}; + use raftstore::store::region_meta::RegionMeta; use security::SecurityConfig; use test_util::new_security_cfg; + use tikv_kv::RaftExtension; use tikv_util::logger::get_log_level; use crate::{ @@ -959,9 +942,9 @@ mod tests { #[derive(Clone)] struct MockRouter; - impl CasualRouter for MockRouter { - fn send(&self, region_id: u64, _: CasualMessage) -> raftstore::Result<()> { - Err(raftstore::Error::RegionNotFound(region_id)) + impl RaftExtension for MockRouter { + fn query_region(&self, region_id: u64) -> BoxFuture<'static, tikv_kv::Result> { + Box::pin(async move { Err(raftstore::Error::RegionNotFound(region_id).into()) }) } } diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 5dd83deb092..b5d989d5370 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -493,7 +493,7 @@ mod tests { msg.mut_message().mut_snapshot().mut_metadata().set_term(1); let send_path = TempDir::new().unwrap(); let send_snap_mgr = - TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()); + TabletSnapManager::new(send_path.path().join("snap_dir").to_str().unwrap()).unwrap(); let snap_path = send_snap_mgr.tablet_gen_path(&snap_key); create_dir_all(snap_path.as_path()).unwrap(); // send file should skip directory @@ -512,7 +512,7 @@ mod tests { let recv_path = TempDir::new().unwrap(); let recv_snap_manager = - TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()); + TabletSnapManager::new(recv_path.path().join("snap_dir").to_str().unwrap()).unwrap(); let (tx, rx) = mpsc::unbounded(); let sink = tx.sink_map_err(Error::from); block_on(send_snap_files( diff --git a/src/storage/config.rs b/src/storage/config.rs index 313f86ba048..68d739c1639 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -31,12 +31,21 @@ const DEFAULT_SCHED_PENDING_WRITE_MB: u64 = 100; const DEFAULT_RESERVED_SPACE_GB: u64 = 5; const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "kebab-case")] +pub enum EngineType { + RaftKv, + RaftKv2, +} + #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, OnlineConfig)] #[serde(default)] #[serde(rename_all = "kebab-case")] pub struct Config { #[online_config(skip)] pub data_dir: String, + #[online_config(skip)] + pub engine: EngineType, // Replaced by `GcConfig.ratio_threshold`. Keep it for backward compatibility. #[online_config(skip)] pub gc_ratio_threshold: f64, @@ -75,6 +84,7 @@ impl Default for Config { let cpu_num = SysQuota::cpu_cores_quota(); Config { data_dir: DEFAULT_DATA_DIR.to_owned(), + engine: EngineType::RaftKv, gc_ratio_threshold: DEFAULT_GC_RATIO_THRESHOLD, max_key_size: DEFAULT_MAX_KEY_SIZE, scheduler_concurrency: DEFAULT_SCHED_CONCURRENCY, @@ -194,7 +204,7 @@ impl Default for FlowControlConfig { #[serde(rename_all = "kebab-case")] pub struct BlockCacheConfig { #[online_config(skip)] - pub shared: bool, + pub shared: Option, pub capacity: Option, #[online_config(skip)] pub num_shard_bits: i32, @@ -209,7 +219,7 @@ pub struct BlockCacheConfig { impl Default for BlockCacheConfig { fn default() -> BlockCacheConfig { BlockCacheConfig { - shared: true, + shared: None, capacity: None, num_shard_bits: 6, strict_capacity_limit: false, @@ -229,9 +239,9 @@ impl BlockCacheConfig { } } - pub fn build_shared_cache(&self) -> Option { - if !self.shared { - return None; + pub fn build_shared_cache(&self) -> Cache { + if self.shared == Some(false) { + warn!("storage.block-cache.shared is deprecated, cache is always shared."); } let capacity = match self.capacity { None => { @@ -248,7 +258,7 @@ impl BlockCacheConfig { if let Some(allocator) = self.new_memory_allocator() { cache_opts.set_memory_allocator(allocator); } - Some(Cache::new_lru_cache(cache_opts)) + Cache::new_lru_cache(cache_opts) } fn new_memory_allocator(&self) -> Option { diff --git a/src/storage/config_manager.rs b/src/storage/config_manager.rs index de3b13408f0..b6a5f9d58ab 100644 --- a/src/storage/config_manager.rs +++ b/src/storage/config_manager.rs @@ -4,7 +4,7 @@ use std::{convert::TryInto, sync::Arc}; -use engine_traits::{KvEngine, TabletFactory, CF_DEFAULT}; +use engine_traits::{ALL_CFS, CF_DEFAULT}; use file_system::{get_io_rate_limiter, IoPriority, IoType}; use online_config::{ConfigChange, ConfigManager, ConfigValue, Result as CfgResult}; use strum::IntoEnumIterator; @@ -15,32 +15,30 @@ use tikv_util::{ }; use crate::{ + config::ConfigurableDb, server::{ttl::TtlCheckerTask, CONFIG_ROCKSDB_GAUGE}, storage::{lock_manager::LockManager, txn::flow_controller::FlowController, TxnScheduler}, }; -pub struct StorageConfigManger { - tablet_factory: Arc + Send + Sync>, - shared_block_cache: bool, +pub struct StorageConfigManger { + configurable_db: K, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, } -unsafe impl Send for StorageConfigManger {} -unsafe impl Sync for StorageConfigManger {} +unsafe impl Send for StorageConfigManger {} +unsafe impl Sync for StorageConfigManger {} -impl StorageConfigManger { +impl StorageConfigManger { pub fn new( - tablet_factory: Arc + Send + Sync>, - shared_block_cache: bool, + configurable_db: K, ttl_checker_scheduler: Scheduler, flow_controller: Arc, scheduler: TxnScheduler, ) -> Self { StorageConfigManger { - tablet_factory, - shared_block_cache, + configurable_db, ttl_checker_scheduler, flow_controller, scheduler, @@ -48,16 +46,16 @@ impl StorageConfigManger { } } -impl ConfigManager for StorageConfigManger { +impl ConfigManager + for StorageConfigManger +{ fn dispatch(&mut self, mut change: ConfigChange) -> CfgResult<()> { if let Some(ConfigValue::Module(mut block_cache)) = change.remove("block_cache") { - if !self.shared_block_cache { - return Err("shared block cache is disabled".into()); - } if let Some(size) = block_cache.remove("capacity") { if size != ConfigValue::None { let s: ReadableSize = size.into(); - self.tablet_factory.set_shared_block_cache_capacity(s.0)?; + self.configurable_db + .set_shared_block_cache_capacity(s.0 as usize)?; // Write config to metric CONFIG_ROCKSDB_GAUGE .with_label_values(&[CF_DEFAULT, "block_cache_size"]) @@ -73,15 +71,11 @@ impl ConfigManager for StorageConfigMan if let Some(v) = flow_control.remove("enable") { let enable: bool = v.into(); let enable_str = if enable { "true" } else { "false" }; - self.tablet_factory.for_each_opened_tablet( - &mut |_region_id, _suffix, tablet: &K| { - for cf in tablet.cf_names() { - tablet - .set_options_cf(cf, &[("disable_write_stall", enable_str)]) - .unwrap(); - } - }, - ); + for cf in ALL_CFS { + self.configurable_db + .set_cf_config(cf, &[("disable_write_stall", enable_str)]) + .unwrap(); + } self.flow_controller.enable(enable); } } else if let Some(v) = change.get("scheduler_worker_pool_size") { diff --git a/src/storage/kv/test_engine_builder.rs b/src/storage/kv/test_engine_builder.rs index f0192372e4b..d15a33742ba 100644 --- a/src/storage/kv/test_engine_builder.rs +++ b/src/storage/kv/test_engine_builder.rs @@ -12,7 +12,7 @@ use kvproto::kvrpcpb::ApiVersion; use tikv_util::config::ReadableSize; use crate::storage::{ - config::BlockCacheConfig, + config::{BlockCacheConfig, EngineType}, kv::{Result, RocksEngine}, }; @@ -96,22 +96,31 @@ impl TestEngineBuilder { if !enable_block_cache { cache_opt.capacity = Some(ReadableSize::kb(0)); } - let cache = cache_opt.build_shared_cache(); + let shared = cfg_rocksdb.build_cf_resources(cache_opt.build_shared_cache()); let cfs_opts = cfs .iter() .map(|cf| match *cf { CF_DEFAULT => ( CF_DEFAULT, - cfg_rocksdb.defaultcf.build_opt(&cache, None, api_version), + cfg_rocksdb + .defaultcf + .build_opt(&shared, None, api_version, EngineType::RaftKv), ), - CF_LOCK => (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - CF_WRITE => (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), - CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + CF_LOCK => ( + CF_LOCK, + cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), + ), + CF_WRITE => ( + CF_WRITE, + cfg_rocksdb + .writecf + .build_opt(&shared, None, EngineType::RaftKv), + ), + CF_RAFT => (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), _ => (*cf, RocksCfOptions::default()), }) .collect(); - let engine = - RocksEngine::new(&path, None, cfs_opts, cache.is_some(), self.io_rate_limiter)?; + let engine = RocksEngine::new(&path, None, cfs_opts, self.io_rate_limiter)?; Ok(engine) } } diff --git a/src/storage/lock_manager/lock_waiting_queue.rs b/src/storage/lock_manager/lock_waiting_queue.rs index 663c6729962..a81248fe9e2 100644 --- a/src/storage/lock_manager/lock_waiting_queue.rs +++ b/src/storage/lock_manager/lock_waiting_queue.rs @@ -76,7 +76,7 @@ use txn_types::{Key, TimeStamp}; use crate::storage::{ lock_manager::{ lock_wait_context::{LockWaitContextSharedState, PessimisticLockKeyCallback}, - LockManager, LockWaitToken, + KeyLockWaitInfo, LockDigest, LockManager, LockWaitToken, UpdateWaitForEvent, }, metrics::*, mvcc::{Error as MvccError, ErrorInner as MvccErrorInner}, @@ -599,6 +599,36 @@ impl LockWaitQueues { result } + pub fn update_lock_wait(&self, lock_info: Vec) { + let mut update_wait_for_events = vec![]; + for lock_info in lock_info { + let key = Key::from_raw(lock_info.get_key()); + if let Some(mut key_state) = self.inner.queue_map.get_mut(&key) { + key_state.current_lock = lock_info; + update_wait_for_events.reserve(key_state.queue.len()); + for (&token, entry) in key_state.queue.iter() { + let event = UpdateWaitForEvent { + token, + start_ts: entry.parameters.start_ts, + is_first_lock: entry.parameters.is_first_lock, + wait_info: KeyLockWaitInfo { + key: key.clone(), + lock_digest: LockDigest { + ts: key_state.current_lock.lock_version.into(), + hash: entry.lock_hash, + }, + lock_info: key_state.current_lock.clone(), + }, + }; + update_wait_for_events.push(event); + } + } + } + if !update_wait_for_events.is_empty() { + self.inner.lock_mgr.update_wait_for(update_wait_for_events); + } + } + /// Gets the count of entries currently waiting in queues. /// /// Mind that the contents of the queues may be changed concurrently. @@ -1205,4 +1235,44 @@ mod tests { queues.must_not_contain_key(b"k1"); assert_eq!(queues.entry_count(), 0); } + + #[bench] + fn bench_update_lock_wait_empty(b: &mut test::Bencher) { + let queues = LockWaitQueues::new(MockLockManager::new()); + queues.mock_lock_wait(b"k1", 5, 6, false); + + let mut lock_info = kvrpcpb::LockInfo::default(); + let key = b"t\x00\x00\x00\x00\x00\x00\x00\x01_r\x00\x00\x00\x00\x00\x00\x00\x01"; + lock_info.set_key(key.to_vec()); + lock_info.set_primary_lock(key.to_vec()); + lock_info.set_lock_version(10); + lock_info.set_lock_for_update_ts(10); + let lock_info = vec![lock_info]; + + b.iter(|| { + queues.update_lock_wait(lock_info.clone()); + }); + } + + #[bench] + fn bench_update_lock_wait_queue_len_512(b: &mut test::Bencher) { + let queues = LockWaitQueues::new(MockLockManager::new()); + + let key = b"t\x00\x00\x00\x00\x00\x00\x00\x01_r\x00\x00\x00\x00\x00\x00\x00\x01"; + + for i in 0..512 { + queues.mock_lock_wait(key, 15 + i, 10, true); + } + + let mut lock_info = kvrpcpb::LockInfo::default(); + lock_info.set_key(key.to_vec()); + lock_info.set_primary_lock(key.to_vec()); + lock_info.set_lock_version(10); + lock_info.set_lock_for_update_ts(10); + let lock_info = vec![lock_info]; + + b.iter(|| { + queues.update_lock_wait(lock_info.clone()); + }); + } } diff --git a/src/storage/lock_manager/mod.rs b/src/storage/lock_manager/mod.rs index 75b133a808f..5c103f40f82 100644 --- a/src/storage/lock_manager/mod.rs +++ b/src/storage/lock_manager/mod.rs @@ -115,7 +115,6 @@ pub struct UpdateWaitForEvent { pub start_ts: TimeStamp, pub is_first_lock: bool, pub wait_info: KeyLockWaitInfo, - pub diag_ctx: DiagnosticContext, } /// `LockManager` manages transactions waiting for locks held by other diff --git a/src/storage/metrics.rs b/src/storage/metrics.rs index e84a7dfb4e9..080ff2c5951 100644 --- a/src/storage/metrics.rs +++ b/src/storage/metrics.rs @@ -347,17 +347,23 @@ where }; tls_cell.with(|c| { let mut c = c.borrow_mut(); - let perf_context = c.get_or_insert_with(|| { - with_tls_engine(|engine: &mut E| { - Box::new(engine.kv_engine().unwrap().get_perf_context( - PerfLevel::Uninitialized, - PerfContextKind::Storage(cmd.get_str()), - )) - }) - }); - perf_context.start_observe(); + if c.is_none() { + *c = with_tls_engine(|engine: &mut E| { + engine.kv_engine().map(|c| { + Box::new(c.get_perf_context( + PerfLevel::Uninitialized, + PerfContextKind::Storage(cmd.get_str()), + )) as Box + }) + }); + }; + if let Some(c) = &mut *c { + c.start_observe(); + } let res = f(); - perf_context.report_metrics(&[get_tls_tracker_token()]); + if let Some(c) = &mut *c { + c.report_metrics(&[get_tls_tracker_token()]); + } res }) } diff --git a/src/storage/mod.rs b/src/storage/mod.rs index caed0f57c91..802b0507849 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -75,7 +75,9 @@ use api_version::{ApiV1, ApiV2, KeyMode, KvFormat, RawValue}; use causal_ts::{CausalTsProvider, CausalTsProviderImpl}; use collections::HashMap; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; -use engine_traits::{raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS}; +use engine_traits::{ + raw_ttl::ttl_to_expire_ts, CfName, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, DATA_CFS_LEN, +}; use futures::prelude::*; use kvproto::{ kvrpcpb::{ @@ -1538,7 +1540,7 @@ impl Storage { [(Some(start_key.as_encoded()), Some(end_key.as_encoded()))], )?; - let mut modifies = Vec::with_capacity(DATA_CFS.len()); + let mut modifies = Vec::with_capacity(DATA_CFS_LEN); for cf in DATA_CFS { modifies.push(Modify::DeleteRange( cf, @@ -3593,6 +3595,7 @@ mod tests { use txn_types::{Mutation, PessimisticLock, WriteType, SHORT_VALUE_MAX_LEN}; use super::{ + config::EngineType, mvcc::tests::{must_unlocked, must_written}, test_util::*, txn::{ @@ -4133,24 +4136,32 @@ mod tests { let engine = { let path = "".to_owned(); let cfg_rocksdb = db_config; - let cache = BlockCacheConfig::default().build_shared_cache(); + let shared = + cfg_rocksdb.build_cf_resources(BlockCacheConfig::default().build_shared_cache()); let cfs_opts = vec![ ( CF_DEFAULT, + cfg_rocksdb.defaultcf.build_opt( + &shared, + None, + ApiVersion::V1, + EngineType::RaftKv, + ), + ), + ( + CF_LOCK, + cfg_rocksdb.lockcf.build_opt(&shared, EngineType::RaftKv), + ), + ( + CF_WRITE, cfg_rocksdb - .defaultcf - .build_opt(&cache, None, ApiVersion::V1), + .writecf + .build_opt(&shared, None, EngineType::RaftKv), ), - (CF_LOCK, cfg_rocksdb.lockcf.build_opt(&cache)), - (CF_WRITE, cfg_rocksdb.writecf.build_opt(&cache, None)), - (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&cache)), + (CF_RAFT, cfg_rocksdb.raftcf.build_opt(&shared)), ]; RocksEngine::new( - &path, - None, - cfs_opts, - cache.is_some(), - None, // io_rate_limiter + &path, None, cfs_opts, None, // io_rate_limiter ) } .unwrap(); @@ -4875,7 +4886,7 @@ mod tests { commit_ts, version, key.clone(), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); if let Mutation::Put(..) = write.0 { expect_value( @@ -4900,7 +4911,7 @@ mod tests { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ) { let (tx, rx) = channel(); storage @@ -4997,7 +5008,7 @@ mod tests { commit_ts, 2.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_value( b"v@1".to_vec(), @@ -5013,7 +5024,7 @@ mod tests { commit_ts, 1.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_none( block_on(storage.get(Context::default(), Key::from_raw(b"k"), commit_ts)) @@ -5104,7 +5115,7 @@ mod tests { flashback_commit_ts, TimeStamp::zero(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); for i in 1..=FLASHBACK_BATCH_SIZE * 4 { let key = Key::from_raw(format!("k{}", i).as_bytes()); @@ -5183,7 +5194,7 @@ mod tests { flashback_commit_ts, 1.into(), Key::from_raw(b"k"), - Key::from_raw(b"z"), + Some(Key::from_raw(b"z")), ); expect_none( block_on(storage.get(Context::default(), k, flashback_commit_ts)) @@ -5192,6 +5203,74 @@ mod tests { ); } + #[test] + fn test_mvcc_flashback_retry_prepare() { + let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) + .build() + .unwrap(); + let (tx, rx) = channel(); + let mut ts = TimeStamp::zero(); + storage + .sched_txn_command( + commands::Prewrite::with_defaults( + vec![Mutation::make_put(Key::from_raw(b"k"), b"v@1".to_vec())], + b"k".to_vec(), + *ts.incr(), + ), + expect_ok_callback(tx.clone(), 0), + ) + .unwrap(); + rx.recv().unwrap(); + storage + .sched_txn_command( + commands::Commit::new( + vec![Key::from_raw(b"k")], + ts, + *ts.incr(), + Context::default(), + ), + expect_value_callback(tx.clone(), 1, TxnStatus::committed(ts)), + ) + .unwrap(); + rx.recv().unwrap(); + expect_value( + b"v@1".to_vec(), + block_on(storage.get(Context::default(), Key::from_raw(b"k"), ts)) + .unwrap() + .0, + ); + // Try to prepare flashback first. + let flashback_start_ts = *ts.incr(); + let flashback_commit_ts = *ts.incr(); + storage + .sched_txn_command( + new_flashback_rollback_lock_cmd( + flashback_start_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Some(Key::from_raw(b"z")), + Context::default(), + ), + expect_ok_callback(tx, 0), + ) + .unwrap(); + rx.recv().unwrap(); + // Mock the prepare flashback retry. + run_flashback_to_version( + &storage, + flashback_start_ts, + flashback_commit_ts, + TimeStamp::zero(), + Key::from_raw(b"k"), + Some(Key::from_raw(b"z")), + ); + expect_none( + block_on(storage.get(Context::default(), Key::from_raw(b"k"), flashback_commit_ts)) + .unwrap() + .0, + ); + } + #[test] fn test_high_priority_get_put() { let storage = TestStorageBuilderApiV1::new(MockLockManager::new()) diff --git a/src/storage/mvcc/reader/reader.rs b/src/storage/mvcc/reader/reader.rs index 4847dbb8428..e530cc56577 100644 --- a/src/storage/mvcc/reader/reader.rs +++ b/src/storage/mvcc/reader/reader.rs @@ -1028,7 +1028,7 @@ pub mod tests { pub fn compact(&mut self) { for cf in ALL_CFS { - self.db.compact_range(cf, None, None, false, 1).unwrap(); + self.db.compact_range_cf(cf, None, None, false, 1).unwrap(); } } } diff --git a/src/storage/mvcc/txn.rs b/src/storage/mvcc/txn.rs index 4cc0ab57ffb..9e87bf748b7 100644 --- a/src/storage/mvcc/txn.rs +++ b/src/storage/mvcc/txn.rs @@ -5,6 +5,7 @@ use std::fmt; use concurrency_manager::{ConcurrencyManager, KeyHandleGuard}; use engine_traits::{CF_DEFAULT, CF_LOCK, CF_WRITE}; +use kvproto::kvrpcpb::LockInfo; use txn_types::{Key, Lock, PessimisticLock, TimeStamp, Value}; use super::metrics::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}; @@ -64,6 +65,11 @@ pub struct MvccTxn { // `writes`, so it can be further processed. The elements are tuples representing // (key, lock, remove_pessimistic_lock) pub(crate) locks_for_1pc: Vec<(Key, Lock, bool)>, + // Collects the information of locks that are acquired in this MvccTxn. Locks that already + // exists but updated in this MvccTxn won't be collected. The collected information will be + // used to update the lock waiting information and redo deadlock detection, if there are some + // pessimistic lock requests waiting on the keys. + pub(crate) new_locks: Vec, // `concurrency_manager` is used to set memory locks for prewritten keys. // Prewritten locks of async commit transactions should be visible to // readers before they are written to the engine. @@ -84,7 +90,8 @@ impl MvccTxn { start_ts, write_size: 0, modifies: vec![], - locks_for_1pc: Vec::new(), + locks_for_1pc: vec![], + new_locks: vec![], concurrency_manager, guards: vec![], } @@ -99,6 +106,10 @@ impl MvccTxn { std::mem::take(&mut self.guards) } + pub fn take_new_locks(&mut self) -> Vec { + std::mem::take(&mut self.new_locks) + } + pub fn write_size(&self) -> usize { self.write_size } @@ -107,7 +118,12 @@ impl MvccTxn { self.modifies.len() == 0 && self.locks_for_1pc.len() == 0 } - pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock) { + // Write a lock. If the key doesn't have lock before, `is_new` should be set. + pub(crate) fn put_lock(&mut self, key: Key, lock: &Lock, is_new: bool) { + if is_new { + self.new_locks + .push(lock.clone().into_lock_info(key.to_raw().unwrap())); + } let write = Modify::Put(CF_LOCK, key, lock.to_bytes()); self.write_size += write.size(); self.modifies.push(write); @@ -117,7 +133,13 @@ impl MvccTxn { self.locks_for_1pc.push((key, lock, remove_pessimstic_lock)); } - pub(crate) fn put_pessimistic_lock(&mut self, key: Key, lock: PessimisticLock) { + // Write a pessimistic lock. If the key doesn't have lock before, `is_new` + // should be set. + pub(crate) fn put_pessimistic_lock(&mut self, key: Key, lock: PessimisticLock, is_new: bool) { + if is_new { + self.new_locks + .push(lock.to_lock().into_lock_info(key.to_raw().unwrap())); + } self.modifies.push(Modify::PessimisticLock(key, lock)) } @@ -198,12 +220,13 @@ impl MvccTxn { } lock.rollback_ts.push(self.start_ts); - self.put_lock(key.clone(), &lock); + self.put_lock(key.clone(), &lock, false); } pub(crate) fn clear(&mut self) { self.write_size = 0; self.modifies.clear(); + self.new_locks.clear(); self.locks_for_1pc.clear(); self.guards.clear(); } diff --git a/src/storage/txn/actions/acquire_pessimistic_lock.rs b/src/storage/txn/actions/acquire_pessimistic_lock.rs index 8e7c4d95118..fcffd500c8e 100644 --- a/src/storage/txn/actions/acquire_pessimistic_lock.rs +++ b/src/storage/txn/actions/acquire_pessimistic_lock.rs @@ -171,7 +171,7 @@ pub fn acquire_pessimistic_lock( last_change_ts: lock.last_change_ts, versions_to_last_change: lock.versions_to_last_change, }; - txn.put_pessimistic_lock(key, lock); + txn.put_pessimistic_lock(key, lock, false); } else { MVCC_DUPLICATE_CMD_COUNTER_VEC .acquire_pessimistic_lock @@ -325,7 +325,7 @@ pub fn acquire_pessimistic_lock( // When lock_only_if_exists is false, always acquire pessimistic lock, otherwise // do it when val exists if !lock_only_if_exists || val.is_some() { - txn.put_pessimistic_lock(key, lock); + txn.put_pessimistic_lock(key, lock, true); } // TODO don't we need to commit the modifies in txn? diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index 4c900e5a438..88982d6da72 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -70,7 +70,7 @@ pub fn check_txn_status_lock_exists( lock.min_commit_ts = current_ts; } - txn.put_lock(primary_key, &lock); + txn.put_lock(primary_key, &lock, false); MVCC_CHECK_TXN_STATUS_COUNTER_VEC.update_ts.inc(); } diff --git a/src/storage/txn/actions/flashback_to_version.rs b/src/storage/txn/actions/flashback_to_version.rs index 4b05c8eef8f..f44854159c0 100644 --- a/src/storage/txn/actions/flashback_to_version.rs +++ b/src/storage/txn/actions/flashback_to_version.rs @@ -1,12 +1,10 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use std::ops::Bound; - use txn_types::{Key, Lock, LockType, TimeStamp, Write, WriteType}; use crate::storage::{ - mvcc::{MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, - txn::{actions::check_txn_status::rollback_lock, Result as TxnResult}, + mvcc::{self, MvccReader, MvccTxn, SnapshotReader, MAX_TXN_WRITE_SIZE}, + txn::{self, actions::check_txn_status::rollback_lock, Result as TxnResult}, Snapshot, }; @@ -15,12 +13,14 @@ pub const FLASHBACK_BATCH_SIZE: usize = 256 + 1 /* To store the next key for mul pub fn flashback_to_version_read_lock( reader: &mut MvccReader, next_lock_key: Key, - end_key: &Key, + end_key: Option<&Key>, + flashback_start_ts: TimeStamp, ) -> TxnResult> { let result = reader.scan_locks( Some(&next_lock_key), - Some(end_key), - |_| true, + end_key, + // Skip the `prewrite_lock`. This lock will appear when retrying prepare + |lock| lock.ts != flashback_start_ts, FLASHBACK_BATCH_SIZE, ); let (key_locks, _) = result?; @@ -31,20 +31,15 @@ pub fn flashback_to_version_read_write( reader: &mut MvccReader, next_write_key: Key, start_key: &Key, - end_key: &Key, + end_key: Option<&Key>, flashback_version: TimeStamp, flashback_commit_ts: TimeStamp, ) -> TxnResult> { - // Filter out the SST that does not have a newer version than - // `flashback_version` in `CF_WRITE`, i.e, whose latest `commit_ts` <= - // `flashback_version`. By doing this, we can only flashback those keys that - // have version changed since `flashback_version` as much as possible. - reader.set_hint_min_ts(Some(Bound::Excluded(flashback_version))); // To flashback the data, we need to get all the latest visible keys first by // scanning every unique key in `CF_WRITE`. let keys_result = reader.scan_latest_user_keys( Some(&next_write_key), - Some(end_key), + end_key, |key, latest_commit_ts| { // There is no any other write could happen after the flashback begins. assert!(latest_commit_ts <= flashback_commit_ts); @@ -154,6 +149,9 @@ pub fn prewrite_flashback_key( flashback_version: TimeStamp, flashback_start_ts: TimeStamp, ) -> TxnResult<()> { + if reader.load_lock(key_to_lock)?.is_some() { + return Ok(()); + } let old_write = reader.get_write(key_to_lock, flashback_version, None)?; // Flashback the value in `CF_DEFAULT` as well if the old write is a // `WriteType::Put` without the short value. @@ -188,6 +186,7 @@ pub fn prewrite_flashback_key( 1, TimeStamp::zero(), ), + false, // Assuming flashback transactions won't participate any lock conflicts. ); Ok(()) } @@ -218,17 +217,75 @@ pub fn commit_flashback_key( lock.is_pessimistic_txn(), flashback_commit_ts, ); + } else { + return Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { + start_ts: flashback_start_ts, + commit_ts: flashback_commit_ts, + key: key_to_commit.to_raw()?, + })); } Ok(()) } +// Check if the flashback has been finished before. +pub fn check_flashback_commit( + reader: &mut MvccReader, + key_to_commit: &Key, + flashback_start_ts: TimeStamp, + flashback_commit_ts: TimeStamp, +) -> TxnResult { + match reader.load_lock(key_to_commit)? { + // If the lock exists, it means the flashback hasn't been finished. + Some(lock) => { + if lock.ts == flashback_start_ts { + return Ok(false); + } + error!( + "check flashback commit exception: lock not found"; + "key_to_commit" => log_wrappers::Value::key(key_to_commit.as_encoded()), + "flashback_start_ts" => flashback_start_ts, + "flashback_commit_ts" => flashback_commit_ts, + "lock" => ?lock, + ); + } + // If the lock doesn't exist and the flashback commit record exists, it means the flashback + // has been finished. + None => { + let write_res = reader.seek_write(key_to_commit, flashback_commit_ts)?; + if let Some((commit_ts, ref write)) = write_res { + if commit_ts == flashback_commit_ts && write.start_ts == flashback_start_ts { + return Ok(true); + } + } + error!( + "check flashback commit exception: write record mismatched"; + "key_to_commit" => log_wrappers::Value::key(key_to_commit.as_encoded()), + "flashback_start_ts" => flashback_start_ts, + "flashback_commit_ts" => flashback_commit_ts, + "write" => ?write_res, + ); + } + } + Err(txn::Error::from_mvcc(mvcc::ErrorInner::TxnLockNotFound { + start_ts: flashback_start_ts, + commit_ts: flashback_commit_ts, + key: key_to_commit.to_raw()?, + })) +} + pub fn get_first_user_key( reader: &mut MvccReader, start_key: &Key, - end_key: &Key, + end_key: Option<&Key>, + flashback_version: TimeStamp, ) -> TxnResult> { - let (mut keys_result, _) = - reader.scan_latest_user_keys(Some(start_key), Some(end_key), |_, _| true, 1)?; + let (mut keys_result, _) = reader.scan_latest_user_keys( + Some(start_key), + end_key, + // Make sure we will get the same first user key each time. + |_, latest_commit_ts| latest_commit_ts > flashback_version, + 1, + )?; Ok(keys_result.pop()) } @@ -258,14 +315,17 @@ pub mod tests { key: &[u8], start_ts: impl Into, ) -> usize { + let start_ts = start_ts.into(); let next_key = Key::from_raw(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot.clone(), Some(ScanMode::Forward), &ctx); - let key_locks = flashback_to_version_read_lock(&mut reader, key, &next_key).unwrap(); + let key_locks = + flashback_to_version_read_lock(&mut reader, key, Some(next_key).as_ref(), start_ts) + .unwrap(); let cm = ConcurrencyManager::new(TimeStamp::zero()); - let mut txn = MvccTxn::new(start_ts.into(), cm); + let mut txn = MvccTxn::new(start_ts, cm); rollback_locks(&mut txn, snapshot, key_locks).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); @@ -284,8 +344,13 @@ pub mod tests { let snapshot = engine.snapshot(Default::default()).unwrap(); let ctx = Context::default(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let prewrite_key = if let Some(first_key) = - get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")).unwrap() + let prewrite_key = if let Some(first_key) = get_first_user_key( + &mut reader, + &Key::from_raw(key), + Some(Key::from_raw(b"z")).as_ref(), + version, + ) + .unwrap() { first_key } else { @@ -305,7 +370,7 @@ pub mod tests { start_ts: impl Into, commit_ts: impl Into, ) -> usize { - let next_key = Key::from_raw(keys::next_key(key).as_slice()); + let next_key = Key::from_raw_maybe_unbounded(keys::next_key(key).as_slice()); let key = Key::from_raw(key); let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let ctx = Context::default(); @@ -316,7 +381,7 @@ pub mod tests { &mut reader, key, &Key::from_raw(b""), - &next_key, + next_key.as_ref(), version, commit_ts, ) @@ -333,19 +398,24 @@ pub mod tests { fn must_commit_flashback_key( engine: &mut E, key: &[u8], + version: impl Into, start_ts: impl Into, commit_ts: impl Into, ) -> usize { - let (start_ts, commit_ts) = (start_ts.into(), commit_ts.into()); + let (version, start_ts, commit_ts) = (version.into(), start_ts.into(), commit_ts.into()); let cm = ConcurrencyManager::new(TimeStamp::zero()); let mut txn = MvccTxn::new(start_ts, cm); let snapshot = engine.snapshot(Default::default()).unwrap(); let ctx = Context::default(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let key_to_lock = - get_first_user_key(&mut reader, &Key::from_raw(key), &Key::from_raw(b"z")) - .unwrap() - .unwrap(); + let key_to_lock = get_first_user_key( + &mut reader, + &Key::from_raw(key), + Some(Key::from_raw(b"z")).as_ref(), + version, + ) + .unwrap() + .unwrap(); commit_flashback_key(&mut txn, &mut reader, &key_to_lock, start_ts, commit_ts).unwrap(); let rows = txn.modifies.len(); write(engine, &ctx, txn.into_modifies()); @@ -500,9 +570,11 @@ pub mod tests { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); let (k, v) = (b"k", [u8::MAX; SHORT_VALUE_MAX_LEN + 1]); - must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_get(&mut engine, k, ts, &v); + for _ in 0..2 { + must_prewrite_put(&mut engine, k, &v, k, *ts.incr()); + must_commit(&mut engine, k, ts, *ts.incr()); + must_get(&mut engine, k, ts, &v); + } let flashback_start_ts = *ts.incr(); // Rollback nothing. @@ -513,20 +585,11 @@ pub mod tests { 2 ); // Retry Prepare - // Unlock `k`, put rollback record and delete the value of `k`. - assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 3); - // Lock and write the value of `k`. - assert_eq!( - must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), - 2 - ); - // Retry Prepare - // Only unlock `k` since there is an overlapped rollback record. - assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 1); - // Only lock `k` since the value of `k` has already existed. + // Skip `k` no need to write again. + assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); assert_eq!( must_prewrite_flashback_key(&mut engine, k, 2, flashback_start_ts), - 1 + 0 ); } @@ -534,29 +597,26 @@ pub mod tests { fn test_prewrite_with_special_key() { let mut engine = TestEngineBuilder::new().build().unwrap(); let mut ts = TimeStamp::zero(); - let (prewrite_key, prewrite_val) = (b"b", b"val"); - must_prewrite_put( - &mut engine, - prewrite_key, - prewrite_val, - prewrite_key, - *ts.incr(), - ); - must_commit(&mut engine, prewrite_key, ts, *ts.incr()); - must_get(&mut engine, prewrite_key, ts, prewrite_val); - let (k, v1, v2) = (b"c", b"v1", b"v2"); - must_prewrite_put(&mut engine, k, v1, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_prewrite_put(&mut engine, k, v2, k, *ts.incr()); - must_commit(&mut engine, k, ts, *ts.incr()); - must_get(&mut engine, k, ts, v2); + let (prewrite_key, k, v) = (b"b", b"c", b"val"); + for k in [prewrite_key, k] { + let (start_ts, commit_ts) = (*ts.incr(), *ts.incr()); + must_prewrite_put(&mut engine, k, v, k, start_ts); + must_commit(&mut engine, k, start_ts, commit_ts); + must_get(&mut engine, k, commit_ts, v); + } // Check for prewrite key b"b". let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &ctx); - let first_key = get_first_user_key(&mut reader, &Key::from_raw(b""), &Key::from_raw(b"z")) - .unwrap_or_else(|_| Some(Key::from_raw(b""))) - .unwrap(); + let flashback_version = TimeStamp::zero(); + let first_key = get_first_user_key( + &mut reader, + &Key::from_raw(b""), + Some(Key::from_raw(b"z")).as_ref(), + flashback_version, + ) + .unwrap_or_else(|_| Some(Key::from_raw(b""))) + .unwrap(); assert_eq!(first_key, Key::from_raw(prewrite_key)); // case 1: start key is before all keys, flashback b"c". @@ -566,7 +626,12 @@ pub mod tests { assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); // Prewrite "prewrite_key" not "start_key". assert_eq!( - must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + must_prewrite_flashback_key( + &mut engine, + start_key, + flashback_version, + flashback_start_ts + ), 1 ); // Flashback (b"c", v2) to (b"c", v1). @@ -574,7 +639,7 @@ pub mod tests { must_flashback_write_to_version( &mut engine, k, - 4, + flashback_version, flashback_start_ts, flashback_commit_ts ), @@ -585,14 +650,14 @@ pub mod tests { must_commit_flashback_key( &mut engine, start_key, + flashback_version, flashback_start_ts, flashback_commit_ts ), 2 ); - must_get(&mut engine, k, ts, v1); - must_get(&mut engine, prewrite_key, ts, prewrite_val); - + must_get_none(&mut engine, prewrite_key, ts); + must_get_none(&mut engine, k, ts); // case 2: start key is after all keys, prewrite will return None. let start_key = b"d"; let flashback_start_ts = *ts.incr(); @@ -600,12 +665,22 @@ pub mod tests { assert_eq!(must_rollback_lock(&mut engine, k, flashback_start_ts), 0); // Prewrite null. assert_eq!( - must_prewrite_flashback_key(&mut engine, start_key, 4, flashback_start_ts), + must_prewrite_flashback_key( + &mut engine, + start_key, + flashback_version, + flashback_start_ts + ), 0 ); - // case 3: start key is valid, end_key is invalid, prewrite key will be None. - let first_key = get_first_user_key(&mut reader, &Key::from_raw(b"a"), &Key::from_raw(b"")) - .unwrap_or_else(|_| Some(Key::from_raw(b""))); - assert_eq!(first_key, None); + must_get_none(&mut engine, prewrite_key, ts); + must_get_none(&mut engine, k, ts); + // case 3: for last region, end_key will be None, prewrite key will be valid. + assert_eq!( + get_first_user_key(&mut reader, &Key::from_raw(b"a"), None, flashback_version) + .unwrap() + .unwrap(), + Key::from_raw(prewrite_key) + ); } } diff --git a/src/storage/txn/actions/prewrite.rs b/src/storage/txn/actions/prewrite.rs index f2de9df0004..1e655846d08 100644 --- a/src/storage/txn/actions/prewrite.rs +++ b/src/storage/txn/actions/prewrite.rs @@ -153,7 +153,9 @@ pub fn prewrite( OldValue::Unspecified }; - let final_min_commit_ts = mutation.write_lock(lock_status, txn)?; + let is_new_lock = !matches!(pessimistic_action, DoPessimisticCheck) || lock_amended; + + let final_min_commit_ts = mutation.write_lock(lock_status, txn, is_new_lock)?; fail_point!("after_prewrite_one_key"); @@ -448,7 +450,12 @@ impl<'a> PrewriteMutation<'a> { Ok(None) } - fn write_lock(self, lock_status: LockStatus, txn: &mut MvccTxn) -> Result { + fn write_lock( + self, + lock_status: LockStatus, + txn: &mut MvccTxn, + is_new_lock: bool, + ) -> Result { let mut try_one_pc = self.try_one_pc(); let mut lock = Lock::new( @@ -506,7 +513,7 @@ impl<'a> PrewriteMutation<'a> { if try_one_pc { txn.put_locks_for_1pc(self.key, lock, lock_status.has_pessimistic_lock()); } else { - txn.put_lock(self.key, &lock); + txn.put_lock(self.key, &lock, is_new_lock); } final_min_commit_ts diff --git a/src/storage/txn/commands/acquire_pessimistic_lock.rs b/src/storage/txn/commands/acquire_pessimistic_lock.rs index 6bd147cf02e..58c33706bbc 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock.rs @@ -144,6 +144,7 @@ impl WriteCommand for AcquirePessimisticLock } } + let new_acquired_locks = txn.take_new_locks(); let modifies = txn.into_modifies(); let mut res = Ok(res); @@ -179,6 +180,7 @@ impl WriteCommand for AcquirePessimisticLock pr, lock_info: encountered_locks, released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, }) @@ -191,7 +193,7 @@ pub(super) fn make_write_data(modifies: Vec, old_values: OldValues) -> W old_values, // One pc status is unknown in AcquirePessimisticLock stage. one_pc: false, - for_flashback: false, + allowed_in_flashback: false, }; WriteData::new(modifies, extra) } else { diff --git a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs index a66f8228755..7640edd7c0c 100644 --- a/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs +++ b/src/storage/txn/commands/acquire_pessimistic_lock_resumed.rs @@ -82,6 +82,7 @@ impl WriteCommand for AcquirePessimisticLockR fn process_write(self, snapshot: S, context: WriteContext<'_, L>) -> Result { fail_point!("acquire_pessimistic_lock_resumed_before_process_write"); let mut modifies = vec![]; + let mut new_acquired_locks = vec![]; let mut txn = None; let mut reader: Option> = None; @@ -107,10 +108,11 @@ impl WriteCommand for AcquirePessimisticLockR .as_ref() .map_or(true, |t: &MvccTxn| t.start_ts != params.start_ts) { - if let Some(prev_txn) = txn.replace(MvccTxn::new( + if let Some(mut prev_txn) = txn.replace(MvccTxn::new( params.start_ts, context.concurrency_manager.clone(), )) { + new_acquired_locks.extend(prev_txn.take_new_locks()); modifies.extend(prev_txn.into_modifies()); } // TODO: Is it possible to reuse the same reader but change the start_ts stored @@ -169,8 +171,9 @@ impl WriteCommand for AcquirePessimisticLockR }; } - if let Some(txn) = txn { + if let Some(mut txn) = txn { if !txn.is_empty() { + new_acquired_locks.extend(txn.take_new_locks()); modifies.extend(txn.into_modifies()); } } @@ -188,6 +191,7 @@ impl WriteCommand for AcquirePessimisticLockR pr, lock_info: encountered_locks, released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnProposed, }) diff --git a/src/storage/txn/commands/atomic_store.rs b/src/storage/txn/commands/atomic_store.rs index 1df5c5b2cf8..9a54895e7e2 100644 --- a/src/storage/txn/commands/atomic_store.rs +++ b/src/storage/txn/commands/atomic_store.rs @@ -60,6 +60,7 @@ impl WriteCommand for RawAtomicStore { pr: ProcessResult::Res, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: raw_ext.into_iter().map(|r| r.key_guard).collect(), response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/check_secondary_locks.rs b/src/storage/txn/commands/check_secondary_locks.rs index 4802535c054..a19a5d82bb6 100644 --- a/src/storage/txn/commands/check_secondary_locks.rs +++ b/src/storage/txn/commands/check_secondary_locks.rs @@ -146,6 +146,7 @@ impl WriteCommand for CheckSecondaryLocks { rows = 1; } let pr = ProcessResult::SecondaryLocksStatus { status: result }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -155,6 +156,7 @@ impl WriteCommand for CheckSecondaryLocks { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 34948109f4b..895c753b160 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -125,6 +125,7 @@ impl WriteCommand for CheckTxnStatus { released_locks.push(released); let pr = ProcessResult::TxnStatus { txn_status }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -134,6 +135,7 @@ impl WriteCommand for CheckTxnStatus { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/cleanup.rs b/src/storage/txn/commands/cleanup.rs index a6c529420d3..302c4fe1308 100644 --- a/src/storage/txn/commands/cleanup.rs +++ b/src/storage/txn/commands/cleanup.rs @@ -67,6 +67,7 @@ impl WriteCommand for Cleanup { true, )?); + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -76,6 +77,7 @@ impl WriteCommand for Cleanup { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/commit.rs b/src/storage/txn/commands/commit.rs index 910b7832ed1..4f05df8fe83 100644 --- a/src/storage/txn/commands/commit.rs +++ b/src/storage/txn/commands/commit.rs @@ -67,6 +67,7 @@ impl WriteCommand for Commit { let pr = ProcessResult::TxnStatus { txn_status: TxnStatus::committed(self.commit_ts), }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -76,6 +77,7 @@ impl WriteCommand for Commit { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/compare_and_swap.rs b/src/storage/txn/commands/compare_and_swap.rs index 943fc6f69d1..ca9213b57d3 100644 --- a/src/storage/txn/commands/compare_and_swap.rs +++ b/src/storage/txn/commands/compare_and_swap.rs @@ -114,6 +114,7 @@ impl WriteCommand for RawCompareAndSwap { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards, response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/flashback_to_version.rs b/src/storage/txn/commands/flashback_to_version.rs index 13de0c9b183..da12bc8906c 100644 --- a/src/storage/txn/commands/flashback_to_version.rs +++ b/src/storage/txn/commands/flashback_to_version.rs @@ -33,7 +33,7 @@ command! { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, state: FlashbackToVersionState, } } @@ -118,7 +118,7 @@ impl WriteCommand for FlashbackToVersion { let rows = txn.modifies.len(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); // To let the flashback modification could be proposed and applied successfully. - write_data.extra.for_flashback = true; + write_data.extra.allowed_in_flashback = true; // To let the CDC treat the flashback modification as an 1PC transaction. if matches!(self.state, FlashbackToVersionState::FlashbackWrite { .. }) { write_data.extra.one_pc = true; @@ -159,6 +159,7 @@ impl WriteCommand for FlashbackToVersion { })(), lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/flashback_to_version_read_phase.rs b/src/storage/txn/commands/flashback_to_version_read_phase.rs index 9ac5014b7f3..7fdc86288c2 100644 --- a/src/storage/txn/commands/flashback_to_version_read_phase.rs +++ b/src/storage/txn/commands/flashback_to_version_read_phase.rs @@ -1,12 +1,14 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +use std::ops::Bound; + // #[PerformanceCriticalPath] use txn_types::{Key, Lock, TimeStamp}; use crate::storage::{ mvcc::MvccReader, txn::{ - actions::flashback_to_version::get_first_user_key, + actions::flashback_to_version::{check_flashback_commit, get_first_user_key}, commands::{ Command, CommandExt, FlashbackToVersion, ProcessResult, ReadCommand, TypedCommand, }, @@ -39,7 +41,7 @@ pub fn new_flashback_rollback_lock_cmd( start_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ctx: Context, ) -> TypedCommand<()> { FlashbackToVersionReadPhase::new( @@ -61,7 +63,7 @@ pub fn new_flashback_write_cmd( commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, ctx: Context, ) -> TypedCommand<()> { FlashbackToVersionReadPhase::new( @@ -87,7 +89,7 @@ command! { commit_ts: TimeStamp, version: TimeStamp, start_key: Key, - end_key: Key, + end_key: Option, state: FlashbackToVersionState, } } @@ -109,25 +111,34 @@ impl CommandExt for FlashbackToVersionReadPhase { /// - Scan all locks. /// - Rollback all these locks. /// 2. [PrepareFlashback] Prewrite phase: -/// - Prewrite the `self.start_key` specifically to prevent the -/// `resolved_ts` from advancing. +/// - Prewrite the first user key after `self.start_key` specifically to +/// prevent the `resolved_ts` from advancing. /// 3. [FinishFlashback] FlashbackWrite phase: /// - Scan all the latest writes and their corresponding values at /// `self.version`. /// - Write the old MVCC version writes again for all these keys with -/// `self.commit_ts` excluding the `self.start_key`. +/// `self.commit_ts` excluding the first user key after `self.start_key`. /// 4. [FinishFlashback] Commit phase: -/// - Commit the `self.start_key` we write at the second phase to finish the -/// flashback. +/// - Commit the first user key after `self.start_key` we write at the +/// second phase to finish the flashback. impl ReadCommand for FlashbackToVersionReadPhase { fn process_read(self, snapshot: S, statistics: &mut Statistics) -> Result { let tag = self.tag().get_str(); let mut reader = MvccReader::new_with_ctx(snapshot, Some(ScanMode::Forward), &self.ctx); + // Filter out the SST that does not have a newer version than `self.version` in + // `CF_WRITE`, i.e, whose latest `commit_ts` <= `self.version` in the later + // scan. By doing this, we can only flashback those keys that have version + // changed since `self.version` as much as possible. + reader.set_hint_min_ts(Some(Bound::Excluded(self.version))); let mut start_key = self.start_key.clone(); let next_state = match self.state { FlashbackToVersionState::RollbackLock { next_lock_key, .. } => { - let mut key_locks = - flashback_to_version_read_lock(&mut reader, next_lock_key, &self.end_key)?; + let mut key_locks = flashback_to_version_read_lock( + &mut reader, + next_lock_key, + self.end_key.as_ref(), + self.start_ts, + )?; if key_locks.is_empty() { // - No more locks to rollback, continue to the Prewrite Phase. // - The start key from the client is actually a range which is used to limit @@ -138,9 +149,12 @@ impl ReadCommand for FlashbackToVersionReadPhase { // completion of the 2pc. // - To make sure the key locked in the latch is the same as the actual key // written, we pass it to the key in `process_write' after getting it. - let key_to_lock = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, &self.end_key)? - { + let key_to_lock = if let Some(first_key) = get_first_user_key( + &mut reader, + &self.start_key, + self.end_key.as_ref(), + self.version, + )? { first_key } else { // If the key is None return directly @@ -177,9 +191,12 @@ impl ReadCommand for FlashbackToVersionReadPhase { // 2pc. So When overwriting the write, we skip the immediate // write of this key and instead put it after the completion // of the 2pc. - next_write_key = if let Some(first_key) = - get_first_user_key(&mut reader, &self.start_key, &self.end_key)? - { + next_write_key = if let Some(first_key) = get_first_user_key( + &mut reader, + &self.start_key, + self.end_key.as_ref(), + self.version, + )? { first_key } else { // If the key is None return directly @@ -189,9 +206,14 @@ impl ReadCommand for FlashbackToVersionReadPhase { // Commit key needs to match the Prewrite key, which is set as the first user // key. start_key = next_write_key.clone(); - // If the key is not locked, it means that the key has been committed before and - // we are in a retry. - if reader.load_lock(&next_write_key)?.is_none() { + // If the key has already been committed by the flashback, it means that we are + // in a retry. It's safe to just return directly. + if check_flashback_commit( + &mut reader, + &start_key, + self.start_ts, + self.commit_ts, + )? { statistics.add(&reader.statistics); return Ok(ProcessResult::Res); } @@ -200,7 +222,7 @@ impl ReadCommand for FlashbackToVersionReadPhase { &mut reader, next_write_key, &start_key, - &self.end_key, + self.end_key.as_ref(), self.version, self.commit_ts, )?; diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 7d835462acf..2d79ebc97cc 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -372,7 +372,7 @@ impl From for TypedCommand<()> { req.get_start_ts().into(), req.get_version().into(), Key::from_raw(req.get_start_key()), - Key::from_raw(req.get_end_key()), + Key::from_raw_maybe_unbounded(req.get_end_key()), req.take_context(), ) } @@ -385,7 +385,7 @@ impl From for TypedCommand<()> { req.get_commit_ts().into(), req.get_version().into(), Key::from_raw(req.get_start_key()), - Key::from_raw(req.get_end_key()), + Key::from_raw_maybe_unbounded(req.get_end_key()), req.take_context(), ) } @@ -417,6 +417,7 @@ pub struct WriteResult { pub pr: ProcessResult, pub lock_info: Vec, pub released_locks: ReleasedLocks, + pub new_acquired_locks: Vec, pub lock_guards: Vec, pub response_policy: ResponsePolicy, } diff --git a/src/storage/txn/commands/pause.rs b/src/storage/txn/commands/pause.rs index 3dc7d06d5ef..5d3aa7f6d2f 100644 --- a/src/storage/txn/commands/pause.rs +++ b/src/storage/txn/commands/pause.rs @@ -50,6 +50,7 @@ impl WriteCommand for Pause { pr: ProcessResult::Res, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/pessimistic_rollback.rs b/src/storage/txn/commands/pessimistic_rollback.rs index c35c362f19e..b34399cb64a 100644 --- a/src/storage/txn/commands/pessimistic_rollback.rs +++ b/src/storage/txn/commands/pessimistic_rollback.rs @@ -83,6 +83,7 @@ impl WriteCommand for PessimisticRollback { released_locks.push(released_lock?); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -92,6 +93,7 @@ impl WriteCommand for PessimisticRollback { pr: ProcessResult::MultiRes { results: vec![] }, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/prewrite.rs b/src/storage/txn/commands/prewrite.rs index cd24f54d13b..fbd4bf5984a 100644 --- a/src/storage/txn/commands/prewrite.rs +++ b/src/storage/txn/commands/prewrite.rs @@ -672,12 +672,13 @@ impl Prewriter { old_values: self.old_values, // Set one_pc flag in TxnExtra to let CDC skip handling the resolver. one_pc: self.try_one_pc, - for_flashback: false, + allowed_in_flashback: false, }; // Here the lock guards are taken and will be released after the write finishes. // If an error (KeyIsLocked or WriteConflict) occurs before, these lock guards // are dropped along with `txn` automatically. let lock_guards = txn.take_guards(); + let new_acquired_locks = txn.take_new_locks(); let mut to_be_write = WriteData::new(txn.into_modifies(), extra); to_be_write.set_disk_full_opt(self.ctx.get_disk_full_opt()); @@ -688,6 +689,7 @@ impl Prewriter { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards, response_policy: ResponsePolicy::OnApplied, } @@ -707,6 +709,7 @@ impl Prewriter { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks: vec![], lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, } @@ -870,8 +873,9 @@ fn handle_1pc_locks(txn: &mut MvccTxn, commit_ts: TimeStamp) -> ReleasedLocks { /// Change all 1pc locks in txn to 2pc locks. pub(in crate::storage::txn) fn fallback_1pc_locks(txn: &mut MvccTxn) { - for (key, lock, _) in std::mem::take(&mut txn.locks_for_1pc) { - txn.put_lock(key, &lock); + for (key, lock, remove_pessimistic_lock) in std::mem::take(&mut txn.locks_for_1pc) { + let is_new_lock = !remove_pessimistic_lock; + txn.put_lock(key, &lock, is_new_lock); } } diff --git a/src/storage/txn/commands/resolve_lock.rs b/src/storage/txn/commands/resolve_lock.rs index 463275b2e1f..f3d141807e8 100644 --- a/src/storage/txn/commands/resolve_lock.rs +++ b/src/storage/txn/commands/resolve_lock.rs @@ -138,6 +138,7 @@ impl WriteCommand for ResolveLock { cmd: Command::ResolveLockReadPhase(next_cmd), } }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -147,6 +148,7 @@ impl WriteCommand for ResolveLock { pr, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/resolve_lock_lite.rs b/src/storage/txn/commands/resolve_lock_lite.rs index d336d88a9ca..63fe201596d 100644 --- a/src/storage/txn/commands/resolve_lock_lite.rs +++ b/src/storage/txn/commands/resolve_lock_lite.rs @@ -63,6 +63,7 @@ impl WriteCommand for ResolveLockLite { }); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -72,6 +73,7 @@ impl WriteCommand for ResolveLockLite { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/rollback.rs b/src/storage/txn/commands/rollback.rs index 52c05ae34c7..f3b674f4916 100644 --- a/src/storage/txn/commands/rollback.rs +++ b/src/storage/txn/commands/rollback.rs @@ -58,6 +58,7 @@ impl WriteCommand for Rollback { released_locks.push(released_lock); } + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -67,6 +68,7 @@ impl WriteCommand for Rollback { pr: ProcessResult::Res, lock_info: vec![], released_locks, + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/commands/txn_heart_beat.rs b/src/storage/txn/commands/txn_heart_beat.rs index f965b863494..448395fc436 100644 --- a/src/storage/txn/commands/txn_heart_beat.rs +++ b/src/storage/txn/commands/txn_heart_beat.rs @@ -67,7 +67,7 @@ impl WriteCommand for TxnHeartBeat { Some(mut lock) if lock.ts == self.start_ts => { if lock.ttl < self.advise_ttl { lock.ttl = self.advise_ttl; - txn.put_lock(self.primary_key.clone(), &lock); + txn.put_lock(self.primary_key.clone(), &lock, false); } lock } @@ -83,6 +83,7 @@ impl WriteCommand for TxnHeartBeat { let pr = ProcessResult::TxnStatus { txn_status: TxnStatus::uncommitted(lock, false), }; + let new_acquired_locks = txn.take_new_locks(); let mut write_data = WriteData::from_modifies(txn.into_modifies()); write_data.set_allowed_on_disk_almost_full(); Ok(WriteResult { @@ -92,6 +93,7 @@ impl WriteCommand for TxnHeartBeat { pr, lock_info: vec![], released_locks: ReleasedLocks::new(), + new_acquired_locks, lock_guards: vec![], response_policy: ResponsePolicy::OnApplied, }) diff --git a/src/storage/txn/flow_controller/tablet_flow_controller.rs b/src/storage/txn/flow_controller/tablet_flow_controller.rs index 17a5900bea7..922e986874a 100644 --- a/src/storage/txn/flow_controller/tablet_flow_controller.rs +++ b/src/storage/txn/flow_controller/tablet_flow_controller.rs @@ -11,9 +11,9 @@ use std::{ time::Duration, }; -use collections::HashMap; +use collections::{HashMap, HashMapEntry}; use engine_rocks::FlowInfo; -use engine_traits::{CfNamesExt, FlowControlFactorsExt, OpenOptions, TabletFactory}; +use engine_traits::{CfNamesExt, FlowControlFactorsExt, TabletRegistry}; use rand::Rng; use tikv_util::{sys::thread::StdThreadBuildWrapper, time::Limiter}; @@ -47,9 +47,9 @@ impl Drop for TabletFlowController { } impl TabletFlowController { - pub fn new( + pub fn new( config: &FlowControlConfig, - tablet_factory: Arc + Send + Sync>, + registry: TabletRegistry, flow_info_receiver: Receiver, ) -> Self { let (tx, rx) = mpsc::sync_channel(5); @@ -69,7 +69,7 @@ impl TabletFlowController { handle: Some(FlowInfoDispatcher::start( rx, flow_info_receiver, - tablet_factory, + registry, flow_checkers, limiters, config.clone(), @@ -86,10 +86,10 @@ impl TabletFlowController { struct FlowInfoDispatcher; impl FlowInfoDispatcher { - fn start( + fn start( rx: Receiver, flow_info_receiver: Receiver, - tablet_factory: Arc + Send + Sync>, + registry: TabletRegistry, flow_checkers: Arc>>>, limiters: Limiters, config: FlowControlConfig, @@ -116,32 +116,6 @@ impl FlowInfoDispatcher { Err(_) => {} } - let insert_limiter_and_checker = |region_id, suffix| -> FlowChecker { - let engine = tablet_factory - .open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_cache_only(true), - ) - .unwrap(); - let mut v = limiters.as_ref().write().unwrap(); - let discard_ratio = Arc::new(AtomicU32::new(0)); - let limiter = v.entry(region_id).or_insert(( - Arc::new( - ::builder(f64::INFINITY) - .refill(Duration::from_millis(1)) - .build(), - ), - discard_ratio, - )); - FlowChecker::new_with_tablet_suffix( - &config, - engine, - limiter.1.clone(), - limiter.0.clone(), - suffix, - ) - }; let msg = flow_info_receiver.recv_deadline(deadline); match msg.clone() { Ok(FlowInfo::L0(_cf, _, region_id, suffix)) @@ -165,22 +139,43 @@ impl FlowInfoDispatcher { } Ok(FlowInfo::Created(region_id, suffix)) => { let mut checkers = flow_checkers.as_ref().write().unwrap(); - let checker = checkers - .entry(region_id) - .or_insert_with(|| insert_limiter_and_checker(region_id, suffix)); + let checker = match checkers.entry(region_id) { + HashMapEntry::Occupied(e) => e.into_mut(), + HashMapEntry::Vacant(e) => { + let engine = if let Some(mut c) = registry.get(region_id) && let Some(t) = c.latest() { + t.clone() + } else { + continue; + }; + let mut v = limiters.as_ref().write().unwrap(); + let discard_ratio = Arc::new(AtomicU32::new(0)); + let limiter = v.entry(region_id).or_insert(( + Arc::new( + ::builder(f64::INFINITY) + .refill(Duration::from_millis(1)) + .build(), + ), + discard_ratio, + )); + e.insert(FlowChecker::new_with_tablet_suffix( + &config, + engine, + limiter.1.clone(), + limiter.0.clone(), + suffix, + )) + }, + }; // check if the checker's engine is exactly (region_id, suffix) // if checker.suffix < suffix, it means its tablet is old and needs the // refresh if checker.tablet_suffix() < suffix { - let engine = tablet_factory - .open_tablet( - region_id, - Some(suffix), - OpenOptions::default().set_cache_only(true), - ) - .unwrap(); - checker.set_engine(engine); - checker.set_tablet_suffix(suffix); + let cached = registry.get(region_id); + // None means the region is destroyed. + if let Some(mut c) = cached && let Some(engine) = c.latest() { + checker.set_engine(engine.clone()); + checker.set_tablet_suffix(suffix); + } } } Ok(FlowInfo::Destroyed(region_id, suffix)) => { @@ -296,35 +291,44 @@ impl TabletFlowController { #[cfg(test)] mod tests { use engine_rocks::FlowInfo; - use engine_traits::DummyFactory; + use engine_traits::{SingletonFactory, TabletContext}; + use tempfile::TempDir; use super::{ super::{singleton_flow_controller::tests::*, FlowController}, *, }; - fn create_tablet_flow_controller() -> (FlowController, mpsc::SyncSender, EngineStub) { + fn create_tablet_flow_controller() -> ( + TempDir, + FlowController, + mpsc::SyncSender, + TabletRegistry, + ) { let (tx, rx) = mpsc::sync_channel(0); - let root_path = "/tmp"; + let temp_dir = tempfile::tempdir().unwrap(); let stub = EngineStub::new(); - let factory = DummyFactory::::new(Some(stub.clone()), root_path.to_string()); - let tablet_factory = Arc::new(factory); + let factory = Box::new(SingletonFactory::new(stub)); + let registry = TabletRegistry::new(factory, temp_dir.path()).unwrap(); ( + temp_dir, FlowController::Tablet(TabletFlowController::new( &FlowControlConfig::default(), - tablet_factory, + registry.clone(), rx, )), tx, - stub, + registry, ) } #[test] fn test_tablet_flow_controller_basic() { - let (flow_controller, tx, _) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + reg.load(tablet_context, false).unwrap(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -348,9 +352,12 @@ mod tests { #[test] fn test_tablet_flow_controller_memtable() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -365,9 +372,12 @@ mod tests { #[test] fn test_tablet_flow_controller_l0() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( @@ -382,9 +392,12 @@ mod tests { #[test] fn test_tablet_flow_controller_pending_compaction_bytes() { - let (flow_controller, tx, stub) = create_tablet_flow_controller(); + let (_dir, flow_controller, tx, reg) = create_tablet_flow_controller(); let region_id = 5_u64; let tablet_suffix = 5_u64; + let tablet_context = TabletContext::with_infinite_region(region_id, Some(tablet_suffix)); + let mut cached = reg.load(tablet_context, false).unwrap(); + let stub = cached.latest().unwrap().clone(); tx.send(FlowInfo::Created(region_id, tablet_suffix)) .unwrap(); tx.send(FlowInfo::L0Intra( diff --git a/src/storage/txn/scheduler.rs b/src/storage/txn/scheduler.rs index bfbb860e545..d96e3e7c97f 100644 --- a/src/storage/txn/scheduler.rs +++ b/src/storage/txn/scheduler.rs @@ -702,7 +702,7 @@ impl Scheduler { Command::FlashbackToVersionReadPhase { .. } | Command::FlashbackToVersion { .. } ) { - snap_ctx.for_flashback = true; + snap_ctx.allowed_in_flashback = true; } // The program is currently in scheduler worker threads. // Safety: `self.inner.worker_pool` should ensure that a TLS engine exists. @@ -798,6 +798,7 @@ impl Scheduler { lock_guards: Vec, pipelined: bool, async_apply_prewrite: bool, + new_acquired_locks: Vec, tag: CommandKind, ) { // TODO: Does async apply prewrite worth a special metric here? @@ -846,7 +847,7 @@ impl Scheduler { assert!(pipelined || async_apply_prewrite); } - // TODO: Update lock wait relationships after acquiring some locks. + self.on_acquired_locks_finished(new_acquired_locks); if do_wake_up { let woken_up_resumable_lock_requests = tctx.woken_up_resumable_lock_requests; @@ -978,6 +979,28 @@ impl Scheduler { resumable_wake_up_list } + fn on_acquired_locks_finished(&self, new_acquired_locks: Vec) { + if new_acquired_locks.is_empty() || self.inner.lock_wait_queues.is_empty() { + return; + } + + // If there are not too many new locks, do not spawn the task to the high + // priority pool since it may consume more CPU. + if new_acquired_locks.len() < 30 { + self.inner + .lock_wait_queues + .update_lock_wait(new_acquired_locks); + } else { + let lock_wait_queues = self.inner.lock_wait_queues.clone(); + self.get_sched_pool(CommandPri::High) + .pool + .spawn(async move { + lock_wait_queues.update_lock_wait(new_acquired_locks); + }) + .unwrap(); + } + } + fn wake_up_legacy_pessimistic_locks( &self, legacy_wake_up_list: impl IntoIterator, ReleasedLock)> @@ -1201,6 +1224,7 @@ impl Scheduler { pr, lock_info, released_locks, + new_acquired_locks, lock_guards, response_policy, } = match deadline @@ -1273,7 +1297,16 @@ impl Scheduler { } if to_be_write.modifies.is_empty() { - scheduler.on_write_finished(cid, pr, Ok(()), lock_guards, false, false, tag); + scheduler.on_write_finished( + cid, + pr, + Ok(()), + lock_guards, + false, + false, + new_acquired_locks, + tag, + ); return; } @@ -1294,7 +1327,16 @@ impl Scheduler { engine.schedule_txn_extra(to_be_write.extra); }) } - scheduler.on_write_finished(cid, pr, Ok(()), lock_guards, false, false, tag); + scheduler.on_write_finished( + cid, + pr, + Ok(()), + lock_guards, + false, + false, + new_acquired_locks, + tag, + ); return; } @@ -1478,6 +1520,7 @@ impl Scheduler { lock_guards, pipelined, is_async_apply_prewrite, + new_acquired_locks, tag, ); KV_COMMAND_KEYWRITE_HISTOGRAM_VEC diff --git a/tests/failpoints/cases/test_gc_worker.rs b/tests/failpoints/cases/test_gc_worker.rs index 3dbb7ffc7b0..d24ec85f040 100644 --- a/tests/failpoints/cases/test_gc_worker.rs +++ b/tests/failpoints/cases/test_gc_worker.rs @@ -6,7 +6,7 @@ use std::{ time::Duration, }; -use engine_traits::{Peekable, WriteBatch}; +use engine_traits::Peekable; use grpcio::{ChannelBuilder, Environment}; use keys::data_key; use kvproto::{kvrpcpb::*, metapb::Region, tikvpb::TikvClient}; diff --git a/tests/failpoints/cases/test_pd_client.rs b/tests/failpoints/cases/test_pd_client.rs index 635b199291b..ca0a473a8b7 100644 --- a/tests/failpoints/cases/test_pd_client.rs +++ b/tests/failpoints/cases/test_pd_client.rs @@ -257,10 +257,17 @@ fn test_retry() { F: FnMut(&mut RpcClientV2) -> pd_client::Result, R: std::fmt::Debug, { - run_on_bad_connection(client, |c| { - f(c).unwrap_err(); - f(c).unwrap(); - }); + let mut success = false; + for _ in 0..3 { + run_on_bad_connection(client, |c| { + f(c).unwrap_err(); + success = f(c).is_ok(); + }); + if success { + return; + } + } + panic!("failed to retry after three attempts"); } test_retry_success(&mut client, |c| { diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index dde25bff636..a6a4a1824f3 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -698,9 +698,9 @@ fn test_snapshot_clean_up_logs_with_unfinished_log_gc() { // Disable default max peer number check. pd_client.disable_default_operator(); cluster.run(); - // Simulate raft log gc are pending in queue. + // Simulate raft log gc tasks are lost during shutdown. let fp = "worker_gc_raft_log"; - fail::cfg(fp, "return(0)").unwrap(); + fail::cfg(fp, "return").unwrap(); let state = cluster.truncated_state(1, 3); for i in 0..30 { diff --git a/tests/failpoints/cases/test_stale_peer.rs b/tests/failpoints/cases/test_stale_peer.rs index 0321772661d..1a4ef0b0afc 100644 --- a/tests/failpoints/cases/test_stale_peer.rs +++ b/tests/failpoints/cases/test_stale_peer.rs @@ -301,9 +301,9 @@ fn test_destroy_clean_up_logs_with_unfinished_log_gc() { // Disable default max peer number check. pd_client.disable_default_operator(); cluster.run(); - // Simulate raft log gc are pending in queue. + // Simulate raft log gc tasks are lost during shutdown. let fp = "worker_gc_raft_log"; - fail::cfg(fp, "return(0)").unwrap(); + fail::cfg(fp, "return").unwrap(); let state = cluster.truncated_state(1, 3); for i in 0..30 { diff --git a/tests/failpoints/cases/test_storage.rs b/tests/failpoints/cases/test_storage.rs index 40c78dfabde..2508b544285 100644 --- a/tests/failpoints/cases/test_storage.rs +++ b/tests/failpoints/cases/test_storage.rs @@ -12,7 +12,6 @@ use std::{ use api_version::{ApiV1, ApiV2, KvFormat}; use collections::HashMap; -use engine_traits::DummyFactory; use errors::{extract_key_error, extract_region_error}; use futures::executor::block_on; use grpcio::*; @@ -262,13 +261,12 @@ fn test_scale_scheduler_pool() { rx, ))); - let cfg_controller = ConfigController::new(cfg.clone()); + let cfg_controller = ConfigController::new(cfg); let (scheduler, _receiver) = dummy_scheduler(); cfg_controller.register( Module::Storage, Box::new(StorageConfigManger::new( - Arc::new(DummyFactory::new(Some(kv_engine), "".to_string())), - cfg.storage.block_cache.shared, + kv_engine, scheduler, flow_controller, storage.get_scheduler(), diff --git a/tests/failpoints/cases/test_table_properties.rs b/tests/failpoints/cases/test_table_properties.rs index 905bcfbd690..559ad5b0746 100644 --- a/tests/failpoints/cases/test_table_properties.rs +++ b/tests/failpoints/cases/test_table_properties.rs @@ -82,16 +82,16 @@ fn test_check_need_gc() { // TEST 2: props.num_versions as f64 > props.num_rows as f64 * ratio_threshold // return true. do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); do_gc(&raw_engine, 2, &mut gc_runner, &dir); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // Set ratio_threshold, let (props.num_versions as f64 > props.num_rows as // f64 * ratio_threshold) return true - gc_runner.ratio_threshold = Option::Some(f64::MIN); + gc_runner.ratio_threshold = Option::Some(0.0f64); // is_bottommost_level = false do_gc(&raw_engine, 1, &mut gc_runner, &dir); @@ -185,7 +185,7 @@ fn test_skip_gc_by_check() { let mut gc_runner = TestGcRunner::new(0); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // The min_mvcc_ts ts > gc safepoint, check_need_gc return false, don't call // dofilter @@ -208,12 +208,12 @@ fn test_skip_gc_by_check() { // TEST 2:When is_bottommost_level = false, // write data to level2 do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); do_gc(&raw_engine, 2, &mut gc_runner, &dir); do_write(&engine, false, 5); - engine.get_rocksdb().flush_cfs(true).unwrap(); + engine.get_rocksdb().flush_cfs(&[], true).unwrap(); // Set ratio_threshold, let (props.num_versions as f64 > props.num_rows as // f64 * ratio_threshold) return false diff --git a/tests/integrations/backup/mod.rs b/tests/integrations/backup/mod.rs index f432fd72246..4cfd4be07be 100644 --- a/tests/integrations/backup/mod.rs +++ b/tests/integrations/backup/mod.rs @@ -598,3 +598,33 @@ fn calculated_commit_ts_after_commit() { commit_ts }); } + +#[test] +fn test_backup_in_flashback() { + let mut suite = TestSuite::new(3, 144 * 1024 * 1024, ApiVersion::V1); + suite.must_kv_put(3, 1); + // Prepare the flashback. + let region = suite.cluster.get_region(b"key_0"); + suite.cluster.must_send_wait_flashback_msg( + region.get_id(), + kvproto::raft_cmdpb::AdminCmdType::PrepareFlashback, + ); + // Start the backup. + let tmp = Builder::new().tempdir().unwrap(); + let backup_ts = suite.alloc_ts(); + let storage_path = make_unique_dir(tmp.path()); + let rx = suite.backup( + vec![], // start + vec![], // end + 0.into(), // begin_ts + backup_ts, + &storage_path, + ); + let resp = block_on(rx.collect::>()); + assert!(!resp[0].has_error()); + // Finish the flashback. + suite.cluster.must_send_wait_flashback_msg( + region.get_id(), + kvproto::raft_cmdpb::AdminCmdType::FinishFlashback, + ); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index ff01788c370..c6f8e565218 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -32,7 +32,7 @@ use tikv::{ lock_manager::Config as PessimisticTxnConfig, Config as ServerConfig, }, storage::config::{ - BlockCacheConfig, Config as StorageConfig, FlowControlConfig, IoRateLimitConfig, + BlockCacheConfig, Config as StorageConfig, EngineType, FlowControlConfig, IoRateLimitConfig, }, }; use tikv_util::config::{LogFormat, ReadableDuration, ReadableSize}; @@ -289,7 +289,7 @@ fn test_serde_custom_tikv_config() { max_manifest_file_size: ReadableSize::mb(12), create_if_missing: false, max_open_files: 12_345, - enable_statistics: false, + enable_statistics: true, stats_dump_period: ReadableDuration::minutes(12), compaction_readahead_size: ReadableSize::kb(1), info_log_max_size: ReadableSize::kb(1), @@ -308,8 +308,13 @@ fn test_serde_custom_tikv_config() { writable_file_max_buffer_size: ReadableSize::mb(12), use_direct_io_for_flush_and_compaction: true, enable_pipelined_write: false, - enable_multi_batch_write: true, + enable_multi_batch_write: Some(true), + paranoid_checks: None, + allow_concurrent_memtable_write: Some(false), enable_unordered_write: true, + write_buffer_limit: Some(ReadableSize::gb(1)), + write_buffer_stall_ratio: 0.0, + write_buffer_flush_oldest_first: false, defaultcf: DefaultCfConfig { block_size: ReadableSize::kb(12), block_cache_size: ReadableSize::gb(12), @@ -363,6 +368,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, writecf: WriteCfConfig { block_size: ReadableSize::kb(12), @@ -431,6 +437,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, lockcf: LockCfConfig { block_size: ReadableSize::kb(12), @@ -499,6 +506,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, raftcf: RaftCfConfig { block_size: ReadableSize::kb(12), @@ -567,6 +575,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, titan: titan_db_config.clone(), }; @@ -582,7 +591,7 @@ fn test_serde_custom_tikv_config() { max_manifest_file_size: ReadableSize::mb(12), create_if_missing: false, max_open_files: 12_345, - enable_statistics: false, + enable_statistics: true, stats_dump_period: ReadableDuration::minutes(12), compaction_readahead_size: ReadableSize::kb(1), info_log_max_size: ReadableSize::kb(1), @@ -650,6 +659,7 @@ fn test_serde_custom_tikv_config() { prepopulate_block_cache: PrepopulateBlockCache::FlushOnly, format_version: 5, checksum: ChecksumType::XXH3, + max_compactions: 3, }, titan: titan_db_config, }; @@ -665,6 +675,7 @@ fn test_serde_custom_tikv_config() { raft_engine_config.memory_limit = Some(RaftEngineReadableSize::gb(1)); value.storage = StorageConfig { data_dir: "/var".to_owned(), + engine: EngineType::RaftKv2, gc_ratio_threshold: 1.2, max_key_size: 4096, scheduler_concurrency: 123, @@ -684,7 +695,7 @@ fn test_serde_custom_tikv_config() { hard_pending_compaction_bytes_limit: ReadableSize(1), }, block_cache: BlockCacheConfig { - shared: true, + shared: None, capacity: Some(ReadableSize::gb(40)), num_shard_bits: 10, strict_capacity_limit: true, @@ -758,8 +769,13 @@ fn test_serde_custom_tikv_config() { ..Default::default() }; value.backup_stream = BackupStreamConfig { - num_threads: 12, - ..Default::default() + max_flush_interval: ReadableDuration::secs(11), + num_threads: 7, + enable: true, + temp_path: "./stream".to_string(), + file_size_limit: ReadableSize::gb(5), + initial_scan_pending_memory_quota: ReadableSize::kb(2), + initial_scan_rate_limit: ReadableSize::mb(3), }; value.import = ImportConfig { num_threads: 123, @@ -817,6 +833,7 @@ fn test_serde_custom_tikv_config() { } } +#[track_caller] fn diff_config(lhs: &TikvConfig, rhs: &TikvConfig) { let lhs_str = format!("{:?}", lhs); let rhs_str = format!("{:?}", rhs); @@ -886,7 +903,6 @@ fn test_do_not_use_unified_readpool_with_legacy_config() { fn test_block_cache_backward_compatible() { let content = read_file_in_project_dir("integrations/config/test-cache-compatible.toml"); let mut cfg: TikvConfig = toml::from_str(&content).unwrap(); - assert!(cfg.storage.block_cache.shared); assert!(cfg.storage.block_cache.capacity.is_none()); cfg.compatible_adjust(); assert!(cfg.storage.block_cache.capacity.is_some()); diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index e5c896238bc..b096437e60c 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -90,6 +90,7 @@ a = "b" [storage] data-dir = "/var" +engine = "raft-kv2" gc-ratio-threshold = 1.2 max-key-size = 4096 scheduler-concurrency = 123 @@ -102,7 +103,6 @@ enable-ttl = true ttl-check-poll-interval = "0s" [storage.block-cache] -shared = true capacity = "40GB" num-shard-bits = 10 strict-capacity-limit = true @@ -252,7 +252,6 @@ max-background-flushes = 4 max-manifest-file-size = "12MB" create-if-missing = false max-open-files = 12345 -enable-statistics = false stats-dump-period = "12m" compaction-readahead-size = "1KB" info-log-max-size = "1KB" @@ -269,7 +268,10 @@ max-sub-compactions = 12 writable-file-max-buffer-size = "12MB" use-direct-io-for-flush-and-compaction = true enable-pipelined-write = false +enable-multi-batch-write = true enable-unordered-write = true +allow-concurrent-memtable-write = false +write-buffer-limit = "1GB" [rocksdb.titan] enabled = true @@ -330,6 +332,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.defaultcf.titan] min-blob-size = "2018B" @@ -392,6 +395,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.lockcf] block-size = "12KB" @@ -441,6 +445,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [rocksdb.raftcf] block-size = "12KB" @@ -490,6 +495,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [raftdb] wal-recovery-mode = "skip-any-corrupted-records" @@ -502,7 +508,6 @@ max-background-flushes = 4 max-manifest-file-size = "12MB" create-if-missing = false max-open-files = 12345 -enable-statistics = false stats-dump-period = "12m" compaction-readahead-size = "1KB" info-log-max-size = "1KB" @@ -572,6 +577,7 @@ compaction-guard-max-output-file-size = "34MB" prepopulate-block-cache = "flush-only" format-version = 5 checksum = "xxh3" +max-compactions = 3 [raftdb.defaultcf.titan] min-blob-size = "2018B" @@ -625,6 +631,15 @@ batch-size = 7 s3-multi-part-size = "15MB" sst-max-size = "789MB" +[log-backup] +max-flush-interval = "11s" +num-threads = 7 +enable = true +temp-path = "./stream" +file-size-limit = "5GiB" +initial-scan-pending-memory-quota = "2KiB" +initial-scan-rate-limit = "3MiB" + [backup.hadoop] home = "/root/hadoop" linux-user = "hadoop" diff --git a/tests/integrations/pd/test_rpc_client.rs b/tests/integrations/pd/test_rpc_client.rs index 23841ba5dfd..ca37318aa8b 100644 --- a/tests/integrations/pd/test_rpc_client.rs +++ b/tests/integrations/pd/test_rpc_client.rs @@ -112,7 +112,7 @@ fn test_rpc_client() { assert_eq!(ts.logical() + 100, ts100.logical()); let mut prev_id = 0; - for _ in 0..100 { + for _ in 0..10 { let mut client = new_client_v2(eps.clone(), None); let alloc_id = client.alloc_id().unwrap(); assert!(alloc_id > prev_id); diff --git a/tests/integrations/raftstore/test_compact_lock_cf.rs b/tests/integrations/raftstore/test_compact_lock_cf.rs index c8ee96c7c67..fbc7629c73f 100644 --- a/tests/integrations/raftstore/test_compact_lock_cf.rs +++ b/tests/integrations/raftstore/test_compact_lock_cf.rs @@ -15,11 +15,9 @@ fn flush_then_check(cluster: &mut Cluster, interval: u64, writt flush(cluster); // Wait for compaction. sleep_ms(interval * 2); - for engines in cluster.engines.values() { - let compact_write_bytes = engines - .kv - .as_inner() - .get_statistics_ticker_count(DBStatisticsTickerType::CompactWriteBytes); + for statistics in &cluster.kv_statistics { + let compact_write_bytes = + statistics.get_ticker_count(DBStatisticsTickerType::CompactWriteBytes); if written { assert!(compact_write_bytes > 0); } else { diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index 89a61223fa2..e50ca59fdff 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -9,7 +9,7 @@ use futures::{channel::oneshot, executor::block_on}; use kvproto::{ errorpb::FlashbackInProgress, metapb, - raft_cmdpb::{AdminCmdType, RaftCmdResponse, Request}, + raft_cmdpb::{AdminCmdType, CmdType, RaftCmdResponse, Request}, }; use raftstore::store::Callback; use test_raftstore::*; @@ -18,6 +18,61 @@ use txn_types::WriteBatchFlags; const TEST_KEY: &[u8] = b"k1"; const TEST_VALUE: &[u8] = b"v1"; +#[test] +fn test_allow_read_only_request() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let mut region = cluster.get_region(TEST_KEY); + let mut snap_req = Request::default(); + snap_req.set_cmd_type(CmdType::Snap); + // Get snapshot normally. + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), false); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot with flashback flag without in the flashback state. + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot with flashback flag with in the flashback state. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); + assert!(!snap_resp.get_header().has_error()); + // Get snapshot without flashback flag with in the flashback state. + let snap_resp = request(&mut cluster, &mut region, snap_req, false); + assert!( + snap_resp + .get_header() + .get_error() + .has_flashback_in_progress(), + "{:?}", + snap_resp + ); + // Finish flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); +} + +#[test] +#[cfg(feature = "failpoints")] +fn test_read_after_prepare_flashback() { + let mut cluster = new_node_cluster(0, 3); + cluster.run(); + cluster.must_transfer_leader(1, new_peer(1, 1)); + + let region = cluster.get_region(TEST_KEY); + fail::cfg("keep_peer_fsm_flashback_state_false", "return").unwrap(); + // Prepare flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); + // Read with flashback flag will succeed even the peer fsm does not updated its + // `is_in_flashback` flag. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + // Writing with flashback flag will succeed since the ApplyFSM owns the + // latest `is_in_flashback` flag. + must_request_with_flashback_flag(&mut cluster, &mut region.clone(), new_get_cmd(TEST_KEY)); + fail::remove("keep_peer_fsm_flashback_state_false"); + // Finish flashback. + cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); +} + #[test] fn test_prepare_flashback_after_split() { let mut cluster = new_node_cluster(0, 3); @@ -281,8 +336,9 @@ fn test_flashback_for_local_read() { // Check the leader does a local read. let state = cluster.raft_local_state(region.get_id(), store_id); assert_eq!(state.get_last_index(), last_index); - // A local read with flashback flag will also be blocked. - must_get_flashback_not_prepared_error(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); + // A local read with flashback flag will not be blocked since it won't have any + // side effects. + must_request_with_flashback_flag(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } #[test] @@ -336,30 +392,62 @@ fn test_flashback_for_apply_snapshot() { must_check_flashback_state(&mut cluster, 1, 1, false); must_check_flashback_state(&mut cluster, 1, 3, false); - // Make store 3 isolated. cluster.add_send_filter(IsolationFilterFactory::new(3)); - // Write some data to trigger snapshot. - for i in 100..110 { - let key = format!("k{}", i); - let value = format!("v{}", i); - cluster.must_put_cf("write", key.as_bytes(), value.as_bytes()); + let mut region = cluster.get_region(TEST_KEY); + for _ in 0..10 { + must_request_without_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cf_cmd("write", TEST_KEY, TEST_VALUE), + ) } - // Prepare for flashback cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, false); - // Add store 3 back. cluster.clear_send_filters(); must_check_flashback_state(&mut cluster, 1, 1, true); must_check_flashback_state(&mut cluster, 1, 3, true); + cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); + must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, false); + // Prepare for flashback + cluster.must_send_wait_flashback_msg(1, AdminCmdType::PrepareFlashback); + must_check_flashback_state(&mut cluster, 1, 1, true); + must_check_flashback_state(&mut cluster, 1, 3, true); + // Make store 3 isolated. + cluster.add_send_filter(IsolationFilterFactory::new(3)); + // Write some flashback data to trigger snapshot. + for _ in 0..10 { + must_request_with_flashback_flag( + &mut cluster, + &mut region.clone(), + new_put_cf_cmd("write", TEST_KEY, TEST_VALUE), + ) + } + // Finish flashback. cluster.must_send_wait_flashback_msg(1, AdminCmdType::FinishFlashback); must_check_flashback_state(&mut cluster, 1, 1, false); + must_check_flashback_state(&mut cluster, 1, 3, true); + // Wait for a while before adding store 3 back to make sure only it does not + // receive the `FinishFlashback` message. + sleep(Duration::from_secs(1)); + // Add store 3 back. + cluster.clear_send_filters(); + must_check_flashback_state(&mut cluster, 1, 1, false); must_check_flashback_state(&mut cluster, 1, 3, false); + // Make store 3 become leader. + cluster.must_transfer_leader(region.get_id(), new_peer(3, 3)); + // Region should not in the flashback state. + must_request_without_flashback_flag( + &mut cluster, + &mut region, + new_put_cmd(TEST_KEY, TEST_VALUE), + ); } fn must_check_flashback_state( @@ -415,7 +503,7 @@ fn must_request_with_flashback_flag( req: Request, ) { let resp = request(cluster, region, req, true); - assert!(!resp.get_header().has_error()); + assert!(!resp.get_header().has_error(), "{:?}", resp); } fn must_get_flashback_not_prepared_error( @@ -434,7 +522,7 @@ fn must_request_without_flashback_flag( req: Request, ) { let resp = request(cluster, region, req, false); - assert!(!resp.get_header().has_error()); + assert!(!resp.get_header().has_error(), "{:?}", resp); } fn must_get_flashback_in_progress_error( diff --git a/tests/integrations/raftstore/test_stats.rs b/tests/integrations/raftstore/test_stats.rs index 22d23f7adba..67e5e261dab 100644 --- a/tests/integrations/raftstore/test_stats.rs +++ b/tests/integrations/raftstore/test_stats.rs @@ -27,7 +27,7 @@ fn check_available(cluster: &mut Cluster) { for i in 0..1000 { let last_available = stats.get_available(); cluster.must_put(format!("k{}", i).as_bytes(), &value); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); sleep_ms(20); let stats = pd_client.get_store_stats(1).unwrap(); @@ -58,7 +58,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { } let engine = cluster.get_engine(1); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); let last_stats = pd_client.get_store_stats(1).unwrap(); assert_eq!(last_stats.get_region_count(), 1); @@ -67,7 +67,7 @@ fn test_simple_store_stats(cluster: &mut Cluster) { let region = pd_client.get_region(b"").unwrap(); cluster.must_split(®ion, b"k2"); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); // wait report region count after split for _ in 0..100 { diff --git a/tests/integrations/raftstore/test_update_region_size.rs b/tests/integrations/raftstore/test_update_region_size.rs index ee4fb79ac62..f2ff0d4f217 100644 --- a/tests/integrations/raftstore/test_update_region_size.rs +++ b/tests/integrations/raftstore/test_update_region_size.rs @@ -9,7 +9,7 @@ use tikv_util::config::*; fn flush(cluster: &mut Cluster) { for engines in cluster.engines.values() { - engines.kv.flush_cfs(true).unwrap(); + engines.kv.flush_cfs(&[], true).unwrap(); } } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 12cff74861d..496c587a7b9 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -11,8 +11,8 @@ use std::{ use api_version::{ApiV1, ApiV1Ttl, ApiV2, KvFormat}; use concurrency_manager::ConcurrencyManager; use engine_traits::{ - MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, SyncMutable, CF_DEFAULT, CF_LOCK, CF_RAFT, - CF_WRITE, + MiscExt, Peekable, RaftEngine, RaftEngineReadOnly, RaftLogBatch, SyncMutable, CF_DEFAULT, + CF_LOCK, CF_RAFT, CF_WRITE, }; use futures::{executor::block_on, future, SinkExt, StreamExt, TryStreamExt}; use grpcio::*; @@ -606,7 +606,7 @@ fn test_mvcc_flashback_failed_after_first_batch() { fail::cfg("flashback_failed_after_first_batch", "return").unwrap(); must_flashback_to_version(&client, ctx.clone(), check_ts, ts + 1, ts + 2); fail::remove("flashback_failed_after_first_batch"); - // key@1 must be flahsbacked in the second batch firstly. + // key@1 must be flashbacked in the second batch firstly. must_kv_read_equal( &client, ctx.clone(), @@ -777,12 +777,75 @@ fn test_mvcc_flashback_unprepared() { req.set_context(ctx.clone()); req.set_start_ts(4); req.set_commit_ts(5); + req.set_version(0); + req.set_start_key(b"a".to_vec()); + req.set_end_key(b"z".to_vec()); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(resp.get_error().contains("txn lock not found")); + must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); + // Flashback with preparing. + must_flashback_to_version(&client, ctx.clone(), 0, 6, 7); + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k; + get_req.version = 7; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); + // Mock the flashback retry. + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx); + req.set_start_ts(6); + req.set_commit_ts(7); req.version = 0; req.start_key = b"a".to_vec(); req.end_key = b"z".to_vec(); let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(resp.get_region_error().has_flashback_not_prepared()); - must_kv_read_equal(&client, ctx, k, v, 6); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); +} + +#[test] +fn test_mvcc_flashback_with_unlimit_range() { + let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (k, v) = (b"key".to_vec(), b"value".to_vec()); + let mut ts = 0; + write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); + must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); + + let mut prepare_req = PrepareFlashbackToVersionRequest::default(); + prepare_req.set_context(ctx.clone()); + prepare_req.set_start_ts(6); + prepare_req.set_version(0); + prepare_req.set_start_key(b"".to_vec()); + prepare_req.set_end_key(b"".to_vec()); + client + .kv_prepare_flashback_to_version(&prepare_req) + .unwrap(); + let mut req = FlashbackToVersionRequest::default(); + req.set_context(ctx.clone()); + req.set_start_ts(6); + req.set_commit_ts(7); + req.set_version(0); + req.set_start_key(b"".to_vec()); + req.set_end_key(b"".to_vec()); + let resp = client.kv_flashback_to_version(&req).unwrap(); + assert!(!resp.has_region_error()); + assert!(resp.get_error().is_empty()); + + let mut get_req = GetRequest::default(); + get_req.set_context(ctx); + get_req.key = k; + get_req.version = 7; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert_eq!(get_resp.value, b"".to_vec()); } // raft related RPC is tested as parts of test_snapshot.rs, so skip here. @@ -902,7 +965,9 @@ fn test_debug_raft_log() { entry.set_index(log_index); entry.set_entry_type(eraftpb::EntryType::EntryNormal); entry.set_data(vec![42].into()); - engine.append(region_id, vec![entry.clone()]).unwrap(); + let mut lb = engine.log_batch(0); + lb.append(region_id, vec![entry.clone()]).unwrap(); + engine.consume(&mut lb, false).unwrap(); assert_eq!( engine.get_entry(region_id, log_index).unwrap().unwrap(), entry @@ -936,7 +1001,9 @@ fn test_debug_region_info() { let region_id = 100; let mut raft_state = raft_serverpb::RaftLocalState::default(); raft_state.set_last_index(42); - raft_engine.put_raft_state(region_id, &raft_state).unwrap(); + let mut lb = raft_engine.log_batch(0); + lb.put_raft_state(region_id, &raft_state).unwrap(); + raft_engine.consume(&mut lb, false).unwrap(); assert_eq!( raft_engine.get_raft_state(region_id).unwrap().unwrap(), raft_state diff --git a/tests/integrations/server/lock_manager.rs b/tests/integrations/server/lock_manager.rs index 43032dd8cc3..289b10303a8 100644 --- a/tests/integrations/server/lock_manager.rs +++ b/tests/integrations/server/lock_manager.rs @@ -1,6 +1,14 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{sync::Arc, thread, time::Duration}; +use std::{ + sync::{ + mpsc, + mpsc::{RecvTimeoutError, TryRecvError}, + Arc, + }, + thread, + time::Duration, +}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ @@ -283,3 +291,106 @@ fn test_detect_deadlock_when_merge_region() { must_transfer_leader(&mut cluster, b"", 1); } } + +#[test] +fn test_detect_deadlock_when_updating_wait_info() { + use kvproto::kvrpcpb::PessimisticLockKeyResultType::*; + let mut cluster = new_cluster_for_deadlock_test(3); + + let key1 = b"key1"; + let key2 = b"key2"; + let (client, ctx) = build_leader_client(&mut cluster, key1); + let client = Arc::new(client); + + fn async_pessimistic_lock( + client: Arc, + ctx: Context, + key: &[u8], + ts: u64, + ) -> mpsc::Receiver { + let (tx, rx) = mpsc::channel(); + let key = vec![key.to_vec()]; + thread::spawn(move || { + let resp = + kv_pessimistic_lock_resumable(&client, ctx, key, ts, ts, Some(1000), false, false); + tx.send(resp).unwrap(); + }); + rx + } + + // key1: txn 11 and 12 waits for 10 + // key2: txn 11 waits for 12 + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![key1.to_vec()], + 10, + 10, + Some(1000), + false, + false, + ); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + let resp = kv_pessimistic_lock_resumable( + &client, + ctx.clone(), + vec![key2.to_vec()], + 12, + 12, + Some(1000), + false, + false, + ); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + let rx_txn11_k1 = async_pessimistic_lock(client.clone(), ctx.clone(), key1, 11); + let rx_txn12_k1 = async_pessimistic_lock(client.clone(), ctx.clone(), key1, 12); + let rx_txn11_k2 = async_pessimistic_lock(client.clone(), ctx.clone(), key2, 11); + // All blocked. + assert_eq!( + rx_txn11_k1 + .recv_timeout(Duration::from_millis(50)) + .unwrap_err(), + RecvTimeoutError::Timeout + ); + assert_eq!(rx_txn12_k1.try_recv().unwrap_err(), TryRecvError::Empty); + assert_eq!(rx_txn11_k2.try_recv().unwrap_err(), TryRecvError::Empty); + + // Release lock at ts=10 on key1 so that txn 11 will be granted the lock. + must_kv_pessimistic_rollback(&client, ctx.clone(), key1.to_vec(), 10, 10); + let resp = rx_txn11_k1 + .recv_timeout(Duration::from_millis(200)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + // And then 12 waits for k1 on key1, which forms a deadlock. + let resp = rx_txn12_k1 + .recv_timeout(Duration::from_millis(1000)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors[0].has_deadlock()); + assert_eq!(resp.results[0].get_type(), LockResultFailed); + // Check correctness of the wait chain. + let wait_chain = resp.errors[0].get_deadlock().get_wait_chain(); + assert_eq!(wait_chain[0].get_txn(), 11); + assert_eq!(wait_chain[0].get_wait_for_txn(), 12); + assert_eq!(wait_chain[0].get_key(), key2); + assert_eq!(wait_chain[1].get_txn(), 12); + assert_eq!(wait_chain[1].get_wait_for_txn(), 11); + assert_eq!(wait_chain[1].get_key(), key1); + + // Clean up. + must_kv_pessimistic_rollback(&client, ctx.clone(), key1.to_vec(), 11, 11); + must_kv_pessimistic_rollback(&client, ctx.clone(), key2.to_vec(), 12, 12); + let resp = rx_txn11_k2 + .recv_timeout(Duration::from_millis(500)) + .unwrap(); + assert!(resp.region_error.is_none()); + assert!(resp.errors.is_empty()); + assert_eq!(resp.results[0].get_type(), LockResultNormal); + must_kv_pessimistic_rollback(&client, ctx, key2.to_vec(), 11, 11); +} diff --git a/tests/integrations/server/status_server.rs b/tests/integrations/server/status_server.rs index 455465d87cb..929a7c286ae 100644 --- a/tests/integrations/server/status_server.rs +++ b/tests/integrations/server/status_server.rs @@ -5,9 +5,8 @@ use std::{error::Error, net::SocketAddr, sync::Arc}; use hyper::{body, Client, StatusCode, Uri}; use raftstore::store::region_meta::RegionMeta; use security::SecurityConfig; -use test_raftstore::{new_server_cluster, Simulator}; +use test_raftstore::new_server_cluster; use tikv::{config::ConfigController, server::status_server::StatusServer}; -use tikv_util::HandyRwLock; async fn check(authority: SocketAddr, region_id: u64) -> Result<(), Box> { let client = Client::new(); @@ -39,13 +38,12 @@ fn test_region_meta_endpoint() { let peer = region.get_peers().get(0); assert!(peer.is_some()); let store_id = peer.unwrap().get_store_id(); - let router = cluster.sim.rl().get_router(store_id); - assert!(router.is_some()); + let router = cluster.raft_extension(store_id); let mut status_server = StatusServer::new( 1, ConfigController::default(), Arc::new(SecurityConfig::default()), - router.unwrap(), + router, std::env::temp_dir(), ) .unwrap(); diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index c0a9ee8b1ed..452bcc89238 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,10 +159,15 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg.rocksdb.build_opt(); - let kv_cfs_opts = cfg + let kv_db_opts = cfg .rocksdb - .build_cf_opts(&cache, None, cfg.storage.api_version()); + .build_opt(&cfg.rocksdb.build_resources(Default::default())); + let kv_cfs_opts = cfg.rocksdb.build_cf_opts( + &cfg.rocksdb.build_cf_resources(cache), + None, + cfg.storage.api_version(), + cfg.storage.engine, + ); let raft_path = path.path().join(Path::new("titan")); let engines = Engines::new( @@ -211,7 +216,7 @@ fn test_delete_files_in_range_for_titan() { .unwrap(); // Flush and compact the kvs into L6. - engines.kv.flush_cfs(true).unwrap(); + engines.kv.flush_cfs(&[], true).unwrap(); engines.kv.compact_files_in_range(None, None, None).unwrap(); let db = engines.kv.as_inner(); let value = db.get_property_int("rocksdb.num-files-at-level0").unwrap(); @@ -254,9 +259,9 @@ fn test_delete_files_in_range_for_titan() { // Used to trigger titan gc let engine = &engines.kv; engine.put(b"1", b"1").unwrap(); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); engine.put(b"2", b"2").unwrap(); - engine.flush_cfs(true).unwrap(); + engine.flush_cfs(&[], true).unwrap(); engine .compact_files_in_range(Some(b"0"), Some(b"3"), Some(1)) .unwrap(); From c353910ef6a296b592db6b217ec888cee34eaffc Mon Sep 17 00:00:00 2001 From: andreid-db <103079610+andreid-db@users.noreply.github.com> Date: Sat, 28 Jan 2023 20:15:54 -0800 Subject: [PATCH 097/115] config: allow starting TiKV nodes with <1 CPU (#14084) close tikv/tikv#13586, close tikv/tikv#13752, ref tikv/tikv#14017 Signed-off-by: Andrei Dragus --- components/raftstore/src/store/config.rs | 2 +- src/config/mod.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index 34f4e159dee..d6994a16ed4 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -660,7 +660,7 @@ impl Config { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as usize * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as usize; if self.apply_batch_system.pool_size == 0 || self.apply_batch_system.pool_size > limit { return Err(box_err!( "apply-pool-size should be greater than 0 and less than or equal to: {}", diff --git a/src/config/mod.rs b/src/config/mod.rs index 7878696faa5..99b593e2443 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1396,7 +1396,7 @@ impl DbConfig { // prevent mistakenly inputting too large values, the max limit is made // according to the cpu quota * 10. Notice 10 is only an estimate, not an // empirical value. - let limit = SysQuota::cpu_cores_quota() as i32 * 10; + let limit = (SysQuota::cpu_cores_quota() * 10.0) as i32; if self.max_background_jobs <= 0 || self.max_background_jobs > limit { return Err(format!( "max_background_jobs should be greater than 0 and less than or equal to {:?}", From 68710b99ee8f64bb353e617745c8ddf727646913 Mon Sep 17 00:00:00 2001 From: Hu# Date: Sun, 29 Jan 2023 13:49:53 +0800 Subject: [PATCH 098/115] pd_client: replace PD_REQUEST_HISTOGRAM_VEC with static metrics (#14087) close tikv/tikv#14086 PD_REQUEST_HISTOGRAM_VEC can be changed to use static metrics to improve performance. Signed-off-by: husharp Co-authored-by: Ti Chi Robot --- Cargo.lock | 1 + components/batch-system/src/router.rs | 9 +- components/pd_client/Cargo.toml | 1 + components/pd_client/src/client.rs | 96 +++++++++------------ components/pd_client/src/client_v2.rs | 80 +++++++---------- components/pd_client/src/metrics.rs | 50 +++++++++-- components/raftstore/src/store/fsm/peer.rs | 4 +- components/raftstore/src/store/fsm/store.rs | 2 +- 8 files changed, 124 insertions(+), 119 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d288af846a6..95587f98565 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3696,6 +3696,7 @@ dependencies = [ "log", "log_wrappers", "prometheus", + "prometheus-static-metric", "security", "semver 0.10.0", "serde", diff --git a/components/batch-system/src/router.rs b/components/batch-system/src/router.rs index bfcb93c9d6b..ef937209531 100644 --- a/components/batch-system/src/router.rs +++ b/components/batch-system/src/router.rs @@ -12,12 +12,7 @@ use std::{ use collections::HashMap; use crossbeam::channel::{SendError, TrySendError}; -use tikv_util::{ - debug, info, - lru::LruCache, - time::{duration_to_sec, Instant}, - Either, -}; +use tikv_util::{debug, info, lru::LruCache, time::Instant, Either}; use crate::{ fsm::{Fsm, FsmScheduler, FsmState}, @@ -322,7 +317,7 @@ where for mailbox in mailboxes.map.values() { let _ = mailbox.force_send(msg_gen(), &self.normal_scheduler); } - BROADCAST_NORMAL_DURATION.observe(duration_to_sec(timer.saturating_elapsed())); + BROADCAST_NORMAL_DURATION.observe(timer.saturating_elapsed_secs()); } /// Try to notify all FSMs that the cluster is being shutdown. diff --git a/components/pd_client/Cargo.toml b/components/pd_client/Cargo.toml index c25e37f23b5..f46d6111c5d 100644 --- a/components/pd_client/Cargo.toml +++ b/components/pd_client/Cargo.toml @@ -19,6 +19,7 @@ lazy_static = "1.3" log = { version = "0.4", features = ["max_level_trace", "release_max_level_debug"] } log_wrappers = { workspace = true } prometheus = { version = "0.13", features = ["nightly"] } +prometheus-static-metric = "0.5" security = { workspace = true } semver = "0.10" serde = "1.0" diff --git a/components/pd_client/src/client.rs b/components/pd_client/src/client.rs index 6686c4e8a04..b0c21797a91 100644 --- a/components/pd_client/src/client.rs +++ b/components/pd_client/src/client.rs @@ -26,10 +26,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, debug, error, info, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, Either, HandyRwLock, + box_err, debug, error, info, thd_name, time::Instant, timer::GLOBAL_TIMER_HANDLE, warn, Either, + HandyRwLock, }; use txn_types::TimeStamp; use yatp::{task::future::TaskCell, ThreadPool}; @@ -193,9 +191,7 @@ impl RpcClient { &self, key: &[u8], ) -> PdFuture<(metapb::Region, Option)> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_region.start_coarse_timer(); let mut req = pdpb::GetRegionRequest::default(); req.set_header(self.header()); @@ -255,8 +251,8 @@ impl RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_async"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_async + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; let store = resp.take_store(); if store.get_state() != metapb::StoreState::Tombstone { @@ -291,7 +287,7 @@ impl PdClient for RpcClient { items: Vec, ) -> PdFuture<()> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_global_config"]) + .store_global_config .start_coarse_timer(); let mut req = pdpb::StoreGlobalConfigRequest::new(); @@ -321,7 +317,7 @@ impl PdClient for RpcClient { config_path: String, ) -> PdFuture<(Vec, i64)> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["load_global_config"]) + .load_global_config .start_coarse_timer(); let mut req = pdpb::LoadGlobalConfigRequest::new(); @@ -355,7 +351,7 @@ impl PdClient for RpcClient { revision: i64, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["watch_global_config"]) + .watch_global_config .start_coarse_timer(); let mut req = pdpb::WatchGlobalConfigRequest::default(); @@ -377,7 +373,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); let mut req = pdpb::BootstrapRequest::default(); @@ -394,7 +390,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); let mut req = pdpb::IsBootstrappedRequest::default(); @@ -409,9 +405,7 @@ impl PdClient for RpcClient { } fn alloc_id(&self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); let mut req = pdpb::AllocIdRequest::default(); req.set_header(self.header()); @@ -430,7 +424,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); let mut req = pdpb::IsSnapshotRecoveringRequest::default(); @@ -445,9 +439,7 @@ impl PdClient for RpcClient { } fn put_store(&self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); let mut req = pdpb::PutStoreRequest::default(); req.set_header(self.header()); @@ -462,9 +454,7 @@ impl PdClient for RpcClient { } fn get_store(&self, store_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_store.start_coarse_timer(); let mut req = pdpb::GetStoreRequest::default(); req.set_header(self.header()); @@ -488,9 +478,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); let mut req = pdpb::GetAllStoresRequest::default(); req.set_header(self.header()); @@ -506,7 +494,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); let mut req = pdpb::GetClusterConfigRequest::default(); @@ -558,8 +546,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() { Ok(Some(resp.take_region())) @@ -600,8 +588,8 @@ impl PdClient for RpcClient { Box::pin(async move { let mut resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { Ok(Some((resp.take_region(), resp.take_leader()))) @@ -737,8 +725,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -775,8 +763,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp) }) as PdFuture<_> @@ -821,8 +809,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { Err(_) => warn!("invalid cluster version: {}", resp.get_cluster_version()), @@ -858,8 +846,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -871,9 +859,7 @@ impl PdClient for RpcClient { } fn scatter_region(&self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_header(self.header()); @@ -912,8 +898,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) }) as PdFuture<_> @@ -929,9 +915,7 @@ impl PdClient for RpcClient { } fn get_operator(&self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); let mut req = pdpb::GetOperatorRequest::default(); req.set_header(self.header()); @@ -946,7 +930,7 @@ impl PdClient for RpcClient { } fn batch_get_tso(&self, count: u32) -> PdFuture { - let begin = Instant::now(); + let timer = Instant::now(); let executor = move |client: &Client, _| { // Remove Box::pin and Compat when GLOBAL_TIMER_HANDLE supports futures 0.3 let ts_fut = Compat::new(Box::pin(client.inner.rl().tso.get_timestamp(count))); @@ -965,8 +949,8 @@ impl PdClient for RpcClient { } })?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["tso"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .tso + .observe(timer.saturating_elapsed_secs()); Ok(ts) }) as PdFuture<_> }; @@ -981,7 +965,7 @@ impl PdClient for RpcClient { safe_point: TimeStamp, ttl: Duration, ) -> PdFuture<()> { - let begin = Instant::now(); + let timer = Instant::now(); let mut req = pdpb::UpdateServiceGcSafePointRequest::default(); req.set_header(self.header()); req.set_service_id(name.into()); @@ -1003,8 +987,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(begin.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> @@ -1039,8 +1023,8 @@ impl PdClient for RpcClient { Box::pin(async move { let resp = handler.await?; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); check_resp_header(resp.get_header())?; Ok(()) }) as PdFuture<_> diff --git a/components/pd_client/src/client_v2.rs b/components/pd_client/src/client_v2.rs index 35e5c3b4785..cfa0d46303c 100644 --- a/components/pd_client/src/client_v2.rs +++ b/components/pd_client/src/client_v2.rs @@ -47,12 +47,8 @@ use kvproto::{ }; use security::SecurityManager; use tikv_util::{ - box_err, error, info, - mpsc::future as mpsc, - slow_log, thd_name, - time::{duration_to_sec, Instant}, - timer::GLOBAL_TIMER_HANDLE, - warn, + box_err, error, info, mpsc::future as mpsc, slow_log, thd_name, time::Instant, + timer::GLOBAL_TIMER_HANDLE, warn, }; use tokio::sync::{broadcast, mpsc as tokio_mpsc}; use txn_types::TimeStamp; @@ -835,7 +831,7 @@ impl PdClient for RpcClient { region: metapb::Region, ) -> Result> { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["bootstrap_cluster"]) + .bootstrap_cluster .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -856,7 +852,7 @@ impl PdClient for RpcClient { fn is_cluster_bootstrapped(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_cluster_bootstrapped"]) + .is_cluster_bootstrapped .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -875,9 +871,7 @@ impl PdClient for RpcClient { } fn alloc_id(&mut self) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["alloc_id"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.alloc_id.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -902,7 +896,7 @@ impl PdClient for RpcClient { fn is_recovering_marked(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["is_recovering_marked"]) + .is_recovering_marked .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -921,9 +915,7 @@ impl PdClient for RpcClient { } fn put_store(&mut self, store: metapb::Store) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["put_store"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.put_store.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -962,8 +954,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_store_and_stats"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_store_and_stats + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let store = resp.take_store(); @@ -976,9 +968,7 @@ impl PdClient for RpcClient { } fn get_all_stores(&mut self, exclude_tombstone: bool) -> Result> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_all_stores"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_all_stores.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -998,7 +988,7 @@ impl PdClient for RpcClient { fn get_cluster_config(&mut self) -> Result { let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_cluster_config"]) + .get_cluster_config .start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1037,8 +1027,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; let region = if resp.has_region() { @@ -1076,8 +1066,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() { @@ -1115,8 +1105,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_region_leader_by_id"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_region_leader_by_id + .observe(timer.saturating_elapsed_secs()); let mut resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; if resp.has_region() && resp.has_leader() { @@ -1145,8 +1135,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1179,8 +1169,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["ask_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .ask_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp) @@ -1223,8 +1213,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["store_heartbeat"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .store_heartbeat + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; match feature_gate.set_version(resp.get_cluster_version()) { @@ -1257,8 +1247,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["report_batch_split"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .report_batch_split + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1266,9 +1256,7 @@ impl PdClient for RpcClient { } fn scatter_region(&mut self, mut region: RegionInfo) -> Result<()> { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["scatter_region"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.scatter_region.start_coarse_timer(); let mut req = pdpb::ScatterRegionRequest::default(); req.set_region_id(region.get_id()); @@ -1307,8 +1295,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_gc_saft_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .get_gc_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(resp.get_safe_point()) @@ -1316,9 +1304,7 @@ impl PdClient for RpcClient { } fn get_operator(&mut self, region_id: u64) -> Result { - let _timer = PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["get_operator"]) - .start_coarse_timer(); + let _timer = PD_REQUEST_HISTOGRAM_VEC.get_operator.start_coarse_timer(); block_on(self.raw_client.wait_for_ready())?; @@ -1366,8 +1352,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["update_service_safe_point"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .update_service_safe_point + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) @@ -1396,8 +1382,8 @@ impl PdClient for RpcClient { }) .await; PD_REQUEST_HISTOGRAM_VEC - .with_label_values(&["min_resolved_ts"]) - .observe(duration_to_sec(timer.saturating_elapsed())); + .min_resolved_ts + .observe(timer.saturating_elapsed_secs()); let resp = raw_client.check_resp(resp)?; check_resp_header(resp.get_header())?; Ok(()) diff --git a/components/pd_client/src/metrics.rs b/components/pd_client/src/metrics.rs index 57879a57d0e..a4ef9c5ce4e 100644 --- a/components/pd_client/src/metrics.rs +++ b/components/pd_client/src/metrics.rs @@ -2,14 +2,52 @@ use lazy_static::lazy_static; use prometheus::*; +use prometheus_static_metric::{make_static_metric, register_static_histogram_vec}; + +make_static_metric! { + pub label_enum PDRequestEventType { + get_region, + get_region_by_id, + get_region_leader_by_id, + scatter_region, + get_store, + get_store_async, + put_store, + get_all_stores, + get_store_and_stats, + store_global_config, + load_global_config, + watch_global_config, + bootstrap_cluster, + is_cluster_bootstrapped, + get_cluster_config, + ask_split, + ask_batch_split, + report_batch_split, + get_gc_safe_point, + update_service_safe_point, + min_resolved_ts, + get_operator, + alloc_id, + is_recovering_marked, + store_heartbeat, + tso, + } + + pub struct PDRequestEventHistogramVec: Histogram { + "type" => PDRequestEventType, + } +} lazy_static! { - pub static ref PD_REQUEST_HISTOGRAM_VEC: HistogramVec = register_histogram_vec!( - "tikv_pd_request_duration_seconds", - "Bucketed histogram of PD requests duration", - &["type"] - ) - .unwrap(); + pub static ref PD_REQUEST_HISTOGRAM_VEC: PDRequestEventHistogramVec = + register_static_histogram_vec!( + PDRequestEventHistogramVec, + "tikv_pd_request_duration_seconds", + "Bucketed histogram of PD requests duration", + &["type"] + ) + .unwrap(); pub static ref PD_HEARTBEAT_COUNTER_VEC: IntCounterVec = register_int_counter_vec!( "tikv_pd_heartbeat_message_total", "Total number of PD heartbeat messages.", diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 4266e400cd3..1cc603f2490 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -55,7 +55,7 @@ use tikv_util::{ mpsc::{self, LooseBoundedSender, Receiver}, store::{find_peer, find_peer_by_id, is_learner, region_on_same_stores}, sys::disk::DiskUsage, - time::{duration_to_sec, monotonic_raw_now, Instant as TiInstant}, + time::{monotonic_raw_now, Instant as TiInstant}, trace, warn, worker::{ScheduleError, Scheduler}, Either, @@ -694,7 +694,7 @@ where .raft_metrics .event_time .peer_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } #[inline] diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index e68873cadf1..26f2983998d 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -806,7 +806,7 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> .raft_metrics .event_time .store_msg - .observe(duration_to_sec(timer.saturating_elapsed())); + .observe(timer.saturating_elapsed_secs()); } fn start(&mut self, store: metapb::Store) { From 1f04c7287451cdc042c2a697f6ee4fbc798c4620 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Sun, 29 Jan 2023 14:10:12 +0800 Subject: [PATCH 099/115] adapt tikv's feature Signed-off-by: CalvinNeo --- Cargo.lock | 4 ++- engine_tiflash/src/lib.rs | 2 -- engine_tiflash/src/misc.rs | 12 +++++++ engine_tiflash/src/perf_context.rs | 2 +- engine_tiflash/src/raft_engine.rs | 34 +++++++++++++++----- engine_tiflash/src/snapshot.rs | 10 +++++- new-mock-engine-store/Cargo.toml | 2 ++ new-mock-engine-store/src/lib.rs | 2 +- new-mock-engine-store/src/mock_cluster.rs | 9 ++++-- new-mock-engine-store/src/server.rs | 1 + proxy_server/Cargo.toml | 1 + proxy_server/src/run.rs | 38 +++++++++++++++++++++-- 12 files changed, 98 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e09cdde1ec8..0b3fc49b646 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1605,7 +1605,7 @@ dependencies = [ "online_config", "ordered-float", "panic_hook", - "parking_lot 0.12.0", + "parking_lot 0.12.1", "pd_client", "prometheus", "prometheus-static-metric", @@ -3382,6 +3382,7 @@ dependencies = [ "raftstore", "rand 0.8.5", "resolved_ts", + "resource_control", "resource_metering", "security", "slog", @@ -4411,6 +4412,7 @@ dependencies = [ "rand 0.8.5", "regex", "resolved_ts", + "resource_control", "resource_metering", "security", "serde", diff --git a/engine_tiflash/src/lib.rs b/engine_tiflash/src/lib.rs index 4ff0baf2f8d..97d1d2e1281 100644 --- a/engine_tiflash/src/lib.rs +++ b/engine_tiflash/src/lib.rs @@ -15,9 +15,7 @@ //! //! Please read the engine_trait crate docs before hacking. #![allow(dead_code)] -#![feature(backtrace)] #![cfg_attr(test, feature(test))] -#![feature(generic_associated_types)] #![feature(let_chains)] #![feature(option_get_or_insert_default)] diff --git a/engine_tiflash/src/misc.rs b/engine_tiflash/src/misc.rs index 0393a96bd02..70d94e567ae 100644 --- a/engine_tiflash/src/misc.rs +++ b/engine_tiflash/src/misc.rs @@ -315,6 +315,18 @@ impl MiscExt for RocksEngine { .get_property_int_cf(handle, ROCKSDB_TOTAL_SST_FILES_SIZE)) } + fn get_num_keys(&self) -> Result { + let mut total = 0; + for cf in self.cf_names() { + let handle = util::get_cf_handle(self.as_inner(), cf).unwrap(); + total += self + .as_inner() + .get_property_int_cf(handle, ROCKSDB_ESTIMATE_NUM_KEYS) + .unwrap_or_default(); + } + Ok(total) + } + fn get_range_entries_and_versions( &self, cf: &str, diff --git a/engine_tiflash/src/perf_context.rs b/engine_tiflash/src/perf_context.rs index a731a9461dc..f8cfdbcc667 100644 --- a/engine_tiflash/src/perf_context.rs +++ b/engine_tiflash/src/perf_context.rs @@ -8,7 +8,7 @@ use crate::{engine::RocksEngine, perf_context_impl::PerfContextStatistics}; impl PerfContextExt for RocksEngine { type PerfContext = RocksPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { RocksPerfContext::new(level, kind) } } diff --git a/engine_tiflash/src/raft_engine.rs b/engine_tiflash/src/raft_engine.rs index d5331a2ce29..a0a5acd5dd8 100644 --- a/engine_tiflash/src/raft_engine.rs +++ b/engine_tiflash/src/raft_engine.rs @@ -166,6 +166,10 @@ impl RaftEngineReadOnly for RocksEngine { panic!() } + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } + fn get_recover_state(&self) -> Result> { self.get_msg_cf(CF_DEFAULT, keys::RECOVER_STATE_KEY) } @@ -361,7 +365,19 @@ impl RaftEngine for RocksEngine { } impl RaftLogBatch for RocksWriteBatchVec { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + for index in last.get_index() + 1..overwrite_to { + let key = keys::raft_log_key(raft_group_id, index); + self.delete(&key).unwrap(); + } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -369,13 +385,6 @@ impl RaftLogBatch for RocksWriteBatchVec { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - for index in from..to { - let key = keys::raft_log_key(raft_group_id, index); - self.delete(&key).unwrap(); - } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } @@ -434,6 +443,15 @@ impl RaftLogBatch for RocksWriteBatchVec { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } diff --git a/engine_tiflash/src/snapshot.rs b/engine_tiflash/src/snapshot.rs index b19a32fd739..60a12c4ac6d 100644 --- a/engine_tiflash/src/snapshot.rs +++ b/engine_tiflash/src/snapshot.rs @@ -5,7 +5,9 @@ use std::{ sync::Arc, }; -use engine_traits::{self, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot}; +use engine_traits::{ + self, CfNamesExt, IterOptions, Iterable, Peekable, ReadOptions, Result, Snapshot, +}; use rocksdb::{rocksdb_options::UnsafeSnap, DBIterator, DB}; use crate::{ @@ -95,3 +97,9 @@ impl Peekable for RocksSnapshot { Ok(v.map(RocksDbVector::from_raw)) } } + +impl CfNamesExt for RocksSnapshot { + fn cf_names(&self) -> Vec<&str> { + self.db.cf_names() + } +} diff --git a/new-mock-engine-store/Cargo.toml b/new-mock-engine-store/Cargo.toml index a3bdbf6a7a7..94cc6aa1311 100644 --- a/new-mock-engine-store/Cargo.toml +++ b/new-mock-engine-store/Cargo.toml @@ -45,6 +45,8 @@ raft = { version = "0.7.0", default-features = false, features = ["protobuf-code raftstore = { workspace = true, default-features = false } rand = "0.8" resolved_ts = { workspace = true } + +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true, default-features = false } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/new-mock-engine-store/src/lib.rs b/new-mock-engine-store/src/lib.rs index 58db2bb0f2c..afb206400c6 100644 --- a/new-mock-engine-store/src/lib.rs +++ b/new-mock-engine-store/src/lib.rs @@ -87,7 +87,7 @@ pub fn copy_data_from( .unwrap(); debug!("copy raft log {:?}", entries); - raft_wb.append(region_id, entries)?; + raft_wb.append(region_id, None, entries)?; box_try!(target_engines.raft.consume(&mut raft_wb, true)); Ok(()) } diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index 9e793544ca4..d4b3c24ff4b 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -44,6 +44,7 @@ use raftstore::{ }, Error, Result, }; +use resource_control::ResourceGroupManager; use tempfile::TempDir; pub use test_pd_client::TestPdClient; use test_raftstore::FilterFactory; @@ -110,6 +111,7 @@ pub struct Cluster> { pub sim: Arc>, pub pd_client: Arc, pub test_data: TestData, + resource_manager: Option>, } impl> std::panic::UnwindSafe for Cluster {} @@ -154,6 +156,7 @@ impl> Cluster { expected_leader_safe_ts: 0, expected_self_safe_ts: 0, }, + resource_manager: Some(Arc::new(ResourceGroupManager::default())), } } @@ -412,7 +415,8 @@ impl> Cluster { if !skip_set.is_empty() { panic!("Error when start with skip set"); } - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); self.create_engine(Some(router.clone())); let store_meta = Arc::new(Mutex::new(StoreMeta::new(PENDING_MSG_CAP))); @@ -810,7 +814,8 @@ impl> Cluster { assert_ne!(engines.kv.engine_store_server_helper, 0); let key_mgr = self.key_managers_map[&node_id].clone(); - let (router, system) = create_raft_batch_system(&self.cfg.raft_store); + let (router, system) = + create_raft_batch_system(&self.cfg.raft_store, &self.resource_manager); let mut cfg = self.cfg.clone(); if let Some(labels) = self.labels.get(&node_id) { diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 3d0c78cf894..90bc76728ae 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -426,6 +426,7 @@ impl ServerCluster { quota_limiter.clone(), self.pd_client.feature_gate().clone(), None, + None, // TODO resource_ctl )?; self.storages.insert(node_id, raft_engine); diff --git a/proxy_server/Cargo.toml b/proxy_server/Cargo.toml index b4c42af2cd1..4770428dede 100644 --- a/proxy_server/Cargo.toml +++ b/proxy_server/Cargo.toml @@ -84,6 +84,7 @@ raftstore = { workspace = true, default-features = false } rand = "0.8" regex = "1.3" resolved_ts = { workspace = true, default-features = false } +resource_control = { workspace = true } resource_metering = { workspace = true } security = { workspace = true, default-features = false } serde = "1.0" diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 7453b0a6034..8b34bd4a8cf 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -62,6 +62,9 @@ use raftstore::{ SplitCheckRunner, SplitConfigManager, StoreMetaDelegate, }, }; +use resource_control::{ + ResourceGroupManager, ResourceManagerService, MIN_PRIORITY_UPDATE_INTERVAL, +}; use security::SecurityManager; use server::{memory::*, raft_engine_switch::*}; use tikv::{ @@ -491,6 +494,7 @@ struct TiKvServer { background_worker: Worker, sst_worker: Option>>, quota_limiter: Arc, + resource_manager: Option>, tablet_registry: Option>, } @@ -540,14 +544,32 @@ impl TiKvServer { let store_path = Path::new(&config.storage.data_dir).to_owned(); - // Initialize raftstore channels. - let (router, system) = fsm::create_raft_batch_system(&config.raft_store); - let thread_count = config.server.background_thread_count; let background_worker = WorkerBuilder::new("background") .thread_count(thread_count) .create(); + let resource_manager = if config.resource_control.enabled { + let mgr = Arc::new(ResourceGroupManager::default()); + let mut resource_mgr_service = + ResourceManagerService::new(mgr.clone(), pd_client.clone()); + // spawn a task to periodically update the minimal virtual time of all resource + // groups. + let resource_mgr = mgr.clone(); + background_worker.spawn_interval_task(MIN_PRIORITY_UPDATE_INTERVAL, move || { + resource_mgr.advance_min_virtual_time(); + }); + // spawn a task to watch all resource groups update. + background_worker.spawn_async_task(async move { + resource_mgr_service.watch_resource_groups().await; + }); + Some(mgr) + } else { + None + }; + // Initialize raftstore channels. + let (router, system) = fsm::create_raft_batch_system(&config.raft_store, &resource_manager); + let mut coprocessor_host = Some(CoprocessorHost::new( router.clone(), config.coprocessor.clone(), @@ -598,6 +620,7 @@ impl TiKvServer { flow_info_receiver: None, sst_worker: None, quota_limiter, + resource_manager, tablet_registry: None, } } @@ -902,10 +925,15 @@ impl TiKvServer { } let unified_read_pool = if self.config.readpool.is_unified_pool_enabled() { + let resource_ctl = self + .resource_manager + .as_ref() + .map(|m| m.derive_controller("unified-read-pool".into(), true)); Some(build_yatp_read_pool( &self.config.readpool.unified, pd_sender.clone(), engines.engine.clone(), + resource_ctl, )) } else { None @@ -988,8 +1016,12 @@ impl TiKvServer { Arc::clone(&self.quota_limiter), self.pd_client.feature_gate().clone(), None, // causal_ts_provider + self.resource_manager + .as_ref() + .map(|m| m.derive_controller("scheduler-worker-pool".to_owned(), true)), ) .unwrap_or_else(|e| fatal!("failed to create raft storage: {}", e)); + cfg_controller.register( tikv::config::Module::Storage, Box::new(StorageConfigManger::new( From b1936e6c2d73789b05545b96b32dc22fac880a79 Mon Sep 17 00:00:00 2001 From: Zhi Qi <30543181+LittleFall@users.noreply.github.com> Date: Mon, 30 Jan 2023 17:25:54 +0800 Subject: [PATCH 100/115] copr: (refactor) Lift heap struct out from top_n_executor (#14096) ref tikv/tikv#13936 Signed-off-by: Zhi Qi --- .../src/top_n_executor.rs | 210 ++--------------- .../tidb_query_executors/src/util/mod.rs | 1 + .../src/util/top_n_heap.rs | 211 ++++++++++++++++++ 3 files changed, 229 insertions(+), 193 deletions(-) create mode 100644 components/tidb_query_executors/src/util/top_n_heap.rs diff --git a/components/tidb_query_executors/src/top_n_executor.rs b/components/tidb_query_executors/src/top_n_executor.rs index 6ef8c6b2224..5ebc65baa25 100644 --- a/components/tidb_query_executors/src/top_n_executor.rs +++ b/components/tidb_query_executors/src/top_n_executor.rs @@ -1,20 +1,23 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; +use std::sync::Arc; use async_trait::async_trait; use tidb_query_common::{storage::IntervalRange, Result}; use tidb_query_datatype::{ - codec::{ - batch::{LazyBatchColumn, LazyBatchColumnVec}, - data_type::*, - }, + codec::{batch::LazyBatchColumnVec, data_type::*}, expr::{EvalConfig, EvalContext, EvalWarnings}, }; use tidb_query_expr::{RpnExpression, RpnExpressionBuilder, RpnStackNode}; use tipb::{Expr, FieldType, TopN}; -use crate::{interface::*, util::*}; +use crate::{ + interface::*, + util::{ + top_n_heap::{HeapItemSourceData, HeapItemUnsafe, TopNHeap}, + *, + }, +}; pub struct BatchTopNExecutor { /// The heap, which contains N rows at most. @@ -22,7 +25,7 @@ pub struct BatchTopNExecutor { /// This field is placed before `eval_columns_buffer_unsafe`, `order_exprs`, /// `order_is_desc` and `src` because it relies on data in those fields /// and we want this field to be dropped first. - heap: BinaryHeap, + heap: TopNHeap, /// A collection of all evaluated columns. This is to avoid repeated /// allocations in each `next_batch()`. @@ -97,7 +100,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -126,7 +129,7 @@ impl BatchTopNExecutor { .collect(); Self { - heap: BinaryHeap::new(), + heap: TopNHeap::new(n), eval_columns_buffer_unsafe: Box::>::default(), order_exprs: order_exprs.into_boxed_slice(), order_exprs_field_type: order_exprs_field_type.into_boxed_slice(), @@ -140,7 +143,7 @@ impl BatchTopNExecutor { } pub fn new( - config: std::sync::Arc, + config: Arc, src: Src, order_exprs_def: Vec, order_is_desc: Vec, @@ -163,8 +166,7 @@ impl BatchTopNExecutor { .collect(); Ok(Self { - // Avoid large N causing OOM - heap: BinaryHeap::with_capacity(n.min(1024)), + heap: TopNHeap::new(n), // Simply large enough to avoid repeated allocations eval_columns_buffer_unsafe: Box::new(Vec::with_capacity(512)), order_exprs: order_exprs.into_boxed_slice(), @@ -182,7 +184,7 @@ impl BatchTopNExecutor { async fn handle_next_batch(&mut self) -> Result> { // Use max batch size from the beginning because top N // always needs to calculate over all data. - let src_result = self.src.next_batch(crate::runner::BATCH_MAX_SIZE).await; + let src_result = self.src.next_batch(BATCH_MAX_SIZE).await; self.context.warnings = src_result.warnings; @@ -193,7 +195,7 @@ impl BatchTopNExecutor { } if src_is_drained { - Ok(Some(self.heap_take_all())) + Ok(Some(self.heap.take_all())) } else { Ok(None) } @@ -240,84 +242,11 @@ impl BatchTopNExecutor { eval_columns_offset: eval_offset, logical_row_index, }; - self.heap_add_row(row)?; - } - - Ok(()) - } - - fn heap_add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { - if self.heap.len() < self.n { - // HeapItemUnsafe must be checked valid to compare in advance, or else it may - // panic inside BinaryHeap. - row.cmp_sort_key(&row)?; - - // Push into heap when heap is not full. - self.heap.push(row); - } else { - // Swap the greatest row in the heap if this row is smaller than that row. - let mut greatest_row = self.heap.peek_mut().unwrap(); - if row.cmp_sort_key(&greatest_row)? == Ordering::Less { - *greatest_row = row; - } + self.heap.add_row(row)?; } Ok(()) } - - #[allow(clippy::clone_on_copy)] - fn heap_take_all(&mut self) -> LazyBatchColumnVec { - let heap = std::mem::take(&mut self.heap); - let sorted_items = heap.into_sorted_vec(); - if sorted_items.is_empty() { - return LazyBatchColumnVec::empty(); - } - - let mut result = sorted_items[0] - .source_data - .physical_columns - .clone_empty(sorted_items.len()); - - for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { - match result_column { - LazyBatchColumn::Raw(dest_column) => { - for item in &sorted_items { - let src = item.source_data.physical_columns[column_index].raw(); - dest_column - .push(&src[item.source_data.logical_rows[item.logical_row_index]]); - } - } - LazyBatchColumn::Decoded(dest_vector_value) => { - match_template::match_template! { - TT = [ - Int, - Real, - Duration, - Decimal, - DateTime, - Bytes => BytesRef, - Json => JsonRef, - Enum => EnumRef, - Set => SetRef, - ], - match dest_vector_value { - VectorValue::TT(dest_column) => { - for item in &sorted_items { - let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); - let src_ref = TT::borrow_vector_value(src); - // TODO: This clone is not necessary. - dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); - } - }, - } - } - } - } - } - - result.assert_columns_equal_length(); - result - } } #[async_trait] @@ -402,111 +331,6 @@ impl BatchExecutor for BatchTopNExecutor { } } -struct HeapItemSourceData { - physical_columns: LazyBatchColumnVec, - logical_rows: Vec, -} - -/// The item in the heap of `BatchTopNExecutor`. -/// -/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is -/// valid (i.e. not dropped). Thus it is called unsafe. -struct HeapItemUnsafe { - /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. - order_is_desc_ptr: NonNull<[bool]>, - - /// A pointer to the `order_exprs_field_type` field in `order_exprs`. - order_exprs_field_type_ptr: NonNull<[FieldType]>, - - /// The source data that evaluated column in this structure is using. - source_data: Arc, - - /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. - eval_columns_buffer_ptr: NonNull>>, - - /// The begin offset of the evaluated columns stored in the buffer. - /// - /// The length of evaluated columns in the buffer is `order_is_desc.len()`. - eval_columns_offset: usize, - - /// Which logical row in the evaluated columns this heap item is - /// representing. - logical_row_index: usize, -} - -impl HeapItemUnsafe { - fn get_order_is_desc(&self) -> &[bool] { - unsafe { self.order_is_desc_ptr.as_ref() } - } - - fn get_order_exprs_field_type(&self) -> &[FieldType] { - unsafe { self.order_exprs_field_type_ptr.as_ref() } - } - - fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { - let offset_begin = self.eval_columns_offset; - let offset_end = offset_begin + len; - let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; - &vec_buf[offset_begin..offset_end] - } - - fn cmp_sort_key(&self, other: &Self) -> Result { - // Only debug assert because this function is called pretty frequently. - debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); - - let order_is_desc = self.get_order_is_desc(); - let order_exprs_field_type = self.get_order_exprs_field_type(); - let columns_len = order_is_desc.len(); - let eval_columns_lhs = self.get_eval_columns(columns_len); - let eval_columns_rhs = other.get_eval_columns(columns_len); - - for column_idx in 0..columns_len { - let lhs_node = &eval_columns_lhs[column_idx]; - let rhs_node = &eval_columns_rhs[column_idx]; - let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); - let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); - - // There is panic inside, but will never panic, since the data type of - // corresponding column should be consistent for each - // `HeapItemUnsafe`. - let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; - - if ord == Ordering::Equal { - continue; - } - if !order_is_desc[column_idx] { - return Ok(ord); - } else { - return Ok(ord.reverse()); - } - } - - Ok(Ordering::Equal) - } -} - -/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator -/// fails to parse. So make sure that it is valid before putting it into a heap. -impl Ord for HeapItemUnsafe { - fn cmp(&self, other: &Self) -> Ordering { - self.cmp_sort_key(other).unwrap() - } -} - -impl PartialOrd for HeapItemUnsafe { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl PartialEq for HeapItemUnsafe { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for HeapItemUnsafe {} - #[cfg(test)] mod tests { use futures::executor::block_on; diff --git a/components/tidb_query_executors/src/util/mod.rs b/components/tidb_query_executors/src/util/mod.rs index 6aa578459e2..ca05e49fcd3 100644 --- a/components/tidb_query_executors/src/util/mod.rs +++ b/components/tidb_query_executors/src/util/mod.rs @@ -5,6 +5,7 @@ pub mod hash_aggr_helper; #[cfg(test)] pub mod mock_executor; pub mod scan_executor; +pub mod top_n_heap; use tidb_query_common::Result; use tidb_query_datatype::{codec::batch::LazyBatchColumnVec, expr::EvalContext}; diff --git a/components/tidb_query_executors/src/util/top_n_heap.rs b/components/tidb_query_executors/src/util/top_n_heap.rs new file mode 100644 index 00000000000..0cbef103e4d --- /dev/null +++ b/components/tidb_query_executors/src/util/top_n_heap.rs @@ -0,0 +1,211 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{cmp::Ordering, collections::BinaryHeap, ptr::NonNull, sync::Arc}; + +use tidb_query_common::Result; +use tidb_query_datatype::codec::{ + batch::{LazyBatchColumn, LazyBatchColumnVec}, + data_type::*, +}; +use tidb_query_expr::RpnStackNode; +use tipb::FieldType; + +/// TopNHeap is the common data structure used in TopN-like executors. +pub struct TopNHeap { + /// The maximum number of rows in the heap. + n: usize, + /// The heap. + heap: BinaryHeap, +} + +impl TopNHeap { + /// parameters: + /// - n: The maximum number of rows in the heaps + /// note: to avoid large N causing OOM, the initial capacity will be limited + /// up to 1024. + pub fn new(n: usize) -> Self { + Self { + n, + // Avoid large N causing OOM + heap: BinaryHeap::with_capacity(n.min(1024)), + } + } + + pub fn add_row(&mut self, row: HeapItemUnsafe) -> Result<()> { + if self.heap.len() < self.n { + // HeapItemUnsafe must be checked valid to compare in advance, or else it may + // panic inside BinaryHeap. + row.cmp_sort_key(&row)?; + + // Push into heap when heap is not full. + self.heap.push(row); + } else { + // Swap the greatest row in the heap if this row is smaller than that row. + let mut greatest_row = self.heap.peek_mut().unwrap(); + if row.cmp_sort_key(&greatest_row)? == Ordering::Less { + *greatest_row = row; + } + } + + Ok(()) + } + + #[allow(clippy::clone_on_copy)] + pub fn take_all(&mut self) -> LazyBatchColumnVec { + let heap = std::mem::take(&mut self.heap); + let sorted_items = heap.into_sorted_vec(); + if sorted_items.is_empty() { + return LazyBatchColumnVec::empty(); + } + + let mut result = sorted_items[0] + .source_data + .physical_columns + .clone_empty(sorted_items.len()); + + for (column_index, result_column) in result.as_mut_slice().iter_mut().enumerate() { + match result_column { + LazyBatchColumn::Raw(dest_column) => { + for item in &sorted_items { + let src = item.source_data.physical_columns[column_index].raw(); + dest_column + .push(&src[item.source_data.logical_rows[item.logical_row_index]]); + } + } + LazyBatchColumn::Decoded(dest_vector_value) => { + match_template::match_template! { + TT = [ + Int, + Real, + Duration, + Decimal, + DateTime, + Bytes => BytesRef, + Json => JsonRef, + Enum => EnumRef, + Set => SetRef, + ], + match dest_vector_value { + VectorValue::TT(dest_column) => { + for item in &sorted_items { + let src: &VectorValue = item.source_data.physical_columns[column_index].decoded(); + let src_ref = TT::borrow_vector_value(src); + // TODO: This clone is not necessary. + dest_column.push(src_ref.get_option_ref(item.source_data.logical_rows[item.logical_row_index]).map(|x| x.into_owned_value())); + } + }, + } + } + } + } + } + + result.assert_columns_equal_length(); + result + } +} + +pub struct HeapItemSourceData { + pub physical_columns: LazyBatchColumnVec, + pub logical_rows: Vec, +} + +/// The item in the heap of `BatchTopNExecutor`. +/// +/// WARN: The content of this structure is valid only if `BatchTopNExecutor` is +/// valid (i.e. not dropped). Thus it is called unsafe. +pub struct HeapItemUnsafe { + /// A pointer to the `order_is_desc` field in `BatchTopNExecutor`. + pub order_is_desc_ptr: NonNull<[bool]>, + + /// A pointer to the `order_exprs_field_type` field in `order_exprs`. + pub order_exprs_field_type_ptr: NonNull<[FieldType]>, + + /// The source data that evaluated column in this structure is using. + pub source_data: Arc, + + /// A pointer to the `eval_columns_buffer` field in `BatchTopNExecutor`. + pub eval_columns_buffer_ptr: NonNull>>, + + /// The begin offset of the evaluated columns stored in the buffer. + /// + /// The length of evaluated columns in the buffer is `order_is_desc.len()`. + pub eval_columns_offset: usize, + + /// Which logical row in the evaluated columns this heap item is + /// representing. + pub logical_row_index: usize, +} + +impl HeapItemUnsafe { + fn get_order_is_desc(&self) -> &[bool] { + unsafe { self.order_is_desc_ptr.as_ref() } + } + + fn get_order_exprs_field_type(&self) -> &[FieldType] { + unsafe { self.order_exprs_field_type_ptr.as_ref() } + } + + fn get_eval_columns(&self, len: usize) -> &[RpnStackNode<'_>] { + let offset_begin = self.eval_columns_offset; + let offset_end = offset_begin + len; + let vec_buf = unsafe { self.eval_columns_buffer_ptr.as_ref() }; + &vec_buf[offset_begin..offset_end] + } + + fn cmp_sort_key(&self, other: &Self) -> Result { + // Only debug assert because this function is called pretty frequently. + debug_assert_eq!(self.get_order_is_desc(), other.get_order_is_desc()); + + let order_is_desc = self.get_order_is_desc(); + let order_exprs_field_type = self.get_order_exprs_field_type(); + let columns_len = order_is_desc.len(); + let eval_columns_lhs = self.get_eval_columns(columns_len); + let eval_columns_rhs = other.get_eval_columns(columns_len); + + for column_idx in 0..columns_len { + let lhs_node = &eval_columns_lhs[column_idx]; + let rhs_node = &eval_columns_rhs[column_idx]; + let lhs = lhs_node.get_logical_scalar_ref(self.logical_row_index); + let rhs = rhs_node.get_logical_scalar_ref(other.logical_row_index); + + // There is panic inside, but will never panic, since the data type of + // corresponding column should be consistent for each + // `HeapItemUnsafe`. + let ord = lhs.cmp_sort_key(&rhs, &order_exprs_field_type[column_idx])?; + + if ord == Ordering::Equal { + continue; + } + return if !order_is_desc[column_idx] { + Ok(ord) + } else { + Ok(ord.reverse()) + }; + } + + Ok(Ordering::Equal) + } +} + +/// WARN: HeapItemUnsafe implements partial ordering. It panics when Collator +/// fails to parse. So make sure that it is valid before putting it into a heap. +impl Ord for HeapItemUnsafe { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp_sort_key(other).unwrap() + } +} + +impl PartialOrd for HeapItemUnsafe { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for HeapItemUnsafe { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for HeapItemUnsafe {} From 0ce3485ca67eab8c9540a6931047478de67c48c0 Mon Sep 17 00:00:00 2001 From: JmPotato Date: Mon, 30 Jan 2023 17:43:54 +0800 Subject: [PATCH 101/115] raftstore: allow the read request with a smaller ts during flashback (#14088) close tikv/tikv#14045 - Store the flashback `start_ts` in region meta. - Allow the read request with a smaller ts during flashback. Signed-off-by: JmPotato Co-authored-by: Ti Chi Robot --- Cargo.lock | 2 +- components/raftstore/src/store/fsm/apply.rs | 4 +- components/raftstore/src/store/fsm/peer.rs | 10 ++- components/raftstore/src/store/util.rs | 24 ++++++- components/raftstore/src/store/worker/read.rs | 5 +- components/test_raftstore/src/util.rs | 30 ++++++--- components/tikv_kv/src/lib.rs | 7 ++- src/server/raftkv/mod.rs | 27 ++++---- src/server/raftkv2/mod.rs | 19 +++--- src/server/service/kv.rs | 9 ++- tests/integrations/server/kv_service.rs | 62 +++++++++++-------- 11 files changed, 129 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 95587f98565..46eac5930a1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2729,7 +2729,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#009f31598ac3200dc8b32e18f96fc4deb7b32e48" +source = "git+https://github.com/pingcap/kvproto.git#1b2b4114103afb06796b7e44f45f7e55133673c0" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index 22a42393173..bb262b9ffa8 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -1649,7 +1649,8 @@ where req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_req_region_epoch(req, &self.region, include_region)?; check_flashback_state( - self.region.get_is_in_flashback(), + self.region.is_in_flashback, + self.region.flashback_start_ts, req, self.region_id(), false, @@ -2975,6 +2976,7 @@ where // Modify the region meta in memory. let mut region = self.region.clone(); region.set_is_in_flashback(is_in_flashback); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); // Modify the `RegionLocalState` persisted in disk. write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { panic!( diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 1cc603f2490..a8232fd8322 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5223,9 +5223,13 @@ where // the apply phase and because a read-only request doesn't need to be applied, // so it will be allowed during the flashback progress, for example, a snapshot // request. - if let Err(e) = - util::check_flashback_state(self.region().is_in_flashback, msg, region_id, true) - { + if let Err(e) = util::check_flashback_state( + self.region().is_in_flashback, + self.region().flashback_start_ts, + msg, + region_id, + true, + ) { match e { Error::FlashbackInProgress(_) => self .ctx diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index 4d8128822c7..0344adb2b92 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -31,7 +31,9 @@ use raft::{ }; use raft_proto::ConfChangeI; use tikv_util::{ - box_err, debug, info, + box_err, + codec::number::{decode_u64, NumberEncoder}, + debug, info, store::{find_peer_by_id, region}, time::monotonic_raw_now, Either, @@ -336,6 +338,7 @@ pub fn compare_region_epoch( // flashback. pub fn check_flashback_state( is_in_flashback: bool, + flashback_start_ts: u64, req: &RaftCmdRequest, region_id: u64, skip_not_prepared: bool, @@ -347,11 +350,20 @@ pub fn check_flashback_state( { return Ok(()); } + // TODO: only use `flashback_start_ts` to check flashback state. + let is_in_flashback = is_in_flashback || flashback_start_ts > 0; let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) .contains(WriteBatchFlags::FLASHBACK); - // If the region is in the flashback state, the only allowed request is the - // flashback request itself. + // If the region is in the flashback state: + // - A request with flashback flag will be allowed. + // - A read request whose `read_ts` is smaller than `flashback_start_ts` will + // be allowed. if is_in_flashback && !is_flashback_request { + if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + if read_ts != 0 && read_ts < flashback_start_ts { + return Ok(()); + } + } return Err(Error::FlashbackInProgress(region_id)); } // If the region is not in the flashback state, the flashback request itself @@ -362,6 +374,12 @@ pub fn check_flashback_state( Ok(()) } +pub fn encode_start_ts_into_flag_data(header: &mut RaftRequestHeader, start_ts: u64) { + let mut data = [0u8; 8]; + (&mut data[..]).encode_u64(start_ts).unwrap(); + header.set_flag_data(data.into()); +} + pub fn is_region_epoch_equal( from_epoch: &metapb::RegionEpoch, current_epoch: &metapb::RegionEpoch, diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 6b20e375786..5d6835666b4 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -822,7 +822,10 @@ where // Check whether the region is in the flashback state and the local read could // be performed. let is_in_flashback = delegate.region.is_in_flashback; - if let Err(e) = util::check_flashback_state(is_in_flashback, req, region_id, false) { + let flashback_start_ts = delegate.region.flashback_start_ts; + if let Err(e) = + util::check_flashback_state(is_in_flashback, flashback_start_ts, req, region_id, false) + { TLS_LOCAL_READ_METRICS.with(|m| match e { Error::FlashbackNotPrepared(_) => { m.borrow_mut().reject_reason.flashback_not_prepared.inc() diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index 8b3745120d5..4bcb99adca3 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -1246,15 +1246,9 @@ pub fn must_raw_get(client: &TikvClient, ctx: Context, key: Vec) -> Option { pub pb_ctx: &'a Context, pub read_id: Option, - // When start_ts is None and `stale_read` is true, it means acquire a snapshot without any - // consistency guarantee. + // When `start_ts` is None and `stale_read` is true, it means acquire a snapshot without any + // consistency guarantee. This filed is also used to check if a read is allowed in the + // flashback. pub start_ts: Option, // `key_ranges` is used in replica read. It will send to // the leader via raft "read index" to check memory locks. @@ -418,7 +419,7 @@ pub trait Engine: Send + Clone + 'static { /// Mark the start of flashback. // It's an infrequent API, use trait object for simplicity. - fn start_flashback(&self, _ctx: &Context) -> BoxFuture<'static, Result<()>> { + fn start_flashback(&self, _ctx: &Context, _start_ts: u64) -> BoxFuture<'static, Result<()>> { Box::pin(futures::future::ready(Ok(()))) } diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index c50c42c9fc6..751c07c6b65 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -44,14 +44,13 @@ use raftstore::{ errors::Error as RaftServerError, router::{LocalReadRouter, RaftStoreRouter}, store::{ - self, Callback as StoreCallback, RaftCmdExtraOpts, ReadIndexContext, ReadResponse, - RegionSnapshot, StoreMsg, WriteResponse, + self, util::encode_start_ts_into_flag_data, Callback as StoreCallback, RaftCmdExtraOpts, + ReadIndexContext, ReadResponse, RegionSnapshot, StoreMsg, WriteResponse, }, }; use thiserror::Error; use tikv_kv::{write_modifies, OnAppliedCb, WriteEvent}; use tikv_util::{ - codec::number::NumberEncoder, future::{paired_future_callback, paired_must_called_future_callback}, time::Instant, }; @@ -547,18 +546,21 @@ where let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); @@ -637,13 +639,16 @@ where } } - fn start_flashback(&self, ctx: &Context) -> BoxFuture<'static, kv::Result<()>> { + fn start_flashback(&self, ctx: &Context, start_ts: u64) -> BoxFuture<'static, kv::Result<()>> { // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the // later flashback. Once invoked, we will update the persistent region meta and // the memory state of the flashback in Peer FSM to reject all read, write // and scheduling operations for this region when propose/apply before we // start the actual data flashback transaction command in the next phase. - let req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + let mut req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + req.mut_admin_request() + .mut_prepare_flashback() + .set_start_ts(start_ts); exec_admin(&*self.router, req) } diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 526a1fab3ca..28f2a1d5d25 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -15,7 +15,7 @@ use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; use futures::{Future, Stream, StreamExt}; use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; pub use node::NodeV2; -use raftstore::store::RegionSnapshot; +use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; use raftstore_v2::{ router::{ message::SimpleWrite, CmdResChannelBuilder, CmdResEvent, CmdResStream, PeerMsg, RaftRouter, @@ -23,7 +23,7 @@ use raftstore_v2::{ SimpleWriteBinary, SimpleWriteEncoder, }; use tikv_kv::{Modify, WriteEvent}; -use tikv_util::{codec::number::NumberEncoder, time::Instant}; +use tikv_util::time::Instant; use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ @@ -153,18 +153,21 @@ impl tikv_kv::Engine for RaftKv2 { let mut header = new_request_header(ctx.pb_ctx); let mut flags = 0; - if ctx.pb_ctx.get_stale_read() && ctx.start_ts.map_or(true, |ts| !ts.is_zero()) { - let mut data = [0u8; 8]; - (&mut data[..]) - .encode_u64(ctx.start_ts.unwrap_or_default().into_inner()) - .unwrap(); + let need_encoded_start_ts = ctx.start_ts.map_or(true, |ts| !ts.is_zero()); + if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); - header.set_flag_data(data.into()); } if ctx.allowed_in_flashback { flags |= WriteBatchFlags::FLASHBACK.bits(); } header.set_flags(flags); + // Encode `start_ts` in `flag_data` for the check of stale read and flashback. + if need_encoded_start_ts { + encode_start_ts_into_flag_data( + &mut header, + ctx.start_ts.unwrap_or_default().into_inner(), + ); + } let mut cmd = RaftCmdRequest::default(); cmd.set_header(header); diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index d42eb510891..da292eca17d 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -1450,7 +1450,9 @@ fn future_prepare_flashback_to_version( ) -> impl Future> { let storage = storage.clone(); async move { - let f = storage.get_engine().start_flashback(req.get_context()); + let f = storage + .get_engine() + .start_flashback(req.get_context(), req.get_start_ts()); let mut res = f.await.map_err(storage::Error::from); if matches!(res, Ok(())) { // After the region is put into the flashback state, we need to do a special @@ -1488,10 +1490,7 @@ fn future_flashback_to_version( res = f.await.unwrap_or_else(|e| Err(box_err!(e))); } if matches!(res, Ok(())) { - // Only finish flashback when Flashback executed successfully. - fail_point!("skip_finish_flashback_to_version", |_| { - Ok(FlashbackToVersionResponse::default()) - }); + // Only finish when flashback executed successfully. let f = storage.get_engine().end_flashback(req.get_context()); res = f.await.map_err(storage::Error::from); } diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 8709373b766..61a3fb39097 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -711,19 +711,17 @@ fn test_mvcc_flashback() { } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_rw() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx.clone(), 0, 1, 2); - // Try to read. + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 1, 2); + // Try to read version 3 (after flashback, FORBIDDEN). let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get let mut get_req = GetRequest::default(); get_req.set_context(ctx.clone()); get_req.key = k.clone(); - get_req.version = 1; + get_req.version = 3; let get_resp = client.kv_get(&get_req).unwrap(); assert!(get_resp.get_region_error().has_flashback_in_progress()); assert!(!get_resp.has_error()); @@ -733,28 +731,48 @@ fn test_mvcc_flashback_block_rw() { scan_req.set_context(ctx.clone()); scan_req.start_key = k.clone(); scan_req.limit = 1; - scan_req.version = 1; + scan_req.version = 3; let scan_resp = client.kv_scan(&scan_req).unwrap(); assert!(scan_resp.get_region_error().has_flashback_in_progress()); + assert!(!scan_resp.has_error()); assert!(scan_resp.pairs.is_empty()); - // Try to write. + // Try to read version 1 (before flashback, ALLOWED). + // Get + let mut get_req = GetRequest::default(); + get_req.set_context(ctx.clone()); + get_req.key = k.clone(); + get_req.version = 1; + let get_resp = client.kv_get(&get_req).unwrap(); + assert!(!get_resp.has_region_error()); + assert!(!get_resp.has_error()); + assert!(get_resp.value.is_empty()); + // Scan + let mut scan_req = ScanRequest::default(); + scan_req.set_context(ctx.clone()); + scan_req.start_key = k.clone(); + scan_req.limit = 1; + scan_req.version = 1; + let scan_resp = client.kv_scan(&scan_req).unwrap(); + assert!(!scan_resp.has_region_error()); + assert!(!scan_resp.has_error()); + assert!(scan_resp.pairs.is_empty()); + // Try to write (FORBIDDEN). // Prewrite let mut mutation = Mutation::default(); mutation.set_op(Op::Put); mutation.set_key(k.clone()); mutation.set_value(v); - let prewrite_resp = try_kv_prewrite(&client, ctx, vec![mutation], k, 1); + let prewrite_resp = try_kv_prewrite(&client, ctx.clone(), vec![mutation], k, 1); assert!(prewrite_resp.get_region_error().has_flashback_in_progress()); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 1, 2, 3); } #[test] -#[cfg(feature = "failpoints")] fn test_mvcc_flashback_block_scheduling() { let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); - fail::cfg("skip_finish_flashback_to_version", "return").unwrap(); - // Flashback - must_flashback_to_version(&client, ctx, 0, 1, 2); + // Prepare the flashback. + must_prepare_flashback(&client, ctx.clone(), 0, 1); // Try to transfer leader. let transfer_leader_resp = cluster.try_transfer_leader(1, new_peer(2, 2)); assert!( @@ -763,7 +781,8 @@ fn test_mvcc_flashback_block_scheduling() { .get_error() .has_flashback_in_progress() ); - fail::remove("skip_finish_flashback_to_version"); + // Finish the flashback. + must_finish_flashback(&client, ctx, 0, 1, 2); } #[test] @@ -794,16 +813,7 @@ fn test_mvcc_flashback_unprepared() { assert!(!get_resp.has_error()); assert_eq!(get_resp.value, b"".to_vec()); // Mock the flashback retry. - let mut req = FlashbackToVersionRequest::default(); - req.set_context(ctx); - req.set_start_ts(6); - req.set_commit_ts(7); - req.version = 0; - req.start_key = b"a".to_vec(); - req.end_key = b"z".to_vec(); - let resp = client.kv_flashback_to_version(&req).unwrap(); - assert!(!resp.has_region_error()); - assert!(resp.get_error().is_empty()); + must_finish_flashback(&client, ctx.clone(), 0, 6, 7); let get_resp = client.kv_get(&get_req).unwrap(); assert!(!get_resp.has_region_error()); assert!(!get_resp.has_error()); @@ -811,7 +821,7 @@ fn test_mvcc_flashback_unprepared() { } #[test] -fn test_mvcc_flashback_with_unlimit_range() { +fn test_mvcc_flashback_with_unlimited_range() { let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; From 7ec73fdd440a1d81e2a5a8c62aa9e31828959903 Mon Sep 17 00:00:00 2001 From: iosmanthus Date: Tue, 31 Jan 2023 10:41:55 +0800 Subject: [PATCH 102/115] import: sst_importer support download SST and rewrite into keyspace data. (#14046) ref tikv/tikv#12999 import: sst_importer support download SST and rewrite into keyspace data. Signed-off-by: iosmanthus --- Cargo.lock | 1 + components/keys/Cargo.toml | 1 + components/keys/src/rewrite.rs | 10 +++ components/sst_importer/src/import_file.rs | 3 +- components/sst_importer/src/sst_importer.rs | 90 ++++++++++++++++----- components/txn_types/src/types.rs | 10 +++ src/import/sst_service.rs | 4 +- 7 files changed, 95 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 46eac5930a1..f2ce2ba4ce1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2724,6 +2724,7 @@ dependencies = [ "panic_hook", "thiserror", "tikv_alloc", + "tikv_util", ] [[package]] diff --git a/components/keys/Cargo.toml b/components/keys/Cargo.toml index 5f2bf5935ee..b5a6412d00a 100644 --- a/components/keys/Cargo.toml +++ b/components/keys/Cargo.toml @@ -10,6 +10,7 @@ kvproto = { workspace = true } log_wrappers = { workspace = true } thiserror = "1.0" tikv_alloc = { workspace = true } +tikv_util = { workspace = true } [dev-dependencies] panic_hook = { workspace = true } diff --git a/components/keys/src/rewrite.rs b/components/keys/src/rewrite.rs index 51f588e9732..68541bb50e0 100644 --- a/components/keys/src/rewrite.rs +++ b/components/keys/src/rewrite.rs @@ -6,11 +6,21 @@ use std::ops::Bound::{self, *}; +use tikv_util::codec::bytes::encode_bytes; + /// An error indicating the key cannot be rewritten because it does not start /// with the given prefix. #[derive(PartialEq, Debug, Clone)] pub struct WrongPrefix; +pub fn encode_bound(bound: Bound>) -> Bound> { + match bound { + Included(k) => Included(encode_bytes(&k)), + Excluded(k) => Excluded(encode_bytes(&k)), + Unbounded => Unbounded, + } +} + /// Rewrites the prefix of a byte array. pub fn rewrite_prefix( old_prefix: &[u8], diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index f766729a066..84d2f67bbab 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -15,6 +15,7 @@ use engine_traits::{ iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; +use keys::data_key; use kvproto::{import_sstpb::*, kvrpcpb::ApiVersion}; use tikv_util::time::Instant; use uuid::{Builder as UuidBuilder, Uuid}; @@ -336,7 +337,7 @@ impl ImportDir { let sst_reader = RocksSstReader::open_with_env(path_str, Some(env))?; for &(start, end) in TIDB_RANGES_COMPLEMENT { - let opt = iter_option(start, end, false); + let opt = iter_option(&data_key(start), &data_key(end), false); let mut iter = sst_reader.iter(opt)?; if iter.seek(start)? { error!( diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 8b6d64f483f..fabe9e2a13a 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -32,7 +32,10 @@ use kvproto::{ kvrpcpb::ApiVersion, }; use tikv_util::{ - codec::stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + codec::{ + bytes::{decode_bytes_in_place, encode_bytes}, + stream_event::{EventEncoder, EventIterator, Iterator as EIterator}, + }, config::ReadableSize, stream::block_on_external_io, sys::SysQuota, @@ -53,13 +56,18 @@ use crate::{ #[derive(Default, Debug, Clone)] pub struct DownloadExt<'a> { cache_key: Option<&'a str>, + req_type: DownloadRequestType, } impl<'a> DownloadExt<'a> { - pub fn cache_key(self, key: &'a str) -> Self { - Self { - cache_key: Some(key), - } + pub fn cache_key(mut self, key: &'a str) -> Self { + self.cache_key = Some(key); + self + } + + pub fn req_type(mut self, req_type: DownloadRequestType) -> Self { + self.req_type = req_type; + self } } @@ -896,16 +904,20 @@ impl SstImporter { let sst_reader = RocksSstReader::open_with_env(dst_file_name, Some(env))?; sst_reader.verify_checksum()?; + // undo key rewrite so we could compare with the keys inside SST + let old_prefix = rewrite_rule.get_old_key_prefix(); + let new_prefix = rewrite_rule.get_new_key_prefix(); + let req_type = ext.req_type; + debug!("downloaded file and verified"; "meta" => ?meta, "name" => name, "path" => dst_file_name, + "old_prefix" => log_wrappers::Value::key(old_prefix), + "new_prefix" => log_wrappers::Value::key(new_prefix), + "req_type" => ?req_type, ); - // undo key rewrite so we could compare with the keys inside SST - let old_prefix = rewrite_rule.get_old_key_prefix(); - let new_prefix = rewrite_rule.get_new_key_prefix(); - let range_start = meta.get_range().get_start(); let range_end = meta.get_range().get_end(); let range_start_bound = key_to_bound(range_start); @@ -915,14 +927,14 @@ impl SstImporter { key_to_bound(range_end) }; - let range_start = + let mut range_start = keys::rewrite::rewrite_prefix_of_start_bound(new_prefix, old_prefix, range_start_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST start range", key: range_start.to_vec(), prefix: new_prefix.to_vec(), })?; - let range_end = + let mut range_end = keys::rewrite::rewrite_prefix_of_end_bound(new_prefix, old_prefix, range_end_bound) .map_err(|_| Error::WrongKeyPrefix { what: "SST end range", @@ -930,6 +942,11 @@ impl SstImporter { prefix: new_prefix.to_vec(), })?; + if req_type == DownloadRequestType::Keyspace { + range_start = keys::rewrite::encode_bound(range_start); + range_end = keys::rewrite::encode_bound(range_end); + } + let start_rename_rewrite = Instant::now(); // read the first and last keys from the SST, determine if we could // simply move the entire SST instead of iterating and generate a new one. @@ -942,9 +959,15 @@ impl SstImporter { return Ok(None); } if !iter.seek_to_first()? { + let mut range = meta.get_range().clone(); + if req_type == DownloadRequestType::Keyspace { + *range.mut_start() = encode_bytes(&range.take_start()); + *range.mut_end() = encode_bytes(&range.take_end()); + } // the SST is empty, so no need to iterate at all (should be impossible?) - return Ok(Some(meta.get_range().clone())); + return Ok(Some(range)); } + let start_key = keys::origin_key(iter.key()); if is_before_start_bound(start_key, &range_start) { // SST's start is before the range to consume, so needs to iterate to skip over @@ -995,8 +1018,10 @@ impl SstImporter { } // perform iteration and key rewrite. - let mut key = keys::data_key(new_prefix); - let new_prefix_data_key_len = key.len(); + let mut data_key = keys::DATA_PREFIX_KEY.to_vec(); + let data_key_prefix_len = keys::DATA_PREFIX_KEY.len(); + let mut user_key = new_prefix.to_vec(); + let user_key_prefix_len = new_prefix.len(); let mut first_key = None; match range_start { @@ -1016,10 +1041,22 @@ impl SstImporter { .unwrap(); while iter.valid()? { - let old_key = keys::origin_key(iter.key()); - if is_after_end_bound(old_key, &range_end) { + let mut old_key = Cow::Borrowed(keys::origin_key(iter.key())); + let mut ts = None; + + if is_after_end_bound(old_key.as_ref(), &range_end) { break; } + + if req_type == DownloadRequestType::Keyspace { + ts = Some(Key::decode_ts_bytes_from(old_key.as_ref())?.to_owned()); + old_key = { + let mut key = old_key.to_vec(); + decode_bytes_in_place(&mut key, false)?; + Cow::Owned(key) + }; + } + if !old_key.starts_with(old_prefix) { return Err(Error::WrongKeyPrefix { what: "Key in SST", @@ -1027,12 +1064,21 @@ impl SstImporter { prefix: old_prefix.to_vec(), }); } - key.truncate(new_prefix_data_key_len); - key.extend_from_slice(&old_key[old_prefix.len()..]); + + data_key.truncate(data_key_prefix_len); + user_key.truncate(user_key_prefix_len); + user_key.extend_from_slice(&old_key[old_prefix.len()..]); + if req_type == DownloadRequestType::Keyspace { + data_key.extend(encode_bytes(&user_key)); + data_key.extend(ts.unwrap()); + } else { + data_key.extend_from_slice(&user_key); + } + let mut value = Cow::Borrowed(iter.value()); if rewrite_rule.new_timestamp != 0 { - key = Key::from_encoded(key) + data_key = Key::from_encoded(data_key) .truncate_ts() .map_err(|e| { Error::BadFormat(format!( @@ -1056,10 +1102,10 @@ impl SstImporter { } } - sst_writer.put(&key, &value)?; + sst_writer.put(&data_key, &value)?; iter.next()?; if first_key.is_none() { - first_key = Some(keys::origin_key(&key).to_vec()); + first_key = Some(keys::origin_key(&data_key).to_vec()); } } @@ -1078,7 +1124,7 @@ impl SstImporter { let mut final_range = Range::default(); final_range.set_start(start_key); - final_range.set_end(keys::origin_key(&key).to_vec()); + final_range.set_end(keys::origin_key(&data_key).to_vec()); Ok(Some(final_range)) } else { // nothing is written: prevents finishing the SST at all. diff --git a/components/txn_types/src/types.rs b/components/txn_types/src/types.rs index 60e64bf444a..15779df426a 100644 --- a/components/txn_types/src/types.rs +++ b/components/txn_types/src/types.rs @@ -192,6 +192,16 @@ impl Key { Ok(number::decode_u64_desc(&mut ts)?.into()) } + /// Decode the timestamp from a ts encoded key and return in bytes. + #[inline] + pub fn decode_ts_bytes_from(key: &[u8]) -> Result<&[u8], codec::Error> { + let len = key.len(); + if len < number::U64_SIZE { + return Err(codec::Error::KeyLength); + } + Ok(&key[key.len() - number::U64_SIZE..]) + } + /// Whether the user key part of a ts encoded key `ts_encoded_key` equals to /// the encoded user key `user_key`. /// diff --git a/src/import/sst_service.rs b/src/import/sst_service.rs index ea52cad0095..08eabe32f0c 100644 --- a/src/import/sst_service.rs +++ b/src/import/sst_service.rs @@ -783,7 +783,9 @@ where cipher, limiter, engine, - DownloadExt::default().cache_key(req.get_storage_cache_id()), + DownloadExt::default() + .cache_key(req.get_storage_cache_id()) + .req_type(req.get_request_type()), ); let mut resp = DownloadResponse::default(); match res.await { From ec2f4dc5420dbdab05ea47ff1724a54e765cdca4 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 30 Jan 2023 21:47:54 -0800 Subject: [PATCH 103/115] rocksdb: reduce rocksdb block size to 16KB (#14053) close tikv/tikv#14052 The writecf and defaultcf's default block size is changed to 16KB to improve read performance (reduce read amplification) Signed-off-by: qi.xu Co-authored-by: qi.xu --- etc/config-template.toml | 4 ++-- src/config/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/config-template.toml b/etc/config-template.toml index 62623afed0e..59152570da1 100644 --- a/etc/config-template.toml +++ b/etc/config-template.toml @@ -679,7 +679,7 @@ ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. -# block-size = "64KB" +# block-size = "16KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive @@ -915,7 +915,7 @@ [rocksdb.writecf] ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level = ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] -# block-size = "64KB" +# block-size = "16KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size = "128MB" diff --git a/src/config/mod.rs b/src/config/mod.rs index 99b593e2443..0a32c99f422 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -631,7 +631,7 @@ impl Default for DefaultCfConfig { let total_mem = SysQuota::memory_limit_in_bytes(); DefaultCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_DEFAULT, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, @@ -756,7 +756,7 @@ impl Default for WriteCfConfig { }; WriteCfConfig { - block_size: ReadableSize::kb(64), + block_size: ReadableSize::kb(16), block_cache_size: memory_limit_for_cf(false, CF_WRITE, total_mem), disable_block_cache: false, cache_index_and_filter_blocks: true, From 15d6040c68eb0f2edf6b9304aebf69092657f8a4 Mon Sep 17 00:00:00 2001 From: tonyxuqqi Date: Mon, 30 Jan 2023 22:01:54 -0800 Subject: [PATCH 104/115] storage: add an alias partitioned-raft-kv for RaftKv2 (#14083) ref tikv/tikv#12842 add an alias partitioned-raft-kv for RaftKv2 Signed-off-by: qi.xu Co-authored-by: qi.xu Co-authored-by: Ti Chi Robot --- src/config/mod.rs | 2 +- src/storage/config.rs | 1 + tests/integrations/config/test-custom.toml | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/config/mod.rs b/src/config/mod.rs index 0a32c99f422..7e006ef2eed 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3135,7 +3135,7 @@ impl TikvConfig { if self.storage.engine == EngineType::RaftKv2 { self.raft_store.store_io_pool_size = cmp::max(self.raft_store.store_io_pool_size, 1); if !self.raft_engine.enable { - panic!("raft-kv2 only supports raft log engine."); + panic!("partitioned-raft-kv only supports raft log engine."); } } diff --git a/src/storage/config.rs b/src/storage/config.rs index 68d739c1639..d74bd721104 100644 --- a/src/storage/config.rs +++ b/src/storage/config.rs @@ -35,6 +35,7 @@ const DEFAULT_RESERVED_RAFT_SPACE_GB: u64 = 1; #[serde(rename_all = "kebab-case")] pub enum EngineType { RaftKv, + #[serde(alias = "partitioned-raft-kv")] RaftKv2, } diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index b096437e60c..d79ec7899e2 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -90,7 +90,7 @@ a = "b" [storage] data-dir = "/var" -engine = "raft-kv2" +engine = "partitioned-raft-kv" gc-ratio-threshold = 1.2 max-key-size = 4096 scheduler-concurrency = 123 From 23a228824cb0e82cc495edb28c3276774f97aead Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 31 Jan 2023 14:35:54 +0800 Subject: [PATCH 105/115] resolved_ts: reduce network traffic by filter regions (#14098) close tikv/tikv#14092 resolved_ts: reduce network traffic by filter regions Signed-off-by: Neil Shen Co-authored-by: Ti Chi Robot --- components/cdc/src/endpoint.rs | 2 +- components/resolved_ts/src/advance.rs | 128 +++++++++++++++++++++++++- src/config/mod.rs | 2 +- 3 files changed, 128 insertions(+), 4 deletions(-) diff --git a/components/cdc/src/endpoint.rs b/components/cdc/src/endpoint.rs index 6d64754d042..2b4eb9ff226 100644 --- a/components/cdc/src/endpoint.rs +++ b/components/cdc/src/endpoint.rs @@ -1543,7 +1543,7 @@ mod tests { } let diff = cfg.diff(&updated_cfg); ep.run(Task::ChangeConfig(diff)); - assert_eq!(ep.config.min_ts_interval, ReadableDuration::millis(200)); + assert_eq!(ep.config.min_ts_interval, ReadableDuration::secs(1)); assert_eq!(ep.config.hibernate_regions_compatible, true); { diff --git a/components/resolved_ts/src/advance.rs b/components/resolved_ts/src/advance.rs index a78e903bc72..fd58fac1601 100644 --- a/components/resolved_ts/src/advance.rs +++ b/components/resolved_ts/src/advance.rs @@ -149,6 +149,7 @@ pub struct LeadershipResolver { region_map: HashMap>, // region_id -> peers id, record the responses. resp_map: HashMap>, + checking_regions: HashSet, valid_regions: HashSet, gc_interval: Duration, @@ -176,6 +177,7 @@ impl LeadershipResolver { region_map: HashMap::default(), resp_map: HashMap::default(), valid_regions: HashSet::default(), + checking_regions: HashSet::default(), last_gc_time: Instant::now_coarse(), gc_interval, } @@ -188,6 +190,7 @@ impl LeadershipResolver { self.region_map = HashMap::default(); self.resp_map = HashMap::default(); self.valid_regions = HashSet::default(); + self.checking_regions = HashSet::default(); self.last_gc_time = now; } } @@ -203,6 +206,7 @@ impl LeadershipResolver { for v in self.resp_map.values_mut() { v.clear(); } + self.checking_regions.clear(); self.valid_regions.clear(); } @@ -248,7 +252,11 @@ impl LeadershipResolver { // This function broadcasts a special message to all stores, gets the leader id // of them to confirm whether current peer has a quorum which accepts its // leadership. - pub async fn resolve(&mut self, _regions: Vec, min_ts: TimeStamp) -> Vec { + pub async fn resolve(&mut self, regions: Vec, min_ts: TimeStamp) -> Vec { + if regions.is_empty() { + return regions; + } + // Clear previous result before resolving. self.clear(); // GC when necessary to prevent memory leak. @@ -256,15 +264,22 @@ impl LeadershipResolver { PENDING_RTS_COUNT.inc(); defer!(PENDING_RTS_COUNT.dec()); - fail_point!("before_sync_replica_read_state", |_| _regions.clone()); + fail_point!("before_sync_replica_read_state", |_| regions.clone()); let store_id = self.store_id; let valid_regions = &mut self.valid_regions; let region_map = &mut self.region_map; let resp_map = &mut self.resp_map; let store_req_map = &mut self.store_req_map; + let checking_regions = &mut self.checking_regions; + for region_id in ®ions { + checking_regions.insert(*region_id); + } self.region_read_progress.with(|registry| { for (region_id, read_progress) in registry { + if !checking_regions.contains(region_id) { + continue; + } let core = read_progress.get_core(); let local_leader_info = core.get_local_leader_info(); let leader_id = local_leader_info.get_leader_id(); @@ -512,3 +527,112 @@ async fn get_tikv_client( RTS_TIKV_CLIENT_INIT_DURATION_HISTOGRAM.observe(start.saturating_elapsed_secs()); Ok(cli) } + +#[cfg(test)] +mod tests { + use std::{ + sync::{ + mpsc::{channel, Receiver, Sender}, + Arc, + }, + time::Duration, + }; + + use grpcio::{self, ChannelBuilder, EnvBuilder, Server, ServerBuilder}; + use kvproto::{metapb::Region, tikvpb::Tikv, tikvpb_grpc::create_tikv}; + use pd_client::PdClient; + use raftstore::store::util::RegionReadProgress; + use tikv_util::store::new_peer; + + use super::*; + + #[derive(Clone)] + struct MockTikv { + req_tx: Sender, + } + + impl Tikv for MockTikv { + fn check_leader( + &mut self, + ctx: grpcio::RpcContext<'_>, + req: CheckLeaderRequest, + sink: ::grpcio::UnarySink, + ) { + self.req_tx.send(req).unwrap(); + ctx.spawn(async { + sink.success(CheckLeaderResponse::default()).await.unwrap(); + }) + } + } + + struct MockPdClient {} + impl PdClient for MockPdClient {} + + fn new_rpc_suite(env: Arc) -> (Server, TikvClient, Receiver) { + let (tx, rx) = channel(); + let tikv_service = MockTikv { req_tx: tx }; + let builder = ServerBuilder::new(env.clone()).register_service(create_tikv(tikv_service)); + let mut server = builder.bind("127.0.0.1", 0).build().unwrap(); + server.start(); + let (_, port) = server.bind_addrs().next().unwrap(); + let addr = format!("127.0.0.1:{}", port); + let channel = ChannelBuilder::new(env).connect(&addr); + let client = TikvClient::new(channel); + (server, client, rx) + } + + #[tokio::test] + async fn test_resolve_leader_request_size() { + let env = Arc::new(EnvBuilder::new().build()); + let (mut server, tikv_client, rx) = new_rpc_suite(env.clone()); + + let mut region1 = Region::default(); + region1.id = 1; + region1.peers.push(new_peer(1, 1)); + region1.peers.push(new_peer(2, 11)); + let progress1 = RegionReadProgress::new(®ion1, 1, 1, 1); + progress1.update_leader_info(1, 1, ®ion1); + + let mut region2 = Region::default(); + region2.id = 2; + region2.peers.push(new_peer(1, 2)); + region2.peers.push(new_peer(2, 22)); + let progress2 = RegionReadProgress::new(®ion2, 1, 1, 2); + progress2.update_leader_info(2, 2, ®ion2); + + let mut leader_resolver = LeadershipResolver::new( + 1, // store id + Arc::new(MockPdClient {}), + env.clone(), + Arc::new(SecurityManager::default()), + RegionReadProgressRegistry::new(), + Duration::from_secs(1), + ); + leader_resolver + .tikv_clients + .lock() + .await + .insert(2 /* store id */, tikv_client); + leader_resolver + .region_read_progress + .insert(1, Arc::new(progress1)); + leader_resolver + .region_read_progress + .insert(2, Arc::new(progress2)); + + leader_resolver.resolve(vec![1, 2], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 2); + + // Checking one region only send 1 region in request. + leader_resolver.resolve(vec![1], TimeStamp::new(1)).await; + let req = rx.recv_timeout(Duration::from_secs(1)).unwrap(); + assert_eq!(req.regions.len(), 1); + + // Checking zero region does not send request. + leader_resolver.resolve(vec![], TimeStamp::new(1)).await; + rx.recv_timeout(Duration::from_secs(1)).unwrap_err(); + + let _ = server.shutdown().await; + } +} diff --git a/src/config/mod.rs b/src/config/mod.rs index 7e006ef2eed..3274d5442df 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2653,7 +2653,7 @@ pub struct CdcConfig { impl Default for CdcConfig { fn default() -> Self { Self { - min_ts_interval: ReadableDuration::millis(200), + min_ts_interval: ReadableDuration::secs(1), hibernate_regions_compatible: true, // 4 threads for incremental scan. incremental_scan_threads: 4, From a33eb2d08991f278785e8b3047c643bf07839bce Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 31 Jan 2023 14:59:54 +0800 Subject: [PATCH 106/115] raftstore-v2: fix peer not cleanup when it replicates more logs (#14101) ref tikv/tikv#12842 If it accepts more logs than conf remove itself, applied_index == commit_index will never be true. So we should check if it's a tombstone already first. Signed-off-by: Jay Lee Co-authored-by: Ti Chi Robot --- components/raftstore-v2/src/fsm/store.rs | 4 + components/raftstore-v2/src/operation/life.rs | 7 +- .../raftstore-v2/src/operation/ready/mod.rs | 26 +++--- .../src/operation/ready/snapshot.rs | 17 +++- components/raftstore-v2/src/raft/storage.rs | 7 +- components/raftstore-v2/src/router/message.rs | 10 ++- .../tests/integrations/cluster.rs | 13 ++- .../tests/integrations/test_conf_change.rs | 80 ++++++++++++++++++- .../raftstore/src/store/async_io/read.rs | 6 +- 9 files changed, 144 insertions(+), 26 deletions(-) diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index 86e3540d23c..17c0a9a50f9 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -266,6 +266,10 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { .fsm .store .on_store_unreachable(self.store_ctx, to_store_id), + #[cfg(feature = "testexport")] + StoreMsg::WaitFlush { region_id, ch } => { + self.fsm.store.on_wait_flush(self.store_ctx, region_id, ch) + } } } } diff --git a/components/raftstore-v2/src/operation/life.rs b/components/raftstore-v2/src/operation/life.rs index 88646f06b59..3a9f678bd8c 100644 --- a/components/raftstore-v2/src/operation/life.rs +++ b/components/raftstore-v2/src/operation/life.rs @@ -284,8 +284,11 @@ impl Peer { #[inline] pub fn postponed_destroy(&self) -> bool { let entry_storage = self.storage().entry_storage(); - // TODO: check actual split index instead of commit index. - entry_storage.applied_index() != entry_storage.commit_index() + // If it's marked as tombstone, then it must be changed by conf change. In + // this case, all following entries are skipped so applied_index never equals + // to commit_index. + (self.storage().region_state().get_state() != PeerState::Tombstone + && entry_storage.applied_index() != entry_storage.commit_index()) // Wait for critical commands like split. || self.has_pending_tombstone_tablets() } diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 38d126ac87a..e7c32e742ec 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -52,7 +52,7 @@ use crate::{ batch::StoreContext, fsm::{PeerFsmDelegate, Store}, raft::{Peer, Storage}, - router::{ApplyTask, PeerMsg, PeerTick}, + router::{PeerMsg, PeerTick}, worker::tablet_gc, }; @@ -70,6 +70,19 @@ impl Store { ctx.router .broadcast_normal(|| PeerMsg::StoreUnreachable { to_store_id }); } + + #[cfg(feature = "testexport")] + pub fn on_wait_flush( + &mut self, + ctx: &mut StoreContext, + region_id: u64, + ch: crate::router::FlushChannel, + ) where + EK: KvEngine, + ER: RaftEngine, + { + let _ = ctx.router.send(region_id, PeerMsg::WaitFlush(ch)); + } } impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, T> { @@ -455,6 +468,7 @@ impl Peer { && !self.raft_group().has_ready() && (self.serving() || self.postponed_destroy()) { + self.maybe_schedule_gen_snapshot(); #[cfg(feature = "testexport")] self.async_writer.notify_flush(); return; @@ -501,15 +515,7 @@ impl Peer { self.handle_raft_committed_entries(ctx, ready.take_committed_entries()); } - // Check whether there is a pending generate snapshot task, the task - // needs to be sent to the apply system. - // Always sending snapshot task after apply task, so it gets latest - // snapshot. - if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { - self.apply_scheduler() - .unwrap() - .send(ApplyTask::Snapshot(gen_task)); - } + self.maybe_schedule_gen_snapshot(); let ready_number = ready.number(); let mut write_task = WriteTask::new(self.region_id(), self.peer_id(), ready_number); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index bcbe220252b..1fae813577c 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -41,13 +41,14 @@ use raftstore::{ RAFT_INIT_LOG_INDEX, RAFT_INIT_LOG_TERM, }, }; -use slog::{error, info, warn}; +use slog::{debug, error, info, warn}; use tikv_util::{box_err, log::SlogFormat, slog_panic}; use crate::{ fsm::ApplyResReporter, operation::{command::temp_split_path, SharedReadTablet}, raft::{Apply, Peer, Storage}, + router::ApplyTask, Result, StoreContext, }; @@ -161,6 +162,19 @@ pub fn install_tablet( } impl Peer { + /// Check whether there is a pending generate snapshot task, the task + /// needs to be sent to the apply system. + /// Always sending snapshot task after apply task, so it gets latest + /// snapshot. + #[inline] + pub fn maybe_schedule_gen_snapshot(&mut self) { + if let Some(gen_task) = self.storage_mut().take_gen_snap_task() { + self.apply_scheduler() + .unwrap() + .send(ApplyTask::Snapshot(gen_task)); + } + } + pub fn on_snapshot_generated(&mut self, snapshot: GenSnapRes) { if self.storage_mut().on_snapshot_generated(snapshot) { self.raft_group_mut().ping(); @@ -270,6 +284,7 @@ impl Apply { /// Will schedule a task to read worker and then generate a snapshot /// asynchronously. pub fn schedule_gen_snapshot(&mut self, snap_task: GenSnapTask) { + debug!(self.logger, "scheduling snapshot"; "task" => ?snap_task); // Do not generate, the peer is removed. if self.tombstone() { snap_task.canceled.store(true, Ordering::SeqCst); diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 1d1f53f9c53..ce15ac20621 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -9,7 +9,7 @@ use collections::HashMap; use engine_traits::{KvEngine, RaftEngine}; use kvproto::{ metapb, - raft_serverpb::{PeerState, RaftApplyState, RaftLocalState, RegionLocalState}, + raft_serverpb::{RaftApplyState, RaftLocalState, RegionLocalState}, }; use raft::{ eraftpb::{ConfState, Entry, Snapshot}, @@ -234,10 +234,7 @@ impl Storage { #[inline] pub fn tablet_index(&self) -> u64 { - match self.region_state.get_state() { - PeerState::Tombstone | PeerState::Applying => 0, - _ => self.region_state.get_tablet_index(), - } + self.region_state.get_tablet_index() } #[inline] diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index a9353e171d9..8814a97cc5f 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -260,7 +260,15 @@ pub enum StoreMsg { SplitInit(Box), Tick(StoreTick), Start, - StoreUnreachable { to_store_id: u64 }, + StoreUnreachable { + to_store_id: u64, + }, + /// A message that used to check if a flush is happened. + #[cfg(feature = "testexport")] + WaitFlush { + region_id: u64, + ch: super::FlushChannel, + }, } impl ResourceMetered for StoreMsg {} diff --git a/components/raftstore-v2/tests/integrations/cluster.rs b/components/raftstore-v2/tests/integrations/cluster.rs index 90f7c500903..2076272b44b 100644 --- a/components/raftstore-v2/tests/integrations/cluster.rs +++ b/components/raftstore-v2/tests/integrations/cluster.rs @@ -39,7 +39,7 @@ use raftstore::{ }; use raftstore_v2::{ create_store_batch_system, - router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter}, + router::{DebugInfoChannel, FlushChannel, PeerMsg, QueryResult, RaftRouter, StoreMsg}, Bootstrap, SimpleWriteEncoder, StateStorage, StoreSystem, }; use resource_metering::CollectorRegHandle; @@ -127,7 +127,16 @@ impl TestRouter { let res = self.send(region_id, PeerMsg::WaitFlush(ch)); match res { Ok(_) => return block_on(sub.result()).is_some(), - Err(TrySendError::Disconnected(_)) => return false, + Err(TrySendError::Disconnected(m)) => { + let PeerMsg::WaitFlush(ch) = m else { unreachable!() }; + match self + .store_router() + .send_control(StoreMsg::WaitFlush { region_id, ch }) + { + Ok(_) => return block_on(sub.result()).is_some(), + Err(_) => return false, + } + } Err(TrySendError::Full(_)) => thread::sleep(Duration::from_millis(10)), } } diff --git a/components/raftstore-v2/tests/integrations/test_conf_change.rs b/components/raftstore-v2/tests/integrations/test_conf_change.rs index 8a075bb9a35..4b3445a00ad 100644 --- a/components/raftstore-v2/tests/integrations/test_conf_change.rs +++ b/components/raftstore-v2/tests/integrations/test_conf_change.rs @@ -2,8 +2,9 @@ use std::{self, time::Duration}; -use engine_traits::{Peekable, CF_DEFAULT}; -use kvproto::raft_cmdpb::AdminCmdType; +use engine_traits::{Peekable, RaftEngineReadOnly, CF_DEFAULT}; +use futures::executor::block_on; +use kvproto::{raft_cmdpb::AdminCmdType, raft_serverpb::PeerState}; use raft::prelude::ConfChangeType; use raftstore_v2::{ router::{PeerMsg, PeerTick}, @@ -102,3 +103,78 @@ fn test_simple_change() { let mut cached = cluster.node(0).tablet_registry().get(2).unwrap(); check_skip_wal(cached.latest().unwrap().as_inner().path()); } + +/// Test if a peer can be destroyed by conf change if logs after conf change are +/// also replicated. +#[test] +fn test_remove_by_conf_change() { + let cluster = Cluster::with_node_count(2, None); + let region_id = 2; + let mut req = cluster.routers[0].new_request_for(2); + let admin_req = req.mut_admin_request(); + admin_req.set_cmd_type(AdminCmdType::ChangePeer); + admin_req + .mut_change_peer() + .set_change_type(ConfChangeType::AddLearnerNode); + let store_id = cluster.node(1).id(); + let new_peer = new_learner_peer(store_id, 10); + admin_req.mut_change_peer().set_peer(new_peer); + let resp = cluster.routers[0].admin_command(2, req.clone()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + // So heartbeat will create a learner. + cluster.dispatch(2, vec![]); + // Trigger the raft tick to replica the log to the learner and execute the + // snapshot task. + cluster.routers[0] + .send(region_id, PeerMsg::Tick(PeerTick::Raft)) + .unwrap(); + cluster.dispatch(region_id, vec![]); + // Wait some time so snapshot can be generated. + std::thread::sleep(Duration::from_millis(100)); + cluster.dispatch(region_id, vec![]); + + // write one kv to make flow control replicated. + let (key, val) = (b"key", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, _) = PeerMsg::simple_write(header, put.encode()); + cluster.routers[0].send(region_id, msg).unwrap(); + cluster.dispatch(region_id, vec![]); + + let new_conf_ver = req.get_header().get_region_epoch().get_conf_ver() + 1; + req.mut_header() + .mut_region_epoch() + .set_conf_ver(new_conf_ver); + req.mut_admin_request() + .mut_change_peer() + .set_change_type(ConfChangeType::RemoveNode); + let (admin_msg, admin_sub) = PeerMsg::admin_command(req.clone()); + // write one kv after removal + let (key, val) = (b"key1", b"value"); + let header = Box::new(cluster.routers[0].new_request_for(region_id).take_header()); + let mut put = SimpleWriteEncoder::with_capacity(64); + put.put(CF_DEFAULT, key, val); + let (msg, sub) = PeerMsg::simple_write(header, put.encode()); + // Send them at the same time so they will be all sent to learner. + cluster.routers[0].send(region_id, admin_msg).unwrap(); + cluster.routers[0].send(region_id, msg).unwrap(); + let resp = block_on(admin_sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + let resp = block_on(sub.result()).unwrap(); + assert!(!resp.get_header().has_error(), "{:?}", resp); + + // Dispatch messages so the learner will receive conf remove and write at the + // same time. + cluster.dispatch(region_id, vec![]); + cluster.routers[1].wait_flush(region_id, Duration::from_millis(300)); + // Wait for apply. + std::thread::sleep(Duration::from_millis(100)); + let raft_engine = &cluster.node(1).running_state().unwrap().raft_engine; + let region_state = raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + assert_eq!(region_state.get_state(), PeerState::Tombstone); + assert_eq!(raft_engine.get_raft_state(region_id).unwrap(), None); +} diff --git a/components/raftstore/src/store/async_io/read.rs b/components/raftstore/src/store/async_io/read.rs index b298ed3529e..45492feb294 100644 --- a/components/raftstore/src/store/async_io/read.rs +++ b/components/raftstore/src/store/async_io/read.rs @@ -227,10 +227,10 @@ where error!("failed to create checkpointer"; "region_id" => region_id, "error" => %e); SNAP_COUNTER.generate.fail.inc(); } else { + let elapsed = start.saturating_elapsed_secs(); SNAP_COUNTER.generate.success.inc(); - SNAP_HISTOGRAM - .generate - .observe(start.saturating_elapsed_secs()); + SNAP_HISTOGRAM.generate.observe(elapsed); + info!("snapshot generated"; "region_id" => region_id, "elapsed" => elapsed, "key" => ?snap_key, "for_balance" => for_balance); res = Some(Box::new((snapshot, to_peer))) } From 9c0df6d68c72d30021b36d24275fdceca9864235 Mon Sep 17 00:00:00 2001 From: you06 Date: Wed, 1 Feb 2023 15:43:55 +0800 Subject: [PATCH 107/115] cop: handle unset scan details in store batch (#14102) close tikv/tikv#14109 Signed-off-by: you06 --- src/coprocessor/endpoint.rs | 2 + tests/integrations/coprocessor/test_select.rs | 60 +++++++++++-------- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/src/coprocessor/endpoint.rs b/src/coprocessor/endpoint.rs index b9d01419a49..6ac1bebc541 100644 --- a/src/coprocessor/endpoint.rs +++ b/src/coprocessor/endpoint.rs @@ -600,6 +600,8 @@ impl Endpoint { response.set_locked(lock_info); } response.set_other_error(resp.take_other_error()); + // keep the exec details already generated. + response.set_exec_details_v2(resp.take_exec_details_v2()); GLOBAL_TRACKERS.with_tracker(cur_tracker, |tracker| { tracker.write_scan_detail( response.mut_exec_details_v2().mut_scan_detail_v2(), diff --git a/tests/integrations/coprocessor/test_select.rs b/tests/integrations/coprocessor/test_select.rs index ad195f62774..056f24b5fee 100644 --- a/tests/integrations/coprocessor/test_select.rs +++ b/tests/integrations/coprocessor/test_select.rs @@ -4,11 +4,10 @@ use std::{cmp, thread, time::Duration}; use engine_traits::CF_LOCK; use kvproto::{ - coprocessor::{Request, Response, StoreBatchTask}, - errorpb, - kvrpcpb::{Context, IsolationLevel, LockInfo}, + coprocessor::{Request, Response, StoreBatchTask, StoreBatchTaskResponse}, + kvrpcpb::{Context, IsolationLevel}, }; -use protobuf::{Message, SingularPtrField}; +use protobuf::Message; use raftstore::store::Bucket; use test_coprocessor::*; use test_raftstore::{Cluster, ServerCluster}; @@ -2151,11 +2150,14 @@ fn test_batch_request() { } req }; - let verify_response = |result: &QueryResult, - data: &[u8], - region_err: &SingularPtrField, - locked: &SingularPtrField, - other_err: &String| { + let verify_response = |result: &QueryResult, resp: &Response| { + let (data, details, region_err, locked, other_err) = ( + resp.get_data(), + resp.get_exec_details_v2(), + &resp.region_error, + &resp.locked, + &resp.other_error, + ); match result { QueryResult::Valid(res) => { let expected_len = res.len(); @@ -2179,6 +2181,12 @@ fn test_batch_request() { assert!(region_err.is_none()); assert!(locked.is_none()); assert!(other_err.is_empty()); + let scan_details = details.get_scan_detail_v2(); + assert_eq!(scan_details.processed_versions, row_count as u64); + if row_count > 0 { + assert!(scan_details.processed_versions_size > 0); + assert!(scan_details.total_versions > 0); + } } QueryResult::ErrRegion => { assert!(region_err.is_some()); @@ -2198,6 +2206,20 @@ fn test_batch_request() { } }; + let batch_resp_2_resp = |batch_resp: &mut StoreBatchTaskResponse| -> Response { + let mut response = Response::default(); + response.set_data(batch_resp.take_data()); + if let Some(err) = batch_resp.region_error.take() { + response.set_region_error(err); + } + if let Some(lock_info) = batch_resp.locked.take() { + response.set_locked(lock_info); + } + response.set_other_error(batch_resp.take_other_error()); + response.set_exec_details_v2(batch_resp.take_exec_details_v2()); + response + }; + for (ranges, results, invalid_epoch, key_is_locked) in cases.iter() { let mut req = prepare_req(&mut cluster, ranges); if *invalid_epoch { @@ -2229,25 +2251,13 @@ fn test_batch_request() { } } let mut resp = handle_request(&endpoint, req); - let batch_results = resp.take_batch_responses().to_vec(); + let mut batch_results = resp.take_batch_responses().to_vec(); for (i, result) in results.iter().enumerate() { if i == 0 { - verify_response( - result, - resp.get_data(), - &resp.region_error, - &resp.locked, - &resp.other_error, - ); + verify_response(result, &resp); } else { - let batch_resp = batch_results.get(i - 1).unwrap(); - verify_response( - result, - batch_resp.get_data(), - &batch_resp.region_error, - &batch_resp.locked, - &batch_resp.other_error, - ); + let batch_resp = batch_results.get_mut(i - 1).unwrap(); + verify_response(result, &batch_resp_2_resp(batch_resp)); }; } if *key_is_locked { From db14c53267ebf815d6a8ae12036bd5e20326f7ee Mon Sep 17 00:00:00 2001 From: ShuNing Date: Thu, 2 Feb 2023 10:23:55 +0800 Subject: [PATCH 108/115] resource_control: unify wru/rru to ru (#14121) close tikv/tikv#14120 resource_control: unify wru/rru to ru Signed-off-by: nolouch --- Cargo.lock | 2 +- .../resource_control/src/resource_group.rs | 43 +++++++++---------- components/resource_control/src/service.rs | 14 +++--- 3 files changed, 28 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f2ce2ba4ce1..1747e74fafa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2730,7 +2730,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#1b2b4114103afb06796b7e44f45f7e55133673c0" +source = "git+https://github.com/pingcap/kvproto.git#a7c51106dfe70ebf59221018b50d1ec6ad25da74" dependencies = [ "futures 0.3.15", "grpcio", diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index 1524ebcba5d..a0abfb11464 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -41,14 +41,10 @@ pub struct ResourceGroupManager { impl ResourceGroupManager { fn get_ru_setting(rg: &ResourceGroup, is_read: bool) -> u64 { match (rg.get_mode(), is_read) { - (GroupMode::RuMode, true) => rg + // RU mode, read and write use the same setting. + (GroupMode::RuMode, _) => rg .get_r_u_settings() - .get_r_r_u() - .get_settings() - .get_fill_rate(), - (GroupMode::RuMode, false) => rg - .get_r_u_settings() - .get_w_r_u() + .get_r_u() .get_settings() .get_fill_rate(), // TODO: currently we only consider the cpu usage in the read path, we may also take @@ -311,6 +307,10 @@ pub(crate) mod tests { use super::*; + pub fn new_resource_group_ru(name: String, ru: u64) -> ResourceGroup { + new_resource_group(name, true, ru, ru) + } + pub fn new_resource_group( name: String, is_ru_mode: bool, @@ -328,15 +328,12 @@ pub(crate) mod tests { }; group.set_mode(mode); if is_ru_mode { + assert!(read_tokens == write_tokens); let mut ru_setting = GroupRequestUnitSettings::new(); ru_setting - .mut_r_r_u() + .mut_r_u() .mut_settings() .set_fill_rate(read_tokens); - ru_setting - .mut_w_r_u() - .mut_settings() - .set_fill_rate(write_tokens); group.set_r_u_settings(ru_setting); } else { let mut resource_setting = GroupRawResourceSettings::new(); @@ -357,7 +354,7 @@ pub(crate) mod tests { fn test_resource_group() { let resource_manager = ResourceGroupManager::default(); - let group1 = new_resource_group("TEST".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST".into(), 100); resource_manager.add_resource_group(group1); assert!(resource_manager.get_resource_group("test1").is_none()); @@ -367,7 +364,7 @@ pub(crate) mod tests { group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 100 @@ -375,14 +372,14 @@ pub(crate) mod tests { drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group1 = new_resource_group("Test".into(), true, 200, 100); + let group1 = new_resource_group_ru("Test".into(), 200); resource_manager.add_resource_group(group1); let group = resource_manager.get_resource_group("test").unwrap(); assert_eq!( group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 200 @@ -390,7 +387,7 @@ pub(crate) mod tests { drop(group); assert_eq!(resource_manager.resource_groups.len(), 1); - let group2 = new_resource_group("test2".into(), true, 400, 200); + let group2 = new_resource_group_ru("test2".into(), 400); resource_manager.add_resource_group(group2); assert_eq!(resource_manager.resource_groups.len(), 2); @@ -451,7 +448,7 @@ pub(crate) mod tests { drop(group2); // test add 1 new resource group - let new_group = new_resource_group("new_group".into(), true, 500, 500); + let new_group = new_resource_group_ru("new_group".into(), 500); resource_manager.add_resource_group(new_group); assert_eq!(resource_ctl.resource_consumptions.len(), 4); @@ -466,29 +463,29 @@ pub(crate) mod tests { let resource_ctl = resource_manager.derive_controller("test_read".into(), true); let resource_ctl_write = resource_manager.derive_controller("test_write".into(), false); - let group1 = new_resource_group("test1".into(), true, 5000, 1000); + let group1 = new_resource_group_ru("test1".into(), 5000); resource_manager.add_resource_group(group1); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 20); assert_eq!( resource_ctl_write.resource_group("test1".as_bytes()).weight, - 100 + 20 ); // add a resource group with big ru - let group1 = new_resource_group("test2".into(), true, 50000, 2000); + let group1 = new_resource_group_ru("test2".into(), 50000); resource_manager.add_resource_group(group1); assert_eq!(*resource_ctl.max_ru_quota.lock().unwrap(), 50000); assert_eq!(resource_ctl.resource_group("test1".as_bytes()).weight, 100); assert_eq!(resource_ctl.resource_group("test2".as_bytes()).weight, 10); // resource_ctl_write should be unchanged. - assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 10000); + assert_eq!(*resource_ctl_write.max_ru_quota.lock().unwrap(), 50000); assert_eq!( resource_ctl_write.resource_group("test1".as_bytes()).weight, 100 ); assert_eq!( resource_ctl_write.resource_group("test2".as_bytes()).weight, - 50 + 10 ); } } diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index ea9a9d724b9..2381b168987 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -125,7 +125,7 @@ pub mod tests { use test_pd::{mocker::Service, util::*, Server as MockServer}; use tikv_util::{config::ReadableDuration, worker::Builder}; - use crate::resource_group::tests::new_resource_group; + use crate::resource_group::tests::{new_resource_group, new_resource_group_ru}; fn new_test_server_and_client( update_interval: ReadableDuration, @@ -202,12 +202,12 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group("TEST1".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST1".into(), 100); add_resource_group(s.pd_client.clone(), group1); - let group2 = new_resource_group("TEST2".into(), true, 100, 100); + let group2 = new_resource_group_ru("TEST2".into(), 100); add_resource_group(s.pd_client.clone(), group2); // Mock modify - let group2 = new_resource_group("TEST2".into(), true, 50, 50); + let group2 = new_resource_group_ru("TEST2".into(), 50); add_resource_group(s.pd_client.clone(), group2); let (res, revision) = block_on(s.list_resource_groups()); assert_eq!(res.len(), 2); @@ -227,7 +227,7 @@ pub mod tests { group .value() .get_r_u_settings() - .get_r_r_u() + .get_r_u() .get_settings() .get_fill_rate(), 50 @@ -247,7 +247,7 @@ pub mod tests { s_clone.watch_resource_groups().await; }); // Mock add - let group1 = new_resource_group("TEST1".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST1".into(), 100); add_resource_group(s.pd_client.clone(), group1); // Mock reboot watch server let watch_global_config_fp = "watch_global_config_return"; @@ -255,7 +255,7 @@ pub mod tests { std::thread::sleep(Duration::from_millis(100)); fail::remove(watch_global_config_fp); // Mock add after rebooting will success - let group1 = new_resource_group("TEST2".into(), true, 100, 100); + let group1 = new_resource_group_ru("TEST2".into(), 100); add_resource_group(s.pd_client.clone(), group1); // Wait watcher update std::thread::sleep(Duration::from_secs(1)); From df5cc1d18708ccf171a2202e977a76f9b9d94043 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Thu, 2 Feb 2023 15:32:50 +0800 Subject: [PATCH 109/115] [Cloud] Add tests for assertion of single replica migration (#264) Signed-off-by: CalvinNeo --- engine_store_ffi/src/observer.rs | 6 +++- proxy_tests/proxy/fast_add_peer.rs | 58 ++++++++++++++++++++++++++++++ proxy_tests/proxy/proxy.rs | 24 +++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 32ac9122f65..f6501fb6899 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -1189,7 +1189,11 @@ impl RegionChangeObserver for TiFlashObs } fn pre_write_apply_state(&self, _ob_ctx: &mut ObserverContext<'_>) -> bool { - fail::fail_point!("on_pre_persist_with_finish", |_| { true }); + fail::fail_point!("on_pre_persist_with_finish", |_| { + // Some test need persist apply state for Leader logic, + // including fast add peer. + true + }); false } diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 47f3ee211d8..54a6b6e1e16 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -590,3 +590,61 @@ fn test_fall_back_to_slow_path() { fail::remove("go_fast_path_not_allow"); cluster.shutdown(); } + +#[test] +fn test_single_replica_migrate() { + let (mut cluster, pd_client) = new_mock_cluster_snap(0, 3); + pd_client.disable_default_operator(); + cluster.cfg.proxy_cfg.engine_store.enable_fast_add_peer = true; + + tikv_util::set_panic_hook(true, "./"); + // Can always apply snapshot immediately + fail::cfg("on_can_apply_snapshot", "return(true)").unwrap(); + fail::cfg("on_pre_persist_with_finish", "return").unwrap(); + + let _ = cluster.run_conf_change(); + + cluster.must_put(b"k1", b"v1"); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1])); + + // Fast add peer 2 + pd_client.must_add_peer(1, new_learner_peer(2, 2)); + check_key(&cluster, b"k1", b"v1", Some(true), None, Some(vec![1, 2])); + must_wait_until_cond_node(&cluster, 1, Some(vec![2]), &|states: &States| -> bool { + find_peer_by_id(states.in_disk_region_state.get_region(), 2).is_some() + }); + + fail::cfg("ffi_fast_add_peer_from_id", "return(2)").unwrap(); + + // Remove peer 2. + pd_client.must_remove_peer(1, new_learner_peer(2, 2)); + must_wait_until_cond_generic( + &cluster, + 1, + None, + &|states: &HashMap| -> bool { states.get(&2).is_none() }, + ); + + // Remove peer 2 and then add some new logs. + cluster.must_put(b"krm2", b"v"); + check_key(&cluster, b"krm2", b"v", Some(true), None, Some(vec![1])); + + // Try fast add peer from removed peer 2. + // TODO It will fallback to slow path if we don't support single replica + // migration. + fail::cfg("go_fast_path_not_allow", "panic").unwrap(); + pd_client.must_add_peer(1, new_learner_peer(3, 3)); + check_key(&cluster, b"krm2", b"v", Some(true), None, Some(vec![3])); + std::thread::sleep(std::time::Duration::from_millis(2000)); + must_wait_until_cond_generic( + &cluster, + 1, + None, + &|states: &HashMap| -> bool { states.get(&3).is_some() }, + ); + fail::remove("go_fast_path_not_allow"); + + fail::remove("on_can_apply_snapshot"); + fail::remove("on_pre_persist_with_finish"); + cluster.shutdown(); +} diff --git a/proxy_tests/proxy/proxy.rs b/proxy_tests/proxy/proxy.rs index 11dc8944df9..bb09dfaee61 100644 --- a/proxy_tests/proxy/proxy.rs +++ b/proxy_tests/proxy/proxy.rs @@ -609,6 +609,9 @@ pub fn must_wait_until_cond_node( break; } } + } else { + // If region not exists in some store. + ok = false; } } if ok { @@ -622,6 +625,27 @@ pub fn must_wait_until_cond_node( } } +pub fn must_wait_until_cond_generic( + cluster: &Cluster, + region_id: u64, + store_ids: Option>, + pred: &dyn Fn(&HashMap) -> bool, +) -> HashMap { + let mut retry = 0; + loop { + let new_states = maybe_collect_states(&cluster, region_id, store_ids.clone()); + let ok = pred(&new_states); + if ok { + break new_states; + } + std::thread::sleep(std::time::Duration::from_millis(100)); + retry += 1; + if retry >= 30 { + panic!("states not as expect after timeout") + } + } +} + pub fn force_compact_log( cluster: &mut Cluster, key: &[u8], From d1d29203e6a93b05dd435ea27a9b39fb30b23f41 Mon Sep 17 00:00:00 2001 From: Hu# Date: Thu, 2 Feb 2023 16:33:56 +0800 Subject: [PATCH 110/115] pd_client: fix item value type (#14106) close tikv/tikv#14104 We need to use the new field to support item value as bytes to avoid proto string check failures. Signed-off-by: husharp --- Cargo.lock | 3 +-- components/resource_control/Cargo.toml | 1 - .../resource_control/src/resource_group.rs | 1 - components/resource_control/src/service.rs | 27 +++++++++++++------ components/test_pd/src/mocker/mod.rs | 5 ++-- components/test_pd/src/server.rs | 2 +- .../failpoints/cases/test_pd_client_legacy.rs | 12 ++++++--- 7 files changed, 32 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1747e74fafa..78c9e88b538 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2730,7 +2730,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#a7c51106dfe70ebf59221018b50d1ec6ad25da74" +source = "git+https://github.com/pingcap/kvproto.git#2b853bed812556901846f42820b63d8a0d9c8d24" dependencies = [ "futures 0.3.15", "grpcio", @@ -4697,7 +4697,6 @@ dependencies = [ "test_pd", "test_pd_client", "tikv_util", - "tokio", "yatp", ] diff --git a/components/resource_control/Cargo.toml b/components/resource_control/Cargo.toml index 3f796627040..39d37ac0f6b 100644 --- a/components/resource_control/Cargo.toml +++ b/components/resource_control/Cargo.toml @@ -26,5 +26,4 @@ slog-global = { version = "0.1", git = "https://github.com/breeswish/slog-global test_pd = { workspace = true } test_pd_client = { workspace = true } tikv_util = { workspace = true } -tokio = { version = "1.5", features = ["time"] } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } diff --git a/components/resource_control/src/resource_group.rs b/components/resource_control/src/resource_group.rs index a0abfb11464..c5112c13516 100644 --- a/components/resource_control/src/resource_group.rs +++ b/components/resource_control/src/resource_group.rs @@ -358,7 +358,6 @@ pub(crate) mod tests { resource_manager.add_resource_group(group1); assert!(resource_manager.get_resource_group("test1").is_none()); - let group = resource_manager.get_resource_group("test").unwrap(); assert_eq!( group diff --git a/components/resource_control/src/service.rs b/components/resource_control/src/service.rs index 2381b168987..fc24af4fdc4 100644 --- a/components/resource_control/src/service.rs +++ b/components/resource_control/src/service.rs @@ -2,10 +2,10 @@ use std::{sync::Arc, time::Duration}; -use futures::StreamExt; +use futures::{compat::Future01CompatExt, StreamExt}; use kvproto::{pdpb::EventType, resource_manager::ResourceGroup}; use pd_client::{Error as PdError, PdClient, RpcClient, RESOURCE_CONTROL_CONFIG_PATH}; -use tikv_util::error; +use tikv_util::{error, timer::GLOBAL_TIMER_HANDLE}; use crate::ResourceGroupManager; @@ -31,6 +31,8 @@ impl ResourceManagerService { } } +const RETRY_INTERVAL: Duration = Duration::from_secs(1); // to consistent with pd_client + impl ResourceManagerService { pub async fn watch_resource_groups(&mut self) { // Firstly, load all resource groups as of now. @@ -56,7 +58,7 @@ impl ResourceManagerService { EventType::Put => { if let Ok(group) = protobuf::parse_from_bytes::( - item.get_value().as_bytes(), + item.get_payload(), ) { self.manager.add_resource_group(group); @@ -69,7 +71,10 @@ impl ResourceManagerService { } Err(err) => { error!("failed to get stream"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -85,7 +90,10 @@ impl ResourceManagerService { } Err(err) => { error!("failed to watch resource groups"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -101,13 +109,16 @@ impl ResourceManagerService { Ok((items, revision)) => { let groups = items .into_iter() - .filter_map(|g| protobuf::parse_from_bytes(g.get_value().as_bytes()).ok()) + .filter_map(|g| protobuf::parse_from_bytes(g.get_payload()).ok()) .collect(); return (groups, revision); } Err(err) => { error!("failed to load global config"; "err" => ?err); - tokio::time::sleep(Duration::from_secs(1)).await; + let _ = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + RETRY_INTERVAL) + .compat() + .await; } } } @@ -142,7 +153,7 @@ pub mod tests { item.set_name(group.get_name().to_string()); let mut buf = Vec::new(); group.write_to_vec(&mut buf).unwrap(); - item.set_value(String::from_utf8(buf).unwrap()); + item.set_payload(buf); futures::executor::block_on(async move { pd_client diff --git a/components/test_pd/src/mocker/mod.rs b/components/test_pd/src/mocker/mod.rs index b9ae839b06e..fc257b12a9f 100644 --- a/components/test_pd/src/mocker/mod.rs +++ b/components/test_pd/src/mocker/mod.rs @@ -47,7 +47,7 @@ pub trait PdMocker { .map(|kv| { let mut item = GlobalConfigItem::default(); item.set_name(String::from_utf8(kv.key().to_vec()).unwrap()); - item.set_value(String::from_utf8(kv.value().to_vec()).unwrap()); + item.set_payload(kv.value().into()); item }) .collect(); @@ -68,7 +68,8 @@ pub trait PdMocker { block_on(async move { match item.get_kind() { EventType::Put => { - let kv = KeyValue(MetaKey(item.get_name().into()), item.get_value().into()); + let kv = + KeyValue(MetaKey(item.get_name().into()), item.get_payload().into()); cli.lock().await.set(kv).await } EventType::Delete => { diff --git a/components/test_pd/src/server.rs b/components/test_pd/src/server.rs index cb495307a1f..28d4077b674 100644 --- a/components/test_pd/src/server.rs +++ b/components/test_pd/src/server.rs @@ -242,7 +242,7 @@ impl Pd for PdMock { KvEventType::Delete => EventType::Delete, }); change.set_name(from_utf8(event.pair.key()).unwrap().to_string()); - change.set_value(from_utf8(event.pair.value()).unwrap().to_string()); + change.set_payload(event.pair.value().into()); let mut wc = WatchGlobalConfigResponse::default(); wc.set_changes(vec![change].into()); let _ = sink.send((wc, WriteFlags::default())).await; diff --git a/tests/failpoints/cases/test_pd_client_legacy.rs b/tests/failpoints/cases/test_pd_client_legacy.rs index 3638e448bd9..d6cf7f1817d 100644 --- a/tests/failpoints/cases/test_pd_client_legacy.rs +++ b/tests/failpoints/cases/test_pd_client_legacy.rs @@ -1,6 +1,7 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + str::from_utf8, sync::{mpsc, Arc}, thread, time::Duration, @@ -118,7 +119,7 @@ fn test_load_global_config() { .map(|(name, value)| { let mut item = GlobalConfigItem::default(); item.set_name(name.to_string()); - item.set_value(value.to_string()); + item.set_payload(value.as_bytes().into()); item }) .collect::>(), @@ -132,7 +133,7 @@ fn test_load_global_config() { assert!( res.iter() .zip(check_items) - .all(|(item1, item2)| item1.name == item2.0 && item1.value == item2.1) + .all(|(item1, item2)| item1.name == item2.0 && item1.payload == item2.1.as_bytes()) ); assert_eq!(revision, 3); } @@ -156,7 +157,10 @@ fn test_watch_global_config_on_closed_server() { Ok(r) => { for item in r.get_changes() { assert_eq!(item.get_name(), items_clone[i].0); - assert_eq!(item.get_value(), items_clone[i].1); + assert_eq!( + from_utf8(item.get_payload()).unwrap(), + items_clone[i].1 + ); i += 1; } } @@ -181,7 +185,7 @@ fn test_watch_global_config_on_closed_server() { .map(|(name, value)| { let mut item = GlobalConfigItem::default(); item.set_name(name.to_string()); - item.set_value(value.to_string()); + item.set_payload(value.as_bytes().into()); item }) .collect::>(), From 37915609defa68c174e7659f99108a0982662989 Mon Sep 17 00:00:00 2001 From: Jay Date: Fri, 3 Feb 2023 13:57:56 +0800 Subject: [PATCH 111/115] raftstore-v2: add tablet logger and update dep (#14129) ref tikv/tikv#12842 - Update raft-engine to fix data corruption during restart - Add tablet logger so we can know which tablet the logs belongs to Signed-off-by: Jay Lee --- Cargo.lock | 209 +++++++++--------- cmd/tikv-ctl/src/main.rs | 3 +- components/engine_rocks/src/logger.rs | 24 ++ components/raft_log_engine/src/engine.rs | 11 +- .../raftstore-v2/src/operation/ready/mod.rs | 12 +- src/config/mod.rs | 15 +- src/server/engine_factory.rs | 13 +- tests/integrations/storage/test_titan.rs | 5 +- 8 files changed, 167 insertions(+), 125 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 78c9e88b538..633194d9323 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -225,7 +225,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d962799a5863fdf06fbf594e04102130582d010379137e9a98a7e2e693a5885" dependencies = [ "error-code", - "libc 0.2.132", + "libc 0.2.139", "wasm-bindgen", "winapi 0.3.9", ] @@ -256,7 +256,7 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -447,7 +447,7 @@ dependencies = [ "addr2line", "cc", "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "miniz_oxide 0.4.4", "object", "rustc-demangle", @@ -603,7 +603,7 @@ dependencies = [ "bcc-sys", "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "regex", "thiserror", ] @@ -735,7 +735,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -761,7 +761,7 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7f788eaf239475a3c1e1acf89951255a46c4b9b46cf3e866fc4d0707b4b9e36" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "valgrind_request", ] @@ -934,7 +934,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f54d78e30b388d4815220c8dd03fea5656b6c6d32adb59e89061552a102f8da1" dependencies = [ "glob", - "libc 0.2.132", + "libc 0.2.139", "libloading", ] @@ -1018,7 +1018,7 @@ dependencies = [ "byteorder", "bytes", "error_code", - "libc 0.2.132", + "libc 0.2.139", "panic_hook", "protobuf", "rand 0.8.5", @@ -1077,7 +1077,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a89e2ae426ea83155dccf10c0fa6b1463ef6d5fcb44cee0b224a408fa640a62" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1092,7 +1092,7 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e9e393a7668fe1fad3075085b86c781883000b4ede868f43627b34a87c8b7ded" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1150,7 +1150,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63aaaf47e457badbcb376c65a49d0f182c317ebd97dc6d1ced94c8e1d09c0f3a" dependencies = [ "criterion", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -1217,7 +1217,7 @@ dependencies = [ "cfg-if 1.0.0", "crossbeam-utils 0.8.8", "lazy_static", - "memoffset", + "memoffset 0.6.4", "scopeguard", ] @@ -1229,7 +1229,7 @@ dependencies = [ "autocfg", "cfg-if 1.0.0", "crossbeam-utils 0.8.11", - "memoffset", + "memoffset 0.6.4", "once_cell", "scopeguard", ] @@ -1420,7 +1420,7 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_users", "winapi 0.3.9", ] @@ -1681,7 +1681,7 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5115567ac25674e0043e472be13d14e537f37ea8aa4bdc4aef0c89add1db1ff" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "str-buf", ] @@ -1789,7 +1789,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "matches", "nix 0.24.1", @@ -1845,7 +1845,7 @@ dependencies = [ "crossbeam-utils 0.8.8", "fs2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "maligned", "online_config", "openssl", @@ -1870,7 +1870,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "thiserror", "winapi 0.3.9", ] @@ -1882,7 +1882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d34cfa13a63ae058bfa601fe9e313bbdb3746427c1459185464ce0fcf62e1e8" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "winapi 0.3.9", ] @@ -1895,7 +1895,7 @@ checksum = "d691fdb3f817632d259d09220d4cf0991dbb2c9e59e044a02a59194bf6e14484" dependencies = [ "cc", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1923,7 +1923,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2adaffba6388640136149e18ed080b77a78611c1e1d6de75aedcdf78df5d4682" dependencies = [ "crc32fast", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "miniz_oxide 0.3.7", ] @@ -1964,7 +1964,7 @@ name = "fs2" version = "0.4.3" source = "git+https://github.com/tabokie/fs2-rs?branch=tikv#cd503764a19a99d74c1ab424dd13d6bcd093fcae" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -1990,7 +1990,7 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41b048a94555da0f42f1d632e2e19510084fb8e303b0daa2816e733fb3644a0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2226,7 +2226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.7.0", ] @@ -2238,7 +2238,7 @@ checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "js-sys", - "libc 0.2.132", + "libc 0.2.139", "wasi 0.10.2+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2287,7 +2287,7 @@ dependencies = [ "futures-executor", "futures-util", "grpcio-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "parking_lot 0.11.1", "protobuf", @@ -2324,7 +2324,7 @@ dependencies = [ "bindgen 0.59.2", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "openssl-sys", "pkg-config", @@ -2392,7 +2392,7 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2600,7 +2600,7 @@ checksum = "4816c66d2c8ae673df83366c18341538f234a26d65a9ecea5c348b453ac1d02f" dependencies = [ "bitflags", "inotify-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2609,7 +2609,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2636,7 +2636,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -2682,7 +2682,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2b1d42ef453b30b7387e113da1c83ab1605d90c5b4e0eb8e96d016ed3b8c160" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "log", ] @@ -2823,9 +2823,9 @@ checksum = "e32a70cf75e5846d53a673923498228bbec6a8624708a9ea5645f075d6276122" [[package]] name = "libc" -version = "0.2.132" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libfuzzer-sys" @@ -2865,7 +2865,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libtitan_sys", "libz-sys", "lz4-sys", @@ -2883,7 +2883,7 @@ dependencies = [ "bzip2-sys", "cc", "cmake", - "libc 0.2.132", + "libc 0.2.139", "libz-sys", "lz4-sys", "snappy-sys", @@ -2897,7 +2897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", "vcpkg", ] @@ -2953,7 +2953,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3008,7 +3008,7 @@ version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3017,7 +3017,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3027,7 +3027,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "057a3db23999c867821a7a59feb06a578fcb03685e983dff90daf9e7d24ac08f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3039,6 +3039,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "memory_trace_macros" version = "0.1.0" @@ -3098,7 +3107,7 @@ dependencies = [ "fuchsia-zircon-sys", "iovec", "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "log", "miow", "net2", @@ -3112,7 +3121,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d732bc30207a6423068df043e3d02e0735b155ad7ce1a6f76fe2baa5b158de" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "log", "wasi 0.11.0+wasi-snapshot-preview1", "windows-sys 0.42.0", @@ -3158,7 +3167,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1587ebb20a5b04738f16cffa7e2526f1b8496b84f92920facd518362ff1559eb" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3209,7 +3218,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8d96b2e1c8da3957d58100b09f102c6d9cfdfced01b7ec5a8974044bb09dbd4" dependencies = [ "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "openssl", "openssl-probe", @@ -3227,7 +3236,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" dependencies = [ "cfg-if 0.1.10", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3239,22 +3248,22 @@ checksum = "8f17df307904acd05aa8e32e97bb20f2a0df1728bbc2d771ae8f9a90463441e9" dependencies = [ "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.6.4", ] [[package]] name = "nix" -version = "0.25.0" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e322c04a9e3440c327fca7b6c8a63e6890a32fa2ad689db972425f07e0d22abb" +checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" dependencies = [ - "autocfg", "bitflags", "cfg-if 1.0.0", - "libc 0.2.132", - "memoffset", + "libc 0.2.139", + "memoffset 0.7.1", "pin-utils", + "static_assertions", ] [[package]] @@ -3311,7 +3320,7 @@ dependencies = [ "fsevent", "fsevent-sys", "inotify", - "libc 0.2.132", + "libc 0.2.139", "mio 0.6.23", "mio-extras", "walkdir", @@ -3464,7 +3473,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" dependencies = [ "hermit-abi", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -3542,7 +3551,7 @@ dependencies = [ "bitflags", "cfg-if 1.0.0", "foreign-types", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "openssl-macros", "openssl-sys", @@ -3582,7 +3591,7 @@ checksum = "e5f9bd0c2710541a3cda73d6f9ac4f1b240de4ae261065d309dbe73d9dceb42f" dependencies = [ "autocfg", "cc", - "libc 0.2.132", + "libc 0.2.139", "openssl-src", "pkg-config", "vcpkg", @@ -3612,7 +3621,7 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -3649,7 +3658,7 @@ checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" dependencies = [ "cfg-if 1.0.0", "instant", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "winapi 0.3.9", @@ -3662,7 +3671,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28141e0cc4143da2443301914478dc976a61ffdb3f043058310c70df2fed8954" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.2.11", "smallvec", "windows-sys 0.32.0", @@ -3739,7 +3748,7 @@ checksum = "b8f94885300e262ef461aa9fd1afbf7df3caf9e84e271a74925d1c6c8b24830f" dependencies = [ "bitflags", "byteorder", - "libc 0.2.132", + "libc 0.2.139", "mmap", "nom 4.2.3", "phf", @@ -3882,7 +3891,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d27361d7578b410d0eb5fe815c2b2105b01ab770a7c738cb9a231457a809fcc7" dependencies = [ "ipnetwork", - "libc 0.2.132", + "libc 0.2.139", "pnet_base", "pnet_sys", "winapi 0.2.8", @@ -3894,7 +3903,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "82f881a6d75ac98c5541db6144682d1773bb14c6fc50c6ebac7086c8f7f23c29" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", "ws2_32-sys", ] @@ -3909,7 +3918,7 @@ dependencies = [ "cfg-if 1.0.0", "findshlibs", "inferno", - "libc 0.2.132", + "libc 0.2.139", "log", "nix 0.24.1", "once_cell", @@ -3993,7 +4002,7 @@ dependencies = [ "byteorder", "hex 0.4.2", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -4002,7 +4011,7 @@ version = "0.4.2" source = "git+https://github.com/tikv/procinfo-rs?rev=6599eb9dca74229b2c1fcc44118bef7eff127128#6599eb9dca74229b2c1fcc44118bef7eff127128" dependencies = [ "byteorder", - "libc 0.2.132", + "libc 0.2.139", "nom 2.2.1", "rustc_version 0.2.3", ] @@ -4027,7 +4036,7 @@ dependencies = [ "cfg-if 1.0.0", "fnv", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "memchr", "parking_lot 0.11.1", "protobuf", @@ -4192,7 +4201,7 @@ dependencies = [ [[package]] name = "raft-engine" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" +source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" dependencies = [ "byteorder", "crc32fast", @@ -4203,11 +4212,11 @@ dependencies = [ "hex 0.4.2", "if_chain", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "lz4-sys", "memmap2", - "nix 0.25.0", + "nix 0.26.2", "num-derive", "num-traits", "parking_lot 0.12.1", @@ -4226,7 +4235,7 @@ dependencies = [ [[package]] name = "raft-engine-ctl" version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#82f6da7b8dff1856483e8e72a59dda903fb2499b" +source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" dependencies = [ "clap 3.1.6", "env_logger", @@ -4390,7 +4399,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "552840b97013b1a26992c11eac34bdd778e464601a4c2054b5f0bff7c6761293" dependencies = [ "fuchsia-cprng", - "libc 0.2.132", + "libc 0.2.139", "rand_core 0.3.1", "rdrand", "winapi 0.3.9", @@ -4403,7 +4412,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom 0.1.12", - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.2.1", "rand_core 0.5.1", "rand_hc", @@ -4415,7 +4424,7 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "rand_chacha 0.3.0", "rand_core 0.6.2", ] @@ -4710,7 +4719,7 @@ dependencies = [ "grpcio", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "online_config", "pdqselect", @@ -4773,7 +4782,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b72b84d47e8ec5a4f2872e8262b8f8256c5be1c938a7d6d3a867a3ba8f722f74" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", "once_cell", "spin", "untrusted", @@ -4786,7 +4795,7 @@ name = "rocksdb" version = "0.3.0" source = "git+https://github.com/tikv/rust-rocksdb.git#14e4fe7f47054408cf3d2905beeca798c6656191" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "librocksdb_sys", ] @@ -5034,7 +5043,7 @@ dependencies = [ "bitflags", "core-foundation", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "security-framework-sys", ] @@ -5045,7 +5054,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3676258fd3cfe2c9a0ec99ce3038798d847ce3e4bb17746373eb9f0f1ac16339" dependencies = [ "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5241,7 +5250,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "pd_client", @@ -5302,7 +5311,7 @@ version = "0.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a253b5e89e2698464fc26b545c9edceb338e18a89effeeecfea192c3025be29d" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "signal-hook-registry", ] @@ -5312,7 +5321,7 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -5445,7 +5454,7 @@ version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#8c12738bad811397600455d6982aff754ea2ac44" dependencies = [ "cmake", - "libc 0.2.132", + "libc 0.2.139", "pkg-config", ] @@ -5473,7 +5482,7 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02e2d2db9033d13a1567121ddd7a095ee144db4e1ca1b1bda3419bc0da294ebd" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "winapi 0.3.9", ] @@ -5683,7 +5692,7 @@ checksum = "ade661fa5e048ada64ad7901713301c21d2dbc5b65ee7967de8826c111452960" dependencies = [ "cfg-if 1.0.0", "core-foundation-sys", - "libc 0.2.132", + "libc 0.2.139", "ntapi", "once_cell", "rayon", @@ -5766,7 +5775,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", - "libc 0.2.132", + "libc 0.2.139", "rand 0.8.5", "redox_syscall 0.2.11", "remove_dir_all", @@ -6009,7 +6018,7 @@ dependencies = [ "hyper", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log_wrappers", "more-asserts", "online_config", @@ -6310,7 +6319,7 @@ dependencies = [ "keys", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "libloading", "log", "log_wrappers", @@ -6410,7 +6419,7 @@ dependencies = [ "hex 0.4.2", "keys", "kvproto", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "pd_client", @@ -6445,7 +6454,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e37706572f4b151dff7a0146e040804e9c26fe3a3118591112f05cf12a4216c1" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "paste", "tikv-jemalloc-sys", ] @@ -6458,7 +6467,7 @@ checksum = "aeab4310214fe0226df8bfeb893a291a58b19682e8a07e1e1d4483ad4200d315" dependencies = [ "cc", "fs_extra", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -6467,7 +6476,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20612db8a13a6c06d57ec83953694185a367e16945f66565e8028d2c0bd76979" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "tikv-jemalloc-sys", ] @@ -6490,7 +6499,7 @@ version = "0.1.0" dependencies = [ "fxhash", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "mimalloc", "snmalloc-rs", "tcmalloc", @@ -6559,7 +6568,7 @@ dependencies = [ "http", "kvproto", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", "log", "log_wrappers", "mnt", @@ -6608,7 +6617,7 @@ version = "0.1.42" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "redox_syscall 0.1.56", "winapi 0.3.9", ] @@ -6651,7 +6660,7 @@ checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" dependencies = [ "autocfg", "bytes", - "libc 0.2.132", + "libc 0.2.139", "memchr", "mio 0.8.5", "num_cpus", @@ -7037,7 +7046,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "055058552ca15c566082fc61da433ae678f78986a6f16957e33162d1b218792a" dependencies = [ "kernel32-sys", - "libc 0.2.132", + "libc 0.2.139", "winapi 0.2.8", ] @@ -7222,7 +7231,7 @@ checksum = "2a5a7e487e921cf220206864a94a89b6c6905bfc19f1057fa26a4cb360e5c1d2" dependencies = [ "either", "lazy_static", - "libc 0.2.132", + "libc 0.2.139", ] [[package]] @@ -7461,7 +7470,7 @@ version = "5.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" dependencies = [ - "libc 0.2.132", + "libc 0.2.139", "zstd-sys", ] @@ -7472,5 +7481,5 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fd07cbbc53846d9145dbffdf6dd09a7a0aa52be46741825f5c97bdd4f73f12b" dependencies = [ "cc", - "libc 0.2.132", + "libc 0.2.139", ] diff --git a/cmd/tikv-ctl/src/main.rs b/cmd/tikv-ctl/src/main.rs index 30cd7035bef..e4c7be98dba 100644 --- a/cmd/tikv-ctl/src/main.rs +++ b/cmd/tikv-ctl/src/main.rs @@ -686,7 +686,8 @@ fn build_rocks_opts(cfg: &TikvConfig) -> engine_rocks::RocksDbOptions { .unwrap() .map(Arc::new); let env = get_env(key_manager, None /* io_rate_limiter */).unwrap(); - cfg.rocksdb.build_opt(&cfg.rocksdb.build_resources(env)) + let resource = cfg.rocksdb.build_resources(env); + cfg.rocksdb.build_opt(&resource, cfg.storage.engine) } fn run_ldb_command(args: Vec, cfg: &TikvConfig) { diff --git a/components/engine_rocks/src/logger.rs b/components/engine_rocks/src/logger.rs index b7b196448c5..85f4de713ac 100644 --- a/components/engine_rocks/src/logger.rs +++ b/components/engine_rocks/src/logger.rs @@ -20,6 +20,30 @@ impl Logger for RocksdbLogger { } } +pub struct TabletLogger { + tablet_name: String, +} + +impl TabletLogger { + pub fn new(tablet_name: String) -> Self { + Self { tablet_name } + } +} + +impl Logger for TabletLogger { + fn logv(&self, log_level: InfoLogLevel, log: &str) { + match log_level { + InfoLogLevel::Header => info!(#"rocksdb_log_header", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Debug => debug!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Info => info!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Warn => warn!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Error => error!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + InfoLogLevel::Fatal => crit!(#"rocksdb_log", "[{}]{}", self.tablet_name, log), + _ => {} + } + } +} + #[derive(Default)] pub struct RaftDbLogger; diff --git a/components/raft_log_engine/src/engine.rs b/components/raft_log_engine/src/engine.rs index 838fe461f4b..92d7a4f7353 100644 --- a/components/raft_log_engine/src/engine.rs +++ b/components/raft_log_engine/src/engine.rs @@ -472,18 +472,21 @@ impl RaftLogBatchTrait for RaftLogBatch { let key = encode_flushed_key(cf, tablet_index); let mut value = vec![0; 8]; NumberCodec::encode_u64(&mut value, apply_index); - self.0.put(raft_group_id, key.to_vec(), value); - Ok(()) + self.0 + .put(raft_group_id, key.to_vec(), value) + .map_err(transfer_error) } fn put_dirty_mark(&mut self, raft_group_id: u64, tablet_index: u64, dirty: bool) -> Result<()> { let key = encode_key(DIRTY_MARK_KEY, tablet_index); if dirty { - self.0.put(raft_group_id, key.to_vec(), vec![]); + self.0 + .put(raft_group_id, key.to_vec(), vec![]) + .map_err(transfer_error) } else { self.0.delete(raft_group_id, key.to_vec()); + Ok(()) } - Ok(()) } fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index e7c32e742ec..7f656e29210 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -535,11 +535,13 @@ impl Peer { } if !self.serving() { self.start_destroy(ctx, &mut write_task); - ctx.coprocessor_host.on_region_changed( - self.region(), - RegionChangeEvent::Destroy, - self.raft_group().raft.state, - ); + if self.persisted_index() != 0 { + ctx.coprocessor_host.on_region_changed( + self.region(), + RegionChangeEvent::Destroy, + self.raft_group().raft.state, + ); + } } // Ready number should increase monotonically. assert!(self.async_writer.known_largest_number() < ready.number()); diff --git a/src/config/mod.rs b/src/config/mod.rs index 3274d5442df..38d69f1ab29 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -1264,7 +1264,7 @@ impl DbConfig { } } - pub fn build_opt(&self, shared: &DbResources) -> RocksDbOptions { + pub fn build_opt(&self, shared: &DbResources, for_engine: EngineType) -> RocksDbOptions { let mut opts = RocksDbOptions::default(); opts.set_wal_recovery_mode(self.wal_recovery_mode); if !self.wal_dir.is_empty() { @@ -1306,7 +1306,9 @@ impl DbConfig { if let Some(b) = self.paranoid_checks { opts.set_paranoid_checks(b); } - opts.set_info_log(RocksdbLogger::default()); + if for_engine == EngineType::RaftKv { + opts.set_info_log(RocksdbLogger::default()); + } opts.set_info_log_level(self.info_log_level.into()); if self.titan.enabled { opts.set_titandb_options(&self.titan.build_opts()); @@ -4424,9 +4426,10 @@ mod tests { fn test_rocks_rate_limit_zero() { let mut tikv_cfg = TikvConfig::default(); tikv_cfg.rocksdb.rate_bytes_per_sec = ReadableSize(0); + let resource = tikv_cfg.rocksdb.build_resources(Arc::new(Env::default())); tikv_cfg .rocksdb - .build_opt(&tikv_cfg.rocksdb.build_resources(Arc::new(Env::default()))); + .build_opt(&resource, tikv_cfg.storage.engine); } #[test] @@ -4587,12 +4590,10 @@ mod tests { Arc, ) { assert_eq!(F::TAG, cfg.storage.api_version()); + let resource = cfg.rocksdb.build_resources(Arc::default()); let engine = RocksDBEngine::new( &cfg.storage.data_dir, - Some( - cfg.rocksdb - .build_opt(&cfg.rocksdb.build_resources(Arc::new(Env::default()))), - ), + Some(cfg.rocksdb.build_opt(&resource, cfg.storage.engine)), cfg.rocksdb.build_cf_opts( &cfg.rocksdb .build_cf_resources(cfg.storage.block_cache.build_shared_cache()), diff --git a/src/server/engine_factory.rs b/src/server/engine_factory.rs index 91b5178f8a0..ff06e41cc57 100644 --- a/src/server/engine_factory.rs +++ b/src/server/engine_factory.rs @@ -6,6 +6,7 @@ use engine_rocks::{ raw::{Cache, Env}, CompactedEventSender, CompactionListener, FlowListener, RocksCfOptions, RocksCompactionJobInfo, RocksDbOptions, RocksEngine, RocksEventListener, RocksPersistenceListener, RocksStatistics, + TabletLogger, }; use engine_traits::{ CompactionJobInfo, MiscExt, PersistenceListener, Result, StateStorage, TabletContext, @@ -134,12 +135,12 @@ impl KvEngineFactory { self.inner.db_resources.statistics.clone() } - fn db_opts(&self) -> RocksDbOptions { + fn db_opts(&self, for_engine: EngineType) -> RocksDbOptions { // Create kv engine. let mut db_opts = self .inner .rocksdb_config - .build_opt(&self.inner.db_resources); + .build_opt(&self.inner.db_resources, for_engine); if !self.inner.lite { db_opts.add_event_listener(RocksEventListener::new( "kv", @@ -170,7 +171,7 @@ impl KvEngineFactory { /// It will always create in path/DEFAULT_DB_SUB_DIR. pub fn create_shared_db(&self, path: impl AsRef) -> Result { let path = path.as_ref(); - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv); let cf_opts = self.cf_opts(EngineType::RaftKv); if let Some(listener) = &self.inner.flow_listener { db_opts.add_event_listener(listener.clone()); @@ -187,7 +188,9 @@ impl KvEngineFactory { impl TabletFactory for KvEngineFactory { fn open_tablet(&self, ctx: TabletContext, path: &Path) -> Result { - let mut db_opts = self.db_opts(); + let mut db_opts = self.db_opts(EngineType::RaftKv2); + let tablet_name = path.file_name().unwrap().to_str().unwrap().to_string(); + db_opts.set_info_log(TabletLogger::new(tablet_name)); let cf_opts = self.cf_opts(EngineType::RaftKv2); if let Some(listener) = &self.inner.flow_listener && let Some(suffix) = ctx.suffix { db_opts.add_event_listener(listener.clone_with(ctx.id, suffix)); @@ -215,7 +218,7 @@ impl TabletFactory for KvEngineFactory { fn destroy_tablet(&self, ctx: TabletContext, path: &Path) -> Result<()> { info!("destroy tablet"; "path" => %path.display(), "id" => ctx.id, "suffix" => ?ctx.suffix); // Create kv engine. - let _db_opts = self.db_opts(); + let _db_opts = self.db_opts(EngineType::RaftKv2); let _cf_opts = self.cf_opts(EngineType::RaftKv2); // TODOTODO: call rust-rocks or tirocks to destroy_engine; // engine_rocks::util::destroy_engine( diff --git a/tests/integrations/storage/test_titan.rs b/tests/integrations/storage/test_titan.rs index 452bcc89238..dc0a85bc9c2 100644 --- a/tests/integrations/storage/test_titan.rs +++ b/tests/integrations/storage/test_titan.rs @@ -159,9 +159,8 @@ fn test_delete_files_in_range_for_titan() { cfg.rocksdb.defaultcf.titan.min_gc_batch_size = ReadableSize(0); cfg.rocksdb.defaultcf.titan.discardable_ratio = 0.4; cfg.rocksdb.defaultcf.titan.min_blob_size = ReadableSize(0); - let kv_db_opts = cfg - .rocksdb - .build_opt(&cfg.rocksdb.build_resources(Default::default())); + let resource = cfg.rocksdb.build_resources(Default::default()); + let kv_db_opts = cfg.rocksdb.build_opt(&resource, cfg.storage.engine); let kv_cfs_opts = cfg.rocksdb.build_cf_opts( &cfg.rocksdb.build_cf_resources(cache), None, From c8c1ca8b8376d7f29c05cd1cf08b469ddbc4939c Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 3 Feb 2023 15:43:55 +0800 Subject: [PATCH 112/115] raftstore: Observe when receive raft message (#14043) ref tikv/tikv#13855 Introduce observers when receive raft message. Signed-off-by: CalvinNeo --- components/cdc/src/observer.rs | 2 ++ .../raftstore-v2/src/operation/ready/mod.rs | 1 + .../raftstore/src/coprocessor/dispatcher.rs | 35 +++++++++++++++++++ components/raftstore/src/coprocessor/mod.rs | 15 ++++++-- components/raftstore/src/store/fsm/peer.rs | 3 ++ components/raftstore/src/store/fsm/store.rs | 3 ++ components/raftstore/src/store/peer.rs | 1 + components/raftstore/src/store/snap.rs | 16 ++++++--- 8 files changed, 70 insertions(+), 6 deletions(-) diff --git a/components/cdc/src/observer.rs b/components/cdc/src/observer.rs index 696bc6341ee..aac2842e404 100644 --- a/components/cdc/src/observer.rs +++ b/components/cdc/src/observer.rs @@ -273,6 +273,7 @@ mod tests { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { @@ -301,6 +302,7 @@ mod tests { prev_lead_transferee: 3, vote: 3, initialized: true, + peer_id: raft::INVALID_ID, }, ); match rx.recv_timeout(Duration::from_millis(10)).unwrap().unwrap() { diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 7f656e29210..03dce74d4e7 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -768,6 +768,7 @@ impl Peer { prev_lead_transferee: target, vote: self.raft_group().raft.vote, initialized: self.storage().is_initialized(), + peer_id: self.peer().get_id(), }, ); self.proposal_control_mut().maybe_update_term(term); diff --git a/components/raftstore/src/coprocessor/dispatcher.rs b/components/raftstore/src/coprocessor/dispatcher.rs index 794a46b8e3a..0e45ef1d09d 100644 --- a/components/raftstore/src/coprocessor/dispatcher.rs +++ b/components/raftstore/src/coprocessor/dispatcher.rs @@ -8,6 +8,7 @@ use kvproto::{ metapb::{Region, RegionEpoch}, pdpb::CheckPolicy, raft_cmdpb::{ComputeHashRequest, RaftCmdRequest}, + raft_serverpb::RaftMessage, }; use protobuf::Message; use raft::eraftpb; @@ -278,6 +279,7 @@ impl_box_observer_g!( ConsistencyCheckObserver, WrappedConsistencyCheckObserver ); +impl_box_observer!(BoxMessageObserver, MessageObserver, WrappedMessageObserver); /// Registry contains all registered coprocessors. #[derive(Clone)] @@ -296,6 +298,7 @@ where read_index_observers: Vec>, pd_task_observers: Vec>, update_safe_ts_observers: Vec>, + message_observers: Vec>, // TODO: add endpoint } @@ -313,6 +316,7 @@ impl Default for Registry { read_index_observers: Default::default(), pd_task_observers: Default::default(), update_safe_ts_observers: Default::default(), + message_observers: Default::default(), } } } @@ -381,6 +385,10 @@ impl Registry { pub fn register_update_safe_ts_observer(&mut self, priority: u32, qo: BoxUpdateSafeTsObserver) { push!(priority, qo, self.update_safe_ts_observers); } + + pub fn register_message_observer(&mut self, priority: u32, qo: BoxMessageObserver) { + push!(priority, qo, self.message_observers); + } } /// A macro that loops over all observers and returns early when error is found @@ -780,6 +788,17 @@ impl CoprocessorHost { true } + /// Returns false if the message should not be stepped later. + pub fn on_raft_message(&self, msg: &RaftMessage) -> bool { + for observer in &self.registry.message_observers { + let observer = observer.observer.inner(); + if !observer.on_raft_message(msg) { + return false; + } + } + true + } + pub fn on_flush_applied_cmd_batch( &self, max_level: ObserveLevel, @@ -890,6 +909,7 @@ mod tests { OnUpdateSafeTs = 23, PrePersist = 24, PreWriteApplyState = 25, + OnRaftMessage = 26, } impl Coprocessor for TestCoprocessor {} @@ -1132,6 +1152,14 @@ mod tests { } } + impl MessageObserver for TestCoprocessor { + fn on_raft_message(&self, _: &RaftMessage) -> bool { + self.called + .fetch_add(ObserverIndex::OnRaftMessage as usize, Ordering::SeqCst); + true + } + } + macro_rules! assert_all { ($target:expr, $expect:expr) => {{ for (c, e) in ($target).iter().zip($expect) { @@ -1168,6 +1196,8 @@ mod tests { .register_cmd_observer(1, BoxCmdObserver::new(ob.clone())); host.registry .register_update_safe_ts_observer(1, BoxUpdateSafeTsObserver::new(ob.clone())); + host.registry + .register_message_observer(1, BoxMessageObserver::new(ob.clone())); let mut index: usize = 0; let region = Region::default(); @@ -1282,6 +1312,11 @@ mod tests { host.pre_write_apply_state(®ion); index += ObserverIndex::PreWriteApplyState as usize; assert_all!([&ob.called], &[index]); + + let msg = RaftMessage::default(); + host.on_raft_message(&msg); + index += ObserverIndex::OnRaftMessage as usize; + assert_all!([&ob.called], &[index]); } #[test] diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index 73110660856..98b045dbed8 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -26,14 +26,16 @@ mod metrics; pub mod region_info_accessor; mod split_check; pub mod split_observer; +use kvproto::raft_serverpb::RaftMessage; pub use self::{ config::{Config, ConsistencyCheckMethod}, consistency_check::{ConsistencyCheckObserver, Raw as RawConsistencyCheckObserver}, dispatcher::{ BoxAdminObserver, BoxApplySnapshotObserver, BoxCmdObserver, BoxConsistencyCheckObserver, - BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, BoxRoleObserver, - BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, StoreHandle, + BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, + BoxRoleObserver, BoxSplitCheckObserver, BoxUpdateSafeTsObserver, CoprocessorHost, Registry, + StoreHandle, }, error::{Error, Result}, region_info_accessor::{ @@ -269,6 +271,7 @@ pub struct RoleChange { /// Which peer is voted by itself. pub vote: u64, pub initialized: bool, + pub peer_id: u64, } impl RoleChange { @@ -280,6 +283,7 @@ impl RoleChange { prev_lead_transferee: raft::INVALID_ID, vote: raft::INVALID_ID, initialized: true, + peer_id: raft::INVALID_ID, } } } @@ -334,6 +338,13 @@ pub trait RegionChangeObserver: Coprocessor { } } +pub trait MessageObserver: Coprocessor { + /// Returns false if the message should not be stepped later. + fn on_raft_message(&self, _: &RaftMessage) -> bool { + true + } +} + #[derive(Clone, Debug, Default)] pub struct Cmd { pub index: u64, diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index a8232fd8322..75da7d497e4 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -610,6 +610,9 @@ where for m in msgs.drain(..) { match m { PeerMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { error!(%e; "handle raft message err"; diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 26f2983998d..85631bebe09 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -753,6 +753,9 @@ impl<'a, EK: KvEngine + 'static, ER: RaftEngine + 'static, T: Transport> match m { StoreMsg::Tick(tick) => self.on_tick(tick), StoreMsg::RaftMessage(msg) => { + if !self.ctx.coprocessor_host.on_raft_message(&msg.msg) { + continue; + } if let Err(e) = self.on_raft_message(msg) { if matches!(&e, Error::RegionNotRegistered { .. }) { // This may happen in normal cases when add-peer runs slowly diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 44701fbf705..a6010a6761f 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -2335,6 +2335,7 @@ where prev_lead_transferee: self.lead_transferee, vote: self.raft_group.raft.vote, initialized: self.is_initialized(), + peer_id: self.peer.get_id(), }, ); self.cmd_epoch_checker.maybe_update_term(self.term()); diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index a9ef7df8c62..358ec716195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -207,7 +207,9 @@ fn retry_delete_snapshot(mgr: &SnapManagerCore, key: &SnapKey, snap: &Snapshot) false } -fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { +// Create a SnapshotMeta that can be later put into RaftSnapshotData or written +// into file. +pub fn gen_snapshot_meta(cf_files: &[CfFile], for_balance: bool) -> RaftStoreResult { let mut meta = Vec::with_capacity(cf_files.len()); for cf_file in cf_files { if !SNAPSHOT_CFS.iter().any(|cf| cf_file.cf == *cf) { @@ -663,7 +665,8 @@ impl Snapshot { Ok(snapshot_meta) } - fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { + // Validate and set SnapshotMeta of this Snapshot. + pub fn set_snapshot_meta(&mut self, snapshot_meta: SnapshotMeta) -> RaftStoreResult<()> { let mut cf_file_count_from_meta: Vec = vec![]; let mut file_count = 0; let mut current_cf = ""; @@ -812,8 +815,9 @@ impl Snapshot { } } - // Only called in `do_build`. - fn save_meta_file(&mut self) -> RaftStoreResult<()> { + // Save `SnapshotMeta` to file. + // Used in `do_build` and by external crates. + pub fn save_meta_file(&mut self) -> RaftStoreResult<()> { let v = box_try!(self.meta_file.meta.as_ref().unwrap().write_to_bytes()); if let Some(mut f) = self.meta_file.file.take() { // `meta_file` could be None for this case: in `init_for_building` the snapshot @@ -1125,6 +1129,10 @@ impl Snapshot { file_system::metadata(&self.meta_file.path) } + pub fn meta_path(&self) -> &PathBuf { + &self.meta_file.path + } + pub fn total_size(&self) -> u64 { self.cf_files .iter() From 31a0fed2549cd984bb9a089144d92bb670a5abb8 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 3 Feb 2023 16:10:30 +0800 Subject: [PATCH 113/115] adapt to tikv master Signed-off-by: CalvinNeo --- Cargo.lock | 664 +++++++++++++++------- new-mock-engine-store/src/mock_cluster.rs | 7 +- 2 files changed, 474 insertions(+), 197 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 633194d9323..f181392700a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,7 +36,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59206260f98d163b3ca42fb29fe551dbcda1d43cf70a244066b2a0666a8fb2a9" dependencies = [ "cc", - "clap 2.33.0", + "clap", "rustc_version 0.2.3", "xdg", ] @@ -441,8 +441,7 @@ dependencies = [ [[package]] name = "backtrace" version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +source = "git+https://github.com/hehechen/backtrace-rs?branch=v0.3.61#d0aeebbea2298174e4c6edd3d1e54bda0e6624e4" dependencies = [ "addr2line", "cc", @@ -623,14 +622,18 @@ dependencies = [ "bitflags", "cexpr 0.4.0", "clang-sys", + "clap", + "env_logger 0.8.4", "lazy_static", "lazycell", + "log", "peeking_take_while", "proc-macro2", "quote", "regex", "rustc-hash", "shlex 0.1.1", + "which 3.1.1", ] [[package]] @@ -642,8 +645,8 @@ dependencies = [ "bitflags", "cexpr 0.6.0", "clang-sys", - "clap 2.33.0", - "env_logger", + "clap", + "env_logger 0.9.0", "lazy_static", "lazycell", "log", @@ -653,7 +656,7 @@ dependencies = [ "regex", "rustc-hash", "shlex 1.1.0", - "which", + "which 4.2.4", ] [[package]] @@ -948,41 +951,11 @@ dependencies = [ "atty", "bitflags", "strsim 0.8.0", - "textwrap 0.11.0", + "textwrap", "unicode-width", "vec_map", ] -[[package]] -name = "clap" -version = "3.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8c93436c21e4698bacadf42917db28b23017027a4deccb35dbe47a7e7840123" -dependencies = [ - "atty", - "bitflags", - "clap_derive", - "indexmap", - "lazy_static", - "os_str_bytes", - "strsim 0.10.0", - "termcolor", - "textwrap 0.15.0", -] - -[[package]] -name = "clap_derive" -version = "3.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da95d038ede1a964ce99f49cbe27a7fb538d1da595e4b4f70b8c8f338d17bf16" -dependencies = [ - "heck 0.4.0", - "proc-macro-error", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "cloud" version = "0.0.1" @@ -1086,6 +1059,15 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" +[[package]] +name = "cpp_demangle" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eeaa953eaad386a53111e47172c2fedba671e5684c8dd601a5f474f4f118710f" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "cpu-time" version = "1.0.0" @@ -1125,10 +1107,10 @@ checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" dependencies = [ "atty", "cast", - "clap 2.33.0", + "clap", "criterion-plot", "csv", - "itertools", + "itertools 0.10.0", "lazy_static", "num-traits", "oorandom", @@ -1170,7 +1152,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ "cast", - "itertools", + "itertools 0.10.0", ] [[package]] @@ -1587,6 +1569,71 @@ dependencies = [ "tikv_util", ] +[[package]] +name = "engine_store_ffi" +version = "0.0.1" +dependencies = [ + "batch-system", + "bitflags", + "byteorder", + "bytes", + "collections", + "crossbeam", + "derivative", + "encryption", + "encryption_export", + "engine_panic", + "engine_rocks", + "engine_test", + "engine_tiflash", + "engine_traits", + "error_code", + "fail", + "file_system", + "fs2", + "futures 0.3.15", + "futures-util", + "getset", + "grpcio-health", + "into_other", + "itertools 0.10.0", + "keys", + "kvproto", + "lazy_static", + "log", + "log_wrappers", + "online_config", + "ordered-float", + "panic_hook", + "parking_lot 0.12.1", + "pd_client", + "prometheus", + "prometheus-static-metric", + "protobuf", + "raft", + "raft-proto", + "raftstore", + "rand 0.8.5", + "serde", + "serde_derive", + "serde_with", + "slog", + "slog-global", + "smallvec", + "sst_importer", + "tempfile", + "test_sst_importer", + "thiserror", + "tikv_alloc", + "tikv_util", + "time", + "tokio", + "tokio-timer", + "tracker", + "uuid 0.8.2", + "yatp", +] + [[package]] name = "engine_test" version = "0.0.1" @@ -1603,6 +1650,48 @@ dependencies = [ "tikv_util", ] +[[package]] +name = "engine_tiflash" +version = "0.0.1" +dependencies = [ + "api_version", + "case_macros", + "collections", + "derive_more", + "encryption", + "engine_rocks", + "engine_traits", + "fail", + "file_system", + "keys", + "kvproto", + "lazy_static", + "libc 0.2.139", + "log_wrappers", + "num_cpus", + "online_config", + "prometheus", + "prometheus-static-metric", + "protobuf", + "raft", + "rand 0.8.5", + "regex", + "rocksdb", + "serde", + "serde_derive", + "slog", + "slog-global", + "slog_derive", + "tempfile", + "tikv_alloc", + "tikv_util", + "time", + "toml", + "tracker", + "txn_types", + "yatp", +] + [[package]] name = "engine_traits" version = "0.0.1" @@ -1652,6 +1741,19 @@ dependencies = [ "syn", ] +[[package]] +name = "env_logger" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" +dependencies = [ + "atty", + "humantime", + "log", + "regex", + "termcolor", +] + [[package]] name = "env_logger" version = "0.9.0" @@ -1706,7 +1808,7 @@ dependencies = [ "hyper", "hyper-openssl", "openssl", - "prost", + "prost 0.11.2", "tokio", "tokio-stream", "tonic", @@ -2209,6 +2311,15 @@ dependencies = [ "url", ] +[[package]] +name = "gen-proxy-ffi" +version = "0.1.0" +dependencies = [ + "bindgen 0.57.0", + "clap", + "walkdir", +] + [[package]] name = "generic-array" version = "0.14.4" @@ -2654,6 +2765,15 @@ dependencies = [ "serde", ] +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.10.0" @@ -3240,6 +3360,80 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "new-mock-engine-store" +version = "0.0.1" +dependencies = [ + "api_version", + "causal_ts", + "collections", + "concurrency_manager", + "crossbeam", + "encryption", + "encryption_export", + "engine_rocks", + "engine_store_ffi", + "engine_test", + "engine_tiflash", + "engine_traits", + "fail", + "file_system", + "futures 0.3.15", + "grpcio", + "grpcio-health", + "keys", + "kvproto", + "lazy_static", + "log_wrappers", + "pd_client", + "protobuf", + "proxy_server", + "raft", + "raftstore", + "rand 0.8.5", + "resolved_ts", + "resource_control", + "resource_metering", + "security", + "slog", + "slog-global", + "tempfile", + "test_pd_client", + "test_raftstore", + "test_util", + "tikv", + "tikv_util", + "tokio", + "tokio-timer", + "txn_types", +] + +[[package]] +name = "nix" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "becb657d662f1cd2ef38c7ad480ec6b8cf9e96b27adb543e594f9cf0f2e6065c" +dependencies = [ + "bitflags", + "cc", + "cfg-if 0.1.10", + "libc 0.2.139", + "void", +] + +[[package]] +name = "nix" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f3790c00a0150112de0f4cd161e3d7fc4b2d8a5542ffc35f099a2562aecb35c" +dependencies = [ + "bitflags", + "cc", + "cfg-if 1.0.0", + "libc 0.2.139", + "memoffset 0.6.4", +] + [[package]] name = "nix" version = "0.24.1" @@ -3606,15 +3800,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "os_str_bytes" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" -dependencies = [ - "memchr", -] - [[package]] name = "page_size" version = "0.4.2" @@ -4030,8 +4215,7 @@ dependencies = [ [[package]] name = "prometheus" version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f64969ffd5dd8f39bd57a68ac53c163a095ed9d0fb707146da1b27025a3504" +source = "git+https://github.com/solotzg/rust-prometheus.git?rev=b4fe98a06a58d29f9b9987a0d7186f6ed5230193#b4fe98a06a58d29f9b9987a0d7186f6ed5230193" dependencies = [ "cfg-if 1.0.0", "fnv", @@ -4056,6 +4240,16 @@ dependencies = [ "syn", ] +[[package]] +name = "prost" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e6984d2f1a23009bd270b8bb56d0926810a3d483f59c987d77969e9d8e840b2" +dependencies = [ + "bytes", + "prost-derive 0.7.0", +] + [[package]] name = "prost" version = "0.11.2" @@ -4063,7 +4257,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0841812012b2d4a6145fae9a6af1534873c32aa67fff26bd09f8fa42c83f95a" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.11.2", ] [[package]] @@ -4074,18 +4268,31 @@ checksum = "1d8b442418ea0822409d9e7d047cbf1e7e9e1760b172bf9982cf29d517c93511" dependencies = [ "bytes", "heck 0.4.0", - "itertools", + "itertools 0.10.0", "lazy_static", "log", "multimap", "petgraph", "prettyplease", - "prost", + "prost 0.11.2", "prost-types", "regex", "syn", "tempfile", - "which", + "which 4.2.4", +] + +[[package]] +name = "prost-derive" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "169a15f3008ecb5160cba7d37bcd690a7601b6d30cfb87a117d45e59d52af5d4" +dependencies = [ + "anyhow", + "itertools 0.9.0", + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -4095,7 +4302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "164ae68b6587001ca506d3bf7f1000bfa248d0e1217b618108fba4ec1d0cc306" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.0", "proc-macro2", "quote", "syn", @@ -4108,7 +4315,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "747761bc3dc48f9a34553bf65605cf6cb6288ba219f3450b4275dbd81539551a" dependencies = [ "bytes", - "prost", + "prost 0.11.2", ] [[package]] @@ -4166,10 +4373,170 @@ dependencies = [ ] [[package]] -name = "quick-xml" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" +name = "proxy_server" +version = "0.0.1" +dependencies = [ + "api_version", + "async-stream 0.2.0", + "backup", + "backup-stream", + "causal_ts", + "chrono", + "clap", + "collections", + "concurrency_manager", + "crossbeam", + "encryption", + "encryption_export", + "engine_rocks", + "engine_rocks_helper", + "engine_store_ffi", + "engine_traits", + "error_code", + "fail", + "file_system", + "fs2", + "futures 0.3.15", + "grpcio", + "grpcio-health", + "hex 0.4.2", + "hyper", + "itertools 0.10.0", + "keys", + "kvproto", + "lazy_static", + "libc 0.2.139", + "log", + "log_wrappers", + "mime", + "nix 0.23.2", + "online_config", + "openssl", + "pd_client", + "pin-project", + "pprof", + "prometheus", + "protobuf", + "raft", + "raft_log_engine", + "raftstore", + "rand 0.8.5", + "regex", + "resolved_ts", + "resource_control", + "resource_metering", + "security", + "serde", + "serde_derive", + "serde_ignored", + "serde_json", + "serde_with", + "server", + "signal", + "slog", + "slog-global", + "tempfile", + "tikv", + "tikv_alloc", + "tikv_util", + "tokio", + "tokio-openssl", + "toml", + "txn_types", + "url", + "yatp", +] + +[[package]] +name = "proxy_tests" +version = "0.0.1" +dependencies = [ + "api_version", + "arrow", + "async-trait", + "batch-system", + "byteorder", + "causal_ts", + "cdc", + "clap", + "collections", + "concurrency_manager", + "crc64fast", + "criterion", + "criterion-cpu-time", + "criterion-perf-events", + "crossbeam", + "encryption", + "engine_rocks", + "engine_rocks_helper", + "engine_store_ffi", + "engine_test", + "engine_tiflash", + "engine_traits", + "error_code", + "external_storage_export", + "fail", + "file_system", + "futures 0.3.15", + "grpcio", + "grpcio-health", + "hyper", + "keys", + "kvproto", + "libc 0.2.139", + "log_wrappers", + "more-asserts", + "new-mock-engine-store", + "online_config", + "panic_hook", + "paste", + "pd_client", + "perfcnt", + "procinfo", + "profiler", + "protobuf", + "proxy_server", + "raft", + "raft_log_engine", + "raftstore", + "rand 0.8.5", + "rand_xorshift", + "resource_metering", + "security", + "serde_json", + "slog", + "slog-global", + "sst_importer", + "tempfile", + "test_backup", + "test_coprocessor", + "test_pd", + "test_pd_client", + "test_raftstore", + "test_sst_importer", + "test_storage", + "test_util", + "tidb_query_aggr", + "tidb_query_common", + "tidb_query_datatype", + "tidb_query_executors", + "tidb_query_expr", + "tikv", + "tikv_util", + "time", + "tipb", + "tipb_helper", + "tokio", + "toml", + "txn_types", + "uuid 0.8.2", +] + +[[package]] +name = "quick-xml" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8533f14c8382aaad0d592c812ac3b826162128b65662331e1127b45c3d18536b" dependencies = [ "memchr", ] @@ -4224,7 +4591,6 @@ dependencies = [ "prometheus-static-metric", "protobuf", "rayon", - "rhai", "scopeguard", "serde", "serde_repr", @@ -4232,16 +4598,6 @@ dependencies = [ "thiserror", ] -[[package]] -name = "raft-engine-ctl" -version = "0.3.0" -source = "git+https://github.com/tikv/raft-engine.git#33530112c3a4acaf8c50ca9d0470284109926296" -dependencies = [ - "clap 3.1.6", - "env_logger", - "raft-engine", -] - [[package]] name = "raft-proto" version = "0.7.0" @@ -4306,7 +4662,7 @@ dependencies = [ "getset", "grpcio-health", "into_other", - "itertools", + "itertools 0.10.0", "keys", "kvproto", "lazy_static", @@ -4348,6 +4704,13 @@ dependencies = [ "yatp", ] +[[package]] +name = "raftstore-proxy" +version = "0.0.1" +dependencies = [ + "proxy_server", +] + [[package]] name = "raftstore-v2" version = "0.1.0" @@ -4749,32 +5112,6 @@ dependencies = [ "bytemuck", ] -[[package]] -name = "rhai" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f06953bb8b9e4307cb7ccc0d9d018e2ddd25a30d32831f631ce4fe8f17671f7" -dependencies = [ - "ahash", - "bitflags", - "instant", - "num-traits", - "rhai_codegen", - "smallvec", - "smartstring", -] - -[[package]] -name = "rhai_codegen" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a39bc2aa9258b282ee5518dac493491a9c4c11a6d7361b9d2644c922fc6488" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "ring" version = "0.16.16" @@ -5232,7 +5569,7 @@ dependencies = [ "causal_ts", "cdc", "chrono", - "clap 2.33.0", + "clap", "collections", "concurrency_manager", "crossbeam", @@ -5305,6 +5642,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" +[[package]] +name = "signal" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106428d9d96840ecdec5208c13ab8a4e28c38da1e0ccf2909fb44e41b992f897" +dependencies = [ + "libc 0.2.139", + "nix 0.11.1", +] + [[package]] name = "signal-hook" version = "0.3.14" @@ -5407,17 +5754,6 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" -[[package]] -name = "smartstring" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" -dependencies = [ - "autocfg", - "static_assertions", - "version_check 0.9.4", -] - [[package]] name = "snap_recovery" version = "0.1.0" @@ -5566,19 +5902,13 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "032c03039aae92b350aad2e3779c352e104d919cb192ba2fabbd7b831ce4f0f6" -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - [[package]] name = "structopt" version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "126d630294ec449fae0b16f964e35bf3c74f940da9dca17ee9b905f7b3112eb8" dependencies = [ - "clap 2.33.0", + "clap", "lazy_static", "structopt-derive", ] @@ -5663,6 +5993,7 @@ version = "10.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48808b846eef84e0ac06365dc620f028ae632355e5dcffc007bf1b2bf5eab17b" dependencies = [ + "cpp_demangle", "rustc-demangle", "symbolic-common", ] @@ -6076,12 +6407,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "textwrap" -version = "0.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" - [[package]] name = "thiserror" version = "1.0.30" @@ -6211,7 +6536,7 @@ dependencies = [ "collections", "fail", "futures 0.3.15", - "itertools", + "itertools 0.10.0", "kvproto", "log_wrappers", "match-template", @@ -6314,7 +6639,7 @@ dependencies = [ "hyper-openssl", "hyper-tls", "into_other", - "itertools", + "itertools 0.10.0", "keyed_priority_queue", "keys", "kvproto", @@ -6344,6 +6669,7 @@ dependencies = [ "procinfo", "prometheus", "prometheus-static-metric", + "prost 0.7.0", "protobuf", "raft", "raft_log_engine", @@ -6396,58 +6722,6 @@ dependencies = [ "zipf", ] -[[package]] -name = "tikv-ctl" -version = "0.0.1" -dependencies = [ - "backup", - "cc", - "cdc", - "chrono", - "clap 2.33.0", - "collections", - "concurrency_manager", - "crossbeam", - "encryption_export", - "engine_rocks", - "engine_traits", - "error_code", - "file_system", - "futures 0.3.15", - "gag", - "grpcio", - "hex 0.4.2", - "keys", - "kvproto", - "libc 0.2.139", - "log", - "log_wrappers", - "pd_client", - "prometheus", - "protobuf", - "raft", - "raft-engine-ctl", - "raft_log_engine", - "raftstore", - "rand 0.8.5", - "regex", - "security", - "serde_json", - "server", - "signal-hook", - "slog", - "slog-global", - "structopt", - "tempfile", - "tikv", - "tikv_alloc", - "tikv_util", - "time", - "tokio", - "toml", - "txn_types", -] - [[package]] name = "tikv-jemalloc-ctl" version = "0.5.0" @@ -6480,19 +6754,6 @@ dependencies = [ "tikv-jemalloc-sys", ] -[[package]] -name = "tikv-server" -version = "0.0.1" -dependencies = [ - "cc", - "clap 2.33.0", - "serde_json", - "server", - "tikv", - "time", - "toml", -] - [[package]] name = "tikv_alloc" version = "0.1.0" @@ -6791,8 +7052,8 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost", - "prost-derive", + "prost 0.11.2", + "prost-derive 0.11.2", "tokio", "tokio-stream", "tokio-util", @@ -7106,6 +7367,12 @@ dependencies = [ "syn", ] +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + [[package]] name = "walkdir" version = "2.3.1" @@ -7223,6 +7490,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +dependencies = [ + "libc 0.2.139", +] + [[package]] name = "which" version = "4.2.4" diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index d4b3c24ff4b..5d2e4c6ee23 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -522,9 +522,10 @@ pub fn create_tiflash_test_engine( let kv_path = dir.path().join(tikv::config::DEFAULT_ROCKSDB_SUB_DIR); let kv_path_str = kv_path.to_str().unwrap(); - let kv_db_opt = cfg - .rocksdb - .build_opt(&cfg.rocksdb.build_resources(env.clone())); + let kv_db_opt = cfg.rocksdb.build_opt( + &cfg.rocksdb.build_resources(env.clone()), + cfg.storage.engine, + ); let cache = cfg.storage.block_cache.build_shared_cache(); let raft_cfs_opt = cfg.raftdb.build_cf_opts(&cache); From b96048f88d8a97e218c887db95ec6d257d65efd1 Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 3 Feb 2023 18:11:59 +0800 Subject: [PATCH 114/115] [Cloud] Fix before merge master (#265) --- components/raftstore/src/store/snap.rs | 8 ----- engine_store_ffi/src/ffihub_impl.rs | 6 +--- engine_store_ffi/src/lib.rs | 2 +- engine_store_ffi/src/observer.rs | 29 ++++++++++--------- engine_store_ffi/src/ps_engine.rs | 17 +++++------ new-mock-engine-store/src/mock_cluster.rs | 3 +- .../src/mock_page_storage.rs | 12 ++++---- new-mock-engine-store/src/node.rs | 7 ++++- new-mock-engine-store/src/server.rs | 6 +++- proxy_server/src/run.rs | 6 +++- proxy_tests/proxy/fast_add_peer.rs | 4 ++- 11 files changed, 50 insertions(+), 50 deletions(-) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 0c39288f939..e7f410382ea 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -1141,14 +1141,6 @@ impl Snapshot { self.cf_files.iter().map(|cf| cf.kv_count).sum() } - pub fn set_hold_tmp_files(&mut self, v: bool) { - self.hold_tmp_files = v; - } - - pub fn hold_tmp_files(&self) -> bool { - self.hold_tmp_files - } - pub fn save(&mut self) -> io::Result<()> { debug!( "saving to snapshot file"; diff --git a/engine_store_ffi/src/ffihub_impl.rs b/engine_store_ffi/src/ffihub_impl.rs index 0a08050b691..d6569756da7 100644 --- a/engine_store_ffi/src/ffihub_impl.rs +++ b/engine_store_ffi/src/ffihub_impl.rs @@ -80,11 +80,7 @@ impl engine_tiflash::FFIHubInner for TiFlashFFIHub { for i in 0..values.len { let value = unsafe { &*arr.offset(i as isize) }; if value.page_view.len != 0 { - f( - &value.key_view.to_slice().to_vec(), - &value.page_view.to_slice().to_vec(), - ) - .unwrap(); + f(value.key_view.to_slice(), value.page_view.to_slice()).unwrap(); } } } diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index 2d346638bb3..dcd0e097f7a 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -470,7 +470,7 @@ impl Drop for RawCppPtrArr { impl Drop for RawCppPtrCarr { fn drop(&mut self) { - if self.inner != std::ptr::null_mut() { + if !self.inner.is_null() { let helper = get_engine_store_server_helper(); helper.gc_raw_cpp_ptr_carr(self.inner as RawVoidPtr, self.type_, self.len); self.inner = std::ptr::null_mut(); diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index f6501fb6899..0bbf970240c 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -82,6 +82,11 @@ unsafe impl Sync for PrehandleTask {} // avoid being bypassed. const TIFLASH_OBSERVER_PRIORITY: u32 = 0; +pub struct PackedEnvs { + pub engine_store_cfg: crate::EngineStoreConfig, + pub pd_endpoints: Vec, +} + pub struct TiFlashObserver { pub store_id: u64, pub engine_store_server_helper: &'static EngineStoreServerHelper, @@ -95,7 +100,7 @@ pub struct TiFlashObserver { // TODO should we use a Mutex here? pub trans: Arc>, pub snap_mgr: Arc, - pub engine_store_cfg: crate::EngineStoreConfig, + pub packed_envs: Arc, } pub fn get_region_local_state( @@ -133,7 +138,7 @@ impl Clone for TiFlashObserver { pending_delete_ssts: self.pending_delete_ssts.clone(), trans: self.trans.clone(), snap_mgr: self.snap_mgr.clone(), - engine_store_cfg: self.engine_store_cfg.clone(), + packed_envs: self.packed_envs.clone(), } } } @@ -159,7 +164,7 @@ impl TiFlashObserver { // Returns whether we need to ignore this message and run fast path instead. pub fn maybe_fast_path(&self, msg: &RaftMessage) -> bool { - if !self.engine_store_cfg.enable_fast_add_peer { + if !self.packed_envs.engine_store_cfg.enable_fast_add_peer { // fast path not enabled return false; } @@ -204,12 +209,8 @@ impl TiFlashObserver { const FALLBACK_MILLIS: u128 = 1000 * 60 * 5; if elapsed >= TRACE_SLOW_MILLIS { let need_fallback = elapsed > FALLBACK_MILLIS; - let do_fallback = if need_fallback { - // TODO If snapshot is sent, we can't fallback? - true - } else { - false - }; + // TODO If snapshot is sent, we need fallback but can't do fallback? + let do_fallback = need_fallback; info!("fast path: ongoing {}:{} {}, MsgAppend duplicated", self.store_id, region_id, new_peer_id; "to_peer_id" => msg.get_to_peer().get_id(), @@ -650,7 +651,7 @@ impl TiFlashObserver { snap_handle_pool_size: usize, trans: T, snap_mgr: SnapManager, - engine_store_cfg: crate::EngineStoreConfig, + packed_envs: PackedEnvs, ) -> Self { let engine_store_server_helper = gen_engine_store_server_helper(engine.engine_store_server_helper); @@ -671,7 +672,7 @@ impl TiFlashObserver { pending_delete_ssts: Arc::new(RwLock::new(vec![])), trans: Arc::new(Mutex::new(trans)), snap_mgr: Arc::new(snap_mgr), - engine_store_cfg, + packed_envs: Arc::new(packed_envs), } } @@ -1143,7 +1144,7 @@ impl RegionChangeObserver for TiFlashObs ); self.engine_store_server_helper .handle_destroy(ob_ctx.region().get_id()); - if self.engine_store_cfg.enable_fast_add_peer { + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { self.get_cached_manager() .remove_cached_region_info(region_id); } @@ -1308,7 +1309,7 @@ impl ApplySnapshotObserver for TiFlashOb let mut should_skip = false; #[allow(clippy::collapsible_if)] - if self.engine_store_cfg.enable_fast_add_peer { + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { if self.get_cached_manager().access_cached_region_info_mut( region_id, |info: MapEntry>| match info { @@ -1411,7 +1412,7 @@ impl ApplySnapshotObserver for TiFlashOb ); let mut should_skip = false; #[allow(clippy::collapsible_if)] - if self.engine_store_cfg.enable_fast_add_peer { + if self.packed_envs.engine_store_cfg.enable_fast_add_peer { if self.get_cached_manager().access_cached_region_info_mut( region_id, |info: MapEntry>| match info { diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index 6ce5d2fb17d..dae9f5a0d22 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -127,7 +127,7 @@ impl PSEngineWriteBatch { fn data_size(&self) -> usize { let helper = gen_engine_store_server_helper(self.engine_store_server_helper); - return helper.write_batch_size(self.raw_write_batch.ptr) as usize; + helper.write_batch_size(self.raw_write_batch.ptr) as usize } fn clear(&self) { @@ -219,7 +219,7 @@ impl RaftLogBatch for PSEngineWriteBatch { } } -#[derive(Clone)] +#[derive(Clone, Default)] pub struct PSEngine { pub engine_store_server_helper: isize, } @@ -292,13 +292,10 @@ impl PSEngine { let arr = values.inner as *mut PageAndCppStrWithView; for i in 0..values.len { let value = unsafe { &*arr.offset(i as isize) }; - if value.page_view.len != 0 { - if !f( - &value.key_view.to_slice().to_vec(), - &value.page_view.to_slice().to_vec(), - )? { - break; - } + if value.page_view.len != 0 + && !f(value.key_view.to_slice(), value.page_view.to_slice())? + { + break; } } Ok(()) @@ -376,7 +373,7 @@ impl RaftEngineReadOnly for PSEngine { Ok(total_size < max_size) })?; - return Ok(count); + Ok(count) } fn get_all_entries_to(&self, region_id: u64, buf: &mut Vec) -> Result<()> { diff --git a/new-mock-engine-store/src/mock_cluster.rs b/new-mock-engine-store/src/mock_cluster.rs index acac3e589f1..43d26a93c81 100644 --- a/new-mock-engine-store/src/mock_cluster.rs +++ b/new-mock-engine-store/src/mock_cluster.rs @@ -292,8 +292,7 @@ impl> Cluster { pub fn run_conf_change_no_start(&mut self) -> u64 { self.create_engines(); - let region_id = self.bootstrap_conf_change(); - region_id + self.bootstrap_conf_change() } /// We need to create FFIHelperSet while we create engine. diff --git a/new-mock-engine-store/src/mock_page_storage.rs b/new-mock-engine-store/src/mock_page_storage.rs index 68773cece7f..ef2c23813fb 100644 --- a/new-mock-engine-store/src/mock_page_storage.rs +++ b/new-mock-engine-store/src/mock_page_storage.rs @@ -39,10 +39,10 @@ pub struct MockPSUniversalPage { data: Vec, } -impl Into for BaseBuffView { - fn into(self) -> MockPSUniversalPage { +impl From for MockPSUniversalPage { + fn from(val: BaseBuffView) -> Self { MockPSUniversalPage { - data: self.to_slice().to_owned(), + data: val.to_slice().to_owned(), } } } @@ -115,13 +115,13 @@ pub unsafe extern "C" fn ffi_mockps_write_batch_size(wb: RawVoidPtr) -> u64 { pub unsafe extern "C" fn ffi_mockps_write_batch_is_empty(wb: RawVoidPtr) -> u8 { let wb: _ = <&mut MockPSWriteBatch as From>::from(wb); - if wb.data.is_empty() { 1 } else { 0 } + u8::from(wb.data.is_empty()) } pub unsafe extern "C" fn ffi_mockps_write_batch_merge(lwb: RawVoidPtr, rwb: RawVoidPtr) { let lwb: _ = <&mut MockPSWriteBatch as From>::from(lwb); let rwb: _ = <&mut MockPSWriteBatch as From>::from(rwb); - lwb.data.extend(rwb.data.drain(..)); + lwb.data.append(&mut rwb.data); } pub unsafe extern "C" fn ffi_mockps_write_batch_clear(wb: RawVoidPtr) { @@ -238,5 +238,5 @@ pub unsafe extern "C" fn ffi_mockps_ps_is_empty( .data .read() .unwrap(); - if guard.is_empty() { 1 } else { 0 } + u8::from(guard.is_empty()) } diff --git a/new-mock-engine-store/src/node.rs b/new-mock-engine-store/src/node.rs index 0a9a284a27a..23d17626e98 100644 --- a/new-mock-engine-store/src/node.rs +++ b/new-mock-engine-store/src/node.rs @@ -82,6 +82,7 @@ impl Default for ChannelTransport { impl Transport for ChannelTransport { #[allow(clippy::significant_drop_in_scrutinee)] + #[allow(clippy::redundant_closure_call)] fn send(&mut self, msg: RaftMessage) -> Result<()> { let mut from_store = msg.get_from_peer().get_store_id(); let to_store = msg.get_to_peer().get_store_id(); @@ -331,6 +332,10 @@ impl Simulator for NodeCluster { f(node_id, &mut coprocessor_host); } + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: cfg.proxy_cfg.engine_store.clone(), + pd_endpoints: cfg.pd.endpoints.clone(), + }; let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), @@ -339,7 +344,7 @@ impl Simulator for NodeCluster { cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), snap_mgr.clone(), - cfg.proxy_cfg.engine_store.clone(), + packed_envs, ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/new-mock-engine-store/src/server.rs b/new-mock-engine-store/src/server.rs index 32ebea7e328..6534f2074b9 100644 --- a/new-mock-engine-store/src/server.rs +++ b/new-mock-engine-store/src/server.rs @@ -535,6 +535,10 @@ impl ServerCluster { let max_grpc_thread_count = cfg.server.grpc_concurrency; let server_cfg = Arc::new(VersionTrack::new(cfg.server.clone())); + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: cfg.proxy_cfg.engine_store.clone(), + pd_endpoints: cfg.pd.endpoints.clone(), + }; let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node_id, engines.kv.clone(), @@ -543,7 +547,7 @@ impl ServerCluster { cfg.proxy_cfg.raft_store.snap_handle_pool_size, simulate_trans.clone(), snap_mgr.clone(), - cfg.proxy_cfg.engine_store.clone(), + packed_envs, ); tiflash_ob.register_to(&mut coprocessor_host); diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 7848cdffab4..ea7eb7e2916 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -1229,6 +1229,10 @@ impl TiKvServer { ) .unwrap_or_else(|e| fatal!("failed to create server: {}", e)); + let packed_envs = engine_store_ffi::observer::PackedEnvs { + engine_store_cfg: self.proxy_config.engine_store.clone(), + pd_endpoints: self.config.pd.endpoints.clone(), + }; let tiflash_ob = engine_store_ffi::observer::TiFlashObserver::new( node.id(), self.engines.as_ref().unwrap().engines.kv.clone(), @@ -1237,7 +1241,7 @@ impl TiKvServer { self.proxy_config.raft_store.snap_handle_pool_size, server.transport().clone(), snap_mgr.clone(), - self.proxy_config.engine_store.clone(), + packed_envs, ); tiflash_ob.register_to(self.coprocessor_host.as_mut().unwrap()); diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 54a6b6e1e16..5b1a9335775 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -324,7 +324,9 @@ fn test_fast_add_peer_from_delayed_learner_blocked() { #[test] fn test_fast_add_peer_from_learner_blocked_paused_build() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); simple_fast_add_peer(SourceType::Learner, true, PauseType::Build); + fail::remove("apply_on_handle_snapshot_sync"); fail::remove("fallback_to_slow_path_not_allow"); } @@ -337,7 +339,7 @@ fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { // Delay when applying snapshot // This test is origianlly aimed to test multiple MsgSnapshot. -// However, +// However, we observed less repeated MsgAppend than in real cluster. #[test] fn test_fast_add_peer_from_learner_blocked_paused_apply() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); From 4f4fcbb51cffac02f48d05ca8e0e73a66c128127 Mon Sep 17 00:00:00 2001 From: CalvinNeo Date: Fri, 3 Feb 2023 18:45:23 +0800 Subject: [PATCH 115/115] merge tikv upstream/raftstore-proxy Signed-off-by: CalvinNeo --- Cargo.lock | 49 +++++++++++++++++++++++++ engine_store_ffi/src/lib.rs | 1 + engine_store_ffi/src/observer.rs | 19 +++++++--- engine_store_ffi/src/ps_engine.rs | 41 +++++++++++++++------ new-mock-engine-store/src/mock_store.rs | 3 +- proxy_server/src/run.rs | 3 +- proxy_tests/proxy/fast_add_peer.rs | 2 + 7 files changed, 97 insertions(+), 21 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f181392700a..aa7eb276e28 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -138,6 +138,12 @@ dependencies = [ "serde_json", ] +[[package]] +name = "assert-type-eq" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd49a41856ee21a0cfb2b1cfbfcca0f1d3e6c257c38939f0d6ecfaf177f2ea47" + [[package]] name = "async-channel" version = "1.6.1" @@ -1607,6 +1613,7 @@ dependencies = [ "panic_hook", "parking_lot 0.12.1", "pd_client", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -1670,6 +1677,7 @@ dependencies = [ "log_wrappers", "num_cpus", "online_config", + "portable-atomic", "prometheus", "prometheus-static-metric", "protobuf", @@ -2732,6 +2740,27 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "int-enum" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff87d3cc4b79b4559e3c75068d64247284aceb6a038bd4bb38387f3f164476d" +dependencies = [ + "int-enum-impl", +] + +[[package]] +name = "int-enum-impl" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df1f2f068675add1a3fc77f5f5ab2e29290c841ee34d151abc007bce902e5d34" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "into_other" version = "0.0.1" @@ -3365,6 +3394,7 @@ name = "new-mock-engine-store" version = "0.0.1" dependencies = [ "api_version", + "assert-type-eq", "causal_ts", "collections", "concurrency_manager", @@ -3381,6 +3411,7 @@ dependencies = [ "futures 0.3.15", "grpcio", "grpcio-health", + "int-enum", "keys", "kvproto", "lazy_static", @@ -4093,6 +4124,12 @@ dependencies = [ "ws2_32-sys", ] +[[package]] +name = "portable-atomic" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26f6a7b87c2e435a3241addceeeff740ff8b7e76b74c13bf9acb17fa454ea00b" + [[package]] name = "pprof" version = "0.11.0" @@ -4132,6 +4169,17 @@ dependencies = [ "syn", ] +[[package]] +name = "proc-macro-crate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda0fc3b0fb7c975631757e14d9049da17374063edb6ebbcbc54d880d4fe94e9" +dependencies = [ + "once_cell", + "thiserror", + "toml", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -4391,6 +4439,7 @@ dependencies = [ "engine_rocks", "engine_rocks_helper", "engine_store_ffi", + "engine_tiflash", "engine_traits", "error_code", "fail", diff --git a/engine_store_ffi/src/lib.rs b/engine_store_ffi/src/lib.rs index dcd0e097f7a..fb2ce037590 100644 --- a/engine_store_ffi/src/lib.rs +++ b/engine_store_ffi/src/lib.rs @@ -1,5 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. #![feature(drain_filter)] +#![feature(let_chains)] #[allow(dead_code)] pub mod interfaces; diff --git a/engine_store_ffi/src/observer.rs b/engine_store_ffi/src/observer.rs index 0bbf970240c..2885b89d62e 100644 --- a/engine_store_ffi/src/observer.rs +++ b/engine_store_ffi/src/observer.rs @@ -22,10 +22,11 @@ use raft::{eraftpb, eraftpb::MessageType, StateRole}; use raftstore::{ coprocessor::{ AdminObserver, ApplyCtxInfo, ApplySnapshotObserver, BoxAdminObserver, - BoxApplySnapshotObserver, BoxPdTaskObserver, BoxQueryObserver, BoxRegionChangeObserver, - BoxRoleObserver, BoxUpdateSafeTsObserver, Cmd, Coprocessor, CoprocessorHost, - ObserverContext, PdTaskObserver, QueryObserver, RegionChangeEvent, RegionChangeObserver, - RegionState, RoleChange, RoleObserver, StoreSizeInfo, UpdateSafeTsObserver, + BoxApplySnapshotObserver, BoxMessageObserver, BoxPdTaskObserver, BoxQueryObserver, + BoxRegionChangeObserver, BoxRoleObserver, BoxUpdateSafeTsObserver, Cmd, Coprocessor, + CoprocessorHost, MessageObserver, ObserverContext, PdTaskObserver, QueryObserver, + RegionChangeEvent, RegionChangeObserver, RegionState, RoleChange, RoleObserver, + StoreSizeInfo, UpdateSafeTsObserver, }, store::{ self, check_sst_for_ingestion, @@ -709,6 +710,10 @@ impl TiFlashObserver { TIFLASH_OBSERVER_PRIORITY, BoxRoleObserver::new(self.clone()), ); + coprocessor_host.registry.register_message_observer( + TIFLASH_OBSERVER_PRIORITY, + BoxMessageObserver::new(self.clone()), + ); } fn handle_ingest_sst_for_engine_store( @@ -1197,9 +1202,11 @@ impl RegionChangeObserver for TiFlashObs }); false } +} - fn should_skip_raft_message(&self, msg: &RaftMessage) -> bool { - self.maybe_fast_path(&msg) +impl MessageObserver for TiFlashObserver { + fn on_raft_message(&self, msg: &RaftMessage) -> bool { + !self.maybe_fast_path(&msg) } } diff --git a/engine_store_ffi/src/ps_engine.rs b/engine_store_ffi/src/ps_engine.rs index dae9f5a0d22..8efc061b35f 100644 --- a/engine_store_ffi/src/ps_engine.rs +++ b/engine_store_ffi/src/ps_engine.rs @@ -137,7 +137,21 @@ impl PSEngineWriteBatch { } impl RaftLogBatch for PSEngineWriteBatch { - fn append(&mut self, raft_group_id: u64, entries: Vec) -> Result<()> { + fn append( + &mut self, + raft_group_id: u64, + overwrite_to: Option, + entries: Vec, + ) -> Result<()> { + let overwrite_to = overwrite_to.unwrap_or(0); + if let Some(last) = entries.last() && last.get_index() + 1 < overwrite_to { + // TODO + panic!("PSEngineWriteBatch has no delete method !!!!!"); + // for index in last.get_index() + 1..overwrite_to { + // let key = keys::raft_log_key(raft_group_id, index); + // self.delete(&key).unwrap(); + // } + } if let Some(max_size) = entries.iter().map(|e| e.compute_size()).max() { let ser_buf = Vec::with_capacity(max_size as usize); return self.append_impl(raft_group_id, &entries, ser_buf); @@ -145,16 +159,6 @@ impl RaftLogBatch for PSEngineWriteBatch { Ok(()) } - fn cut_logs(&mut self, raft_group_id: u64, from: u64, to: u64) { - // This function is used to clean entries that will be overwritten - // later. - // TODO: make sure overlapped entries will be overwritten - // by newer log. for index in from..to { - // let key = ps_raft_log_key(raft_group_id, index); - // self.del_page(&key).unwrap(); - // } - } - fn put_raft_state(&mut self, raft_group_id: u64, state: &RaftLocalState) -> Result<()> { self.put_msg(&keys::raft_state_key(raft_group_id), state) } @@ -214,6 +218,15 @@ impl RaftLogBatch for PSEngineWriteBatch { panic!() } + fn put_dirty_mark( + &mut self, + _raft_group_id: u64, + _tablet_index: u64, + _dirty: bool, + ) -> Result<()> { + panic!() + } + fn put_recover_state(&mut self, state: &StoreRecoverState) -> Result<()> { self.put_msg(keys::RECOVER_STATE_KEY, state) } @@ -425,6 +438,10 @@ impl RaftEngineReadOnly for PSEngine { fn get_flushed_index(&self, _raft_group_id: u64, _cf: &str) -> Result> { panic!() } + + fn get_dirty_mark(&self, _raft_group_id: u64, _tablet_index: u64) -> Result { + panic!() + } } impl RaftEngineDebug for PSEngine { @@ -574,7 +591,7 @@ impl RaftEngine for PSEngine { impl PerfContextExt for PSEngine { type PerfContext = PSPerfContext; - fn get_perf_context(&self, level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { + fn get_perf_context(level: PerfLevel, kind: PerfContextKind) -> Self::PerfContext { PSPerfContext::new(level, kind) } } diff --git a/new-mock-engine-store/src/mock_store.rs b/new-mock-engine-store/src/mock_store.rs index 6814ff84d6f..ec154b8e269 100644 --- a/new-mock-engine-store/src/mock_store.rs +++ b/new-mock-engine-store/src/mock_store.rs @@ -1545,7 +1545,8 @@ unsafe extern "C" fn ffi_fast_add_peer( "from" => from, "to" => to, ); - raft_wb.cut_logs(region_id, from, to); + // raft_wb.cut_logs(region_id, from, to); + target_engines.raft.gc(region_id, from, to, &mut raft_wb).unwrap(); target_engines.raft.consume(&mut raft_wb, true).unwrap(); } let apply_state_bytes = apply_state.write_to_bytes().unwrap(); diff --git a/proxy_server/src/run.rs b/proxy_server/src/run.rs index 6e636c612f2..9d366c966e9 100644 --- a/proxy_server/src/run.rs +++ b/proxy_server/src/run.rs @@ -398,7 +398,7 @@ impl TiKvServer { .unwrap(); // Create raft engine - let (raft_engine, raft_statistics) = CER::build( + let (mut raft_engine, raft_statistics) = CER::build( &self.config, &env, &self.encryption_key_manager, @@ -412,7 +412,6 @@ impl TiKvServer { ps_engine.init(engine_store_server_helper); } } - self.raft_statistics = raft_statistics; // Create kv engine. let builder = KvEngineFactoryBuilder::new(env, &self.config, block_cache) diff --git a/proxy_tests/proxy/fast_add_peer.rs b/proxy_tests/proxy/fast_add_peer.rs index 5b1a9335775..c96ce750c3e 100644 --- a/proxy_tests/proxy/fast_add_peer.rs +++ b/proxy_tests/proxy/fast_add_peer.rs @@ -333,7 +333,9 @@ fn test_fast_add_peer_from_learner_blocked_paused_build() { #[test] fn test_fast_add_peer_from_delayed_learner_blocked_paused_build() { fail::cfg("fallback_to_slow_path_not_allow", "panic").unwrap(); + fail::cfg("apply_on_handle_snapshot_sync", "return(true)").unwrap(); simple_fast_add_peer(SourceType::DelayedLearner, true, PauseType::Build); + fail::remove("apply_on_handle_snapshot_sync"); fail::remove("fallback_to_slow_path_not_allow"); }