diff --git a/Makefile b/Makefile index f6d169a..38d77f2 100644 --- a/Makefile +++ b/Makefile @@ -26,9 +26,18 @@ build-cpp-raft-driver: cmake -B ${BUILD_DIR}/cmake-build-debug/drivers -S systems/WRaft-series/driver cmake --build ${BUILD_DIR}/cmake-build-debug/drivers -j $(shell nproc) +build-cpp-raft-rs-driver: + cmake -B ${BUILD_DIR}/cmake-build-debug/rsDriver -S systems/Raft-rs/driver + cmake --build ${BUILD_DIR}/cmake-build-debug/rsDriver -j $(shell nproc) + + build-cpp-raft-driver-in-docker: docker exec controller bash -c "cd /root/sandtable && make build-cpp-raft-driver" +build-cpp-raft-rs-driver-in-docker: + docker exec controller bash -c "cd /root/sandtable && make build-cpp-raft-rs-driver" + + config-network: sudo scripts/batch_config_tproxy.sh -n 20 -b docker -c controller start diff --git a/docker/Dockerfile b/docker/Dockerfile index b476c34..31effaa 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -34,10 +34,12 @@ RUN sudo add-apt-repository ppa:deadsnakes/ppa \ && sudo apt-get update \ && sudo apt-get install -y python3.7-venv + FROM tools as builder RUN apt-get install -y cmake build-essential libgflags-dev libreadline-dev libconcurrentqueue-dev FROM builder as controller RUN apt-get install -y lsof tmux jq COPY requirements.txt /requirements.txt +RUN pip config set global.index-url https://mirror.nju.edu.cn/pypi/web/simple RUN pip install -r /requirements.txt && rm -f /requirements.txt diff --git a/rustup-init.sh b/rustup-init.sh new file mode 100644 index 0000000..1813191 --- /dev/null +++ b/rustup-init.sh @@ -0,0 +1,877 @@ +#!/bin/sh +# shellcheck shell=dash +# shellcheck disable=SC2039 # local is non-POSIX + +# This is just a little script that can be downloaded from the internet to +# install rustup. It just does platform detection, downloads the installer +# and runs it. + +# It runs on Unix shells like {a,ba,da,k,z}sh. It uses the common `local` +# extension. Note: Most shells limit `local` to 1 var per line, contra bash. + +# Some versions of ksh have no `local` keyword. Alias it to `typeset`, but +# beware this makes variables global with f()-style function syntax in ksh93. +# mksh has this alias by default. +has_local() { + # shellcheck disable=SC2034 # deliberately unused + local _has_local +} + +has_local 2>/dev/null || alias local=typeset + +is_zsh() { + [ -n "${ZSH_VERSION-}" ] +} + +set -u + +# If RUSTUP_UPDATE_ROOT is unset or empty, default it. +RUSTUP_UPDATE_ROOT="${RUSTUP_UPDATE_ROOT:-https://static.rust-lang.org/rustup}" +# Set quiet as a global for ease of use +RUSTUP_QUIET=no + +# NOTICE: If you change anything here, please make the same changes in setup_mode.rs +usage() { + cat < + Choose a default host triple + --default-toolchain + Choose a default toolchain to install. Use 'none' to not install any toolchains at all + --profile + [default: default] [possible values: minimal, default, complete] + -c, --component + Comma-separated list of component names to also install + -t, --target + Comma-separated list of target names to also install + --no-update-default-toolchain + Don't update any existing default toolchain after install + --no-modify-path + Don't configure the PATH environment variable + -h, --help + Print help + -V, --version + Print version +EOF +} + +main() { + downloader --check + need_cmd uname + need_cmd mktemp + need_cmd chmod + need_cmd mkdir + need_cmd rm + need_cmd rmdir + + get_architecture || return 1 + local _arch="$RETVAL" + assert_nz "$_arch" "arch" + + local _ext="" + case "$_arch" in + *windows*) + _ext=".exe" + ;; + esac + + local _url + if [ "${RUSTUP_VERSION+set}" = 'set' ]; then + say "\`RUSTUP_VERSION\` has been set to \`${RUSTUP_VERSION}\`" + _url="${RUSTUP_UPDATE_ROOT}/archive/${RUSTUP_VERSION}" + else + _url="${RUSTUP_UPDATE_ROOT}/dist" + fi + _url="${_url}/${_arch}/rustup-init${_ext}" + + + local _dir + if ! _dir="$(ensure mktemp -d)"; then + # Because the previous command ran in a subshell, we must manually + # propagate exit status. + exit 1 + fi + local _file="${_dir}/rustup-init${_ext}" + + local _ansi_escapes_are_valid=false + if [ -t 2 ]; then + if [ "${TERM+set}" = 'set' ]; then + case "$TERM" in + xterm*|rxvt*|urxvt*|linux*|vt*) + _ansi_escapes_are_valid=true + ;; + esac + fi + fi + + # check if we have to use /dev/tty to prompt the user + local need_tty=yes + for arg in "$@"; do + case "$arg" in + --help) + usage + exit 0 + ;; + --quiet) + RUSTUP_QUIET=yes + ;; + *) + OPTIND=1 + if [ "${arg%%--*}" = "" ]; then + # Long option (other than --help); + # don't attempt to interpret it. + continue + fi + while getopts :hqy sub_arg "$arg"; do + case "$sub_arg" in + h) + usage + exit 0 + ;; + q) + RUSTUP_QUIET=yes + ;; + y) + # user wants to skip the prompt -- + # we don't need /dev/tty + need_tty=no + ;; + *) + ;; + esac + done + ;; + esac + done + + say 'downloading installer' + + ensure mkdir -p "$_dir" + ensure downloader "$_url" "$_file" "$_arch" + ensure chmod u+x "$_file" + if [ ! -x "$_file" ]; then + err "Cannot execute $_file (likely because of mounting /tmp as noexec)." + err "Please copy the file to a location where you can execute binaries and run ./rustup-init${_ext}." + exit 1 + fi + + if [ "$need_tty" = "yes" ] && [ ! -t 0 ]; then + # The installer is going to want to ask for confirmation by + # reading stdin. This script was piped into `sh` though and + # doesn't have stdin to pass to its children. Instead we're going + # to explicitly connect /dev/tty to the installer's stdin. + if [ ! -t 1 ]; then + err "Unable to run interactively. Run with -y to accept defaults, --help for additional options" + exit 1; + fi + + ignore "$_file" "$@" < /dev/tty + else + ignore "$_file" "$@" + fi + + local _retval=$? + + ignore rm "$_file" + ignore rmdir "$_dir" + + return "$_retval" +} + +get_current_exe() { + # Returns the executable used for system architecture detection + # This is only run on Linux + local _current_exe + if test -L /proc/self/exe ; then + _current_exe=/proc/self/exe + else + warn "Unable to find /proc/self/exe. System architecture detection might be inaccurate." + if test -n "$SHELL" ; then + _current_exe=$SHELL + else + need_cmd /bin/sh + _current_exe=/bin/sh + fi + warn "Falling back to $_current_exe." + fi + echo "$_current_exe" +} + +get_bitness() { + need_cmd head + # Architecture detection without dependencies beyond coreutils. + # ELF files start out "\x7fELF", and the following byte is + # 0x01 for 32-bit and + # 0x02 for 64-bit. + # The printf builtin on some shells like dash only supports octal + # escape sequences, so we use those. + local _current_exe=$1 + local _current_exe_head + _current_exe_head=$(head -c 5 "$_current_exe") + if [ "$_current_exe_head" = "$(printf '\177ELF\001')" ]; then + echo 32 + elif [ "$_current_exe_head" = "$(printf '\177ELF\002')" ]; then + echo 64 + else + err "unknown platform bitness" + exit 1; + fi +} + +is_host_amd64_elf() { + local _current_exe=$1 + + need_cmd head + need_cmd tail + # ELF e_machine detection without dependencies beyond coreutils. + # Two-byte field at offset 0x12 indicates the CPU, + # but we're interested in it being 0x3E to indicate amd64, or not that. + local _current_exe_machine + _current_exe_machine=$(head -c 19 "$_current_exe" | tail -c 1) + [ "$_current_exe_machine" = "$(printf '\076')" ] +} + +get_endianness() { + local _current_exe=$1 + local cputype=$2 + local suffix_eb=$3 + local suffix_el=$4 + + # detect endianness without od/hexdump, like get_bitness() does. + need_cmd head + need_cmd tail + + local _current_exe_endianness + _current_exe_endianness="$(head -c 6 "$_current_exe" | tail -c 1)" + if [ "$_current_exe_endianness" = "$(printf '\001')" ]; then + echo "${cputype}${suffix_el}" + elif [ "$_current_exe_endianness" = "$(printf '\002')" ]; then + echo "${cputype}${suffix_eb}" + else + err "unknown platform endianness" + exit 1 + fi +} + +# Detect the Linux/LoongArch UAPI flavor, with all errors being non-fatal. +# Returns 0 or 234 in case of successful detection, 1 otherwise (/tmp being +# noexec, or other causes). +check_loongarch_uapi() { + need_cmd base64 + + local _tmp + if ! _tmp="$(ensure mktemp)"; then + return 1 + fi + + # Minimal Linux/LoongArch UAPI detection, exiting with 0 in case of + # upstream ("new world") UAPI, and 234 (-EINVAL truncated) in case of + # old-world (as deployed on several early commercial Linux distributions + # for LoongArch). + # + # See https://gist.github.com/xen0n/5ee04aaa6cecc5c7794b9a0c3b65fc7f for + # source to this helper binary. + ignore base64 -d > "$_tmp" <&1 | grep -q 'musl'; then + _clibtype="musl" + fi + fi + + if [ "$_ostype" = Darwin ]; then + # Darwin `uname -m` can lie due to Rosetta shenanigans. If you manage to + # invoke a native shell binary and then a native uname binary, you can + # get the real answer, but that's hard to ensure, so instead we use + # `sysctl` (which doesn't lie) to check for the actual architecture. + if [ "$_cputype" = i386 ]; then + # Handling i386 compatibility mode in older macOS versions (<10.15) + # running on x86_64-based Macs. + # Starting from 10.15, macOS explicitly bans all i386 binaries from running. + # See: + + # Avoid `sysctl: unknown oid` stderr output and/or non-zero exit code. + if sysctl hw.optional.x86_64 2> /dev/null || true | grep -q ': 1'; then + _cputype=x86_64 + fi + elif [ "$_cputype" = x86_64 ]; then + # Handling x86-64 compatibility mode (a.k.a. Rosetta 2) + # in newer macOS versions (>=11) running on arm64-based Macs. + # Rosetta 2 is built exclusively for x86-64 and cannot run i386 binaries. + + # Avoid `sysctl: unknown oid` stderr output and/or non-zero exit code. + if sysctl hw.optional.arm64 2> /dev/null || true | grep -q ': 1'; then + _cputype=arm64 + fi + fi + fi + + if [ "$_ostype" = SunOS ]; then + # Both Solaris and illumos presently announce as "SunOS" in "uname -s" + # so use "uname -o" to disambiguate. We use the full path to the + # system uname in case the user has coreutils uname first in PATH, + # which has historically sometimes printed the wrong value here. + if [ "$(/usr/bin/uname -o)" = illumos ]; then + _ostype=illumos + fi + + # illumos systems have multi-arch userlands, and "uname -m" reports the + # machine hardware name; e.g., "i86pc" on both 32- and 64-bit x86 + # systems. Check for the native (widest) instruction set on the + # running kernel: + if [ "$_cputype" = i86pc ]; then + _cputype="$(isainfo -n)" + fi + fi + + local _current_exe + case "$_ostype" in + + Android) + _ostype=linux-android + ;; + + Linux) + _current_exe=$(get_current_exe) + _ostype=unknown-linux-$_clibtype + _bitness=$(get_bitness "$_current_exe") + ;; + + FreeBSD) + _ostype=unknown-freebsd + ;; + + NetBSD) + _ostype=unknown-netbsd + ;; + + DragonFly) + _ostype=unknown-dragonfly + ;; + + Darwin) + _ostype=apple-darwin + ;; + + illumos) + _ostype=unknown-illumos + ;; + + MINGW* | MSYS* | CYGWIN* | Windows_NT) + _ostype=pc-windows-gnu + ;; + + *) + err "unrecognized OS type: $_ostype" + exit 1 + ;; + + esac + + case "$_cputype" in + + i386 | i486 | i686 | i786 | x86) + _cputype=i686 + ;; + + xscale | arm) + _cputype=arm + if [ "$_ostype" = "linux-android" ]; then + _ostype=linux-androideabi + fi + ;; + + armv6l) + _cputype=arm + if [ "$_ostype" = "linux-android" ]; then + _ostype=linux-androideabi + else + _ostype="${_ostype}eabihf" + fi + ;; + + armv7l | armv8l) + _cputype=armv7 + if [ "$_ostype" = "linux-android" ]; then + _ostype=linux-androideabi + else + _ostype="${_ostype}eabihf" + fi + ;; + + aarch64 | arm64) + _cputype=aarch64 + ;; + + x86_64 | x86-64 | x64 | amd64) + _cputype=x86_64 + ;; + + mips) + _cputype=$(get_endianness "$_current_exe" mips '' el) + ;; + + mips64) + if [ "$_bitness" -eq 64 ]; then + # only n64 ABI is supported for now + _ostype="${_ostype}abi64" + _cputype=$(get_endianness "$_current_exe" mips64 '' el) + fi + ;; + + ppc) + _cputype=powerpc + ;; + + ppc64) + _cputype=powerpc64 + ;; + + ppc64le) + _cputype=powerpc64le + ;; + + s390x) + _cputype=s390x + ;; + riscv64) + _cputype=riscv64gc + ;; + loongarch64) + _cputype=loongarch64 + ensure_loongarch_uapi + ;; + *) + err "unknown CPU type: $_cputype" + exit 1 + + esac + + # Detect 64-bit linux with 32-bit userland + if [ "${_ostype}" = unknown-linux-gnu ] && [ "${_bitness}" -eq 32 ]; then + case $_cputype in + x86_64) + if [ -n "${RUSTUP_CPUTYPE:-}" ]; then + _cputype="$RUSTUP_CPUTYPE" + else { + # 32-bit executable for amd64 = x32 + if is_host_amd64_elf "$_current_exe"; then { + err "This host is running an x32 userland, for which no native toolchain is provided." + err "You will have to install multiarch compatibility with i686 or amd64." + err "To do so, set the RUSTUP_CPUTYPE environment variable set to i686 or amd64 and re-run this script." + err "You will be able to add an x32 target after installation by running \`rustup target add x86_64-unknown-linux-gnux32\`." + exit 1 + }; else + _cputype=i686 + fi + }; fi + ;; + mips64) + _cputype=$(get_endianness "$_current_exe" mips '' el) + ;; + powerpc64) + _cputype=powerpc + ;; + aarch64) + _cputype=armv7 + if [ "$_ostype" = "linux-android" ]; then + _ostype=linux-androideabi + else + _ostype="${_ostype}eabihf" + fi + ;; + riscv64gc) + err "riscv64 with 32-bit userland unsupported" + exit 1 + ;; + esac + fi + + # Detect armv7 but without the CPU features Rust needs in that build, + # and fall back to arm. + # See https://github.com/rust-lang/rustup.rs/issues/587. + if [ "$_ostype" = "unknown-linux-gnueabihf" ] && [ "$_cputype" = armv7 ]; then + if ! (ensure grep '^Features' /proc/cpuinfo | grep -E -q 'neon|simd') ; then + # Either `/proc/cpuinfo` is malformed or unavailable, or + # at least one processor does not have NEON (which is asimd on armv8+). + _cputype=arm + fi + fi + + _arch="${_cputype}-${_ostype}" + + RETVAL="$_arch" +} + +__print() { + if $_ansi_escapes_are_valid; then + printf '\33[1m%s:\33[0m %s\n' "$1" "$2" >&2 + else + printf '%s: %s\n' "$1" "$2" >&2 + fi +} + +warn() { + __print 'warn' "$1" >&2 +} + +say() { + if [ "$RUSTUP_QUIET" = "no" ]; then + __print 'info' "$1" >&2 + fi +} + +# NOTE: you are required to exit yourself +# we don't do it here because of multiline errors +err() { + __print 'error' "$1" >&2 +} + +need_cmd() { + if ! check_cmd "$1"; then + err "need '$1' (command not found)" + exit 1 + fi +} + +check_cmd() { + command -v "$1" > /dev/null 2>&1 +} + +assert_nz() { + if [ -z "$1" ]; then + err "assert_nz $2" + exit 1 + fi +} + +# Run a command that should never fail. If the command fails execution +# will immediately terminate with an error showing the failing +# command. +ensure() { + if ! "$@"; then + err "command failed: $*" + exit 1 + fi +} + +# This is just for indicating that commands' results are being +# intentionally ignored. Usually, because it's being executed +# as part of error handling. +ignore() { + "$@" +} + +# This wraps curl or wget. Try curl first, if not installed, +# use wget instead. +downloader() { + # zsh does not split words by default, Required for curl retry arguments below. + is_zsh && setopt local_options shwordsplit + + local _dld + local _ciphersuites + local _err + local _status + local _retry + if check_cmd curl; then + _dld=curl + elif check_cmd wget; then + _dld=wget + else + _dld='curl or wget' # to be used in error message of need_cmd + fi + + if [ "$1" = --check ]; then + need_cmd "$_dld" + elif [ "$_dld" = curl ]; then + check_curl_for_retry_support + _retry="$RETVAL" + get_ciphersuites_for_curl + _ciphersuites="$RETVAL" + if [ -n "$_ciphersuites" ]; then + # shellcheck disable=SC2086 + _err=$(curl $_retry --proto '=https' --tlsv1.2 --ciphers "$_ciphersuites" --silent --show-error --fail --location "$1" --output "$2" 2>&1) + _status=$? + else + warn "Not enforcing strong cipher suites for TLS, this is potentially less secure" + if ! check_help_for "$3" curl --proto --tlsv1.2; then + warn "Not enforcing TLS v1.2, this is potentially less secure" + # shellcheck disable=SC2086 + _err=$(curl $_retry --silent --show-error --fail --location "$1" --output "$2" 2>&1) + _status=$? + else + # shellcheck disable=SC2086 + _err=$(curl $_retry --proto '=https' --tlsv1.2 --silent --show-error --fail --location "$1" --output "$2" 2>&1) + _status=$? + fi + fi + if [ -n "$_err" ]; then + warn "$_err" + if echo "$_err" | grep -q 404$; then + err "installer for platform '$3' not found, this may be unsupported" + exit 1 + fi + fi + return $_status + elif [ "$_dld" = wget ]; then + if [ "$(wget -V 2>&1|head -2|tail -1|cut -f1 -d" ")" = "BusyBox" ]; then + warn "using the BusyBox version of wget. Not enforcing strong cipher suites for TLS or TLS v1.2, this is potentially less secure" + _err=$(wget "$1" -O "$2" 2>&1) + _status=$? + else + get_ciphersuites_for_wget + _ciphersuites="$RETVAL" + if [ -n "$_ciphersuites" ]; then + _err=$(wget --https-only --secure-protocol=TLSv1_2 --ciphers "$_ciphersuites" "$1" -O "$2" 2>&1) + _status=$? + else + warn "Not enforcing strong cipher suites for TLS, this is potentially less secure" + if ! check_help_for "$3" wget --https-only --secure-protocol; then + warn "Not enforcing TLS v1.2, this is potentially less secure" + _err=$(wget "$1" -O "$2" 2>&1) + _status=$? + else + _err=$(wget --https-only --secure-protocol=TLSv1_2 "$1" -O "$2" 2>&1) + _status=$? + fi + fi + fi + if [ -n "$_err" ]; then + warn "$_err" + if echo "$_err" | grep -q ' 404 Not Found$'; then + err "installer for platform '$3' not found, this may be unsupported" + exit 1 + fi + fi + return $_status + else + err "Unknown downloader" # should not reach here + exit 1 + fi +} + +check_help_for() { + local _arch + local _cmd + local _arg + _arch="$1" + shift + _cmd="$1" + shift + + local _category + if "$_cmd" --help | grep -q '"--help all"'; then + _category="all" + else + _category="" + fi + + case "$_arch" in + + *darwin*) + if check_cmd sw_vers; then + local _os_version + local _os_major + _os_version=$(sw_vers -productVersion) + _os_major=$(echo "$_os_version" | cut -d. -f1) + case $_os_major in + 10) + # If we're running on macOS, older than 10.13, then we always + # fail to find these options to force fallback + if [ "$(echo "$_os_version" | cut -d. -f2)" -lt 13 ]; then + # Older than 10.13 + warn "Detected macOS platform older than 10.13" + return 1 + fi + ;; + *) + if ! { [ "$_os_major" -eq "$_os_major" ] 2>/dev/null && [ "$_os_major" -ge 11 ]; }; then + # Unknown product version, warn and continue + warn "Detected unknown macOS major version: $_os_version" + warn "TLS capabilities detection may fail" + fi + ;; # We assume that macOS v11+ will always be okay. + esac + fi + ;; + + esac + + for _arg in "$@"; do + if ! "$_cmd" --help "$_category" | grep -q -- "$_arg"; then + return 1 + fi + done + + true # not strictly needed +} + +# Check if curl supports the --retry flag, then pass it to the curl invocation. +check_curl_for_retry_support() { + local _retry_supported="" + # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc. + if check_help_for "notspecified" "curl" "--retry"; then + _retry_supported="--retry 3" + if check_help_for "notspecified" "curl" "--continue-at"; then + # "-C -" tells curl to automatically find where to resume the download when retrying. + _retry_supported="--retry 3 -C -" + fi + fi + + RETVAL="$_retry_supported" +} + +# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites +# if support by local tools is detected. Detection currently supports these curl backends: +# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty. +get_ciphersuites_for_curl() { + if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then + # user specified custom cipher suites, assume they know what they're doing + RETVAL="$RUSTUP_TLS_CIPHERSUITES" + return + fi + + local _openssl_syntax="no" + local _gnutls_syntax="no" + local _backend_supported="yes" + if curl -V | grep -q ' OpenSSL/'; then + _openssl_syntax="yes" + elif curl -V | grep -iq ' LibreSSL/'; then + _openssl_syntax="yes" + elif curl -V | grep -iq ' BoringSSL/'; then + _openssl_syntax="yes" + elif curl -V | grep -iq ' GnuTLS/'; then + _gnutls_syntax="yes" + else + _backend_supported="no" + fi + + local _args_supported="no" + if [ "$_backend_supported" = "yes" ]; then + # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc. + if check_help_for "notspecified" "curl" "--tlsv1.2" "--ciphers" "--proto"; then + _args_supported="yes" + fi + fi + + local _cs="" + if [ "$_args_supported" = "yes" ]; then + if [ "$_openssl_syntax" = "yes" ]; then + _cs=$(get_strong_ciphersuites_for "openssl") + elif [ "$_gnutls_syntax" = "yes" ]; then + _cs=$(get_strong_ciphersuites_for "gnutls") + fi + fi + + RETVAL="$_cs" +} + +# Return cipher suite string specified by user, otherwise return strong TLS 1.2-1.3 cipher suites +# if support by local tools is detected. Detection currently supports these wget backends: +# GnuTLS and OpenSSL (possibly also LibreSSL and BoringSSL). Return value can be empty. +get_ciphersuites_for_wget() { + if [ -n "${RUSTUP_TLS_CIPHERSUITES-}" ]; then + # user specified custom cipher suites, assume they know what they're doing + RETVAL="$RUSTUP_TLS_CIPHERSUITES" + return + fi + + local _cs="" + if wget -V | grep -q '\-DHAVE_LIBSSL'; then + # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc. + if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then + _cs=$(get_strong_ciphersuites_for "openssl") + fi + elif wget -V | grep -q '\-DHAVE_LIBGNUTLS'; then + # "unspecified" is for arch, allows for possibility old OS using macports, homebrew, etc. + if check_help_for "notspecified" "wget" "TLSv1_2" "--ciphers" "--https-only" "--secure-protocol"; then + _cs=$(get_strong_ciphersuites_for "gnutls") + fi + fi + + RETVAL="$_cs" +} + +# Return strong TLS 1.2-1.3 cipher suites in OpenSSL or GnuTLS syntax. TLS 1.2 +# excludes non-ECDHE and non-AEAD cipher suites. DHE is excluded due to bad +# DH params often found on servers (see RFC 7919). Sequence matches or is +# similar to Firefox 68 ESR with weak cipher suites disabled via about:config. +# $1 must be openssl or gnutls. +get_strong_ciphersuites_for() { + if [ "$1" = "openssl" ]; then + # OpenSSL is forgiving of unknown values, no problems with TLS 1.3 values on versions that don't support it yet. + echo "TLS_AES_128_GCM_SHA256:TLS_CHACHA20_POLY1305_SHA256:TLS_AES_256_GCM_SHA384:ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-ECDSA-CHACHA20-POLY1305:ECDHE-RSA-CHACHA20-POLY1305:ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384" + elif [ "$1" = "gnutls" ]; then + # GnuTLS isn't forgiving of unknown values, so this may require a GnuTLS version that supports TLS 1.3 even if wget doesn't. + # Begin with SECURE128 (and higher) then remove/add to build cipher suites. Produces same 9 cipher suites as OpenSSL but in slightly different order. + echo "SECURE128:-VERS-SSL3.0:-VERS-TLS1.0:-VERS-TLS1.1:-VERS-DTLS-ALL:-CIPHER-ALL:-MAC-ALL:-KX-ALL:+AEAD:+ECDHE-ECDSA:+ECDHE-RSA:+AES-128-GCM:+CHACHA20-POLY1305:+AES-256-GCM" + fi +} + +set +u +case "$RUSTUP_INIT_SH_PRINT" in + arch | architecture) + get_architecture || exit 1 + echo "$RETVAL" + ;; + *) + main "$@" || exit 1 + ;; +esac diff --git a/src/controller/UdpNetwork.cpp b/src/controller/UdpNetwork.cpp index 685506c..2f0ac4d 100644 --- a/src/controller/UdpNetwork.cpp +++ b/src/controller/UdpNetwork.cpp @@ -15,11 +15,7 @@ void UdpNetwork::run_epoll() { struct epoll_event events[max_events]; int nfds, udp_fd = udp->socket(); - - - // pair conn_fds; char recv_buff[1472]; // MSS: 1500(MTU)-20(IP)-8(UDP) - // ssize_t num_read; while (true) { nfds = epoll_wait(epoll_fd, events, max_events, -1); if (nfds == -1) { @@ -31,8 +27,8 @@ void UdpNetwork::run_epoll() { throw_syserror("epoll_wait"); } for (int n = 0; n < nfds; ++n) { - if (events[n].data.fd == udp_fd) { // udp 处理 - // udp读取到信息进行转发 + if (events[n].data.fd == udp_fd) { // udp handling + // udp message resend AcceptData* client = new AcceptData; wait_message(client, udp_fd, recv_buff); add_msg_to_net(*client, recv_buff); @@ -44,7 +40,6 @@ void UdpNetwork::run_epoll() { if (events[n].events & EPOLLIN) { ssh_handle(ssh_fd); } - // cout< 0) { - // //设置响应事件,设置可读和边缘(ET)模式 - // //很多人会把可写事件(EPOLLOUT)也注册了,后面会解释 - // epev.events = EPOLLIN | EPOLLET; - // epev.data.fd = fd; - // //设置连接为非阻塞模式 - // int flags = fcntl(fd, F_GETFL, 0); - // if (flags < 0) { - // cout << "set no block error, fd:" << fd << endl; - // continue; - // } - // if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { - // cout << "set no block error, fd:" << fd << endl; - // continue; - // } - // //将新的连接添加到epoll中 - // } string node_name = configFile.get_node_name_with_addr(client_addr); cerr_verbose << "Client " << configFile.get_node_name_with_addr(client_addr) << " cmd fd: " << fd << endl; @@ -86,15 +64,13 @@ UdpNetwork::UdpNetwork(UdpSokcet *udp_socket) : udp(udp_socket) { epoll_fd = epoll_create1(0); if (epoll_fd == -1) throw_syserror("epoll_create1"); - // udp_socket 监听 + // udp_socket listen add_monitor_fd(udp_socket->socket()); // ssh port this->ssh_fd = ::socket(AF_INET, SOCK_STREAM, 0); if (this->ssh_fd == -1) throw_syserror("socket"); - // if (set_nonblocking(tcp_fd) == -1) - // throw_syserror("set_nonblocking"); const int opt = 1; struct sockaddr_in bind_addr{}; bind_addr.sin_family = AF_INET; @@ -135,8 +111,6 @@ const struct sockaddr_in *destination){ //printf("destination-setsockopt IP_TRANSPARENT isOK...\n"); } setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &flags, sizeof(flags)); - // todo ??? - // setsockopt(fd, SOL_SOCKET, SO_BROADCAST, &flags, sizeof(flags)); if(bind (fd, (struct sockaddr *) source,sizeof(*source))==-1){ cerr_warning <<"source-bind() error!"<client_addr.sin_addr)<<" :" << ntohs(ret->client_addr.sin_port); - // cerr_detail_cont << " Get Message source "<< configFile.get_node_name_with_addr(ret->client_addr); - // memset(&destinationAddr,0,sizeof(struct sockaddr_in)); for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg =CMSG_NXTHDR(&msg,cmsg)){ if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type ==IP_RECVORIGADDRS){ @@ -197,13 +168,6 @@ int UdpNetwork::wait_message(AcceptData* ret, const int udp_socket, char *data) cerr_detail_cont << data<masque_addr)<origin_addr)) { - // cerr_detail_cont << endl; - // } else { - // cerr_detail_cont << " (origin: " << configFile.get_node_name_with_addr(ret->origin_addr) << ")" << endl; - // } - // 这时候buf里面已经存有数据 //send_to_transparenty(buf,n,&sourceAddr,&destinationAddr); //sendto(fd,buf,n,&sourceAddr,&destinationAddr); return n; @@ -226,21 +190,6 @@ bool UdpNetwork::do_send(const uint32_t seq_num){ return false; } UdpMsg msg = num_msg->second; - // if (configFile.is_router_addr(*msg.get_dst())) { - // cerr_detail << "Transfer msg to router" << configFile.get_node_name_with_addr(*msg.get_src()) << " -> " << configFile.get_node_name_with_addr(*msg.get_dst())<add_node(configFile.get_node_name(acc.client_addr), acc.socket_fd); - // // return {-2, -2}; // success, is router fd - // network.erase(num_msg); - // return true; - // } if(send_to_transparenty(msg.buffer(), msg.size(), msg.get_src(), msg.get_dst()) == -1){ throw_syserror("send transparently"); network.erase(num_msg); @@ -401,7 +350,6 @@ bool UdpNetwork::send_cmd(const string &node, const string &cmd, int lineno) { return false; } close(it->second); - // TODO: (bug) concurrent, should erase carefully fd_to_client.erase(it->second); client_to_fd.erase(it); } diff --git a/src/controller/UdpNetwork.h b/src/controller/UdpNetwork.h index 70e2ed3..828bae9 100644 --- a/src/controller/UdpNetwork.h +++ b/src/controller/UdpNetwork.h @@ -34,7 +34,7 @@ class UdpNetwork { bool send_cmd_all(const string &prefix, const string &cmd, int lineno); private: UdpSokcet *udp; - // seq -> message, 便于定位数据包 + // seq -> message map network; int epoll_fd; bool is_direct; diff --git a/systems/Raft-rs/README b/systems/Raft-rs/README new file mode 100644 index 0000000..f9dcf37 --- /dev/null +++ b/systems/Raft-rs/README @@ -0,0 +1,8 @@ +需要下载rust编译环境 +```bash +sudo apt-get install rustc cargo +``` +运行命令 +```bash +build-cpp-raft-rs-driver-in-docker +``` \ No newline at end of file diff --git a/systems/Raft-rs/driver/CMakeLists.txt b/systems/Raft-rs/driver/CMakeLists.txt new file mode 100644 index 0000000..8f86664 --- /dev/null +++ b/systems/Raft-rs/driver/CMakeLists.txt @@ -0,0 +1,54 @@ +cmake_minimum_required(VERSION 3.16) +project(rsDriver) + +set(CMAKE_CXX_STANDARD 20) +add_compile_options(-Wall -Wextra -Werror -Wno-unused-parameter -Wno-unused-function) + +# add_subdirectory(deps/raft EXCLUDE_FROM_ALL) +add_subdirectory(tlpi EXCLUDE_FROM_ALL) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") + +find_package(gflags REQUIRED) +find_package(Readline REQUIRED) + + +# 单独生成的库所需的源文件 +set(LIBRARY_SOURCES + Network.cpp + TcpNetwork.cpp + Config.cpp + Node.cpp + common.cpp +) +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,/path/to/myLibrary") + +add_library(myLibrary SHARED ${LIBRARY_SOURCES}) + +target_link_libraries(myLibrary tlpi) + +add_custom_target(world_so + COMMAND make + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/rust_so +) + +# message("Current binary directory: ${CMAKE_CURRENT_BINARY_DIR}") +# message("Source binary directory: ${CMAKE_CURRENT_SOURCE_DIR}") +set(MyLib ${CMAKE_CURRENT_BINARY_DIR}/libmyLibrary.so ) + +set(WORLD_SO ${CMAKE_CURRENT_SOURCE_DIR}/rust_so/target/debug/libworld.so) + +include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/rust_so/include) + +add_executable(RedisTMet main.cpp Network.h Config.h Repl.cpp Repl.h Raft.cpp Raft.h common.h Node.h TcpNetwork.h UdpNetwork.h) + +# Add dependencies for the custom target if needed +add_dependencies(RedisTMet world_so) + +target_link_libraries(RedisTMet PUBLIC + readline + gflags + tlpi + ${MyLib} + # myLibrary + ${WORLD_SO}) \ No newline at end of file diff --git a/systems/Raft-rs/driver/Config.cpp b/systems/Raft-rs/driver/Config.cpp new file mode 100644 index 0000000..0d7dcff --- /dev/null +++ b/systems/Raft-rs/driver/Config.cpp @@ -0,0 +1,252 @@ +// +// Created by tangruize on 2/14/23. +// + +#include +#include "common.h" +#include "Config.h" + +bool Config::get_addr(const string &name, sockaddr_in &addr) const { + auto it = nodes.find(name); + if (it != nodes.end()) { + addr = it->addr; + return true; + } + return false; +} + +bool Config::get_name(const sockaddr_in &addr, string &name) const { + Node n(addr); + if (get_node(n)) { + name = n.name; + return true; + } else { + return false; + } +} + +bool Config::get_node(Node &n) const { +// static cmp_addr_equal eq; +// set::iterator it; +// if (n.name.empty()) { +// auto is_eq = [&n](const Node &i){ return eq(i.addr, n.addr); }; +// it = std::find_if(nodes.begin(), nodes.end(), is_eq); +// } else { +// it = nodes.find(n); +// } + auto it = find_node(n); + if (it != nodes.end()) { + n = *it; + return true; + } + return false; +} + +void Config::load(const string &filename) { + loading = true; + if (filename.empty()) { + throw_syserror("Error: filename is empty"); + } + ifstream f; + f.exceptions(std::ifstream::failbit | std::ifstream::badbit); // throw exception if failed to open file for read + f.open(filename); + f.exceptions(std::ifstream::badbit); // disable failbit throwing exceptions (such as EOF) + + string line; + int line_no = 0; + cerr_verbose << "Reading config file" << endl; + while (getline(f, line)) { + line_no++; + read(line, line_no); + } + f.close(); + set_self_node(); + loading = false; +} + +Config::Config(const string &filename) { + load(filename); + set_self_node(); +} + +bool Config::read(const string &line, int line_no) { + vector tokens = tokenize(line); + if (tokens.empty() || tokens[0][1] == '#') + return true; + + bool ok = false; + if (tokens.size() == 3 && tokens[0] == "map-cidr") { + char delim = '/'; + struct sockaddr_in origin = convert_addr(tokens[1].c_str(), delim); + struct sockaddr_in masquerade = convert_addr(tokens[2].c_str(), delim); + if (origin.sin_port && masquerade.sin_port) { + ok = true; + cidr_map[origin] = masquerade; + rev_cidr_map[masquerade] = origin; + cerr_verbose_cont << " - " << tokens[0] << ' ' << addr_to_string_delim(origin, delim) + << ' ' << addr_to_string_delim(masquerade, delim) << endl; + } + } + else if (tokens.size() == 3 && tokens[0] == "node") { + struct sockaddr_in addr = get_converted_addr(convert_addr(tokens[2].c_str()), rev_cidr_map); + if (addr.sin_port == (in_port_t)-1) { + addr.sin_port = htons(DEFAULT_PORT); + } + if (addr.sin_addr.s_addr) { + ok = true; + string node = tokens[1]; + nodes.insert(Node(node, addr)); + cerr_verbose << " - " << tokens[0] << ' ' << node << ' ' << inet_ntoa((addr).sin_addr) << endl; + } + } else if (tokens.size() == 2 && tokens[0] == "selfnode") { + self_name = tokens[1]; + ok = true; + cerr_verbose << " - " << tokens[0] << ' ' << self_name << endl; + } else if (tokens[0] == "get") { + return getinfo(stringify(tokens, 1)); + } + if (!ok) { + if (!loading) { + cerr_verbose_cont << " - WARN: invalid cmd"; + if (line_no > 0) { + cerr_verbose_cont << " at line " << line_no << ": " << line; + } + cerr_verbose_cont << endl; + } + } + return ok; +} + +bool Config::set_self_node(const string& name) { + if (name.empty()) { + if (!my_name.empty()) { + self_name = my_name; + } + } else { + self_name = name; + } + if (self_name.empty()) + return false; + self_node.name = self_name; + bool ok = get_node(self_node); + if (!ok) { + cerr_warning << "Cannot find self node name: " << name << endl; + self_node.name = ""; + } else { + cerr_verbose << "Set self node " << self_node.to_string() << endl; + } + return ok; +} + +Config::Config() { + set_self_node(); +} + +bool Config::get_self_node(Node &n) { + if (!self_node.name.empty()) { + n = self_node; + return true; + } else { + if (set_self_node()) { + n = self_node; + return true; + } else { + return false; + } + } +// n.name = self_name; +// bool ok = get_node(n); +// if (ok) { +// self_node = n; +// } +// return ok; +} + +set &Config::get_all_nodes() { + return nodes; +} + +Node &Config::get_self_node() { + if (self_node.name.empty()) + cerr_warning << "self node is not set" << endl; + return self_node; +} + +set::iterator Config::find_node(const Node &e) const { + static cmp_addr_equal eq; + set::iterator it; + if (e.name.empty()) { + auto is_eq = [&e](const Node &i){ return eq(i.addr, e.addr); }; + it = std::find_if(nodes.begin(), nodes.end(), is_eq); + } else { + it = nodes.find(e); + } + return it; +} + +// new add +set::iterator Config::find_node(const int id) const { + set::iterator it; + auto is_eq = [id](const Node &i){ return i.id == id; }; + it = std::find_if(nodes.begin(), nodes.end(), is_eq); + return it; +} + +bool Config::getinfo(const string &cmd) { + auto tokens = tokenize(cmd); + if (tokens.empty()) { + return false; + } + bool ok; + if (tokens[0] == "name") { + Node n(convert_addr(stringify(tokens, 1).c_str())); + ok = get_node(n); + if (ok) { + cout << n.name << endl; + } else { + cerr << "failed to find " << n.to_string() << endl; + } + } else if (tokens[0] == "addr") { + Node n(stringify(tokens, 1)); + ok = get_node(n); + if (ok) { + cout << addr_to_string(n.addr) << endl; + } + } else if (tokens[0] == "selfnode") { + Node n; + ok = get_self_node(n); + if (ok) { +// cout << n.name << ' ' << addr_to_string(n.addr) << endl; + cout << n.to_string() << endl; + } + } + return ok; +} + +static inline in_addr_t addr_mask(const sockaddr_in &addr, unsigned mask) { + return ntohl(addr.sin_addr.s_addr) & mask; +} + +struct sockaddr_in Config::get_converted_addr(const sockaddr_in &to_conv, const Config::conv_map_t &b) const { + for (auto &i: b) { + unsigned mask = unsigned(-1) << (32 - i.first.sin_port); + if (addr_mask(i.first, mask) == addr_mask(to_conv, mask)) { + mask = unsigned(-1) << (32 - i.second.sin_port); + sockaddr_in ret = to_conv; + ret.sin_addr.s_addr = htonl(addr_mask(i.second, mask) | addr_mask(to_conv, ~mask)); + return ret; + } + } + return to_conv; +} + +// Library functions +extern "C" { + int getIdSelf(); +} + +int getIdSelf(){ + return config.get_self_node().getid(); +} + +Config config; diff --git a/systems/Raft-rs/driver/Config.h b/systems/Raft-rs/driver/Config.h new file mode 100644 index 0000000..1712aa3 --- /dev/null +++ b/systems/Raft-rs/driver/Config.h @@ -0,0 +1,54 @@ +// +// Created by tangruize on 2/14/23. +// + +#ifndef REDISTMET_CONFIG_H +#define REDISTMET_CONFIG_H + +#include +#include +#include +#include "Node.h" +//#include "deps/raft/include/raft.h" + +using namespace std; + +const int DEFAULT_PORT = 9000; + +class Config { + typedef map conv_map_t; + conv_map_t cidr_map; + conv_map_t rev_cidr_map; + set nodes; + unordered_map node_map; + string self_name; + Node self_node; + void *raft_server; + set::iterator find_node(const Node &e) const; + + bool loading = false; + struct sockaddr_in get_converted_addr(const struct sockaddr_in &to_conv, const conv_map_t &b) const; +public: + Config(); + Config(const string &filename); + void load(const string &filename); + bool read(const string &line, int line_no=-1); + bool getinfo(const string &cmd); + bool get_node(Node &n) const; + bool get_addr(const string &name, struct sockaddr_in &addr) const; + bool get_name(const struct sockaddr_in &addr, string &name) const; + bool set_self_node(const string& name = ""); + bool get_self_node(Node &n); + set::iterator find_node(const int id) const ; + Node &get_self_node(); + [[nodiscard]] set &get_all_nodes(); + void set_raft_server(void *s) { raft_server = s; } + [[nodiscard]] void *get_raft_server() const { return raft_server; } +}; + + + + +extern Config config; + +#endif //REDISTMET_CONFIG_H diff --git a/systems/Raft-rs/driver/Network.cpp b/systems/Raft-rs/driver/Network.cpp new file mode 100644 index 0000000..4036ea9 --- /dev/null +++ b/systems/Raft-rs/driver/Network.cpp @@ -0,0 +1,109 @@ +// +// Created by tangruize on 2/13/23. +// + +#include "Network.h" +#include + +int Network::load_config() { + bool ok = config.get_self_node(self); + if (!ok) { + cerr_warning << "Self node name is not set" << endl; + } + int count = 0; + for (auto &i: config.get_all_nodes()) { + count += add_node(i); + } + return count; +} + +bool Network::add_node(const Node &peer) { +// auto it = peers.find(peer); + auto it = find_node(peer); + if (it != peers.end() || peer.name == self.name) { + if (it != peers.end() && peer.name == self.name) { + close(it->second); + peers.erase(it); + } + return false; + } + peers[peer] = -1; + return true; +} + +Network::Network() { + sockfd = -1; + evfd = -1; + int n = load_config(); + cerr_verbose << "Load " << n << " peer nodes" << endl; +} + +void Network::shut_down() { + int fd = sockfd; + sockfd = -1; + close(fd); + close(evfd); +} + +int Network::set_nonblocking(int fd) { + int flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { +// warn_syserror("set_nonblocking fcntl F_GETFL"); + return -1; + } + flags |= O_NONBLOCK; + if (fcntl(fd, F_SETFL, flags) == -1) { +// warn_syserror("set_nonblocking fcntl F_SETFL"); + return -1; + } + return 0; +} + +bool Network::repl(const string &cmd) { + auto tokens = tokenize(cmd); + if (tokens[0] == "connectall") { + connect_all(); + return true; + } else if (tokens[0] == "connect") { + if (tokens.size() > 1) { + return connect(tokens[1]); + } else { + return false; + } + } else if (tokens[0] == "isallconnected") { + return is_all_connected(); + } else if (tokens[0] == "recv") { + // to implement epoll! + return false; + } else if (tokens[0] == "recvfrom") { + if (tokens.size() > 1) { + return recv_from(tokens[1]) > 0; + } else { + return false; + } + } + return false; +} + +map::iterator Network::find_node(const Node &e) { + static cmp_addr_equal eq; + map::iterator it; + if (e.name.empty()) { + auto is_eq = [&e](const pair &i){ return eq(i.first.addr, e.addr); }; + it = std::find_if(begin(peers), end(peers), is_eq); + } else { + it = peers.find(e); + } + return it; +} + + + +map::iterator Network::find_node(const int id) { + map::iterator it; + auto is_eq = [id](const pair &i){ return i.first.id == id; }; + it = std::find_if(peers.begin(), peers.end(), is_eq); + return it; +} + +Network *net; \ No newline at end of file diff --git a/systems/Raft-rs/driver/Network.h b/systems/Raft-rs/driver/Network.h new file mode 100644 index 0000000..b8f879f --- /dev/null +++ b/systems/Raft-rs/driver/Network.h @@ -0,0 +1,40 @@ +// +// Created by tangruize on 2/13/23. +// + +#ifndef REDISTMET_NETWORK_H +#define REDISTMET_NETWORK_H + +#include "common.h" +#include "Config.h" + +class Network { +protected: + map peers; // node (for connectionless socket) -> file descriptor (for connection-based socket) + Node self; + volatile int sockfd = -1; + int evfd = -1; // auto recv, not implemented! + string recvbuffer; + [[nodiscard]] map::iterator find_node(const Node &e); +public: + Network(); + int load_config(); + bool add_node(const Node &peer); + virtual bool connect(const Node &n) { return true; } // do nothing for connectionless socket + virtual void connect_all() {} // do nothing for connectionless socket + virtual bool is_all_connected() { return true; } // do nothing for connectionless socket + virtual ssize_t send_to(const Node &peer, const string &data) = 0; + virtual ssize_t recv_from(const Node &peer, string &data) = 0; + ssize_t recv_from(const Node &peer) { return recv_from(peer, recvbuffer); } + void shut_down(); + ~Network() { shut_down(); } + static int set_nonblocking(int fd); + bool repl(const string &cmd); + string &get_recv_buffer() { return recvbuffer; } + + [[nodiscard]] map::iterator find_node(const int id) ; +}; + +extern Network *net; + +#endif //REDISTMET_NETWORK_H diff --git a/systems/Raft-rs/driver/Node.cpp b/systems/Raft-rs/driver/Node.cpp new file mode 100644 index 0000000..2138940 --- /dev/null +++ b/systems/Raft-rs/driver/Node.cpp @@ -0,0 +1,65 @@ +// +// Created by tangruize on 2/21/23. +// + +#include +#include "common.h" +#include "Node.h" + + +bool Node::operator<(const Node &b) const { +// if (!name.empty() && !b.name.empty()) { +// return *this < b.name; +// } else { +// return *this < b.addr; +// } + return *this < b.name; +} + +bool Node::operator<(const string &b) const { + return name < b; +} + +bool Node::operator<(const sockaddr_in &b) const { + const static cmp_addr_no_port_less al; + return al(addr, b); +} + +string Node::gethost() const { + return inet_ntoa(addr.sin_addr); +} + +string Node::getport() const { + return std::to_string(ntohs(addr.sin_port)); +} + +string Node::to_string() const { + if (!name.empty() && addr.sin_addr.s_addr != 0) + return "(" + name + ", " + gethost() + ":" + getport() + ")"; + else if (name.empty()) + return "(?UNKNOWN?, " + gethost() + ":" + getport() + ")"; + else + return "(" + name + ", ?UNKNOWN?)"; +} + +int Node::getid() { + if (id < 0) { + if (name.empty()) { + cerr_warning << "getid name is empty" << endl; + return id; + } + for (unsigned i = 0; i < name.size(); i++) { + if ('0' <= name[i] && name[i] <= '9') { + if (name.substr(0, i) != NODE_PREFIX) { + cerr_warning << "node name prefix is not \"" << NODE_PREFIX << "\"" << endl; + } + id = stoi(name.substr(i)); + break; + } + } + if (id < 0) { + cerr_warning << "cannot get node id" << endl; + } + } + return id; +} diff --git a/systems/Raft-rs/driver/Node.h b/systems/Raft-rs/driver/Node.h new file mode 100644 index 0000000..e58a510 --- /dev/null +++ b/systems/Raft-rs/driver/Node.h @@ -0,0 +1,38 @@ +// +// Created by tangruize on 2/21/23. +// + +#ifndef REDISTMET_NODE_H +#define REDISTMET_NODE_H + +#include +#include +#include +#include "common.h" + +using namespace std; + +#define NODE_PREFIX "n" + +struct Node { + string name; + struct sockaddr_in addr; + int id = -1; + void *data = nullptr; + Node() = default; + Node(string name_, const struct sockaddr_in &addr_): name{std::move(name_)}, addr{addr_} { getid(); } + Node(string name_): name{std::move(name_)}, addr{} { getid(); } + Node(const struct sockaddr_in &addr_): addr{addr_} {} + Node(int id_): addr{}, id{id_} { name = NODE_PREFIX + std::to_string(id); } + bool operator<(const struct Node& b) const; // less + bool operator<(const string& b) const; + bool operator<(const struct sockaddr_in& b) const; + [[nodiscard]] string gethost() const; + [[nodiscard]] string getport() const; + [[nodiscard]] string to_string() const; + int getid(); + void set_data(void *d) { data = d; } + [[nodiscard]] void* get_data() const { return data; } +}; + +#endif //REDISTMET_NODE_H diff --git a/systems/Raft-rs/driver/README.md b/systems/Raft-rs/driver/README.md new file mode 100644 index 0000000..e3404c5 --- /dev/null +++ b/systems/Raft-rs/driver/README.md @@ -0,0 +1,29 @@ +# RedisTMet + +目的:封装redis raft为可执行的程序。 + +## 网络 + +TCP网络: + +- 读取配置文件并建立连接 +- 使用字符串节点名作为发送方和接收方的标识(一个程序运行一个节点,仅需接收方) +- 消息开头为4字节网络字节序的包大小,后面为包内容 +- API: connect, recv, send, accept + +## Raft驱动 + +- 需要考虑超时设置 +- API封装:RV, RVR, AE, AER, timeout, recv entry, send ae + +## 用户界面 + +- 需要REPL来读取命令 +- 配置文件,节点名和地址 + +## 配置文件 + +格式: + +- node n1 1.1.1.1:9000 +- node n1 1.1.1.1 (默认9000) diff --git a/systems/Raft-rs/driver/Raft.cpp b/systems/Raft-rs/driver/Raft.cpp new file mode 100644 index 0000000..c6ad737 --- /dev/null +++ b/systems/Raft-rs/driver/Raft.cpp @@ -0,0 +1,202 @@ +// +// Created by tangruize on 2/14/23. +// +// WARN: we do not consider endian problem while serialization. +// WARN: we do not consider 32-bit or 64-bit memory. + +#include +#include +#include "Raft.h" +#include "Network.h" + +extern "C" { + void* raftInit(); + void raftRecvMsg(void* Node, const char* data, int length); + void raftPeriodic(void* Node); + void raftClientOperation(void* Node, const char* data); + char* raftGet(void* Node, const char* data); + void raftBecomePreCandidate(void* Node); + void raftBecomeCandidate(void* Node); + void raftBecomeLeader(void* Node); + void raftCampaign(void* Node, const char* data); + void raftBcastHeartbeat(void* Node); + void raftBcastAppend(void* Node); + void raftAskSnap(void* Node); +} + + + +#define DATA_LEN 8 + +string state_machine = "init"; + +enum { + MSG_AE, + MSG_AER, + MSG_RV, + MSG_RVR +}; + + + +bool RaftInit() { // 所以是一次性将所有节点都初始化好? + void *me = raftInit(); + if (!me) + return false; + cerr_verbose << "Raft init, self id: " << config.get_self_node().getid() << endl; + config.set_raft_server(me); + return true; +} + + + +bool RaftRecvMsg(Node node) { + if (!config.get_node(node)) { + cerr_warning << "RaftRecvMsg cannot find node" << endl; + return false; + } + string data; + int length = net->recv_from(node, data); + if (length < 0) + return false; + + + + char * dst = new char[length + 1]; + std::memcpy(dst, data.data(), length); + // const char* cstr = data.c_str(); + // printf("in Raft.cpp raftrecvmsg : %s", cstr); + + raftRecvMsg(config.get_raft_server(), dst, length); + return true; +} + +// 这个操作是什么 +bool RaftPeriodic() { + // const char* cstr = time.c_str(); + auto * me = config.get_raft_server(); + raftPeriodic(me); + return true; +} + +bool RaftBecomePreCandidate(){ + auto * me = config.get_raft_server(); + raftBecomePreCandidate(me); + return true; +} + +bool RaftBecomeCandidate(){ + auto * me = config.get_raft_server(); + raftBecomeCandidate(me); + return true; +} + +bool RaftBecomeLeader(){ + auto * me = config.get_raft_server(); + raftBecomeLeader(me); + return true; +} + +bool RaftBcastHeartBeat(){ + auto * me = config.get_raft_server(); + raftBcastHeartbeat(me); + return true; +} + +bool RaftBcastAppend(){ + auto * me = config.get_raft_server(); + raftBcastAppend(me); + return true; +} + +bool RaftAskSnap(){ + auto * me = config.get_raft_server(); + raftAskSnap(me); + return true; +} + +bool RaftCampaign(string data){ + auto * me = config.get_raft_server(); + const char* cstr = data.c_str(); + raftCampaign(me, cstr); + return true; +} + + +bool RaftClientOperation(string data) { + auto *me = config.get_raft_server(); + const char* cstr = data.c_str(); + raftClientOperation(me, cstr); + return true; +} + + + + +bool RaftRepl(const string &cmd) { + auto tokens = tokenize(cmd); + if (tokens.empty()) { + return false; + } + if (tokens[0] == "init") { + return RaftInit(); + } else if (tokens[0] == "recvfrom") { + if (tokens.size() <= 1) + return false; + else + return RaftRecvMsg(tokens[1]); + } else if (tokens[0] == "BecomePrecandidate") { + RaftBecomePreCandidate(); + return true; + } else if (tokens[0] == "BecomeCandidate") { + RaftBecomeCandidate(); + return true; + } else if (tokens[0] == "campaign") { + if (tokens.size() <= 1) + return false; + else + return RaftCampaign(tokens[1]); + } else if (tokens[0] == "BecomeLeader") { + RaftBecomeLeader(); + return true; + } else if (tokens[0] == "heartBeat") { + RaftBcastHeartBeat(); + return true; + } else if (tokens[0] == "askSnap") { + RaftAskSnap(); + return true; + } else if (tokens[0] == "bcast_append") { + RaftBcastAppend(); + return true; + } else if (tokens[0] == "periodic") { + RaftPeriodic(); + return true; + } else if (tokens[0] == "cli") { + if (tokens.size() <= 1) + return false; + return RaftClientOperation(tokens[1]); + } else if (tokens[0] == "statemachine" ) { + cerr << state_machine << endl; + return true; + } + else { + return false; + } + return false; +} + +string RaftGet(const string &variable) { + auto *me = config.get_raft_server(); + const char* cstr = variable.c_str(); + string result = raftGet(me, cstr); + return result; +} + +// not implemented! +//void RaftAutoRun() { +// if (!RaftInit()) { +// cerr_warning << "cannot init" << endl; +// abort(); +// } +// +//} \ No newline at end of file diff --git a/systems/Raft-rs/driver/Raft.h b/systems/Raft-rs/driver/Raft.h new file mode 100644 index 0000000..189140e --- /dev/null +++ b/systems/Raft-rs/driver/Raft.h @@ -0,0 +1,15 @@ +// +// Created by tangruize on 2/14/23. +// + +#ifndef REDISTMET_RAFT_H +#define REDISTMET_RAFT_H + +#include "common.h" + +#include "world.h" + +bool RaftRepl(const string &cmd); +string RaftGet(const string &variable); + +#endif //REDISTMET_RAFT_H diff --git a/systems/Raft-rs/driver/Repl.cpp b/systems/Raft-rs/driver/Repl.cpp new file mode 100644 index 0000000..d8e23df --- /dev/null +++ b/systems/Raft-rs/driver/Repl.cpp @@ -0,0 +1,113 @@ +// +// Created by tangruize on 2/14/23. +// + +#include "Repl.h" +#include "common.h" +#include "Raft.h" +#include "Config.h" +#include "Network.h" + +#include +#include +#include + +void Repl::readline() { + char *line; + while ((line = ::readline(prompt)) != nullptr) { + string line_str(line); + bool ok = false, invalid = false; + if (line_str.empty() || line_str.starts_with("#")) + continue; + vector tokens = tokenize(line_str); + if (tokens.empty()) + continue; + string ack_data; + if (tokens[0] == "config") { + ok = config.read(stringify(tokens, 1)); + } else if (tokens[0] == "get") { + ack_data = RaftGet(stringify(tokens, 1)); + ok = !ack_data.empty(); + if (ok) + cerr << ack_data << endl; + } else if (tokens[0] == "raft") { + ok = RaftRepl(stringify(tokens, 1)); + } else if (tokens[0] == "net") { + ok = net->repl(stringify(tokens, 1)); + } + else { + cerr_warning << "Invalid command" << endl; + invalid = true; + } + if (!invalid) { + cerr << (ok ? ok_str : fail_str) << endl; + } + if (ack_data.empty()) + ack_data = ok ? "True" : "False"; + ack(ack_data); + free(line); + } +} + +Repl::Repl() { + if (!isatty(STDIN_FILENO)) { + prompt = nullptr; + ok_str = "[OK]"; + fail_str = "[FAIL]"; + } + check_interceptor_fd(); +} + +bool Repl::getinfo(const string &cmd) { + auto tokens = tokenize(cmd); + if (tokens.empty()) { + return false; + } + bool ok; + if (tokens[0] == "name") { + Node n(convert_addr(stringify(tokens, 1).c_str())); + ok = config.get_node(n); + if (ok) { + cout << n.name << endl; + } else { + cerr << "failed to find " << n.to_string() << endl; + } + } else if (tokens[0] == "addr") { + Node n(stringify(tokens, 1)); + ok = config.get_node(n); + if (ok) { + cout << addr_to_string(n.addr) << endl; + } + } else if (tokens[0] == "selfnode") { + Node n; + ok = config.get_self_node(n); + if (ok) { +// cout << n.name << ' ' << addr_to_string(n.addr) << endl; + cout << n.to_string() << endl; + } + } + return ok; +} + +void Repl::check_interceptor_fd() { + if (fcntl(interceptor_fd, F_GETFD) == -1 && errno == EBADF) { + interceptor_fd = -1; + cerr_verbose << "Repl ack is disabled" << endl; + } else { + cerr_verbose << "Repl ack is enabled" << endl; + } + +} + +void Repl::ack(const string &data) const { + if (interceptor_fd == -1) + return; + string to_send; + uint32_t size = data.size(); + to_send.resize(sizeof(uint32_t)); + *(uint32_t*)&to_send.front() = htonl(size); + to_send += data; + if (write(interceptor_fd, to_send.c_str(), to_send.size()) != ssize_t(to_send.size())) { + cerr_warning << "cannot ack cmd with data: " << data << endl; + } +} diff --git a/systems/Raft-rs/driver/Repl.h b/systems/Raft-rs/driver/Repl.h new file mode 100644 index 0000000..4cde896 --- /dev/null +++ b/systems/Raft-rs/driver/Repl.h @@ -0,0 +1,27 @@ +// +// Created by tangruize on 2/14/23. +// + +#ifndef REDISTMET_REPL_H +#define REDISTMET_REPL_H + +#include + +using namespace std; + +class Repl { +private: + const char *prompt = "\033[1;36m" "(REPL) " "\033[0m"; + const char *ok_str = "\033[1;32m" "[OK]" "\033[0m"; // bold green + const char *fail_str = "\033[1;31m" "[FAIL]" "\033[0m"; // bold red + int interceptor_fd = 1022; + void check_interceptor_fd(); + void ack(const string &data) const; +public: + Repl(); + void readline(); + bool getinfo(const string &cmd); +}; + + +#endif //REDISTMET_REPL_H diff --git a/systems/Raft-rs/driver/TcpNetwork.cpp b/systems/Raft-rs/driver/TcpNetwork.cpp new file mode 100644 index 0000000..c05ea7a --- /dev/null +++ b/systems/Raft-rs/driver/TcpNetwork.cpp @@ -0,0 +1,267 @@ +// +// Created by tangruize on 2/23/23. +// + +#include "TcpNetwork.h" +#include +#include + +extern "C" { +#include "tlpi/inet_sockets.h" +} + +TcpNetwork::TcpNetwork(bool run_accept): Network() { + if (self.name.empty()) + return; + addrlen = 0; + // 9 ? + sockfd = inetListen(self.getport().c_str(), 9, &addrlen); + if (sockfd == -1) { + throw_syserror("inetListen"); + } + cerr_verbose << "listening port " << self.getport() << endl; + // 绑定到本节点的 listening port 然后等待其他节点来连接即可 + if (set_nonblocking(sockfd) == -1) { + throw_syserror("set_nonblocking"); + } + if (run_accept) { + accept_in_background(); +// cerr_verbose << "running accept in background" << endl; + } +} + +bool TcpNetwork::is_connected(int fd) { + if (fd == -1) + return false; + char data; + ssize_t size = recv(fd, &data, 1, MSG_PEEK | MSG_DONTWAIT); + if (size == 0 || (size == -1 && errno != EAGAIN)) + return false; + return true; +} + +bool TcpNetwork::connect(const Node &n) { +// auto it = peers.find(n); + auto it = find_node(n); + if (it == peers.end()) { + cerr_warning << "connect: cannot find peer!" << endl; + return false; + } + int fd = inetConnect(n.gethost().c_str(), n.getport().c_str(), SOCK_STREAM, 1); + if (fd == -1) { + warn_syserror("inetConnect"); + return false; + } + it->second = fd; + return true; +} + +void TcpNetwork::connect_all() { + for (auto &i: peers) { + // detect is still connected + if (i.second != -1) { + if (is_connected(i.second)) { + continue; + } + close(i.second); + i.second = -1; + } + // connect peers whose name > self.name. (thus accept connections that peer name < self.name) + if (i.first.name < self.name) + continue; + if (!connect(i.first)) { + cerr_warning << "failed to connect " << i.first.to_string() << endl; + } else { + cerr_verbose << "connected " << i.first.to_string() << endl; + } + } +} + +int TcpNetwork::accept() { + if (sockfd == -1) { + cerr_warning << "sockfd is -1" << endl; + return 0; + } + char claddr[addrlen]; + auto *sa = reinterpret_cast(claddr); + socklen_t alen = addrlen; + int cfd = ::accept(sockfd, sa, &alen); + if (cfd == -1) { + if (errno == EBADF) { + cerr_verbose << "sockfd is closed" << endl; + return 0; + } else if (errno == EAGAIN) { + return -1; + } else { + warn_syserror("accept"); + return -1; + } + } + switch (sa->sa_family) { + case AF_INET: + break; + case AF_INET6: + default: + throw_syserror("sa_family not implemented!"); + } + auto *sin = reinterpret_cast(sa); +// auto it = peers.find(*sin); + auto it = find_node(*sin); + if (it == peers.end()) { + char addr_str[IS_ADDR_STR_LEN]; + inetAddressStr(sa, alen, addr_str, IS_ADDR_STR_LEN); + cerr_warning << "not accepted: cannot find peer " << addr_str << " in config" << endl; + close(cfd); + return -1; + } + const struct timeval recv_timeout = { .tv_sec = 3, .tv_usec = 0}; + if (setsockopt(cfd, SOL_SOCKET, SO_RCVTIMEO, &recv_timeout, sizeof(recv_timeout)) == -1) + warn_syserror("accept setsockopt timeout: " + it->first.to_string()); + cerr_verbose << "accept " << it->first.to_string() << endl; + close(it->second); + it->second = cfd; + return cfd; +} + +void TcpNetwork::accept_in_background() { + cerr_verbose << "start accept thread" << endl; + thread t1([this] { accept_loop(); }); + t1.detach(); +} + +void TcpNetwork::accept_loop() { + struct epoll_event ev{}; + struct epoll_event events[1]; + int nfds, epoll_fd = epoll_create1(0); + if (epoll_fd == -1) { + throw_syserror("epoll_create"); + } + + ev.events = EPOLLIN | EPOLLET; // edge triggered + ev.data.fd = sockfd; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, sockfd, &ev) == -1) + throw_syserror("epoll_ctl"); + + while (true) { + nfds = epoll_wait(epoll_fd, events, 1, -1); + if (nfds == -1) { + if (errno == EINTR) { + cerr_warning << "epoll_wait is interrupted by signal" << endl; + continue; + } else + throw_syserror("epoll_wait"); + } + + for (int n = 0; n < nfds; ++n) { + if (events[n].data.fd == sockfd) { + while (accept() > 0) { // edge trigger, accept until no request to handle + ; + } + } + } + if (sockfd == -1) + return; + } +} + +ssize_t TcpNetwork::send_to(const Node &peer, const string &data) { +// auto it = peers.find(peer); + auto it = find_node(peer); + if (it == peers.end() || it->second < 0) { + cerr_warning << "cannot send to " << peer.to_string() << endl; + return -1; + } + string to_send; + uint32_t size = htonl(data.size()); + to_send.resize(sizeof(size)); + *(uint32_t *)&to_send.front() = size; + to_send += data; + + // cerr_verbose<<"in send_to to_send"<< to_send.c_str()<second, to_send.c_str(), to_send.size()); + + if (ret != ssize_t(data.size()) + 4) { + if (ret == -1) + warn_syserror("send_to"); + else { + cerr_warning << "partial send: " + to_string(ret) + "/" + to_string(to_send.size()); + } + } + return ret; +} + +ssize_t TcpNetwork::recv_from(const Node &peer, string &data) { +// auto it = peers.find(peer); + auto it = find_node(peer); + if (it == peers.end() || it->second < 0) { + cerr_warning << "cannot find node " << peer.to_string() << endl; + return -1; + } + uint32_t size; + ssize_t ret = recv(it->second, &size, sizeof(size), MSG_DONTWAIT | MSG_PEEK); + if (ret != 4) { + if (ret == -1) { + if (errno != EAGAIN) { + warn_syserror("TcpNetwork::recv_from"); + close(it->second); + it->second = -1; + } + } else { + cerr_warning << "recv_from cannot get size" << endl; + } + return -1; + } + if (recv(it->second, &size, sizeof(size), MSG_DONTWAIT) != 4) { // discard 4 bytes + cerr_warning << "recv_from failed to discard 4 bytes" << endl; + return -1; + } + size = ntohl(size); + if (size > 65535) { + cerr_warning << "recv_from: size is too big!" << endl; + abort(); + } + data.resize(size); + ret = recv(it->second, &data.front(), size, MSG_WAITALL); + if (ret == -1) { + warn_syserror("recv_from"); + } + return ret; +} + +bool all_of(const int* begin, const int* end, bool (*predicate)(int)) { + for (const int* p = begin; p != end; ++p) { + if (!predicate(*p)) { + return false; + } + } + return true; +} + +bool TcpNetwork::is_all_connected() { + return all_of(peers.cbegin(), peers.cend(), [](const pair&i) { + return i.second != -1 && is_connected(i.second); + }); +} + +// Library functions +extern "C" { + ssize_t send_to(int node_id, const char* data, int length) ; + ssize_t recv_from(int node_id, string &data); +} + + +ssize_t send_to(int node_id, const char* data, int length) { + + string str(data, length); + // cerr_verbose<<"in client tcpnetwork send to str"<< str<find_node(node_id); + return net->send_to(it->first, str); +} + + +ssize_t recv_from(int node_id, string &data) { + auto it = net->find_node(node_id); + return net->recv_from(it->first, data); +} + diff --git a/systems/Raft-rs/driver/TcpNetwork.h b/systems/Raft-rs/driver/TcpNetwork.h new file mode 100644 index 0000000..70f0b42 --- /dev/null +++ b/systems/Raft-rs/driver/TcpNetwork.h @@ -0,0 +1,29 @@ +// +// Created by tangruize on 2/23/23. +// + +#ifndef REDISTMET_TCPNETWORK_H +#define REDISTMET_TCPNETWORK_H + +#include "common.h" +#include "Network.h" + +class TcpNetwork: public Network { +private: + socklen_t addrlen; +public: + explicit TcpNetwork(bool run_accept=true); + int accept(); + void accept_loop(); + void accept_in_background(); + static bool is_connected(int fd); + bool connect(const Node &n) override; + void connect_all() override; + bool is_all_connected() override; + ssize_t send_to(const Node &peer, const string &data) override; + ssize_t recv_from(const Node &peer, string &data) override; +}; + + + +#endif //REDISTMET_TCPNETWORK_H diff --git a/systems/Raft-rs/driver/UdpNetwork.cpp b/systems/Raft-rs/driver/UdpNetwork.cpp new file mode 100644 index 0000000..890c098 --- /dev/null +++ b/systems/Raft-rs/driver/UdpNetwork.cpp @@ -0,0 +1,112 @@ +// +// Created by tangruize on 2/23/23. +// + +#include "UdpNetwork.h" +#include +#include + +extern "C" { +#include "tlpi/inet_sockets.h" +} + +UdpNetwork::UdpNetwork(bool run_background): Network() { + if (self.name.empty()) + return; + addrlen = 0; + // 这里为什么只传 port? + sockfd = inetBind(self.getport().c_str(), SOCK_DGRAM, &addrlen); + if (sockfd == -1) { + throw_syserror("inetBind"); + } + cerr_verbose << "bind port " << self.getport() << endl; + if (set_nonblocking(sockfd) == -1) { + throw_syserror("set_nonblocking"); + } +// if (run_background) { +// udp_in_background(); +// // cerr_verbose << "running accept in background" << endl; +// } +} + +bool UdpNetwork::is_connected(int fd) { + if (fd == -1) + return false; + char data; + ssize_t size = recv(fd, &data, 1, MSG_PEEK | MSG_DONTWAIT); + if (size == 0 || (size == -1 && errno != EAGAIN)) + return false; + return true; +} + + +ssize_t UdpNetwork::send_to(const Node &peer, const string &data) { +// auto it = peers.find(peer); + auto it = find_node(peer); + if (it == peers.end() || it->second < 0) { + cerr_warning << "cannot send to " << peer.to_string() << endl; + return -1; + } + string to_send; + uint32_t size = htonl(data.size()); + to_send.resize(sizeof(size)); + *(uint32_t *)&to_send.front() = size; + to_send += data; + ssize_t ret = write(it->second, to_send.c_str(), to_send.size()); + if (ret != ssize_t(data.size()) + 4) { + if (ret == -1) + warn_syserror("send_to"); + else { + cerr_warning << "partial send: " + to_string(ret) + "/" + to_string(to_send.size()); + } + } + return ret; +} + +ssize_t UdpNetwork::recv_from(const Node &peer, string &data) { +// auto it = peers.find(peer); + auto it = find_node(peer); + if (it == peers.end() || it->second < 0) { + cerr_warning << "cannot find node " << peer.to_string() << endl; + return -1; + } + uint32_t size; + struct sockaddr_in cli_addr; + socklen_t addrlen = sizeof(struct sockaddr_in); + ssize_t ret = recvfrom(it->second, &size, sizeof(size), MSG_DONTWAIT | MSG_PEEK, (struct sockaddr* )&cli_addr, &addrlen); + if (ret != 4) { + if (ret == -1) { + warn_syserror("UdpNetwork::recv_from"); + close(it->second); + it->second = -1; + } else { + cerr_warning << "recv_from cannot get size" << endl; + } + return -1; + } + if (recvfrom(it->second, &size, sizeof(size), MSG_DONTWAIT, (struct sockaddr* )&cli_addr, &addrlen) != 4) { // discard 4 bytes + cerr_warning << "recv_from failed to discard 4 bytes" << endl; + return -1; + } + size = ntohl(size); + if (size > 65535) { + cerr_warning << "recv_from: size is too big!" << endl; + abort(); + } + data.resize(size); + ret = recvfrom(it->second, &data.front(), size, MSG_WAITALL, (struct sockaddr* )&cli_addr, &addrlen); + if (ret == -1) { + warn_syserror("recv_from"); + } + return ret; +} + +bool all_of(const int* begin, const int* end, bool (*predicate)(int)) { + for (const int* p = begin; p != end; ++p) { + if (!predicate(*p)) { + return false; + } + } + return true; +} + diff --git a/systems/Raft-rs/driver/UdpNetwork.h b/systems/Raft-rs/driver/UdpNetwork.h new file mode 100644 index 0000000..0e23ed0 --- /dev/null +++ b/systems/Raft-rs/driver/UdpNetwork.h @@ -0,0 +1,32 @@ +// +// Created by tangruize on 2/23/23. +// + +#ifndef REDISTMET_UDPNETWORK_H +#define REDISTMET_UDPNETWORK_H + +#include "common.h" +#include "Network.h" + +class UdpNetwork: public Network { +private: + socklen_t addrlen; +public: + explicit UdpNetwork(bool run_accept=true); + int accept(); + void udp_loop(); + void udp_in_background(); + static bool is_connected(int fd); + bool connect(const Node &n) override; + void connect_all() override; + bool is_all_connected() override; + ssize_t send_to(const Node &peer, const string &data) override; + ssize_t recv_from(const Node &peer, string &data) override; +}; + +ssize_t send_to(const Node &peer, const string &data); + +ssize_t recv_from(const Node &peer, string &data); + + +#endif //REDISTMET_TCPNETWORK_H diff --git a/systems/Raft-rs/driver/cmake/FindReadline.cmake b/systems/Raft-rs/driver/cmake/FindReadline.cmake new file mode 100644 index 0000000..d1a4211 --- /dev/null +++ b/systems/Raft-rs/driver/cmake/FindReadline.cmake @@ -0,0 +1,14 @@ +find_path(Readline_INCLUDE_DIR + NAMES readline/readline.h) + +find_library(Readline_LIBRARY + NAMES readline) + +mark_as_advanced( + Readline_INCLUDE_DIR + Readline_LIBRARY +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Readline + DEFAULT_MSG Readline_INCLUDE_DIR Readline_LIBRARY) diff --git a/systems/Raft-rs/driver/common.cpp b/systems/Raft-rs/driver/common.cpp new file mode 100644 index 0000000..d8690fb --- /dev/null +++ b/systems/Raft-rs/driver/common.cpp @@ -0,0 +1,87 @@ +// +// Created by tangruize on 2/14/23. +// + +#include "common.h" + +string info_str = "[INFO] "; +string warn_str = "[WARN] "; +string detl_str = "[DETL] "; + +bool my_help; +bool my_verbose; +bool my_detail; +string my_config; +string my_name; + +// convert ADDR (format: xxx.xxx.xxx.xxx:port) +struct sockaddr_in convert_addr(const char *addr, char delim) { + struct sockaddr_in res{}; + stringstream ss(addr); + string ip, port; + getline(ss, ip, delim); + getline(ss, port, delim); + inet_aton(ip.c_str(), &res.sin_addr); + if (!port.empty()) { + if (delim == ':') + res.sin_port = htons(stoi(port)); + else { + res.sin_port = stoi(port); + unsigned mask = unsigned(-1) << (32 - res.sin_port); + res.sin_addr.s_addr = htonl(ntohl(res.sin_addr.s_addr) & mask); + }; + } else { + res.sin_port = (in_port_t)-1; + } + return res; +} + +void init_prompt_color() { + if (isatty(STDERR_FILENO) && !info_str.empty() && info_str[0] != '\033') { + info_str = "\033[1;32m" + info_str + "\033[0m"; // bold green + warn_str = "\033[1;31m" + warn_str + "\033[0m"; // bold red + detl_str = "\033[1;34m" + detl_str + "\033[0m"; // bold blue + } +} + +vector tokenize(const string &s) { + vector tokens; + string token; + istringstream ss(s); + while (getline(ss, token, ' ')) { + if (token.empty()) + continue; + tokens.push_back(token); + } + return tokens; +} + +string stringify(const vector &t, size_t start, const string &delim) { + string result; + for (size_t i = 0; i < t.size(); i++) { + if (i > start) + result += delim + t[i]; + else if (i == start) + result += t[i]; + } + return result; +} + +bool cmp_addr_less::operator()(const sockaddr_in &a, const sockaddr_in &b) const { + if (a.sin_addr.s_addr == b.sin_addr.s_addr) + return a.sin_port < b.sin_port; + return a.sin_addr.s_addr < b.sin_addr.s_addr; +} + +bool cmp_addr_equal::operator()(const sockaddr_in &a, const sockaddr_in &b) const { +// return a.sin_addr.s_addr == b.sin_addr.s_addr && a.sin_port == b.sin_port; + return a.sin_addr.s_addr == b.sin_addr.s_addr; +} + +bool cmp_addr_no_port_less::operator()(const sockaddr_in &a, const sockaddr_in &b) const { +// if (a.sin_addr.s_addr == b.sin_addr.s_addr) +// if (a.sin_port != (in_port_t)-1 && b.sin_port != (in_port_t)-1) +// return a.sin_port < b.sin_port; + return a.sin_addr.s_addr < b.sin_addr.s_addr; +} + diff --git a/systems/Raft-rs/driver/common.h b/systems/Raft-rs/driver/common.h new file mode 100644 index 0000000..8f97f03 --- /dev/null +++ b/systems/Raft-rs/driver/common.h @@ -0,0 +1,79 @@ +// +// Created by tangruize on 2/14/23. +// + +#ifndef REDISTMET_COMMON_H +#define REDISTMET_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#include "Config.h" + +using namespace std; +using namespace gflags; + +extern string info_str; +extern string warn_str; +extern string detl_str; + +extern bool my_help; +extern bool my_verbose; +extern bool my_detail; +extern string my_config; +extern string my_name; + +DECLARE_bool(verbose); +DECLARE_bool(detail); +DECLARE_string(name); + +#define cerr_verbose if (my_verbose) cerr << info_str +#define cerr_verbose_cont if (my_verbose) cerr +#define cerr_detail if (my_detail) cerr << detl_str +#define cerr_detail_cont if (my_detail) cerr +#define cerr_warning cerr << warn_str +#define cerr_warning_cont cerr + +#define SHORT_FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) +#define SYSERROR(info) std::system_error(errno, std::generic_category(), \ + std::string(info) + ": " + std::string(SHORT_FILENAME) + ":" + std::to_string(__LINE__)) +#define throw_syserror(info) throw SYSERROR(info) +#define warn_syserror(info) cerr_warning << SYSERROR(info).what() << endl + +#define addr_to_string_delim(addr, delim) (string(inet_ntoa((addr).sin_addr)) + (((addr).sin_port != 0) ? ((delim) + \ + to_string(((delim) == ':') ? ntohs((addr).sin_port) : (addr).sin_port)) : "")) +#define addr_to_string(addr) addr_to_string_delim(addr, ':') + +void init_prompt_color(); + +// a < b +struct cmp_addr_less { + bool operator()(const struct sockaddr_in& a, const struct sockaddr_in& b) const; +}; + +struct cmp_addr_no_port_less { + bool operator()(const struct sockaddr_in& a, const struct sockaddr_in& b) const; +}; + +// a == b +struct cmp_addr_equal { + bool operator()(const struct sockaddr_in& a, const struct sockaddr_in& b) const; +}; + +struct sockaddr_in convert_addr(const char *addr, char delim = ':'); + +vector tokenize(const string &s); +string stringify(const vector &t, size_t start=0, const string &delim=" "); + +#endif //REDISTMET_COMMON_H diff --git a/systems/Raft-rs/driver/main.cpp b/systems/Raft-rs/driver/main.cpp new file mode 100644 index 0000000..e168614 --- /dev/null +++ b/systems/Raft-rs/driver/main.cpp @@ -0,0 +1,38 @@ +#include "common.h" +#include "Repl.h" +#include "TcpNetwork.h" + +DECLARE_bool(help); +DEFINE_bool(verbose, false, "Show TMET information"); +DEFINE_bool(detail, false, "Show Raft information"); +DEFINE_string(config, "", "Config file"); +DEFINE_string(name, "", "Self name"); + +void setup(int argc, char **argv) { + // Parse arguments and show help + SetUsageMessage("Redis TMET node program"); + ParseCommandLineNonHelpFlags(&argc, &argv, true); + if (FLAGS_help || argc != 1) { + ShowUsageWithFlagsRestrict(argv[0], "main"); + exit(1); + } else if (FLAGS_detail) { + FLAGS_verbose = true; + } + + init_prompt_color(); + my_help = FLAGS_detail; + my_verbose = FLAGS_verbose; + my_detail = FLAGS_detail; + my_config = FLAGS_config; + my_name = FLAGS_name; +} + +int main(int argc, char **argv) { + setup(argc, argv); + if (!FLAGS_config.empty()) { + config.load(FLAGS_config); + } + net = new TcpNetwork(); + Repl repl; + repl.readline(); +} diff --git a/systems/Raft-rs/driver/rust_so/Cargo.lock b/systems/Raft-rs/driver/rust_so/Cargo.lock new file mode 100644 index 0000000..df7075f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/Cargo.lock @@ -0,0 +1,1131 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi 0.1.19", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "autotools" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef941527c41b0fc0dd48511a8154cd5fc7e29200a0ff8b7203c5d777dbc795cf" +dependencies = [ + "cc", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cc" +version = "1.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" +dependencies = [ + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags 1.3.2", + "strsim", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "criterion" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + +[[package]] +name = "deranged" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getset" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3586f256131df87204eb733da72e3d3eb4f343c639f4b7be279ac7c48baeafe" +dependencies = [ + "proc-macro-error2", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "half" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi 0.5.1", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags 2.9.1", + "libc", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro-error-attr2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96de42df36bb9bba5542fe9f1a054b8cc87e172759a1868aa05c1f3acc89dfc5" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "proc-macro-error2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11ec05c52be0a07b08061f7dd003e7d7092e0472bc731b4af7bb1ef876109802" +dependencies = [ + "proc-macro-error-attr2", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "protobuf" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" +dependencies = [ + "bytes", +] + +[[package]] +name = "protobuf-build" +version = "0.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c852d9625b912c3e50480cdc701f60f49890b5d7ad46198dd583600f15e7c6ec" +dependencies = [ + "bitflags 1.3.2", + "protobuf", + "protobuf-codegen", + "protobuf-src", + "regex", +] + +[[package]] +name = "protobuf-codegen" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" +dependencies = [ + "protobuf", +] + +[[package]] +name = "protobuf-src" +version = "1.1.0+21.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7ac8852baeb3cc6fb83b93646fb93c0ffe5d14bf138c945ceb4b9948ee0e3c1" +dependencies = [ + "autotools", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "raft" +version = "0.7.0" +dependencies = [ + "bytes", + "fxhash", + "getset", + "protobuf", + "raft-proto", + "rand", + "slog", + "slog-envlogger", + "slog-stdlog", + "slog-term", + "thiserror", +] + +[[package]] +name = "raft-proto" +version = "0.7.0" +dependencies = [ + "bytes", + "protobuf", + "protobuf-build", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom", + "libredox", + "thiserror", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rust_so" +version = "0.1.0" +dependencies = [ + "cc", + "clap", + "criterion", + "libc", + "protobuf", + "raft", + "rand", + "regex", + "slog", + "slog-async", + "slog-envlogger", + "slog-stdlog", + "slog-term", +] + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + +[[package]] +name = "slog-async" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72c8038f898a2c79507940990f05386455b3a317d8f18d4caea7cbc3d5096b84" +dependencies = [ + "crossbeam-channel", + "slog", + "take_mut", + "thread_local", +] + +[[package]] +name = "slog-envlogger" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "906a1a0bc43fed692df4b82a5e2fbfc3733db8dad8bb514ab27a4f23ad04f5c0" +dependencies = [ + "log", + "regex", + "slog", + "slog-async", + "slog-scope", + "slog-stdlog", + "slog-term", +] + +[[package]] +name = "slog-scope" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f95a4b4c3274cd2869549da82b57ccc930859bdbf5bcea0424bc5f140b3c786" +dependencies = [ + "arc-swap", + "lazy_static", + "slog", +] + +[[package]] +name = "slog-stdlog" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6706b2ace5bbae7291d3f8d2473e2bfab073ccd7d03670946197aec98471fa3e" +dependencies = [ + "log", + "slog", + "slog-scope", +] + +[[package]] +name = "slog-term" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6e022d0b998abfe5c3782c1f03551a596269450ccd677ea51c56f8b214610e8" +dependencies = [ + "is-terminal", + "slog", + "term", + "thread_local", + "time", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "take_mut" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60" + +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "time" +version = "0.3.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" + +[[package]] +name = "time-macros" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/systems/Raft-rs/driver/rust_so/Cargo.toml b/systems/Raft-rs/driver/rust_so/Cargo.toml new file mode 100644 index 0000000..11a6138 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "rust_so" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + + +[lib] +name = "world" +crate-type = ["cdylib"] + +[build-dependencies] +cc = "1.0" +libc = "0.2" + +[profile.dev] +rpath = true + +[dependencies] +clap = "2.33.0" +raft = { path = "./raft-rs" } +libc = "0.2" +rand = "0.8" +slog = "2.2" +slog-envlogger = { version = "2.1.0", optional = true } +slog-stdlog = { version = "4", optional = true } +slog-term = { version = "2.6.0" } +criterion = "0.3" +regex = "1" +slog-async = "2.1.0" +protobuf = "2" diff --git a/systems/Raft-rs/driver/rust_so/Makefile b/systems/Raft-rs/driver/rust_so/Makefile new file mode 100644 index 0000000..e683c8a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/Makefile @@ -0,0 +1,5 @@ +target/debug/libworld.so: src/lib.rs + env RUSTFLAGS="-L $(CURDIR)/../../../../cmake-build-debug/rsDriver" cargo build + +clean: + cargo clean diff --git a/systems/Raft-rs/driver/rust_so/include/world.h b/systems/Raft-rs/driver/rust_so/include/world.h new file mode 100644 index 0000000..9061f98 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/include/world.h @@ -0,0 +1,22 @@ + +#ifndef WORLD_H +#define WORLD_H + +extern "C"{ + void* raftInit(); + void raftRecvMsg(void* Node, const char* id, int length); + void raftPeriodic(void* Node); + void raftClientOperation(void* Node, const char* data); + char* raftGet(void* Node, const char* data); + void raftBecomePreCandidate(void* Node); + void raftBecomeCandidate(void* Node); + void raftBecomeLeader(void* Node); + void raftCampaign(void* Node, const char* data); + void raftCompact(void* Node); + void raftAskSnap(void* Node); + void raftBcastHeartbeat(void* Node); + void raftBcastAppend(void* Node); +} + + +#endif \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/proto/Cargo.toml b/systems/Raft-rs/driver/rust_so/proto/Cargo.toml new file mode 100644 index 0000000..1fd820e --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "raft-proto" +version = "0.7.0" +authors = ["The TiKV Project Developers"] +edition = "2021" +license = "Apache-2.0" +keywords = ["raft", "distributed-systems", "ha"] +repository = "https://github.com/pingcap/raft-rs" +homepage = "https://github.com/pingcap/raft-rs" +documentation = "https://docs.rs/raft-proto" +description = "Protocol definitions for the rust language implementation of the Raft algorithm." +categories = ["algorithms", "database-implementations"] +build = "build.rs" + +[features] +default = ["protobuf-codec"] +protobuf-codec = ["protobuf-build/protobuf-codec", "bytes", "protobuf/bytes"] +prost-codec = ["protobuf-build/prost-codec", "prost", "lazy_static"] + +[build-dependencies] +protobuf-build = { version = "0.14", default-features = false } + +[dependencies] +bytes = { version = "1", optional = true } +lazy_static = { version = "1", optional = true } +prost = { version = "0.11", optional = true } +protobuf = "2" diff --git a/systems/Raft-rs/driver/rust_so/proto/README.md b/systems/Raft-rs/driver/rust_so/proto/README.md new file mode 100644 index 0000000..d9a51dc --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/README.md @@ -0,0 +1,8 @@ +# Raft Proto + +[![Documentation](https://docs.rs/raft-proto/badge.svg)](https://docs.rs/raft-proto/) +[![Crates.io](https://img.shields.io/crates/v/raft-proto.svg)](https://crates.io/crates/raft-proto) + +This crate contains the protobuf structs used by raft. + +You can find the generated `eraftpb.rs` in `target/debug/build/raft-proto-***/out`. \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/proto/proto/eraftpb.proto b/systems/Raft-rs/driver/rust_so/proto/proto/eraftpb.proto new file mode 100644 index 0000000..dcd285d --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/proto/eraftpb.proto @@ -0,0 +1,196 @@ +syntax = "proto3"; +package eraftpb; + +import "rustproto.proto"; +option (rustproto.carllerche_bytes_for_bytes_all) = true; + +enum EntryType { + EntryNormal = 0; + EntryConfChange = 1; + EntryConfChangeV2 = 2; +} + +// The entry is a type of change that needs to be applied. It contains two data fields. +// While the fields are built into the model; their usage is determined by the entry_type. +// +// For normal entries, the data field should contain the data change that should be applied. +// The context field can be used for any contextual data that might be relevant to the +// application of the data. +// +// For configuration changes, the data will contain the ConfChange message and the +// context will provide anything needed to assist the configuration change. The context +// if for the user to set and use in this case. +message Entry { + EntryType entry_type = 1; + uint64 term = 2; + uint64 index = 3; + bytes data = 4; + bytes context = 6; + + // Deprecated! It is kept for backward compatibility. + // TODO: remove it in the next major release. + bool sync_log = 5; +} + +message SnapshotMetadata { + // The current `ConfState`. + ConfState conf_state = 1; + // The applied index. + uint64 index = 2; + // The term of the applied index. + uint64 term = 3; +} + +message Snapshot { + bytes data = 1; + SnapshotMetadata metadata = 2; +} + +enum MessageType { + MsgHup = 0; + MsgBeat = 1; + MsgPropose = 2; + MsgAppend = 3; + MsgAppendResponse = 4; + MsgRequestVote = 5; + MsgRequestVoteResponse = 6; + MsgSnapshot = 7; + MsgHeartbeat = 8; + MsgHeartbeatResponse = 9; + MsgUnreachable = 10; + MsgSnapStatus = 11; + MsgCheckQuorum = 12; + MsgTransferLeader = 13; + MsgTimeoutNow = 14; + MsgReadIndex = 15; + MsgReadIndexResp = 16; + MsgRequestPreVote = 17; + MsgRequestPreVoteResponse = 18; +} + +message Message { + MessageType msg_type = 1; + uint64 to = 2; + uint64 from = 3; + uint64 term = 4; + // logTerm is generally used for appending Raft logs to followers. For example, + // (type=MsgAppend,index=100,log_term=5) means leader appends entries starting at + // index=101, and the term of entry at index 100 is 5. + // (type=MsgAppendResponse,reject=true,index=100,log_term=5) means follower rejects some + // entries from its leader as it already has an entry with term 5 at index 100. + uint64 log_term = 5; + uint64 index = 6; + repeated Entry entries = 7; + uint64 commit = 8; + uint64 commit_term = 15; + Snapshot snapshot = 9; + uint64 request_snapshot = 13; + bool reject = 10; + uint64 reject_hint = 11; + bytes context = 12; + uint64 deprecated_priority = 14; + // If this new field is not set, then use the above old field; otherwise + // use the new field. When broadcasting request vote, both fields are + // set if the priority is larger than 0. This change is not a fully + // compatible change, but it makes minimal impact that only new priority + // is not recognized by the old nodes during rolling update. + int64 priority = 16; +} + +message HardState { + uint64 term = 1; + uint64 vote = 2; + uint64 commit = 3; +} + +enum ConfChangeTransition { + // Automatically use the simple protocol if possible, otherwise fall back + // to ConfChangeType::Implicit. Most applications will want to use this. + Auto = 0; + // Use joint consensus unconditionally, and transition out of them + // automatically (by proposing a zero configuration change). + // + // This option is suitable for applications that want to minimize the time + // spent in the joint configuration and do not store the joint configuration + // in the state machine (outside of InitialState). + Implicit = 1; + // Use joint consensus and remain in the joint configuration until the + // application proposes a no-op configuration change. This is suitable for + // applications that want to explicitly control the transitions, for example + // to use a custom payload (via the Context field). + Explicit = 2; +} + +message ConfState { + repeated uint64 voters = 1; + repeated uint64 learners = 2; + + // The voters in the outgoing config. If not empty the node is in joint consensus. + repeated uint64 voters_outgoing = 3; + // The nodes that will become learners when the outgoing config is removed. + // These nodes are necessarily currently in nodes_joint (or they would have + // been added to the incoming config right away). + repeated uint64 learners_next = 4; + // If set, the config is joint and Raft will automatically transition into + // the final config (i.e. remove the outgoing config) when this is safe. + bool auto_leave = 5; +} + +enum ConfChangeType { + AddNode = 0; + RemoveNode = 1; + AddLearnerNode = 2; +} + +message ConfChange { + ConfChangeType change_type = 2; + uint64 node_id = 3; + bytes context = 4; + + uint64 id = 1; +} + +// ConfChangeSingle is an individual configuration change operation. Multiple +// such operations can be carried out atomically via a ConfChangeV2. +message ConfChangeSingle { + ConfChangeType change_type = 1; + uint64 node_id = 2; +} + +// ConfChangeV2 messages initiate configuration changes. They support both the +// simple "one at a time" membership change protocol and full Joint Consensus +// allowing for arbitrary changes in membership. +// +// The supplied context is treated as an opaque payload and can be used to +// attach an action on the state machine to the application of the config change +// proposal. Note that contrary to Joint Consensus as outlined in the Raft +// paper[1], configuration changes become active when they are *applied* to the +// state machine (not when they are appended to the log). +// +// The simple protocol can be used whenever only a single change is made. +// +// Non-simple changes require the use of Joint Consensus, for which two +// configuration changes are run. The first configuration change specifies the +// desired changes and transitions the Raft group into the joint configuration, +// in which quorum requires a majority of both the pre-changes and post-changes +// configuration. Joint Consensus avoids entering fragile intermediate +// configurations that could compromise survivability. For example, without the +// use of Joint Consensus and running across three availability zones with a +// replication factor of three, it is not possible to replace a voter without +// entering an intermediate configuration that does not survive the outage of +// one availability zone. +// +// The provided ConfChangeTransition specifies how (and whether) Joint Consensus +// is used, and assigns the task of leaving the joint configuration either to +// Raft or the application. Leaving the joint configuration is accomplished by +// proposing a ConfChangeV2 with only and optionally the Context field +// populated. +// +// For details on Raft membership changes, see: +// +// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf +message ConfChangeV2 { + ConfChangeTransition transition = 1; + repeated ConfChangeSingle changes = 2; + bytes context = 3; +} diff --git a/systems/Raft-rs/driver/rust_so/proto/src/confchange.rs b/systems/Raft-rs/driver/rust_so/proto/src/confchange.rs new file mode 100644 index 0000000..23b81fe --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/src/confchange.rs @@ -0,0 +1,151 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::eraftpb::{ + ConfChange, ConfChangeSingle, ConfChangeTransition, ConfChangeType, ConfChangeV2, +}; +use std::borrow::Cow; +use std::fmt::Write; + +/// Creates a `ConfChangeSingle`. +pub fn new_conf_change_single(node_id: u64, ty: ConfChangeType) -> ConfChangeSingle { + let mut single = ConfChangeSingle::default(); + single.node_id = node_id; + single.set_change_type(ty); + single +} + +/// Parses a Space-delimited sequence of operations into a slice of ConfChangeSingle. +/// The supported operations are: +/// - vn: make n a voter, +/// - ln: make n a learner, +/// - rn: remove n +pub fn parse_conf_change(s: &str) -> Result, String> { + let s = s.trim(); + if s.is_empty() { + return Ok(vec![]); + } + let mut ccs = vec![]; + let splits = s.split_ascii_whitespace(); + for tok in splits { + if tok.len() < 2 { + return Err(format!("unknown token {}", tok)); + } + let mut cc = ConfChangeSingle::default(); + let mut chars = tok.chars(); + cc.set_change_type(match chars.next().unwrap() { + 'v' => ConfChangeType::AddNode, + 'l' => ConfChangeType::AddLearnerNode, + 'r' => ConfChangeType::RemoveNode, + _ => return Err(format!("unknown token {}", tok)), + }); + cc.node_id = match chars.as_str().parse() { + Ok(id) => id, + Err(e) => return Err(format!("parse token {} fail: {}", tok, e)), + }; + ccs.push(cc); + } + Ok(ccs) +} + +/// The inverse to `parse_conf_change`. +pub fn stringify_conf_change(ccs: &[ConfChangeSingle]) -> String { + let mut s = String::new(); + for (i, cc) in ccs.iter().enumerate() { + if i > 0 { + s.push(' '); + } + match cc.get_change_type() { + ConfChangeType::AddNode => s.push('v'), + ConfChangeType::AddLearnerNode => s.push('l'), + ConfChangeType::RemoveNode => s.push('r'), + } + write!(&mut s, "{}", cc.node_id).unwrap(); + } + s +} + +/// Abstracts over ConfChangeV2 and (legacy) ConfChange to allow +/// treating them in a unified manner. +pub trait ConfChangeI { + /// Converts conf change to `ConfChangeV2`. + fn into_v2(self) -> ConfChangeV2; + + /// Gets conf change as `ConfChangeV2`. + fn as_v2(&self) -> Cow; + + /// Converts conf change to `ConfChange`. + /// + /// `ConfChangeV2` can't be changed back to `ConfChange`. + fn as_v1(&self) -> Option<&ConfChange>; +} + +impl ConfChangeI for ConfChange { + #[inline] + fn into_v2(mut self) -> ConfChangeV2 { + let mut cc = ConfChangeV2::default(); + let single = new_conf_change_single(self.node_id, self.get_change_type()); + cc.mut_changes().push(single); + cc.set_context(self.take_context()); + cc + } + + #[inline] + fn as_v2(&self) -> Cow { + Cow::Owned(self.clone().into_v2()) + } + + #[inline] + fn as_v1(&self) -> Option<&ConfChange> { + Some(self) + } +} + +impl ConfChangeI for ConfChangeV2 { + #[inline] + fn into_v2(self) -> ConfChangeV2 { + self + } + + #[inline] + fn as_v2(&self) -> Cow { + Cow::Borrowed(self) + } + + #[inline] + fn as_v1(&self) -> Option<&ConfChange> { + None + } +} + +impl ConfChangeV2 { + /// Checks if uses Joint Consensus. + /// + /// It will return Some if and only if this config change will use Joint Consensus, + /// which is the case if it contains more than one change or if the use of Joint + /// Consensus was requested explicitly. The bool indicates whether the Joint State + /// will be left automatically. + pub fn enter_joint(&self) -> Option { + // NB: in theory, more config changes could qualify for the "simple" + // protocol but it depends on the config on top of which the changes apply. + // For example, adding two learners is not OK if both nodes are part of the + // base config (i.e. two voters are turned into learners in the process of + // applying the conf change). In practice, these distinctions should not + // matter, so we keep it simple and use Joint Consensus liberally. + if self.get_transition() != ConfChangeTransition::Auto || self.changes.len() > 1 { + match self.get_transition() { + ConfChangeTransition::Auto | ConfChangeTransition::Implicit => Some(true), + ConfChangeTransition::Explicit => Some(false), + } + } else { + None + } + } + + /// Checks if the configuration change leaves a joint configuration. + /// + /// This is the case if the ConfChangeV2 is zero, with the possible exception of + /// the Context field. + pub fn leave_joint(&self) -> bool { + self.get_transition() == ConfChangeTransition::Auto && self.changes.is_empty() + } +} diff --git a/systems/Raft-rs/driver/rust_so/proto/src/confstate.rs b/systems/Raft-rs/driver/rust_so/proto/src/confstate.rs new file mode 100644 index 0000000..559bbd2 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/src/confstate.rs @@ -0,0 +1,40 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::eraftpb::ConfState; + +fn eq_without_order(lhs: &[u64], rhs: &[u64]) -> bool { + for l in lhs { + if !rhs.contains(l) { + return false; + } + } + for r in rhs { + if !lhs.contains(r) { + return false; + } + } + true +} + +// Returns true if the inputs describe the same configuration. +#[must_use] +pub fn conf_state_eq(lhs: &ConfState, rhs: &ConfState) -> bool { + // The orders are different only when hash algorithm or insert orders are + // different. In most case, only one hash algorithm is used. Insert orders + // should be the same due to the raft protocol. So in most case, they can + // be compared directly. + if lhs.get_voters() == rhs.get_voters() + && lhs.get_learners() == rhs.get_learners() + && lhs.get_voters_outgoing() == rhs.get_voters_outgoing() + && lhs.get_learners_next() == rhs.get_learners_next() + && lhs.auto_leave == rhs.auto_leave + { + return true; + } + + eq_without_order(lhs.get_voters(), rhs.get_voters()) + && eq_without_order(lhs.get_learners(), rhs.get_learners()) + && eq_without_order(lhs.get_voters_outgoing(), rhs.get_voters_outgoing()) + && eq_without_order(lhs.get_learners_next(), rhs.get_learners_next()) + && lhs.auto_leave == rhs.auto_leave +} diff --git a/systems/Raft-rs/driver/rust_so/proto/src/lib.rs b/systems/Raft-rs/driver/rust_so/proto/src/lib.rs new file mode 100644 index 0000000..56ed74a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/proto/src/lib.rs @@ -0,0 +1,56 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +mod confchange; +mod confstate; + +pub use crate::confchange::{ + new_conf_change_single, parse_conf_change, stringify_conf_change, ConfChangeI, +}; +pub use crate::confstate::conf_state_eq; +pub use crate::protos::eraftpb; + +#[allow(dead_code)] +#[allow(unknown_lints)] +#[allow(clippy::all)] +#[allow(renamed_and_removed_lints)] +#[allow(bare_trait_objects)] +mod protos { + include!(concat!(env!("OUT_DIR"), "/protos/mod.rs")); + + use self::eraftpb::Snapshot; + + impl Snapshot { + /// For a given snapshot, determine if it's empty or not. + pub fn is_empty(&self) -> bool { + self.get_metadata().index == 0 + } + } +} + +pub mod prelude { + pub use crate::eraftpb::{ + ConfChange, ConfChangeSingle, ConfChangeTransition, ConfChangeType, ConfChangeV2, + ConfState, Entry, EntryType, HardState, Message, MessageType, Snapshot, SnapshotMetadata, + }; +} + +pub mod util { + use crate::eraftpb::ConfState; + + impl From<(Iter1, Iter2)> for ConfState + where + Iter1: IntoIterator, + Iter2: IntoIterator, + { + fn from((voters, learners): (Iter1, Iter2)) -> Self { + let mut conf_state = ConfState::default(); + conf_state.mut_voters().extend(voters.into_iter()); + conf_state.mut_learners().extend(learners.into_iter()); + conf_state + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/.gitignore b/systems/Raft-rs/driver/rust_so/raft-rs/.gitignore new file mode 100644 index 0000000..1184afb --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/.gitignore @@ -0,0 +1,33 @@ + +# OSX leaves these everywhere on SMB shares +._* + +# OSX trash +.DS_Store + +# Eclipse files +.classpath +.project +.settings/** + +# Vim swap files +*.swp + +# Files generated by JetBrains IDEs, e.g. IntelliJ IDEA +.idea/ +*.iml +out/ + +# Vscode files +.vscode/** + +target +tmp +/bin + +harness/target + +Cargo.lock +rust-toolchain +*.rs.bk +*.rs.fmt diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/CHANGELOG.md b/systems/Raft-rs/driver/rust_so/raft-rs/CHANGELOG.md new file mode 100644 index 0000000..27446f2 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/CHANGELOG.md @@ -0,0 +1,98 @@ +# 0.7.0 - 2023-03-06 + +- Update prost to 0.11 (#496) +- Make priority an i64 so a lower than default priority can be configured (#495) +- Allow fetching entries in asynchronous manner (#459) +- Support setting max-inflight-msgs in runtime to reduce memory usage (#450) + +# 0.6.0 - 2021-06-16 + +- Joint Consensus became a stable feature (#379, #380, #382, #383, #385, #386, #411) +- Ported aggresive flow control from etcd/raft (#354) +- Introduced group commit to force geo replication safety (#359) +- Harden read index (#355, #363) +- Support limiting uncommitted logs (#398) +- Support asynchronous ready (#403, #410, #417, #433) +- Fast log append rejection (#367) +- bytes::Bytes is used for protos if feature protobuf-codec is enabled (by default) (#438) +- Switched to thiserror (#435) +- Implemented committed entries pagination (#440) + +# 0.6.0-alpha - 2019-07-24 + +- Protos now is a separate crate (https://github.com/pingcap/raft-rs/pull/247) +- raft-rs is rust-2018 compatible (https://github.com/pingcap/raft-rs/pull/184) +- Optional support for batch MsgAppend (https://github.com/pingcap/raft-rs/pull/179) +- Harden follower read (https://github.com/pingcap/raft-rs/pull/220) +- Migrate to slog (https://github.com/pingcap/raft-rs/pull/185) +- Several code refactor and API clean up +- Optional support for prost + +# 0.5.0 - 2019-02-11 + +- Introduced an experimental Joint Consensus based arbitrary membership change feature. (https://github.com/pingcap/raft-rs/pull/101) +- Harmonized `protobuf` dependency to match important downstreams. (https://github.com/pingcap/raft-rs/pull/181) +- Unified the `Progress` collections inside `ProgressSet`. (https://github.com/pingcap/raft-rs/pull/108) +- `Raft::new()` now returns a `Result`. (https://github.com/pingcap/raft-rs/pull/122) +- Removed the `Progress.is_learner` field. Check via function in `ProgressSet` instead. (https://github.com/pingcap/raft-rs/pull/119) +- Added Appvevor. Added then removed bors. (https://github.com/pingcap/raft-rs/pull/137, https://github.com/pingcap/raft-rs/pull/134) +- Introduced getters and setters for various `Ready` fields. (https://github.com/pingcap/raft-rs/pull/120) +- Reduced memory allocation on reset. (https://github.com/pingcap/raft-rs/pull/130) +- Added issue templates, more links. (https://github.com/pingcap/raft-rs/pull/133, https://github.com/pingcap/raft-rs/pull/126) +- Moved poll and quorum checking functionality into `ProgressSet`. (https://github.com/pingcap/raft-rs/pull/121) +- The leader is now trivially in the replicate state. (https://github.com/pingcap/raft-rs/pull/146) +- Fixed a problem with lease based read-only requests interacting with `check_quorum` wrong. (https://github.com/pingcap/raft-rs/pull/141) +- Corrected the `single_mem_node` example. (https://github.com/pingcap/raft-rs/pull/162) +- Fixed typos. (https://github.com/pingcap/raft-rs/pull/159) +- Adopted [Hashbrown](https://github.com/Amanieu/hashbrown) over `FxHash`. (https://github.com/pingcap/raft-rs/pull/160) +- Corrected learner checking in `handle_transfer_leader`. (https://github.com/pingcap/raft-rs/pull/165) +- Resolved some lints (https://github.com/pingcap/raft-rs/pull/174, https://github.com/pingcap/raft-rs/pull/168, https://github.com/pingcap/raft-rs/pull/142, https://github.com/pingcap/raft-rs/pull/124) +- Fixed uses of `#[feature(_)]` so that we can build on stable cleanly. (https://github.com/pingcap/raft-rs/pull/180) + +# 0.4.0 - 2018-09-18 + +- No longer scan the raft log when becoming a leader. (https://github.com/pingcap/raft-rs/pull/100) +- Added the ability to skip broadcast commit at runtime. (https://github.com/pingcap/raft-rs/pull/115) +- Documented all public API. (https://github.com/pingcap/raft-rs/pull/87) +- Refined a few points in the API in preparation for more work. (https://github.com/pingcap/raft-rs/pull/102) +- Configuration logic was moved into its own module. (https://github.com/pingcap/raft-rs/pull/91) +- Added [`fail-rs`](https://github.com/pingcap/fail-rs) based tests. (https://github.com/pingcap/raft-rs/pull/114) +- Added benchmarking using [`criterion`](). (https://github.com/pingcap/raft-rs/pull/110) +- Expanded tested examples. (https://github.com/pingcap/raft-rs/pull/118) +- Improved documentation. (https://github.com/pingcap/raft-rs/pull/106) +- Refined the CI scripts to ensure strict linting. (https://github.com/pingcap/raft-rs/pull/117) +- Tests now output logs. Configure it with `RUST_LOG=raft=info`. (https://github.com/pingcap/raft-rs/pull/103) +- Eased the `log` dependency. (https://github.com/pingcap/raft-rs/pull/116) +- Formatting updates. (https://github.com/pingcap/raft-rs/pull/104) +- Updated some dependencies. (https://github.com/pingcap/raft-rs/pull/97) +- Use the clippy preview from Rustup. (https://github.com/pingcap/raft-rs/pull/95) +- Adopted a Code of Conduct. (https://github.com/pingcap/raft-rs/pull/107) + +# 0.3.1 - 2018-07-10 + +- Bugfix: Reset leader_id when becoming precandidate to resolve `prevote` and `check_quorum` compatability (https://github.com/pingcap/raft-rs/pull/84) +- Bugfix: Becoming a precandidate should reset votes (https://github.com/pingcap/raft-rs/pull/83) +- Fixed some typos, improve variable naming, and other small documentation fixes (https://github.com/pingcap/raft-rs/pull/77, https://github.com/pingcap/raft-rs/pull/79, https://github.com/pingcap/raft-rs/pull/78, https://github.com/pingcap/raft-rs/pull/80) +- Implemented `Default` for `Config` and fleshed out an example (https://github.com/pingcap/raft-rs/pull/81) +- Improved our changelog format (https://github.com/pingcap/raft-rs/pull/85) +- Removed custom Rustfmt configuration (https://github.com/pingcap/raft-rs/pull/86) + +# 0.3.0 - 2018-06-07 + +- Supported configuring the election timeout range (https://github.com/pingcap/raft-rs/pull/63). +- Kept compatible with rust-protobuf 2.0 (https://github.com/pingcap/raft-rs/pull/64, https://github.com/pingcap/raft-rs/pull/75) +- Made Raft now `Send` (https://github.com/pingcap/raft-rs/pull/67) +- Added documentation examples (https://github.com/pingcap/raft-rs/pull/69) +- Fixed a deadlock in the prevote migration process (https://github.com/pingcap/raft-rs/pull/42) + +# 0.2.0 - 2018-05-14 + +- Deprecated sync-log and add context (https://github.com/pingcap/raft-rs/pull/59) +- Fixed learner isolation bug (https://github.com/pingcap/raft-rs/pull/58) +- Ported several tests (https://github.com/pingcap/raft-rs/pull/54, https://github.com/pingcap/raft-rs/pull/41) +- Added examples (https://github.com/pingcap/raft-rs/pull/44) +- Added fxhash (https://github.com/pingcap/raft-rs/pull/48) + +# 0.1.0 - 2018-02-26 + +- initial release diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/CODE_OF_CONDUCT.md b/systems/Raft-rs/driver/rust_so/raft-rs/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..63f400a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at coc@pingcap.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/CONTRIBUTING.md b/systems/Raft-rs/driver/rust_so/raft-rs/CONTRIBUTING.md new file mode 100644 index 0000000..31967e0 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/CONTRIBUTING.md @@ -0,0 +1,52 @@ +# Overview + +**A bit of history:** The Raft-rs library started life inside TiKV, a (now CNCF) project to develop a distributed database. It has existed in some prototype form since Rust around 0.7, and has gone through many changes and evolutions. At the start of 2018 PingCAP packaged this Raft implementation for others to use under the Apache 2.0 license. + +Raft, as a distributed consensus algorithm, occupies a core part of the systems that utilize it. While we do not expect contributors to be experts on the algorithm, or Rust, please be aware that the review process for Raft may be longer and more strict than some other projects you have contributed to in the past. + +**We'd love it** if you used Raft and reported (or even fixed!) any problems you find, whether it be bugs, safety flaws, or even usability issues. You are welcome to improve existing code, clean up modules, better organize tests, add in tooling and instrumentation, and help us to everything we can to make sure the project is rock solid and an absolute joy to use. + +There is a lot of work to do to get there, and we're very excited for you to consider contributing. + +# Process for Contributors + +1. **Choose, or report, any issue you want!** Try to keep things bite sized, you may have a lot of ideas, but try to limit your changes to ease future review. If your resulting pull request is too big we will ask you to split it up, and that's no fun! + * [These are good first issues.](https://github.com/pingcap/raft-rs/labels/Good%20First%20Issue) + * [These are issues we'd love to complete with your help.](https://github.com/pingcap/raft-rs/labels/Help%20Wanted) + * Many issues are "big ideas" which you are welcome to break down and partially complete. +2. **Tell us you want to tackle it.** If you tell us you're working on something we are very happy to mentor you and pair with you on problems if you want. We'll try to occasionally check in on your progress and see if you need support. Doing this also helps avoid two people doing the same thing. +3. **Tackle it!** Try to break up your work into a [story](https://about.futurelearn.com/blog/telling-stories-with-your-git-history). + * Once you've opened a PR and requested reviews, try to avoid rebasing (please merge instead). + * If you add any API surface, please test it. + * If you add a feature, tests are required. + * Benchmarks are highly encouraged. + * Try to avoid panicking, return `Result` instead. If you return a result, please test all possible paths do not mutate the `Raft` on failure. + * Avoid public fields, use getters and setters. In the future changing a field could break an API and force a major version update. +4. **Prepare it.** Groom your code before you send a PR. The CI will verify you did all of this. + * Run `cargo test --all`, make sure they all pass. + * Run `cargo bench --all -- --test`, make sure they all pass. + * Run `cargo clippy --all --all-targets -- -D clippy::all`, fix any lints. + * Run `cargo fmt --all`. +5. **Submit the Pull Request.** + * Make sure to link to the issue you are addressing. If there is no related issue, please describe what the PR resolves. + * If you're still working on things, or you aren't ready for a full review yet, you can put `WIP:` in the title. You can use this as a chance to ask for some feedback or help. + * Look at the pending changes, give it a quick browse. Use this as a chance to check for `TODOs` or newly incorrect documentation. + * Someone will review your code and assign another reviewer. + * Discuss and consider reflecting any feedback. If you choose to abandon the PR at this point we may choose to drive it to completion, you can let us know if you don't have more time. +6. Once you have two approving reviews: **Your PR is merged.** + +# Process for Reviewers + +1. **Find a Pull Request** requiring review, or be requested to review one. +2. **Review the description.**. If the description does not clearly state the problem it resolves, ask the author to change it so it does. +3. **Thoroughly review the code.** Take your time, consider the context and possible corner cases. Anywhere which causes confusion you should request some explanation. If you think it's better as documentation in the code please specify that. +4. If you approve of the change, use **use the 'Submit Review' button to give approval.** +5. **Request a second reviewer,** if you think it's ready. If you gave a lot of change requests, or think the PR is in the wrong direction, you can wait to do this. +6. If you're the second reviewer, **review it** and if it's good, leave an approval then **merge it**. +6. If the PR is by a contributor, make sure we **thank them for their efforts**. + +# Becoming a Maintainer + +If you contribute a major feature, or help us tackle several bugs, we may invite you to become a maintainer. As a maintainer we will work with you in any project planning. As you will help reduce our maintenance responsibility, it will be our new responsibility ensure you have a voice in the project. + +You'll also get cool swag, and any time we're in your area we'll do our best to treat you to a meal and/or drinks. :) diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/Cargo.toml b/systems/Raft-rs/driver/rust_so/raft-rs/Cargo.toml new file mode 100644 index 0000000..2bcf727 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/Cargo.toml @@ -0,0 +1,65 @@ +[package] +name = "raft" +version = "0.7.0" +authors = ["The TiKV Project Developers"] +license = "Apache-2.0" +keywords = ["raft", "distributed-systems", "ha"] +repository = "https://github.com/tikv/raft-rs" +readme = "README.md" +homepage = "https://github.com/tikv/raft-rs" +documentation = "https://docs.rs/raft" +description = "The rust language implementation of Raft algorithm." +categories = ["algorithms", "database-implementations"] +edition = "2021" + +[workspace] +members = ["proto", "harness", "datadriven"] + +[features] +default = ["protobuf-codec", "default-logger"] +# Enable failpoints +failpoints = ["fail/failpoints"] +protobuf-codec = ["raft-proto/protobuf-codec", "bytes"] +prost-codec = ["raft-proto/prost-codec"] +default-logger = ["slog-stdlog", "slog-envlogger", "slog-term"] + +# Make sure to synchronize updates with Harness. +[dependencies] +bytes = { version = "1", optional = true } +fxhash = "0.2.1" +fail = { version = "0.4", optional = true } +getset = "0.1.1" +protobuf = "2" +thiserror = "1.0" +raft-proto = { path = "proto", version = "0.7.0", default-features = false } +rand = "0.8" +slog = "2.2" +slog-envlogger = { version = "2.1.0", optional = true } +slog-stdlog = { version = "4", optional = true } +slog-term = { version = "2.4.0", optional = true } + +[dev-dependencies] +criterion = "0.3" +regex = "1" +slog-async = "2.3.0" +slog-envlogger = "2.1.0" +slog-stdlog = "4" +slog-term = "2.4.0" +anyhow = "1.0.40" +datadriven = { path = "datadriven", version = "0.1.0" } +itertools = "0.10.0" + +[[bench]] +name = "benches" +harness = false + +[badges] +travis-ci = { repository = "tikv/raft-rs" } + +[[example]] +name = "single_mem_node" +path = "examples/single_mem_node/main.rs" + +[[example]] +name = "five_mem_node" +path = "examples/five_mem_node/main.rs" diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/LICENSE b/systems/Raft-rs/driver/rust_so/raft-rs/LICENSE new file mode 100644 index 0000000..4eedc01 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/LICENSE @@ -0,0 +1,201 @@ +Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/README.md b/systems/Raft-rs/driver/rust_so/raft-rs/README.md new file mode 100644 index 0000000..3a40e8c --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/README.md @@ -0,0 +1,109 @@ +# Raft + +[![CI](https://github.com/tikv/raft-rs/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/tikv/raft-rs/actions/workflows/ci.yml) +[![Documentation](https://docs.rs/raft/badge.svg)](https://docs.rs/raft/) +[![Crates.io](https://img.shields.io/crates/v/raft.svg)](https://crates.io/crates/raft) +[![dependency status](https://deps.rs/repo/github/tikv/raft-rs/status.svg)](https://deps.rs/repo/github/tikv/raft-rs) + +## Problem and Importance + +When building a distributed system one principal goal is often to build in *fault-tolerance*. That is, if one particular node in a network goes down, or if there is a network partition, the entire cluster does not fall over. The cluster of nodes taking part in a distributed consensus protocol must come to agreement regarding values, and once that decision is reached, that choice is final. + +Distributed Consensus Algorithms often take the form of a replicated state machine and log. Each state machine accepts inputs from its log, and represents the value(s) to be replicated, for example, a hash table. They allow a collection of machines to work as a coherent group that can survive the failures of some of its members. + +Two well known Distributed Consensus Algorithms are Paxos and Raft. Paxos is used in systems like [Chubby](http://research.google.com/archive/chubby.html) by Google, and Raft is used in things like [`tikv`](https://github.com/tikv/tikv) or [`etcd`](https://github.com/etcd-io/etcd). Raft is generally seen as a more understandable and simpler to implement than Paxos. + +## Design + +Raft replicates the state machine through logs. If you can ensure that all the machines have the same sequence of logs, after applying all logs in order, the state machine will reach a consistent state. + +A complete Raft model contains 4 essential parts: + +1. Consensus Module, the core consensus algorithm module; + +2. Log, the place to keep the Raft logs; + +3. State Machine, the place to save the user data; + +4. Transport, the network layer for communication. + +![The design of the Raft crate](media/the-design-of-raft-rs.png) + +> Note: This Raft implementation in Rust includes the core Consensus Module only, not the other parts. The core Consensus Module in the Raft crate is customizable, flexible, and resilient. You can directly use the Raft crate, but you will need to build your own Log, State Machine and Transport components. + +## Using the raft crate + +You can use raft with either [rust-protobuf](https://github.com/pingcap/rust-protobuf) or [Prost](https://github.com/tokio-rs/prost) to encode/decode gRPC messages. We use rust-protobuf by default. To use Prost, build (or depend on) Raft using the `prost-codec` feature and without default features. + +## Developing the Raft crate + +`Raft` is built using the latest version of `stable` Rust, using [the 2018 edition](https://doc.rust-lang.org/edition-guide/rust-2018/). +Minimum supported version is `1.44.0`. + +Using `rustup` you can get started this way: + +```bash +rustup component add clippy +rustup component add rustfmt +``` + +In order to have your PR merged running the following must finish without error: + +```bash +cargo test --all && \ +cargo clippy --all --all-targets -- -D clippy::all && \ +cargo fmt --all -- --check +``` + +You may optionally want to install `cargo-watch` to allow for automated rebuilding while editing: + +```bash +cargo watch -s "cargo check" +``` + +### Modifying Protobufs + +See [instructions](proto/README.md) in the proto subdirectory. + +### Benchmarks + +We use [Criterion](https://github.com/japaric/criterion.rs) for benchmarking. + +> It's currently an ongoing effort to build an appropriate benchmarking suite. If you'd like to help out please let us know! [Interested?](https://github.com/tikv/raft-rs/issues/109) + +You can run the benchmarks by installing `gnuplot` then running: + +```bash +cargo bench +``` + +You can check `target/criterion/report/index.html` for plots and charts relating to the benchmarks. + +You can check the performance between two branches: + +```bash +git checkout master +cargo bench --bench benches -- --save-baseline master +git checkout other +cargo bench --bench benches -- --baseline master +``` + +This will report relative increases or decreased for each benchmark. + +## Acknowledgments + +Thanks [etcd](https://github.com/etcd-io/raft) for providing the amazing Go implementation! + +## Projects using the Raft crate + +- [TiKV](https://github.com/tikv/tikv), a distributed transactional key value database powered by Rust and Raft. + +## Links for Further Research + +* [The Raft site](https://raftconsensus.github.io/) +* [The Secret Lives of Data - Raft](http://thesecretlivesofdata.com/raft/) +* [Raft Paper](https://raft.github.io/raft.pdf) +* [Raft Dissertation](https://github.com/ongardie/dissertation#readme) +* [Raft Refloated](https://www.cl.cam.ac.uk/~ms705/pub/papers/2015-osr-raft.pdf) +* [Implement Raft in Rust](https://www.pingcap.com/blog/implement-raft-in-rust/) +* [Using Raft in Rust at RustConf 2018](https://www.youtube.com/watch?v=MSrcdhGRsOE) diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/benches/benches.rs b/systems/Raft-rs/driver/rust_so/raft-rs/benches/benches.rs new file mode 100644 index 0000000..a9c09b0 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/benches/benches.rs @@ -0,0 +1,24 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +#![allow(dead_code)] // Due to criterion we need this to avoid warnings. + +use criterion::Criterion; +use std::time::Duration; + +mod suites; + +pub const DEFAULT_RAFT_SETS: [(usize, usize); 4] = [(0, 0), (3, 1), (5, 2), (7, 3)]; + +fn main() { + let mut c = Criterion::default() + // Configure defaults before overriding with args. + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(1)) + .configure_from_args(); + + suites::bench_raft(&mut c); + suites::bench_raw_node(&mut c); + suites::bench_progress(&mut c); + + c.final_summary(); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/mod.rs b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/mod.rs new file mode 100644 index 0000000..2e853be --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/mod.rs @@ -0,0 +1,8 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +mod raft; +pub use self::raft::*; +mod raw_node; +pub use self::raw_node::*; +mod progress; +pub use self::progress::*; diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/progress.rs b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/progress.rs new file mode 100644 index 0000000..3a600b0 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/progress.rs @@ -0,0 +1,17 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use criterion::{Bencher, Criterion}; +use raft::Progress; + +pub fn bench_progress(c: &mut Criterion) { + bench_progress_default(c); +} + +pub fn bench_progress_default(c: &mut Criterion) { + let bench = |b: &mut Bencher| { + // No setup. + b.iter(|| Progress::new(9, 10)); + }; + + c.bench_function("Progress::default", bench); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raft.rs b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raft.rs new file mode 100644 index 0000000..8e36d23 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raft.rs @@ -0,0 +1,66 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::DEFAULT_RAFT_SETS; +use criterion::Criterion; +use raft::eraftpb::ConfState; +use raft::{storage::MemStorage, Config, Raft}; + +pub fn bench_raft(c: &mut Criterion) { + bench_raft_new(c); + bench_raft_campaign(c); +} + +fn new_storage(voters: usize, learners: usize) -> MemStorage { + let mut cc = ConfState::default(); + for i in 1..=voters { + cc.voters.push(i as u64); + } + for i in 1..=learners { + cc.learners.push(voters as u64 + i as u64); + } + MemStorage::new_with_conf_state(cc) +} + +fn quick_raft(storage: MemStorage, logger: &slog::Logger) -> Raft { + let id = 1; + let config = Config::new(id); + Raft::new(&config, storage, logger).unwrap() +} + +pub fn bench_raft_new(c: &mut Criterion) { + DEFAULT_RAFT_SETS.iter().for_each(|(voters, learners)| { + c.bench_function(&format!("Raft::new ({}, {})", voters, learners), move |b| { + let logger = raft::default_logger(); + let storage = new_storage(*voters, *learners); + b.iter(|| quick_raft(storage.clone(), &logger)) + }); + }); +} + +pub fn bench_raft_campaign(c: &mut Criterion) { + DEFAULT_RAFT_SETS + .iter() + .skip(1) + .for_each(|(voters, learners)| { + // We don't want to make `raft::raft` public at this point. + let msgs = &[ + "CampaignPreElection", + "CampaignElection", + "CampaignTransfer", + ]; + // Skip the first since it's 0,0 + for msg in msgs { + c.bench_function( + &format!("Raft::campaign ({}, {}, {})", voters, learners, msg), + move |b| { + let logger = raft::default_logger(); + let storage = new_storage(*voters, *learners); + b.iter(|| { + let mut raft = quick_raft(storage.clone(), &logger); + raft.campaign(msg.as_bytes()); + }) + }, + ); + } + }); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raw_node.rs b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raw_node.rs new file mode 100644 index 0000000..a9e7c44 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/benches/suites/raw_node.rs @@ -0,0 +1,141 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +use criterion::{BatchSize, Bencher, BenchmarkId, Criterion, Throughput}; +use raft::eraftpb::{ConfState, Entry, Message, Snapshot, SnapshotMetadata}; +use raft::{storage::MemStorage, Config, RawNode}; +use std::time::Duration; + +pub fn bench_raw_node(c: &mut Criterion) { + bench_raw_node_new(c); + bench_raw_node_leader_propose(c); + bench_raw_node_new_ready(c); +} + +fn quick_raw_node(logger: &slog::Logger) -> RawNode { + let id = 1; + let conf_state = ConfState::from((vec![1], vec![])); + let storage = MemStorage::new_with_conf_state(conf_state); + let config = Config::new(id); + RawNode::new(&config, storage, logger).unwrap() +} + +pub fn bench_raw_node_new(c: &mut Criterion) { + let bench = |b: &mut Bencher| { + let logger = raft::default_logger(); + b.iter(|| quick_raw_node(&logger)); + }; + + c.bench_function("RawNode::new", bench); +} + +pub fn bench_raw_node_leader_propose(c: &mut Criterion) { + static KB: usize = 1024; + let mut test_sets = vec![ + 0, + 32, + 128, + 512, + KB, + 4 * KB, + 16 * KB, + 128 * KB, + 512 * KB, + KB * KB, + ]; + let mut group = c.benchmark_group("RawNode::leader_propose"); + for size in test_sets.drain(..) { + // Calculate measurement time in seconds according to the input size. + // The approximate time might not be the best but should work fine. + let mtime = if size < KB { + 1 + } else if size < 128 * KB { + 3 + } else { + 7 + }; + group + .measurement_time(Duration::from_secs(mtime)) + .throughput(Throughput::Bytes(size as u64)) + .bench_with_input( + BenchmarkId::from_parameter(size), + &size, + |b: &mut Bencher, size| { + let logger = raft::default_logger(); + let mut node = quick_raw_node(&logger); + node.raft.become_candidate(); + node.raft.become_leader(); + b.iter_batched( + || (vec![0; 8], vec![0; *size]), + |(context, value)| node.propose(context, value).expect(""), + BatchSize::SmallInput, + ); + }, + ); + } +} + +pub fn bench_raw_node_new_ready(c: &mut Criterion) { + let logger = raft::default_logger(); + let mut group = c.benchmark_group("RawNode::ready"); + group + // TODO: The proper measurement time could be affected by the system and machine. + .measurement_time(Duration::from_secs(20)) + .bench_function("Default", |b: &mut Bencher| { + b.iter_batched( + || test_ready_raft_node(&logger), + |mut node| { + let _ = node.ready(); + }, + // NOTICE: SmallInput accumulates (iters + 10 - 1) / 10 samples per batch + BatchSize::SmallInput, + ); + }); +} + +// Create a raft node calling `ready()` with things below: +// - 100 new entries with 32KB data each +// - 100 committed entries with 32KB data each +// - 100 raft messages +// - A snapshot with 8MB data +// TODO: Maybe gathering all the things we need into a struct(e.g. something like `ReadyBenchOption`) and use it +// to customize the output. +fn test_ready_raft_node(logger: &slog::Logger) -> RawNode { + let mut node = quick_raw_node(logger); + node.raft.become_candidate(); + node.raft.become_leader(); + let unstable = node.raft.raft_log.unstable_entries().to_vec(); + node.raft.raft_log.stable_entries(1, 1); + node.raft.raft_log.store.wl().append(&unstable).expect(""); + node.raft.on_persist_entries(1, 1); + node.raft.commit_apply(1); + let mut entries = vec![]; + for i in 1..101 { + let mut e = Entry::default(); + e.data = vec![0; 32 * 1024].into(); + e.context = vec![].into(); + e.index = i; + e.term = 1; + entries.push(e); + } + let _ = node.raft.append_entry(&mut entries); + let unstable = node.raft.raft_log.unstable_entries().to_vec(); + node.raft.raft_log.stable_entries(101, 1); + node.raft.raft_log.store.wl().append(&unstable).expect(""); + // This increases 'committed_index' to `last_index` because there is only one node in quorum. + node.raft.on_persist_entries(101, 1); + + let mut snap = Snapshot::default(); + snap.set_data(vec![0; 8 * 1024 * 1024].into()); + // We don't care about the contents in snapshot here since it won't be applied. + snap.set_metadata(SnapshotMetadata::default()); + for _ in 0..100 { + node.raft.msgs.push(Message::default()); + } + // Force reverting committed index to provide us some entries to be stored from next `Ready` + node.raft.raft_log.committed = 1; + node +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/bors.toml b/systems/Raft-rs/driver/rust_so/raft-rs/bors.toml new file mode 100644 index 0000000..4e6e85f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/bors.toml @@ -0,0 +1,4 @@ +status = [ + "continuous-integration/travis-ci/push", + "continuous-integration/appveyor/branch" +] diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/Cargo.toml b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/Cargo.toml new file mode 100644 index 0000000..ce5f0d2 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "datadriven" +version = "0.1.0" +authors = ["accelsao "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +anyhow = "1.0.40" +regex = "1" +lazy_static = "1.4.0" +slog = "2.5.2" +slog-term = "2.6.0" +slog-async = "2.5.0" +similar-asserts = "1.1.0" diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven.rs new file mode 100644 index 0000000..64dda20 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven.rs @@ -0,0 +1,195 @@ +#[cfg(test)] +mod datadriven_test; + +use crate::get_dirs_or_file; +use crate::test_data::TestData; +use crate::test_data_reader::TestDataReader; +use anyhow::Result; +use regex::Regex; +use std::fs; +use std::fs::OpenOptions; +use std::io::Write; +use std::path::Path; + +use lazy_static::lazy_static; +use similar_asserts::assert_eq; +use slog::debug; + +/// The main function to run tests +/// +/// You need to pass the path of `testdata` where store the test cases, and your function +/// to output expected result. +/// +/// It will be compared with crate `difference` +/// +/// It invokes a data-driven test. The test cases are contained in a +/// separate test file and are dynamically loaded, parsed, and executed by this +/// testing framework. By convention, test files are typically located in a +/// sub-directory called `testdata`. Each test file has the following format: +/// +/// ```txt +/// [arg | arg=val | arg=(val1, val2, ...)]... \ +/// \ +/// +/// ---- +/// +/// +/// ```` +/// +/// The command input can contain blank lines. However, by default, the expected +/// results cannot contain blank lines. This alternate syntax allows the use of +/// blank lines: +/// ```txt +/// [arg | arg=val | arg=(val1, val2, ...)]... \ +/// \ +/// +/// ---- +/// ---- +/// +/// +/// +/// ---- +/// ---- +/// +/// ```` +/// +/// Data store in `TestData` +/// +/// `` => `String`: `cmd` +/// +/// `` => `Vec`: `cmd_args` +/// +/// see more detail for [CmdArg](struct.CmdArg.html) +/// +/// `F` is customize function: `FnOnce(&TestData) -> String` +/// +/// you will get input as type `TestData` and your expected output as type `String` +/// +/// Note: you need `end line` after each line if you have multiple lines. +/// This is used to separate them, instead of mixing the characters altogether. +/// +/// ```rust +/// use datadriven::{TestData, CmdArg}; +/// +/// fn func(d: &TestData) -> String { +/// let args : Vec = d.cmd_args.clone(); +/// let cmd : String = d.cmd.clone(); +/// +/// // DO SOMETHINGS +/// // NOTE: You need to add end line, if needed +/// +/// String::from("YOUR EXPECTED OUTPUT") +/// } +/// ``` +/// +/// Rewrite Mode: +/// +/// When the testdata is changed, you can run the rewrite mode directly to get the new testdata. +/// If there are no blank line in expected value, it will be overwritten with one separator, +/// instead of double separator, vice versa. +/// +pub fn run_test(path: &str, mut f: F, rewrite: bool, logger: &slog::Logger) -> Result<()> +where + F: FnMut(&TestData) -> String, +{ + let files = get_dirs_or_file(path)?; + + for path in &files { + let content = fs::read_to_string(path)?; + if let Some(rewrite_data) = run_test_internal(path, &content, &mut f, rewrite, logger)? { + let mut file = OpenOptions::new().write(true).truncate(true).open(path)?; + file.write_all(rewrite_data.as_bytes())?; + file.sync_data()?; + } else { + // test mode, skip rewriting data + } + } + + Ok(()) +} + +fn run_test_internal( + source_name: P, + content: &str, + mut f: F, + rewrite: bool, + logger: &slog::Logger, +) -> Result> +where + F: FnMut(&TestData) -> String, + P: AsRef, +{ + let mut r = TestDataReader::new(source_name, content, rewrite, logger); + + while r.next()? { + run_directive(&mut r, &mut f); + } + + // remove redundant '\n' + let data = r.rewrite_buffer.map(|mut rb| { + if rb.ends_with("\n\n") { + rb.pop(); + } + rb + }); + debug!(logger, "rewrite_buffer: {:?}", data); + Ok(data) +} + +// run_directive runs just one directive in the input. +// +fn run_directive(r: &mut TestDataReader, mut f: F) +where + F: FnMut(&TestData) -> String, +{ + let mut actual = f(&r.data); + + if !actual.is_empty() && !actual.ends_with('\n') { + actual += "\n"; + } + + // test mode + if r.rewrite_buffer.is_none() { + assert_eq!(&actual, &r.data.expected); + } else { + r.emit("----"); + if has_blank_line(&actual) { + r.emit("----"); + + if let Some(rb) = r.rewrite_buffer.as_mut() { + rb.push_str(&actual); + } + + r.emit("----"); + r.emit("----"); + r.emit(""); + } else { + // Here actual already ends in \n so emit adds a blank line. + r.emit(&actual); + } + } +} + +/// Walk goes through all the files in a subdirectory, creating subtests to match +/// the file hierarchy; for each "leaf" file, the given function is called. +pub fn walk(path: &str, f: F) -> Result<()> +where + F: Fn(&Path) -> Result<()>, +{ + let files = get_dirs_or_file(path)?; + for file in files { + f(file.as_path())?; + } + Ok(()) +} + +lazy_static! { + // Multi-line mode means ^ and $ no longer match just at the + // beginning/end of the input, but at the beginning/end of lines + // from regex.doc + static ref BLANK_LINE_RE: Regex = Regex::new(r"(?m)^[\t ]*\n").unwrap(); +} + +fn has_blank_line(str: &str) -> bool { + BLANK_LINE_RE.captures(str).is_some() +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven/datadriven_test.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven/datadriven_test.rs new file mode 100644 index 0000000..0b54eda --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/datadriven/datadriven_test.rs @@ -0,0 +1,247 @@ +use crate::datadriven::{has_blank_line, run_test, run_test_internal}; +use crate::test_data::TestData; +use crate::{default_logger, get_dirs_or_file}; +use anyhow::Result; +use std::cmp; +use std::fs::{read_to_string, OpenOptions}; +use std::io::Write; + +use similar_asserts::assert_eq; +use slog::debug; + +fn fibonacci(n: u32) -> u32 { + match n { + 0 => 1, + 1 => 1, + _ => fibonacci(n - 1) + fibonacci(n - 2), + } +} + +fn factorial(n: u32) -> u32 { + match n { + 0 => 1, + 1 => 1, + _ => factorial(n - 1) * n, + } +} + +fn fibonacci_or_factorial_or_sum(d: &TestData) -> String { + let mut expected = String::new(); + + match d.cmd.as_str() { + "fibonacci" => { + for arg in d.cmd_args.iter() { + assert_eq!( + arg.vals.len(), + 1, + r#"expected value len is 1, check "{}""#, + d.pos + ); + // value must exists + assert!(!arg.vals[0].is_empty()); + + let v = fibonacci(arg.vals[0].parse().unwrap()); + let line = arg.key.clone() + "=" + v.to_string().as_str() + "\n"; + expected.push_str(&line); + } + } + "factorial" => { + for arg in d.cmd_args.iter() { + assert_eq!( + arg.vals.len(), + 1, + r#"expected value len is 1, check "{}""#, + d.pos + ); + // value must exists + assert!(!arg.vals[0].is_empty()); + let v = factorial(arg.vals[0].parse().unwrap()); + let line = arg.key.clone() + "=" + v.to_string().as_str() + "\n"; + expected.push_str(&line); + } + } + "sum" => { + for arg in &d.cmd_args { + if arg.vals.is_empty() { + // if no value, assume is 0 + let res = arg.key.clone() + "=0\n"; + expected.push_str(&res); + } else { + let mut sum = 0; + for val in &arg.vals { + if val.is_empty() { + continue; + } + let vs = val + .split(',') + .map(|v| { + v.parse::().unwrap_or_else(|_| { + panic!( + "value: {:?} can't parse, check {}", + arg.vals.clone(), + d.pos + ) + }) + }) + .sum::(); + sum += vs; + } + let line = arg.key.clone() + "=" + sum.to_string().as_str() + "\n"; + expected.push_str(&line); + } + } + } + "max" => { + for arg in d.cmd_args.iter() { + if arg.vals.is_empty() { + // if no value, assume is 0 + let res = arg.key.clone() + "=0\n"; + expected.push_str(&res); + } else { + let mut max = 0; + for val in &arg.vals { + if val.is_empty() { + continue; + } + let vs = val + .split(',') + .map(|v| { + v.parse::().unwrap_or_else(|_| { + panic!( + "value: {:?} can't parse, check {}", + arg.vals.clone(), + d.pos + ) + }) + }) + .max() + .expect("Vec is empty, this should not happen."); + max = cmp::max(max, vs); + } + let line = arg.key.clone() + "=" + max.to_string().as_str() + "\n"; + expected.push_str(&line); + } + } + } + "do_nothing" => { + // this is for testing + } + "repeat_me" => { + for arg in &d.cmd_args { + if arg.vals.is_empty() { + let res = arg.key.clone() + "=\n"; + expected.push_str(&res); + } else { + let mut res = arg.key.clone() + "="; + for v in &arg.vals { + res += v; + } + res += "\n"; + expected.push_str(&res); + } + } + } + _ => panic!("unknown command"), + } + expected +} + +#[test] +fn test_rewrite() -> Result<()> { + // set 'true' if you want to rewrite 'rewrite_test' + let rewrite_testfiles = false; + + let logger = default_logger(); + + let path = "src/testdata/rewrite"; + + let files = get_dirs_or_file(path).unwrap(); + + let files = files + .into_iter() + .filter(|f| f.to_str().unwrap().ends_with("-before")) + .map(|f| f.to_str().unwrap().trim_end_matches("-before").to_string()) + .collect::>(); + + fn handle(d: &TestData) -> String { + match d.cmd.as_str() { + "noop" => d.input.clone(), + "duplicate" => d.input.clone() + "\n" + d.input.as_str(), + "duplicate-with-blank" => d.input.clone() + "\n\n" + d.input.as_str(), + "no-output" => "".to_string(), + _ => panic!("unknown directive: {}", d.cmd), + } + } + + debug!(logger, "files: {:?}", files); + + for file in &files { + let content = read_to_string(file.to_owned() + "-before")?; + + if let Some(rewrite_data) = + run_test_internal(path, &content, handle, /*rewrite*/ true, &logger)? + { + if rewrite_testfiles { + let mut after_path = OpenOptions::new() + .write(true) + .truncate(true) + .open(file.to_owned() + "-after")?; + after_path.write_all(rewrite_data.as_bytes())?; + } else { + let content = read_to_string(file.to_owned() + "-after")?.replace('\r', ""); + assert_eq!(&rewrite_data, &content); + } + } + } + + Ok(()) +} + +#[test] +fn test_datadriven() -> Result<()> { + let logger = default_logger(); + let rewrite = false; + + run_test( + "src/testdata/datadriven", + fibonacci_or_factorial_or_sum, + rewrite, + &logger, + )?; + Ok(()) +} + +#[test] +fn test_unknown_data() { + let logger = default_logger(); + let rewrite = false; + + let e = run_test( + "src/testdata/unknown_data_1.txt", + fibonacci_or_factorial_or_sum, + rewrite, + &logger, + ); + assert!(e.is_err()); + let e = run_test( + "src/testdata/unknown_data_2.txt", + fibonacci_or_factorial_or_sum, + rewrite, + &logger, + ); + assert!(e.is_err()); +} + +#[test] +fn test_blank_line() { + let str = "123\n 123\n"; + assert_eq!(has_blank_line(str), false); + let str = "123\n \n123\n"; + assert_eq!(has_blank_line(str), true); + let str = "123\n\n123\n"; + assert_eq!(has_blank_line(str), true); + let str = "\n"; + assert_eq!(has_blank_line(str), true); + let str = ""; + assert_eq!(has_blank_line(str), false); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/lib.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/lib.rs new file mode 100644 index 0000000..9021880 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/lib.rs @@ -0,0 +1,85 @@ +/*! +# datadriven +**datadriven** is a tool for testing. Ported from [cockroachdb/datadriven](https://github.com/cockroachdb/datadriven) + +To execute data-driven tests, pass the path of the test file as well as a +function which can interpret and execute whatever commands are present in +the test file. The framework invokes the function, passing it information +about the test case in a TestData struct. + +The function must return the actual results of the case, which +run_test() compares with the expected results. If the two are not +equal, the test is marked to fail. + +[run_test()](fn.run_test.html) will run the test on a file or given folder + +Recommend usage: + +for test function `test_func_001`, place the testdata in `src/testdata/test_func_001`, `run_test(src/testdata/test_func_001, func_001)` + +for test function `test_func_002`, place the testdata in `src/testdata/test_func_002`, `run_test(src/testdata/test_func_002, func_002)` + +or just run a file `run_test(src/testdata/data.txt, func_002)` + +and so on. + +The path tree looks like the following: +```text +. +├── Cargo.toml +└── src + ├── datadriven.rs + ├── lib.rs + └── testdata + ├── data.txt + ├── test_func_001 + │   ├── data_001.txt + │   └── data_002.txt + └── test_func_002 + ├── data_001.txt + └── data_002.txt +``` + +The comparison is done by [similar-asserts](https://docs.rs/similar-asserts/1.1.0/similar_asserts/) + +The difference between [cockroachdb/datadriven](https://github.com/cockroachdb/datadriven) +1. no rewrite +2. no subtest + +*/ + +#![deny(missing_docs)] + +mod datadriven; +mod line_sparser; +mod test_data; +mod test_data_reader; + +pub use self::datadriven::run_test; +pub use self::datadriven::walk; +pub use self::test_data::CmdArg; +pub use self::test_data::TestData; +use anyhow::Result; +use slog::Drain; +use std::fs::read_dir; +use std::io; +use std::path::PathBuf; + +use slog::o; + +#[allow(dead_code)] +fn default_logger() -> slog::Logger { + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain).build().fuse(); + slog::Logger::root(drain, o!()) +} + +fn get_dirs_or_file(path: &str) -> Result> { + match read_dir(path) { + Ok(dir) => Ok(dir + .map(|res| res.map(|e| e.path())) + .collect::, io::Error>>()?), + _ => Ok(vec![PathBuf::from(path)]), + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/line_sparser.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/line_sparser.rs new file mode 100644 index 0000000..4dde527 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/line_sparser.rs @@ -0,0 +1,147 @@ +use crate::test_data::CmdArg; +use anyhow::Result; +use regex::Regex; + +use anyhow::{anyhow, bail}; +use lazy_static::lazy_static; +use slog::debug; + +// Token +// (1) argument (no value) +// (2) argument= (empty value) +// (3) argument=() (empty value) +// (4) argument=a (single value) +// (5) argument=a,b,c (single value) +// (6) argument=(a,b,c,...) (multiple value) +// +// parse_line parses a line of datadriven input language and returns +// the parsed command and CmdArgs. +pub fn parse_line(line: &str, logger: &slog::Logger) -> Result<(String, Vec)> { + debug!(logger, "line pass to split_directives: {:?}", line); + + let fields = split_directives(line)?; + if fields.is_empty() { + return Ok((String::new(), vec![])); + } + + debug!(logger, "argument after split: {:?}", fields); + + let cmd = fields[0].clone(); + let mut cmd_args = vec![]; + + for arg in &fields[1..] { + let key_value = arg.splitn(2, '=').collect::>(); + + debug!(logger, "keyvalue: {:?}", key_value); + + match key_value.len() { + 1 => { + // key only + cmd_args.push(CmdArg { + key: key_value[0].to_string(), + vals: vec![], + }) + } + 2 => { + let (key, val) = (key_value[0].to_string(), key_value[1]); + + debug!(logger, "val: {:?}", val); + + if val.starts_with('(') && val.ends_with(')') { + // trim because white space is allow. + let vals = val[1..val.len() - 1] + .split(',') + .map(|v| v.trim().to_string()) + .collect(); + cmd_args.push(CmdArg { key, vals }) + } else { + cmd_args.push(CmdArg { + key, + vals: vec![val.to_string()], + }) + } + } + _ => bail!("unknown argument format: {}", arg), + } + } + + Ok((cmd, cmd_args)) +} + +lazy_static! { + static ref RE: Regex = + Regex::new(r"^ *[-a-zA-Z0-9/_,.]+(|=[-a-zA-Z0-9_@=+/,.]*|=\([^)]*\))( |$)").unwrap(); +} + +fn split_directives(line: &str) -> Result> { + let mut res = vec![]; + + let origin_line = <&str>::clone(&line); + + let mut line = line; + while !line.is_empty() { + if let Some(l) = RE.captures(line) { + // get first captures + let (first, last) = line.split_at(l[0].len()); + res.push(first.trim().to_string()); + line = last; + } else { + return Err(anyhow!( + "cannot parse directive at column {}: {}", + origin_line.len() - line.len() + 1, + origin_line + )); + } + } + Ok(res) +} + +#[cfg(test)] +mod tests { + use crate::default_logger; + use crate::line_sparser::{parse_line, split_directives}; + use anyhow::Result; + + #[test] + fn test_parse_line() -> Result<()> { + let logger = default_logger(); + let line = "cmd a=1 b=(2,3) c= d"; + let (cmd, cmd_args) = parse_line(line, &logger)?; + assert_eq!(cmd, "cmd"); + assert_eq!(format!("{:?}", cmd_args), "[a=1, b=(2,3), c=, d]"); + + Ok(()) + } + + #[test] + fn test_split_directives() -> Result<()> { + let line = "cmd a=1 b=2,2,2 c=(3,33,3333)"; + assert_eq!( + split_directives(line)?, + ["cmd", "a=1", "b=2,2,2", "c=(3,33,3333)"], + ); + + let line = "cmd a b c"; + assert_eq!(split_directives(line)?, ["cmd", "a", "b", "c"]); + + let line = "cmd"; + assert_eq!(split_directives(line)?, ["cmd"]); + + let line = "cmd a=1\n"; + assert_eq!( + split_directives(line).unwrap_err().to_string(), + "cannot parse directive at column 5: cmd a=1\n".to_string() + ); + + let line = "cmd a=1 "; + assert_eq!(split_directives(line)?, ["cmd", "a=1"]); + + let line = "cmd a=1 "; + assert_eq!( + split_directives(line).unwrap_err().to_string(), + "cannot parse directive at column 9: cmd a=1 ".to_string() + ); + + Ok(()) + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data.rs new file mode 100644 index 0000000..f3b59bd --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data.rs @@ -0,0 +1,95 @@ +use std::fmt; + +/// CmdArg contains information about an argument on the directive line. An +/// argument is specified in one of the following forms: +/// +/// - key (no value) +/// - key= (empty value) +/// - key=() (empty value) +/// - key=a (single value) +/// - key=a,b,c (single value) +/// - key=(a,b,c) (multiple value) +#[derive(Clone)] +pub struct CmdArg { + /// Key of CmdArg + pub key: String, + /// Values of CmdArg + pub vals: Vec, +} + +impl fmt::Display for CmdArg { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.vals.len() { + 0 => write!(f, "{}", self.key), + 1 => write!(f, "{}={}", self.key, self.vals[0]), + _ => write!(f, "{}=({})", self.key, self.vals.join(",")), + } + } +} + +impl fmt::Debug for CmdArg { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +/// TestData contains information about datadriven testcase that was parsed from the test file. +/// Data format is text file(txt). +#[derive(Clone, Default)] +pub struct TestData { + /// Pos is a file:line prefix for the input test file, suitable for + /// inclusion in logs and error messages. + pub pos: String, + + /// Cmd is the first string on the directive line (up to the first whitespace). + pub cmd: String, + + /// CmdArgs contains the k/v arguments to the command. + pub cmd_args: Vec, + + /// Input is the text between the first directive line and the ---- separator. + pub input: String, + + /// Expected is the value below the ---- separator. In most cases, + /// tests need not check this, and instead return their own actual + /// output. + /// This field is provided so that a test can perform an early return + /// with "return d.expected" to signal that nothing has changed. + pub expected: String, +} + +impl TestData { + /// Return `true` if the `cmd_args` contains a value for the specified key. + pub fn contains_key(&self, k: &str) -> bool { + for cmd_arg in self.cmd_args.iter() { + if cmd_arg.key == k { + return true; + } + } + false + } +} + +#[cfg(test)] +mod tests { + use crate::{CmdArg, TestData}; + + #[test] + fn test_contains_key() { + let cmd_arg = CmdArg { + key: "key".to_string(), + vals: vec!["123".to_string(), "92".to_string(), "92".to_string()], + }; + let mut d = TestData::default(); + d.cmd_args.push(cmd_arg); + let cmd_arg = CmdArg { + key: "key2".to_string(), + vals: vec!["some string".to_string()], + }; + d.cmd_args.push(cmd_arg); + assert!(d.contains_key("key2")); + assert!(!d.contains_key("key1")); + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data_reader.rs b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data_reader.rs new file mode 100644 index 0000000..9d11d22 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/test_data_reader.rs @@ -0,0 +1,214 @@ +use std::iter::Enumerate; +use std::path::{Path, PathBuf}; +use std::str::Lines; + +use crate::line_sparser::parse_line; +use crate::test_data::TestData; +use anyhow::Result; + +use anyhow::bail; +use slog::debug; + +pub struct TestDataReader<'a> { + source_name: PathBuf, + pub data: TestData, + scanner: Enumerate>, + pub logger: slog::Logger, + pub rewrite_buffer: Option, +} + +impl<'a> TestDataReader<'a> { + pub fn new

(source_name: P, content: &'a str, rewrite: bool, logger: &slog::Logger) -> Self + where + P: AsRef, + { + Self { + source_name: source_name.as_ref().to_path_buf(), + scanner: content.lines().enumerate(), + data: TestData::default(), + logger: logger.clone(), + rewrite_buffer: match rewrite { + true => Some(String::new()), + false => None, + }, + } + } + + pub fn next(&mut self) -> Result { + loop { + let line = self.scanner.next(); + if line.is_none() { + return Ok(false); + } + + let mut pos = line.unwrap().0; + let line = line.unwrap().1; + self.emit(line); + + let mut line = line.trim().to_string(); + + // Only (1) comment (2) empty line + // are accepted before argument. + if line.starts_with('#') || line.is_empty() { + // Skip comment lines. + continue; + } + + // Support wrapping directive lines using \, for example: + // build-scalar \ + // vars(int) + while line.ends_with('\\') { + line.pop(); + + let mut next_line = self + .scanner + .next() + .expect("expect argument ends without '\\'") + .1; + + self.emit(next_line); + + next_line = next_line.trim(); + + // skip blank line, if not we might get some whitespace at the end, + // without trimming it, parsing line will fails + if !next_line.is_empty() { + // Add a whitespace for separating arguments + line.push(' '); + line.push_str(next_line); + } + + // We need the last line number of argument + pos += 1; + } + + debug!(self.logger, "argument_after_cleanup: {}", line); + + // Start reading argument + // Init data + self.data = TestData::default(); + + // Save `line` information for error/debug message usage + self.data.pos = format!("{} : L{}", self.source_name.as_path().display(), pos + 1); + + let (cmd, cmd_args) = parse_line(&line, &self.logger)?; + + if cmd.is_empty() { + bail!("cmd must not be empty"); + } + + debug!(self.logger, "cmd: {}, cmd_args: {:?}", cmd, cmd_args,); + + self.data.cmd = cmd; + self.data.cmd_args = cmd_args; + + let mut buf = String::new(); + let mut separator = false; + + loop { + let line = self.scanner.next(); + if line.is_none() { + break; + } + + let line = line.unwrap().1; + + debug!(self.logger, "input line: {:?}", line); + if line == "----" { + separator = true; + break; + } else { + self.emit(line); + let line = line.to_string() + "\n"; + buf.push_str(&line); + } + } + + self.data.input = buf.trim().to_string(); + + debug!(self.logger, "input before separator: {:?}", self.data.input); + + if separator { + self.read_expected(); + } + + return Ok(true); + } + } + + // check whether there is two separator or one + fn read_expected(&mut self) { + // after reading first separator + // we expected two input + // (1) second separator + // (2) non empty output + + // We need to add an blank line for each line in order to distinguish the exact output, + // otherwise we may consider an unexpected output combination to be correct + + let mut allow_blank_lines = false; + + let mut line = ""; + if let Some((_, l)) = self.scanner.next() { + if l == "----" { + allow_blank_lines = true; + } + line = l; + } else { + // In rewrite mode, no lines after first separator + } + + if allow_blank_lines { + loop { + let mut line = self.scanner.next().unwrap().1.to_string(); + if line == "----" { + let mut line2 = self.scanner.next().unwrap().1.to_string(); + if line2 == "----" { + // Read the following blank line (if we don't do this, we will emit + // an extra blank line when rewriting). + if let Some((_, line3)) = self.scanner.next() { + // we should assert line3 is empty + assert!( + line3.is_empty(), + "non-blank line after end of double ---- separator section" + ); + } else { + // if None, then it if fine. + } + + break; + } + line += "\n"; + self.data.expected.push_str(&line); + line2 += "\n"; + self.data.expected.push_str(&line2); + continue; + } + line += "\n"; + self.data.expected.push_str(&line); + } + } else { + // Terminate on first blank line. + loop { + if line.trim().is_empty() { + break; + } + let l = line.to_string() + "\n"; + self.data.expected.push_str(&l); + + let l = self.scanner.next(); + if l.is_none() { + break; + } + line = l.unwrap().1; + } + } + } + + pub fn emit(&mut self, str: &str) { + if let Some(rb) = self.rewrite_buffer.as_mut() { + let str = str.to_string() + "\n"; + rb.push_str(&str); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_1.txt b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_1.txt new file mode 100644 index 0000000..8b30e00 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_1.txt @@ -0,0 +1,62 @@ +# Datadriven Test +# Input and output are separated by separator "----" +# You are accepted to add blank line between double "----", +# and you need another double "----" to end with the expected value. +# Remember a blank line to end the expected value. +fibonacci a=3 b=4 \ +c=4 + line 1 + + input in middle + +line 2 + +---- +a=3 +b=5 +c=5 + +factorial a=3 c=4 +---- +a=6 +c=24 + +sum a=3,2,4 key,comma \ +-/_,.=(1,1) ,,= ...=() +---- +a=9 +key,comma=0 +-/_,.=2 +,,=0 +...=0 + +# Expected value is None, but we still need another blank line, otherwise panic. +do_nothing +---- + + +do_nothing +---- + + + + +# blank line is ok between test cases + + +repeat_me non_alphanumeric_characters-/,.non_alphanumeric_characters=-_@=+/,.-_@=+/,. \ +key_only \ +contiguous_equal==== \ + comma=, \ +none_none=(,) \ + type_2= \ +ws_none=( ,) \ + +---- +non_alphanumeric_characters-/,.non_alphanumeric_characters=-_@=+/,.-_@=+/,. +key_only= +contiguous_equal==== +comma=, +none_none= +type_2= +ws_none= diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_2.txt b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_2.txt new file mode 100644 index 0000000..8664b9c --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/datadriven/datadriven_2.txt @@ -0,0 +1,48 @@ +# Datadriven Test +# Input and output are separated by separator "----" +# You are accepted to add blank line between double "----", +# and you need another double "----" to end with the expected value. +# Remember a blank line to end the expected value. +fibonacci a=3 de=5 \ + \ + \ +b=5 +---- +a=3 +de=8 +b=8 + +factorial a=3 c=4 +---- +a=6 +c=24 + + +# test argument (5) +sum sum_num=3,2,4 a=2,90 b=() d= +---- +sum_num=9 +a=92 +b=0 +d=0 + +max a=(9,6,3) \ +b=3 c=3,2,1 \ +1 D=4,5,6 p=() +---- +a=9 +b=3 +c=3 +1=0 +D=6 +p=0 + +max a=(9,6,3)\ +b=(1,2)\ + c=(10,20)\ +d=(8,9) +---- +a=9 +b=2 +c=20 +d=9 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/multiline b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/multiline new file mode 100644 index 0000000..bb45842 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/multiline @@ -0,0 +1,19 @@ +small +---- +---- +just +two lines of output +---- +---- + +large +---- +---- +more +than +five +lines +of +output +---- +---- \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-after b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-after new file mode 100644 index 0000000..eb34c5a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-after @@ -0,0 +1,54 @@ +noop +---- + +noop +---- + +noop +---- + +noop +some input +---- +some input + +noop +some input +---- +some input + +duplicate +some input +---- +some input +some input + +duplicate +some input +---- +some input +some input + +duplicate-with-blank +some input +---- +---- +some input + +some input +---- +---- + +duplicate-with-blank +some input +---- +---- +some input + +some input +---- +---- + +no-output +some input +---- diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-before b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-before new file mode 100644 index 0000000..751f70b --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/basic-before @@ -0,0 +1,54 @@ +noop +---- + +noop +---- +xxx + +noop +---- +---- +xxx +---- +---- + +noop +some input +---- + +noop +some input +---- +xxx + +duplicate +some input +---- +yyy + +duplicate +some input +---- +---- +yyy +---- +---- + +duplicate-with-blank +some input +---- + +duplicate-with-blank +some input +---- +---- +some + +expected +---- +---- + +no-output +some input +---- +zzz \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-after b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-after new file mode 100644 index 0000000..f650f08 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-after @@ -0,0 +1,3 @@ +# Case where the last directive has blank output. +noop +---- diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-before b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-before new file mode 100644 index 0000000..8ed9c31 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-1-before @@ -0,0 +1,3 @@ +# Case where the last directive has blank output. +noop +---- \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-after b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-after new file mode 100644 index 0000000..4958b52 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-after @@ -0,0 +1,4 @@ +# Case where the last directive has blank output (but the double-separator +# syntax is used in the test file). +noop +---- diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-before b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-before new file mode 100644 index 0000000..90b1255 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/rewrite/eof-2-before @@ -0,0 +1,8 @@ +# Case where the last directive has blank output (but the double-separator +# syntax is used in the test file). +noop +---- +---- +foo +---- +---- \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_1.txt b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_1.txt new file mode 100644 index 0000000..e59ec04 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_1.txt @@ -0,0 +1,6 @@ +# `?` is not recognized. +# Because this test fails at line3, so the lines at the following is not read. +fibonacci a=3 b=?4 +--- +a=3 +b=5 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_2.txt b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_2.txt new file mode 100644 index 0000000..50bb426 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/datadriven/src/testdata/unknown_data_2.txt @@ -0,0 +1,7 @@ +# `\` is not recognized. +fibonacci a=3 b=4 \\ +c=3 +---- +a=3 +b=5 +c=3 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/examples/five_mem_node/main.rs b/systems/Raft-rs/driver/rust_so/raft-rs/examples/five_mem_node/main.rs new file mode 100644 index 0000000..eaf7202 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/examples/five_mem_node/main.rs @@ -0,0 +1,435 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +use slog::Drain; +use std::collections::{HashMap, VecDeque}; +use std::sync::mpsc::{self, Receiver, Sender, SyncSender, TryRecvError}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; +use std::{str, thread}; + +use protobuf::Message as PbMessage; +use raft::storage::MemStorage; +use raft::{prelude::*, StateRole}; +use regex::Regex; + +use slog::{error, info, o}; + +fn main() { + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain) + .chan_size(4096) + .overflow_strategy(slog_async::OverflowStrategy::Block) + .build() + .fuse(); + let logger = slog::Logger::root(drain, o!()); + + const NUM_NODES: u32 = 5; + // Create 5 mailboxes to send/receive messages. Every node holds a `Receiver` to receive + // messages from others, and uses the respective `Sender` to send messages to others. + let (mut tx_vec, mut rx_vec) = (Vec::new(), Vec::new()); + for _ in 0..NUM_NODES { + let (tx, rx) = mpsc::channel(); + tx_vec.push(tx); + rx_vec.push(rx); + } + + let (tx_stop, rx_stop) = mpsc::channel(); + let rx_stop = Arc::new(Mutex::new(rx_stop)); + + // A global pending proposals queue. New proposals will be pushed back into the queue, and + // after it's committed by the raft cluster, it will be poped from the queue. + let proposals = Arc::new(Mutex::new(VecDeque::::new())); + + let mut handles = Vec::new(); + for (i, rx) in rx_vec.into_iter().enumerate() { + // A map[peer_id -> sender]. In the example we create 5 nodes, with ids in [1, 5]. + let mailboxes = (1..6u64).zip(tx_vec.iter().cloned()).collect(); + let mut node = match i { + // Peer 1 is the leader. + 0 => Node::create_raft_leader(1, rx, mailboxes, &logger), + // Other peers are followers. + _ => Node::create_raft_follower(rx, mailboxes), + }; + let proposals = Arc::clone(&proposals); + + // Tick the raft node per 100ms. So use an `Instant` to trace it. + let mut t = Instant::now(); + + // Clone the stop receiver + let rx_stop_clone = Arc::clone(&rx_stop); + let logger = logger.clone(); + // Here we spawn the node on a new thread and keep a handle so we can join on them later. + let handle = thread::spawn(move || loop { + thread::sleep(Duration::from_millis(10)); + loop { + // Step raft messages. + match node.my_mailbox.try_recv() { + Ok(msg) => node.step(msg, &logger), + Err(TryRecvError::Empty) => break, + Err(TryRecvError::Disconnected) => return, + } + } + + let raft_group = match node.raft_group { + Some(ref mut r) => r, + // When Node::raft_group is `None` it means the node is not initialized. + _ => continue, + }; + + if t.elapsed() >= Duration::from_millis(100) { + // Tick the raft. + raft_group.tick(); + t = Instant::now(); + } + + // Let the leader pick pending proposals from the global queue. + if raft_group.raft.state == StateRole::Leader { + // Handle new proposals. + let mut proposals = proposals.lock().unwrap(); + for p in proposals.iter_mut().skip_while(|p| p.proposed > 0) { + propose(raft_group, p); + } + } + + // Handle readies from the raft. + on_ready( + raft_group, + &mut node.kv_pairs, + &node.mailboxes, + &proposals, + &logger, + ); + + // Check control signals from the main thread. + if check_signals(&rx_stop_clone) { + return; + }; + }); + handles.push(handle); + } + + // Propose some conf changes so that followers can be initialized. + add_all_followers(proposals.as_ref()); + + // Put 100 key-value pairs. + info!( + logger, + "We get a 5 nodes Raft cluster now, now propose 100 proposals" + ); + (0..100u16) + .filter(|i| { + let (proposal, rx) = Proposal::normal(*i, "hello, world".to_owned()); + proposals.lock().unwrap().push_back(proposal); + // After we got a response from `rx`, we can assume the put succeeded and following + // `get` operations can find the key-value pair. + rx.recv().unwrap() + }) + .count(); + + info!(logger, "Propose 100 proposals success!"); + + // Send terminate signals + for _ in 0..NUM_NODES { + tx_stop.send(Signal::Terminate).unwrap(); + } + + // Wait for the thread to finish + for th in handles { + th.join().unwrap(); + } +} + +enum Signal { + Terminate, +} + +fn check_signals(receiver: &Arc>>) -> bool { + match receiver.lock().unwrap().try_recv() { + Ok(Signal::Terminate) => true, + Err(TryRecvError::Empty) => false, + Err(TryRecvError::Disconnected) => true, + } +} + +struct Node { + // None if the raft is not initialized. + raft_group: Option>, + my_mailbox: Receiver, + mailboxes: HashMap>, + // Key-value pairs after applied. `MemStorage` only contains raft logs, + // so we need an additional storage engine. + kv_pairs: HashMap, +} + +impl Node { + // Create a raft leader only with itself in its configuration. + fn create_raft_leader( + id: u64, + my_mailbox: Receiver, + mailboxes: HashMap>, + logger: &slog::Logger, + ) -> Self { + let mut cfg = example_config(); + cfg.id = id; + let logger = logger.new(o!("tag" => format!("peer_{}", id))); + let mut s = Snapshot::default(); + // Because we don't use the same configuration to initialize every node, so we use + // a non-zero index to force new followers catch up logs by snapshot first, which will + // bring all nodes to the same initial state. + s.mut_metadata().index = 1; + s.mut_metadata().term = 1; + s.mut_metadata().mut_conf_state().voters = vec![1]; + let storage = MemStorage::new(); + storage.wl().apply_snapshot(s).unwrap(); + let raft_group = Some(RawNode::new(&cfg, storage, &logger).unwrap()); + Node { + raft_group, + my_mailbox, + mailboxes, + kv_pairs: Default::default(), + } + } + + // Create a raft follower. + fn create_raft_follower( + my_mailbox: Receiver, + mailboxes: HashMap>, + ) -> Self { + Node { + raft_group: None, + my_mailbox, + mailboxes, + kv_pairs: Default::default(), + } + } + + // Initialize raft for followers. + fn initialize_raft_from_message(&mut self, msg: &Message, logger: &slog::Logger) { + if !is_initial_msg(msg) { + return; + } + let mut cfg = example_config(); + cfg.id = msg.to; + let logger = logger.new(o!("tag" => format!("peer_{}", msg.to))); + let storage = MemStorage::new(); + self.raft_group = Some(RawNode::new(&cfg, storage, &logger).unwrap()); + } + + // Step a raft message, initialize the raft if need. + fn step(&mut self, msg: Message, logger: &slog::Logger) { + if self.raft_group.is_none() { + if is_initial_msg(&msg) { + self.initialize_raft_from_message(&msg, logger); + } else { + return; + } + } + let raft_group = self.raft_group.as_mut().unwrap(); + let _ = raft_group.step(msg); + } +} + +fn on_ready( + raft_group: &mut RawNode, + kv_pairs: &mut HashMap, + mailboxes: &HashMap>, + proposals: &Mutex>, + logger: &slog::Logger, +) { + if !raft_group.has_ready() { + return; + } + let store = raft_group.raft.raft_log.store.clone(); + + // Get the `Ready` with `RawNode::ready` interface. + let mut ready = raft_group.ready(); + + let handle_messages = |msgs: Vec| { + for msg in msgs { + let to = msg.to; + if mailboxes[&to].send(msg).is_err() { + error!( + logger, + "send raft message to {} fail, let Raft retry it", to + ); + } + } + }; + + if !ready.messages().is_empty() { + // Send out the messages come from the node. + handle_messages(ready.take_messages()); + } + + // Apply the snapshot. It's necessary because in `RawNode::advance` we stabilize the snapshot. + if *ready.snapshot() != Snapshot::default() { + let s = ready.snapshot().clone(); + if let Err(e) = store.wl().apply_snapshot(s) { + error!( + logger, + "apply snapshot fail: {:?}, need to retry or panic", e + ); + return; + } + } + + let reg = Regex::new("put ([0-9]+) (.+)").unwrap(); + let mut handle_committed_entries = + |rn: &mut RawNode, committed_entries: Vec| { + for entry in committed_entries { + if entry.data.is_empty() { + // From new elected leaders. + continue; + } + if let EntryType::EntryConfChange = entry.get_entry_type() { + // For conf change messages, make them effective. + let mut cc = ConfChange::default(); + cc.merge_from_bytes(&entry.data).unwrap(); + let cs = rn.apply_conf_change(&cc).unwrap(); + store.wl().set_conf_state(cs); + } else { + // For normal proposals, extract the key-value pair and then + // insert them into the kv engine. + let data = str::from_utf8(&entry.data).unwrap(); + if let Some(caps) = reg.captures(data) { + kv_pairs.insert(caps[1].parse().unwrap(), caps[2].to_string()); + } + } + if rn.raft.state == StateRole::Leader { + // The leader should response to the clients, tell them if their proposals + // succeeded or not. + let proposal = proposals.lock().unwrap().pop_front().unwrap(); + proposal.propose_success.send(true).unwrap(); + } + } + }; + // Apply all committed entries. + handle_committed_entries(raft_group, ready.take_committed_entries()); + + // Persistent raft logs. It's necessary because in `RawNode::advance` we stabilize + // raft logs to the latest position. + if let Err(e) = store.wl().append(ready.entries()) { + error!( + logger, + "persist raft log fail: {:?}, need to retry or panic", e + ); + return; + } + + if let Some(hs) = ready.hs() { + // Raft HardState changed, and we need to persist it. + store.wl().set_hardstate(hs.clone()); + } + + if !ready.persisted_messages().is_empty() { + // Send out the persisted messages come from the node. + handle_messages(ready.take_persisted_messages()); + } + + // Call `RawNode::advance` interface to update position flags in the raft. + let mut light_rd = raft_group.advance(ready); + // Update commit index. + if let Some(commit) = light_rd.commit_index() { + store.wl().mut_hard_state().set_commit(commit); + } + // Send out the messages. + handle_messages(light_rd.take_messages()); + // Apply all committed entries. + handle_committed_entries(raft_group, light_rd.take_committed_entries()); + // Advance the apply index. + raft_group.advance_apply(); +} + +fn example_config() -> Config { + Config { + election_tick: 10, + heartbeat_tick: 3, + ..Default::default() + } +} + +// The message can be used to initialize a raft node or not. +fn is_initial_msg(msg: &Message) -> bool { + let msg_type = msg.get_msg_type(); + msg_type == MessageType::MsgRequestVote + || msg_type == MessageType::MsgRequestPreVote + || (msg_type == MessageType::MsgHeartbeat && msg.commit == 0) +} + +struct Proposal { + normal: Option<(u16, String)>, // key is an u16 integer, and value is a string. + conf_change: Option, // conf change. + transfer_leader: Option, + // If it's proposed, it will be set to the index of the entry. + proposed: u64, + propose_success: SyncSender, +} + +impl Proposal { + fn conf_change(cc: &ConfChange) -> (Self, Receiver) { + let (tx, rx) = mpsc::sync_channel(1); + let proposal = Proposal { + normal: None, + conf_change: Some(cc.clone()), + transfer_leader: None, + proposed: 0, + propose_success: tx, + }; + (proposal, rx) + } + + fn normal(key: u16, value: String) -> (Self, Receiver) { + let (tx, rx) = mpsc::sync_channel(1); + let proposal = Proposal { + normal: Some((key, value)), + conf_change: None, + transfer_leader: None, + proposed: 0, + propose_success: tx, + }; + (proposal, rx) + } +} + +fn propose(raft_group: &mut RawNode, proposal: &mut Proposal) { + let last_index1 = raft_group.raft.raft_log.last_index() + 1; + if let Some((ref key, ref value)) = proposal.normal { + let data = format!("put {} {}", key, value).into_bytes(); + let _ = raft_group.propose(vec![], data); + } else if let Some(ref cc) = proposal.conf_change { + let _ = raft_group.propose_conf_change(vec![], cc.clone()); + } else if let Some(_transferee) = proposal.transfer_leader { + // TODO: implement transfer leader. + unimplemented!(); + } + + let last_index2 = raft_group.raft.raft_log.last_index() + 1; + if last_index2 == last_index1 { + // Propose failed, don't forget to respond to the client. + proposal.propose_success.send(false).unwrap(); + } else { + proposal.proposed = last_index1; + } +} + +// Proposes some conf change for peers [2, 5]. +fn add_all_followers(proposals: &Mutex>) { + for i in 2..6u64 { + let mut conf_change = ConfChange::default(); + conf_change.node_id = i; + conf_change.set_change_type(ConfChangeType::AddNode); + loop { + let (proposal, rx) = Proposal::conf_change(&conf_change); + proposals.lock().unwrap().push_back(proposal); + if rx.recv().unwrap() { + break; + } + thread::sleep(Duration::from_millis(100)); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/examples/single_mem_node/main.rs b/systems/Raft-rs/driver/rust_so/raft-rs/examples/single_mem_node/main.rs new file mode 100644 index 0000000..c60637f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/examples/single_mem_node/main.rs @@ -0,0 +1,205 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use slog::{Drain, Logger}; +use std::collections::HashMap; +use std::sync::mpsc::{self, RecvTimeoutError}; +use std::thread; +use std::time::{Duration, Instant}; + +use raft::prelude::*; +use raft::storage::MemStorage; + +use slog::{info, o}; + +type ProposeCallback = Box; + +enum Msg { + Propose { + id: u8, + cb: ProposeCallback, + }, + // Here we don't use Raft Message, so use dead_code to + // avoid the compiler warning. + #[allow(dead_code)] + Raft(Message), +} + +// A simple example about how to use the Raft library in Rust. +fn main() { + // Create a storage for Raft, and here we just use a simple memory storage. + // You need to build your own persistent storage in your production. + // Please check the Storage trait in src/storage.rs to see how to implement one. + let storage = MemStorage::new_with_conf_state(ConfState::from((vec![1], vec![]))); + + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain) + .chan_size(4096) + .overflow_strategy(slog_async::OverflowStrategy::Block) + .build() + .fuse(); + let logger = slog::Logger::root(drain, o!("tag" => format!("[{}]", 1))); + + // Create the configuration for the Raft node. + let cfg = Config { + // The unique ID for the Raft node. + id: 1, + // Election tick is for how long the follower may campaign again after + // it doesn't receive any message from the leader. + election_tick: 10, + // Heartbeat tick is for how long the leader needs to send + // a heartbeat to keep alive. + heartbeat_tick: 3, + // The max size limits the max size of each appended message. Mostly, 1 MB is enough. + max_size_per_msg: 1024 * 1024 * 1024, + // Max inflight msgs that the leader sends messages to follower without + // receiving ACKs. + max_inflight_msgs: 256, + // The Raft applied index. + // You need to save your applied index when you apply the committed Raft logs. + applied: 0, + ..Default::default() + }; + + // Create the Raft node. + let mut r = RawNode::new(&cfg, storage, &logger).unwrap(); + + let (sender, receiver) = mpsc::channel(); + + // Use another thread to propose a Raft request. + send_propose(logger.clone(), sender); + + // Loop forever to drive the Raft. + let mut t = Instant::now(); + let mut timeout = Duration::from_millis(100); + + // Use a HashMap to hold the `propose` callbacks. + let mut cbs = HashMap::new(); + + loop { + match receiver.recv_timeout(timeout) { + Ok(Msg::Propose { id, cb }) => { + cbs.insert(id, cb); + r.propose(vec![], vec![id]).unwrap(); + } + Ok(Msg::Raft(m)) => r.step(m).unwrap(), + Err(RecvTimeoutError::Timeout) => (), + Err(RecvTimeoutError::Disconnected) => return, + } + + let d = t.elapsed(); + t = Instant::now(); + if d >= timeout { + timeout = Duration::from_millis(100); + // We drive Raft every 100ms. + r.tick(); + } else { + timeout -= d; + } + on_ready(&mut r, &mut cbs); + } +} + +fn on_ready(raft_group: &mut RawNode, cbs: &mut HashMap) { + if !raft_group.has_ready() { + return; + } + let store = raft_group.raft.raft_log.store.clone(); + + // Get the `Ready` with `RawNode::ready` interface. + let mut ready = raft_group.ready(); + + let handle_messages = |msgs: Vec| { + for _msg in msgs { + // Send messages to other peers. + } + }; + + if !ready.messages().is_empty() { + // Send out the messages come from the node. + handle_messages(ready.take_messages()); + } + + if !ready.snapshot().is_empty() { + // This is a snapshot, we need to apply the snapshot at first. + store.wl().apply_snapshot(ready.snapshot().clone()).unwrap(); + } + + let mut _last_apply_index = 0; + let mut handle_committed_entries = |committed_entries: Vec| { + for entry in committed_entries { + // Mostly, you need to save the last apply index to resume applying + // after restart. Here we just ignore this because we use a Memory storage. + _last_apply_index = entry.index; + + if entry.data.is_empty() { + // Empty entry, when the peer becomes Leader it will send an empty entry. + continue; + } + + if entry.get_entry_type() == EntryType::EntryNormal { + if let Some(cb) = cbs.remove(entry.data.first().unwrap()) { + cb(); + } + } + + // TODO: handle EntryConfChange + } + }; + handle_committed_entries(ready.take_committed_entries()); + + if !ready.entries().is_empty() { + // Append entries to the Raft log. + store.wl().append(ready.entries()).unwrap(); + } + + if let Some(hs) = ready.hs() { + // Raft HardState changed, and we need to persist it. + store.wl().set_hardstate(hs.clone()); + } + + if !ready.persisted_messages().is_empty() { + // Send out the persisted messages come from the node. + handle_messages(ready.take_persisted_messages()); + } + + // Advance the Raft. + let mut light_rd = raft_group.advance(ready); + // Update commit index. + if let Some(commit) = light_rd.commit_index() { + store.wl().mut_hard_state().set_commit(commit); + } + // Send out the messages. + handle_messages(light_rd.take_messages()); + // Apply all committed entries. + handle_committed_entries(light_rd.take_committed_entries()); + // Advance the apply index. + raft_group.advance_apply(); +} + +fn send_propose(logger: Logger, sender: mpsc::Sender) { + thread::spawn(move || { + // Wait some time and send the request to the Raft. + thread::sleep(Duration::from_secs(10)); + + let (s1, r1) = mpsc::channel::(); + + info!(logger, "propose a request"); + + // Send a command to the Raft, wait for the Raft to apply it + // and get the result. + sender + .send(Msg::Propose { + id: 1, + cb: Box::new(move || { + s1.send(0).unwrap(); + }), + }) + .unwrap(); + + let n = r1.recv().unwrap(); + assert_eq!(n, 0); + + info!(logger, "receive the propose callback"); + }); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/Cargo.toml b/systems/Raft-rs/driver/rust_so/raft-rs/harness/Cargo.toml new file mode 100644 index 0000000..5010d76 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "harness" +version = "0.1.0" +authors = ["The TiKV Project Developers"] +license = "Apache-2.0" +keywords = [] +repository = "https://github.com/pingcap/raft-rs/harness" +readme = "README.md" +homepage = "https://github.com/pingcap/raft-rs/harness" +description = "A testing harness for Raft." +categories = [] +edition = "2018" + +[features] +default = ["protobuf-codec", "raft/default-logger"] +# Enable failpoints +failpoints = ["fail/failpoints"] +protobuf-codec = ["raft/protobuf-codec"] +prost-codec = ["raft/prost-codec"] + +# Make sure to synchronize updates with Raft. +[dependencies] +raft = { path = "..", default-features = false } +raft-proto = { path = "../proto", default-features = false } +fail = { version = "0.4", optional = true } +rand = "0.8" +slog = "2.2" + +[dev-dependencies] +criterion = "0.3" +fxhash = "0.2.1" +lazy_static = "1" +protobuf = "2" +regex = "1" diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/interface.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/interface.rs new file mode 100644 index 0000000..3e86f6f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/interface.rs @@ -0,0 +1,101 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use raft::{eraftpb::Message, storage::MemStorage, Raft, Result}; +use std::ops::{Deref, DerefMut}; + +/// A simulated Raft facade for testing. +/// +/// If the contained value is a `Some` operations happen. If they are a `None` operations are +/// a no-op. +/// +// Compare to upstream, we use struct instead of trait here. +// Because to be able to cast Interface later, we have to make +// Raft derive Any, which will require a lot of dependencies to derive Any. +// That's not worthy for just testing purpose. +pub struct Interface { + /// The raft peer. + pub raft: Option>, +} + +impl Interface { + /// Create a new interface to a new raft. + pub fn new(r: Raft) -> Interface { + Interface { raft: Some(r) } + } + + /// Step the raft, if it exists. + pub fn step(&mut self, m: Message) -> Result<()> { + match self.raft { + Some(_) => Raft::step(self, m), + None => Ok(()), + } + } + + /// Read messages out of the raft. + pub fn read_messages(&mut self) -> Vec { + match self.raft { + Some(_) => self.msgs.drain(..).collect(), + None => vec![], + } + } + + /// Persist the unstable snapshot and entries. + pub fn persist(&mut self) { + if self.raft.is_some() { + if let Some(snapshot) = self.raft_log.unstable_snapshot() { + let snap = snapshot.clone(); + let index = snap.get_metadata().index; + self.raft_log.stable_snap(index); + self.mut_store().wl().apply_snapshot(snap).expect(""); + self.on_persist_snap(index); + self.commit_apply(index); + } + let unstable = self.raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + let (last_idx, last_term) = (e.get_index(), e.get_term()); + self.raft_log.stable_entries(last_idx, last_term); + self.mut_store().wl().append(&unstable).expect(""); + self.on_persist_entries(last_idx, last_term); + } + } + } +} + +impl From>> for Interface { + fn from(raft: Option>) -> Self { + Self { raft } + } +} + +impl From> for Interface { + fn from(raft: Raft) -> Self { + Self { raft: Some(raft) } + } +} + +impl Deref for Interface { + type Target = Raft; + fn deref(&self) -> &Raft { + self.raft.as_ref().unwrap() + } +} + +impl DerefMut for Interface { + fn deref_mut(&mut self) -> &mut Raft { + self.raft.as_mut().unwrap() + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/lib.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/lib.rs new file mode 100644 index 0000000..7d27a6c --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/lib.rs @@ -0,0 +1,28 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*! + +This module contains various testing harness utilities for Raft. + +> If you want to build Raft without this, disable the `harness` feature. + +*/ + +mod interface; +mod network; + +pub use self::{interface::Interface, network::Network}; diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/network.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/network.rs new file mode 100644 index 0000000..5c7c029 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/src/network.rs @@ -0,0 +1,226 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use raft::{ + eraftpb::{ConfState, Message, MessageType}, + storage::MemStorage, + Config, Raft, Result, NO_LIMIT, +}; +use slog::Logger; + +use super::interface::Interface; + +/// A connection from one node to another. +/// +/// Used in by `Network` for determining drop rates on messages. +#[derive(Default, Debug, PartialEq, Eq, Hash)] +struct Connection { + from: u64, + to: u64, +} + +/// A simulated network for testing. +/// +/// You can use this to create a test network of Raft nodes. +/// +/// *Please note:* no actual network calls are made. +#[derive(Default)] +pub struct Network { + /// The set of raft peers. + pub peers: HashMap, + /// The storage of the raft peers. + pub storage: HashMap, + /// Drop messages from `from` to `to` at a rate of `f64`. + dropm: HashMap, + /// Drop messages of type `MessageType`. + ignorem: HashMap, +} + +impl Network { + /// Get a base config. Calling `Network::new` will initialize peers with this config. + pub fn default_config() -> Config { + Config { + election_tick: 10, + heartbeat_tick: 1, + max_size_per_msg: NO_LIMIT, + max_inflight_msgs: 256, + ..Default::default() + } + } + + /// Initializes a network from `peers`. + /// + /// Nodes will receive their ID based on their index in the vector, starting with 1. + /// + /// A `None` node will be replaced with a new Raft node, and its configuration will + /// be `peers`. + pub fn new(peers: Vec>, l: &Logger) -> Network { + let config = Network::default_config(); + Network::new_with_config(peers, &config, l) + } + + /// Initialize a network from `peers` with explicitly specified `config`. + pub fn new_with_config( + mut peers: Vec>, + config: &Config, + l: &Logger, + ) -> Network { + let mut nstorage = HashMap::new(); + let mut npeers = HashMap::new(); + + let peer_addrs: Vec = (1..=peers.len() as u64).collect(); + for (p, id) in peers.drain(..).zip(&peer_addrs) { + match p { + None => { + let conf_state = ConfState::from((peer_addrs.clone(), vec![])); + let store = MemStorage::new_with_conf_state(conf_state); + nstorage.insert(*id, store.clone()); + let mut config = config.clone(); + config.id = *id; + let r = Raft::new(&config, store, l).unwrap().into(); + npeers.insert(*id, r); + } + Some(r) => { + if let Some(raft) = r.raft.as_ref() { + if raft.id != *id { + panic!("peer {} in peers has a wrong position", r.id); + } + let store = raft.raft_log.store.clone(); + nstorage.insert(*id, store); + } + npeers.insert(*id, r); + } + } + } + Network { + peers: npeers, + storage: nstorage, + ..Default::default() + } + } + + /// Ignore a given `MessageType`. + pub fn ignore(&mut self, t: MessageType) { + self.ignorem.insert(t, true); + } + + /// Filter out messages that should be dropped according to rules set by `ignore` or `drop`. + pub fn filter(&self, msgs: impl IntoIterator) -> Vec { + msgs.into_iter() + .filter(|m| { + if self + .ignorem + .get(&m.get_msg_type()) + .cloned() + .unwrap_or(false) + { + return false; + } + // hups never go over the network, so don't drop them but panic + assert_ne!(m.get_msg_type(), MessageType::MsgHup, "unexpected msgHup"); + let perc = self + .dropm + .get(&Connection { + from: m.from, + to: m.to, + }) + .cloned() + .unwrap_or(0f64); + rand::random::() >= perc + }) + .collect() + } + + /// Read out all messages generated by peers in the `Network`. + /// + /// Note: messages are not filtered by any configured filters. + pub fn read_messages(&mut self) -> Vec { + self.peers + .iter_mut() + .flat_map(|(_peer, progress)| progress.read_messages()) + .collect() + } + + /// Instruct the cluster to `step` through the given messages. + /// + /// NOTE: the given `msgs` won't be filtered by its filters. + pub fn send(&mut self, msgs: Vec) { + let mut msgs = msgs; + while !msgs.is_empty() { + let mut new_msgs = vec![]; + for m in msgs.drain(..) { + let resp = { + let p = self.peers.get_mut(&m.to).unwrap(); + let _ = p.step(m); + // The unstable data should be persisted before sending msg. + p.persist(); + p.read_messages() + }; + new_msgs.append(&mut self.filter(resp)); + } + msgs.append(&mut new_msgs); + } + } + + /// Filter `msgs` and then instruct the cluster to `step` through the given messages. + pub fn filter_and_send(&mut self, msgs: Vec) { + self.send(self.filter(msgs)); + } + + /// Dispatches the given messages to the appropriate peers. + /// + /// Unlike `send` this does not gather and send any responses. It also does not ignore errors. + pub fn dispatch(&mut self, messages: impl IntoIterator) -> Result<()> { + for message in self.filter(messages.into_iter()) { + let to = message.to; + let peer = self.peers.get_mut(&to).unwrap(); + peer.step(message)?; + } + Ok(()) + } + + /// Ignore messages from `from` to `to` at `perc` percent chance. + /// + /// `perc` set to `1f64` is a 100% chance, `0f64` is a 0% chance. + pub fn drop(&mut self, from: u64, to: u64, perc: f64) { + self.dropm.insert(Connection { from, to }, perc); + } + + /// Cut the communication between the two given nodes. + pub fn cut(&mut self, one: u64, other: u64) { + self.drop(one, other, 1f64); + self.drop(other, one, 1f64); + } + + /// Isolate the given raft to and from all other raft in the cluster. + pub fn isolate(&mut self, id: u64) { + for i in 0..self.peers.len() as u64 { + let nid = i + 1; + if nid != id { + self.drop(id, nid, 1.0); + self.drop(nid, id, 1.0); + } + } + } + + /// Recover the cluster conditions applied with `drop` and `ignore`. + pub fn recover(&mut self) { + self.dropm = HashMap::new(); + self.ignorem = HashMap::new(); + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/failpoints_cases/mod.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/failpoints_cases/mod.rs new file mode 100644 index 0000000..0955d13 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/failpoints_cases/mod.rs @@ -0,0 +1,39 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::test_util::*; +use fail; +use raft::{default_logger, eraftpb::MessageType}; +use std::sync::*; + +// test_reject_stale_term_message tests that if a server receives a request with +// a stale term number, it rejects the request. +// Our implementation ignores the request instead. +// Reference: section 5.1 +#[test] +fn test_reject_stale_term_message() { + let scenario = fail::FailScenario::setup(); + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + fail::cfg("before_step", "panic").unwrap(); + r.load_state(&hard_state(2, 0, 0)); + + let mut m = new_message(0, 0, MessageType::MsgAppend, 0); + m.term = r.term - 1; + r.step(m).expect(""); + scenario.teardown(); +} + +// ensure that the Step function ignores the message from old term and does not pass it to the +// actual stepX function. +#[test] +fn test_step_ignore_old_term_msg() { + let scenario = fail::FailScenario::setup(); + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + fail::cfg("before_step", "panic").unwrap(); + sm.term = 2; + let mut m = new_message(0, 0, MessageType::MsgAppend, 0); + m.term = 1; + sm.step(m).expect(""); + scenario.teardown(); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/mod.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/mod.rs new file mode 100644 index 0000000..5e3928f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/mod.rs @@ -0,0 +1,7 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +mod test_raft; +mod test_raft_flow_control; +mod test_raft_paper; +mod test_raft_snap; +mod test_raw_node; diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft.rs new file mode 100644 index 0000000..9f73bb1 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft.rs @@ -0,0 +1,5912 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp; +use std::collections::HashMap; +use std::panic::{self, AssertUnwindSafe}; + +use harness::*; +use protobuf::Message as PbMessage; +use raft::eraftpb::*; +use raft::storage::MemStorage; +use raft::*; +use raft_proto::*; +use slog::Logger; + +use crate::integration_cases::test_raft_paper::commit_noop_entry; +use crate::test_util::*; + +type HashSet = std::collections::HashSet>; + +fn read_messages(raft: &mut Raft) -> Vec { + raft.msgs.drain(..).collect() +} + +fn ents_with_config( + terms: &[u64], + pre_vote: bool, + id: u64, + peers: Vec, + l: &Logger, +) -> Interface { + let store = MemStorage::new_with_conf_state((peers.clone(), vec![])); + for (i, term) in terms.iter().enumerate() { + let mut e = Entry::default(); + e.index = i as u64 + 1; + e.term = *term; + store.wl().append(&[e]).expect(""); + } + let mut raft = new_test_raft_with_prevote(id, peers, 5, 1, store, pre_vote, l); + raft.reset(terms[terms.len() - 1]); + raft +} + +fn assert_raft_log( + prefix: &str, + raft_log: &RaftLog, + (committed, applied, last): (u64, u64, u64), +) { + assert_eq!( + raft_log.committed, committed, + "{}committed = {}, want = {}", + prefix, raft_log.committed, committed + ); + assert_eq!( + raft_log.applied, applied, + "{}applied = {}, want = {}", + prefix, raft_log.applied, applied + ); + assert_eq!( + raft_log.last_index(), + last, + "{}last_index = {}, want = {}", + prefix, + raft_log.last_index(), + last + ); +} + +// voted_with_config creates a raft state machine with vote and term set +// to the given value but no log entries (indicating that it voted in +// the given term but has not receive any logs). +fn voted_with_config( + vote: u64, + term: u64, + pre_vote: bool, + id: u64, + peers: Vec, + l: &Logger, +) -> Interface { + let store = MemStorage::new_with_conf_state((peers.clone(), vec![])); + store.wl().mut_hard_state().vote = vote; + store.wl().mut_hard_state().term = term; + let mut raft = new_test_raft_with_prevote(id, peers, 5, 1, store, pre_vote, l); + raft.reset(term); + raft +} + +// Persist committed index and fetch next entries. +fn next_ents(r: &mut Raft, s: &MemStorage) -> Vec { + let unstable = r.raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + let (last_idx, last_term) = (e.get_index(), e.get_term()); + r.raft_log.stable_entries(last_idx, last_term); + s.wl().append(&unstable).expect(""); + r.on_persist_entries(last_idx, last_term); + } + let ents = r.raft_log.next_entries(None); + r.commit_apply(r.raft_log.committed); + ents.unwrap_or_default() +} + +#[test] +fn test_progress_committed_index() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + + // set node 1 as Leader + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + assert_raft_log("#1: ", &nt.peers[&1].raft_log, (1, 0, 1)); + assert_raft_log("#2: ", &nt.peers[&2].raft_log, (1, 0, 1)); + assert_raft_log("#3: ", &nt.peers[&3].raft_log, (1, 0, 1)); + + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (1, 1, 1) + ); + + // #1 test append entries + // append entries between 1 and 2 + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let m = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.cut(1, 3); + nt.send(vec![m.clone(), m]); + nt.recover(); + + assert_raft_log("#1: ", &nt.peers[&1].raft_log, (3, 0, 3)); + assert_raft_log("#2: ", &nt.peers[&2].raft_log, (3, 0, 3)); + assert_raft_log("#3: ", &nt.peers[&3].raft_log, (1, 0, 1)); + + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (3, 3, 1) + ); + + // #2 test heartbeat + let heartbeat = new_message(1, 1, MessageType::MsgBeat, 0); + nt.send(vec![heartbeat]); + + assert_raft_log("#1: ", &nt.peers[&1].raft_log, (3, 0, 3)); + assert_raft_log("#2: ", &nt.peers[&2].raft_log, (3, 0, 3)); + assert_raft_log("#3: ", &nt.peers[&3].raft_log, (3, 0, 3)); + + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (3, 3, 3) + ); + + // set node 2 as Leader + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&2].state, StateRole::Leader); + + assert_raft_log("#1: ", &nt.peers[&1].raft_log, (4, 0, 4)); + assert_raft_log("#2: ", &nt.peers[&2].raft_log, (4, 0, 4)); + assert_raft_log("#3: ", &nt.peers[&3].raft_log, (4, 0, 4)); + + assert_eq!( + ( + nt.peers[&2].prs().get(1).unwrap().committed_index, + nt.peers[&2].prs().get(2).unwrap().committed_index, + nt.peers[&2].prs().get(3).unwrap().committed_index + ), + (4, 4, 4) + ); + + // #3 test append entries rejection (fails to update committed index) + nt.isolate(2); + nt.send(vec![new_message(2, 2, MessageType::MsgPropose, 2)]); + nt.recover(); + nt.dispatch(vec![new_message(2, 2, MessageType::MsgPropose, 1)]) + .expect(""); + + // [msg_type: MsgAppend to: 1 from: 2 term: 2 log_term: 2 index: 6 entries {term: 2 index: 7 data: "somedata"} commit: 4, + // msg_type: MsgAppend to: 3 from: 2 term: 2 log_term: 2 index: 6 entries {term: 2 index: 7 data: "somedata"} commit: 4] + let msg_append = nt.read_messages(); + + nt.dispatch(msg_append).expect(""); + + // [msg_type: MsgAppendResponse to: 2 from: 1 term: 2 index: 6 commit: 4 reject: true reject_hint: 4, + // msg_type: MsgAppendResponse to: 2 from: 3 term: 2 index: 6 commit: 4 reject: true reject_hint: 4] + let msg_append_response = nt.read_messages(); + + nt.dispatch(msg_append_response).expect(""); + + // [msg_type: MsgAppend to: 3 from: 2 term: 2 log_term: 2 index: 4 entries {term: 2 index: 5 data: "somedata"} entries {term: 2 index: 6 data: "somedata"} entries {term: 2 index: 7 data: "somedata"} commit: 4, + // msg_type: MsgAppend to: 1 from: 2 term: 2 log_term: 2 index: 4 entries {term: 2 index: 5 data: "somedata"} entries {term: 2 index: 6 data: "somedata"} entries {term: 2 index: 7 data: "somedata"} commit: 4] + let msg_append = nt.read_messages(); + + // committed index remain the same + assert_eq!( + ( + nt.peers[&2].prs().get(1).unwrap().committed_index, + nt.peers[&2].prs().get(2).unwrap().committed_index, + nt.peers[&2].prs().get(3).unwrap().committed_index + ), + (4, 4, 4) + ); + + // resend append + nt.send(msg_append); + + // log is up-to-date + assert_eq!( + ( + nt.peers[&2].prs().get(1).unwrap().committed_index, + nt.peers[&2].prs().get(2).unwrap().committed_index, + nt.peers[&2].prs().get(3).unwrap().committed_index + ), + (7, 7, 7) + ); + + // set node 1 as Leader again + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + assert_raft_log("#1: ", &nt.peers[&1].raft_log, (8, 0, 8)); + assert_raft_log("#2: ", &nt.peers[&2].raft_log, (8, 0, 8)); + assert_raft_log("#3: ", &nt.peers[&3].raft_log, (8, 0, 8)); + + // update to 8 + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (8, 8, 8) + ); + + // #4 pass a smaller committed index, it occurs when the append response delay + + nt.dispatch(vec![ + new_message(1, 1, MessageType::MsgPropose, 1), + new_message(1, 1, MessageType::MsgPropose, 1), + ]) + .expect(""); + let msg_append = nt.read_messages(); + nt.dispatch(msg_append).expect(""); + let msg_append_response = nt.read_messages(); + nt.dispatch(msg_append_response).expect(""); + let msg_append = nt.read_messages(); + nt.dispatch(msg_append).expect(""); + let mut msg_append_response = nt.read_messages(); + // m1: msg_type: MsgAppendResponse to: 1 from: 3 term: 3 index: 10 commit: 10 + // m2: msg_type: MsgAppendResponse to: 1 from: 2 term: 3 index: 10 commit: 10 + let m1 = msg_append_response.remove(1); + let m2 = msg_append_response.remove(2); + nt.send(vec![m1, m2]); + + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (10, 10, 10) + ); + + // committed index remain 10 + + // msg_type: MsgAppendResponse to: 1 from: 2 term: 3 index: 10 commit: 9, + // msg_type: MsgAppendResponse to: 1 from: 3 term: 3 index: 10 commit: 9 + nt.send(msg_append_response); + assert_eq!( + ( + nt.peers[&1].prs().get(1).unwrap().committed_index, + nt.peers[&1].prs().get(2).unwrap().committed_index, + nt.peers[&1].prs().get(3).unwrap().committed_index + ), + (10, 10, 10) + ); +} + +#[test] +fn test_progress_leader() { + let l = default_logger(); + let mut raft = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + raft.become_candidate(); + raft.become_leader(); + // For no-op entry + raft.persist(); + raft.mut_prs().get_mut(2).unwrap().become_replicate(); + + let prop_msg = new_message(1, 1, MessageType::MsgPropose, 1); + for i in 0..5 { + assert_eq!( + raft.mut_prs().get_mut(1).unwrap().state, + ProgressState::Replicate + ); + + let matched = raft.mut_prs().get_mut(1).unwrap().matched; + let next_idx = raft.mut_prs().get_mut(1).unwrap().next_idx; + assert_eq!(matched, i + 1); + assert_eq!(next_idx, matched + 1); + + raft.step(prop_msg.clone()).unwrap(); + raft.persist(); + } +} + +// test_progress_resume_by_heartbeat_resp ensures raft.heartbeat reset progress.paused by +// heartbeat response. +#[test] +fn test_progress_resume_by_heartbeat_resp() { + let l = default_logger(); + let mut raft = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + raft.become_candidate(); + raft.become_leader(); + raft.mut_prs().get_mut(2).unwrap().paused = true; + + raft.step(new_message(1, 1, MessageType::MsgBeat, 0)) + .expect(""); + assert!(raft.prs().get(2).unwrap().paused); + + raft.mut_prs().get_mut(2).unwrap().become_replicate(); + raft.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + assert!(!raft.prs().get(2).unwrap().paused); +} + +#[test] +fn test_progress_paused() { + let l = default_logger(); + let mut raft = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + raft.become_candidate(); + raft.become_leader(); + let mut m = Message::default(); + m.from = 1; + m.to = 1; + m.set_msg_type(MessageType::MsgPropose); + let mut e = Entry::default(); + e.data = (b"some_data" as &'static [u8]).into(); + m.entries = vec![e].into(); + raft.step(m.clone()).expect(""); + raft.step(m.clone()).expect(""); + raft.step(m).expect(""); + let ms = read_messages(&mut raft); + assert_eq!(ms.len(), 1); +} + +#[test] +fn test_progress_flow_control() { + let l = default_logger(); + let mut cfg = new_test_config(1, 5, 1); + cfg.max_inflight_msgs = 3; + cfg.max_size_per_msg = 2048; + let s = MemStorage::new_with_conf_state((vec![1, 2], vec![])); + let mut r = new_test_raft_with_config(&cfg, s, &l); + r.become_candidate(); + r.become_leader(); + + // Throw away all the messages relating to the initial election. + r.read_messages(); + + // While node 2 is in probe state, propose a bunch of entries. + r.mut_prs().get_mut(2).unwrap().become_probe(); + let data: String = "a".repeat(1000); + for _ in 0..10 { + let msg = new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![new_entry(0, 0, Some(&data))], + ); + r.step(msg).unwrap(); + } + + let mut ms = r.read_messages(); + // First append has two entries: the empty entry to confirm the + // election, and the first proposal (only one proposal gets sent + // because we're in probe state). + assert_eq!(ms.len(), 1); + assert_eq!(ms[0].msg_type, MessageType::MsgAppend); + assert_eq!(ms[0].entries.len(), 2); + assert_eq!(ms[0].entries[0].data.len(), 0); + assert_eq!(ms[0].entries[1].data.len(), 1000); + + // When this append is acked, we change to replicate state and can + // send multiple messages at once. + let mut msg = new_message(2, 1, MessageType::MsgAppendResponse, 0); + msg.index = ms[0].entries[1].index; + r.step(msg).unwrap(); + ms = r.read_messages(); + assert_eq!(ms.len(), 3); + for (i, m) in ms.iter().enumerate() { + if m.msg_type != MessageType::MsgAppend { + panic!("{}: expected MsgAppend, got {:?}", i, m.msg_type); + } + if m.entries.len() != 2 { + panic!("{}: expected 2 entries, got {}", i, m.entries.len()); + } + } + + // Ack all three of those messages together and get the last two + // messages (containing three entries). + let mut msg = new_message(2, 1, MessageType::MsgAppendResponse, 0); + msg.index = ms[2].entries[1].index; + r.step(msg).unwrap(); + ms = r.read_messages(); + assert_eq!(ms.len(), 2); + for (i, m) in ms.iter().enumerate() { + if m.msg_type != MessageType::MsgAppend { + panic!("{}: expected MsgAppend, got {:?}", i, m.msg_type); + } + } + assert_eq!(ms[0].entries.len(), 2); + assert_eq!(ms[1].entries.len(), 1); +} + +#[test] +fn test_leader_election() { + let l = default_logger(); + test_leader_election_with_config(false, &l); +} + +#[test] +fn test_leader_election_pre_vote() { + let l = default_logger(); + test_leader_election_with_config(true, &l); +} + +fn test_leader_election_with_config(pre_vote: bool, l: &Logger) { + let mut config = Network::default_config(); + config.pre_vote = pre_vote; + let mut tests = vec![ + ( + Network::new_with_config(vec![None, None, None], &config, l), + StateRole::Leader, + 1, + ), + ( + Network::new_with_config(vec![None, None, NOP_STEPPER], &config, l), + StateRole::Leader, + 1, + ), + ( + Network::new_with_config(vec![None, NOP_STEPPER, NOP_STEPPER], &config, l), + StateRole::Candidate, + 1, + ), + ( + Network::new_with_config(vec![None, NOP_STEPPER, NOP_STEPPER, None], &config, l), + StateRole::Candidate, + 1, + ), + ( + Network::new_with_config(vec![None, NOP_STEPPER, NOP_STEPPER, None, None], &config, l), + StateRole::Leader, + 1, + ), + // three logs further along than 0, but in the same term so rejection + // are returned instead of the votes being ignored. + ( + Network::new_with_config( + vec![ + None, + Some(ents_with_config(&[1], pre_vote, 2, vec![1, 2, 3, 4, 5], l)), + Some(ents_with_config(&[1], pre_vote, 3, vec![1, 2, 3, 4, 5], l)), + Some(ents_with_config( + &[1, 1], + pre_vote, + 4, + vec![1, 2, 3, 4, 5], + l, + )), + None, + ], + &config, + l, + ), + StateRole::Follower, + 1, + ), + ]; + + for (i, &mut (ref mut network, state, term)) in tests.iter_mut().enumerate() { + let mut m = Message::default(); + m.from = 1; + m.to = 1; + m.set_msg_type(MessageType::MsgHup); + network.send(vec![m]); + let raft = &network.peers[&1]; + let (exp_state, exp_term) = if state == StateRole::Candidate && pre_vote { + // In pre-vote mode, an election that fails to complete + // leaves the node in pre-candidate state without advancing + // the term. + (StateRole::PreCandidate, 0) + } else { + (state, term) + }; + if raft.state != exp_state { + panic!("#{}: state = {:?}, want {:?}", i, raft.state, exp_state); + } + if raft.term != exp_term { + panic!("#{}: term = {}, want {}", i, raft.term, exp_term) + } + } +} + +#[test] +fn test_leader_cycle() { + let l = default_logger(); + test_leader_cycle_with_config(false, &l) +} + +#[test] +fn test_leader_cycle_pre_vote() { + let l = default_logger(); + test_leader_cycle_with_config(true, &l) +} + +// test_leader_cycle verifies that each node in a cluster can campaign +// and be elected in turn. This ensures that elections (including +// pre-vote) work when not starting from a clean state (as they do in +// test_leader_election) +fn test_leader_cycle_with_config(pre_vote: bool, l: &Logger) { + let mut config = Network::default_config(); + config.pre_vote = pre_vote; + let mut network = Network::new_with_config(vec![None, None, None], &config, l); + for campaigner_id in 1..4 { + network.send(vec![new_message( + campaigner_id, + campaigner_id, + MessageType::MsgHup, + 0, + )]); + + for sm in network.peers.values() { + if sm.id == campaigner_id && sm.state != StateRole::Leader { + panic!( + "pre_vote={}: campaigning node {} state = {:?}, want Leader", + pre_vote, sm.id, sm.state + ); + } else if sm.id != campaigner_id && sm.state != StateRole::Follower { + panic!( + "pre_vote={}: after campaign of node {}, node {} had state = {:?}, want \ + Follower", + pre_vote, campaigner_id, sm.id, sm.state + ); + } + } + } +} + +#[test] +fn test_leader_election_overwrite_newer_logs() { + let l = default_logger(); + test_leader_election_overwrite_newer_logs_with_config(false, &l); +} + +#[test] +fn test_leader_election_overwrite_newer_logs_pre_vote() { + let l = default_logger(); + test_leader_election_overwrite_newer_logs_with_config(true, &l); +} + +// test_leader_election_overwrite_newer_logs tests a scenario in which a +// newly-elected leader does *not* have the newest (i.e. highest term) +// log entries, and must overwrite higher-term log entries with +// lower-term ones. +fn test_leader_election_overwrite_newer_logs_with_config(pre_vote: bool, l: &Logger) { + // This network represents the results of the following sequence of + // events: + // - Node 1 won the election in term 1. + // - Node 1 replicated a log entry to node 2 but died before sending + // it to other nodes. + // - Node 3 won the second election in term 2. + // - Node 3 wrote an entry to its logs but died without sending it + // to any other nodes. + // + // At this point, nodes 1, 2, and 3 all have uncommitted entries in + // their logs and could win an election at term 3. The winner's log + // entry overwrites the loser's. (test_leader_sync_follower_log tests + // the case where older log entries are overwritten, so this test + // focuses on the case where the newer entries are lost). + let peers = vec![1, 2, 3, 4, 5]; + let mut config = Network::default_config(); + config.pre_vote = pre_vote; + let mut network = Network::new_with_config( + vec![ + Some(ents_with_config(&[1], pre_vote, 1, peers.clone(), l)), // Node 1: Won first election + Some(ents_with_config(&[1], pre_vote, 2, peers.clone(), l)), // Node 2: Get logs from node 1 + Some(ents_with_config(&[2], pre_vote, 3, peers.clone(), l)), // Node 3: Won second election + Some(voted_with_config(3, 2, pre_vote, 4, peers.clone(), l)), // Node 4: Voted but didn't get logs + Some(voted_with_config(3, 2, pre_vote, 5, peers, l)), // Node 5: Voted but didn't get logs + ], + &config, + l, + ); + + // Node 1 campaigns. The election fails because a quorum of nodes + // know about the election that already happened at term 2. Node 1's + // term is pushed ahead to 2. + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(network.peers[&1].state, StateRole::Follower); + assert_eq!(network.peers[&1].term, 2); + + // Node 1 campaigns again with a higher term. this time it succeeds. + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(network.peers[&1].state, StateRole::Leader); + assert_eq!(network.peers[&1].term, 3); + + // Now all nodes agree on a log entry with term 1 at index 1 (and + // term 3 at index 2). + for (id, sm) in &network.peers { + let entries = sm.raft_log.all_entries(); + assert_eq!( + entries.len(), + 2, + "node {}: entries.len() == {}, want 2", + id, + entries.len() + ); + assert_eq!( + entries[0].term, 1, + "node {}: term at index 1 == {}, want 1", + id, entries[0].term + ); + assert_eq!( + entries[1].term, 3, + "node {}: term at index 2 == {}, want 3", + id, entries[1].term + ); + } +} + +#[test] +fn test_vote_from_any_state() { + let l = default_logger(); + test_vote_from_any_state_for_type(MessageType::MsgRequestVote, &l); +} + +#[test] +fn test_prevote_from_any_state() { + let l = default_logger(); + test_vote_from_any_state_for_type(MessageType::MsgRequestPreVote, &l); +} + +fn test_vote_from_any_state_for_type(vt: MessageType, l: &Logger) { + let all_states = vec![ + StateRole::Follower, + StateRole::Candidate, + StateRole::PreCandidate, + StateRole::Leader, + ]; + for state in all_states { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), l); + r.term = 1; + match state { + StateRole::Follower => { + let term = r.term; + r.become_follower(term, 3); + } + StateRole::PreCandidate => r.become_pre_candidate(), + StateRole::Candidate => r.become_candidate(), + StateRole::Leader => { + r.become_candidate(); + r.become_leader(); + } + } + // Note that setting our state above may have advanced r.term + // past its initial value. + let orig_term = r.term; + let new_term = r.term + 1; + + let mut msg = new_message(2, 1, vt, 0); + msg.term = new_term; + msg.log_term = new_term; + msg.index = 42; + r.step(msg) + .unwrap_or_else(|_| panic!("{:?},{:?}: step failed", vt, state)); + assert_eq!( + r.msgs.len(), + 1, + "{:?},{:?}: {} response messages, want 1: {:?}", + vt, + state, + r.msgs.len(), + r.msgs + ); + let resp = &r.msgs[0]; + assert_eq!( + resp.get_msg_type(), + vote_resp_msg_type(vt), + "{:?},{:?}: response message is {:?}, want {:?}", + vt, + state, + resp.get_msg_type(), + vote_resp_msg_type(vt) + ); + assert!(!resp.reject, "{:?},{:?}: unexpected rejection", vt, state); + + // If this was a real vote, we reset our state and term. + if vt == MessageType::MsgRequestVote { + assert_eq!( + r.state, + StateRole::Follower, + "{:?},{:?}, state {:?}, want {:?}", + vt, + state, + r.state, + StateRole::Follower + ); + assert_eq!( + r.term, new_term, + "{:?},{:?}, term {}, want {}", + vt, state, r.term, new_term + ); + assert_eq!(r.vote, 2, "{:?},{:?}, vote {}, want 2", vt, state, r.vote); + } else { + // In a pre-vote, nothing changes. + assert_eq!( + r.state, state, + "{:?},{:?}, state {:?}, want {:?}", + vt, state, r.state, state + ); + assert_eq!( + r.term, orig_term, + "{:?},{:?}, term {}, want {}", + vt, state, r.term, orig_term + ); + // If state == Follower or PreCandidate, r hasn't voted yet. + // In Candidate or Leader, it's voted for itself. + assert!( + r.vote == INVALID_ID || r.vote == 1, + "{:?},{:?}, vote {}, want {:?} or 1", + vt, + state, + r.vote, + INVALID_ID + ); + } + } +} + +#[test] +fn test_log_replication() { + let l = default_logger(); + let mut tests = vec![ + ( + Network::new(vec![None, None, None], &l), + vec![new_message(1, 1, MessageType::MsgPropose, 1)], + 2, + ), + ( + Network::new(vec![None, None, None], &l), + vec![ + new_message(1, 1, MessageType::MsgPropose, 1), + new_message(1, 2, MessageType::MsgHup, 0), + new_message(1, 2, MessageType::MsgPropose, 1), + ], + 4, + ), + ]; + + for (i, &mut (ref mut network, ref msgs, wcommitted)) in tests.iter_mut().enumerate() { + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + for m in msgs { + network.send(vec![m.clone()]); + } + + for (j, x) in &mut network.peers { + if x.raft_log.committed != wcommitted { + panic!( + "#{}.{}: committed = {}, want {}", + i, j, x.raft_log.committed, wcommitted + ); + } + + let mut ents = next_ents(x, &network.storage[j]); + let ents: Vec = ents.drain(..).filter(|e| !e.data.is_empty()).collect(); + for (k, m) in msgs + .iter() + .filter(|m| m.get_msg_type() == MessageType::MsgPropose) + .enumerate() + { + if ents[k].data != m.entries[0].data { + panic!( + "#{}.{}: data = {:?}, want {:?}", + i, j, ents[k].data, m.entries[0].data + ); + } + } + } + } +} + +#[test] +fn test_single_node_commit() { + let l = default_logger(); + let mut tt = Network::new(vec![None], &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + assert_eq!(tt.peers[&1].raft_log.committed, 3); +} + +// test_cannot_commit_without_new_term_entry tests the entries cannot be committed +// when leader changes, no new proposal comes in and ChangeTerm proposal is +// filtered. +#[test] +fn test_cannot_commit_without_new_term_entry() { + let l = default_logger(); + let mut tt = Network::new(vec![None, None, None, None, None], &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // 0 cannot reach 2, 3, 4 + tt.cut(1, 3); + tt.cut(1, 4); + tt.cut(1, 5); + + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + assert_eq!(tt.peers[&1].raft_log.committed, 1); + + // network recovery + tt.recover(); + // avoid committing ChangeTerm proposal + tt.ignore(MessageType::MsgAppend); + + // elect 2 as the new leader with term 2 + tt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // no log entries from previous term should be committed + assert_eq!(tt.peers[&2].raft_log.committed, 1); + + tt.recover(); + // send heartbeat; reset wait + tt.send(vec![new_message(2, 2, MessageType::MsgBeat, 0)]); + // append an entry at current term + tt.send(vec![new_message(2, 2, MessageType::MsgPropose, 1)]); + // expect the committed to be advanced + assert_eq!(tt.peers[&2].raft_log.committed, 5); +} + +// test_commit_without_new_term_entry tests the entries could be committed +// when leader changes, no new proposal comes in. +#[test] +fn test_commit_without_new_term_entry() { + let l = default_logger(); + let mut tt = Network::new(vec![None, None, None, None, None], &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // 0 cannot reach 2, 3, 4 + tt.cut(1, 3); + tt.cut(1, 4); + tt.cut(1, 5); + + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + assert_eq!(tt.peers[&1].raft_log.committed, 1); + + // network recovery + tt.recover(); + + // elect 1 as the new leader with term 2 + // after append a ChangeTerm entry from the current term, all entries + // should be committed + tt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + assert_eq!(tt.peers[&1].raft_log.committed, 4); +} + +#[test] +fn test_dueling_candidates() { + let l = default_logger(); + let a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + nt.cut(1, 3); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // 1 becomes leader since it receives votes from 1 and 2 + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + // 3 stays as candidate since it receives a vote from 3 and a rejection from 2 + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + nt.recover(); + + // Candidate 3 now increases its term and tries to vote again, we except it to + // disrupt the leader 1 since it has a higher term, 3 will be follower again + // since both 1 and 2 rejects its vote request since 3 does not have a long + // enough log. + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + let tests = [ + // role, term, committed, applied, last index. + (StateRole::Follower, 2, (1, 0, 1)), + (StateRole::Follower, 2, (1, 0, 1)), + (StateRole::Follower, 2, (0, 0, 0)), + ]; + + for (i, &(state, term, raft_log)) in tests.iter().enumerate() { + let id = i as u64 + 1; + if nt.peers[&id].state != state { + panic!( + "#{}: state = {:?}, want {:?}", + i, nt.peers[&id].state, state + ); + } + if nt.peers[&id].term != term { + panic!("#{}: term = {}, want {}", i, nt.peers[&id].term, term); + } + + let prefix = format!("#{}: ", i); + assert_raft_log(&prefix, &nt.peers[&id].raft_log, raft_log); + } +} + +#[test] +fn test_dueling_pre_candidates() { + let l = default_logger(); + let a = new_test_raft_with_prevote(1, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let b = new_test_raft_with_prevote(2, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let c = new_test_raft_with_prevote(3, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + + let mut config = Network::default_config(); + config.pre_vote = true; + let mut nt = Network::new_with_config(vec![Some(a), Some(b), Some(c)], &config, &l); + nt.cut(1, 3); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // 1 becomes leader since it receives votes from 1 and 2 + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + // 3 campaigns then reverts to follower when its pre_vote is rejected + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + nt.recover(); + + // Candidate 3 now increases its term and tries to vote again. + // With pre-vote, it does not disrupt the leader. + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + let tests = [ + // role, term, committed, applied, last index. + (1, StateRole::Leader, 1, (1, 0, 1)), + (2, StateRole::Follower, 1, (1, 0, 1)), + (3, StateRole::Follower, 1, (0, 0, 0)), + ]; + for (i, &(id, state, term, raft_log)) in tests.iter().enumerate() { + if nt.peers[&id].state != state { + panic!( + "#{}: state = {:?}, want {:?}", + i, nt.peers[&id].state, state + ); + } + if nt.peers[&id].term != term { + panic!("#{}: term = {}, want {}", i, nt.peers[&id].term, term); + } + let prefix = format!("#{}: ", i); + assert_raft_log(&prefix, &nt.peers[&id].raft_log, raft_log); + } +} + +#[test] +fn test_candidate_concede() { + let l = default_logger(); + let mut tt = Network::new(vec![None, None, None], &l); + tt.isolate(1); + + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + tt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // heal the partition + tt.recover(); + // send heartbeat; reset wait + tt.send(vec![new_message(3, 3, MessageType::MsgBeat, 0)]); + + // send a proposal to 3 to flush out a MsgAppend to 1 + let data = "force follower"; + let mut m = new_message(3, 3, MessageType::MsgPropose, 0); + m.entries = vec![new_entry(0, 0, Some(data))].into(); + tt.send(vec![m]); + // send heartbeat; flush out commit + tt.send(vec![new_message(3, 3, MessageType::MsgBeat, 0)]); + + assert_eq!(tt.peers[&1].state, StateRole::Follower); + assert_eq!(tt.peers[&1].term, 1); + + for p in tt.peers.values() { + assert_eq!(p.raft_log.committed, 2); // All raft logs are committed. + assert_eq!(p.raft_log.applied, 0); // Raft logs are based on a snapshot with index 1. + assert_eq!(p.raft_log.last_index(), 2); + } +} + +#[test] +fn test_single_node_candidate() { + let l = default_logger(); + let mut tt = Network::new(vec![None], &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(tt.peers[&1].state, StateRole::Leader); +} + +#[test] +fn test_sinle_node_pre_candidate() { + let l = default_logger(); + let mut config = Network::default_config(); + config.pre_vote = true; + let mut tt = Network::new_with_config(vec![None], &config, &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(tt.peers[&1].state, StateRole::Leader); +} + +#[test] +fn test_old_messages() { + let l = default_logger(); + let mut tt = Network::new(vec![None, None, None], &l); + // make 0 leader @ term 3 + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + tt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + // pretend we're an old leader trying to make progress; this entry is expected to be ignored. + let mut m = new_message(2, 1, MessageType::MsgAppend, 0); + m.term = 2; + m.entries = vec![empty_entry(2, 3)].into(); + tt.send(vec![m]); + // commit a new entry + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + for p in tt.peers.values() { + let raft = p.raft.as_ref().unwrap(); + assert_eq!(raft.raft_log.committed, 4); + assert_eq!(raft.raft_log.applied, 0); + assert_eq!(raft.raft_log.last_index(), 4); + } +} + +// test_old_messages_reply - optimization - reply with new term. + +#[test] +fn test_proposal() { + let l = default_logger(); + let mut tests = vec![ + (Network::new(vec![None, None, None], &l), true), + (Network::new(vec![None, None, NOP_STEPPER], &l), true), + ( + Network::new(vec![None, NOP_STEPPER, NOP_STEPPER], &l), + false, + ), + ( + Network::new(vec![None, NOP_STEPPER, NOP_STEPPER, None], &l), + false, + ), + ( + Network::new(vec![None, NOP_STEPPER, NOP_STEPPER, None, None], &l), + true, + ), + ]; + + for (j, (mut nw, success)) in tests.drain(..).enumerate() { + let send = |nw: &mut Network, m| { + let res = panic::catch_unwind(AssertUnwindSafe(|| nw.send(vec![m]))); + assert!(res.is_ok() || !success); + }; + + // promote 0 the leader + send(&mut nw, new_message(1, 1, MessageType::MsgHup, 0)); + send(&mut nw, new_message(1, 1, MessageType::MsgPropose, 1)); + + // committed index, applied index and last index. + let want_log = if success { (2, 0, 2) } else { (0, 0, 0) }; + + for p in nw.peers.values() { + if let Some(ref raft) = p.raft { + let prefix = format!("#{}: ", j); + assert_raft_log(&prefix, &raft.raft_log, want_log); + } + } + if nw.peers[&1].term != 1 { + panic!("#{}: term = {}, want: {}", j, nw.peers[&1].term, 1); + } + } +} + +#[test] +fn test_proposal_by_proxy() { + let l = default_logger(); + let mut tests = vec![ + Network::new(vec![None, None, None], &l), + Network::new(vec![None, None, NOP_STEPPER], &l), + ]; + for (j, tt) in tests.iter_mut().enumerate() { + // promote 0 the leader + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // propose via follower + tt.send(vec![new_message(2, 2, MessageType::MsgPropose, 1)]); + + for p in tt.peers.values() { + if p.raft.is_none() { + continue; + } + if let Some(ref raft) = p.raft { + let prefix = format!("#{}: ", j); + assert_raft_log(&prefix, &raft.raft_log, (2, 0, 2)); + } + } + if tt.peers[&1].term != 1 { + panic!("#{}: term = {}, want {}", j, tt.peers[&1].term, 1); + } + } +} + +#[test] +fn test_commit() { + let l = default_logger(); + let mut tests = vec![ + // single + (vec![1], vec![empty_entry(1, 1)], 1, 1), + (vec![1], vec![empty_entry(1, 1)], 2, 0), + (vec![2], vec![empty_entry(1, 1), empty_entry(2, 2)], 2, 2), + (vec![1], vec![empty_entry(2, 1)], 2, 1), + // odd + ( + vec![2, 1, 1], + vec![empty_entry(1, 1), empty_entry(2, 2)], + 1, + 1, + ), + ( + vec![2, 1, 1], + vec![empty_entry(1, 1), empty_entry(1, 2)], + 2, + 0, + ), + ( + vec![2, 1, 2], + vec![empty_entry(1, 1), empty_entry(2, 2)], + 2, + 2, + ), + ( + vec![2, 1, 2], + vec![empty_entry(1, 1), empty_entry(1, 2)], + 2, + 0, + ), + // even + ( + vec![2, 1, 1, 1], + vec![empty_entry(1, 1), empty_entry(2, 2)], + 1, + 1, + ), + ( + vec![2, 1, 1, 1], + vec![empty_entry(1, 1), empty_entry(1, 2)], + 2, + 0, + ), + ( + vec![2, 1, 1, 2], + vec![empty_entry(1, 1), empty_entry(2, 2)], + 1, + 1, + ), + ( + vec![2, 1, 1, 2], + vec![empty_entry(1, 1), empty_entry(1, 2)], + 2, + 0, + ), + ( + vec![2, 1, 2, 2], + vec![empty_entry(1, 1), empty_entry(2, 2)], + 2, + 2, + ), + ( + vec![2, 1, 2, 2], + vec![empty_entry(1, 1), empty_entry(1, 2)], + 2, + 0, + ), + ]; + + for (i, (matches, logs, sm_term, w)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1], vec![])); + store.wl().append(&logs).unwrap(); + let mut hs = HardState::default(); + hs.term = sm_term; + store.wl().set_hardstate(hs); + let cfg = new_test_config(1, 5, 1); + let mut sm = new_test_raft_with_config(&cfg, store, &l); + + for (j, v) in matches.iter().enumerate() { + let id = j as u64 + 1; + if sm.mut_prs().get(id).is_none() { + sm.apply_conf_change(&add_node(id)).unwrap(); + let pr = sm.mut_prs().get_mut(id).unwrap(); + pr.matched = *v; + pr.next_idx = *v + 1; + } + } + sm.maybe_commit(); + if sm.raft_log.committed != w { + panic!("#{}: committed = {}, want {}", i, sm.raft_log.committed, w); + } + } +} + +#[test] +fn test_pass_election_timeout() { + let l = default_logger(); + let tests = [ + (5, 0f64, false), + (10, 0.1, true), + (13, 0.4, true), + (15, 0.6, true), + (18, 0.9, true), + (20, 1.0, false), + ]; + + for (i, &(elapse, wprobability, round)) in tests.iter().enumerate() { + let mut sm = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + sm.election_elapsed = elapse; + let mut c = 0; + for _ in 0..10_000 { + sm.reset_randomized_election_timeout(); + if sm.pass_election_timeout() { + c += 1; + } + } + let mut got = f64::from(c) / 10000.0; + if round { + got = (got * 10.0 + 0.5).floor() / 10.0; + } + if (got - wprobability).abs() > 0.000_001 { + panic!("#{}: probability = {}, want {}", i, got, wprobability); + } + } +} + +// test_handle_msg_append ensures: +// 1. Reply false if log doesn’t contain an entry at prevLogIndex whose term matches prevLogTerm. +// 2. If an existing entry conflicts with a new one (same index but different terms), +// delete the existing entry and all that follow it; append any new entries not already in the +// log. +// 3. If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry). +#[test] +fn test_handle_msg_append() { + let l = default_logger(); + let nm = |term, log_term, index, commit, ents: Option>| { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgAppend); + m.term = term; + m.log_term = log_term; + m.index = index; + m.commit = commit; + if let Some(ets) = ents { + m.entries = ets.iter().map(|&(i, t)| empty_entry(t, i)).collect(); + } + m + }; + let mut tests = vec![ + // Ensure 1 + (nm(2, 3, 2, 3, None), 2, 0, true), // previous log mismatch + (nm(2, 3, 3, 3, None), 2, 0, true), // previous log non-exist + // Ensure 2 + (nm(2, 1, 1, 1, None), 2, 1, false), + (nm(2, 0, 0, 1, Some(vec![(1, 2)])), 1, 1, false), + (nm(2, 2, 2, 3, Some(vec![(3, 2), (4, 2)])), 4, 3, false), + (nm(2, 2, 2, 4, Some(vec![(3, 2)])), 3, 3, false), + (nm(2, 1, 1, 4, Some(vec![(2, 2)])), 2, 2, false), + // Ensure 3 + (nm(1, 1, 1, 3, None), 2, 1, false), // match entry 1, commit up to last new entry 1 + (nm(1, 1, 1, 3, Some(vec![(2, 2)])), 2, 2, false), // match entry 1, commit up to last new + // entry 2 + (nm(2, 2, 2, 3, None), 2, 2, false), // match entry 2, commit up to last new entry 2 + (nm(2, 2, 2, 4, None), 2, 2, false), // commit up to log.last() + ]; + + for (j, (m, w_index, w_commit, w_reject)) in tests.drain(..).enumerate() { + let mut sm = new_test_raft_with_logs( + 1, + vec![1], + 10, + 1, + MemStorage::new(), + &[empty_entry(1, 1), empty_entry(2, 2)], + &l, + ); + + sm.become_follower(2, INVALID_ID); + sm.handle_append_entries(&m); + if sm.raft_log.last_index() != w_index { + panic!( + "#{}: last_index = {}, want {}", + j, + sm.raft_log.last_index(), + w_index + ); + } + if sm.raft_log.committed != w_commit { + panic!( + "#{}: committed = {}, want {}", + j, sm.raft_log.committed, w_commit + ); + } + let m = sm.read_messages(); + if m.len() != 1 { + panic!("#{}: msg count = {}, want 1", j, m.len()); + } + if m[0].reject != w_reject { + panic!("#{}: reject = {}, want {}", j, m[0].reject, w_reject); + } + } +} + +// test_handle_heartbeat ensures that the follower commits to the commit in the message. +#[test] +fn test_handle_heartbeat() { + let l = default_logger(); + let commit = 2u64; + let nw = |f, to, term, commit| { + let mut m = new_message(f, to, MessageType::MsgHeartbeat, 0); + m.term = term; + m.commit = commit; + m + }; + let mut tests = vec![ + (nw(2, 1, 2, commit + 1), commit + 1), + (nw(2, 1, 2, commit - 1), commit), // do not decrease commit + ]; + for (i, (m, w_commit)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1, 2], vec![])); + store + .wl() + .append(&[empty_entry(1, 1), empty_entry(2, 2), empty_entry(3, 3)]) + .unwrap(); + let cfg = new_test_config(1, 5, 1); + let mut sm = new_test_raft_with_config(&cfg, store, &l); + sm.become_follower(2, 2); + sm.raft_log.commit_to(commit); + sm.handle_heartbeat(m); + if sm.raft_log.committed != w_commit { + panic!( + "#{}: committed = {}, want = {}", + i, sm.raft_log.committed, w_commit + ); + } + let m = sm.read_messages(); + if m.len() != 1 { + panic!("#{}: msg count = {}, want 1", i, m.len()); + } + if m[0].get_msg_type() != MessageType::MsgHeartbeatResponse { + panic!( + "#{}: type = {:?}, want MsgHeartbeatResponse", + i, + m[0].get_msg_type() + ); + } + } +} + +// test_handle_heartbeat_resp ensures that we re-send log entries when we get a heartbeat response. +#[test] +fn test_handle_heartbeat_resp() { + let l = default_logger(); + let store = new_storage(); + store + .wl() + .append(&[empty_entry(1, 1), empty_entry(2, 2), empty_entry(3, 3)]) + .unwrap(); + let mut sm = new_test_raft(1, vec![1, 2], 5, 1, store, &l); + sm.become_candidate(); + sm.become_leader(); + let last_index = sm.raft_log.last_index(); + sm.raft_log.commit_to(last_index); + + // A heartbeat response from a node that is behind; re-send MsgApp + sm.step(new_message(2, 0, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + let mut msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend); + + // A second heartbeat response generates another MsgApp re-send + sm.step(new_message(2, 0, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend); + + // Once we have an MsgAppResp, heartbeats no longer send MsgApp. + let mut m = new_message(2, 0, MessageType::MsgAppendResponse, 0); + m.index = msgs[0].index + msgs[0].entries.len() as u64; + sm.step(m).expect(""); + // Consume the message sent in response to MsgAppResp + sm.read_messages(); + + sm.step(new_message(2, 0, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + msgs = sm.read_messages(); + assert!(msgs.is_empty()); +} + +// test_raft_frees_read_only_mem ensures raft will free read request from +// ReadOnly read_index_queue and pending_read_index map. +// related issue: https://github.com/coreos/etcd/issues/7571 +#[test] +fn test_raft_frees_read_only_mem() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + sm.become_candidate(); + sm.become_leader(); + let last_index = sm.raft_log.last_index(); + sm.raft_log.commit_to(last_index); + + let ctx = "ctx"; + let vec_ctx = ctx.as_bytes().to_vec(); + + // leader starts linearizable read request. + // more info: raft dissertation 6.4, step 2. + let m = new_message_with_entries( + 2, + 1, + MessageType::MsgReadIndex, + vec![new_entry(0, 0, Some(ctx))], + ); + sm.step(m).expect(""); + let msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgHeartbeat); + assert_eq!(msgs[0].context, &vec_ctx[..]); + assert_eq!(sm.read_only.read_index_queue.len(), 1); + assert_eq!(sm.read_only.pending_read_index.len(), 1); + assert!(sm.read_only.pending_read_index.contains_key(&vec_ctx)); + + // heartbeat responses from majority of followers (1 in this case) + // acknowledge the authority of the leader. + // more info: raft dissertation 6.4, step 3. + let mut m = new_message(2, 1, MessageType::MsgHeartbeatResponse, 0); + m.context = vec_ctx.clone().into(); + sm.step(m).expect(""); + assert_eq!(sm.read_only.read_index_queue.len(), 0); + assert_eq!(sm.read_only.pending_read_index.len(), 0); + assert!(!sm.read_only.pending_read_index.contains_key(&vec_ctx)); +} + +// test_msg_append_response_wait_reset verifies the waitReset behavior of a leader +// MsgAppResp. +#[test] +fn test_msg_append_response_wait_reset() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2, 3], 5, 1, new_storage(), &l); + sm.become_candidate(); + sm.become_leader(); + // For no-op entry + sm.persist(); + // The new leader has just emitted a new Term 4 entry; consume those messages + // from the outgoing queue. + sm.bcast_append(); + sm.read_messages(); + + // Node 2 acks the first entry, making it committed. + let mut m = new_message(2, 0, MessageType::MsgAppendResponse, 0); + m.index = 1; + sm.step(m).expect(""); + assert_eq!(sm.raft_log.committed, 1); + // Also consume the MsgApp messages that update Commit on the followers. + sm.read_messages(); + + // A new command is now proposed on node 1. + m = new_message(1, 0, MessageType::MsgPropose, 0); + m.entries = vec![empty_entry(0, 0)].into(); + sm.step(m).expect(""); + sm.persist(); + + // The command is broadcast to all nodes not in the wait state. + // Node 2 left the wait state due to its MsgAppResp, but node 3 is still waiting. + let mut msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend); + assert_eq!(msgs[0].to, 2); + assert_eq!(msgs[0].entries.len(), 1); + assert_eq!(msgs[0].entries[0].index, 2); + + // Now Node 3 acks the first entry. This releases the wait and entry 2 is sent. + m = new_message(3, 0, MessageType::MsgAppendResponse, 0); + m.index = 1; + sm.step(m).expect(""); + msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend); + assert_eq!(msgs[0].to, 3); + assert_eq!(msgs[0].entries.len(), 1); + assert_eq!(msgs[0].entries[0].index, 2); +} + +#[test] +fn test_recv_msg_request_vote() { + let l = default_logger(); + test_recv_msg_request_vote_for_type(MessageType::MsgRequestVote, &l); +} + +fn test_recv_msg_request_vote_for_type(msg_type: MessageType, l: &Logger) { + let mut tests = vec![ + (StateRole::Follower, 0, 0, INVALID_ID, true), + (StateRole::Follower, 0, 1, INVALID_ID, true), + (StateRole::Follower, 0, 2, INVALID_ID, true), + (StateRole::Follower, 0, 3, INVALID_ID, false), + (StateRole::Follower, 1, 0, INVALID_ID, true), + (StateRole::Follower, 1, 1, INVALID_ID, true), + (StateRole::Follower, 1, 2, INVALID_ID, true), + (StateRole::Follower, 1, 3, INVALID_ID, false), + (StateRole::Follower, 2, 0, INVALID_ID, true), + (StateRole::Follower, 2, 1, INVALID_ID, true), + (StateRole::Follower, 2, 2, INVALID_ID, false), + (StateRole::Follower, 2, 3, INVALID_ID, false), + (StateRole::Follower, 3, 0, INVALID_ID, true), + (StateRole::Follower, 3, 1, INVALID_ID, true), + (StateRole::Follower, 3, 2, INVALID_ID, false), + (StateRole::Follower, 3, 3, INVALID_ID, false), + (StateRole::Follower, 3, 2, 2, false), + (StateRole::Follower, 3, 2, 1, true), + (StateRole::Leader, 3, 3, 1, true), + (StateRole::PreCandidate, 3, 3, 1, true), + (StateRole::Candidate, 3, 3, 1, true), + ]; + + for (j, (state, index, log_term, vote_for, w_reject)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1], vec![])); + let ents = &[empty_entry(2, 1), empty_entry(2, 2)]; + store.wl().append(ents).unwrap(); + let mut sm = new_test_raft(1, vec![1], 10, 1, store, l); + sm.state = state; + sm.vote = vote_for; + + let mut m = new_message(2, 0, msg_type, 0); + m.index = index; + m.log_term = log_term; + // raft.Term is greater than or equal to raft.raftLog.lastTerm. In this + // test we're only testing MsgVote responses when the campaigning node + // has a different raft log compared to the recipient node. + // Additionally we're verifying behaviour when the recipient node has + // already given out its vote for its current term. We're not testing + // what the recipient node does when receiving a message with a + // different term number, so we simply initialize both term numbers to + // be the same. + let term = cmp::max(sm.raft_log.last_term(), log_term); + m.term = term; + sm.term = term; + sm.step(m).expect(""); + + let msgs = sm.read_messages(); + if msgs.len() != 1 { + panic!("#{}: msgs count = {}, want 1", j, msgs.len()); + } + if msgs[0].get_msg_type() != vote_resp_msg_type(msg_type) { + panic!( + "#{}: m.type = {:?}, want {:?}", + j, + msgs[0].get_msg_type(), + vote_resp_msg_type(msg_type) + ); + } + if msgs[0].reject != w_reject { + panic!( + "#{}: m.get_reject = {}, want {}", + j, msgs[0].reject, w_reject + ); + } + } +} + +#[test] +fn test_state_transition() { + let l = default_logger(); + let mut tests = vec![ + ( + StateRole::Follower, + StateRole::Follower, + true, + 1, + INVALID_ID, + ), + ( + StateRole::Follower, + StateRole::PreCandidate, + true, + 0, + INVALID_ID, + ), + ( + StateRole::Follower, + StateRole::Candidate, + true, + 1, + INVALID_ID, + ), + (StateRole::Follower, StateRole::Leader, false, 0, INVALID_ID), + ( + StateRole::PreCandidate, + StateRole::Follower, + true, + 0, + INVALID_ID, + ), + ( + StateRole::PreCandidate, + StateRole::PreCandidate, + true, + 0, + INVALID_ID, + ), + ( + StateRole::PreCandidate, + StateRole::Candidate, + true, + 1, + INVALID_ID, + ), + (StateRole::PreCandidate, StateRole::Leader, true, 0, 1), + ( + StateRole::Candidate, + StateRole::Follower, + true, + 0, + INVALID_ID, + ), + ( + StateRole::Candidate, + StateRole::PreCandidate, + true, + 0, + INVALID_ID, + ), + ( + StateRole::Candidate, + StateRole::Candidate, + true, + 1, + INVALID_ID, + ), + (StateRole::Candidate, StateRole::Leader, true, 0, 1), + (StateRole::Leader, StateRole::Follower, true, 1, INVALID_ID), + ( + StateRole::Leader, + StateRole::PreCandidate, + false, + 0, + INVALID_ID, + ), + ( + StateRole::Leader, + StateRole::Candidate, + false, + 1, + INVALID_ID, + ), + (StateRole::Leader, StateRole::Leader, true, 0, 1), + ]; + for (i, (from, to, wallow, wterm, wlead)) in tests.drain(..).enumerate() { + let sm: &mut Raft = &mut new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + sm.state = from; + + let res = panic::catch_unwind(AssertUnwindSafe(|| match to { + StateRole::Follower => sm.become_follower(wterm, wlead), + StateRole::PreCandidate => sm.become_pre_candidate(), + StateRole::Candidate => sm.become_candidate(), + StateRole::Leader => sm.become_leader(), + })); + if res.is_ok() ^ wallow { + panic!("#{}: allow = {}, want {}", i, res.is_ok(), wallow); + } + if res.is_err() { + continue; + } + + if sm.term != wterm { + panic!("#{}: term = {}, want {}", i, sm.term, wterm); + } + if sm.leader_id != wlead { + panic!("#{}: lead = {}, want {}", i, sm.leader_id, wlead); + } + } +} + +#[test] +fn test_all_server_stepdown() { + let l = default_logger(); + let mut tests = vec![ + // state, want_state, term, last_index, entry count. + (StateRole::Follower, StateRole::Follower, 3, 0, 0), + (StateRole::PreCandidate, StateRole::Follower, 3, 0, 0), + (StateRole::Candidate, StateRole::Follower, 3, 0, 0), + (StateRole::Leader, StateRole::Follower, 3, 1, 1), + ]; + + let tmsg_types = [MessageType::MsgRequestVote, MessageType::MsgAppend]; + let tterm = 3u64; + + for (i, (state, wstate, wterm, windex, entries)) in tests.drain(..).enumerate() { + let mut sm = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + match state { + StateRole::Follower => sm.become_follower(1, INVALID_ID), + StateRole::PreCandidate => sm.become_pre_candidate(), + StateRole::Candidate => sm.become_candidate(), + StateRole::Leader => { + sm.become_candidate(); + sm.become_leader(); + } + } + + for (j, &msg_type) in tmsg_types.iter().enumerate() { + let mut m = new_message(2, 0, msg_type, 0); + m.term = tterm; + m.log_term = tterm; + sm.step(m).expect(""); + + if sm.state != wstate { + panic!("{}.{} state = {:?}, want {:?}", i, j, sm.state, wstate); + } + if sm.term != wterm { + panic!("{}.{} term = {}, want {}", i, j, sm.term, wterm); + } + if sm.raft_log.last_index() != windex { + panic!( + "{}.{} index = {}, want {}", + i, + j, + sm.raft_log.last_index(), + windex + ); + } + let entry_count = sm.raft_log.all_entries().len() as u64; + if entry_count != entries { + panic!("{}.{} ents count = {}, want {}", i, j, entry_count, entries); + } + let wlead = if msg_type == MessageType::MsgRequestVote { + INVALID_ID + } else { + 2 + }; + if sm.leader_id != wlead { + panic!("{}, sm.lead = {}, want {}", i, sm.leader_id, INVALID_ID); + } + } + } +} + +#[test] +fn test_candidate_reset_term_msg_heartbeat() { + let l = default_logger(); + test_candidate_reset_term(MessageType::MsgHeartbeat, &l) +} + +#[test] +fn test_candidate_reset_term_msg_append() { + let l = default_logger(); + test_candidate_reset_term(MessageType::MsgAppend, &l) +} + +// test_candidate_reset_term tests when a candidate receives a +// MsgHeartbeat or MsgAppend from leader, "step" resets the term +// with leader's and reverts back to follower. +fn test_candidate_reset_term(message_type: MessageType, l: &Logger) { + let a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), l); + let b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), l); + let c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), l); + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], l); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + // isolate 3 and increase term in rest + nt.isolate(3); + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + // trigger campaign in isolated c + nt.peers + .get_mut(&3) + .unwrap() + .reset_randomized_election_timeout(); + let timeout = nt.peers[&3].randomized_election_timeout(); + for _ in 0..timeout { + nt.peers.get_mut(&3).unwrap().tick(); + } + + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + nt.recover(); + + // leader sends to isolated candidate + // and expects candidate to revert to follower + let mut msg = new_message(1, 3, message_type, 0); + msg.term = nt.peers[&1].term; + nt.send(vec![msg]); + + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + // follower c term is reset with leader's + assert_eq!( + nt.peers[&3].term, nt.peers[&1].term, + "follower term expected same term as leader's {}, got {}", + nt.peers[&1].term, nt.peers[&3].term, + ) +} + +#[test] +fn test_leader_stepdown_when_quorum_active() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2, 3], 5, 1, new_storage(), &l); + sm.check_quorum = true; + sm.become_candidate(); + sm.become_leader(); + + for _ in 0..=sm.election_timeout() { + let mut m = new_message(2, 0, MessageType::MsgHeartbeatResponse, 0); + m.term = sm.term; + sm.step(m).expect(""); + sm.tick(); + } + + assert_eq!(sm.state, StateRole::Leader); +} + +#[test] +fn test_leader_stepdown_when_quorum_lost() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2, 3], 5, 1, new_storage(), &l); + + sm.check_quorum = true; + + sm.become_candidate(); + sm.become_leader(); + sm.raft_log.max_apply_unpersisted_log_limit = 100; + + for _ in 0..=sm.election_timeout() { + sm.tick(); + } + + assert_eq!(sm.state, StateRole::Follower); + // check after become follower, the limit is reset. + // Currently we only support return committed but not persisted raft + // entries on raft leader. Due to the limitation of current interface, + // we hard code this logic in raft-rs. + assert_eq!(sm.raft_log.max_apply_unpersisted_log_limit, 0); +} + +#[test] +fn test_leader_superseding_with_check_quorum() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + a.check_quorum = true; + b.check_quorum = true; + c.check_quorum = true; + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + + let b_election_timeout = nt.peers[&2].election_timeout(); + + // prevent campaigning from b + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // Peer b rejected c's vote since its electionElapsed had not reached to electionTimeout + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + // Letting b's electionElapsed reach to electionTimeout + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&3].state, StateRole::Leader); +} + +#[test] +fn test_leader_election_with_check_quorum() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + a.check_quorum = true; + b.check_quorum = true; + c.check_quorum = true; + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + + // we can not let system choosing the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let a_election_timeout = nt.peers[&1].election_timeout(); + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&1) + .unwrap() + .set_randomized_election_timeout(a_election_timeout + 1); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 2); + + // Immediately after creation, votes are cast regardless of the election timeout + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + // need to reset randomizedElectionTimeout larger than electionTimeout again, + // because the value might be reset to electionTimeout since the last state changes + let a_election_timeout = nt.peers[&1].election_timeout(); + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&1) + .unwrap() + .set_randomized_election_timeout(a_election_timeout + 1); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 2); + + for _ in 0..a_election_timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Leader); +} + +// test_free_stuck_candidate_with_check_quorum ensures that a candidate with a higher term +// can disrupt the leader even if the leader still "officially" holds the lease, The +// leader is expected to step down and adopt the candidate's term +#[test] +fn test_free_stuck_candidate_with_check_quorum() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + a.check_quorum = true; + b.check_quorum = true; + c.check_quorum = true; + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + + // we can not let system choosing the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + nt.isolate(1); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + assert_eq!(nt.peers[&3].term, &nt.peers[&2].term + 1); + + // Vote again for safety + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + assert_eq!(nt.peers[&3].term, &nt.peers[&2].term + 2); + + nt.recover(); + let mut msg = new_message(1, 3, MessageType::MsgHeartbeat, 0); + msg.term = nt.peers[&1].term; + nt.send(vec![msg]); + + // Disrupt the leader so that the stuck peer is freed + assert_eq!(nt.peers[&1].state, StateRole::Follower); + assert_eq!(nt.peers[&3].term, nt.peers[&1].term); + + // Vote again, should become leader this time + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&3].state, StateRole::Leader); +} + +#[test] +fn test_non_promotable_voter_with_check_quorum() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1], 10, 1, new_storage(), &l); + + a.check_quorum = true; + b.check_quorum = true; + + let mut nt = Network::new(vec![Some(a), Some(b)], &l); + + // we can not let system choosing the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + // Need to remove 2 again to make it a non-promotable node since newNetwork + // overwritten some internal states + nt.peers + .get_mut(&2) + .unwrap() + .apply_conf_change(&remove_node(2)) + .unwrap(); + + assert!(!nt.peers[&2].promotable()); + + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&2].leader_id, 1); +} + +/// `test_disruptive_follower` tests isolated follower, +/// with slow network incoming from leader, election times out +/// to become a candidate with an increased term. Then, the +/// candiate's response to late leader heartbeat forces the leader +/// to step down. +#[test] +fn test_disruptive_follower() { + let l = default_logger(); + let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + n1.check_quorum = true; + n2.check_quorum = true; + n3.check_quorum = true; + + n1.become_follower(1, INVALID_ID); + n2.become_follower(1, INVALID_ID); + n3.become_follower(1, INVALID_ID); + + let mut nt = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // check state + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + // etcd server "advanceTicksForElection" on restart; + // this is to expedite campaign trigger when given larger + // election timeouts (e.g. multi-datacenter deploy) + // Or leader messages are being delayed while ticks elapse + let timeout = nt.peers[&3].election_timeout(); + nt.peers + .get_mut(&3) + .unwrap() + .set_randomized_election_timeout(timeout + 2); + let timeout = nt.peers[&3].randomized_election_timeout(); + for _ in 0..timeout - 1 { + nt.peers.get_mut(&3).unwrap().tick(); + } + + // ideally, before last election tick elapses, + // the follower n3 receives "pb.MsgApp" or "pb.MsgHeartbeat" + // from leader n1, and then resets its "electionElapsed" + // however, last tick may elapse before receiving any + // messages from leader, thus triggering campaign + nt.peers.get_mut(&3).unwrap().tick(); + + // n1 is still leader yet + // while its heartbeat to candidate n3 is being delayed + // check state + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + // check term + // n1.Term == 2 + // n2.Term == 2 + // n3.Term == 3 + assert_eq!(nt.peers[&1].term, 2); + assert_eq!(nt.peers[&2].term, 2); + assert_eq!(nt.peers[&3].term, 3); + + // while outgoing vote requests are still queued in n3, + // leader heartbeat finally arrives at candidate n3 + // however, due to delayed network from leader, leader + // heartbeat was sent with lower term than candidate's + let mut msg = new_message(1, 3, MessageType::MsgHeartbeat, 0); + msg.term = nt.peers[&1].term; + nt.send(vec![msg]); + + // then candidate n3 responds with "pb.MsgAppResp" of higher term + // and leader steps down from a message with higher term + // this is to disrupt the current leader, so that candidate + // with higher term can be freed with following election + + // check state + assert_eq!(nt.peers[&1].state, StateRole::Follower); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + // check term + // n1.Term == 3 + // n2.Term == 2 + // n3.Term == 3 + assert_eq!(nt.peers[&1].term, 3); + assert_eq!(nt.peers[&2].term, 2); + assert_eq!(nt.peers[&3].term, 3); +} + +/// `test_disruptive_follower_pre_vote` tests isolated follower, +/// with slow network incoming from leader, election times out +/// to become a pre-candidate with less log than current leader. +/// Then pre-vote phase prevents this isolated node from forcing +/// current leader to step down, thus less disruptions. +#[test] +fn test_disruptive_follower_pre_vote() { + let l = default_logger(); + let mut n1 = new_test_raft_with_prevote(1, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let mut n2 = new_test_raft_with_prevote(2, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let mut n3 = new_test_raft_with_prevote(3, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + + n1.check_quorum = true; + n2.check_quorum = true; + n3.check_quorum = true; + + n1.become_follower(1, INVALID_ID); + n2.become_follower(1, INVALID_ID); + n3.become_follower(1, INVALID_ID); + + let mut nt = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // check state + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Follower); + + nt.isolate(3); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + nt.recover(); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // check state + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + // check term + // n1.Term == 2 + // n2.Term == 2 + // n3.Term == 2 + assert_eq!(nt.peers[&1].term, 2); + assert_eq!(nt.peers[&2].term, 2); + assert_eq!(nt.peers[&3].term, 2); + + // delayed leader heartbeat does not force current leader to step down + let mut msg = new_message(1, 3, MessageType::MsgHeartbeat, 0); + msg.term = nt.peers[&1].term; + nt.send(vec![msg]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); +} + +#[test] +fn test_read_only_option_safe() { + let l = default_logger(); + let a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + + // we can not let system choose the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + let mut tests = vec![ + (1, 10, 11, vec!["ctx1", "ctx11"], false), + (2, 10, 21, vec!["ctx2", "ctx22"], false), + (3, 10, 31, vec!["ctx3", "ctx33"], false), + (1, 10, 41, vec!["ctx4", "ctx44"], true), + (2, 10, 51, vec!["ctx5", "ctx55"], true), + (3, 10, 61, vec!["ctx6", "ctx66"], true), + ]; + + for (i, (id, proposals, wri, wctx, pending)) in tests.drain(..).enumerate() { + for _ in 0..proposals { + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + } + + let msg1 = new_message_with_entries( + id, + id, + MessageType::MsgReadIndex, + vec![new_entry(0, 0, Some(wctx[0]))], + ); + let msg2 = new_message_with_entries( + id, + id, + MessageType::MsgReadIndex, + vec![new_entry(0, 0, Some(wctx[1]))], + ); + + // `pending` indicates that a `ReadIndex` request will not get through quorum checking immediately + // so that it remains in the `read_index_queue` + if pending { + // drop MsgHeartbeatResponse here to prevent leader handling pending ReadIndex request per round + nt.ignore(MessageType::MsgHeartbeatResponse); + nt.send(vec![msg1.clone(), msg1.clone(), msg2.clone()]); + nt.recover(); + // send a ReadIndex request with the last ctx to notify leader to handle pending read requests + nt.send(vec![msg2.clone()]); + } else { + nt.send(vec![msg1.clone(), msg1.clone(), msg2.clone()]); + } + + let read_states: Vec = nt + .peers + .get_mut(&id) + .unwrap() + .read_states + .drain(..) + .collect(); + if read_states.is_empty() { + panic!("#{}: read_states is empty, want non-empty", i); + } + assert_eq!(read_states.len(), wctx.len()); + for (rs, wctx) in read_states.iter().zip(wctx) { + if rs.index != wri { + panic!("#{}: read_index = {}, want {}", i, rs.index, wri) + } + let ctx_bytes = wctx.as_bytes().to_vec(); + if rs.request_ctx != ctx_bytes { + panic!( + "#{}: request_ctx = {:?}, want {:?}", + i, rs.request_ctx, ctx_bytes + ) + } + } + } +} + +#[test] +fn test_read_only_with_learner() { + let l = default_logger(); + let a = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + let b = new_test_learner_raft(2, vec![1], vec![2], 10, 1, new_storage(), &l); + + let mut nt = Network::new(vec![Some(a), Some(b)], &l); + + // we can not let system choose the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + + let mut tests = vec![ + (1, 10, 11, "ctx1"), + (2, 10, 21, "ctx2"), + (1, 10, 31, "ctx3"), + (2, 10, 41, "ctx4"), + ]; + + for (i, (id, proposals, wri, wctx)) in tests.drain(..).enumerate() { + for _ in 0..proposals { + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + } + + let e = new_entry(0, 0, Some(wctx)); + nt.send(vec![new_message_with_entries( + id, + id, + MessageType::MsgReadIndex, + vec![e], + )]); + + let read_states: Vec = nt + .peers + .get_mut(&id) + .unwrap() + .read_states + .drain(..) + .collect(); + assert!( + !read_states.is_empty(), + "#{}: read_states is empty, want non-empty", + i + ); + let rs = &read_states[0]; + assert_eq!( + rs.index, wri, + "#{}: read_index = {}, want {}", + i, rs.index, wri + ); + let vec_wctx = wctx.as_bytes().to_vec(); + assert_eq!( + rs.request_ctx, vec_wctx, + "#{}: request_ctx = {:?}, want {:?}", + i, rs.request_ctx, vec_wctx + ); + } +} + +#[test] +fn test_read_only_option_lease() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + a.read_only.option = ReadOnlyOption::LeaseBased; + b.read_only.option = ReadOnlyOption::LeaseBased; + c.read_only.option = ReadOnlyOption::LeaseBased; + a.check_quorum = true; + b.check_quorum = true; + c.check_quorum = true; + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + + // we can not let system choose the value of randomizedElectionTimeout + // otherwise it will introduce some uncertainty into this test case + // we need to ensure randomizedElectionTimeout > electionTimeout here + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + let mut tests = vec![ + (1, 10, 11, "ctx1"), + (2, 10, 21, "ctx2"), + (3, 10, 31, "ctx3"), + (1, 10, 41, "ctx4"), + (2, 10, 51, "ctx5"), + (3, 10, 61, "ctx6"), + ]; + + for (i, (id, proposals, wri, wctx)) in tests.drain(..).enumerate() { + for _ in 0..proposals { + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + } + + let entry = new_entry(0, 0, Some(wctx)); + nt.send(vec![new_message_with_entries( + id, + id, + MessageType::MsgReadIndex, + vec![entry], + )]); + + let read_states: Vec = nt + .peers + .get_mut(&id) + .unwrap() + .read_states + .drain(..) + .collect(); + if read_states.is_empty() { + panic!("#{}: read_states is empty, want non-empty", i); + } + let rs = &read_states[0]; + if rs.index != wri { + panic!("#{}: read_index = {}, want {}", i, rs.index, wri); + } + let vec_wctx = wctx.as_bytes().to_vec(); + if rs.request_ctx != vec_wctx { + panic!( + "#{}: request_ctx = {:?}, want {:?}", + i, rs.request_ctx, vec_wctx + ); + } + } +} + +#[test] +fn test_read_only_option_lease_without_check_quorum() { + let l = default_logger(); + let mut a = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut b = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut c = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + a.read_only.option = ReadOnlyOption::LeaseBased; + b.read_only.option = ReadOnlyOption::LeaseBased; + c.read_only.option = ReadOnlyOption::LeaseBased; + + let mut nt = Network::new(vec![Some(a), Some(b), Some(c)], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let ctx = "ctx1"; + let entry = new_entry(0, 0, Some(ctx)); + nt.send(vec![new_message_with_entries( + 2, + 2, + MessageType::MsgReadIndex, + vec![entry], + )]); + + let read_states = &nt.peers[&2].read_states; + assert!(!read_states.is_empty()); + let rs = &read_states[0]; + assert_eq!(rs.index, 1); + let vec_ctx = ctx.as_bytes().to_vec(); + assert_eq!(rs.request_ctx, vec_ctx); +} + +// `test_read_only_for_new_leader` ensures that a leader only accepts MsgReadIndex message +// when it commits at least one log entry at it term. +#[test] +fn test_read_only_for_new_leader() { + let l = default_logger(); + let heartbeat_ticks = 1; + let node_configs = vec![(1, 1, 1, 0), (2, 2, 2, 2), (3, 2, 2, 2)]; + let mut peers = vec![]; + for (id, committed, applied, compact_index) in node_configs { + let mut cfg = new_test_config(id, 10, heartbeat_ticks); + cfg.applied = applied; + let storage = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let entries = vec![empty_entry(1, 1), empty_entry(1, 2)]; + storage.wl().append(&entries).unwrap(); + let mut hs = HardState::default(); + hs.term = 1; + hs.commit = committed; + storage.wl().set_hardstate(hs); + if compact_index != 0 { + storage.wl().compact(compact_index).unwrap(); + } + let i = new_test_raft_with_config(&cfg, storage, &l); + peers.push(Some(i)); + } + let mut nt = Network::new(peers, &l); + + // Drop MsgAppend to forbid peer 1 to commit any log entry at its term + // after it becomes leader. + nt.ignore(MessageType::MsgAppend); + // Force peer 1 to become leader + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + // Ensure peer 1 drops read only request. + let windex = 4; + let wctx = "ctx"; + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgReadIndex, + vec![new_entry(0, 0, Some(wctx))], + )]); + assert_eq!(nt.peers[&1].read_states.len(), 0); + + nt.recover(); + + // Force peer 1 to commit a log entry at its term. + for _ in 0..heartbeat_ticks { + nt.peers.get_mut(&1).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + assert_eq!(nt.peers[&1].raft_log.committed, 4); + assert_eq!( + nt.peers[&1] + .raft_log + .term(nt.peers[&1].raft_log.committed) + .unwrap_or(0), + nt.peers[&1].term + ); + + // Ensure peer 1 accepts read only request after it commits a entry at its term. + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgReadIndex, + vec![new_entry(0, 0, Some(wctx))], + )]); + let read_states: Vec = nt + .peers + .get_mut(&1) + .unwrap() + .read_states + .drain(..) + .collect(); + assert_eq!(read_states.len(), 1); + let rs = &read_states[0]; + assert_eq!(rs.index, windex); + assert_eq!(rs.request_ctx, wctx.as_bytes().to_vec()); +} + +// `test_advance_commit_index_by_read_index_response` ensures that read index response +// can advance the follower's commit index if it has new enough logs +#[test] +fn test_advance_commit_index_by_read_index_response() { + let l = default_logger(); + let mut tt = Network::new(vec![None, None, None, None, None], &l); + tt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // don't commit entries + tt.cut(1, 3); + tt.cut(1, 4); + tt.cut(1, 5); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + tt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + tt.recover(); + tt.cut(1, 2); + + // commit entries for leader but not node 2 + tt.send(vec![new_message(3, 1, MessageType::MsgReadIndex, 1)]); + assert_eq!(tt.peers[&1].raft_log.committed, 3); + assert_eq!(tt.peers[&2].raft_log.committed, 1); + + tt.recover(); + // use LeaseBased so leader won't send MsgHeartbeat to advance node 2's commit index + tt.peers.get_mut(&1).unwrap().read_only.option = ReadOnlyOption::LeaseBased; + tt.send(vec![new_message(2, 1, MessageType::MsgReadIndex, 1)]); + assert_eq!(tt.peers[&2].raft_log.committed, 3); +} + +#[test] +fn test_leader_append_response() { + let l = default_logger(); + // Initial progress: match = 0, next = 4 on followers. + let mut tests = vec![ + // Stale resp; no replies. + (3, true, 0, 3, 0, 0, 0), + // Denied resp; decrease next and send probing message. + (2, true, 0, 2, 1, 1, 0), + // Accepted resp; leader commits; broadcast with committed index. + (2, false, 2, 4, 2, 2, 2), + (0, false, 0, 3, 0, 0, 0), + ]; + + for (i, (index, reject, wmatch, wnext, wmsg_num, windex, wcommitted)) in + tests.drain(..).enumerate() + { + // Initial raft logs: last index = 3, committed = 1. + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let ents = &[empty_entry(0, 1), empty_entry(1, 2)]; + store.wl().append(ents).unwrap(); + let mut sm = new_test_raft(1, vec![1, 2, 3], 10, 1, store, &l); + + // sm term is 2 after it becomes the leader. + sm.become_candidate(); + sm.become_leader(); + + sm.read_messages(); + let mut m = new_message(2, 0, MessageType::MsgAppendResponse, 0); + m.index = index; + m.term = sm.term; + m.reject = reject; + m.reject_hint = index; + sm.step(m).expect(""); + + if sm.prs().get(2).unwrap().matched != wmatch { + panic!( + "#{}: match = {}, want {}", + i, + sm.prs().get(2).unwrap().matched, + wmatch + ); + } + if sm.prs().get(2).unwrap().next_idx != wnext { + panic!( + "#{}: next = {}, want {}", + i, + sm.prs().get(2).unwrap().next_idx, + wnext + ); + } + + let mut msgs = sm.read_messages(); + if msgs.len() != wmsg_num { + panic!("#{} msg_num = {}, want {}", i, msgs.len(), wmsg_num); + } + for (j, msg) in msgs.drain(..).enumerate() { + if msg.index != windex { + panic!("#{}.{} index = {}, want {}", i, j, msg.index, windex); + } + if msg.commit != wcommitted { + panic!("#{}.{} commit = {}, want {}", i, j, msg.commit, wcommitted); + } + } + } +} + +// When the leader receives a heartbeat tick, it should +// send a MsgApp with m.Index = 0, m.LogTerm=0 and empty entries. +#[test] +fn test_bcast_beat() { + let l = default_logger(); + // make a state machine with log.offset = 1000 + let offset = 1000u64; + let s = new_snapshot(offset, 1, vec![1, 2, 3]); + let store = new_storage(); + store.wl().apply_snapshot(s).expect(""); + let mut sm = new_test_raft(1, vec![1, 2, 3], 10, 1, store, &l); + sm.term = 1; + + sm.become_candidate(); + sm.become_leader(); + for i in 0..10 { + let _ = sm.append_entry(&mut [empty_entry(0, offset + i + 1)]); + } + sm.persist(); + // slow follower + let mut_pr = |sm: &mut Interface, n, matched, next_idx| { + let m = sm.mut_prs().get_mut(n).unwrap(); + m.matched = matched; + m.next_idx = next_idx; + }; + // slow follower + mut_pr(&mut sm, 2, offset + 5, offset + 6); + // normal follower + let last_index = sm.raft_log.last_index(); + mut_pr(&mut sm, 3, last_index, last_index + 1); + + sm.step(new_message(0, 0, MessageType::MsgBeat, 0)) + .expect(""); + let mut msgs = sm.read_messages(); + assert_eq!(msgs.len(), 2); + + let mut want_commit_map = HashMap::new(); + want_commit_map.insert( + 2, + cmp::min(sm.raft_log.committed, sm.prs().get(2).unwrap().matched), + ); + want_commit_map.insert( + 3, + cmp::min(sm.raft_log.committed, sm.prs().get(3).unwrap().matched), + ); + for (i, m) in msgs.drain(..).enumerate() { + if m.get_msg_type() != MessageType::MsgHeartbeat { + panic!( + "#{}: type = {:?}, want = {:?}", + i, + m.get_msg_type(), + MessageType::MsgHeartbeat + ); + } + if m.index != 0 { + panic!("#{}: prev_index = {}, want {}", i, m.index, 0); + } + if m.log_term != 0 { + panic!("#{}: prev_term = {}, want {}", i, m.log_term, 0); + } + if want_commit_map[&m.to] == 0 { + panic!("#{}: unexpected to {}", i, m.to) + } else { + if m.commit != want_commit_map[&m.to] { + panic!( + "#{}: commit = {}, want {}", + i, m.commit, want_commit_map[&m.to] + ); + } + want_commit_map.remove(&m.to); + } + if !m.entries.is_empty() { + panic!("#{}: entries count = {}, want 0", i, m.entries.len()); + } + } +} + +// tests the output of the statemachine when receiving MsgBeat +#[test] +fn test_recv_msg_beat() { + let l = default_logger(); + let mut tests = vec![ + (StateRole::Leader, 2), + // candidate and follower should ignore MsgBeat + (StateRole::Candidate, 0), + (StateRole::Follower, 0), + ]; + + for (i, (state, w_msg)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let ents = &[empty_entry(0, 1), empty_entry(1, 2)]; + store.wl().append(ents).unwrap(); + + let mut sm = new_test_raft(1, vec![1, 2, 3], 10, 1, store, &l); + sm.state = state; + sm.step(new_message(1, 1, MessageType::MsgBeat, 0)) + .expect(""); + + let msgs = sm.read_messages(); + if msgs.len() != w_msg { + panic!("#{}: msg count = {}, want {}", i, msgs.len(), w_msg); + } + for m in msgs { + if m.get_msg_type() != MessageType::MsgHeartbeat { + panic!( + "#{}: msg.type = {:?}, want {:?}", + i, + m.get_msg_type(), + MessageType::MsgHeartbeat + ); + } + } + } +} + +#[test] +fn test_leader_increase_next() { + let l = default_logger(); + let previous_ents = vec![empty_entry(1, 1), empty_entry(1, 2), empty_entry(1, 3)]; + let mut tests = vec![ + // state replicate; optimistically increase next + // previous entries + noop entry + propose + 1 + ( + ProgressState::Replicate, + 2, + previous_ents.len() as u64 + 1 + 1 + 1, + ), + // state probe, not optimistically increase next + (ProgressState::Probe, 2, 2), + ]; + for (i, (state, next_idx, wnext)) in tests.drain(..).enumerate() { + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.raft_log.append(&previous_ents); + sm.persist(); + sm.become_candidate(); + sm.become_leader(); + sm.mut_prs().get_mut(2).unwrap().state = state; + sm.mut_prs().get_mut(2).unwrap().next_idx = next_idx; + sm.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + + if sm.prs().get(2).unwrap().next_idx != wnext { + panic!( + "#{}: next = {}, want {}", + i, + sm.prs().get(2).unwrap().next_idx, + wnext + ); + } + } +} + +#[test] +fn test_send_append_for_progress_probe() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + r.read_messages(); + r.mut_prs().get_mut(2).unwrap().become_probe(); + + // each round is a heartbeat + for i in 0..3 { + if i == 0 { + // we expect that raft will only send out one msgAPP on the first + // loop. After that, the follower is paused until a heartbeat response is + // received. + let _ = r.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r.send_append(2); + let msg = r.read_messages(); + assert_eq!(msg.len(), 1); + assert_eq!(msg[0].index, 0); + } + + assert!(r.prs().get(2).unwrap().paused); + for _ in 0..10 { + let _ = r.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r.send_append(2); + assert_eq!(r.read_messages().len(), 0); + } + + // do a heartbeat + for _ in 0..r.heartbeat_timeout() { + r.step(new_message(1, 1, MessageType::MsgBeat, 0)) + .expect(""); + } + assert!(r.prs().get(2).unwrap().paused); + + // consume the heartbeat + let msg = r.read_messages(); + assert_eq!(msg.len(), 1); + assert_eq!(msg[0].get_msg_type(), MessageType::MsgHeartbeat); + } + + // a heartbeat response will allow another message to be sent + r.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + let msg = r.read_messages(); + assert_eq!(msg.len(), 1); + assert_eq!(msg[0].index, 0); + assert!(r.prs().get(2).unwrap().paused); +} + +#[test] +fn test_send_append_for_progress_replicate() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + r.read_messages(); + r.mut_prs().get_mut(2).unwrap().become_replicate(); + + for _ in 0..10 { + let _ = r.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r.send_append(2); + assert_eq!(r.read_messages().len(), 1); + } +} + +#[test] +fn test_send_append_for_progress_snapshot() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + r.read_messages(); + r.mut_prs().get_mut(2).unwrap().become_snapshot(10); + + for _ in 0..10 { + let _ = r.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r.send_append(2); + assert_eq!(r.read_messages().len(), 0); + } +} + +#[test] +fn test_recv_msg_unreachable() { + let l = default_logger(); + let previous_ents = vec![empty_entry(1, 1), empty_entry(1, 2), empty_entry(1, 3)]; + let s = new_storage(); + s.wl().append(&previous_ents).unwrap(); + let mut r = new_test_raft(1, vec![1, 2], 10, 1, s, &l); + r.become_candidate(); + r.become_leader(); + r.read_messages(); + // set node 2 to state replicate + r.mut_prs().get_mut(2).unwrap().matched = 3; + r.mut_prs().get_mut(2).unwrap().become_replicate(); + r.mut_prs().get_mut(2).unwrap().optimistic_update(5); + + r.step(new_message(2, 1, MessageType::MsgUnreachable, 0)) + .expect(""); + + let peer_2 = r.prs().get(2).unwrap(); + assert_eq!(peer_2.state, ProgressState::Probe); + assert_eq!(peer_2.matched + 1, peer_2.next_idx); +} + +#[test] +fn test_restore() { + let l = default_logger(); + // magic number + let s = new_snapshot(11, 11, vec![1, 2, 3]); + + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + assert!(sm.restore(s.clone())); + assert_eq!(sm.raft_log.last_index(), s.get_metadata().index); + assert_eq!( + sm.raft_log.term(s.get_metadata().index).unwrap(), + s.get_metadata().term + ); + assert_iter_eq!( + o sm.prs().conf().voters().ids(), + s.get_metadata() + .get_conf_state() + .voters + ); + assert!(!sm.restore(s)); +} + +#[test] +fn test_restore_ignore_snapshot() { + let l = default_logger(); + let previous_ents = vec![empty_entry(1, 1), empty_entry(1, 2), empty_entry(1, 3)]; + let commit = 1u64; + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.raft_log.append(&previous_ents); + sm.raft_log.commit_to(commit); + + let mut s = new_snapshot(commit, 1, vec![1, 2]); + + // ingore snapshot + assert!(!sm.restore(s.clone())); + assert_eq!(sm.raft_log.committed, commit); + + // ignore snapshot and fast forward commit + s.mut_metadata().index = commit + 1; + assert!(!sm.restore(s)); + assert_eq!(sm.raft_log.committed, commit + 1); +} + +#[test] +fn test_provide_snap() { + let l = default_logger(); + // restore the state machine from a snapshot so it has a compacted log and a snapshot + let s = new_snapshot(11, 11, vec![1, 2]); // magic number + + let mut sm = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + sm.restore(s); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + // force set the next of node 2, so that node 2 needs a snapshot + sm.mut_prs().get_mut(2).unwrap().next_idx = sm.raft_log.first_index(); + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = sm.prs().get(2).unwrap().next_idx - 1; + m.reject = true; + sm.step(m).expect(""); + + let msgs = sm.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgSnapshot); +} + +#[test] +fn test_ignore_providing_snapshot() { + let l = default_logger(); + // restore the state machine from a snapshot so it has a compacted log and a snapshot + let s = new_snapshot(11, 11, vec![1, 2]); // magic number + let mut sm = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + sm.restore(s); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + // force set the next of node 2, so that node 2 needs a snapshot + // change node 2 to be inactive, expect node 1 ignore sending snapshot to 2 + sm.mut_prs().get_mut(2).unwrap().next_idx = sm.raft_log.first_index() - 1; + sm.mut_prs().get_mut(2).unwrap().recent_active = false; + + sm.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + + assert_eq!(sm.read_messages().len(), 0); +} + +#[test] +fn test_restore_from_snap_msg() { + let l = default_logger(); + let s = new_snapshot(11, 11, vec![1, 2]); // magic number + let mut sm = new_test_raft(2, vec![1, 2], 10, 1, new_storage(), &l); + let mut m = new_message(1, 0, MessageType::MsgSnapshot, 0); + m.term = 2; + m.set_snapshot(s); + + sm.step(m).expect(""); + + assert_eq!(sm.leader_id, 1); + + // TODO: port the remaining if upstream completed this test. +} + +#[test] +fn test_slow_node_restore() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + for _ in 0..100 { + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + } + next_ents(nt.peers.get_mut(&1).unwrap(), &nt.storage[&1]); + nt.storage[&1] + .wl() + .commit_to(nt.peers[&1].raft_log.applied) + .unwrap(); + nt.storage[&1] + .wl() + .compact(nt.peers[&1].raft_log.applied) + .unwrap(); + + nt.recover(); + // send heartbeats so that the leader can learn everyone is active. + // node 3 will only be considered as active when node 1 receives a reply from it. + loop { + nt.send(vec![new_message(1, 1, MessageType::MsgBeat, 0)]); + if nt.peers[&1].prs().get(3).unwrap().recent_active { + break; + } + } + + // trigger a snapshot + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + // trigger a commit + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + assert_eq!( + nt.peers[&3].raft_log.committed, + nt.peers[&1].raft_log.committed + ); +} + +// test_step_config tests that when raft step msgProp in EntryConfChange type, +// it appends the entry to log and sets pendingConf to be true. +#[test] +fn test_step_config() { + let l = default_logger(); + // a raft that cannot make progress + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + let index = r.raft_log.last_index(); + let mut m = new_message(1, 1, MessageType::MsgPropose, 0); + let mut e = Entry::default(); + e.set_entry_type(EntryType::EntryConfChange); + m.mut_entries().push(e); + r.step(m).expect(""); + assert_eq!(r.raft_log.last_index(), index + 1); +} + +// test_step_ignore_config tests that if raft step the second msgProp in +// EntryConfChange type when the first one is uncommitted, the node will set +// the proposal to noop and keep its original state. +#[test] +fn test_step_ignore_config() { + let l = default_logger(); + // a raft that cannot make progress + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + assert!(!r.has_pending_conf()); + let mut m = new_message(1, 1, MessageType::MsgPropose, 0); + let mut e = Entry::default(); + e.set_entry_type(EntryType::EntryConfChange); + m.mut_entries().push(e); + assert!(!r.has_pending_conf()); + r.step(m.clone()).expect(""); + assert!(r.has_pending_conf()); + let index = r.raft_log.last_index(); + let pending_conf_index = r.pending_conf_index; + r.step(m).expect(""); + let mut we = empty_entry(1, 3); + we.set_entry_type(EntryType::EntryNormal); + let wents = vec![we]; + let entries = r + .raft_log + .entries(index + 1, None, GetEntriesContext::empty(false)) + .expect(""); + assert_eq!(entries, wents); + assert_eq!(r.pending_conf_index, pending_conf_index); +} + +// test_new_leader_pending_config tests that new leader sets its pending_conf_index +// based on uncommitted entries. +#[test] +fn test_new_leader_pending_config() { + let l = default_logger(); + let mut tests = vec![(false, 0), (true, 1)]; + for (i, (add_entry, wpending_index)) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + let mut e = Entry::default(); + if add_entry { + e.set_entry_type(EntryType::EntryNormal); + let _ = r.append_entry(&mut [e]); + r.persist(); + } + r.become_candidate(); + r.become_leader(); + if r.pending_conf_index != wpending_index { + panic!( + "#{}: pending_conf_index = {}, want {}", + i, r.pending_conf_index, wpending_index + ); + } + assert_eq!(r.has_pending_conf(), add_entry, "#{}: ", i); + } +} + +// test_add_node tests that add_node could update nodes correctly. +#[test] +fn test_add_node() -> Result<()> { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + r.apply_conf_change(&add_node(2))?; + assert_iter_eq!(o r.prs().conf().voters().ids(), + [1, 2] + ); + + Ok(()) +} + +#[test] +fn test_add_node_check_quorum() -> Result<()> { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + + r.check_quorum = true; + + r.become_candidate(); + r.become_leader(); + + for _ in 0..r.election_timeout() - 1 { + r.tick(); + } + + r.apply_conf_change(&add_node(2))?; + + // This tick will reach electionTimeout, which triggers a quorum check. + r.tick(); + + // Node 1 should still be the leader after a single tick. + assert_eq!(r.state, StateRole::Leader); + + // After another electionTimeout ticks without hearing from node 2, + // node 1 should step down. + for _ in 0..r.election_timeout() { + r.tick(); + } + + assert_eq!(r.state, StateRole::Follower); + + Ok(()) +} + +// test_remove_node tests that removeNode could update pendingConf, nodes and +// and removed list correctly. +#[test] +fn test_remove_node() -> Result<()> { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r.apply_conf_change(&remove_node(2))?; + assert_iter_eq!(o r.prs().conf().voters().ids(), [1]); + + // Removing all voters is not allowed. + assert!(r.apply_conf_change(&remove_node(1)).is_err()); + assert_iter_eq!(o r.prs().conf().voters().ids(), [1]); + + Ok(()) +} + +#[test] +fn test_remove_node_itself() { + let l = default_logger(); + let mut n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + + assert!(n1.apply_conf_change(&remove_node(1)).is_err()); + assert_iter_eq!(n1.prs().conf().learners(), [2]); + assert_iter_eq!(o n1.prs().conf().voters().ids(), [1]); +} + +#[test] +fn test_promotable() { + let l = default_logger(); + let id = 1u64; + let mut tests = vec![ + (vec![1], true), + (vec![1, 2, 3], true), + (vec![], false), + (vec![2, 3], false), + ]; + for (i, (peers, wp)) in tests.drain(..).enumerate() { + let r = new_test_raft(id, peers, 5, 1, new_storage(), &l); + if r.promotable() != wp { + panic!("#{}: promotable = {}, want {}", i, r.promotable(), wp); + } + } +} + +#[test] +fn test_raft_nodes() { + let l = default_logger(); + let mut tests = vec![ + (vec![1, 2, 3], vec![1, 2, 3]), + (vec![3, 2, 1], vec![1, 2, 3]), + ]; + for (i, (ids, wids)) in tests.drain(..).enumerate() { + let r = new_test_raft(1, ids, 10, 1, new_storage(), &l); + let voter_ids: HashSet<_> = r.prs().conf().voters().ids().iter().collect(); + let wids = wids.into_iter().collect::>(); + if voter_ids != wids { + panic!("#{}: nodes = {:?}, want {:?}", i, voter_ids, wids); + } + } +} + +#[test] +fn test_campaign_while_leader() { + let l = default_logger(); + test_campaign_while_leader_with_pre_vote(false, &l); +} + +#[test] +fn test_pre_campaign_while_leader() { + let l = default_logger(); + test_campaign_while_leader_with_pre_vote(true, &l); +} + +fn test_campaign_while_leader_with_pre_vote(pre_vote: bool, l: &Logger) { + let mut r = new_test_raft_with_prevote(1, vec![1], 5, 1, new_storage(), pre_vote, l); + assert_eq!(r.state, StateRole::Follower); + // We don't call campaign() directly because it comes after the check + // for our current state. + r.step(new_message(1, 1, MessageType::MsgHup, 0)).expect(""); + assert_eq!(r.state, StateRole::Leader); + let term = r.term; + r.step(new_message(1, 1, MessageType::MsgHup, 0)).expect(""); + assert_eq!(r.state, StateRole::Leader); + assert_eq!(r.term, term); +} + +// test_commit_after_remove_node verifies that pending commands can become +// committed when a config change reduces the quorum requirements. +#[test] +fn test_commit_after_remove_node() -> Result<()> { + let l = default_logger(); + // Create a cluster with two nodes. + let s = new_storage(); + let mut r = new_test_raft(1, vec![1, 2], 5, 1, s.clone(), &l); + r.become_candidate(); + r.become_leader(); + + // Begin to remove the second node. + let mut msg = new_message(0, 0, MessageType::MsgPropose, 0); + let mut entry = Entry::default(); + entry.set_entry_type(EntryType::EntryConfChange); + let mut cc = ConfChange::default(); + cc.set_change_type(ConfChangeType::RemoveNode); + cc.node_id = 2; + let ccdata = cc.write_to_bytes().unwrap(); + entry.data = ccdata.into(); + msg.mut_entries().push(entry); + r.step(msg).expect(""); + // Stabilize the log and make sure nothing is committed yet. + assert_eq!(next_ents(&mut r, &s).len(), 0); + let cc_index = r.raft_log.last_index(); + + // While the config change is pending, make another proposal. + let mut msg = new_message(0, 0, MessageType::MsgPropose, 0); + let mut entry = new_entry(0, 0, Some("hello")); + entry.set_entry_type(EntryType::EntryNormal); + msg.mut_entries().push(entry); + r.step(msg).expect(""); + + // Node 2 acknowledges the config change, committing it. + let mut msg = new_message(2, 0, MessageType::MsgAppendResponse, 0); + msg.index = cc_index; + r.step(msg).expect(""); + let ents = next_ents(&mut r, &s); + assert_eq!(ents.len(), 2); + assert_eq!(ents[0].get_entry_type(), EntryType::EntryNormal); + assert!(ents[0].data.is_empty()); + assert_eq!(ents[1].get_entry_type(), EntryType::EntryConfChange); + + // Apply the config change. This reduces quorum requirements so the + // pending command can now commit. + r.apply_conf_change(&remove_node(2))?; + let ents = next_ents(&mut r, &s); + assert_eq!(ents.len(), 1); + assert_eq!(ents[0].get_entry_type(), EntryType::EntryNormal); + assert_eq!(ents[0].data.as_ref(), b"hello"); + + Ok(()) +} + +// test_leader_transfer_to_uptodate_node verifies transferring should succeed +// if the transferee has the most up-to-date log entries when transfer starts. +#[test] +fn test_leader_transfer_to_uptodate_node() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let lead_id = nt.peers[&1].leader_id; + assert_eq!(lead_id, 1); + + // Transfer leadership to peer 2. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 2); + + // After some log replication, transfer leadership back to peer 1. + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + nt.send(vec![new_message(1, 2, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +// test_leader_transfer_to_uptodate_node_from_follower verifies transferring should succeed +// if the transferee has the most up-to-date log entries when transfer starts. +// Not like test_leader_transfer_to_uptodate_node, where the leader transfer message +// is sent to the leader, in this test case every leader transfer message is sent +// to the follower. +#[test] +fn test_leader_transfer_to_uptodate_node_from_follower() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let lead_id = nt.peers[&1].leader_id; + assert_eq!(lead_id, 1); + + // transfer leadership to peer 2. + nt.send(vec![new_message(2, 2, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 2); + + // After some log replication, transfer leadership back to peer 1. + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + nt.send(vec![new_message(1, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +// TestLeaderTransferWithCheckQuorum ensures transferring leader still works +// even the current leader is still under its leader lease +#[test] +fn test_leader_transfer_with_check_quorum() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + for i in 1..4 { + let r = &mut nt.peers.get_mut(&i).unwrap(); + r.check_quorum = true; + let election_timeout = r.election_timeout(); + r.set_randomized_election_timeout(election_timeout + i as usize); + } + + let b_election_timeout = nt.peers[&2].election_timeout(); + nt.peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(b_election_timeout + 1); + + // Letting peer 2 electionElapsed reach to timeout so that it can vote for peer 1 + for _ in 0..b_election_timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].leader_id, 1); + + // Transfer leadership to 2. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 2); + + // After some log replication, transfer leadership back to 1. + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + nt.send(vec![new_message(1, 2, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +#[test] +fn test_leader_transfer_to_slow_follower() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + nt.recover(); + assert_eq!(nt.peers[&1].prs().get(3).unwrap().matched, 1); + + // Transfer leadership to 3 when node 3 is lack of log. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 3); +} + +#[test] +fn test_leader_transfer_after_snapshot() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + next_ents(nt.peers.get_mut(&1).unwrap(), &nt.storage[&1]); + nt.storage[&1] + .wl() + .commit_to(nt.peers[&1].raft_log.applied) + .unwrap(); + nt.storage[&1] + .wl() + .compact(nt.peers[&1].raft_log.applied) + .unwrap(); + + nt.recover(); + assert_eq!(nt.peers[&1].prs().get(3).unwrap().matched, 1); + + // Transfer leadership to 3 when node 3 is lack of snapshot. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + // Send pb.MsgHeartbeatResp to leader to trigger a snapshot for node 3. + nt.send(vec![new_message( + 3, + 1, + MessageType::MsgHeartbeatResponse, + 0, + )]); + + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 3); +} + +#[test] +fn test_leader_transfer_to_self() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // Transfer leadership to self, there will be noop. + nt.send(vec![new_message(1, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +#[test] +fn test_leader_transfer_to_non_existing_node() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // Transfer leadership to non-existing node, there will be noop. + nt.send(vec![new_message(4, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +#[test] +fn test_leader_transfer_to_learner() { + let l = default_logger(); + let s = MemStorage::new_with_conf_state((vec![1], vec![2])); + let c = new_test_config(1, 10, 1); + let leader = new_test_raft_with_config(&c, s, &l); + + let s = MemStorage::new_with_conf_state((vec![1], vec![2])); + let c = new_test_config(2, 10, 1); + let learner = new_test_raft_with_config(&c, s, &l); + + let mut nt = Network::new(vec![Some(leader), Some(learner)], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // Transfer leadership to learner node, there will be noop. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +#[test] +fn test_leader_transfer_timeout() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + // Transfer leadership to isolated node, wait for timeout. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + let heartbeat_timeout = nt.peers[&1].heartbeat_timeout(); + let election_timeout = nt.peers[&1].election_timeout(); + for _ in 0..heartbeat_timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + for _ in 0..election_timeout - heartbeat_timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +#[test] +fn test_leader_transfer_ignore_proposal() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + // Transfer leadership to isolated node to let transfer pending, then send proposal. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + assert_eq!( + nt.peers + .get_mut(&1) + .unwrap() + .step(new_message(1, 1, MessageType::MsgPropose, 1)), + Err(Error::ProposalDropped), + "should return drop proposal error while transferring" + ); + + assert_eq!(nt.peers[&1].prs().get(1).unwrap().matched, 1); +} + +#[test] +fn test_leader_transfer_receive_higher_term_vote() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + // Transfer leadership to isolated node to let transfer pending. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + nt.send(vec![new_message_with_entries( + 2, + 2, + MessageType::MsgHup, + vec![new_entry(1, 2, None)], + )]); + + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 2); +} + +#[test] +fn test_leader_transfer_remove_node() -> Result<()> { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.ignore(MessageType::MsgTimeoutNow); + + // The lead_transferee is removed when leadship transferring. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + nt.peers + .get_mut(&1) + .unwrap() + .apply_conf_change(&remove_node(3))?; + + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); + + Ok(()) +} + +// test_leader_transfer_back verifies leadership can transfer +// back to self when last transfer is pending. +#[test] +fn test_leader_transfer_back() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + // Transfer leadership back to self. + nt.send(vec![new_message(1, 1, MessageType::MsgTransferLeader, 0)]); + + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +// test_leader_transfer_second_transfer_to_another_node verifies leader can transfer to another node +// when last transfer is pending. +#[test] +fn test_leader_transfer_second_transfer_to_another_node() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + // Transfer leadership to another node. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + + check_leader_transfer_state(&nt.peers[&1], StateRole::Follower, 2); +} + +// test_leader_transfer_second_transfer_to_same_node verifies second transfer leader request +// to the same node should not extend the timeout while the first one is pending. +#[test] +fn test_leader_transfer_second_transfer_to_same_node() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.isolate(3); + + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].lead_transferee.unwrap(), 3); + + let heartbeat_timeout = nt.peers[&1].heartbeat_timeout(); + for _ in 0..heartbeat_timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + + // Second transfer leadership request to the same node. + nt.send(vec![new_message(3, 1, MessageType::MsgTransferLeader, 0)]); + + let election_timeout = nt.peers[&1].election_timeout(); + for _ in 0..election_timeout - heartbeat_timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + + check_leader_transfer_state(&nt.peers[&1], StateRole::Leader, 1); +} + +fn check_leader_transfer_state(r: &Raft, state: StateRole, lead: u64) { + if r.state != state || r.leader_id != lead { + panic!( + "after transferring, node has state {:?} lead {}, want state {:?} lead {}", + r.state, r.leader_id, state, lead + ); + } + assert_eq!(r.lead_transferee, None); +} + +// test_transfer_non_member verifies that when a MsgTimeoutNow arrives at +// a node that has been removed from the group, nothing happens. +// (previously, if the node also got votes, it would panic as it +// transitioned to StateRole::Leader) +#[test] +fn test_transfer_non_member() { + let l = default_logger(); + let mut raft = new_test_raft(1, vec![2, 3, 4], 5, 1, new_storage(), &l); + raft.step(new_message(2, 1, MessageType::MsgTimeoutNow, 0)) + .expect(""); + + raft.step(new_message(2, 1, MessageType::MsgRequestVoteResponse, 0)) + .expect(""); + raft.step(new_message(3, 1, MessageType::MsgRequestVoteResponse, 0)) + .expect(""); + assert_eq!(raft.state, StateRole::Follower); +} + +// TestNodeWithSmallerTermCanCompleteElection tests the scenario where a node +// that has been partitioned away (and fallen behind) rejoins the cluster at +// about the same time the leader node gets partitioned away. +// Previously the cluster would come to a standstill when run with PreVote +// enabled. +#[test] +fn test_node_with_smaller_term_can_complete_election() { + let l = default_logger(); + let mut n1 = new_test_raft_with_prevote(1, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let mut n2 = new_test_raft_with_prevote(2, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + let mut n3 = new_test_raft_with_prevote(3, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + + n1.become_follower(1, INVALID_ID); + n2.become_follower(1, INVALID_ID); + n3.become_follower(1, INVALID_ID); + + // cause a network partition to isolate node 3 + let mut config = Network::default_config(); + config.pre_vote = true; + let mut nt = Network::new_with_config(vec![Some(n1), Some(n2), Some(n3)], &config, &l); + nt.cut(1, 3); + nt.cut(2, 3); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // check whether the term values are expected + // a.Term == 3 + // b.Term == 3 + // c.Term == 1 + assert_eq!(nt.peers[&1].term, 3); + assert_eq!(nt.peers[&2].term, 3); + assert_eq!(nt.peers[&3].term, 1); + + // check state + // a == follower + // b == leader + // c == pre-candidate + assert_eq!(nt.peers[&1].state, StateRole::Follower); + assert_eq!(nt.peers[&2].state, StateRole::Leader); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + // recover the network then immediately isolate b which is currently + // the leader, this is to emulate the crash of b. + nt.recover(); + nt.cut(2, 1); + nt.cut(2, 3); + + // call for election + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // do we have a leader? + assert!( + nt.peers[&1].state == StateRole::Leader || nt.peers[&3].state == StateRole::Leader, + "no leader" + ); +} + +pub fn new_test_learner_raft( + id: u64, + peers: Vec, + learners: Vec, + election: usize, + heartbeat: usize, + storage: MemStorage, + logger: &Logger, +) -> Interface { + if storage.initial_state().unwrap().initialized() && peers.is_empty() { + panic!("new_test_raft with empty peers on initialized store"); + } + if !peers.is_empty() && !storage.initial_state().unwrap().initialized() { + storage.initialize_with_conf_state((peers, learners)); + } + let cfg = new_test_config(id, election, heartbeat); + new_test_raft_with_config(&cfg, storage, logger) +} + +pub fn new_test_learner_raft_with_prevote( + id: u64, + peers: Vec, + learners: Vec, + logger: &Logger, + prevote: bool, +) -> Interface { + let storage = new_storage(); + storage.initialize_with_conf_state((peers, learners)); + let mut cfg = new_test_config(id, 10, 1); + cfg.pre_vote = prevote; + new_test_raft_with_config(&cfg, storage, logger) +} + +// TestLearnerElectionTimeout verifies that the leader should not start election +// even when times out. +#[test] +fn test_learner_election_timeout() { + let l = default_logger(); + let mut n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + n1.become_follower(1, INVALID_ID); + + let mut n2 = new_test_learner_raft(2, vec![1], vec![2], 10, 1, new_storage(), &l); + n2.become_follower(1, INVALID_ID); + + let timeout = n2.election_timeout(); + n2.set_randomized_election_timeout(timeout); + + // n2 is a learner. Learner should not start election even when time out. + for _ in 0..timeout { + n2.tick(); + } + assert_eq!(n2.state, StateRole::Follower); +} + +// TestLearnerPromotion verifies that the leaner should not election until +// it is promoted to a normal peer. +#[test] +fn test_learner_promotion() -> Result<()> { + let l = default_logger(); + let mut n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + n1.become_follower(1, INVALID_ID); + + let mut n2 = new_test_learner_raft(2, vec![1], vec![2], 10, 1, new_storage(), &l); + n2.become_follower(1, INVALID_ID); + + let mut network = Network::new(vec![Some(n1), Some(n2)], &l); + assert_eq!(network.peers[&1].state, StateRole::Follower); + + // n1 should become leader. + let timeout = network.peers[&1].election_timeout(); + network + .peers + .get_mut(&1) + .unwrap() + .set_randomized_election_timeout(timeout); + for _ in 0..timeout { + network.peers.get_mut(&1).unwrap().tick(); + } + assert_eq!(network.peers[&1].state, StateRole::Leader); + assert_eq!(network.peers[&2].state, StateRole::Follower); + + let mut heart_beat = new_message(1, 1, MessageType::MsgBeat, 0); + network.send(vec![heart_beat.clone()]); + + // Promote n2 from learner to follower. + network + .peers + .get_mut(&1) + .unwrap() + .apply_conf_change(&add_node(2))?; + network + .peers + .get_mut(&2) + .unwrap() + .apply_conf_change(&add_node(2))?; + assert_eq!(network.peers[&2].state, StateRole::Follower); + assert!(network.peers[&2].promotable()); + + let timeout = network.peers[&2].election_timeout(); + network + .peers + .get_mut(&2) + .unwrap() + .set_randomized_election_timeout(timeout); + for _ in 0..timeout { + network.peers.get_mut(&2).unwrap().tick(); + } + + heart_beat.to = 2; + heart_beat.from = 2; + network.send(vec![heart_beat]); + assert_eq!(network.peers[&1].state, StateRole::Follower); + assert_eq!(network.peers[&2].state, StateRole::Leader); + + Ok(()) +} + +// TestLearnerLogReplication tests that a learner can receive entries from the leader. +#[test] +fn test_learner_log_replication() { + let l = default_logger(); + let n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + let n2 = new_test_learner_raft(2, vec![1], vec![2], 10, 1, new_storage(), &l); + let mut network = Network::new(vec![Some(n1), Some(n2)], &l); + + network + .peers + .get_mut(&1) + .unwrap() + .become_follower(1, INVALID_ID); + network + .peers + .get_mut(&2) + .unwrap() + .become_follower(1, INVALID_ID); + + let timeout = network.peers[&1].election_timeout(); + network + .peers + .get_mut(&1) + .unwrap() + .set_randomized_election_timeout(timeout); + + for _ in 0..timeout { + network.peers.get_mut(&1).unwrap().tick(); + } + + let heart_beat = new_message(1, 1, MessageType::MsgBeat, 0); + network.send(vec![heart_beat]); + + assert_eq!(network.peers[&1].state, StateRole::Leader); + assert_eq!(network.peers[&2].state, StateRole::Follower); + assert!(!network.peers[&2].promotable()); + + let next_committed = network.peers[&1].raft_log.committed + 1; + + let msg = new_message(1, 1, MessageType::MsgPropose, 1); + network.send(vec![msg]); + + assert_eq!(network.peers[&1].raft_log.committed, next_committed); + assert_eq!(network.peers[&2].raft_log.committed, next_committed); + + let matched = network + .peers + .get_mut(&1) + .unwrap() + .prs() + .get(2) + .unwrap() + .matched; + assert_eq!(matched, network.peers[&2].raft_log.committed); +} + +// TestRestoreWithLearner restores a snapshot which contains learners. +#[test] +fn test_restore_with_learner() { + let l = default_logger(); + let mut s = new_snapshot(11, 11, vec![1, 2]); + s.mut_metadata().mut_conf_state().mut_learners().push(3); + + let mut sm = new_test_learner_raft(3, vec![1, 2], vec![3], 10, 1, new_storage(), &l); + assert!(!sm.promotable()); + assert!(sm.restore(s.clone())); + assert_eq!(sm.raft_log.last_index(), 11); + assert_eq!(sm.raft_log.term(11).unwrap(), 11); + assert_iter_eq!(o sm.prs().conf().voters().ids(), [1, 2]); + assert_iter_eq!(sm.prs().conf().learners(), [3]); + + let conf_state = s.get_metadata().get_conf_state(); + for node in &conf_state.voters { + assert!(sm.prs().get(*node).is_some()); + assert!(!sm.prs().conf().learners().contains(node)); + } + + for node in &conf_state.learners { + assert!(sm.prs().get(*node).is_some()); + assert!(sm.prs().conf().learners().contains(node)); + } + + assert!(!sm.restore(s)); +} + +// Tests if outgoing voters can restore snapshot correctly. +#[test] +fn test_restore_with_voters_outgoing() { + let l = default_logger(); + // magic number + let mut s = new_snapshot(11, 11, vec![2, 3, 4]); + s.mut_metadata().mut_conf_state().voters_outgoing = vec![1, 2, 3]; + + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + assert!(sm.restore(s.clone())); + assert_eq!(sm.raft_log.last_index(), s.get_metadata().index); + assert_eq!( + sm.raft_log.term(s.get_metadata().index).unwrap(), + s.get_metadata().term + ); + assert_iter_eq!( + o sm.prs().conf().voters().ids(), + [1, 2, 3, 4] + ); + assert!(!sm.restore(s)); +} + +// Verifies that a voter can be depromoted by snapshot. +#[test] +fn test_restore_depromote_voter() { + let l = default_logger(); + let mut s = new_snapshot(11, 11, vec![1, 2]); + s.mut_metadata().mut_conf_state().mut_learners().push(3); + + let mut sm = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + assert!(sm.promotable()); + assert!(sm.restore(s)); +} + +#[test] +fn test_restore_learner() { + let l = default_logger(); + let mut s = new_snapshot(11, 11, vec![1, 2]); + s.mut_metadata().mut_conf_state().mut_learners().push(3); + + let mut sm = new_test_raft(3, vec![], 10, 1, new_storage(), &l); + assert!(!sm.promotable()); // Uninitialized peers can't be promoted. + assert!(sm.restore(s)); + assert!(!sm.promotable()); +} + +// TestRestoreLearnerPromotion checks that a learner can become to a follower after +// restoring snapshot. +#[test] +fn test_restore_learner_promotion() { + let l = default_logger(); + let s = new_snapshot(11, 11, vec![1, 2, 3]); + let mut sm = new_test_learner_raft(3, vec![1, 2], vec![3], 10, 1, new_storage(), &l); + assert!(!sm.promotable()); + assert!(sm.restore(s)); + assert!(sm.promotable()); +} + +// TestLearnerReceiveSnapshot tests that a learner can receive a snapshot from leader. +#[test] +fn test_learner_receive_snapshot() { + let l = default_logger(); + let mut s = new_snapshot(11, 11, vec![1]); + s.mut_metadata().mut_conf_state().mut_learners().push(2); + + let mut n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + let n2 = new_test_learner_raft(2, vec![1], vec![2], 10, 1, new_storage(), &l); + + n1.restore(s); + n1.persist(); + + let committed = n1.raft_log.committed; + n1.commit_apply(committed); + + let mut network = Network::new(vec![Some(n1), Some(n2)], &l); + + let timeout = network.peers[&1].election_timeout(); + network + .peers + .get_mut(&1) + .unwrap() + .set_randomized_election_timeout(timeout); + + for _ in 0..timeout { + network.peers.get_mut(&1).unwrap().tick(); + } + + let mut msg = Message::default(); + msg.from = 1; + msg.to = 1; + msg.set_msg_type(MessageType::MsgBeat); + network.send(vec![msg]); + + let n1_committed = network.peers[&1].raft_log.committed; + let n2_committed = network.peers[&2].raft_log.committed; + assert_eq!(n1_committed, n2_committed); +} + +// TestAddLearner tests that addLearner could update nodes correctly. +#[test] +fn test_add_learner() -> Result<()> { + let l = default_logger(); + let mut n1 = new_test_raft(1, vec![1], 10, 1, new_storage(), &l); + n1.apply_conf_change(&add_learner(2))?; + + assert_iter_eq!(n1.prs().conf().learners(), [2]); + assert!(n1.prs().conf().learners().contains(&2)); + + Ok(()) +} + +// TestRemoveLearner tests that removeNode could update nodes and +// and removed list correctly. +#[test] +fn test_remove_learner() -> Result<()> { + let l = default_logger(); + let mut n1 = new_test_learner_raft(1, vec![1], vec![2], 10, 1, new_storage(), &l); + n1.apply_conf_change(&remove_node(2))?; + assert_iter_eq!(o n1.prs().conf().voters().ids(), [1]); + assert!(n1.prs().conf().learners().is_empty()); + + // Remove all voters are not allowed. + assert!(n1.apply_conf_change(&remove_node(1)).is_err()); + assert_iter_eq!(o n1.prs().conf().voters().ids(), [1]); + assert!(n1.prs().conf().learners().is_empty()); + + Ok(()) +} + +// simulate rolling update a cluster for Pre-Vote. cluster has 3 nodes [n1, n2, n3]. +// n1 is leader with term 2 +// n2 is follower with term 2 +// n3 is partitioned, with term 4 and less log, state is candidate +fn new_prevote_migration_cluster(l: &Logger) -> Network { + // We intentionally do not enable pre_vote for n3, this is done so in order + // to simulate a rolling restart process where it's possible to have a mixed + // version cluster with replicas with pre_vote enabled, and replicas without. + let mut n1 = new_test_raft_with_prevote(1, vec![1, 2, 3], 10, 1, new_storage(), true, l); + let mut n2 = new_test_raft_with_prevote(2, vec![1, 2, 3], 10, 1, new_storage(), true, l); + let mut n3 = new_test_raft_with_prevote(3, vec![1, 2, 3], 10, 1, new_storage(), false, l); + + n1.become_follower(1, INVALID_ID); + n2.become_follower(1, INVALID_ID); + n3.become_follower(1, INVALID_ID); + + let mut nt = Network::new(vec![Some(n1), Some(n2), Some(n3)], l); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // Cause a network partition to isolate n3. + nt.isolate(3); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // check state + // n1.state == Leader + // n2.state == Follower + // n3.state == Candidate + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::Candidate); + + // check term + // n1.Term == 2 + // n2.Term == 2 + // n3.Term == 4 + assert_eq!(nt.peers[&1].term, 2); + assert_eq!(nt.peers[&2].term, 2); + assert_eq!(nt.peers[&3].term, 4); + + // Enable prevote on n3, then recover the network + nt.peers.get_mut(&3).unwrap().pre_vote = true; + nt.recover(); + + nt +} + +#[test] +fn test_prevote_migration_can_complete_election() { + let l = default_logger(); + // n1 is leader with term 2 + // n2 is follower with term 2 + // n3 is pre-candidate with term 4, and less log + let mut nt = new_prevote_migration_cluster(&l); + + // simulate leader down + nt.isolate(1); + + // Call for elections from both n2 and n3. + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // check state + // n2.state == Follower + // n3.state == PreCandidate + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // Do we have a leader? + assert!( + (nt.peers[&2].state == StateRole::Leader) || (nt.peers[&3].state == StateRole::Follower) + ); +} + +#[test] +fn test_prevote_migration_with_free_stuck_pre_candidate() { + let l = default_logger(); + let mut nt = new_prevote_migration_cluster(&l); + + // n1 is leader with term 2 + // n2 is follower with term 2 + // n3 is pre-candidate with term 4, and less log + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + // Pre-Vote again for safety + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + assert_eq!(nt.peers[&2].state, StateRole::Follower); + assert_eq!(nt.peers[&3].state, StateRole::PreCandidate); + + let mut to_send = new_message(1, 3, MessageType::MsgHeartbeat, 0); + to_send.term = nt.peers[&1].term; + nt.send(vec![to_send]); + + // Disrupt the leader so that the stuck peer is freed + assert_eq!(nt.peers[&1].state, StateRole::Follower); + + assert_eq!(nt.peers[&3].term, nt.peers[&1].term); +} + +#[test] +fn test_learner_respond_vote() -> Result<()> { + let l = default_logger(); + let mut n1 = new_test_learner_raft(1, vec![1, 2], vec![3], 10, 1, new_storage(), &l); + n1.become_follower(1, INVALID_ID); + n1.reset_randomized_election_timeout(); + + let mut n3 = new_test_learner_raft(3, vec![1, 2], vec![3], 10, 1, new_storage(), &l); + n3.become_follower(1, INVALID_ID); + n3.reset_randomized_election_timeout(); + + let do_campaign = |nw: &mut Network| { + let msg = new_message(1, 1, MessageType::MsgHup, 0); + nw.send(vec![msg]); + }; + + let mut network = Network::new(vec![Some(n1), None, Some(n3)], &l); + network.isolate(2); + + // Can't elect new leader because 1 won't send MsgRequestVote to 3. + do_campaign(&mut network); + assert_eq!(network.peers[&1].state, StateRole::Candidate); + + // After promote 3 to voter, election should success. + network + .peers + .get_mut(&1) + .unwrap() + .apply_conf_change(&add_node(3))?; + do_campaign(&mut network); + assert_eq!(network.peers[&1].state, StateRole::Leader); + + Ok(()) +} + +#[test] +fn test_election_tick_range() { + let l = default_logger(); + let mut cfg = new_test_config(1, 10, 1); + let s = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let mut raft = new_test_raft_with_config(&cfg, s, &l).raft.unwrap(); + for _ in 0..1000 { + raft.reset_randomized_election_timeout(); + let randomized_timeout = raft.randomized_election_timeout(); + assert!( + cfg.election_tick <= randomized_timeout && randomized_timeout < 2 * cfg.election_tick + ); + } + + cfg.min_election_tick = cfg.election_tick; + cfg.validate().unwrap(); + + // Too small election tick. + cfg.min_election_tick = cfg.election_tick - 1; + cfg.validate().unwrap_err(); + + // max_election_tick should be larger than min_election_tick + cfg.min_election_tick = cfg.election_tick; + cfg.max_election_tick = cfg.election_tick; + cfg.validate().unwrap_err(); + + cfg.max_election_tick = cfg.election_tick + 1; + raft = new_test_raft_with_config(&cfg, new_storage(), &l) + .raft + .unwrap(); + for _ in 0..100 { + raft.reset_randomized_election_timeout(); + let randomized_timeout = raft.randomized_election_timeout(); + assert_eq!(randomized_timeout, cfg.election_tick); + } +} + +// TestPreVoteWithSplitVote verifies that after split vote, cluster can complete +// election in next round. +#[test] +fn test_prevote_with_split_vote() { + let l = default_logger(); + let peers = (1..=3).map(|id| { + let mut raft = + new_test_raft_with_prevote(id, vec![1, 2, 3], 10, 1, new_storage(), true, &l); + raft.become_follower(1, INVALID_ID); + Some(raft) + }); + let mut network = Network::new(peers.collect(), &l); + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // simulate leader down. followers start split vote. + network.isolate(1); + network.send(vec![ + new_message(2, 2, MessageType::MsgHup, 0), + new_message(3, 3, MessageType::MsgHup, 0), + ]); + + // check whether the term values are expected + assert_eq!(network.peers[&2].term, 3, "peer 2 term",); + assert_eq!(network.peers[&3].term, 3, "peer 3 term",); + + // check state + assert_eq!( + network.peers[&2].state, + StateRole::Candidate, + "peer 2 state", + ); + assert_eq!( + network.peers[&3].state, + StateRole::Candidate, + "peer 3 state", + ); + + // node 2 election timeout first + network.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // check whether the term values are expected + assert_eq!(network.peers[&2].term, 4, "peer 2 term",); + assert_eq!(network.peers[&3].term, 4, "peer 3 term",); + + // check state + assert_eq!(network.peers[&2].state, StateRole::Leader, "peer 2 state",); + assert_eq!(network.peers[&3].state, StateRole::Follower, "peer 3 state",); +} + +// ensure that after a node become pre-candidate, it will checkQuorum correctly. +#[test] +fn test_prevote_with_check_quorum() { + let l = default_logger(); + let bootstrap = |id| { + let mut cfg = new_test_config(id, 10, 1); + cfg.pre_vote = true; + cfg.check_quorum = true; + let s = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let mut i = new_test_raft_with_config(&cfg, s, &l); + i.become_follower(1, INVALID_ID); + i + }; + let (peer1, peer2, peer3) = (bootstrap(1), bootstrap(2), bootstrap(3)); + + let mut network = Network::new(vec![Some(peer1), Some(peer2), Some(peer3)], &l); + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // cause a network partition to isolate node 3. node 3 has leader info + network.cut(1, 3); + network.cut(2, 3); + + assert_eq!(network.peers[&1].state, StateRole::Leader, "peer 1 state",); + assert_eq!(network.peers[&2].state, StateRole::Follower, "peer 2 state",); + + network.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + assert_eq!( + network.peers[&3].state, + StateRole::PreCandidate, + "peer 3 state", + ); + + // term + 2, so that node 2 will ignore node 3's PreVote + network.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + network.send(vec![new_message(1, 2, MessageType::MsgTransferLeader, 0)]); + + // check whether the term values are expected + assert_eq!(network.peers[&1].term, 4, "peer 1 term",); + assert_eq!(network.peers[&2].term, 4, "peer 2 term",); + assert_eq!(network.peers[&3].term, 2, "peer 3 term",); + + // check state + assert_eq!(network.peers[&1].state, StateRole::Leader, "peer 1 state",); + assert_eq!(network.peers[&2].state, StateRole::Follower, "peer 2 state",); + assert_eq!( + network.peers[&3].state, + StateRole::PreCandidate, + "peer 3 state", + ); + + // recover the network then immediately isolate node 1 which is currently + // the leader, this is to emulate the crash of node 1. + network.recover(); + network.cut(1, 2); + network.cut(1, 3); + + // call for election. node 3 shouldn't ignore node 2's PreVote + let timeout = network.peers[&3].randomized_election_timeout(); + for _ in 0..timeout { + network.peers.get_mut(&3).unwrap().tick(); + } + network.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // check state + assert_eq!(network.peers[&2].state, StateRole::Leader, "peer 2 state",); + assert_eq!(network.peers[&3].state, StateRole::Follower, "peer 3 state",); +} + +// ensure a new Raft returns a Error::ConfigInvalid with an invalid config +#[test] +fn test_new_raft_with_bad_config_errors() { + let invalid_config = new_test_config(INVALID_ID, 1, 1); + let s = MemStorage::new_with_conf_state((vec![1, 2], vec![])); + let raft = Raft::new(&invalid_config, s, &default_logger()); + assert!(raft.is_err()) +} + +// tests whether MsgAppend are batched +#[test] +fn test_batch_msg_append() { + let l = default_logger(); + let storage = new_storage(); + let mut raft = new_test_raft(1, vec![1, 2, 3], 10, 1, storage.clone(), &l); + raft.become_candidate(); + raft.become_leader(); + raft.set_batch_append(true); + commit_noop_entry(&mut raft, &storage); + for _ in 0..10 { + let prop_msg = new_message(1, 1, MessageType::MsgPropose, 1); + raft.step(prop_msg).unwrap(); + } + assert_eq!(raft.msgs.len(), 2); + for msg in &raft.msgs { + assert_eq!(msg.entries.len(), 10); + assert_eq!(msg.index, 1); + } + // if the append entry is not continuous, raft should not batch the RPC + let mut reject_msg = new_message(2, 1, MessageType::MsgAppendResponse, 0); + reject_msg.reject = true; + reject_msg.index = 2; + raft.step(reject_msg).unwrap(); + assert_eq!(raft.msgs.len(), 3); +} + +/// Tests if unapplied conf change is checked before campaign. +#[test] +fn test_conf_change_check_before_campaign() { + let l = default_logger(); + let mut nt = Network::new(vec![None, None, None], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + + let mut m = new_message(1, 1, MessageType::MsgPropose, 0); + let mut e = Entry::default(); + e.set_entry_type(EntryType::EntryConfChange); + let mut cc = ConfChange::default(); + cc.set_change_type(ConfChangeType::RemoveNode); + cc.node_id = 3; + e.data = protobuf::Message::write_to_bytes(&cc).unwrap().into(); + m.mut_entries().push(e); + nt.send(vec![m]); + + // trigger campaign in node 2 + nt.peers + .get_mut(&2) + .unwrap() + .reset_randomized_election_timeout(); + let timeout = nt.peers[&2].randomized_election_timeout(); + for _ in 0..timeout { + nt.peers.get_mut(&2).unwrap().tick(); + } + // It's still follower because committed conf change is not applied. + assert_eq!(nt.peers[&2].state, StateRole::Follower); + + // Transfer leadership to peer 2. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Leader); + // It's still follower because committed conf change is not applied. + assert_eq!(nt.peers[&2].state, StateRole::Follower); + // Abort transfer leader. + nt.peers.get_mut(&1).unwrap().abort_leader_transfer(); + + let committed = nt.peers[&2].raft_log.committed; + nt.peers.get_mut(&2).unwrap().commit_apply(committed); + nt.peers + .get_mut(&2) + .unwrap() + .apply_conf_change(&remove_node(3)) + .unwrap(); + + // transfer leadership to peer 2 again. + nt.send(vec![new_message(2, 1, MessageType::MsgTransferLeader, 0)]); + assert_eq!(nt.peers[&1].state, StateRole::Follower); + assert_eq!(nt.peers[&2].state, StateRole::Leader); + + nt.peers.get_mut(&1).unwrap().commit_apply(committed); + nt.peers + .get_mut(&1) + .unwrap() + .apply_conf_change(&remove_node(3)) + .unwrap(); + + // trigger campaign in node 1 + nt.peers + .get_mut(&1) + .unwrap() + .reset_randomized_election_timeout(); + let timeout = nt.peers[&1].randomized_election_timeout(); + for _ in 0..timeout { + nt.peers.get_mut(&1).unwrap().tick(); + } + assert_eq!(nt.peers[&1].state, StateRole::Candidate); +} + +fn test_advance_commit_index_by_vote_request(use_prevote: bool) { + let l = default_logger(); + let mut cases: Vec> = vec![ + Box::new(conf_change(ConfChangeType::AddNode, 4)), + Box::new(conf_change_v2(vec![ + new_conf_change_single(3, ConfChangeType::AddLearnerNode), + new_conf_change_single(4, ConfChangeType::AddNode), + ])), + ]; + for (i, cc) in cases.drain(..).enumerate() { + let peers = (1..=4) + .map(|id| { + Some(new_test_learner_raft_with_prevote( + id, + vec![1, 2, 3], + vec![4], + &l, + use_prevote, + )) + }) + .collect(); + let mut nt = Network::new(peers, &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + let mut e = Entry::default(); + if let Some(v1) = cc.as_v1() { + e.set_entry_type(EntryType::EntryConfChange); + e.set_data(v1.write_to_bytes().unwrap().into()); + } else { + e.set_entry_type(EntryType::EntryConfChangeV2); + e.set_data(cc.as_v2().write_to_bytes().unwrap().into()); + } + + // propose a confchange entry but don't let it commit + nt.ignore(MessageType::MsgAppendResponse); + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![e], + )]); + let cc_index = nt.peers[&1].raft_log.last_index(); + + // let node 4 have more up to data log than other voter + nt.recover(); + nt.cut(1, 2); + nt.cut(1, 3); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + // let the confchange entry commit but don't let node 4 know + nt.recover(); + nt.cut(1, 4); + nt.ignore(MessageType::MsgAppend); + let mut msg = new_message(2, 1, MessageType::MsgAppendResponse, 0); + msg.set_index(nt.peers[&2].raft_log.last_index()); + nt.send(vec![msg, new_message(1, 1, MessageType::MsgBeat, 0)]); + + // simulate the leader down + nt.recover(); + nt.isolate(1); + + let p4 = nt.peers.get_mut(&4).unwrap(); + if p4.raft_log.committed >= cc_index { + panic!( + "#{} expected node 4 commit index less than {}, got {}", + i, cc_index, p4.raft_log.committed + ); + } + // node 4 can't start new election because it thinks itself is a learner + for _ in 0..p4.randomized_election_timeout() { + p4.tick(); + } + if p4.state != StateRole::Follower { + panic!("#{} node 4 state: {:?}, want Follower", i, p4.state); + } + let p2 = nt.peers.get_mut(&2).unwrap(); + if p2.raft_log.committed < cc_index { + panic!( + "#{} expected node 2 commit index not less than {}, got {}", + i, cc_index, p2.raft_log.committed + ); + } + p2.apply_conf_change(&cc.as_v2()).unwrap(); + p2.commit_apply(cc_index); + + // node 2 needs votes from both node 3 and node 4, but node 4 will reject it + for _ in 0..p2.randomized_election_timeout() { + p2.tick(); + } + let want = if use_prevote { + StateRole::PreCandidate + } else { + StateRole::Candidate + }; + if p2.state != want { + panic!("#{} node 2 state: {:?}, want {:?}", i, p2.state, want); + } + let msgs = nt.read_messages(); + nt.filter_and_send(msgs); + if nt.peers[&2].state == StateRole::Leader { + panic!("#{} node 2 can't campaign successfully.", i); + } + + // node 4's commit index should be advanced by node 2's vote request + let p4 = nt.peers.get_mut(&4).unwrap(); + if p4.raft_log.committed < cc_index { + panic!( + "#{} expected node 4 commit index not less than {}, got {}", + i, cc_index, p4.raft_log.committed + ); + } + p4.apply_conf_change(&cc.as_v2()).unwrap(); + p4.commit_apply(cc_index); + + // now node 4 can start new election and become leader + for _ in 0..p4.randomized_election_timeout() { + p4.tick(); + } + let msgs = nt.read_messages(); + nt.filter_and_send(msgs); + if nt.peers[&4].state != StateRole::Leader { + panic!("#{} node 4 state: {:?} want Leader", i, nt.peers[&4].state); + } + } +} + +/// Tests the commit index can be advanced by direct vote request +#[test] +fn test_advance_commit_index_by_direct_vote_request() { + test_advance_commit_index_by_vote_request(false) +} + +/// Tests the commit index can be advanced by prevote request +#[test] +fn test_advance_commit_index_by_prevote_request() { + test_advance_commit_index_by_vote_request(true) +} + +fn test_advance_commit_index_by_vote_response(use_prevote: bool) { + let l = default_logger(); + let mut cases: Vec> = vec![ + Box::new(conf_change(ConfChangeType::RemoveNode, 4)), + // Explicit leave joint + Box::new(conf_change_v2(vec![])), + ]; + // Enter joint confchange + let mut enter_joint = conf_change_v2(vec![ + new_conf_change_single(3, ConfChangeType::AddNode), + new_conf_change_single(4, ConfChangeType::AddLearnerNode), + ]); + enter_joint.set_transition(ConfChangeTransition::Explicit); + for (i, cc) in cases.drain(..).enumerate() { + let peers = (1..=4) + .map(|id| { + Some(new_test_raft_with_prevote( + id, + vec![1, 2, 3, 4], + 10, + 1, + new_storage(), + use_prevote, + &l, + )) + }) + .collect(); + let mut nt = Network::new(peers, &l); + + // Joint confchange, let's enter joint first + if cc.as_v1().is_none() { + for p in nt.peers.values_mut() { + p.apply_conf_change(&enter_joint).unwrap(); + } + } + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let mut e = Entry::default(); + if let Some(v1) = cc.as_v1() { + e.set_entry_type(EntryType::EntryConfChange); + e.set_data(v1.write_to_bytes().unwrap().into()); + } else { + e.set_entry_type(EntryType::EntryConfChangeV2); + e.set_data(cc.as_v2().write_to_bytes().unwrap().into()); + } + + // propose a confchange entry but don't let it commit + nt.ignore(MessageType::MsgAppendResponse); + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![e], + )]); + let cc_index = nt.peers[&1].raft_log.last_index(); + + // let node 4 have more up to data log than other voter + nt.recover(); + nt.cut(1, 2); + nt.cut(1, 3); + nt.send(vec![new_message(1, 1, MessageType::MsgPropose, 1)]); + + // A delayed MsgAppResp message make the confchange entry become committed + let mut msg = new_message(2, 1, MessageType::MsgAppendResponse, 0); + msg.set_index(nt.peers[&2].raft_log.last_index()); + nt.send(vec![msg, new_message(1, 1, MessageType::MsgBeat, 0)]); + + // simulate the leader down + nt.recover(); + nt.isolate(1); + + let p4 = nt.peers.get_mut(&4).unwrap(); + if p4.raft_log.committed < cc_index { + panic!( + "#{} expected node 4 commit index larger than {}, got {}", + i, cc_index, p4.raft_log.committed + ); + } + p4.apply_conf_change(&cc.as_v2()).unwrap(); + p4.commit_apply(cc_index); + // node 4 can't start new election because it thinks itself is a learner + for _ in 0..p4.randomized_election_timeout() { + p4.tick(); + } + if p4.state != StateRole::Follower { + panic!("#{} node 4 state: {:?}, want Follower", i, p4.state); + } + let p2 = nt.peers.get_mut(&2).unwrap(); + if p2.raft_log.committed >= cc_index { + panic!( + "#{} expected node 2 commit index less than {}, got {}", + i, cc_index, p2.raft_log.committed + ); + } + + // node 2 needs votes from both node 3 and node 4, but node 4 will reject it + for _ in 0..p2.randomized_election_timeout() { + p2.tick(); + } + let want = if use_prevote { + StateRole::PreCandidate + } else { + StateRole::Candidate + }; + if p2.state != want { + panic!("#{} node 2 state: {:?}, want {:?}", i, p2.state, want); + } + let msgs = nt.read_messages(); + nt.filter_and_send(msgs); + let p2 = nt.peers.get_mut(&2).unwrap(); + if p2.state != StateRole::Follower { + panic!( + "#{} node 2 should become follower by vote response, but got {:?}", + i, p2.state + ); + } + + // node 2's commit index should be advanced by vote response + if p2.raft_log.committed < cc_index { + panic!( + "#{} expected node 2 commit index less than {}, got {}", + i, cc_index, p2.raft_log.committed + ); + } + p2.apply_conf_change(&cc.as_v2()).unwrap(); + p2.commit_apply(cc_index); + + // now node 2 only need vote from node 3 + for _ in 0..p2.randomized_election_timeout() { + p2.tick(); + } + let msgs = nt.read_messages(); + nt.filter_and_send(msgs); + if nt.peers[&2].state != StateRole::Leader { + panic!("#{} node 2 state: {:?} want Leader", i, nt.peers[&2].state); + } + } +} + +// Tests the commit index can be forwarded by direct vote response +#[test] +fn test_advance_commit_index_by_direct_vote_response() { + test_advance_commit_index_by_vote_response(false) +} + +// Tests the commit index can be forwarded by prevote response +#[test] +fn test_advance_commit_index_by_prevote_response() { + test_advance_commit_index_by_vote_response(true) +} + +fn prepare_request_snapshot() -> (Network, Snapshot) { + let l = default_logger(); + + fn index_term_11(id: u64, ids: Vec, l: &Logger) -> Interface { + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(11, 11, ids.clone())) + .unwrap(); + let mut raft = new_test_raft(id, ids, 5, 1, store, l); + raft.reset(11); + raft + } + + let mut nt = Network::new( + vec![ + Some(index_term_11(1, vec![1, 2, 3], &l)), + Some(index_term_11(2, vec![1, 2, 3], &l)), + Some(index_term_11(3, vec![1, 2, 3], &l)), + ], + &l, + ); + + // elect r1 as leader + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.send(vec![msg.clone(), msg]); + assert_eq!(nt.peers[&1].raft_log.committed, 14); + assert_eq!(nt.peers[&2].raft_log.committed, 14); + + let ents = nt + .peers + .get_mut(&1) + .unwrap() + .raft_log + .unstable_entries() + .to_vec(); + nt.storage[&1].wl().append(&ents).unwrap(); + nt.storage[&1].wl().commit_to(14).unwrap(); + nt.peers.get_mut(&1).unwrap().raft_log.applied = 14; + + // Commit a new raft log. + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.send(vec![msg]); + + let s = nt.storage[&1].snapshot(0, 0).unwrap(); + (nt, s) +} + +// Test if an up-to-date follower can request a snapshot from leader. +#[test] +fn test_follower_request_snapshot() { + let (mut nt, s) = prepare_request_snapshot(); + + // Request the latest snapshot. + let prev_snapshot_idx = s.get_metadata().index; + let request_idx = nt.peers[&1].raft_log.committed; + assert!(prev_snapshot_idx < request_idx); + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + + // Send the request snapshot message. + let req_snap = nt.peers.get_mut(&2).unwrap().msgs.pop().unwrap(); + assert!( + req_snap.get_msg_type() == MessageType::MsgAppendResponse + && req_snap.reject + && req_snap.request_snapshot == request_idx, + "{:?}", + req_snap + ); + nt.peers.get_mut(&1).unwrap().step(req_snap).unwrap(); + + // New proposes can not be replicated to peer 2. + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.send(vec![msg.clone()]); + assert_eq!(nt.peers[&1].raft_log.committed, 16); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Snapshot + ); + assert_eq!(nt.peers[&2].raft_log.committed, 15); + + // Util snapshot success or fail. + let report_ok = new_message(2, 1, MessageType::MsgSnapStatus, 0); + nt.send(vec![report_ok]); + let hb_resp = new_message(2, 1, MessageType::MsgHeartbeatResponse, 0); + nt.send(vec![hb_resp]); + nt.send(vec![msg]); + + assert_eq!(nt.peers[&1].raft_log.committed, 17); + assert_eq!(nt.peers[&2].raft_log.committed, 17); +} + +// Test if request snapshot can make progress when it meets SnapshotTemporarilyUnavailable. +#[test] +fn test_request_snapshot_unavailable() { + let (mut nt, s) = prepare_request_snapshot(); + + // Request the latest snapshot. + let prev_snapshot_idx = s.get_metadata().index; + let request_idx = nt.peers[&1].raft_log.committed; + assert!(prev_snapshot_idx < request_idx); + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + + // Send the request snapshot message. + let req_snap = nt.peers.get_mut(&2).unwrap().msgs.pop().unwrap(); + assert!( + req_snap.get_msg_type() == MessageType::MsgAppendResponse + && req_snap.reject + && req_snap.request_snapshot == request_idx, + "{:?}", + req_snap + ); + + // Peer 2 is still in probe state due to SnapshotTemporarilyUnavailable. + nt.peers[&1].store().wl().trigger_snap_unavailable(); + nt.peers + .get_mut(&1) + .unwrap() + .step(req_snap.clone()) + .unwrap(); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Probe + ); + + // Next index is decreased. + nt.peers[&1].store().wl().trigger_snap_unavailable(); + nt.peers + .get_mut(&1) + .unwrap() + .step(req_snap.clone()) + .unwrap(); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Probe + ); + + // Snapshot will be available if it requests again. This message must not + // be considered stale even if `reject != next - 1` + nt.peers.get_mut(&1).unwrap().step(req_snap).unwrap(); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Snapshot + ); +} + +// Test if request snapshot can make progress when matched is advanced. +#[test] +fn test_request_snapshot_matched_change() { + let (mut nt, _) = prepare_request_snapshot(); + // Let matched be greater than the committed. + nt.peers.get_mut(&2).unwrap().raft_log.committed -= 1; + + // Request the latest snapshot. + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + let req_snap = nt.peers.get_mut(&2).unwrap().msgs.pop().unwrap(); + // The request snapshot is ignored because it is considered as out of order. + nt.peers.get_mut(&1).unwrap().step(req_snap).unwrap(); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Replicate + ); + + // Heartbeat is responded with a request snapshot message. + for _ in 0..nt.peers[&1].heartbeat_timeout() { + nt.peers.get_mut(&1).unwrap().tick(); + } + let msg_hb = nt + .peers + .get_mut(&1) + .unwrap() + .msgs + .iter() + .filter(|m| m.to == 2) + .collect::>()[0] + .clone(); + nt.peers.get_mut(&2).unwrap().step(msg_hb).unwrap(); + let req_snap = nt.peers.get_mut(&2).unwrap().msgs.pop().unwrap(); + nt.peers.get_mut(&1).unwrap().step(req_snap).unwrap(); + assert_eq!( + nt.peers[&1].prs().get(2).unwrap().state, + ProgressState::Snapshot + ); +} + +// Test if request snapshot can make progress when the peer is not Replicate. +#[test] +fn test_request_snapshot_none_replicate() { + let (mut nt, _) = prepare_request_snapshot(); + nt.peers + .get_mut(&1) + .unwrap() + .mut_prs() + .get_mut(2) + .unwrap() + .state = ProgressState::Probe; + + // Request the latest snapshot. + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + let req_snap = nt.peers.get_mut(&2).unwrap().msgs.pop().unwrap(); + nt.peers.get_mut(&1).unwrap().step(req_snap).unwrap(); + assert!(nt.peers[&1].prs().get(2).unwrap().pending_request_snapshot != 0); +} + +// Test if request snapshot can make progress when leader steps down. +#[test] +fn test_request_snapshot_step_down() { + let (mut nt, _) = prepare_request_snapshot(); + + // Commit a new entry and leader steps down while peer 2 is isolated. + nt.isolate(2); + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.send(vec![msg]); + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&3].state, StateRole::Leader); + + // Recover and request the latest snapshot. + nt.recover(); + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + nt.send(vec![new_message(3, 3, MessageType::MsgBeat, 0)]); + assert!( + nt.peers[&2].pending_request_snapshot == INVALID_INDEX, + "{}", + nt.peers[&2].pending_request_snapshot + ); +} + +// Abort request snapshot if it becomes leader or candidate. +#[test] +fn test_request_snapshot_on_role_change() { + let (mut nt, _) = prepare_request_snapshot(); + + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + + // Becoming follower does not reset pending_request_snapshot. + let (term, id) = (nt.peers[&1].term, nt.peers[&1].id); + nt.peers.get_mut(&2).unwrap().become_follower(term, id); + assert!( + nt.peers[&2].pending_request_snapshot != INVALID_INDEX, + "{}", + nt.peers[&2].pending_request_snapshot + ); + + // Becoming candidate resets pending_request_snapshot. + nt.peers.get_mut(&2).unwrap().become_candidate(); + assert!( + nt.peers[&2].pending_request_snapshot == INVALID_INDEX, + "{}", + nt.peers[&2].pending_request_snapshot + ); +} + +// Abort request snapshot if term change. +#[test] +fn test_request_snapshot_after_term_change() { + let (mut nt, _) = prepare_request_snapshot(); + + nt.peers.get_mut(&2).unwrap().request_snapshot().unwrap(); + + assert!( + nt.peers[&2].pending_request_snapshot != INVALID_INDEX, + "{}", + nt.peers[&2].pending_request_snapshot + ); + + let term = nt.peers[&1].term; + nt.peers.get_mut(&2).unwrap().reset(term + 1); + assert!( + nt.peers[&2].pending_request_snapshot == INVALID_INDEX, + "{}", + nt.peers[&2].pending_request_snapshot + ); +} + +/// Tests group commit. +/// +/// 1. Logs should be replicated to at least different groups before committed; +/// 2. all peers are configured to the same group, simple quorum should be used. +#[test] +fn test_group_commit() { + let l = default_logger(); + let mut tests = vec![ + // Single + (vec![1], vec![0], 1, 1), + (vec![1], vec![1], 1, 1), + // Odd + (vec![2, 2, 1], vec![1, 2, 1], 2, 2), + (vec![2, 2, 1], vec![1, 1, 2], 1, 2), + (vec![2, 2, 1], vec![1, 0, 1], 1, 2), + (vec![2, 2, 1], vec![0, 0, 0], 1, 2), + // Even + (vec![4, 2, 1, 3], vec![0, 0, 0, 0], 1, 2), + (vec![4, 2, 1, 3], vec![1, 0, 0, 0], 1, 2), + (vec![4, 2, 1, 3], vec![0, 1, 0, 2], 2, 2), + (vec![4, 2, 1, 3], vec![0, 2, 1, 0], 1, 2), + (vec![4, 2, 1, 3], vec![1, 1, 1, 1], 2, 2), + (vec![4, 2, 1, 3], vec![1, 1, 2, 1], 1, 2), + (vec![4, 2, 1, 3], vec![1, 2, 1, 1], 2, 2), + (vec![4, 2, 1, 3], vec![4, 3, 2, 1], 2, 2), + ]; + + for (i, (matches, group_ids, g_w, q_w)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1], vec![])); + let min_index = *matches.iter().min().unwrap(); + let max_index = *matches.iter().max().unwrap(); + let logs: Vec<_> = (min_index..=max_index).map(|i| empty_entry(1, i)).collect(); + store.wl().append(&logs).unwrap(); + let mut hs = HardState::default(); + hs.term = 1; + store.wl().set_hardstate(hs); + let cfg = new_test_config(1, 5, 1); + let mut sm = new_test_raft_with_config(&cfg, store, &l); + + let mut groups = vec![]; + for (j, (m, g)) in matches.into_iter().zip(group_ids).enumerate() { + let id = j as u64 + 1; + if sm.mut_prs().get(id).is_none() { + sm.apply_conf_change(&add_node(id)).unwrap(); + let pr = sm.mut_prs().get_mut(id).unwrap(); + pr.matched = m; + pr.next_idx = m + 1; + } + if g != 0 { + groups.push((id, g)); + } + } + sm.enable_group_commit(true); + sm.assign_commit_groups(&groups); + if sm.raft_log.committed != 0 { + panic!( + "#{}: follower group committed {}, want 0", + i, sm.raft_log.committed + ); + } + sm.state = StateRole::Leader; + sm.assign_commit_groups(&groups); + if sm.raft_log.committed != g_w { + panic!( + "#{}: leader group committed {}, want {}", + i, sm.raft_log.committed, g_w + ); + } + sm.enable_group_commit(false); + if sm.raft_log.committed != q_w { + panic!( + "#{}: quorum committed {}, want {}", + i, sm.raft_log.committed, q_w + ); + } + } +} + +#[test] +fn test_group_commit_consistent() { + let l = default_logger(); + let mut logs = vec![]; + for i in 1..6 { + logs.push(empty_entry(1, i)); + } + for i in 6..=8 { + logs.push(empty_entry(2, i)); + } + let mut tests = vec![ + // Single node is not using group commit + (vec![8], vec![0], 8, 6, StateRole::Leader, Some(false)), + (vec![8], vec![1], 8, 5, StateRole::Leader, None), + (vec![8], vec![1], 8, 6, StateRole::Follower, None), + // Not commit to current term should return None, as old leader may + // have reach consistent. + (vec![8, 2, 0], vec![1, 2, 1], 2, 2, StateRole::Leader, None), + ( + vec![8, 2, 6], + vec![1, 1, 2], + 6, + 6, + StateRole::Leader, + Some(true), + ), + // Not apply to current term should return None, as there maybe pending conf change. + (vec![8, 2, 6], vec![1, 1, 2], 6, 5, StateRole::Leader, None), + // It should be false when not using group commit. + ( + vec![8, 6, 6], + vec![0, 0, 0], + 6, + 6, + StateRole::Leader, + Some(false), + ), + // It should be false when there is only one group. + ( + vec![8, 6, 6], + vec![1, 1, 1], + 6, + 6, + StateRole::Leader, + Some(false), + ), + ( + vec![8, 6, 6], + vec![1, 1, 0], + 6, + 6, + StateRole::Leader, + Some(false), + ), + // Only leader knows what's the current state. + ( + vec![8, 2, 6], + vec![1, 1, 2], + 6, + 6, + StateRole::Follower, + None, + ), + ( + vec![8, 2, 6], + vec![1, 1, 2], + 6, + 6, + StateRole::Candidate, + None, + ), + ( + vec![8, 2, 6], + vec![1, 1, 2], + 6, + 6, + StateRole::PreCandidate, + None, + ), + ]; + + for (i, (matches, group_ids, committed, applied, role, exp)) in tests.drain(..).enumerate() { + let store = MemStorage::new_with_conf_state((vec![1], vec![])); + store.wl().append(&logs).unwrap(); + let mut hs = HardState::default(); + hs.term = 2; + hs.commit = committed; + store.wl().set_hardstate(hs); + let mut cfg = new_test_config(1, 5, 1); + cfg.applied = applied; + let mut sm = new_test_raft_with_config(&cfg, store, &l); + sm.state = role; + + let mut groups = vec![]; + for (j, (m, g)) in matches.into_iter().zip(group_ids).enumerate() { + let id = j as u64 + 1; + if sm.mut_prs().get(id).is_none() { + sm.apply_conf_change(&add_node(id)).unwrap(); + let pr = sm.mut_prs().get_mut(id).unwrap(); + pr.matched = m; + pr.next_idx = m + 1; + } + if g != 0 { + groups.push((id, g)); + } + } + sm.assign_commit_groups(&groups); + if Some(true) == exp { + let is_consistent = sm.check_group_commit_consistent(); + if is_consistent != Some(false) { + panic!( + "#{}: consistency = {:?}, want Some(false)", + i, is_consistent + ); + } + } + sm.enable_group_commit(true); + let is_consistent = sm.check_group_commit_consistent(); + if is_consistent != exp { + panic!("#{}: consistency = {:?}, want {:?}", i, is_consistent, exp); + } + } +} + +/// test_election_with_priority_log verifies the correctness +/// of the election with both priority and log. +#[test] +fn test_election_with_priority_log() { + let tests = [ + // log is up to date or not 1..3, priority 1..3, id, state + (true, false, false, 3, 1, 1, 1, StateRole::Leader), + (true, false, false, 2, 2, 2, 1, StateRole::Leader), + (true, false, false, 1, 3, 3, 1, StateRole::Leader), + (true, true, true, 3, 1, 1, 1, StateRole::Leader), + (true, true, true, 2, 2, 2, 1, StateRole::Leader), + (true, true, true, 1, 3, 3, 1, StateRole::Follower), + (false, true, true, 3, 1, 1, 1, StateRole::Follower), + (false, true, true, 2, 2, 2, 1, StateRole::Follower), + (false, true, true, 1, 3, 3, 1, StateRole::Follower), + (false, false, true, 1, 3, 1, 1, StateRole::Follower), + (false, false, true, 1, 1, 3, 1, StateRole::Leader), + ]; + + for (l1, l2, l3, p1, p2, p3, id, state) in tests { + let l = default_logger(); + let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + n1.set_priority(p1); + n2.set_priority(p2); + n3.set_priority(p3); + let entries = vec![new_entry(1, 1, SOME_DATA), new_entry(1, 1, SOME_DATA)]; + if l1 { + n1.raft_log.append(&entries); + } + if l2 { + n2.raft_log.append(&entries); + } + if l3 { + n3.raft_log.append(&entries); + } + + let mut network = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + + network.send(vec![new_message(id, id, MessageType::MsgHup, 0)]); + + assert_eq!(network.peers[&id].state, state); + } +} + +/// test_election_after_change_priority verifies that a peer can win an election +/// by raising its priority and lose election by lowering its priority. +#[test] +fn test_election_after_change_priority() { + let l = default_logger(); + let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut n3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + // priority of n1 is 0 in default. + n2.set_priority(2); + n3.set_priority(3); + n1.become_follower(1, INVALID_ID); + n2.become_follower(1, INVALID_ID); + n3.become_follower(1, INVALID_ID); + let mut network = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + + assert_eq!(network.peers[&1].priority, 0, "peer 1 priority"); + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + // check state + assert_eq!(network.peers[&1].state, StateRole::Follower, "peer 1 state"); + + let tests = [ + (1, 1, StateRole::Follower), //id, priority, state + (1, 2, StateRole::Leader), + (1, 3, StateRole::Leader), + (1, 0, StateRole::Follower), + ]; + + for (i, &(id, p, state)) in tests.iter().enumerate() { + network + .peers + .get_mut(&id) + .unwrap() + .become_follower((i + 2) as u64, INVALID_ID); + network.peers.get_mut(&id).unwrap().set_priority(p); + network.send(vec![new_message(id, id, MessageType::MsgHup, 0)]); + + // check state + assert_eq!(network.peers[&id].state, state, "peer {} state", id); + } +} + +// `test_read_when_quorum_becomes_less` tests read requests could be handled earlier +// if quorum becomes less in configuration changes. +#[test] +fn test_read_when_quorum_becomes_less() { + let l = default_logger(); + let mut network = Network::new(vec![None, None], &l); + + let mut m = Message::default(); + m.from = 1; + m.to = 1; + m.set_msg_type(MessageType::MsgHup); + network.send(vec![m]); + assert_eq!(network.peers[&1].raft_log.committed, 1); + + // Read index on the peer. + let mut m = Message::default(); + m.to = 1; + m.set_msg_type(MessageType::MsgReadIndex); + let mut e = Entry::default(); + e.data = (b"abcdefg" as &'static [u8]).into(); + m.set_entries(vec![e].into()); + network.dispatch(vec![m]).unwrap(); + + // Broadcast heartbeats. + let heartbeats = network.read_messages(); + network.dispatch(heartbeats).unwrap(); + + // Drop heartbeat response from peer 2. + let heartbeat_responses = network.read_messages(); + assert_eq!(heartbeat_responses.len(), 1); + + network + .peers + .get_mut(&1) + .unwrap() + .apply_conf_change(&remove_node(2)) + .unwrap(); + assert!(!network.peers[&1].read_states.is_empty()); +} + +#[test] +fn test_uncommitted_entries_size_limit() { + let l = default_logger(); + let config = &Config { + id: 1, + max_uncommitted_size: 12, + ..Config::default() + }; + let mut nt = Network::new_with_config(vec![None, None, None], config, &l); + let data = b"hello world!".to_vec(); + let mut entry = Entry::default(); + entry.data = data.to_vec().into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // should return ok + nt.dispatch([msg.clone()]).unwrap(); + + // then next proposal should be dropped + let result = nt.dispatch([msg]); + assert_eq!(result.unwrap_err(), raft::Error::ProposalDropped); + + // but entry with empty size should be accepted + let entry = Entry::default(); + let empty_msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + nt.dispatch([empty_msg]).unwrap(); + + // after reduce, new proposal should be accepted + let mut entry = Entry::default(); + entry.data = data.into(); + entry.index = 3; + nt.peers + .get_mut(&1) + .unwrap() + .reduce_uncommitted_size(&[entry]); + assert_eq!(nt.peers.get_mut(&1).unwrap().uncommitted_size(), 0); + + // a huge proposal should be accepted when there is no uncommitted entry, + // even it's bigger than max_uncommitted_size + let mut entry = Entry::default(); + entry.data = (b"hello world and raft" as &'static [u8]).into(); + let long_msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + nt.dispatch([long_msg]).unwrap(); + + // but another huge one will be dropped + let mut entry = Entry::default(); + entry.data = (b"hello world and raft" as &'static [u8]).into(); + let long_msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + nt.dispatch([long_msg]).unwrap_err(); + + // entry with empty size should still be accepted + let entry = Entry::default(); + let empty_msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + nt.dispatch([empty_msg]).unwrap(); +} + +#[test] +fn test_uncommitted_entry_after_leader_election() { + let l = default_logger(); + let config = &Config { + id: 1, + max_uncommitted_size: 12, + ..Config::default() + }; + let mut nt = Network::new_with_config(vec![None, None, None, None, None], config, &l); + let data = b"hello world!".to_vec(); + let mut entry = Entry::default(); + entry.data = data.into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![entry]); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // create a uncommitted entry on node2 + nt.cut(1, 3); + nt.cut(1, 4); + nt.cut(1, 5); + nt.send(vec![msg]); + + // now isolate master and make node2 as master + nt.isolate(1); + // ignore message append, cluster only work on election + nt.ignore(MessageType::MsgAppend); + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + + // uncommitted log size should be 0 on node2, + // because we set uncommitted size to 0 rather than re-computing it, + // which means max_uncommitted_size is a soft limit + assert_eq!(nt.peers.get_mut(&2).unwrap().state, raft::StateRole::Leader); + assert_eq!(nt.peers.get_mut(&2).unwrap().uncommitted_size(), 0); +} + +#[test] +fn test_uncommitted_state_advance_ready_from_last_term() { + let l = default_logger(); + let config = &Config { + id: 1, + max_uncommitted_size: 12, + ..Config::default() + }; + let mut nt = Network::new_with_config(vec![None, None, None, None, None], config, &l); + + let data = b"hello world!".to_vec(); + let mut ent = Entry::default(); + ent.data = data.clone().into(); + + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![ent.clone()], + )]); + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![ent.clone()], + )]); + + // now node2 has 2 committed entries + // make node2 leader + nt.send(vec![new_message(2, 2, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers.get_mut(&2).unwrap().state, raft::StateRole::Leader); + + nt.isolate(2); + // create one uncommitted entry + nt.send(vec![new_message_with_entries( + 2, + 2, + MessageType::MsgPropose, + vec![ent.clone()], + )]); + + let mut ent1 = ent.clone(); + ent1.index = 1; + let mut ent2 = ent; + ent2.index = 2; + + // simulate advance 2 entries when node2 is follower + nt.peers + .get_mut(&2) + .unwrap() + .reduce_uncommitted_size(&[ent1, ent2]); + + // uncommitted size should be 12(remain unchanged since there's only one uncommitted entries) + assert_eq!(nt.peers.get_mut(&2).unwrap().uncommitted_size(), data.len()); +} + +#[test] +fn test_fast_log_rejection() { + let mut tests = vec![ + // This case tests that leader can find the conflict index quickly. + // Firstly leader appends (type=MsgApp,index=7,logTerm=4, entries=...); + // After rejected leader appends (type=MsgApp,index=3,logTerm=2). + ( + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(4, 6), + empty_entry(4, 7), + ], + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(3, 4), + empty_entry(3, 5), + empty_entry(3, 6), + empty_entry(3, 7), + empty_entry(3, 8), + empty_entry(3, 9), + empty_entry(3, 10), + empty_entry(3, 11), + ], + 3, + 7, + 2, + 3, + ), + // This case tests that leader can find the conflict index quickly. + // Firstly leader appends (type=MsgApp,index=8,logTerm=5, entries=...); + // After rejected leader appends (type=MsgApp,index=4,logTerm=3). + ( + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(3, 4), + empty_entry(4, 5), + empty_entry(4, 6), + empty_entry(4, 7), + empty_entry(5, 8), + ], + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(3, 4), + empty_entry(3, 5), + empty_entry(3, 6), + empty_entry(3, 7), + empty_entry(3, 8), + empty_entry(3, 9), + empty_entry(3, 10), + empty_entry(3, 11), + ], + 3, + 8, + 3, + 4, + ), + // This case tests that follower can find the conflict index quickly. + // Firstly leader appends (type=MsgApp,index=4,logTerm=1, entries=...); + // After rejected leader appends (type=MsgApp,index=1,logTerm=1). + ( + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(1, 4), + ], + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(4, 4), + ], + 1, + 1, + 1, + 1, + ), + // This case is similar to the previous case. However, this time, the + // leader has a longer uncommitted log tail than the follower. + // Firstly leader appends (type=MsgApp,index=6,logTerm=1, entries=...); + // After rejected leader appends (type=MsgApp,index=1,logTerm=1). + ( + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(1, 4), + empty_entry(1, 5), + empty_entry(1, 6), + ], + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(4, 4), + ], + 1, + 1, + 1, + 1, + ), + // This case is similar to the previous case. However, this time, the + // follower has a longer uncommitted log tail than the leader. + // Firstly leader appends (type=MsgApp,index=4,logTerm=1, entries=...); + // After rejected leader appends (type=MsgApp,index=1,logTerm=1). + ( + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(1, 4), + ], + vec![ + empty_entry(1, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(4, 6), + ], + 1, + 1, + 1, + 1, + ), + // An normal case that there are no log conflicts. + // Firstly leader appends (type=MsgApp,index=5,logTerm=5, entries=...); + // After rejected leader appends (type=MsgApp,index=4,logTerm=4). + ( + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(5, 5), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + ], + 4, + 4, + 4, + 4, + ), + // Test case from example comment in stepLeader (on leader). + ( + vec![ + empty_entry(2, 1), + empty_entry(5, 2), + empty_entry(5, 3), + empty_entry(5, 4), + empty_entry(5, 5), + empty_entry(5, 6), + empty_entry(5, 7), + empty_entry(5, 8), + empty_entry(5, 9), + ], + vec![ + empty_entry(2, 1), + empty_entry(4, 2), + empty_entry(4, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(4, 6), + ], + 4, + 6, + 2, + 1, + ), + // Test case from example comment in handleAppendEntries (on follower). + ( + vec![ + empty_entry(2, 1), + empty_entry(2, 2), + empty_entry(2, 3), + empty_entry(2, 4), + empty_entry(2, 5), + ], + vec![ + empty_entry(2, 1), + empty_entry(4, 2), + empty_entry(4, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(4, 6), + empty_entry(4, 7), + empty_entry(4, 8), + ], + 2, + 1, + 2, + 1, + ), + ]; + for ( + i, + ( + leader_log, + follower_log, + reject_hint_term, + reject_hint_index, + next_append_term, + next_append_index, + ), + ) in tests.drain(..).enumerate() + { + let l = default_logger(); + let s1 = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + s1.wl().append(&leader_log).unwrap(); + let s2 = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + s2.wl().append(&follower_log).unwrap(); + let mut n1 = new_test_raft(1, vec![1, 2, 3], 10, 1, s1, &l); + let mut n2 = new_test_raft(2, vec![1, 2, 3], 10, 1, s2, &l); + n1.become_candidate(); + n1.become_leader(); + n2.step(new_message(2, 2, MessageType::MsgHeartbeat, 0)) + .unwrap(); + + let mut msgs = n2.read_messages(); + assert_eq!(msgs.len(), 1, "#{}", i); + assert_eq!( + msgs[0].get_msg_type(), + MessageType::MsgHeartbeatResponse, + "#{}", + i + ); + // move Vec item by pop + n1.step(msgs.pop().unwrap()).unwrap(); + + let mut msgs = n1.read_messages(); + assert_eq!(msgs.len(), 1, "#{}", i); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend, "#{}", i); + n2.step(msgs.pop().unwrap()).unwrap(); + + let mut msgs = n2.read_messages(); + assert_eq!(msgs.len(), 1, "#{}", i); + assert_eq!( + msgs[0].get_msg_type(), + MessageType::MsgAppendResponse, + "#{}", + i + ); + assert!(msgs[0].reject, "#{}", i); + assert_eq!(msgs[0].reject_hint, reject_hint_index, "#{}", i); + assert_eq!(msgs[0].log_term, reject_hint_term, "#{}", i); + n1.step(msgs.pop().unwrap()).unwrap(); + + let msgs = n1.read_messages(); + assert_eq!(msgs.len(), 1, "#{}", i); + assert_eq!(msgs[0].log_term, next_append_term, "#{}", i); + assert_eq!(msgs[0].index, next_append_index, "#{}", i); + } +} + +#[test] +fn test_switching_check_quorum() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2, 3], 5, 1, new_storage(), &l); + + sm.set_check_quorum(true); + sm.become_candidate(); + sm.become_leader(); + for _ in 0..=sm.election_timeout() { + sm.tick(); + } + assert_ne!(sm.state, StateRole::Leader); + + sm.persist(); + sm.set_check_quorum(false); + sm.become_candidate(); + sm.become_leader(); + + for _ in 0..=sm.election_timeout() { + sm.tick(); + } + assert_eq!(sm.state, StateRole::Leader); +} + +fn expect_one_message(r: &mut Interface) -> Message { + let msgs = r.read_messages(); + assert_eq!(msgs.len(), 1, "expect one message"); + msgs[0].clone() +} + +#[test] +fn test_log_replication_with_reordered_message() { + let l = default_logger(); + let mut r1 = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + r1.become_candidate(); + r1.become_leader(); + r1.read_messages(); + r1.mut_prs().get_mut(2).unwrap().become_replicate(); + + let mut r2 = new_test_raft(2, vec![1, 2], 10, 1, new_storage(), &l); + + // r1 sends 2 MsgApp messages to r2. + let _ = r1.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r1.send_append(2); + let req1 = expect_one_message(&mut r1); + let _ = r1.append_entry(&mut [new_entry(0, 0, SOME_DATA)]); + r1.send_append(2); + let req2 = expect_one_message(&mut r1); + + // r2 receives the second MsgApp first due to reordering. + let _ = r2.step(req2); + let resp2 = expect_one_message(&mut r2); + // r2 rejects req2 + assert!(resp2.reject); + assert_eq!(resp2.reject_hint, 0); + assert_eq!(resp2.index, 2); + + // r2 handles the first MsgApp and responses to r1. + // And r1 updates match index accordingly. + let _ = r2.step(req1); + let m = expect_one_message(&mut r2); + assert!(!m.reject); + assert_eq!(m.index, 2); + let _ = r1.step(m); + assert_eq!(r1.prs().get(2).unwrap().matched, 2); + + // r1 observes a transient network issue to r2, hence transits to probe state. + let _ = r1.step(new_message(2, 1, MessageType::MsgUnreachable, 0)); + assert_eq!(r1.prs().get(2).unwrap().state, ProgressState::Probe); + + // now r1 receives the delayed resp2. + let _ = r1.step(resp2); + let m = expect_one_message(&mut r1); + // r1 shall re-send MsgApp from match index even if resp2's reject hint is less than matching index. + assert_eq!(r1.prs().get(2).unwrap().matched, m.index) +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_flow_control.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_flow_control.rs new file mode 100644 index 0000000..5f05f27 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_flow_control.rs @@ -0,0 +1,292 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::test_util::*; +use raft::{default_logger, eraftpb::*}; + +// test_msg_app_flow_control_full ensures: +// 1. msgApp can fill the sending window until full +// 2. when the window is full, no more msgApp can be sent. +#[test] +fn test_msg_app_flow_control_full() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + + // force the progress to be in replicate state + r.mut_prs().get_mut(2).unwrap().become_replicate(); + // fill in the inflights window + for i in 0..r.max_inflight { + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let ms = r.read_messages(); + if ms.len() != 1 { + panic!("#{}: ms count = {}, want 1", i, ms.len()); + } + } + + // ensure 1 + assert!(r.prs().get(2).unwrap().ins.full()); + + // ensure 2 + for i in 0..10 { + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let ms = r.read_messages(); + if !ms.is_empty() { + panic!("#{}: ms count = {}, want 0", i, ms.len()); + } + } +} + +// test_msg_app_flow_control_move_forward ensures msgAppResp can move +// forward the sending window correctly: +// 1. valid msgAppResp.index moves the windows to pass all smaller or equal index. +// 2. out-of-dated msgAppResp has no effect on the sliding window. +#[test] +fn test_msg_app_flow_control_move_forward() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + + // force the progress to be in replicate state + r.mut_prs().get_mut(2).unwrap().become_replicate(); + // fill in the inflights window + for _ in 0..r.max_inflight { + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.read_messages(); + } + + // 1 is noop, 2 is the first proposal we just sent. + // so we start with 2. + for tt in 2..r.max_inflight { + // move forward the window + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = tt as u64; + r.step(m).expect(""); + r.read_messages(); + + // fill in the inflights window again + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let ms = r.read_messages(); + if ms.len() != 1 { + panic!("#{}: ms count = {}, want 1", tt, ms.len()); + } + + // ensure 1 + assert!(r.prs().get(2).unwrap().ins.full()); + + // ensure 2 + for i in 0..tt { + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = i as u64; + r.step(m).expect(""); + if !r.prs().get(2).unwrap().ins.full() { + panic!( + "#{}: inflights.full = {}, want true", + tt, + r.prs().get(2).unwrap().ins.full() + ); + } + } + } +} + +// test_msg_app_flow_control_recv_heartbeat ensures a heartbeat response +// frees one slot if the window is full. +#[test] +fn test_msg_app_flow_control_recv_heartbeat() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + + // force the progress to be in replicate state + r.mut_prs().get_mut(2).unwrap().become_replicate(); + // fill in the inflights window + for _ in 0..r.max_inflight { + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.read_messages(); + } + + for tt in 1..5 { + if !r.prs().get(2).unwrap().ins.full() { + panic!( + "#{}: inflights.full = {}, want true", + tt, + r.prs().get(2).unwrap().ins.full() + ); + } + + // recv tt MsgHeartbeatResp and expect one free slot + for i in 0..tt { + r.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + r.read_messages(); + if r.prs().get(2).unwrap().ins.full() { + panic!( + "#{}.{}: inflights.full = {}, want false", + tt, + i, + r.prs().get(2).unwrap().ins.full() + ); + } + } + + // one slot + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let ms = r.read_messages(); + if ms.len() != 1 { + panic!("#{}: free slot = 0, want 1", tt); + } + + // and just one slot + for i in 0..10 { + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let ms1 = r.read_messages(); + if !ms1.is_empty() { + panic!("#{}.{}, ms1 should be empty.", tt, i); + } + } + + // clear all pending messages + r.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .expect(""); + r.read_messages(); + } +} + +#[test] +fn test_msg_app_flow_control_with_freeing_resources() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2, 3], 5, 1, new_storage(), &l); + + r.become_candidate(); + r.become_leader(); + + for (_, pr) in r.prs().iter() { + assert!(!pr.ins.buffer_is_allocated()); + } + + for i in 1..=3 { + // Force the progress to be in replicate state. + r.mut_prs().get_mut(i).unwrap().become_replicate(); + } + + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .unwrap(); + + for (&id, pr) in r.prs().iter() { + if id != 1 { + assert!(pr.ins.buffer_is_allocated()); + assert_eq!(pr.ins.count(), 1); + } + } + + /* + 1: cap=0/start=0/count=0/buffer=[] + 2: cap=256/start=0/count=1/buffer=[2] + 3: cap=256/start=0/count=1/buffer=[2] + */ + + let mut resp = new_message(2, 1, MessageType::MsgAppendResponse, 0); + resp.index = r.raft_log.last_index(); + r.step(resp).unwrap(); + + assert_eq!(r.prs().get(2).unwrap().ins.count(), 0); + + /* + 1: cap=0/start=0/count=0/buffer=[] + 2: cap=256/start=1/count=0/buffer=[2] + 3: cap=256/start=0/count=1/buffer=[2] + */ + + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .unwrap(); + + assert_eq!(r.prs().get(2).unwrap().ins.count(), 1); + assert_eq!(r.prs().get(3).unwrap().ins.count(), 2); + + /* + 1: cap=0/start=0/count=0/buffer=[] + 2: cap=256/start=1/count=1/buffer=[2,3] + 3: cap=256/start=0/count=2/buffer=[2,3] + */ + + let mut resp = new_message(2, 1, MessageType::MsgAppendResponse, 0); + resp.index = r.raft_log.last_index(); + r.step(resp).unwrap(); + + assert_eq!(r.prs().get(2).unwrap().ins.count(), 0); + assert_eq!(r.prs().get(3).unwrap().ins.count(), 2); + assert_eq!(r.inflight_buffers_size(), 4096); + + /* + 1: cap=0/start=0/count=0/buffer=[] + 2: cap=256/start=2/count=0/buffer=[2,3] + 3: cap=256/start=0/count=2/buffer=[2,3] + */ + + r.maybe_free_inflight_buffers(); + + assert!(!r.prs().get(2).unwrap().ins.buffer_is_allocated()); + assert_eq!(r.prs().get(2).unwrap().ins.count(), 0); + assert_eq!(r.inflight_buffers_size(), 2048); + + /* + 1: cap=0/start=0/count=0/buffer=[] + 2: cap=0/start=0/count=0/buffer=[] + 3: cap=256/start=0/count=2/buffer=[2,3] + */ +} + +// Test progress can be disabled with `adjust_max_inflight_msgs(, 0)`. +#[test] +fn test_disable_progress() { + let l = default_logger(); + let mut r = new_test_raft(1, vec![1, 2], 5, 1, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + + r.mut_prs().get_mut(2).unwrap().become_replicate(); + + // Disable the progress 2. Internal `free`s shouldn't fail. + r.adjust_max_inflight_msgs(2, 0); + r.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .unwrap(); + assert!(r.prs().get(2).unwrap().ins.full()); + assert_eq!(r.prs().get(2).unwrap().ins.count(), 0); + + // Progress 2 is disabled. + let msgs = r.read_messages(); + assert_eq!(msgs.len(), 0); + + // After the progress gets enabled and a heartbeat response is received, + // its leader can continue to append entries to it. + r.adjust_max_inflight_msgs(2, 10); + r.step(new_message(2, 1, MessageType::MsgHeartbeatResponse, 0)) + .unwrap(); + let msgs = r.read_messages(); + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].get_msg_type(), MessageType::MsgAppend); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_paper.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_paper.rs new file mode 100644 index 0000000..22a29b3 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_paper.rs @@ -0,0 +1,1052 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::test_util::*; +use harness::*; +use raft::eraftpb::*; +use raft::storage::MemStorage; +use raft::*; +use slog::Logger; + +pub fn commit_noop_entry(r: &mut Interface, s: &MemStorage) { + assert_eq!(r.state, StateRole::Leader); + r.bcast_append(); + // simulate the response of MsgAppend + let msgs = r.read_messages(); + for m in msgs { + assert_eq!(m.get_msg_type(), MessageType::MsgAppend); + assert_eq!(m.entries.len(), 1); + assert!(m.entries[0].data.is_empty()); + r.step(accept_and_reply(&m)).expect(""); + } + // ignore further messages to refresh followers' commit index + r.read_messages(); + let unstable = r.raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + let (last_idx, last_term) = (e.get_index(), e.get_term()); + r.raft_log.stable_entries(last_idx, last_term); + s.wl().append(&unstable).expect(""); + r.on_persist_entries(last_idx, last_term); + let committed = r.raft_log.committed; + r.commit_apply(committed); + } +} + +fn accept_and_reply(m: &Message) -> Message { + assert_eq!(m.get_msg_type(), MessageType::MsgAppend); + let mut reply = new_message(m.to, m.from, MessageType::MsgAppendResponse, 0); + reply.term = m.term; + reply.index = m.index + m.entries.len() as u64; + reply +} + +#[test] +fn test_follower_update_term_from_message() { + let l = default_logger(); + test_update_term_from_message(StateRole::Follower, &l); +} + +#[test] +fn test_candidate_update_term_from_message() { + let l = default_logger(); + test_update_term_from_message(StateRole::Candidate, &l); +} + +#[test] +fn test_leader_update_term_from_message() { + let l = default_logger(); + test_update_term_from_message(StateRole::Leader, &l); +} + +// test_update_term_from_message tests that if one server’s current term is +// smaller than the other’s, then it updates its current term to the larger +// value. If a candidate or leader discovers that its term is out of date, +// it immediately reverts to follower state. +// Reference: section 5.1 +fn test_update_term_from_message(state: StateRole, l: &Logger) { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), l); + match state { + StateRole::Follower => r.become_follower(1, 2), + StateRole::PreCandidate => r.become_pre_candidate(), + StateRole::Candidate => r.become_candidate(), + StateRole::Leader => { + r.become_candidate(); + r.become_leader(); + } + } + + let mut m = new_message(0, 0, MessageType::MsgAppend, 0); + m.term = 2; + r.step(m).expect(""); + + assert_eq!(r.term, 2); + assert_eq!(r.state, StateRole::Follower); +} + +// test_start_as_follower tests that when servers start up, they begin as followers. +// Reference: section 5.2 +#[test] +fn test_start_as_follower() { + let l = default_logger(); + let r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + assert_eq!(r.state, StateRole::Follower); +} + +// test_leader_bcast_beat tests that if the leader receives a heartbeat tick, +// it will send a msgApp with m.Index = 0, m.LogTerm=0 and empty entries as +// heartbeat to all followers. +// Reference: section 5.2 +#[test] +fn test_leader_bcast_beat() { + let l = default_logger(); + // heartbeat interval + let hi = 1; + let mut r = new_test_raft(1, vec![1, 2, 3], 10, hi, new_storage(), &l); + r.become_candidate(); + r.become_leader(); + for i in 0..10 { + let _ = r.append_entry(&mut [empty_entry(0, i as u64 + 1)]); + } + + for _ in 0..hi { + r.tick(); + } + + let mut msgs = r.read_messages(); + msgs.sort_by_key(|m| format!("{:?}", m)); + + let new_message_ext = |f, to| { + let mut m = new_message(f, to, MessageType::MsgHeartbeat, 0); + m.term = 1; + m.commit = 0; + m + }; + + let expect_msgs = vec![new_message_ext(1, 2), new_message_ext(1, 3)]; + assert_eq!(msgs, expect_msgs); +} + +#[test] +fn test_follower_start_election() { + let l = default_logger(); + test_nonleader_start_election(StateRole::Follower, &l); +} + +#[test] +fn test_candidate_start_new_election() { + let l = default_logger(); + test_nonleader_start_election(StateRole::Candidate, &l); +} + +// test_nonleader_start_election tests that if a follower receives no communication +// over election timeout, it begins an election to choose a new leader. It +// increments its current term and transitions to candidate state. It then +// votes for itself and issues RequestVote RPCs in parallel to each of the +// other servers in the cluster. +// Reference: section 5.2 +// Also if a candidate fails to obtain a majority, it will time out and +// start a new election by incrementing its term and initiating another +// round of RequestVote RPCs. +// Reference: section 5.2 +fn test_nonleader_start_election(state: StateRole, l: &Logger) { + // election timeout + let et = 10; + let mut r = new_test_raft(1, vec![1, 2, 3], et, 1, new_storage(), l); + match state { + StateRole::Follower => r.become_follower(1, 2), + StateRole::Candidate => r.become_candidate(), + _ => panic!("Only non-leader role is accepted."), + } + + for _ in 1..2 * et { + r.tick(); + } + + assert_eq!(r.term, 2); + assert_eq!(r.state, StateRole::Candidate); + assert!(r.prs().votes()[&r.id]); + let mut msgs = r.read_messages(); + msgs.sort_by_key(|m| format!("{:?}", m)); + let new_message_ext = |f, to| { + let mut m = new_message(f, to, MessageType::MsgRequestVote, 0); + m.term = 2; + m + }; + let expect_msgs = vec![new_message_ext(1, 2), new_message_ext(1, 3)]; + assert_eq!(msgs, expect_msgs); +} + +// test_leader_election_in_one_round_rpc tests all cases that may happen in +// leader election during one round of RequestVote RPC: +// a) it wins the election +// b) it loses the election +// c) it is unclear about the result +// Reference: section 5.2 +#[test] +fn test_leader_election_in_one_round_rpc() { + let l = default_logger(); + let mut tests = vec![ + // win the election when receiving votes from a majority of the servers + (1, map!(), StateRole::Leader), + (3, map!(2 => true, 3 => true), StateRole::Leader), + (3, map!(2 => true), StateRole::Leader), + ( + 5, + map!(2 => true, 3 => true, 4 => true, 5 => true), + StateRole::Leader, + ), + (5, map!(2 => true, 3 => true, 4 => true), StateRole::Leader), + (5, map!(2 => true, 3 => true), StateRole::Leader), + // return to follower state if it receives vote denial from a majority + (3, map!(2 => false, 3 => false), StateRole::Follower), + ( + 5, + map!(2 => false, 3 => false, 4 => false, 5 => false), + StateRole::Follower, + ), + ( + 5, + map!(2 => true, 3 => false, 4 => false, 5 => false), + StateRole::Follower, + ), + // stay in candidate if it does not obtain the majority + (3, map!(), StateRole::Candidate), + (5, map!(2 => true), StateRole::Candidate), + (5, map!(2 => false, 3 => false), StateRole::Candidate), + (5, map!(), StateRole::Candidate), + ]; + + for (i, (size, votes, state)) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, (1..=size as u64).collect(), 10, 1, new_storage(), &l); + + r.step(new_message(1, 1, MessageType::MsgHup, 0)).expect(""); + for (id, vote) in votes { + let mut m = new_message(id, 1, MessageType::MsgRequestVoteResponse, 0); + m.term = r.term; + m.reject = !vote; + r.step(m).expect(""); + } + + if r.state != state { + panic!("#{}: state = {:?}, want {:?}", i, r.state, state); + } + if r.term != 1 { + panic!("#{}: term = {}, want {}", i, r.term, 1); + } + } +} + +// test_follower_vote tests that each follower will vote for at most one +// candidate in a given term, on a first-come-first-served basis. +// Reference: section 5.2 +#[test] +fn test_follower_vote() { + let l = default_logger(); + let mut tests = vec![ + (INVALID_ID, 1, false), + (INVALID_ID, 2, false), + (1, 1, false), + (2, 2, false), + (1, 2, true), + (2, 1, true), + ]; + + for (i, (vote, nvote, wreject)) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + r.load_state(&hard_state(1, 0, vote)); + + let mut m = new_message(nvote, 1, MessageType::MsgRequestVote, 0); + m.term = 1; + r.step(m).expect(""); + + let msgs = r.read_messages(); + let mut m = new_message(1, nvote, MessageType::MsgRequestVoteResponse, 0); + m.term = 1; + m.reject = wreject; + let expect_msgs = vec![m]; + if msgs != expect_msgs { + panic!("#{}: msgs = {:?}, want {:?}", i, msgs, expect_msgs); + } + } +} + +// test_candidate_fallback tests that while waiting for votes, +// if a candidate receives an AppendEntries RPC from another server claiming +// to be leader whose term is at least as large as the candidate's current term, +// it recognizes the leader as legitimate and returns to follower state. +// Reference: section 5.2 +#[test] +fn test_candidate_fallback() { + let l = default_logger(); + let new_message_ext = |f, to, term| { + let mut m = new_message(f, to, MessageType::MsgAppend, 0); + m.term = term; + m + }; + let mut tests = vec![new_message_ext(2, 1, 2), new_message_ext(2, 1, 3)]; + for (i, m) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + r.step(new_message(1, 1, MessageType::MsgHup, 0)).expect(""); + assert_eq!(r.state, StateRole::Candidate); + + let term = m.term; + r.step(m).expect(""); + + if r.state != StateRole::Follower { + panic!( + "#{}: state = {:?}, want {:?}", + i, + r.state, + StateRole::Follower + ); + } + if r.term != term { + panic!("#{}: term = {}, want {}", i, r.term, term); + } + } +} + +#[test] +fn test_follower_election_timeout_randomized() { + let l = default_logger(); + test_non_leader_election_timeout_randomized(StateRole::Follower, &l); +} + +#[test] +fn test_candidate_election_timeout_randomized() { + let l = default_logger(); + test_non_leader_election_timeout_randomized(StateRole::Candidate, &l); +} + +// test_non_leader_election_timeout_randomized tests that election timeout for +// follower or candidate is randomized. +// Reference: section 5.2 +fn test_non_leader_election_timeout_randomized(state: StateRole, l: &Logger) { + let et = 10; + let mut r = new_test_raft(1, vec![1, 2, 3], et, 1, new_storage(), l); + let mut timeouts = map!(); + for _ in 0..1000 * et { + let term = r.term; + match state { + StateRole::Follower => r.become_follower(term + 1, 2), + StateRole::Candidate => r.become_candidate(), + _ => panic!("only non leader state is accepted!"), + } + + let mut time = 0; + while r.read_messages().is_empty() { + r.tick(); + time += 1; + } + timeouts.insert(time, true); + } + + assert!(timeouts.len() <= et && timeouts.len() >= et - 1); + for d in et + 1..2 * et { + assert!(timeouts[&d]); + } +} + +#[test] +fn test_follower_election_timeout_nonconflict() { + let l = default_logger(); + test_nonleaders_election_timeout_nonconfict(StateRole::Follower, &l); +} + +#[test] +fn test_candidates_election_timeout_nonconf() { + let l = default_logger(); + test_nonleaders_election_timeout_nonconfict(StateRole::Candidate, &l); +} + +// test_nonleaders_election_timeout_nonconfict tests that in most cases only a +// single server(follower or candidate) will time out, which reduces the +// likelihood of split vote in the new election. +// Reference: section 5.2 +fn test_nonleaders_election_timeout_nonconfict(state: StateRole, l: &Logger) { + let et = 10; + let size = 5; + let mut rs = Vec::with_capacity(size); + let ids: Vec = (1..=size as u64).collect(); + for id in ids.iter().take(size) { + rs.push(new_test_raft(*id, ids.clone(), et, 1, new_storage(), l)); + } + let mut conflicts = 0; + for _ in 0..1000 { + for r in &mut rs { + let term = r.term; + match state { + StateRole::Follower => r.become_follower(term + 1, INVALID_ID), + StateRole::Candidate => r.become_candidate(), + _ => panic!("non leader state is expect!"), + } + } + + let mut timeout_num = 0; + while timeout_num == 0 { + for r in &mut rs { + r.tick(); + if !r.read_messages().is_empty() { + timeout_num += 1; + } + } + } + // several rafts time out at the same tick + if timeout_num > 1 { + conflicts += 1; + } + } + + assert!(f64::from(conflicts) / 1000.0 <= 0.3); +} + +// test_leader_start_replication tests that when receiving client proposals, +// the leader appends the proposal to its log as a new entry, then issues +// AppendEntries RPCs in parallel to each of the other servers to replicate +// the entry. Also, when sending an AppendEntries RPC, the leader includes +// the index and term of the entry in its log that immediately precedes +// the new entries. +// Also, it writes the new entry into stable storage. +// Reference: section 5.3 +#[test] +fn test_leader_start_replication() { + let l = default_logger(); + let s = new_storage(); + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, s.clone(), &l); + r.become_candidate(); + r.become_leader(); + commit_noop_entry(&mut r, &s); + let li = r.raft_log.last_index(); + + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + + assert_eq!(r.raft_log.last_index(), li + 1); + assert_eq!(r.raft_log.committed, li); + let mut msgs = r.read_messages(); + msgs.sort_by_key(|m| format!("{:?}", m)); + let wents = vec![new_entry(1, li + 1, SOME_DATA)]; + let new_message_ext = |f, to, ents| { + let mut m = new_message(f, to, MessageType::MsgAppend, 0); + m.term = 1; + m.index = li; + m.log_term = 1; + m.commit = li; + m.entries = ents; + m + }; + let expect_msgs = vec![ + new_message_ext(1, 2, wents.clone().into()), + new_message_ext(1, 3, wents.clone().into()), + ]; + assert_eq!(msgs, expect_msgs); + assert_eq!(r.raft_log.unstable_entries(), &*wents); +} + +// test_leader_commit_entry tests that when the entry has been safely replicated, +// the leader gives out the applied entries, which can be applied to its state +// machine. +// Also, the leader keeps track of the highest index it knows to be committed, +// and it includes that index in future AppendEntries RPCs so that the other +// servers eventually find out. +// Reference: section 5.3 +#[test] +fn test_leader_commit_entry() { + let l = default_logger(); + let s = new_storage(); + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, s.clone(), &l); + r.become_candidate(); + r.become_leader(); + commit_noop_entry(&mut r, &s); + let li = r.raft_log.last_index(); + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.persist(); + + for m in r.read_messages() { + r.step(accept_and_reply(&m)).expect(""); + } + + assert_eq!(r.raft_log.committed, li + 1); + let wents = vec![new_entry(1, li + 1, SOME_DATA)]; + assert_eq!(r.raft_log.next_entries(None), Some(wents)); + let mut msgs = r.read_messages(); + msgs.sort_by_key(|m| format!("{:?}", m)); + for (i, m) in msgs.drain(..).enumerate() { + assert_eq!(i as u64 + 2, m.to); + assert_eq!(m.get_msg_type(), MessageType::MsgAppend); + assert_eq!(m.commit, li + 1); + } +} + +// test_leader_acknowledge_commit tests that a log entry is committed once the +// leader that created the entry has replicated it on a majority of the servers. +// Reference: section 5.3 +#[test] +fn test_leader_acknowledge_commit() { + let l = default_logger(); + let mut tests = vec![ + (1, map!(), true), + (3, map!(), false), + (3, map!(2 => true), true), + (3, map!(2 => true, 3 => true), true), + (5, map!(), false), + (5, map!(2 => true), false), + (5, map!(2 => true, 3 => true), true), + (5, map!(2 => true, 3 => true, 4 => true), true), + (5, map!(2 => true, 3 => true, 4 => true, 5 => true), true), + ]; + for (i, (size, acceptors, wack)) in tests.drain(..).enumerate() { + let s = new_storage(); + let mut r = new_test_raft(1, (1..=size).collect(), 10, 1, s.clone(), &l); + r.become_candidate(); + r.become_leader(); + commit_noop_entry(&mut r, &s); + let li = r.raft_log.last_index(); + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.persist(); + + for m in r.read_messages() { + if acceptors.contains_key(&m.to) && acceptors[&m.to] { + r.step(accept_and_reply(&m)).expect(""); + } + } + + let g = r.raft_log.committed > li; + if g ^ wack { + panic!("#{}: ack commit = {}, want {}", i, g, wack); + } + } +} + +// test_leader_commit_preceding_entries tests that when leader commits a log entry, +// it also commits all preceding entries in the leader’s log, including +// entries created by previous leaders. +// Also, it applies the entry to its local state machine (in log order). +// Reference: section 5.3 +#[test] +fn test_leader_commit_preceding_entries() { + let l = default_logger(); + let mut tests = vec![ + vec![], + vec![empty_entry(2, 1)], + vec![empty_entry(1, 1), empty_entry(2, 2)], + vec![empty_entry(1, 1)], + ]; + + for (i, mut tt) in tests.drain(..).enumerate() { + let mut r = { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + store.wl().append(&tt).unwrap(); + let cfg = new_test_config(1, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + r.load_state(&hard_state(2, 0, 0)); + r.become_candidate(); + r.become_leader(); + + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.persist(); + + for m in r.read_messages() { + r.step(accept_and_reply(&m)).expect(""); + } + + let li = tt.len() as u64; + tt.append(&mut vec![ + empty_entry(3, li + 1), + new_entry(3, li + 2, SOME_DATA), + ]); + let g = r.raft_log.next_entries(None); + let wg = Some(tt); + if g != wg { + panic!("#{}: ents = {:?}, want {:?}", i, g, wg); + } + } +} + +// test_follower_commit_entry tests that once a follower learns that a log entry +// is committed, it applies the entry to its local state machine (in log order). +// Reference: section 5.3 +#[test] +fn test_follower_commit_entry() { + let l = default_logger(); + let mut tests = vec![ + (vec![new_entry(1, 1, SOME_DATA)], 1), + ( + vec![ + new_entry(1, 1, SOME_DATA), + new_entry(1, 2, Some("somedata2")), + ], + 2, + ), + ( + vec![ + new_entry(1, 1, Some("somedata2")), + new_entry(1, 2, SOME_DATA), + ], + 2, + ), + ( + vec![ + new_entry(1, 1, SOME_DATA), + new_entry(1, 2, Some("somedata2")), + ], + 1, + ), + ]; + + for (i, (ents, commit)) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + r.become_follower(1, 2); + + let mut m = new_message(2, 1, MessageType::MsgAppend, 0); + m.term = 1; + m.commit = commit; + m.entries = ents.clone().into(); + r.step(m).expect(""); + r.persist(); + + if r.raft_log.committed != commit { + panic!( + "#{}: committed = {}, want {}", + i, r.raft_log.committed, commit + ); + } + let wents = Some(ents[..commit as usize].to_vec()); + let g = r.raft_log.next_entries(None); + if g != wents { + panic!("#{}: next_ents = {:?}, want {:?}", i, g, wents); + } + } +} + +// test_follower_check_msg_append tests that if the follower does not find an +// entry in its log with the same index and term as the one in AppendEntries RPC, +// then it refuses the new entries. Otherwise it replies that it accepts the +// append entries. +// Reference: section 5.3 +#[test] +fn test_follower_check_msg_append() { + let l = default_logger(); + let ents = vec![empty_entry(1, 1), empty_entry(2, 2)]; + let mut tests = vec![ + // match with committed entries + (0, 0, 1, 1, false, 0, 0), + (ents[0].term, ents[0].index, 1, 1, false, 0, 0), + // match with uncommitted entries + (ents[1].term, ents[1].index, 2, 1, false, 0, 0), + // unmatch with existing entry + (ents[0].term, ents[1].index, ents[1].index, 1, true, 1, 1), + // unexisting entry + ( + ents[1].term + 1, + ents[1].index + 1, + ents[1].index + 1, + 1, + true, + 2, + 2, + ), + ]; + + for (i, (term, index, windex, w_commit, wreject, wreject_hint, w_log_term)) in + tests.drain(..).enumerate() + { + let mut r = { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + store.wl().append(&ents).unwrap(); + let cfg = new_test_config(1, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + r.load_state(&hard_state(0, 1, 0)); + r.become_follower(2, 2); + + let mut m = new_message(2, 1, MessageType::MsgAppend, 0); + m.term = 2; + m.log_term = term; + m.index = index; + r.step(m).expect(""); + + let msgs = r.read_messages(); + let mut wm = new_message(1, 2, MessageType::MsgAppendResponse, 0); + wm.term = 2; + wm.index = windex; + wm.commit = w_commit; + if wreject { + wm.reject = wreject; + wm.reject_hint = wreject_hint; + wm.log_term = w_log_term; + } + let expect_msgs = vec![wm]; + if msgs != expect_msgs { + panic!("#{}: msgs = {:?}, want {:?}", i, msgs, expect_msgs); + } + } +} + +// test_follower_append_entries tests that when AppendEntries RPC is valid, +// the follower will delete the existing conflict entry and all that follow it, +// and append any new entries not already in the log. +// Also, it writes the new entry into stable storage. +// Reference: section 5.3 +#[test] +fn test_follower_append_entries() { + let l = default_logger(); + let mut tests = vec![ + ( + 2, + 2, + vec![empty_entry(3, 3)], + vec![empty_entry(1, 1), empty_entry(2, 2), empty_entry(3, 3)], + vec![empty_entry(3, 3)], + ), + ( + 1, + 1, + vec![empty_entry(3, 2), empty_entry(4, 3)], + vec![empty_entry(1, 1), empty_entry(3, 2), empty_entry(4, 3)], + vec![empty_entry(3, 2), empty_entry(4, 3)], + ), + ( + 0, + 0, + vec![empty_entry(1, 1)], + vec![empty_entry(1, 1), empty_entry(2, 2)], + vec![], + ), + ( + 0, + 0, + vec![empty_entry(3, 1)], + vec![empty_entry(3, 1)], + vec![empty_entry(3, 1)], + ), + ]; + for (i, (index, term, ents, wents, wunstable)) in tests.drain(..).enumerate() { + let mut r = { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + store + .wl() + .append(&[empty_entry(1, 1), empty_entry(2, 2)]) + .unwrap(); + let cfg = new_test_config(1, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + r.become_follower(2, 2); + + let mut m = new_message(2, 1, MessageType::MsgAppend, 0); + m.term = 2; + m.log_term = term; + m.index = index; + m.entries = ents.into(); + r.step(m).expect(""); + + let g = r.raft_log.all_entries(); + if g != wents { + panic!("#{}: ents = {:?}, want {:?}", i, g, wents); + } + let g = r.raft_log.unstable_entries(); + if g != &*wunstable { + panic!("#{}: unstable_entries = {:?}, want {:?}", i, g, wunstable); + } + } +} + +// test_leader_sync_follower_log tests that the leader could bring a follower's log +// into consistency with its own. +// Reference: section 5.3, figure 7 +#[test] +fn test_leader_sync_follower_log() { + let l = default_logger(); + let ents = vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(5, 6), + empty_entry(5, 7), + empty_entry(6, 8), + empty_entry(6, 9), + empty_entry(6, 10), + ]; + let term = 8u64; + let mut tests = vec![ + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(5, 6), + empty_entry(5, 7), + empty_entry(6, 8), + empty_entry(6, 9), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(5, 6), + empty_entry(5, 7), + empty_entry(6, 8), + empty_entry(6, 9), + empty_entry(6, 10), + empty_entry(6, 11), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(5, 6), + empty_entry(5, 7), + empty_entry(6, 8), + empty_entry(6, 9), + empty_entry(6, 10), + empty_entry(7, 11), + empty_entry(7, 12), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(4, 4), + empty_entry(4, 5), + empty_entry(4, 6), + empty_entry(4, 7), + ], + vec![ + empty_entry(1, 1), + empty_entry(1, 2), + empty_entry(1, 3), + empty_entry(2, 4), + empty_entry(2, 5), + empty_entry(2, 6), + empty_entry(3, 7), + empty_entry(3, 8), + empty_entry(3, 9), + empty_entry(3, 10), + empty_entry(3, 11), + ], + ]; + for (i, tt) in tests.drain(..).enumerate() { + let mut lead = { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + store.wl().append(&ents).unwrap(); + let cfg = new_test_config(1, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + let last_index = lead.raft_log.last_index(); + lead.load_state(&hard_state(term, last_index, 0)); + + let mut follower = { + let store = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + store.wl().append(&tt).unwrap(); + let cfg = new_test_config(2, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + follower.load_state(&hard_state(term - 1, 0, 0)); + + // It is necessary to have a three-node cluster. + // The second may have more up-to-date log than the first one, so the + // first node needs the vote from the third node to become the leader. + let mut n = Network::new(vec![Some(lead), Some(follower), NOP_STEPPER], &l); + n.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + // The election occurs in the term after the one we loaded with + // lead.load_state above. + let mut m = new_message(3, 1, MessageType::MsgRequestVoteResponse, 0); + m.term = term + 1; + n.send(vec![m]); + + let mut m = new_message(1, 1, MessageType::MsgPropose, 0); + m.entries = vec![Entry::default()].into(); + n.send(vec![m]); + let lead_str = ltoa(&n.peers[&1].raft_log); + let follower_str = ltoa(&n.peers[&2].raft_log); + if lead_str != follower_str { + panic!( + "#{}: lead str: {}, follower_str: {}", + i, lead_str, follower_str + ); + } + } +} + +// test_vote_request tests that the vote request includes information about the candidate’s log +// and are sent to all of the other nodes. +// Reference: section 5.4.1 +#[test] +fn test_vote_request() { + let l = default_logger(); + let mut tests = vec![ + (vec![empty_entry(1, 1)], 2), + (vec![empty_entry(1, 1), empty_entry(2, 2)], 3), + ]; + for (j, (ents, wterm)) in tests.drain(..).enumerate() { + let mut r = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut m = new_message(2, 1, MessageType::MsgAppend, 0); + m.term = wterm - 1; + m.log_term = 0; + m.index = 0; + m.entries = ents.clone().into(); + r.step(m).expect(""); + r.read_messages(); + + for _ in 1..r.election_timeout() * 2 { + r.tick_election(); + } + + let mut msgs = r.read_messages(); + msgs.sort_by_key(|m| format!("{:?}", m)); + if msgs.len() != 2 { + panic!("#{}: msg count = {}, want 2", j, msgs.len()); + } + for (i, m) in msgs.iter().enumerate() { + if m.get_msg_type() != MessageType::MsgRequestVote { + panic!( + "#{}.{}: msg_type = {:?}, want {:?}", + j, + i, + m.get_msg_type(), + MessageType::MsgRequestVote + ); + } + if m.to != i as u64 + 2 { + panic!("#{}.{}: to = {}, want {}", j, i, m.to, i + 2); + } + if m.term != wterm { + panic!("#{}.{}: term = {}, want {}", j, i, m.term, wterm); + } + let windex = ents.last().unwrap().index; + let wlogterm = ents.last().unwrap().term; + if m.index != windex { + panic!("#{}.{}: index = {}, want {}", j, i, m.index, windex); + } + if m.log_term != wlogterm { + panic!("#{}.{}: log_term = {}, want {}", j, i, m.log_term, wlogterm); + } + } + } +} + +// test_voter tests the voter denies its vote if its own log is more up-to-date +// than that of the candidate. +// Reference: section 5.4.1 +#[test] +fn test_voter() { + let l = default_logger(); + let mut tests = vec![ + // same logterm + (vec![empty_entry(1, 1)], 1, 1, false), + (vec![empty_entry(1, 1)], 1, 2, false), + (vec![empty_entry(1, 1), empty_entry(1, 2)], 1, 1, true), + // candidate higher logterm + (vec![empty_entry(1, 1)], 2, 1, false), + (vec![empty_entry(1, 1)], 2, 2, false), + (vec![empty_entry(1, 1), empty_entry(1, 2)], 2, 1, false), + // voter higher logterm + (vec![empty_entry(2, 1)], 1, 1, true), + (vec![empty_entry(2, 1)], 1, 2, true), + (vec![empty_entry(2, 1), empty_entry(1, 2)], 1, 1, true), + ]; + for (i, (ents, log_term, index, wreject)) in tests.drain(..).enumerate() { + let s = MemStorage::new_with_conf_state((vec![1, 2], vec![])); + s.wl().append(&ents).unwrap(); + let cfg = new_test_config(1, 10, 1); + let mut r = new_test_raft_with_config(&cfg, s, &l); + + let mut m = new_message(2, 1, MessageType::MsgRequestVote, 0); + m.term = 3; + m.log_term = log_term; + m.index = index; + r.step(m).expect(""); + + let msgs = r.read_messages(); + if msgs.len() != 1 { + panic!("#{}: msg count = {}, want {}", i, msgs.len(), 1); + } + if msgs[0].get_msg_type() != MessageType::MsgRequestVoteResponse { + panic!( + "#{}: msg_type = {:?}, want {:?}", + i, + msgs[0].get_msg_type(), + MessageType::MsgRequestVoteResponse + ); + } + if msgs[0].reject != wreject { + panic!("#{}: reject = {}, want {}", i, msgs[0].reject, wreject); + } + } +} + +// TestLeaderOnlyCommitsLogFromCurrentTerm tests that only log entries from the leader’s +// current term are committed by counting replicas. +// Reference: section 5.4.2 +#[test] +fn test_leader_only_commits_log_from_current_term() { + let l = default_logger(); + let ents = vec![empty_entry(1, 1), empty_entry(2, 2)]; + let mut tests = vec![ + // do not commit log entries in previous terms + (1, 0), + (2, 0), + // commit log in current term + (3, 3), + ]; + for (i, (index, wcommit)) in tests.drain(..).enumerate() { + let mut r = { + let store = MemStorage::new_with_conf_state((vec![1, 2], vec![])); + store.wl().append(&ents).unwrap(); + let cfg = new_test_config(1, 10, 1); + new_test_raft_with_config(&cfg, store, &l) + }; + r.load_state(&hard_state(2, 0, 0)); + + // become leader at term 3 + r.become_candidate(); + r.become_leader(); + r.read_messages(); + + // propose a entry to current term + r.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + r.persist(); + + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.term = r.term; + m.index = index; + r.step(m).expect(""); + if r.raft_log.committed != wcommit { + panic!( + "#{}: commit = {}, want {}", + i, r.raft_log.committed, wcommit + ); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_snap.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_snap.rs new file mode 100644 index 0000000..e330246 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raft_snap.rs @@ -0,0 +1,234 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::test_util::*; +use harness::Network; +use raft::eraftpb::*; +use raft::{default_logger, Error, ProgressState, INVALID_INDEX}; + +fn testing_snap() -> Snapshot { + new_snapshot(11, 11, vec![1, 2]) +} + +#[test] +fn test_sending_snapshot_set_pending_snapshot() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + // force set the next of node 1, so that + // node 1 needs a snapshot + sm.mut_prs().get_mut(2).unwrap().next_idx = sm.raft_log.first_index(); + + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + { + let voter_2 = sm.prs().get(2).unwrap(); + m.index = voter_2.next_idx - 1; + m.reject = true; + }; + sm.step(m).expect(""); + assert_eq!(sm.prs().get(2).unwrap().pending_snapshot, 11); +} + +#[test] +fn test_pending_snapshot_pause_replication() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + sm.mut_prs().get_mut(2).unwrap().become_snapshot(11); + + sm.step(new_message(1, 1, MessageType::MsgPropose, 1)) + .expect(""); + let msgs = sm.read_messages(); + assert!(msgs.is_empty()); +} + +#[test] +fn test_snapshot_failure() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + sm.mut_prs().get_mut(2).unwrap().next_idx = 1; + sm.mut_prs().get_mut(2).unwrap().become_snapshot(11); + + let mut m = new_message(2, 1, MessageType::MsgSnapStatus, 0); + m.reject = true; + sm.step(m).expect(""); + let voter_2 = sm.prs().get(2).unwrap(); + assert_eq!(voter_2.pending_snapshot, 0); + assert_eq!(voter_2.next_idx, 1); + assert!(voter_2.paused); +} + +#[test] +fn test_snapshot_succeed() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + sm.mut_prs().get_mut(2).unwrap().next_idx = 1; + sm.mut_prs().get_mut(2).unwrap().become_snapshot(11); + + let mut m = new_message(2, 1, MessageType::MsgSnapStatus, 0); + m.reject = false; + sm.step(m).expect(""); + let voter_2 = sm.prs().get(2).unwrap(); + assert_eq!(voter_2.pending_snapshot, 0); + assert_eq!(voter_2.next_idx, 12); + assert!(voter_2.paused); +} + +#[test] +fn test_snapshot_abort() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + sm.become_candidate(); + sm.become_leader(); + + sm.mut_prs().get_mut(2).unwrap().next_idx = 1; + sm.mut_prs().get_mut(2).unwrap().become_snapshot(11); + + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = 11; + // A successful MsgAppendResponse that has a higher/equal index than the + // pending snapshot should abort the pending snapshot. + sm.step(m).expect(""); + assert_eq!(sm.prs().get(2).unwrap().pending_snapshot, 0); + assert_eq!(sm.prs().get(2).unwrap().next_idx, 12); +} + +// Initialized storage should be at term 1 instead of 0. Otherwise the case will fail. +#[test] +fn test_snapshot_with_min_term() { + let l = default_logger(); + let do_test = |pre_vote: bool| { + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2])) + .unwrap(); + let n1 = new_test_raft_with_prevote(1, vec![1, 2], 10, 1, s, pre_vote, &l); + let n2 = new_test_raft_with_prevote(2, vec![], 10, 1, new_storage(), pre_vote, &l); + let mut nt = Network::new(vec![Some(n1), Some(n2)], &l); + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + // 1 will be elected as leader, and then send a snapshot and an empty entry to 2. + assert_eq!(nt.peers[&2].raft_log.first_index(), 2); + assert_eq!(nt.peers[&2].raft_log.last_index(), 2); + }; + do_test(true); + do_test(false); +} + +#[test] +fn test_request_snapshot() { + let l = default_logger(); + let mut sm = new_test_raft(1, vec![1, 2], 10, 1, new_storage(), &l); + sm.restore(testing_snap()); + sm.persist(); + + // Raft can not step request snapshot if there is no leader. + assert_eq!( + sm.raft.as_mut().unwrap().request_snapshot().unwrap_err(), + Error::RequestSnapshotDropped + ); + + let term = sm.term; + sm.become_follower(term + 1, 2); + + // Raft can not step request snapshot if last raft log's term mismatch current term. + assert_eq!( + sm.raft.as_mut().unwrap().request_snapshot().unwrap_err(), + Error::RequestSnapshotDropped + ); + + sm.become_candidate(); + sm.become_leader(); + + // Raft can not step request snapshot if itself is a leader. + assert_eq!( + sm.raft.as_mut().unwrap().request_snapshot().unwrap_err(), + Error::RequestSnapshotDropped + ); + + // Advance matched. + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = 11; + sm.step(m).unwrap(); + assert_eq!(sm.prs().get(2).unwrap().state, ProgressState::Replicate); + + let request_snapshot_idx = sm.raft_log.committed; + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = 11; + m.reject = true; + m.reject_hint = INVALID_INDEX; + m.request_snapshot = request_snapshot_idx; + + // Ignore out of order request snapshot messages. + let mut out_of_order = m.clone(); + out_of_order.index = 9; + sm.step(out_of_order).unwrap(); + assert_eq!(sm.prs().get(2).unwrap().state, ProgressState::Replicate); + + // Request snapshot. + sm.step(m).unwrap(); + assert_eq!(sm.prs().get(2).unwrap().state, ProgressState::Snapshot); + assert_eq!(sm.prs().get(2).unwrap().pending_snapshot, 11); + assert_eq!(sm.prs().get(2).unwrap().next_idx, 12); + assert!(sm.prs().get(2).unwrap().is_paused()); + let snap = sm.msgs.pop().unwrap(); + assert!( + snap.get_msg_type() == MessageType::MsgSnapshot + && snap.get_snapshot().get_metadata().index == request_snapshot_idx, + "{:?}", + snap + ); + + // Append/heartbeats does not set the state from snapshot to probe. + let mut m = new_message(2, 1, MessageType::MsgAppendResponse, 0); + m.index = 11; + sm.step(m).unwrap(); + assert_eq!(sm.prs().get(2).unwrap().state, ProgressState::Snapshot); + assert_eq!(sm.prs().get(2).unwrap().pending_snapshot, 11); + assert_eq!(sm.prs().get(2).unwrap().next_idx, 12); + assert!(sm.prs().get(2).unwrap().is_paused()); + + // However snapshot status report does set the stat to probe. + let m = new_message(2, 1, MessageType::MsgSnapStatus, 0); + sm.step(m).unwrap(); + assert_eq!(sm.prs().get(2).unwrap().state, ProgressState::Probe); + assert_eq!(sm.prs().get(2).unwrap().pending_snapshot, 0); + assert_eq!(sm.prs().get(2).unwrap().next_idx, 12); + assert!(sm.prs().get(2).unwrap().is_paused()); +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raw_node.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raw_node.rs new file mode 100644 index 0000000..058321b --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/integration_cases/test_raw_node.rs @@ -0,0 +1,1985 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use harness::Network; +use protobuf::{Message as PbMessage, ProtobufEnum as _}; +use raft::eraftpb::*; +use raft::storage::MemStorage; +use raft::*; +use raft_proto::*; +use slog::Logger; + +use crate::test_util::*; + +fn conf_change(t: ConfChangeType, node_id: u64) -> ConfChange { + let mut cc = ConfChange::default(); + cc.set_change_type(t); + cc.node_id = node_id; + cc +} + +#[allow(clippy::too_many_arguments)] +fn must_cmp_ready( + r: &Ready, + ss: &Option, + hs: &Option, + entries: &[Entry], + committed_entries: &[Entry], + snapshot: &Option, + msg_is_empty: bool, + persisted_msg_is_empty: bool, + must_sync: bool, +) { + assert_eq!(r.ss(), ss.as_ref()); + assert_eq!(r.hs(), hs.as_ref()); + assert_eq!(r.entries().as_slice(), entries); + assert_eq!(r.committed_entries().as_slice(), committed_entries); + assert_eq!(r.must_sync(), must_sync); + assert!(r.read_states().is_empty()); + assert_eq!( + r.snapshot(), + snapshot.as_ref().unwrap_or(&Snapshot::default()) + ); + assert_eq!(r.messages().is_empty(), msg_is_empty); + assert_eq!(r.persisted_messages().is_empty(), persisted_msg_is_empty); +} + +fn new_raw_node( + id: u64, + peers: Vec, + election_tick: usize, + heartbeat_tick: usize, + storage: MemStorage, + logger: &Logger, +) -> RawNode { + let config = new_test_config(id, election_tick, heartbeat_tick); + new_raw_node_with_config(peers, &config, storage, logger) +} + +fn new_raw_node_with_config( + peers: Vec, + config: &Config, + storage: MemStorage, + logger: &Logger, +) -> RawNode { + if storage.initial_state().unwrap().initialized() && peers.is_empty() { + panic!("new_raw_node with empty peers on initialized store"); + } + if !peers.is_empty() && !storage.initial_state().unwrap().initialized() { + storage + .wl() + .apply_snapshot(new_snapshot(1, 1, peers)) + .unwrap(); + } + RawNode::new(config, storage, logger).unwrap() +} + +/// Ensures that RawNode::step ignore local message. +#[test] +fn test_raw_node_step() { + let l = default_logger(); + for msg_t in MessageType::values() { + let s = new_storage(); + s.wl().set_hardstate(hard_state(1, 1, 0)); + // Append an empty entry to make sure the non-local messages (like + // vote requests) are ignored and don't trigger assertions. + s.wl().append(&[new_entry(1, 1, None)]).unwrap(); + s.wl().apply_snapshot(new_snapshot(1, 1, vec![1])).unwrap(); + + let mut raw_node = new_raw_node(1, vec![1], 10, 1, new_storage(), &l); + let res = raw_node.step(new_message(0, 0, *msg_t, 0)); + // LocalMsg should be ignored. + if raw_node::is_local_msg(*msg_t) { + assert_eq!(res, Err(Error::StepLocalMsg), "{:?}", msg_t); + } + } +} + +/// Ensures that MsgReadIndex to old leader gets forwarded to the new leader and +/// 'send' method does not attach its term. +#[test] +fn test_raw_node_read_index_to_old_leader() { + let l = default_logger(); + let r1 = new_test_raft(1, vec![1, 2, 3], 10, 1, new_storage(), &l); + let r2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let r3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + + let mut nt = Network::new(vec![Some(r1), Some(r2), Some(r3)], &l); + + // elect r1 as leader + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + + // send readindex request to r2(follower) + let _ = nt.peers.get_mut(&2).unwrap().step(new_message_with_entries( + 2, + 2, + MessageType::MsgReadIndex, + vec![test_entries.clone()], + )); + + // verify r2(follower) forwards this message to r1(leader) with term not set + assert_eq!(nt.peers[&2].msgs.len(), 1); + let read_index_msg1 = + new_message_with_entries(2, 1, MessageType::MsgReadIndex, vec![test_entries.clone()]); + assert_eq!(read_index_msg1, nt.peers[&2].msgs[0]); + + // send readindex request to r3(follower) + let _ = nt.peers.get_mut(&3).unwrap().step(new_message_with_entries( + 3, + 3, + MessageType::MsgReadIndex, + vec![test_entries.clone()], + )); + + // verify r3(follower) forwards this message to r1(leader) with term not set as well. + assert_eq!(nt.peers[&3].msgs.len(), 1); + + let read_index_msg2 = + new_message_with_entries(3, 1, MessageType::MsgReadIndex, vec![test_entries.clone()]); + assert_eq!(nt.peers[&3].msgs[0], read_index_msg2); + + // now elect r3 as leader + nt.send(vec![new_message(3, 3, MessageType::MsgHup, 0)]); + + // let r1 steps the two messages previously we got from r2, r3 + let _ = nt.peers.get_mut(&1).unwrap().step(read_index_msg1); + let _ = nt.peers.get_mut(&1).unwrap().step(read_index_msg2); + + // verify r1(follower) forwards these messages again to r3(new leader) + assert_eq!(nt.peers[&1].msgs.len(), 2); + assert_eq!( + nt.peers[&1].msgs[0], + new_message_with_entries(2, 3, MessageType::MsgReadIndex, vec![test_entries.clone()]) + ); + assert_eq!( + nt.peers[&1].msgs[1], + new_message_with_entries(3, 3, MessageType::MsgReadIndex, vec![test_entries]) + ); +} + +/// Tests the configuration change mechanism. Each test case sends a configuration +/// change which is either simple or joint, verifies that it applies and that the +/// resulting ConfState matches expectations, and for joint configurations makes +/// sure that they are exited successfully. +#[test] +fn test_raw_node_propose_and_conf_change() { + let l = default_logger(); + let mut test_cases: Vec<(Box, _, _)> = vec![ + // V1 config change. + ( + Box::new(conf_change(ConfChangeType::AddNode, 2)), + conf_state(vec![1, 2], vec![]), + None, + ), + ]; + + // Proposing the same as a V2 change works just the same, without entering + // a joint config. + let single = new_conf_change_single(2, ConfChangeType::AddNode); + test_cases.push(( + Box::new(conf_change_v2(vec![single])), + conf_state(vec![1, 2], vec![]), + None, + )); + + // Ditto if we add it as a learner instead. + let single = new_conf_change_single(2, ConfChangeType::AddLearnerNode); + test_cases.push(( + Box::new(conf_change_v2(vec![single])), + conf_state(vec![1], vec![2]), + None, + )); + + // We can ask explicitly for joint consensus if we want it. + let single = new_conf_change_single(2, ConfChangeType::AddLearnerNode); + let mut cc = conf_change_v2(vec![single]); + cc.set_transition(ConfChangeTransition::Explicit); + let cs = conf_state_v2(vec![1], vec![2], vec![1], vec![], false); + test_cases.push((Box::new(cc), cs, Some(conf_state(vec![1], vec![2])))); + + // Ditto, but with implicit transition (the harness checks this). + let single = new_conf_change_single(2, ConfChangeType::AddLearnerNode); + let mut cc = conf_change_v2(vec![single]); + cc.set_transition(ConfChangeTransition::Implicit); + let cs = conf_state_v2(vec![1], vec![2], vec![1], vec![], true); + test_cases.push((Box::new(cc), cs, Some(conf_state(vec![1], vec![2])))); + + // Add a new node and demote n1. This exercises the interesting case in + // which we really need joint config changes and also need LearnersNext. + let cc = conf_change_v2(vec![ + new_conf_change_single(2, ConfChangeType::AddNode), + new_conf_change_single(1, ConfChangeType::AddLearnerNode), + new_conf_change_single(3, ConfChangeType::AddLearnerNode), + ]); + let cs = conf_state_v2(vec![2], vec![3], vec![1], vec![1], true); + test_cases.push((Box::new(cc), cs, Some(conf_state(vec![2], vec![1, 3])))); + + // Ditto explicit. + let mut cc = conf_change_v2(vec![ + new_conf_change_single(2, ConfChangeType::AddNode), + new_conf_change_single(1, ConfChangeType::AddLearnerNode), + new_conf_change_single(3, ConfChangeType::AddLearnerNode), + ]); + cc.set_transition(ConfChangeTransition::Explicit); + let cs = conf_state_v2(vec![2], vec![3], vec![1], vec![1], false); + test_cases.push((Box::new(cc), cs, Some(conf_state(vec![2], vec![1, 3])))); + + // Ditto implicit. + let mut cc = conf_change_v2(vec![ + new_conf_change_single(2, ConfChangeType::AddNode), + new_conf_change_single(1, ConfChangeType::AddLearnerNode), + new_conf_change_single(3, ConfChangeType::AddLearnerNode), + ]); + cc.set_transition(ConfChangeTransition::Implicit); + let cs = conf_state_v2(vec![2], vec![3], vec![1], vec![1], true); + test_cases.push((Box::new(cc), cs, Some(conf_state(vec![2], vec![1, 3])))); + + for (cc, exp, exp2) in test_cases { + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + raw_node.campaign().unwrap(); + let mut proposed = false; + let mut ccdata = vec![]; + // Propose the ConfChange, wait until it applies, save the resulting ConfState. + let mut cs = None; + while cs.is_none() { + let mut rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let mut handle_committed_entries = + |rn: &mut RawNode, committed_entries: Vec| { + for e in committed_entries { + if e.get_entry_type() == EntryType::EntryConfChange { + let mut cc = ConfChange::default(); + cc.merge_from_bytes(e.get_data()).unwrap(); + cs = Some(rn.apply_conf_change(&cc).unwrap()); + } else if e.get_entry_type() == EntryType::EntryConfChangeV2 { + let mut cc = ConfChangeV2::default(); + cc.merge_from_bytes(e.get_data()).unwrap(); + cs = Some(rn.apply_conf_change(&cc).unwrap()); + } + } + }; + handle_committed_entries(&mut raw_node, rd.take_committed_entries()); + let is_leader = rd.ss().is_some_and(|ss| ss.leader_id == raw_node.raft.id); + + let mut light_rd = raw_node.advance(rd); + handle_committed_entries(&mut raw_node, light_rd.take_committed_entries()); + raw_node.advance_apply(); + + // Once we are the leader, propose a command and a ConfChange. + if !proposed && is_leader { + raw_node.propose(vec![], b"somedata".to_vec()).unwrap(); + + if let Some(v1) = cc.as_v1() { + ccdata = v1.write_to_bytes().unwrap(); + raw_node.propose_conf_change(vec![], v1.clone()).unwrap(); + } else { + let v2 = cc.as_v2().clone().into_owned(); + ccdata = v2.write_to_bytes().unwrap(); + raw_node.propose_conf_change(vec![], v2).unwrap(); + } + + proposed = true; + } + } + + // Check that the last index is exactly the conf change we put in, + // down to the bits. Note that this comes from the Storage, which + // will not reflect any unstable entries that we'll only be presented + // with in the next Ready. + let last_index = s.last_index().unwrap(); + let entries = s + .entries( + last_index - 1, + last_index + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + ) + .unwrap(); + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].get_data(), b"somedata"); + if cc.as_v1().is_some() { + assert_eq!(entries[1].get_entry_type(), EntryType::EntryConfChange); + } else { + assert_eq!(entries[1].get_entry_type(), EntryType::EntryConfChangeV2); + } + assert_eq!(ccdata, entries[1].get_data()); + assert_eq!(exp, cs.unwrap()); + + let conf_index = if cc.as_v2().enter_joint() == Some(true) { + // If this is an auto-leaving joint conf change, it will have + // appended the entry that auto-leaves, so add one to the last + // index that forms the basis of our expectations on + // pendingConfIndex. (Recall that lastIndex was taken from stable + // storage, but this auto-leaving entry isn't on stable storage + // yet). + last_index + 1 + } else { + last_index + }; + assert_eq!(conf_index, raw_node.raft.pending_conf_index); + + // Move the RawNode along. If the ConfChange was simple, nothing else + // should happen. Otherwise, we're in a joint state, which is either + // left automatically or not. If not, we add the proposal that leaves + // it manually. + let mut rd = raw_node.ready(); + let mut context = vec![]; + if !exp.auto_leave { + assert!(rd.entries().is_empty()); + if exp2.is_none() { + continue; + } + context = b"manual".to_vec(); + let mut cc = conf_change_v2(vec![]); + cc.set_context(context.clone().into()); + raw_node.propose_conf_change(vec![], cc).unwrap(); + rd = raw_node.ready(); + } + // Check that the right ConfChange comes out. + assert_eq!(rd.entries().len(), 1); + assert_eq!( + rd.entries()[0].get_entry_type(), + EntryType::EntryConfChangeV2 + ); + let mut leave_cc = ConfChangeV2::default(); + leave_cc + .merge_from_bytes(rd.entries()[0].get_data()) + .unwrap(); + assert_eq!(context, leave_cc.get_context(), "{:?}", cc.as_v2()); + // Lie and pretend the ConfChange applied. It won't do so because now + // we require the joint quorum and we're only running one node. + let cs = raw_node.apply_conf_change(&leave_cc).unwrap(); + assert_eq!(cs, exp2.unwrap()); + } +} + +/// Tests the configuration change auto leave even leader lost leadership. +#[test] +fn test_raw_node_joint_auto_leave() { + let l = default_logger(); + + let single = new_conf_change_single(2, ConfChangeType::AddLearnerNode); + let mut test_cc = conf_change_v2(vec![single]); + test_cc.set_transition(ConfChangeTransition::Implicit); + let exp_cs = conf_state_v2(vec![1], vec![2], vec![1], vec![], true); + let exp_cs2 = conf_state(vec![1], vec![2]); + + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + raw_node.campaign().unwrap(); + let mut proposed = false; + let ccdata = test_cc.write_to_bytes().unwrap(); + // Propose the ConfChange, wait until it applies, save the resulting ConfState. + let mut cs = None; + while cs.is_none() { + let mut rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let mut handle_committed_entries = + |rn: &mut RawNode, committed_entries: Vec| { + for e in committed_entries { + if e.get_entry_type() == EntryType::EntryConfChangeV2 { + let mut cc = ConfChangeV2::default(); + cc.merge_from_bytes(e.get_data()).unwrap(); + + // Force it step down. + let mut msg = new_message(1, 1, MessageType::MsgHeartbeatResponse, 0); + msg.term = rn.raft.term + 1; + rn.step(msg).unwrap(); + + cs = Some(rn.apply_conf_change(&cc).unwrap()); + } + } + }; + handle_committed_entries(&mut raw_node, rd.take_committed_entries()); + let is_leader = rd.ss().is_some_and(|ss| ss.leader_id == raw_node.raft.id); + + let mut light_rd = raw_node.advance(rd); + handle_committed_entries(&mut raw_node, light_rd.take_committed_entries()); + raw_node.advance_apply(); + + // Once we are the leader, propose a command and a ConfChange. + if !proposed && is_leader { + raw_node.propose(vec![], b"somedata".to_vec()).unwrap(); + raw_node + .propose_conf_change(vec![], test_cc.clone()) + .unwrap(); + proposed = true; + } + } + + // Check that the last index is exactly the conf change we put in, + // down to the bits. Note that this comes from the Storage, which + // will not reflect any unstable entries that we'll only be presented + // with in the next Ready. + let last_index = s.last_index().unwrap(); + let entries = s + .entries( + last_index - 1, + last_index + 1, + NO_LIMIT, + GetEntriesContext::empty(false), + ) + .unwrap(); + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].get_data(), b"somedata"); + assert_eq!(entries[1].get_entry_type(), EntryType::EntryConfChangeV2); + assert_eq!(ccdata, entries[1].get_data()); + assert_eq!(exp_cs, cs.unwrap()); + assert_eq!(0, raw_node.raft.pending_conf_index); + + // Move the RawNode along. It should not leave joint because it's follower. + let mut rd = raw_node.ready(); + assert!(rd.entries().is_empty()); + let _ = raw_node.advance(rd); + + // Make it leader again. It should leave joint automatically after moving apply index. + raw_node.campaign().unwrap(); + rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + + // Check that the right ConfChange comes out. + assert_eq!(rd.entries().len(), 1); + assert_eq!( + rd.entries()[0].get_entry_type(), + EntryType::EntryConfChangeV2 + ); + let mut leave_cc = ConfChangeV2::default(); + leave_cc + .merge_from_bytes(rd.entries()[0].get_data()) + .unwrap(); + assert!(leave_cc.get_context().is_empty()); + // Lie and pretend the ConfChange applied. It won't do so because now + // we require the joint quorum and we're only running one node. + let cs = raw_node.apply_conf_change(&leave_cc).unwrap(); + assert_eq!(cs, exp_cs2); +} + +/// Ensures that two proposes to add the same node should not affect the later propose +/// to add new node. +#[test] +fn test_raw_node_propose_add_duplicate_node() { + let l = default_logger(); + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + raw_node.campaign().expect(""); + loop { + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + if rd.ss().is_some_and(|ss| ss.leader_id == raw_node.raft.id) { + let _ = raw_node.advance(rd); + break; + } + let _ = raw_node.advance(rd); + } + + let mut propose_conf_change_and_apply = |cc| { + raw_node.propose_conf_change(vec![], cc).expect(""); + let mut rd = raw_node.ready(); + s.wl().append(rd.entries()).expect(""); + let handle_committed_entries = + |rn: &mut RawNode, committed_entries: Vec| { + for e in committed_entries { + if e.get_entry_type() == EntryType::EntryConfChange { + let mut conf_change = ConfChange::default(); + conf_change.merge_from_bytes(&e.data).unwrap(); + rn.apply_conf_change(&conf_change).unwrap(); + } + } + }; + handle_committed_entries(&mut raw_node, rd.take_committed_entries()); + + let mut light_rd = raw_node.advance(rd); + handle_committed_entries(&mut raw_node, light_rd.take_committed_entries()); + raw_node.advance_apply(); + }; + + let cc1 = conf_change(ConfChangeType::AddNode, 1); + let ccdata1 = cc1.write_to_bytes().unwrap(); + propose_conf_change_and_apply(cc1.clone()); + + // try to add the same node again + propose_conf_change_and_apply(cc1); + + // the new node join should be ok + let cc2 = conf_change(ConfChangeType::AddNode, 2); + let ccdata2 = cc2.write_to_bytes().unwrap(); + propose_conf_change_and_apply(cc2); + + let last_index = s.last_index().unwrap(); + + // the last three entries should be: ConfChange cc1, cc1, cc2 + let mut entries = s + .entries( + last_index - 2, + last_index + 1, + None, + GetEntriesContext::empty(false), + ) + .unwrap(); + assert_eq!(entries.len(), 3); + assert_eq!(entries[0].take_data(), ccdata1); + assert_eq!(entries[2].take_data(), ccdata2); +} + +#[test] +fn test_raw_node_propose_add_learner_node() -> Result<()> { + let l = default_logger(); + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + let rd = raw_node.ready(); + must_cmp_ready(&rd, &None, &None, &[], &[], &None, true, true, false); + let _ = raw_node.advance(rd); + + raw_node.campaign().expect(""); + loop { + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + if rd.ss().is_some_and(|ss| ss.leader_id == raw_node.raft.id) { + let _ = raw_node.advance(rd); + break; + } + let _ = raw_node.advance(rd); + } + + // propose add learner node and check apply state + let cc = conf_change(ConfChangeType::AddLearnerNode, 2); + raw_node.propose_conf_change(vec![], cc).expect(""); + + let rd = raw_node.ready(); + s.wl().append(rd.entries()).expect(""); + + let light_rd = raw_node.advance(rd); + + assert_eq!( + light_rd.committed_entries().len(), + 1, + "should committed the conf change entry" + ); + + let e = &light_rd.committed_entries()[0]; + assert_eq!(e.get_entry_type(), EntryType::EntryConfChange); + let mut conf_change = ConfChange::default(); + conf_change.merge_from_bytes(&e.data).unwrap(); + let conf_state = raw_node.apply_conf_change(&conf_change)?; + assert_eq!(conf_state.voters, vec![1]); + assert_eq!(conf_state.learners, vec![2]); + + Ok(()) +} + +/// Ensures that RawNode.read_index sends the MsgReadIndex message to the underlying +/// raft. It also ensures that ReadState can be read out. +#[test] +fn test_raw_node_read_index() { + let l = default_logger(); + let wrequest_ctx = b"somedata".to_vec(); + let wrs = vec![ReadState { + index: 2u64, + request_ctx: wrequest_ctx.clone(), + }]; + + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + raw_node.campaign().expect(""); + loop { + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + if rd.ss().is_some_and(|ss| ss.leader_id == raw_node.raft.id) { + let _ = raw_node.advance(rd); + + // Once we are the leader, issue a read index request + raw_node.read_index(wrequest_ctx); + break; + } + let _ = raw_node.advance(rd); + } + + // ensure the read_states can be read out + assert!(!raw_node.raft.read_states.is_empty()); + assert!(raw_node.has_ready()); + let rd = raw_node.ready(); + assert_eq!(*rd.read_states(), wrs); + s.wl().append(rd.entries()).expect(""); + let _ = raw_node.advance(rd); + + // ensure raft.read_states is reset after advance + assert!(!raw_node.has_ready()); + assert!(raw_node.raft.read_states.is_empty()); +} + +/// Ensures that a node can be started correctly. Note that RawNode requires the +/// application to bootstrap the state, i.e. it does not accept peers and will not +/// create faux configuration change entries. +#[test] +fn test_raw_node_start() { + let l = default_logger(); + let store = new_storage(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, store.clone(), &l); + + let rd = raw_node.ready(); + must_cmp_ready(&rd, &None, &None, &[], &[], &None, true, true, false); + let _ = raw_node.advance(rd); + + raw_node.campaign().expect(""); + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &Some(soft_state(1, StateRole::Leader)), + &Some(hard_state(2, 1, 1)), + &[new_entry(2, 2, None)], + &[], + &None, + true, + true, + true, + ); + store.wl().append(rd.entries()).expect(""); + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), Some(2)); + assert_eq!(*light_rd.committed_entries(), vec![new_entry(2, 2, None)]); + assert!(!raw_node.has_ready()); + + raw_node.propose(vec![], b"somedata".to_vec()).expect(""); + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &None, + &None, + &[new_entry(2, 3, SOME_DATA)], + &[], + &None, + true, + true, + true, + ); + store.wl().append(rd.entries()).expect(""); + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), Some(3)); + assert_eq!( + *light_rd.committed_entries(), + vec![new_entry(2, 3, SOME_DATA)] + ); + + assert!(!raw_node.has_ready()); +} + +#[test] +fn test_raw_node_restart() { + let l = default_logger(); + let entries = vec![empty_entry(1, 1), new_entry(1, 2, Some("foo"))]; + + let mut raw_node = { + let store = new_storage(); + store.wl().set_hardstate(hard_state(1, 1, 0)); + store.wl().append(&entries).unwrap(); + new_raw_node(1, vec![], 10, 1, store, &l) + }; + + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &None, + &None, + &[], + &entries[..1], + &None, + true, + true, + false, + ); + let _ = raw_node.advance(rd); + assert!(!raw_node.has_ready()); +} + +#[test] +fn test_raw_node_restart_from_snapshot() { + let l = default_logger(); + let snap = new_snapshot(2, 1, vec![1, 2]); + let entries = vec![new_entry(1, 3, Some("foo"))]; + + let mut raw_node = { + let store = new_storage(); + store.wl().apply_snapshot(snap).unwrap(); + store.wl().append(&entries).unwrap(); + store.wl().set_hardstate(hard_state(1, 3, 0)); + RawNode::new(&new_test_config(1, 10, 1), store, &l).unwrap() + }; + + let rd = raw_node.ready(); + must_cmp_ready(&rd, &None, &None, &[], &entries, &None, true, true, false); + let _ = raw_node.advance(rd); + assert!(!raw_node.has_ready()); +} + +// test_skip_bcast_commit ensures that empty commit message is not sent out +// when skip_bcast_commit is true. +#[test] +fn test_skip_bcast_commit() { + let l = default_logger(); + let mut config = new_test_config(1, 10, 1); + config.skip_bcast_commit = true; + let s = MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])); + let r1 = new_test_raft_with_config(&config, s, &l); + let r2 = new_test_raft(2, vec![1, 2, 3], 10, 1, new_storage(), &l); + let r3 = new_test_raft(3, vec![1, 2, 3], 10, 1, new_storage(), &l); + let mut nt = Network::new(vec![Some(r1), Some(r2), Some(r3)], &l); + + // elect r1 as leader + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // Without bcast commit, followers will not update its commit index immediately. + let mut test_entries = Entry::default(); + test_entries.data = (b"testdata" as &'static [u8]).into(); + let msg = new_message_with_entries(1, 1, MessageType::MsgPropose, vec![test_entries]); + nt.send(vec![msg.clone()]); + assert_eq!(nt.peers[&1].raft_log.committed, 2); + assert_eq!(nt.peers[&2].raft_log.committed, 1); + assert_eq!(nt.peers[&3].raft_log.committed, 1); + + // After bcast heartbeat, followers will be informed the actual commit index. + for _ in 0..nt.peers[&1].randomized_election_timeout() { + nt.peers.get_mut(&1).unwrap().tick(); + } + nt.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + assert_eq!(nt.peers[&2].raft_log.committed, 2); + assert_eq!(nt.peers[&3].raft_log.committed, 2); + + // The feature should be able to be adjusted at run time. + nt.peers.get_mut(&1).unwrap().skip_bcast_commit(false); + nt.send(vec![msg.clone()]); + assert_eq!(nt.peers[&1].raft_log.committed, 3); + assert_eq!(nt.peers[&2].raft_log.committed, 3); + assert_eq!(nt.peers[&3].raft_log.committed, 3); + + nt.peers.get_mut(&1).unwrap().skip_bcast_commit(true); + + // Later proposal should commit former proposal. + nt.send(vec![msg.clone()]); + nt.send(vec![msg]); + assert_eq!(nt.peers[&1].raft_log.committed, 5); + assert_eq!(nt.peers[&2].raft_log.committed, 4); + assert_eq!(nt.peers[&3].raft_log.committed, 4); + + // When committing conf change, leader should always bcast commit. + let mut cc = ConfChange::default(); + cc.set_change_type(ConfChangeType::RemoveNode); + cc.node_id = 3; + let data = cc.write_to_bytes().unwrap(); + let mut cc_entry = Entry::default(); + cc_entry.set_entry_type(EntryType::EntryConfChange); + cc_entry.data = data.into(); + nt.send(vec![new_message_with_entries( + 1, + 1, + MessageType::MsgPropose, + vec![cc_entry], + )]); + assert!(nt.peers[&1].should_bcast_commit()); + assert!(nt.peers[&2].should_bcast_commit()); + assert!(nt.peers[&3].should_bcast_commit()); + + assert_eq!(nt.peers[&1].raft_log.committed, 6); + assert_eq!(nt.peers[&2].raft_log.committed, 6); + assert_eq!(nt.peers[&3].raft_log.committed, 6); +} + +/// test_set_priority checks the set_priority function in RawNode. +#[test] +fn test_set_priority() { + let l = default_logger(); + let mut raw_node = new_raw_node(1, vec![1], 10, 1, new_storage(), &l); + let priorities = vec![0, 1, 5, 10, 10000]; + for p in priorities { + raw_node.set_priority(p); + assert_eq!(raw_node.raft.priority, p); + } +} + +// TestNodeBoundedLogGrowthWithPartition tests a scenario where a leader is +// partitioned from a quorum of nodes. It verifies that the leader's log is +// protected from unbounded growth even as new entries continue to be proposed. +// This protection is provided by the max_uncommitted_size configuration. +#[test] +fn test_bounded_uncommitted_entries_growth_with_partition() { + let l = default_logger(); + let config = &Config { + id: 1, + max_uncommitted_size: 12, + ..Config::default() + }; + let s = new_storage(); + let mut raw_node = new_raw_node_with_config(vec![1], config, s.clone(), &l); + + // wait raw_node to be leader + raw_node.campaign().unwrap(); + loop { + let rd = raw_node.ready(); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + if rd + .ss() + .is_some_and(|ss| ss.leader_id == raw_node.raft.leader_id) + { + let _ = raw_node.advance(rd); + break; + } + + let _ = raw_node.advance(rd); + } + + // should be accepted + let data = b"hello world!"; + raw_node.propose(vec![], data.to_vec()).unwrap(); + + // shoule be dropped + let result = raw_node.propose(vec![], data.to_vec()); + assert_eq!(result.unwrap_err(), Error::ProposalDropped); + + // should be accepted when previous data has been committed + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + let data = b"hello world!".to_vec(); + raw_node.propose(vec![], data).unwrap(); +} + +fn prepare_async_entries(raw_node: &mut RawNode, s: &MemStorage) { + raw_node.raft.become_candidate(); + raw_node.raft.become_leader(); + + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + let data: Vec = vec![1; 1000]; + for _ in 0..10 { + raw_node.propose(vec![], data.to_vec()).unwrap(); + } + + let rd = raw_node.ready(); + let entries = rd.entries().clone(); + assert_eq!(entries.len(), 10); + s.wl().append(&entries).unwrap(); + let msgs = rd.messages(); + // First append has two entries: the empty entry to confirm the + // election, and the first proposal (only one proposal gets sent + // because we're in probe state). + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].msg_type, MessageType::MsgAppend); + assert_eq!(msgs[0].entries.len(), 2); + let _ = raw_node.advance_append(rd); + + s.wl().trigger_log_unavailable(true); + + // Become replicate state + let mut append_response = new_message(2, 1, MessageType::MsgAppendResponse, 0); + append_response.set_term(2); + append_response.set_index(2); + raw_node.step(append_response).unwrap(); +} + +// Test entries are handled properly when they are fetched asynchronously +#[test] +fn test_raw_node_with_async_entries() { + let l = default_logger(); + let mut cfg = new_test_config(1, 10, 1); + cfg.max_size_per_msg = 2048; + let s = new_storage(); + let mut raw_node = new_raw_node_with_config(vec![1, 2], &cfg, s.clone(), &l); + + prepare_async_entries(&mut raw_node, &s); + + // No entries are sent because the entries are temporarily unavailable + let rd = raw_node.ready(); + let entries = rd.entries().clone(); + s.wl().append(&entries).unwrap(); + let msgs = rd.messages(); + assert_eq!(msgs.len(), 0); + let _ = raw_node.advance_append(rd); + + // Entries are sent when the entries are ready which is informed by `on_entries_fetched`. + s.wl().trigger_log_unavailable(false); + let context = s.wl().take_get_entries_context().unwrap(); + raw_node.on_entries_fetched(context); + let rd = raw_node.ready(); + let entries = rd.entries().clone(); + s.wl().append(&entries).unwrap(); + let msgs = rd.messages(); + assert_eq!(msgs.len(), 5); + assert_eq!(msgs[0].msg_type, MessageType::MsgAppend); + assert_eq!(msgs[0].entries.len(), 2); + let _ = raw_node.advance_append(rd); +} + +// Test if async fetch entries works well when there is a remove node conf-change. +#[test] +fn test_raw_node_with_async_entries_to_removed_node() { + let l = default_logger(); + let mut cfg = new_test_config(1, 10, 1); + cfg.max_size_per_msg = 2048; + let s = new_storage(); + let mut raw_node = new_raw_node_with_config(vec![1, 2], &cfg, s.clone(), &l); + + prepare_async_entries(&mut raw_node, &s); + + raw_node.apply_conf_change(&remove_node(2)).unwrap(); + + // Entries are not sent due to the node is removed. + s.wl().trigger_log_unavailable(false); + let context = s.wl().take_get_entries_context().unwrap(); + raw_node.on_entries_fetched(context); + let rd = raw_node.ready(); + assert_eq!(rd.entries().len(), 0); + assert_eq!(rd.messages().len(), 0); + let _ = raw_node.advance_append(rd); +} + +// Test if async fetch entries works well when there is a leader step-down. +#[test] +fn test_raw_node_with_async_entries_on_follower() { + let l = default_logger(); + let mut cfg = new_test_config(1, 10, 1); + cfg.max_size_per_msg = 2048; + let s = new_storage(); + let mut raw_node = new_raw_node_with_config(vec![1, 2], &cfg, s.clone(), &l); + + prepare_async_entries(&mut raw_node, &s); + + // Set recent inactive to step down leader + raw_node.raft.mut_prs().get_mut(2).unwrap().recent_active = false; + let mut msg = Message::new(); + msg.set_to(1); + msg.set_msg_type(MessageType::MsgCheckQuorum); + raw_node.raft.step(msg).unwrap(); + assert_ne!(raw_node.raft.state, StateRole::Leader); + + // Entries are not sent due to the leader is changed. + s.wl().trigger_log_unavailable(false); + let context = s.wl().take_get_entries_context().unwrap(); + raw_node.on_entries_fetched(context); + let rd = raw_node.ready(); + assert_eq!(rd.entries().len(), 0); + assert_eq!(rd.messages().len(), 0); + let _ = raw_node.advance_append(rd); +} + +#[test] +fn test_raw_node_async_entries_with_leader_change() { + let l = default_logger(); + let mut cfg = new_test_config(1, 10, 1); + cfg.max_size_per_msg = 2048; + let s = new_storage(); + let mut raw_node = new_raw_node_with_config(vec![1, 2], &cfg, s.clone(), &l); + + raw_node.raft.become_candidate(); + raw_node.raft.become_leader(); + + let rd = raw_node.ready(); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + let data: Vec = vec![1; 1000]; + for _ in 0..10 { + raw_node.propose(vec![], data.to_vec()).unwrap(); + } + + let rd = raw_node.ready(); + let entries = rd.entries().clone(); + assert_eq!(entries.len(), 10); + s.wl().append(&entries).unwrap(); + let msgs = rd.messages(); + // First append has two entries: the empty entry to confirm the + // election, and the first proposal (only one proposal gets sent + // because we're in probe state). + assert_eq!(msgs.len(), 1); + assert_eq!(msgs[0].msg_type, MessageType::MsgAppend); + assert_eq!(msgs[0].entries.len(), 2); + let _ = raw_node.advance_append(rd); + + s.wl().trigger_log_unavailable(true); + + // Become replicate state + let mut append_response = new_message(2, 1, MessageType::MsgAppendResponse, 0); + append_response.set_term(2); + append_response.set_index(2); + raw_node.step(append_response).unwrap(); + + raw_node.raft.become_follower(raw_node.raft.term + 1, 2); + raw_node.raft.become_candidate(); + raw_node.raft.become_leader(); + + // Entries are not sent due to the leadership or the term is changed. + s.wl().trigger_log_unavailable(false); + let context = s.wl().take_get_entries_context().unwrap(); + raw_node.on_entries_fetched(context); + let rd = raw_node.ready(); + assert_eq!(rd.entries().len(), 1); // no-op entry + assert_eq!(rd.messages().len(), 0); + let _ = raw_node.advance_append(rd); +} + +#[test] +fn test_raw_node_with_async_apply() { + let l = default_logger(); + let s = new_storage(); + s.wl().apply_snapshot(new_snapshot(1, 1, vec![1])).unwrap(); + + let mut raw_node = new_raw_node(1, vec![1], 10, 1, s.clone(), &l); + raw_node.campaign().unwrap(); + let rd = raw_node.ready(); + // Single node should become leader. + assert!(rd + .ss() + .is_some_and(|ss| ss.leader_id == raw_node.raft.leader_id)); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + let mut last_index = raw_node.raft.raft_log.last_index(); + + let data = b"hello world!"; + + for _ in 1..10 { + let cnt = rand::random::() % 10 + 1; + for _ in 0..cnt { + raw_node.propose(vec![], data.to_vec()).unwrap(); + } + + let rd = raw_node.ready(); + let entries = rd.entries().clone(); + assert_eq!(entries.first().unwrap().get_index(), last_index + 1); + assert_eq!(entries.last().unwrap().get_index(), last_index + cnt); + must_cmp_ready(&rd, &None, &None, &entries, &[], &None, true, true, true); + + s.wl().append(&entries).unwrap(); + + let light_rd = raw_node.advance_append(rd); + assert_eq!(entries, *light_rd.committed_entries()); + assert_eq!(light_rd.commit_index(), Some(last_index + cnt)); + + // No matter how applied index changes, the index of next committed + // entries should be the same. + raw_node.advance_apply_to(last_index + 1); + assert!(!raw_node.has_ready()); + + last_index += cnt; + } +} + +/// Test if the ready process is expected when a follower receives a snapshot +/// and some committed entries after its snapshot. +#[test] +fn test_raw_node_entries_after_snapshot() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2], 10, 1, s.clone(), &l); + + let mut entries = vec![]; + for i in 2..20 { + entries.push(new_entry(2, i, Some("hello"))); + } + let mut append_msg = new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(2); + append_msg.set_index(1); + append_msg.set_log_term(1); + append_msg.set_commit(5); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &Some(soft_state(2, StateRole::Follower)), + &Some(hard_state(2, 5, 0)), + &entries, + &[], + &None, + true, + false, + true, + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), None); + assert_eq!(light_rd.committed_entries().as_slice(), &entries[..4]); + assert!(light_rd.messages().is_empty()); + + let snapshot = new_snapshot(10, 3, vec![1, 2]); + let mut snapshot_msg = new_message(2, 1, MessageType::MsgSnapshot, 0); + snapshot_msg.set_term(3); + snapshot_msg.set_snapshot(snapshot.clone()); + raw_node.step(snapshot_msg).unwrap(); + + let mut entries = vec![]; + for i in 11..14 { + entries.push(new_entry(3, i, Some("hello"))); + } + let mut append_msg = new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(3); + append_msg.set_index(10); + append_msg.set_log_term(3); + append_msg.set_commit(12); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + // If there is a snapshot, the committed entries should be empty. + must_cmp_ready( + &rd, + &None, + &Some(hard_state(3, 12, 0)), + &entries, + &[], + &Some(snapshot), + true, + false, + true, + ); + // Should have a MsgAppendResponse + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgAppendResponse + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().apply_snapshot(rd.snapshot().clone()).unwrap(); + s.wl().append(rd.entries()).unwrap(); + + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), None); + assert_eq!(light_rd.committed_entries().as_slice(), &entries[..2]); + assert!(light_rd.messages().is_empty()); +} + +/// Test if the given committed entries are persisted when some persisted +/// entries are overwritten by a new leader. +#[test] +fn test_raw_node_overwrite_entries() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2, 3])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2, 3], 10, 1, s.clone(), &l); + + let entries = [ + new_entry(2, 2, Some("hello")), + new_entry(2, 3, Some("hello")), + new_entry(2, 4, Some("hello")), + ]; + let mut append_msg = new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(2); + append_msg.set_index(1); + append_msg.set_log_term(1); + append_msg.set_commit(1); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &Some(soft_state(2, StateRole::Follower)), + &Some(hard_state(2, 1, 0)), + &entries, + &[], + &None, + true, + false, + true, + ); + // Should have a MsgAppendResponse + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgAppendResponse + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), None); + assert!(light_rd.committed_entries().is_empty()); + assert!(light_rd.messages().is_empty()); + + let entries_2 = [ + new_entry(3, 4, Some("hello")), + new_entry(3, 5, Some("hello")), + new_entry(3, 6, Some("hello")), + ]; + let mut append_msg = new_message_with_entries(3, 1, MessageType::MsgAppend, entries_2.to_vec()); + append_msg.set_term(3); + append_msg.set_index(3); + append_msg.set_log_term(2); + append_msg.set_commit(5); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &Some(soft_state(3, StateRole::Follower)), + &Some(hard_state(3, 5, 0)), + &entries_2, + &entries[..2], + &None, + true, + false, + true, + ); + // Should have a MsgAppendResponse + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgAppendResponse + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + + let light_rd = raw_node.advance(rd); + assert_eq!(light_rd.commit_index(), None); + assert_eq!(light_rd.committed_entries().as_slice(), &entries_2[..2]); + assert!(light_rd.messages().is_empty()); +} + +/// Test if async ready process is expected when a leader receives +/// the append response and persist its entries. +#[test] +fn test_async_ready_leader() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2, 3])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2, 3], 10, 1, s.clone(), &l); + raw_node.raft.become_candidate(); + raw_node.raft.become_leader(); + let rd = raw_node.ready(); + assert!(rd + .ss() + .is_some_and(|ss| ss.leader_id == raw_node.raft.leader_id)); + s.wl().append(rd.entries()).unwrap(); + let _ = raw_node.advance(rd); + + assert_eq!(raw_node.raft.term, 2); + let mut first_index = raw_node.raft.raft_log.last_index(); + + let data = b"hello world!"; + + // Set node 2 progress to replicate + raw_node.raft.mut_prs().get_mut(2).unwrap().matched = 1; + raw_node + .raft + .mut_prs() + .get_mut(2) + .unwrap() + .become_replicate(); + for i in 0..10 { + for _ in 0..10 { + raw_node.propose(vec![], data.to_vec()).unwrap(); + } + + let mut rd = raw_node.ready(); + assert_eq!(rd.number(), i + 2); + let entries = rd.entries().clone(); + assert_eq!( + entries.first().unwrap().get_index(), + first_index + i * 10 + 1 + ); + assert_eq!( + entries.last().unwrap().get_index(), + first_index + i * 10 + 10 + ); + // Leader‘s msg can be sent immediately. + must_cmp_ready(&rd, &None, &None, &entries, &[], &None, false, true, true); + for msg in rd.take_messages() { + assert_eq!(msg.get_msg_type(), MessageType::MsgAppend); + } + + s.wl().append(&entries).unwrap(); + raw_node.advance_append_async(rd); + } + // Unpersisted Ready number in range [2, 11] + raw_node.on_persist_ready(4); + // No new committed entries due to two nodes in this cluster + assert!(!raw_node.has_ready()); + + // The index of uncommitted entries in range [first_index, first_index + 100] + let mut append_response = new_message(2, 1, MessageType::MsgAppendResponse, 0); + append_response.set_term(2); + append_response.set_index(first_index + 100); + + raw_node.step(append_response).unwrap(); + + // Forward commit index due to append response + let rd = raw_node.ready(); + assert_eq!(rd.hs(), Some(&hard_state(2, first_index + 30, 1))); + assert_eq!( + rd.committed_entries().first().unwrap().get_index(), + first_index + ); + assert_eq!( + rd.committed_entries().last().unwrap().get_index(), + first_index + 30 + ); + assert!(!rd.messages().is_empty()); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + raw_node.advance_append_async(rd); + + // Forward commit index due to persist ready + raw_node.on_persist_ready(8); + let rd = raw_node.ready(); + assert_eq!(rd.hs(), Some(&hard_state(2, first_index + 70, 1))); + assert_eq!( + rd.committed_entries().first().unwrap().get_index(), + first_index + 31 + ); + assert_eq!( + rd.committed_entries().last().unwrap().get_index(), + first_index + 70 + ); + assert!(!rd.messages().is_empty()); + assert!(rd.persisted_messages().is_empty()); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + + // Forward commit index due to persist last ready + let light_rd = raw_node.advance_append(rd); + assert_eq!(light_rd.commit_index(), Some(first_index + 100)); + assert_eq!( + light_rd.committed_entries().first().unwrap().get_index(), + first_index + 71 + ); + assert_eq!( + light_rd.committed_entries().last().unwrap().get_index(), + first_index + 100 + ); + assert!(!light_rd.messages().is_empty()); + + // Test when 2 followers response the append entries msg and leader has + // not persisted them yet. + first_index += 100; + for _ in 0..10 { + raw_node.propose(vec![], data.to_vec()).unwrap(); + } + + let mut rd = raw_node.ready(); + assert_eq!(rd.number(), 14); + let entries = rd.entries().clone(); + assert_eq!(entries.first().unwrap().get_index(), first_index + 1); + assert_eq!(entries.last().unwrap().get_index(), first_index + 10); + // Leader‘s msg can be sent immediately. + must_cmp_ready(&rd, &None, &None, &entries, &[], &None, false, true, true); + for msg in rd.take_messages() { + assert_eq!(msg.get_msg_type(), MessageType::MsgAppend); + } + s.wl().append(&entries).unwrap(); + raw_node.advance_append_async(rd); + + let mut append_response = new_message(2, 1, MessageType::MsgAppendResponse, 0); + append_response.set_term(2); + append_response.set_index(first_index + 9); + + raw_node.step(append_response).unwrap(); + + let mut append_response = new_message(3, 1, MessageType::MsgAppendResponse, 0); + append_response.set_term(2); + append_response.set_index(first_index + 10); + + raw_node.step(append_response).unwrap(); + + let mut rd = raw_node.ready(); + // It should has some append msgs and its commit index should be first_index + 9. + must_cmp_ready( + &rd, + &None, + &Some(hard_state(2, first_index + 9, 1)), + &[], + &[], + &None, + false, + true, + false, + ); + for msg in rd.take_messages() { + assert_eq!(msg.get_msg_type(), MessageType::MsgAppend); + assert_eq!(msg.get_commit(), first_index + 9); + } + + // Forward commit index due to peer 1's append response and persisted entries + let light_rd = raw_node.advance_append(rd); + assert_eq!(light_rd.commit_index(), Some(first_index + 10)); + assert_eq!( + light_rd.committed_entries().first().unwrap().get_index(), + first_index + 1 + ); + assert_eq!( + light_rd.committed_entries().last().unwrap().get_index(), + first_index + 10 + ); + assert!(!light_rd.messages().is_empty()); +} + +/// Test if async ready process is expected when a follower receives +/// some append msg and snapshot. +#[test] +fn test_async_ready_follower() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2], 10, 1, s.clone(), &l); + let mut first_index = 1; + let mut rd_number = 0; + for cnt in 0..3 { + for i in 0..10 { + let entries = [ + new_entry(2, first_index + i * 3 + 1, Some("hello")), + new_entry(2, first_index + i * 3 + 2, Some("hello")), + new_entry(2, first_index + i * 3 + 3, Some("hello")), + ]; + let mut append_msg = + new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(2); + append_msg.set_index(first_index + i * 3); + if cnt == 0 && i == 0 { + append_msg.set_log_term(1); + } else { + append_msg.set_log_term(2); + } + append_msg.set_commit(first_index + i * 3 + 3); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), rd_number + i + 1); + assert_eq!(rd.hs(), Some(&hard_state(2, first_index + i * 3 + 3, 0))); + assert_eq!(rd.entries(), &entries); + assert_eq!(rd.committed_entries().as_slice(), &[]); + assert!(rd.messages().is_empty()); + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgAppendResponse + ); + + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + raw_node.advance_append_async(rd); + } + // Unpersisted Ready number in range [1, 10] + raw_node.on_persist_ready(rd_number + 4); + let rd = raw_node.ready(); + assert_eq!(rd.hs(), None); + assert_eq!( + rd.committed_entries().first().unwrap().get_index(), + first_index + 1 + ); + assert_eq!( + rd.committed_entries().last().unwrap().get_index(), + first_index + 3 * 3 + 3 + ); + assert!(rd.messages().is_empty()); + assert!(rd.persisted_messages().is_empty()); + + let light_rd = raw_node.advance_append(rd); + assert_eq!(light_rd.commit_index(), None); + assert_eq!( + light_rd.committed_entries().first().unwrap().get_index(), + first_index + 3 * 3 + 4 + ); + assert_eq!( + light_rd.committed_entries().last().unwrap().get_index(), + first_index + 10 * 3 + ); + assert!(light_rd.messages().is_empty()); + + first_index += 10 * 3; + rd_number += 11; + } + + let snapshot = new_snapshot(first_index + 5, 2, vec![1, 2]); + let mut snapshot_msg = new_message(2, 1, MessageType::MsgSnapshot, 0); + snapshot_msg.set_term(2); + snapshot_msg.set_snapshot(snapshot.clone()); + raw_node.step(snapshot_msg).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), rd_number + 1); + must_cmp_ready( + &rd, + &None, + &Some(hard_state(2, first_index + 5, 0)), + &[], + &[], + &Some(snapshot.clone()), + true, + false, + true, + ); + + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().apply_snapshot(snapshot).unwrap(); + s.wl().append(rd.entries()).unwrap(); + raw_node.advance_append_async(rd); + + let mut entries = vec![]; + for i in 1..10 { + entries.push(new_entry(2, first_index + 5 + i, Some("hello"))); + } + let mut append_msg = new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(2); + append_msg.set_index(first_index + 5); + append_msg.set_log_term(2); + append_msg.set_commit(first_index + 5 + 3); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), rd_number + 2); + must_cmp_ready( + &rd, + &None, + &Some(hard_state(2, first_index + 5 + 3, 0)), + &entries, + &[], + &None, + true, + false, + true, + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().append(rd.entries()).unwrap(); + raw_node.advance_append_async(rd); + + raw_node.on_persist_ready(rd_number + 1); + assert_eq!(raw_node.raft.raft_log.persisted, first_index + 5); + raw_node.advance_apply_to(first_index + 5); + + raw_node.on_persist_ready(rd_number + 2); + + let rd = raw_node.ready(); + must_cmp_ready( + &rd, + &None, + &None, + &[], + &entries[..3], + &None, + true, + true, + false, + ); +} + +/// Test if a new leader immediately sends all messages recorded before without +/// persisting. +#[test] +fn test_async_ready_become_leader() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(5, 5, vec![1, 2, 3])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2, 3], 10, 1, s.clone(), &l); + for _ in 1..raw_node.raft.election_timeout() * 2 { + raw_node.raft.tick_election(); + } + let rd = raw_node.ready(); + assert_eq!(rd.number(), 1); + must_cmp_ready( + &rd, + &Some(soft_state(0, StateRole::Candidate)), + &Some(hard_state(6, 5, 1)), + &[], + &[], + &None, + true, + false, + true, + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + + for msg in rd.persisted_messages() { + assert_eq!(msg.get_msg_type(), MessageType::MsgRequestVote); + } + + let _ = raw_node.advance_append(rd); + + // Peer 1 should reject to vote to peer 2 + let mut vote_request_2 = new_message(2, 1, MessageType::MsgRequestVote, 0); + vote_request_2.set_term(6); + vote_request_2.set_log_term(4); + vote_request_2.set_index(4); + raw_node.step(vote_request_2).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), 2); + must_cmp_ready(&rd, &None, &None, &[], &[], &None, true, false, false); + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgRequestVoteResponse + ); + raw_node.advance_append_async(rd); + + // Peer 1 should reject to vote to peer 3 + let mut vote_request_3 = new_message(3, 1, MessageType::MsgRequestVote, 0); + vote_request_3.set_term(6); + vote_request_3.set_log_term(4); + vote_request_3.set_index(4); + raw_node.step(vote_request_3).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), 3); + must_cmp_ready(&rd, &None, &None, &[], &[], &None, true, false, false); + assert_eq!( + rd.persisted_messages()[0].get_msg_type(), + MessageType::MsgRequestVoteResponse + ); + raw_node.advance_append_async(rd); + + // Peer 1 receives the vote from peer 2 + let mut vote_response_2 = new_message(2, 1, MessageType::MsgRequestVoteResponse, 0); + vote_response_2.set_term(6); + vote_response_2.set_reject(false); + raw_node.step(vote_response_2).unwrap(); + + let mut rd = raw_node.ready(); + assert_eq!(rd.number(), 4); + assert_eq!(rd.entries().len(), 1); + must_cmp_ready( + &rd, + &Some(soft_state(1, StateRole::Leader)), + &None, + rd.entries(), + &[], + &None, + false, + true, + true, + ); + assert_eq!(rd.messages().len(), 2); + for msg in rd.take_messages().iter() { + assert_eq!(msg.get_msg_type(), MessageType::MsgAppend); + } + + s.wl().append(rd.entries()).unwrap(); + + s.wl().append(rd.entries()).unwrap(); + + let light_rd = raw_node.advance_append(rd); + assert_eq!(light_rd.commit_index(), None); + assert!(light_rd.committed_entries().is_empty()); + assert!(light_rd.messages().is_empty()); +} + +#[test] +fn test_async_ready_multiple_snapshot() { + let l = default_logger(); + let s = new_storage(); + s.wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2])) + .unwrap(); + + let mut raw_node = new_raw_node(1, vec![1, 2], 10, 1, s.clone(), &l); + + let snapshot = new_snapshot(10, 2, vec![1, 2]); + let mut snapshot_msg = new_message(2, 1, MessageType::MsgSnapshot, 0); + snapshot_msg.set_term(2); + snapshot_msg.set_snapshot(snapshot.clone()); + raw_node.step(snapshot_msg).unwrap(); + + let mut entries = vec![]; + for i in 11..14 { + entries.push(new_entry(2, i, Some("hello"))); + } + let mut append_msg = new_message_with_entries(2, 1, MessageType::MsgAppend, entries.to_vec()); + append_msg.set_term(2); + append_msg.set_index(10); + append_msg.set_log_term(2); + append_msg.set_commit(12); + raw_node.step(append_msg).unwrap(); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), 1); + // If there is a snapshot, the committed entries should be empty. + must_cmp_ready( + &rd, + &Some(soft_state(2, StateRole::Follower)), + &Some(hard_state(2, 12, 0)), + &entries, + &[], + &Some(snapshot), + true, + false, + true, + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().apply_snapshot(rd.snapshot().clone()).unwrap(); + s.wl().append(rd.entries()).unwrap(); + + raw_node.advance_append_async(rd); + + let snapshot = new_snapshot(20, 1, vec![1, 2]); + let mut snapshot_msg = new_message(2, 1, MessageType::MsgSnapshot, 0); + snapshot_msg.set_term(2); + snapshot_msg.set_snapshot(snapshot.clone()); + raw_node.step(snapshot_msg).unwrap(); + + raw_node.on_persist_ready(1); + + assert_eq!(raw_node.raft.raft_log.persisted, 13); + + raw_node.advance_apply_to(10); + + let rd = raw_node.ready(); + assert_eq!(rd.number(), 2); + must_cmp_ready( + &rd, + &None, + &Some(hard_state(2, 20, 0)), + &[], + &[], + &Some(snapshot), + true, + false, + true, + ); + s.wl().set_hardstate(rd.hs().unwrap().clone()); + s.wl().apply_snapshot(rd.snapshot().clone()).unwrap(); + + let light_rd = raw_node.advance_append(rd); + assert_eq!(light_rd.commit_index(), None); + assert!(light_rd.committed_entries().is_empty()); + assert!(light_rd.messages().is_empty()); + + raw_node.advance_apply_to(20); +} + +#[test] +fn test_committed_entries_pagination() { + let l = default_logger(); + let s = new_storage(); + let mut raw_node = new_raw_node(1, vec![1, 2, 3], 10, 1, s, &l); + + let mut entries = vec![]; + for i in 2..10 { + entries.push(new_entry(1, i, None)); + } + let mut msg = new_message_with_entries(3, 1, MessageType::MsgAppend, entries.to_vec()); + msg.set_term(1); + msg.set_index(1); + msg.set_log_term(1); + msg.set_commit(9); + raw_node.step(msg).unwrap(); + + // Test unpersisted entries won't be fetched. + // NOTE: maybe it's better to allow fetching unpersisted committed entries. + let rd = raw_node.ready(); + assert!(rd.committed_entries().is_empty()); + assert!(raw_node.has_ready()); + + // Persist entries. + assert!(!rd.entries().is_empty()); + raw_node.store().wl().append(rd.entries()).unwrap(); + + // Advance the ready, and we can get committed_entries as expected. + // Test using 0 as `committed_entries_max_size` works as expected. + raw_node.raft.set_max_committed_size_per_ready(0); + let rd = raw_node.advance(rd); + // `MemStorage::entries` uses `util::limit_size` to limit size of committed entries. + // So there will be at least one entry. + assert_eq!(rd.committed_entries().len(), 1); + + // Fetch a `Ready` again without size limit for committed entries. + assert!(raw_node.has_ready()); + raw_node.raft.set_max_committed_size_per_ready(u64::MAX); + let rd = raw_node.ready(); + assert_eq!(rd.committed_entries().len(), 7); + + // No more `Ready`s. + assert!(!raw_node.has_ready()); +} + +/// Test with `commit_since_index`, committed entries can be fetched correctly after restart. +/// +/// Case steps: +/// - Node learns that index 10 is committed +/// - `next_entries` returns entries [2..11) in committed_entries (but index 10 already +/// exceeds maxBytes), which isn't noticed internally by Raft +/// - Commit index gets bumped to 10 +/// - The node persists the `HardState`, but crashes before applying the entries +/// - Upon restart, the storage returns the same entries, but `slice` takes a +/// different code path and removes the last entry. +/// - Raft does not emit a HardState, but when the app calls advance(), it bumps +/// its internal applied index cursor to 10 (when it should be 9) +/// - The next `Ready` asks the app to apply index 11 (omitting index 10), losing a +/// write. +#[test] +fn test_committed_entries_pagination_after_restart() { + let l = default_logger(); + let s = IgnoreSizeHintMemStorage::default(); + s.inner + .wl() + .apply_snapshot(new_snapshot(1, 1, vec![1, 2, 3])) + .unwrap(); + + let (mut entries, mut size) = (vec![], 0); + for i in 2..=10 { + let e = new_entry(1, i, Some("test data")); + size += e.compute_size() as u64; + entries.push(e); + } + s.inner.wl().append(&entries).unwrap(); + s.inner.wl().mut_hard_state().commit = 10; + + s.inner + .wl() + .append(&[new_entry(1, 11, Some("boom"))]) + .unwrap(); + + let config = new_test_config(1, 10, 1); + let mut raw_node = RawNode::new(&config, s, &l).unwrap(); + + // `IgnoreSizeHintMemStorage` will ignore `max_committed_size_per_ready` but + // `RaftLog::slice won't.` + raw_node.raft.set_max_committed_size_per_ready(size - 1); + + let mut highest_applied = 1; + while highest_applied != 11 { + let mut rd = raw_node.ready(); + let committed_entries = rd.take_committed_entries(); + let next = committed_entries.first().map(|x| x.index).unwrap(); + assert_eq!(highest_applied + 1, next); + + highest_applied = committed_entries.last().map(|x| x.index).unwrap(); + raw_node.raft.raft_log.commit_to(11); + } +} + +#[test] +fn test_disable_proposal_forwarding() { + let l = default_logger(); + + let n1 = new_test_raft_with_config( + &Config { + id: 1, + heartbeat_tick: 1, + election_tick: 10, + disable_proposal_forwarding: false, + ..Default::default() + }, + MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])), + &l, + ); + + let n2 = new_test_raft_with_config( + &Config { + id: 2, + heartbeat_tick: 1, + election_tick: 10, + disable_proposal_forwarding: false, + ..Default::default() + }, + MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])), + &l, + ); + + let n3 = new_test_raft_with_config( + &Config { + id: 3, + heartbeat_tick: 1, + election_tick: 10, + disable_proposal_forwarding: true, + ..Default::default() + }, + MemStorage::new_with_conf_state((vec![1, 2, 3], vec![])), + &l, + ); + + let mut network = Network::new(vec![Some(n1), Some(n2), Some(n3)], &l); + + // node 1 starts campaign to become leader. + network.send(vec![new_message(1, 1, MessageType::MsgHup, 0)]); + + // send proposal to n2(follower) where DisableProposalForwarding is false + assert_eq!( + network + .peers + .get_mut(&2) + .unwrap() + .step(new_message(2, 2, MessageType::MsgPropose, 1)), + Ok(()) + ); + + // verify n2(follower) does forward the proposal when DisableProposalForwarding is false + assert_eq!(network.peers.get(&2).unwrap().msgs.len(), 1); + + // send proposal to n3(follower) where DisableProposalForwarding is true + assert_eq!( + network + .peers + .get_mut(&3) + .unwrap() + .step(new_message(3, 3, MessageType::MsgPropose, 1)), + Err(Error::ProposalDropped) + ); + + assert!(network.peers.get(&3).unwrap().msgs.is_empty()); +} + +#[derive(Default)] +struct IgnoreSizeHintMemStorage { + inner: MemStorage, +} + +impl Storage for IgnoreSizeHintMemStorage { + fn initial_state(&self) -> Result { + self.inner.initial_state() + } + + fn entries( + &self, + low: u64, + high: u64, + _max_size: impl Into>, + context: GetEntriesContext, + ) -> Result> { + self.inner.entries(low, high, u64::MAX, context) + } + + fn term(&self, idx: u64) -> Result { + self.inner.term(idx) + } + + fn first_index(&self) -> Result { + self.inner.first_index() + } + + fn last_index(&self) -> Result { + self.inner.last_index() + } + + fn snapshot(&self, request_index: u64, to: u64) -> Result { + self.inner.snapshot(request_index, to) + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/test_util/mod.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/test_util/mod.rs new file mode 100644 index 0000000..b55412f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/test_util/mod.rs @@ -0,0 +1,226 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use harness::*; +use raft::eraftpb::*; +use raft::storage::MemStorage; +use raft::*; +use raft_proto::ConfChangeI; +use slog::Logger; + +#[allow(clippy::declare_interior_mutable_const)] +pub const NOP_STEPPER: Option = Some(Interface { raft: None }); + +pub fn ltoa(raft_log: &RaftLog) -> String { + let mut s = format!("committed: {}\n", raft_log.committed); + s = s + &format!("applied: {}\n", raft_log.applied); + for (i, e) in raft_log.all_entries().iter().enumerate() { + s = s + &format!("#{}: {:?}\n", i, e); + } + s +} + +pub fn new_storage() -> MemStorage { + MemStorage::new() +} + +pub fn new_test_config(id: u64, election_tick: usize, heartbeat_tick: usize) -> Config { + Config { + id, + election_tick, + heartbeat_tick, + max_size_per_msg: NO_LIMIT, + max_inflight_msgs: 256, + ..Default::default() + } +} + +pub fn new_test_raft( + id: u64, + peers: Vec, + election: usize, + heartbeat: usize, + storage: MemStorage, + l: &Logger, +) -> Interface { + let config = new_test_config(id, election, heartbeat); + if storage.initial_state().unwrap().initialized() && peers.is_empty() { + panic!("new_test_raft with empty peers on initialized store"); + } + if !peers.is_empty() && !storage.initial_state().unwrap().initialized() { + storage.initialize_with_conf_state((peers, vec![])); + } + new_test_raft_with_config(&config, storage, l) +} + +pub fn new_test_raft_with_prevote( + id: u64, + peers: Vec, + election: usize, + heartbeat: usize, + storage: MemStorage, + pre_vote: bool, + l: &Logger, +) -> Interface { + let mut config = new_test_config(id, election, heartbeat); + config.pre_vote = pre_vote; + if storage.initial_state().unwrap().initialized() && peers.is_empty() { + panic!("new_test_raft with empty peers on initialized store"); + } + if !peers.is_empty() && !storage.initial_state().unwrap().initialized() { + storage.initialize_with_conf_state((peers, vec![])); + } + new_test_raft_with_config(&config, storage, l) +} + +pub fn new_test_raft_with_logs( + id: u64, + peers: Vec, + election: usize, + heartbeat: usize, + storage: MemStorage, + logs: &[Entry], + l: &Logger, +) -> Interface { + let config = new_test_config(id, election, heartbeat); + if storage.initial_state().unwrap().initialized() && peers.is_empty() { + panic!("new_test_raft with empty peers on initialized store"); + } + if !peers.is_empty() && !storage.initial_state().unwrap().initialized() { + storage.initialize_with_conf_state((peers, vec![])); + } + storage.wl().append(logs).unwrap(); + new_test_raft_with_config(&config, storage, l) +} + +pub fn new_test_raft_with_config(config: &Config, storage: MemStorage, l: &Logger) -> Interface { + Interface::new(Raft::new(config, storage, l).unwrap()) +} + +pub fn hard_state(term: u64, commit: u64, vote: u64) -> HardState { + HardState { + term, + vote, + commit, + ..Default::default() + } +} + +pub fn soft_state(leader_id: u64, raft_state: StateRole) -> SoftState { + SoftState { + leader_id, + raft_state, + } +} + +pub const SOME_DATA: Option<&'static str> = Some("somedata"); + +pub fn new_message_with_entries(from: u64, to: u64, ty: MessageType, ents: Vec) -> Message { + let mut m = Message { + msg_type: ty, + to, + from, + ..Default::default() + }; + if !ents.is_empty() { + m.entries = ents.into(); + } + m +} + +pub fn new_message(from: u64, to: u64, t: MessageType, n: usize) -> Message { + let mut m = new_message_with_entries(from, to, t, vec![]); + if n > 0 { + let mut ents = Vec::with_capacity(n); + for _ in 0..n { + ents.push(new_entry(0, 0, SOME_DATA)); + } + m.entries = ents.into(); + } + m +} + +pub fn new_entry(term: u64, index: u64, data: Option<&str>) -> Entry { + let mut e = Entry { + term, + index, + ..Default::default() + }; + if let Some(d) = data { + e.data = d.as_bytes().to_vec().into(); + } + e +} + +pub fn empty_entry(term: u64, index: u64) -> Entry { + new_entry(term, index, None) +} + +pub fn new_snapshot(index: u64, term: u64, voters: Vec) -> Snapshot { + let mut s = Snapshot::default(); + s.mut_metadata().index = index; + s.mut_metadata().term = term; + s.mut_metadata().mut_conf_state().voters = voters; + s +} + +pub fn conf_change(ty: ConfChangeType, node_id: u64) -> ConfChange { + ConfChange { + change_type: ty, + node_id, + ..Default::default() + } +} + +pub fn remove_node(node_id: u64) -> ConfChangeV2 { + conf_change(ConfChangeType::RemoveNode, node_id).into_v2() +} + +pub fn add_node(node_id: u64) -> ConfChangeV2 { + conf_change(ConfChangeType::AddNode, node_id).into_v2() +} + +pub fn add_learner(node_id: u64) -> ConfChangeV2 { + conf_change(ConfChangeType::AddLearnerNode, node_id).into_v2() +} + +pub fn conf_state(voters: Vec, learners: Vec) -> ConfState { + ConfState { + voters, + learners, + ..Default::default() + } +} + +pub fn conf_state_v2( + voters: Vec, + learners: Vec, + voters_outgoing: Vec, + learners_next: Vec, + auto_leave: bool, +) -> ConfState { + let mut cs = conf_state(voters, learners); + cs.set_voters_outgoing(voters_outgoing); + cs.set_learners_next(learners_next); + cs.auto_leave = auto_leave; + cs +} + +pub fn conf_change_v2(steps: Vec) -> ConfChangeV2 { + let mut cc = ConfChangeV2::default(); + cc.set_changes(steps.into()); + cc +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/tests.rs b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/tests.rs new file mode 100644 index 0000000..61d49a9 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/harness/tests/tests.rs @@ -0,0 +1,91 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +#![cfg_attr(feature = "failpoints", allow(dead_code, unused_imports))] +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +/// Get the count of macro's arguments. +/// +/// # Examples +/// +/// ``` +/// # use tikv::count_args; +/// # fn main() { +/// assert_eq!(count_args!(), 0); +/// assert_eq!(count_args!(1), 1); +/// assert_eq!(count_args!(1, 2), 2); +/// assert_eq!(count_args!(1, 2, 3), 3); +/// # } +/// ``` +#[macro_export] +macro_rules! count_args { + () => { 0 }; + ($head:expr $(, $tail:expr)*) => { 1 + count_args!($($tail),*) }; +} + +/// Initial a `HashMap` with specify key-value pairs. +/// +/// # Examples +/// +/// ``` +/// # use tikv::map; +/// # fn main() { +/// // empty map +/// let m: tikv::util::collections::HashMap = map!(); +/// assert!(m.is_empty()); +/// +/// // one initial kv pairs. +/// let m = map!("key" => "value"); +/// assert_eq!(m.len(), 1); +/// assert_eq!(m["key"], "value"); +/// +/// // initialize with multiple kv pairs. +/// let m = map!("key1" => "value1", "key2" => "value2"); +/// assert_eq!(m.len(), 2); +/// assert_eq!(m["key1"], "value1"); +/// assert_eq!(m["key2"], "value2"); +/// # } +/// ``` +#[macro_export] +macro_rules! map { + () => { + { + use std::collections::HashMap; + HashMap::new() + } + }; + ( $( $k:expr => $v:expr ),+ ) => { + { + use std::collections::HashMap; + let mut temp_map = HashMap::with_capacity(count_args!($(($k, $v)),+)); + $( + temp_map.insert($k, $v); + )+ + temp_map + } + }; +} + +#[macro_export] +macro_rules! assert_iter_eq { + (o $lhs:expr, $rhs:expr) => {{ + assert_iter_eq!(internal $lhs.iter(), $rhs.iter().cloned()); + }}; + ($lhs:expr, $rhs:expr) => {{ + assert_iter_eq!(internal $lhs.iter().cloned(), $rhs.iter().cloned()); + }}; + (internal $lhs:expr, $rhs:expr) => {{ + let mut lhs: Vec<_> = $lhs.collect(); + let mut rhs: Vec<_> = $rhs.collect(); + lhs.sort_unstable(); + rhs.sort_unstable(); + assert_eq!(lhs, rhs); + }}; +} + +#[cfg(feature = "failpoints")] +mod failpoints_cases; +#[cfg(not(feature = "failpoints"))] +mod integration_cases; +mod test_util; diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/media/the-design-of-raft-rs.png b/systems/Raft-rs/driver/rust_so/raft-rs/media/the-design-of-raft-rs.png new file mode 100644 index 0000000..e408d6b Binary files /dev/null and b/systems/Raft-rs/driver/rust_so/raft-rs/media/the-design-of-raft-rs.png differ diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/Cargo.toml b/systems/Raft-rs/driver/rust_so/raft-rs/proto/Cargo.toml new file mode 100644 index 0000000..5efbe0d --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "raft-proto" +version = "0.7.0" +authors = ["The TiKV Project Developers"] +edition = "2021" +license = "Apache-2.0" +keywords = ["raft", "distributed-systems", "ha"] +repository = "https://github.com/pingcap/raft-rs" +homepage = "https://github.com/pingcap/raft-rs" +documentation = "https://docs.rs/raft-proto" +description = "Protocol definitions for the rust language implementation of the Raft algorithm." +categories = ["algorithms", "database-implementations"] +build = "build.rs" + +[features] +default = ["protobuf-codec"] +protobuf-codec = ["protobuf-build/protobuf-codec", "bytes", "protobuf/bytes"] +prost-codec = ["protobuf-build/prost-codec", "prost", "lazy_static"] + +[build-dependencies] +protobuf-build = { version = "0.15.1", default-features = false } + +[dependencies] +bytes = { version = "1", optional = true } +lazy_static = { version = "1", optional = true } +prost = { version = "0.11", optional = true } +protobuf = "2" diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/README.md b/systems/Raft-rs/driver/rust_so/raft-rs/proto/README.md new file mode 100644 index 0000000..d9a51dc --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/README.md @@ -0,0 +1,8 @@ +# Raft Proto + +[![Documentation](https://docs.rs/raft-proto/badge.svg)](https://docs.rs/raft-proto/) +[![Crates.io](https://img.shields.io/crates/v/raft-proto.svg)](https://crates.io/crates/raft-proto) + +This crate contains the protobuf structs used by raft. + +You can find the generated `eraftpb.rs` in `target/debug/build/raft-proto-***/out`. \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/proto/eraftpb.proto b/systems/Raft-rs/driver/rust_so/raft-rs/proto/proto/eraftpb.proto new file mode 100644 index 0000000..dcd285d --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/proto/eraftpb.proto @@ -0,0 +1,196 @@ +syntax = "proto3"; +package eraftpb; + +import "rustproto.proto"; +option (rustproto.carllerche_bytes_for_bytes_all) = true; + +enum EntryType { + EntryNormal = 0; + EntryConfChange = 1; + EntryConfChangeV2 = 2; +} + +// The entry is a type of change that needs to be applied. It contains two data fields. +// While the fields are built into the model; their usage is determined by the entry_type. +// +// For normal entries, the data field should contain the data change that should be applied. +// The context field can be used for any contextual data that might be relevant to the +// application of the data. +// +// For configuration changes, the data will contain the ConfChange message and the +// context will provide anything needed to assist the configuration change. The context +// if for the user to set and use in this case. +message Entry { + EntryType entry_type = 1; + uint64 term = 2; + uint64 index = 3; + bytes data = 4; + bytes context = 6; + + // Deprecated! It is kept for backward compatibility. + // TODO: remove it in the next major release. + bool sync_log = 5; +} + +message SnapshotMetadata { + // The current `ConfState`. + ConfState conf_state = 1; + // The applied index. + uint64 index = 2; + // The term of the applied index. + uint64 term = 3; +} + +message Snapshot { + bytes data = 1; + SnapshotMetadata metadata = 2; +} + +enum MessageType { + MsgHup = 0; + MsgBeat = 1; + MsgPropose = 2; + MsgAppend = 3; + MsgAppendResponse = 4; + MsgRequestVote = 5; + MsgRequestVoteResponse = 6; + MsgSnapshot = 7; + MsgHeartbeat = 8; + MsgHeartbeatResponse = 9; + MsgUnreachable = 10; + MsgSnapStatus = 11; + MsgCheckQuorum = 12; + MsgTransferLeader = 13; + MsgTimeoutNow = 14; + MsgReadIndex = 15; + MsgReadIndexResp = 16; + MsgRequestPreVote = 17; + MsgRequestPreVoteResponse = 18; +} + +message Message { + MessageType msg_type = 1; + uint64 to = 2; + uint64 from = 3; + uint64 term = 4; + // logTerm is generally used for appending Raft logs to followers. For example, + // (type=MsgAppend,index=100,log_term=5) means leader appends entries starting at + // index=101, and the term of entry at index 100 is 5. + // (type=MsgAppendResponse,reject=true,index=100,log_term=5) means follower rejects some + // entries from its leader as it already has an entry with term 5 at index 100. + uint64 log_term = 5; + uint64 index = 6; + repeated Entry entries = 7; + uint64 commit = 8; + uint64 commit_term = 15; + Snapshot snapshot = 9; + uint64 request_snapshot = 13; + bool reject = 10; + uint64 reject_hint = 11; + bytes context = 12; + uint64 deprecated_priority = 14; + // If this new field is not set, then use the above old field; otherwise + // use the new field. When broadcasting request vote, both fields are + // set if the priority is larger than 0. This change is not a fully + // compatible change, but it makes minimal impact that only new priority + // is not recognized by the old nodes during rolling update. + int64 priority = 16; +} + +message HardState { + uint64 term = 1; + uint64 vote = 2; + uint64 commit = 3; +} + +enum ConfChangeTransition { + // Automatically use the simple protocol if possible, otherwise fall back + // to ConfChangeType::Implicit. Most applications will want to use this. + Auto = 0; + // Use joint consensus unconditionally, and transition out of them + // automatically (by proposing a zero configuration change). + // + // This option is suitable for applications that want to minimize the time + // spent in the joint configuration and do not store the joint configuration + // in the state machine (outside of InitialState). + Implicit = 1; + // Use joint consensus and remain in the joint configuration until the + // application proposes a no-op configuration change. This is suitable for + // applications that want to explicitly control the transitions, for example + // to use a custom payload (via the Context field). + Explicit = 2; +} + +message ConfState { + repeated uint64 voters = 1; + repeated uint64 learners = 2; + + // The voters in the outgoing config. If not empty the node is in joint consensus. + repeated uint64 voters_outgoing = 3; + // The nodes that will become learners when the outgoing config is removed. + // These nodes are necessarily currently in nodes_joint (or they would have + // been added to the incoming config right away). + repeated uint64 learners_next = 4; + // If set, the config is joint and Raft will automatically transition into + // the final config (i.e. remove the outgoing config) when this is safe. + bool auto_leave = 5; +} + +enum ConfChangeType { + AddNode = 0; + RemoveNode = 1; + AddLearnerNode = 2; +} + +message ConfChange { + ConfChangeType change_type = 2; + uint64 node_id = 3; + bytes context = 4; + + uint64 id = 1; +} + +// ConfChangeSingle is an individual configuration change operation. Multiple +// such operations can be carried out atomically via a ConfChangeV2. +message ConfChangeSingle { + ConfChangeType change_type = 1; + uint64 node_id = 2; +} + +// ConfChangeV2 messages initiate configuration changes. They support both the +// simple "one at a time" membership change protocol and full Joint Consensus +// allowing for arbitrary changes in membership. +// +// The supplied context is treated as an opaque payload and can be used to +// attach an action on the state machine to the application of the config change +// proposal. Note that contrary to Joint Consensus as outlined in the Raft +// paper[1], configuration changes become active when they are *applied* to the +// state machine (not when they are appended to the log). +// +// The simple protocol can be used whenever only a single change is made. +// +// Non-simple changes require the use of Joint Consensus, for which two +// configuration changes are run. The first configuration change specifies the +// desired changes and transitions the Raft group into the joint configuration, +// in which quorum requires a majority of both the pre-changes and post-changes +// configuration. Joint Consensus avoids entering fragile intermediate +// configurations that could compromise survivability. For example, without the +// use of Joint Consensus and running across three availability zones with a +// replication factor of three, it is not possible to replace a voter without +// entering an intermediate configuration that does not survive the outage of +// one availability zone. +// +// The provided ConfChangeTransition specifies how (and whether) Joint Consensus +// is used, and assigns the task of leaving the joint configuration either to +// Raft or the application. Leaving the joint configuration is accomplished by +// proposing a ConfChangeV2 with only and optionally the Context field +// populated. +// +// For details on Raft membership changes, see: +// +// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf +message ConfChangeV2 { + ConfChangeTransition transition = 1; + repeated ConfChangeSingle changes = 2; + bytes context = 3; +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confchange.rs b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confchange.rs new file mode 100644 index 0000000..23b81fe --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confchange.rs @@ -0,0 +1,151 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::eraftpb::{ + ConfChange, ConfChangeSingle, ConfChangeTransition, ConfChangeType, ConfChangeV2, +}; +use std::borrow::Cow; +use std::fmt::Write; + +/// Creates a `ConfChangeSingle`. +pub fn new_conf_change_single(node_id: u64, ty: ConfChangeType) -> ConfChangeSingle { + let mut single = ConfChangeSingle::default(); + single.node_id = node_id; + single.set_change_type(ty); + single +} + +/// Parses a Space-delimited sequence of operations into a slice of ConfChangeSingle. +/// The supported operations are: +/// - vn: make n a voter, +/// - ln: make n a learner, +/// - rn: remove n +pub fn parse_conf_change(s: &str) -> Result, String> { + let s = s.trim(); + if s.is_empty() { + return Ok(vec![]); + } + let mut ccs = vec![]; + let splits = s.split_ascii_whitespace(); + for tok in splits { + if tok.len() < 2 { + return Err(format!("unknown token {}", tok)); + } + let mut cc = ConfChangeSingle::default(); + let mut chars = tok.chars(); + cc.set_change_type(match chars.next().unwrap() { + 'v' => ConfChangeType::AddNode, + 'l' => ConfChangeType::AddLearnerNode, + 'r' => ConfChangeType::RemoveNode, + _ => return Err(format!("unknown token {}", tok)), + }); + cc.node_id = match chars.as_str().parse() { + Ok(id) => id, + Err(e) => return Err(format!("parse token {} fail: {}", tok, e)), + }; + ccs.push(cc); + } + Ok(ccs) +} + +/// The inverse to `parse_conf_change`. +pub fn stringify_conf_change(ccs: &[ConfChangeSingle]) -> String { + let mut s = String::new(); + for (i, cc) in ccs.iter().enumerate() { + if i > 0 { + s.push(' '); + } + match cc.get_change_type() { + ConfChangeType::AddNode => s.push('v'), + ConfChangeType::AddLearnerNode => s.push('l'), + ConfChangeType::RemoveNode => s.push('r'), + } + write!(&mut s, "{}", cc.node_id).unwrap(); + } + s +} + +/// Abstracts over ConfChangeV2 and (legacy) ConfChange to allow +/// treating them in a unified manner. +pub trait ConfChangeI { + /// Converts conf change to `ConfChangeV2`. + fn into_v2(self) -> ConfChangeV2; + + /// Gets conf change as `ConfChangeV2`. + fn as_v2(&self) -> Cow; + + /// Converts conf change to `ConfChange`. + /// + /// `ConfChangeV2` can't be changed back to `ConfChange`. + fn as_v1(&self) -> Option<&ConfChange>; +} + +impl ConfChangeI for ConfChange { + #[inline] + fn into_v2(mut self) -> ConfChangeV2 { + let mut cc = ConfChangeV2::default(); + let single = new_conf_change_single(self.node_id, self.get_change_type()); + cc.mut_changes().push(single); + cc.set_context(self.take_context()); + cc + } + + #[inline] + fn as_v2(&self) -> Cow { + Cow::Owned(self.clone().into_v2()) + } + + #[inline] + fn as_v1(&self) -> Option<&ConfChange> { + Some(self) + } +} + +impl ConfChangeI for ConfChangeV2 { + #[inline] + fn into_v2(self) -> ConfChangeV2 { + self + } + + #[inline] + fn as_v2(&self) -> Cow { + Cow::Borrowed(self) + } + + #[inline] + fn as_v1(&self) -> Option<&ConfChange> { + None + } +} + +impl ConfChangeV2 { + /// Checks if uses Joint Consensus. + /// + /// It will return Some if and only if this config change will use Joint Consensus, + /// which is the case if it contains more than one change or if the use of Joint + /// Consensus was requested explicitly. The bool indicates whether the Joint State + /// will be left automatically. + pub fn enter_joint(&self) -> Option { + // NB: in theory, more config changes could qualify for the "simple" + // protocol but it depends on the config on top of which the changes apply. + // For example, adding two learners is not OK if both nodes are part of the + // base config (i.e. two voters are turned into learners in the process of + // applying the conf change). In practice, these distinctions should not + // matter, so we keep it simple and use Joint Consensus liberally. + if self.get_transition() != ConfChangeTransition::Auto || self.changes.len() > 1 { + match self.get_transition() { + ConfChangeTransition::Auto | ConfChangeTransition::Implicit => Some(true), + ConfChangeTransition::Explicit => Some(false), + } + } else { + None + } + } + + /// Checks if the configuration change leaves a joint configuration. + /// + /// This is the case if the ConfChangeV2 is zero, with the possible exception of + /// the Context field. + pub fn leave_joint(&self) -> bool { + self.get_transition() == ConfChangeTransition::Auto && self.changes.is_empty() + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confstate.rs b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confstate.rs new file mode 100644 index 0000000..559bbd2 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/confstate.rs @@ -0,0 +1,40 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::eraftpb::ConfState; + +fn eq_without_order(lhs: &[u64], rhs: &[u64]) -> bool { + for l in lhs { + if !rhs.contains(l) { + return false; + } + } + for r in rhs { + if !lhs.contains(r) { + return false; + } + } + true +} + +// Returns true if the inputs describe the same configuration. +#[must_use] +pub fn conf_state_eq(lhs: &ConfState, rhs: &ConfState) -> bool { + // The orders are different only when hash algorithm or insert orders are + // different. In most case, only one hash algorithm is used. Insert orders + // should be the same due to the raft protocol. So in most case, they can + // be compared directly. + if lhs.get_voters() == rhs.get_voters() + && lhs.get_learners() == rhs.get_learners() + && lhs.get_voters_outgoing() == rhs.get_voters_outgoing() + && lhs.get_learners_next() == rhs.get_learners_next() + && lhs.auto_leave == rhs.auto_leave + { + return true; + } + + eq_without_order(lhs.get_voters(), rhs.get_voters()) + && eq_without_order(lhs.get_learners(), rhs.get_learners()) + && eq_without_order(lhs.get_voters_outgoing(), rhs.get_voters_outgoing()) + && eq_without_order(lhs.get_learners_next(), rhs.get_learners_next()) + && lhs.auto_leave == rhs.auto_leave +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/lib.rs b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/lib.rs new file mode 100644 index 0000000..3e29dbd --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/proto/src/lib.rs @@ -0,0 +1,56 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +mod confchange; +mod confstate; + +pub use crate::confchange::{ + new_conf_change_single, parse_conf_change, stringify_conf_change, ConfChangeI, +}; +pub use crate::confstate::conf_state_eq; +pub use crate::protos::eraftpb; + +#[allow(dead_code)] +#[allow(unknown_lints)] +#[allow(clippy::all)] +#[allow(renamed_and_removed_lints)] +#[allow(bare_trait_objects)] +mod protos { + include!(concat!(env!("OUT_DIR"), "/protos/mod.rs")); + + use self::eraftpb::Snapshot; + + impl Snapshot { + /// For a given snapshot, determine if it's empty or not. + pub fn is_empty(&self) -> bool { + self.get_metadata().index == 0 + } + } +} + +pub mod prelude { + pub use crate::eraftpb::{ + ConfChange, ConfChangeSingle, ConfChangeTransition, ConfChangeType, ConfChangeV2, + ConfState, Entry, EntryType, HardState, Message, MessageType, Snapshot, SnapshotMetadata, + }; +} + +pub mod util { + use crate::eraftpb::ConfState; + + impl From<(Iter1, Iter2)> for ConfState + where + Iter1: IntoIterator, + Iter2: IntoIterator, + { + fn from((voters, learners): (Iter1, Iter2)) -> Self { + let mut conf_state = ConfState::default(); + conf_state.mut_voters().extend(voters); + conf_state.mut_learners().extend(learners); + conf_state + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange.rs new file mode 100644 index 0000000..4068af0 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange.rs @@ -0,0 +1,15 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +mod changer; +#[cfg(test)] +pub mod datadriven_test; +mod restore; + +pub use self::changer::{Changer, MapChange, MapChangeType}; +pub use self::restore::restore; + +use crate::tracker::Configuration; + +#[inline] +pub(crate) fn joint(cfg: &Configuration) -> bool { + !cfg.voters().outgoing.is_empty() +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/changer.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/changer.rs new file mode 100644 index 0000000..3c3e111 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/changer.rs @@ -0,0 +1,357 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::eraftpb::{ConfChangeSingle, ConfChangeType}; +use crate::tracker::{Configuration, ProgressMap, ProgressTracker}; +use crate::{Error, Result}; + +/// Change log for progress map. +pub enum MapChangeType { + Add, + Remove, +} + +/// Changes made by `Changer`. +pub type MapChange = Vec<(u64, MapChangeType)>; + +/// A map that stores updates instead of apply them directly. +pub struct IncrChangeMap<'a> { + changes: MapChange, + base: &'a ProgressMap, +} + +impl IncrChangeMap<'_> { + pub fn into_changes(self) -> MapChange { + self.changes + } + + fn contains(&self, id: u64) -> bool { + match self.changes.iter().rfind(|(i, _)| *i == id) { + Some((_, MapChangeType::Remove)) => false, + Some((_, MapChangeType::Add)) => true, + None => self.base.contains_key(&id), + } + } +} + +/// Changer facilitates configuration changes. +/// +/// It exposes methods to handle +/// simple and joint consensus while performing the proper validation that allows +/// refusing invalid configuration changes before they affect the active +/// configuration. +pub struct Changer<'a> { + tracker: &'a ProgressTracker, +} + +impl Changer<'_> { + /// Creates a changer. + pub fn new(tracker: &ProgressTracker) -> Changer { + Changer { tracker } + } + + /// Verifies that the outgoing (=right) majority config of the joint + /// config is empty and initializes it with a copy of the incoming (=left) + /// majority config. That is, it transitions from + /// ```text + /// (1 2 3)&&() + /// ``` + /// to + /// ```text + /// (1 2 3)&&(1 2 3) + /// ```. + /// + /// The supplied changes are then applied to the incoming majority config, + /// resulting in a joint configuration that in terms of the Raft thesis[1] + /// (Section 4.3) corresponds to `C_{new,old}`. + /// + /// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf + pub fn enter_joint( + &self, + auto_leave: bool, + ccs: &[ConfChangeSingle], + ) -> Result<(Configuration, MapChange)> { + if super::joint(self.tracker.conf()) { + return Err(Error::ConfChangeError("config is already joint".to_owned())); + } + let (mut cfg, mut prs) = self.check_and_copy()?; + if cfg.voters().incoming.is_empty() { + // We allow adding nodes to an empty config for convenience (testing and + // bootstrap), but you can't enter a joint state. + return Err(Error::ConfChangeError( + "can't make a zero-voter config joint".to_owned(), + )); + } + cfg.voters + .outgoing + .extend(cfg.voters.incoming.iter().cloned()); + self.apply(&mut cfg, &mut prs, ccs)?; + cfg.auto_leave = auto_leave; + check_invariants(&cfg, &prs)?; + Ok((cfg, prs.into_changes())) + } + + /// Transitions out of a joint configuration. It is an error to call this method if + /// the configuration is not joint, i.e. if the outgoing majority config is empty. + /// + /// The outgoing majority config of the joint configuration will be removed, that is, + /// the incoming config is promoted as the sole decision maker. In the notation of + /// the Raft thesis[1] (Section 4.3), this method transitions from `C_{new,old}` into + /// `C_new`. + /// + /// At the same time, any staged learners (LearnersNext) the addition of which was + /// held back by an overlapping voter in the former outgoing config will be inserted + /// into Learners. + /// + /// [1]: https://github.com/ongardie/dissertation/blob/master/online-trim.pdf + pub fn leave_joint(&self) -> Result<(Configuration, MapChange)> { + if !super::joint(self.tracker.conf()) { + return Err(Error::ConfChangeError( + "can't leave a non-joint config".to_owned(), + )); + } + let (mut cfg, mut prs) = self.check_and_copy()?; + if cfg.voters().outgoing.is_empty() { + return Err(Error::ConfChangeError(format!( + "configuration is not joint: {:?}", + cfg + ))); + } + cfg.learners.extend(cfg.learners_next.drain()); + + for id in &*cfg.voters.outgoing { + if !cfg.voters.incoming.contains(id) && !cfg.learners.contains(id) { + prs.changes.push((*id, MapChangeType::Remove)); + } + } + + cfg.voters.outgoing.clear(); + cfg.auto_leave = false; + check_invariants(&cfg, &prs)?; + Ok((cfg, prs.into_changes())) + } + + /// Carries out a series of configuration changes that (in aggregate) mutates the + /// incoming majority config `Voters[0]` by at most one. This method will return an + /// error if that is not the case, if the resulting quorum is zero, or if the + /// configuration is in a joint state (i.e. if there is an outgoing configuration). + pub fn simple(&mut self, ccs: &[ConfChangeSingle]) -> Result<(Configuration, MapChange)> { + if super::joint(self.tracker.conf()) { + return Err(Error::ConfChangeError( + "can't apply simple config change in joint config".to_owned(), + )); + } + let (mut cfg, mut prs) = self.check_and_copy()?; + self.apply(&mut cfg, &mut prs, ccs)?; + + if cfg + .voters + .incoming + .symmetric_difference(&self.tracker.conf().voters.incoming) + .count() + > 1 + { + return Err(Error::ConfChangeError( + "more than one voter changed without entering joint config".to_owned(), + )); + } + check_invariants(&cfg, &prs)?; + Ok((cfg, prs.into_changes())) + } + + /// Applies a change to the configuration. By convention, changes to voters are always + /// made to the incoming majority config. Outgoing is either empty or preserves the + /// outgoing majority configuration while in a joint state. + fn apply( + &self, + cfg: &mut Configuration, + prs: &mut IncrChangeMap, + ccs: &[ConfChangeSingle], + ) -> Result<()> { + for cc in ccs { + if cc.node_id == 0 { + // Replaces the NodeID with zero if it decides (downstream of + // raft) to not apply a change, so we have to have explicit code + // here to ignore these. + continue; + } + match cc.get_change_type() { + ConfChangeType::AddNode => self.make_voter(cfg, prs, cc.node_id), + ConfChangeType::AddLearnerNode => self.make_learner(cfg, prs, cc.node_id), + ConfChangeType::RemoveNode => self.remove(cfg, prs, cc.node_id), + } + } + if cfg.voters().incoming.is_empty() { + return Err(Error::ConfChangeError("removed all voters".to_owned())); + } + Ok(()) + } + + /// Adds or promotes the given ID to be a voter in the incoming majority config. + fn make_voter(&self, cfg: &mut Configuration, prs: &mut IncrChangeMap, id: u64) { + if !prs.contains(id) { + self.init_progress(cfg, prs, id, false); + return; + } + + cfg.voters.incoming.insert(id); + cfg.learners.remove(&id); + cfg.learners_next.remove(&id); + } + + /// Makes the given ID a learner or stages it to be a learner once an active joint + /// configuration is exited. + /// + /// The former happens when the peer is not a part of the outgoing config, in which + /// case we either add a new learner or demote a voter in the incoming config. + /// + /// The latter case occurs when the configuration is joint and the peer is a voter + /// in the outgoing config. In that case, we do not want to add the peer as a learner + /// because then we'd have to track a peer as a voter and learner simultaneously. + /// Instead, we add the learner to LearnersNext, so that it will be added to Learners + /// the moment the outgoing config is removed by LeaveJoint(). + fn make_learner(&self, cfg: &mut Configuration, prs: &mut IncrChangeMap, id: u64) { + if !prs.contains(id) { + self.init_progress(cfg, prs, id, true); + return; + } + + if cfg.learners.contains(&id) { + return; + } + + cfg.voters.incoming.remove(&id); + cfg.learners.remove(&id); + cfg.learners_next.remove(&id); + + // Use LearnersNext if we can't add the learner to Learners directly, i.e. + // if the peer is still tracked as a voter in the outgoing config. It will + // be turned into a learner in LeaveJoint(). + // + // Otherwise, add a regular learner right away. + if cfg.voters().outgoing.contains(&id) { + cfg.learners_next.insert(id); + } else { + cfg.learners.insert(id); + } + } + + /// Removes this peer as a voter or learner from the incoming config. + fn remove(&self, cfg: &mut Configuration, prs: &mut IncrChangeMap, id: u64) { + if !prs.contains(id) { + return; + } + + cfg.voters.incoming.remove(&id); + cfg.learners.remove(&id); + cfg.learners_next.remove(&id); + + // If the peer is still a voter in the outgoing config, keep the Progress. + if !cfg.voters.outgoing.contains(&id) { + prs.changes.push((id, MapChangeType::Remove)); + } + } + + /// Initializes a new progress for the given node or learner. + fn init_progress( + &self, + cfg: &mut Configuration, + prs: &mut IncrChangeMap, + id: u64, + is_learner: bool, + ) { + if !is_learner { + cfg.voters.incoming.insert(id); + } else { + cfg.learners.insert(id); + } + prs.changes.push((id, MapChangeType::Add)); + } + + /// Copies the tracker's config. It returns an error if checkInvariants does. + /// + /// Unlike Etcd, we don't copy progress as we don't need to mutate the `is_learner` + /// flags. Additions and Removals should be done after everything is checked OK. + fn check_and_copy(&self) -> Result<(Configuration, IncrChangeMap)> { + let prs = IncrChangeMap { + changes: vec![], + base: self.tracker.progress(), + }; + check_invariants(self.tracker.conf(), &prs)?; + Ok((self.tracker.conf().clone(), prs)) + } +} + +/// Makes sure that the config and progress are compatible with each other. +/// This is used to check both what the Changer is initialized with, as well +/// as what it returns. +fn check_invariants(cfg: &Configuration, prs: &IncrChangeMap) -> Result<()> { + // NB: intentionally allow the empty config. In production we'll never see a + // non-empty config (we prevent it from being created) but we will need to + // be able to *create* an initial config, for example during bootstrap (or + // during tests). Instead of having to hand-code this, we allow + // transitioning from an empty config into any other legal and non-empty + // config. + for id in cfg.voters().ids().iter() { + if !prs.contains(id) { + return Err(Error::ConfChangeError(format!( + "no progress for voter {}", + id + ))); + } + } + for id in &cfg.learners { + if !prs.contains(*id) { + return Err(Error::ConfChangeError(format!( + "no progress for learner {}", + id + ))); + } + // Conversely Learners and Voters doesn't intersect at all. + if cfg.voters().outgoing.contains(id) { + return Err(Error::ConfChangeError(format!( + "{} is in learners and outgoing voters", + id + ))); + } + if cfg.voters().incoming.contains(id) { + return Err(Error::ConfChangeError(format!( + "{} is in learners and incoming voters", + id + ))); + } + } + for id in &cfg.learners_next { + if !prs.contains(*id) { + return Err(Error::ConfChangeError(format!( + "no progress for learner(next) {}", + id + ))); + } + + // Any staged learner was staged because it could not be directly added due + // to a conflicting voter in the outgoing config. + if !cfg.voters().outgoing.contains(id) { + return Err(Error::ConfChangeError(format!( + "{} is in learners_next and outgoing voters", + id + ))); + } + } + + if !super::joint(cfg) { + // Etcd enforces outgoing and learner_next to be nil map. But there is no nil + // in rust. We just check empty for simplicity. + if !cfg.learners_next().is_empty() { + return Err(Error::ConfChangeError( + "learners_next must be empty when not joint".to_owned(), + )); + } + if cfg.auto_leave { + return Err(Error::ConfChangeError( + "auto_leave must be false when not joint".to_owned(), + )); + } + } + + Ok(()) +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/datadriven_test.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/datadriven_test.rs new file mode 100644 index 0000000..96b9dcc --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/datadriven_test.rs @@ -0,0 +1,86 @@ +use std::fmt::Write; + +use crate::{default_logger, Changer, ProgressTracker}; +use datadriven::{run_test, walk}; +use itertools::Itertools; +use raft_proto::parse_conf_change; + +#[test] +fn test_conf_change_data_driven() -> anyhow::Result<()> { + walk("src/confchange/testdata", |path| -> anyhow::Result<()> { + let logger = default_logger(); + + let mut tr = ProgressTracker::new(10); + let mut idx = 0; + + run_test( + path.to_str().unwrap(), + |data| -> String { + let ccs = parse_conf_change(&data.input).unwrap(); + + let res = match data.cmd.as_str() { + "simple" => Changer::new(&tr).simple(&ccs), + "enter-joint" => { + let mut auto_leave = false; + for arg in &data.cmd_args { + match arg.key.as_str() { + "autoleave" => { + for val in &arg.vals { + auto_leave = val + .parse() + .expect("type of autoleave should be boolean") + } + } + _ => { + panic!("unknown arg: {}", arg.key); + } + } + } + Changer::new(&tr).enter_joint(auto_leave, &ccs) + } + "leave-joint" => { + assert!(data.cmd_args.is_empty()); + Changer::new(&tr).leave_joint() + } + _ => { + panic!("unknown arg: {}", data.cmd); + } + }; + match res { + Ok((conf, changes)) => { + tr.apply_conf(conf, changes, idx); + idx += 1; + } + Err(e) => { + idx += 1; + return e.to_string(); + } + } + + let mut buffer = String::new(); + + let conf = tr.conf(); + writeln!(buffer, "{}", conf).unwrap(); + + let prs = tr.progress(); + + // output with peer_id sorted + for (k, v) in prs.iter().sorted_by(|&(k1, _), &(k2, _)| k1.cmp(k2)) { + write!( + buffer, + "{}: {} match={} next={}", + k, v.state, v.matched, v.next_idx + ) + .unwrap(); + if conf.learners.contains(k) { + buffer.push_str(" learner"); + } + buffer.push('\n'); + } + buffer + }, + false, + &logger, + ) + }) +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/restore.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/restore.rs new file mode 100644 index 0000000..7dbb7f5 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/restore.rs @@ -0,0 +1,107 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +// TODO: remove following line +#![allow(dead_code)] + +use super::changer::Changer; +use crate::eraftpb::{ConfChangeSingle, ConfChangeType, ConfState}; +use crate::tracker::ProgressTracker; +use crate::Result; + +/// Translates a conf state into 1) a slice of operations creating first the config that +/// will become the outgoing one, and then the incoming one, and b) another slice that, +/// when applied to the config resulted from 1), represents the ConfState. +fn to_conf_change_single(cs: &ConfState) -> (Vec, Vec) { + // Example to follow along this code: + // voters=(1 2 3) learners=(5) outgoing=(1 2 4 6) learners_next=(4) + // + // This means that before entering the joint config, the configuration + // had voters (1 2 4 6) and perhaps some learners that are already gone. + // The new set of voters is (1 2 3), i.e. (1 2) were kept around, and (4 6) + // are no longer voters; however 4 is poised to become a learner upon leaving + // the joint state. + // We can't tell whether 5 was a learner before entering the joint config, + // but it doesn't matter (we'll pretend that it wasn't). + // + // The code below will construct + // outgoing = add 1; add 2; add 4; add 6 + // incoming = remove 1; remove 2; remove 4; remove 6 + // add 1; add 2; add 3; + // add-learner 5; + // add-learner 4; + // + // So, when starting with an empty config, after applying 'outgoing' we have + // + // quorum=(1 2 4 6) + // + // From which we enter a joint state via 'incoming' + // + // quorum=(1 2 3)&&(1 2 4 6) learners=(5) learners_next=(4) + // + // as desired. + let mut incoming = Vec::new(); + let mut outgoing = Vec::new(); + for id in cs.get_voters_outgoing() { + // If there are outgoing voters, first add them one by one so that the + // (non-joint) config has them all. + outgoing.push(raft_proto::new_conf_change_single( + *id, + ConfChangeType::AddNode, + )); + } + + // We're done constructing the outgoing slice, now on to the incoming one + // (which will apply on top of the config created by the outgoing slice). + + // First, we'll remove all of the outgoing voters. + for id in cs.get_voters_outgoing() { + incoming.push(raft_proto::new_conf_change_single( + *id, + ConfChangeType::RemoveNode, + )); + } + // Then we'll add the incoming voters and learners. + for id in cs.get_voters() { + incoming.push(raft_proto::new_conf_change_single( + *id, + ConfChangeType::AddNode, + )); + } + for id in cs.get_learners() { + incoming.push(raft_proto::new_conf_change_single( + *id, + ConfChangeType::AddLearnerNode, + )); + } + // Same for LearnersNext; these are nodes we want to be learners but which + // are currently voters in the outgoing config. + for id in cs.get_learners_next() { + incoming.push(raft_proto::new_conf_change_single( + *id, + ConfChangeType::AddLearnerNode, + )); + } + (outgoing, incoming) +} + +/// Restore takes a Changer (which must represent an empty configuration), and runs a +/// sequence of changes enacting the configuration described in the ConfState. +/// +/// TODO(jay) find a way to only take `ProgressMap` instead of a whole tracker. +pub fn restore(tracker: &mut ProgressTracker, next_idx: u64, cs: &ConfState) -> Result<()> { + let (outgoing, incoming) = to_conf_change_single(cs); + if outgoing.is_empty() { + for i in incoming { + let (cfg, changes) = Changer::new(tracker).simple(&[i])?; + tracker.apply_conf(cfg, changes, next_idx); + } + } else { + for cc in outgoing { + let (cfg, changes) = Changer::new(tracker).simple(&[cc])?; + tracker.apply_conf(cfg, changes, next_idx); + } + let (cfg, changes) = Changer::new(tracker).enter_joint(cs.auto_leave, &incoming)?; + tracker.apply_conf(cfg, changes, next_idx); + } + Ok(()) +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_autoleave.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_autoleave.txt new file mode 100644 index 0000000..9ec8cb0 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_autoleave.txt @@ -0,0 +1,29 @@ +# Test the autoleave argument to EnterJoint. It defaults to false in the +# datadriven tests. The flag has no associated semantics in this package, +# it is simply passed through. +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +# Autoleave is reflected in the config. +enter-joint autoleave=true +v2 v3 +---- +voters=(1 2 3)&&(1) autoleave +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 +3: StateProbe match=0 next=1 + +# Can't enter-joint twice, even if autoleave changes. +enter-joint autoleave=false +---- +config is already joint + +leave-joint +---- +voters=(1 2 3) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 +3: StateProbe match=0 next=1 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_idempotency.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_idempotency.txt new file mode 100644 index 0000000..6d1346b --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_idempotency.txt @@ -0,0 +1,23 @@ +# Verify that operations upon entering the joint state are idempotent, i.e. +# removing an absent node is fine, etc. + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +enter-joint +r1 r2 r9 v2 v3 v4 v2 v3 v4 l2 l2 r4 r4 l1 l1 +---- +voters=(3)&&(1) learners=(2) learners_next=(1) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 learner +3: StateProbe match=0 next=1 + +leave-joint +---- +voters=(3) learners=(1 2) +1: StateProbe match=0 next=0 learner +2: StateProbe match=0 next=1 learner +3: StateProbe match=0 next=1 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_learners_next.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_learners_next.txt new file mode 100644 index 0000000..df1da7d --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_learners_next.txt @@ -0,0 +1,24 @@ +# Verify that when a voter is demoted in a joint config, it will show up in +# learners_next until the joint config is left, and only then will the progress +# turn into that of a learner, without resetting the progress. Note that this +# last fact is verified by `next`, which can tell us which "round" the progress +# was originally created in. + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +enter-joint +v2 l1 +---- +voters=(2)&&(1) learners_next=(1) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 + +leave-joint +---- +voters=(2) learners=(1) +1: StateProbe match=0 next=0 learner +2: StateProbe match=0 next=1 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_safety.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_safety.txt new file mode 100644 index 0000000..75d11b1 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/joint_safety.txt @@ -0,0 +1,81 @@ +leave-joint +---- +can't leave a non-joint config + +enter-joint +---- +can't make a zero-voter config joint + +enter-joint +v1 +---- +can't make a zero-voter config joint + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=3 + +leave-joint +---- +can't leave a non-joint config + +# Can enter into joint config. +enter-joint +---- +voters=(1)&&(1) +1: StateProbe match=0 next=3 + +enter-joint +---- +config is already joint + +leave-joint +---- +voters=(1) +1: StateProbe match=0 next=3 + +leave-joint +---- +can't leave a non-joint config + +# Can enter again, this time with some ops. +enter-joint +r1 v2 v3 l4 +---- +voters=(2 3)&&(1) learners=(4) +1: StateProbe match=0 next=3 +2: StateProbe match=0 next=9 +3: StateProbe match=0 next=9 +4: StateProbe match=0 next=9 learner + +enter-joint +---- +config is already joint + +enter-joint +v12 +---- +config is already joint + +simple +l15 +---- +can't apply simple config change in joint config + +leave-joint +---- +voters=(2 3) learners=(4) +2: StateProbe match=0 next=9 +3: StateProbe match=0 next=9 +4: StateProbe match=0 next=9 learner + +simple +l9 +---- +voters=(2 3) learners=(4 9) +2: StateProbe match=0 next=9 +3: StateProbe match=0 next=9 +4: StateProbe match=0 next=9 learner +9: StateProbe match=0 next=14 learner diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_idempotency.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_idempotency.txt new file mode 100644 index 0000000..2f7ca2e --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_idempotency.txt @@ -0,0 +1,69 @@ +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +simple +v2 +---- +voters=(1 2) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=2 + +simple +l1 +---- +voters=(2) learners=(1) +1: StateProbe match=0 next=0 learner +2: StateProbe match=0 next=2 + +simple +l1 +---- +voters=(2) learners=(1) +1: StateProbe match=0 next=0 learner +2: StateProbe match=0 next=2 + +simple +r1 +---- +voters=(2) +2: StateProbe match=0 next=2 + +simple +r1 +---- +voters=(2) +2: StateProbe match=0 next=2 + +simple +v3 +---- +voters=(2 3) +2: StateProbe match=0 next=2 +3: StateProbe match=0 next=7 + +simple +r3 +---- +voters=(2) +2: StateProbe match=0 next=2 + +simple +r3 +---- +voters=(2) +2: StateProbe match=0 next=2 + +simple +r4 +---- +voters=(2) +2: StateProbe match=0 next=2 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_promote_demote.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_promote_demote.txt new file mode 100644 index 0000000..52369b4 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_promote_demote.txt @@ -0,0 +1,60 @@ +# Set up three voters for this test. + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=0 + +simple +v2 +---- +voters=(1 2) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 + +simple +v3 +---- +voters=(1 2 3) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 +3: StateProbe match=0 next=2 + +# Can atomically demote and promote without a hitch. +# This is pointless, but possible. +simple +l1 v1 +---- +voters=(1 2 3) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 +3: StateProbe match=0 next=2 + +# Can demote a voter. +simple +l2 +---- +voters=(1 3) learners=(2) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 learner +3: StateProbe match=0 next=2 + +# Can atomically promote and demote the same voter. +# This is pointless, but possible. +simple +v2 l2 +---- +voters=(1 3) learners=(2) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 learner +3: StateProbe match=0 next=2 + +# Can promote a voter. +simple +v2 +---- +voters=(1 2 3) +1: StateProbe match=0 next=0 +2: StateProbe match=0 next=1 +3: StateProbe match=0 next=2 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_safety.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_safety.txt new file mode 100644 index 0000000..6566c5f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/simple_safety.txt @@ -0,0 +1,64 @@ +simple +l1 +---- +removed all voters + +simple +v1 +---- +voters=(1) +1: StateProbe match=0 next=1 + +simple +v2 l3 +---- +voters=(1 2) learners=(3) +1: StateProbe match=0 next=1 +2: StateProbe match=0 next=2 +3: StateProbe match=0 next=2 learner + +simple +r1 v5 +---- +more than one voter changed without entering joint config + +simple +r1 r2 +---- +removed all voters + +simple +v3 v4 +---- +more than one voter changed without entering joint config + +simple +l1 v5 +---- +more than one voter changed without entering joint config + +simple +l1 l2 +---- +removed all voters + +simple +l2 l3 l4 l5 +---- +voters=(1) learners=(2 3 4 5) +1: StateProbe match=0 next=1 +2: StateProbe match=0 next=2 learner +3: StateProbe match=0 next=2 learner +4: StateProbe match=0 next=8 learner +5: StateProbe match=0 next=8 learner + +simple +r1 +---- +removed all voters + +simple +r2 r3 r4 r5 +---- +voters=(1) +1: StateProbe match=0 next=1 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/zero.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/zero.txt new file mode 100644 index 0000000..5e0d46f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/confchange/testdata/zero.txt @@ -0,0 +1,6 @@ +# NodeID zero is ignored. +simple +v1 r0 v0 l0 +---- +voters=(1) +1: StateProbe match=0 next=0 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/config.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/config.rs new file mode 100644 index 0000000..3668db8 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/config.rs @@ -0,0 +1,219 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub use super::read_only::ReadOnlyOption; +use super::util::NO_LIMIT; +use super::{ + errors::{Error, Result}, + INVALID_ID, +}; + +/// Config contains the parameters to start a raft. +#[derive(Clone)] +pub struct Config { + /// The identity of the local raft. It cannot be 0, and must be unique in the group. + pub id: u64, + + /// The number of node.tick invocations that must pass between + /// elections. That is, if a follower does not receive any message from the + /// leader of current term before ElectionTick has elapsed, it will become + /// candidate and start an election. election_tick must be greater than + /// HeartbeatTick. We suggest election_tick = 10 * HeartbeatTick to avoid + /// unnecessary leader switching + pub election_tick: usize, + + /// HeartbeatTick is the number of node.tick invocations that must pass between + /// heartbeats. That is, a leader sends heartbeat messages to maintain its + /// leadership every heartbeat ticks. + pub heartbeat_tick: usize, + + /// Applied is the last applied index. It should only be set when restarting + /// raft. raft will not return entries to the application smaller or equal to Applied. + /// If Applied is unset when restarting, raft might return previous applied entries. + /// This is a very application dependent configuration. + pub applied: u64, + + /// Limit the max size of each append message. Smaller value lowers + /// the raft recovery cost(initial probing and message lost during normal operation). + /// On the other side, it might affect the throughput during normal replication. + /// Note: math.MaxUusize64 for unlimited, 0 for at most one entry per message. + pub max_size_per_msg: u64, + + /// Limit the max number of in-flight append messages during optimistic + /// replication phase. The application transportation layer usually has its own sending + /// buffer over TCP/UDP. Set to avoid overflowing that sending buffer. + /// TODO: feedback to application to limit the proposal rate? + pub max_inflight_msgs: usize, + + /// Specify if the leader should check quorum activity. Leader steps down when + /// quorum is not active for an electionTimeout. + pub check_quorum: bool, + + /// Enables the Pre-Vote algorithm described in raft thesis section + /// 9.6. This prevents disruption when a node that has been partitioned away + /// rejoins the cluster. + pub pre_vote: bool, + + /// The range of election timeout. In some cases, we hope some nodes has less possibility + /// to become leader. This configuration ensures that the randomized election_timeout + /// will always be suit in [min_election_tick, max_election_tick). + /// If it is 0, then election_tick will be chosen. + pub min_election_tick: usize, + + /// If it is 0, then 2 * election_tick will be chosen. + pub max_election_tick: usize, + + /// Choose the linearizability mode or the lease mode to read data. If you don’t care about the read consistency and want a higher read performance, you can use the lease mode. + /// + /// Setting this to `LeaseBased` requires `check_quorum = true`. + pub read_only_option: ReadOnlyOption, + + /// Don't broadcast an empty raft entry to notify follower to commit an entry. + /// This may make follower wait a longer time to apply an entry. This configuration + /// May affect proposal forwarding and follower read. + pub skip_bcast_commit: bool, + + /// Batches every append msg if any append msg already exists + pub batch_append: bool, + + /// The election priority of this node. + pub priority: i64, + + /// Specify maximum of uncommitted entry size. + /// When this limit is reached, all proposals to append new log will be dropped + pub max_uncommitted_size: u64, + + /// Max size for committed entries in a `Ready`. + pub max_committed_size_per_ready: u64, + + /// Maximum raft log number that can be applied after commit but before persist. + /// The default value is 0, which means apply after both commit and persist. + pub max_apply_unpersisted_log_limit: u64, + + /// If enable, followers will not forward proposal to leader. + pub disable_proposal_forwarding: bool, +} + +impl Default for Config { + fn default() -> Self { + const HEARTBEAT_TICK: usize = 2; + Self { + id: 0, + election_tick: HEARTBEAT_TICK * 10, + heartbeat_tick: HEARTBEAT_TICK, + applied: 0, + max_size_per_msg: 0, + max_inflight_msgs: 256, + check_quorum: false, + pre_vote: false, + min_election_tick: 0, + max_election_tick: 0, + read_only_option: ReadOnlyOption::Safe, + skip_bcast_commit: false, + batch_append: false, + priority: 0, + max_uncommitted_size: NO_LIMIT, + max_committed_size_per_ready: NO_LIMIT, + max_apply_unpersisted_log_limit: 0, + disable_proposal_forwarding: false, + } + } +} + +impl Config { + /// Creates a new config. + pub fn new(id: u64) -> Self { + Self { + id, + ..Self::default() + } + } + + /// The minimum number of ticks before an election. + #[inline] + pub fn min_election_tick(&self) -> usize { + if self.min_election_tick == 0 { + self.election_tick + } else { + self.min_election_tick + } + } + + /// The maximum number of ticks before an election. + #[inline] + pub fn max_election_tick(&self) -> usize { + if self.max_election_tick == 0 { + 2 * self.election_tick + } else { + self.max_election_tick + } + } + + /// Runs validations against the config. + pub fn validate(&self) -> Result<()> { + if self.id == INVALID_ID { + return Err(Error::ConfigInvalid("invalid node id".to_owned())); + } + + if self.heartbeat_tick == 0 { + return Err(Error::ConfigInvalid( + "heartbeat tick must greater than 0".to_owned(), + )); + } + + if self.election_tick <= self.heartbeat_tick { + return Err(Error::ConfigInvalid( + "election tick must be greater than heartbeat tick".to_owned(), + )); + } + + let min_timeout = self.min_election_tick(); + let max_timeout = self.max_election_tick(); + if min_timeout < self.election_tick { + return Err(Error::ConfigInvalid(format!( + "min election tick {} must not be less than election_tick {}", + min_timeout, self.election_tick + ))); + } + + if min_timeout >= max_timeout { + return Err(Error::ConfigInvalid(format!( + "min election tick {} should be less than max election tick {}", + min_timeout, max_timeout + ))); + } + + if self.max_inflight_msgs == 0 { + return Err(Error::ConfigInvalid( + "max inflight messages must be greater than 0".to_owned(), + )); + } + + if self.read_only_option == ReadOnlyOption::LeaseBased && !self.check_quorum { + return Err(Error::ConfigInvalid( + "read_only_option == LeaseBased requires check_quorum == true".into(), + )); + } + + if self.max_uncommitted_size < self.max_size_per_msg { + return Err(Error::ConfigInvalid( + "max uncommitted size should greater than max_size_per_msg".to_owned(), + )); + } + + Ok(()) + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/errors.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/errors.rs new file mode 100644 index 0000000..2e6d247 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/errors.rs @@ -0,0 +1,174 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. +use thiserror::Error; + +/// The base error type for raft +#[derive(Debug, Error)] +pub enum Error { + /// An IO error occurred + #[error("{0}")] + Io(#[from] std::io::Error), + /// A storage error occurred. + #[error("{0}")] + Store(#[from] StorageError), + /// Raft cannot step the local message. + #[error("raft: cannot step raft local message")] + StepLocalMsg, + /// The raft peer is not found and thus cannot step. + #[error("raft: cannot step as peer not found")] + StepPeerNotFound, + /// The proposal of changes was dropped. + #[error("raft: proposal dropped")] + ProposalDropped, + /// The configuration is invalid. + #[error("{0}")] + ConfigInvalid(String), + /// A protobuf message codec failed in some manner. + #[error("protobuf codec error {0:?}")] + CodecError(#[from] protobuf::ProtobufError), + /// The node exists, but should not. + #[error("The node {id} already exists in the {set} set.")] + Exists { + /// The node id. + id: u64, + /// The node set. + set: &'static str, + }, + /// The node does not exist, but should. + #[error("The node {id} is not in the {set} set.")] + NotExists { + /// The node id. + id: u64, + /// The node set. + set: &'static str, + }, + /// ConfChange proposal is invalid. + #[error("{0}")] + ConfChangeError(String), + /// The request snapshot is dropped. + #[error("raft: request snapshot dropped")] + RequestSnapshotDropped, +} + +impl PartialEq for Error { + fn eq(&self, other: &Error) -> bool { + match (self, other) { + (Error::StepPeerNotFound, Error::StepPeerNotFound) => true, + (Error::ProposalDropped, Error::ProposalDropped) => true, + (Error::Store(ref e1), Error::Store(ref e2)) => e1 == e2, + (Error::Io(ref e1), Error::Io(ref e2)) => e1.kind() == e2.kind(), + (Error::StepLocalMsg, Error::StepLocalMsg) => true, + (Error::ConfigInvalid(ref e1), Error::ConfigInvalid(ref e2)) => e1 == e2, + (Error::RequestSnapshotDropped, Error::RequestSnapshotDropped) => true, + (Error::ConfChangeError(e1), Error::ConfChangeError(e2)) => e1 == e2, + _ => false, + } + } +} + +/// An error with the storage. +#[derive(Debug, Error)] +pub enum StorageError { + /// The storage was compacted and not accessible + #[error("log compacted")] + Compacted, + /// The log is not available. + #[error("log unavailable")] + Unavailable, + /// The log is being fetched. + #[error("log is temporarily unavailable")] + LogTemporarilyUnavailable, + /// The snapshot is out of date. + #[error("snapshot out of date")] + SnapshotOutOfDate, + /// The snapshot is being created. + #[error("snapshot is temporarily unavailable")] + SnapshotTemporarilyUnavailable, + /// Some other error occurred. + #[error("unknown error {0}")] + Other(#[from] Box), +} + +impl PartialEq for StorageError { + fn eq(&self, other: &StorageError) -> bool { + matches!( + (self, other), + (StorageError::Compacted, StorageError::Compacted) + | (StorageError::Unavailable, StorageError::Unavailable) + | ( + StorageError::LogTemporarilyUnavailable, + StorageError::LogTemporarilyUnavailable + ) + | ( + StorageError::SnapshotOutOfDate, + StorageError::SnapshotOutOfDate + ) + | ( + StorageError::SnapshotTemporarilyUnavailable, + StorageError::SnapshotTemporarilyUnavailable, + ) + ) + } +} + +/// A result type that wraps up the raft errors. +pub type Result = std::result::Result; + +#[allow(clippy::eq_op)] +#[cfg(test)] +mod tests { + use super::*; + use std::io; + + #[test] + fn test_error_equal() { + assert_eq!(Error::StepPeerNotFound, Error::StepPeerNotFound); + assert_eq!( + Error::Store(StorageError::Compacted), + Error::Store(StorageError::Compacted) + ); + assert_eq!( + Error::Io(io::Error::new(io::ErrorKind::UnexpectedEof, "oh no!")), + Error::Io(io::Error::new(io::ErrorKind::UnexpectedEof, "oh yes!")) + ); + assert_ne!( + Error::Io(io::Error::new(io::ErrorKind::NotFound, "error")), + Error::Io(io::Error::new(io::ErrorKind::BrokenPipe, "error")) + ); + assert_eq!(Error::StepLocalMsg, Error::StepLocalMsg); + assert_eq!( + Error::ConfigInvalid(String::from("config error")), + Error::ConfigInvalid(String::from("config error")) + ); + assert_ne!( + Error::ConfigInvalid(String::from("config error")), + Error::ConfigInvalid(String::from("other error")) + ); + assert_eq!( + Error::from(io::Error::new(io::ErrorKind::Other, "oh no!")), + Error::from(io::Error::new(io::ErrorKind::Other, "oh yes!")) + ); + assert_ne!( + Error::StepPeerNotFound, + Error::Store(StorageError::Compacted) + ); + } + + #[test] + fn test_storage_error_equal() { + assert_eq!(StorageError::Compacted, StorageError::Compacted); + assert_eq!(StorageError::Unavailable, StorageError::Unavailable); + assert_eq!( + StorageError::SnapshotOutOfDate, + StorageError::SnapshotOutOfDate + ); + assert_eq!( + StorageError::SnapshotTemporarilyUnavailable, + StorageError::SnapshotTemporarilyUnavailable + ); + assert_ne!(StorageError::Compacted, StorageError::Unavailable); + assert_ne!( + StorageError::Other(Box::new(StorageError::Unavailable)), + StorageError::Unavailable + ); + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/lib.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/lib.rs new file mode 100644 index 0000000..fc01400 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/lib.rs @@ -0,0 +1,598 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*! + +## Creating a Raft node + +You can use [`RawNode::new`] to create the Raft node. To create the Raft node, you need to +provide a [`Storage`] component, and a [`Config`] to the [`RawNode::new`] function. + +```rust +use raft::{ + Config, + storage::MemStorage, + raw_node::RawNode, +}; +use slog::{Drain, o}; + +// Select some defaults, then change what we need. +let config = Config { + id: 1, + ..Default::default() +}; +// Initialize logger. +let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); +// ... Make any configuration changes. +// After, make sure it's valid! +config.validate().unwrap(); +// We'll use the built-in `MemStorage`, but you will likely want your own. +// Finally, create our Raft node! +let storage = MemStorage::new_with_conf_state((vec![1], vec![])); +let mut node = RawNode::new(&config, storage, &logger).unwrap(); +``` + +## Ticking the Raft node + +Use a timer to tick the Raft node at regular intervals. See the following example using Rust +channel `recv_timeout` to drive the Raft node at least every 100ms, calling +[`tick()`](RawNode::tick) each time. + +```rust +# use slog::{Drain, o}; +# use raft::{Config, storage::MemStorage, raw_node::RawNode}; +# let config = Config { id: 1, ..Default::default() }; +# let store = MemStorage::new_with_conf_state((vec![1], vec![])); +# let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); +# let mut node = RawNode::new(&config, store, &logger).unwrap(); +# node.raft.become_candidate(); +# node.raft.become_leader(); +use std::{sync::mpsc::{channel, RecvTimeoutError}, time::{Instant, Duration}}; + +// We're using a channel, but this could be any stream of events. +let (tx, rx) = channel(); +let timeout = Duration::from_millis(100); +let mut remaining_timeout = timeout; + +// Send the `tx` somewhere else... + +loop { + let now = Instant::now(); + + match rx.recv_timeout(remaining_timeout) { + Ok(()) => { + // Let's save this for later. + unimplemented!() + }, + Err(RecvTimeoutError::Timeout) => (), + Err(RecvTimeoutError::Disconnected) => unimplemented!(), + } + + let elapsed = now.elapsed(); + if elapsed >= remaining_timeout { + remaining_timeout = timeout; + // We drive Raft every 100ms. + node.tick(); + } else { + remaining_timeout -= elapsed; + } +# break; +} +``` + +## Proposing to, and stepping the Raft node + +Using the `propose` function you can drive the Raft node when the client sends a request to the +Raft server. You can call `propose` to add the request to the Raft log explicitly. + +In most cases, the client needs to wait for a response for the request. For example, if the +client writes a value to a key and wants to know whether the write succeeds or not, but the +write flow is asynchronous in Raft, so the write log entry must be replicated to other followers, +then committed and at last applied to the state machine, so here we need a way to notify the client +after the write is finished. + +One simple way is to use a unique ID for the client request, and save the associated callback +function in a hash map. When the log entry is applied, we can get the ID from the decoded entry, +call the corresponding callback, and notify the client. + +You can call the `step` function when you receive the Raft messages from other nodes. + +Here is a simple example to use `propose` and `step`: + +```rust +# use raft::{Config, storage::MemStorage, raw_node::RawNode, eraftpb::Message}; +# use std::{ +# sync::mpsc::{channel, RecvTimeoutError}, +# time::{Instant, Duration}, +# collections::HashMap +# }; +# use slog::{Drain, o}; +# +# let config = Config { id: 1, ..Default::default() }; +# let store = MemStorage::new_with_conf_state((vec![1], vec![])); +# let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); +# let mut node = RawNode::new(&config, store, &logger).unwrap(); +# node.raft.become_candidate(); +# node.raft.become_leader(); +# +# let (tx, rx) = channel(); +# let timeout = Duration::from_millis(100); +# let mut remaining_timeout = timeout; +# +enum Msg { + Propose { + id: u8, + callback: Box, + }, + Raft(Message), +} + +// Simulate a message coming down the stream. +tx.send(Msg::Propose { id: 1, callback: Box::new(|| ()) }); + +let mut cbs = HashMap::new(); +loop { + let now = Instant::now(); + + match rx.recv_timeout(remaining_timeout) { + Ok(Msg::Propose { id, callback }) => { + cbs.insert(id, callback); + node.propose(vec![], vec![id]).unwrap(); + } + Ok(Msg::Raft(m)) => node.step(m).unwrap(), + Err(RecvTimeoutError::Timeout) => (), + Err(RecvTimeoutError::Disconnected) => unimplemented!(), + } + + let elapsed = now.elapsed(); + if elapsed >= remaining_timeout { + remaining_timeout = timeout; + // We drive Raft every 100ms. + node.tick(); + } else { + remaining_timeout -= elapsed; + } + break; +} +``` + +In the above example, we use a channel to receive the `propose` and `step` messages. We only +propose the request ID to the Raft log. In your own practice, you can embed the ID in your request +and propose the encoded binary request data. + +## Processing the `Ready` State + +When your Raft node is ticked and running, Raft should enter a `Ready` state. You need to first use +`has_ready` to check whether Raft is ready. If yes, use the `ready` function to get a `Ready` +state: + +```rust +# use slog::{Drain, o}; +# use raft::{Config, storage::MemStorage, raw_node::RawNode}; +# +# let config = Config { id: 1, ..Default::default() }; +# config.validate().unwrap(); +# let store = MemStorage::new_with_conf_state((vec![1], vec![])); +# let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); +# let mut node = RawNode::new(&config, store, &logger).unwrap(); +# +if !node.has_ready() { + return; +} + +// The Raft is ready, we can do something now. +let mut ready = node.ready(); +``` + +The `Ready` state contains quite a bit of information, and you need to check and process them one +by one: + +1. Check whether `messages` is empty or not. If not, it means that the node will send messages to + other nodes: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode, StateRole}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + if !ready.messages().is_empty() { + for msg in ready.take_messages() { + // Send messages to other peers. + } + } + ``` + +2. Check whether `snapshot` is empty or not. If not empty, it means that the Raft node has received + a Raft snapshot from the leader and we must apply the snapshot: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + if !ready.snapshot().is_empty() { + // This is a snapshot, we need to apply the snapshot at first. + node.mut_store() + .wl() + .apply_snapshot(ready.snapshot().clone()) + .unwrap(); + } + + ``` + +3. Check whether `committed_entries` is empty or not. If not, it means that there are some newly + committed log entries which you must apply to the state machine. Of course, after applying, you + need to update the applied index and resume `apply` later: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode, eraftpb::EntryType}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + # fn handle_conf_change(e: raft::eraftpb::Entry) { + # } + # + # fn handle_conf_change_v2(e: raft::eraftpb::Entry) { + # } + # + # fn handle_normal(e: raft::eraftpb::Entry) { + # } + # + let mut _last_apply_index = 0; + for entry in ready.take_committed_entries() { + // Mostly, you need to save the last apply index to resume applying + // after restart. Here we just ignore this because we use a Memory storage. + _last_apply_index = entry.index; + + if entry.data.is_empty() { + // Emtpy entry, when the peer becomes Leader it will send an empty entry. + continue; + } + + match entry.get_entry_type() { + EntryType::EntryNormal => handle_normal(entry), + // It's recommended to always use `EntryType::EntryConfChangeV2. + EntryType::EntryConfChange => handle_conf_change(entry), + EntryType::EntryConfChangeV2 => handle_conf_change_v2(entry), + } + } + ``` + + Note, although Raft guarentees only persisted committed entries will be applied, + but it doesn't guarentee commit index is persisted before being applied. For example, + if application is restarted after applying committed entries before persisting + commit index, apply index can be larger than commit index and cause panic. To + solve the problem, persisting commit index with or before applying entries. + You can also always assign commit index to the `max(commit_index, applied_index)` + after restarting, *it may work but potential log loss may also be ignored silently*. + +4. Check whether `entries` is empty or not. If not empty, it means that there are newly added + entries but have not been committed yet, we must append the entries to the Raft log: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + if !ready.entries().is_empty() { + // Append entries to the Raft log + node.mut_store().wl().append(ready.entries()).unwrap(); + } + + ``` + +5. Check whether `hs` is empty or not. If not empty, it means that the `HardState` of the node has + changed. For example, the node may vote for a new leader, or the commit index has been increased. + We must persist the changed `HardState`: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + if let Some(hs) = ready.hs() { + // Raft HardState changed, and we need to persist it. + node.mut_store().wl().set_hardstate(hs.clone()); + } + ``` + +6. Check whether `persisted_messages` is empty or not. If not, it means that the node will send messages to + other nodes after persisting hardstate, entries and snapshot: + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode, StateRole}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + if !ready.persisted_messages().is_empty() { + for msg in ready.take_persisted_messages() { + // Send persisted messages to other peers. + } + } + ``` + +7. Call `advance` to notify that the previous work is completed. Get the return value `LightReady` + and handle its `messages` and `committed_entries` like step 1 and step 3 does. Then call `advance_apply` + to advance the applied index inside. + + ```rust + # use slog::{Drain, o}; + # use raft::{Config, storage::MemStorage, raw_node::RawNode}; + # use raft::eraftpb::{EntryType, Entry, Message}; + # + # let config = Config { id: 1, ..Default::default() }; + # config.validate().unwrap(); + # let store = MemStorage::new_with_conf_state((vec![1], vec![])); + # let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); + # let mut node = RawNode::new(&config, store, &logger).unwrap(); + # + # if !node.has_ready() { + # return; + # } + # let mut ready = node.ready(); + # + # fn handle_messages(msgs: Vec) { + # } + # + # fn handle_committed_entries(committed_entries: Vec) { + # } + let mut light_rd = node.advance(ready); + // Like step 1 and 3, you can use functions to make them behave the same. + handle_messages(light_rd.take_messages()); + handle_committed_entries(light_rd.take_committed_entries()); + node.advance_apply(); + ``` + +For more information, check out an [example](examples/single_mem_node/main.rs#L113-L179). + +Sometimes it's better not to block the raft machine in IO operation, so that latency of +read/write can be more predictable and the fsync frequency can be controlled. The crate +supports async ready to offload the IO operation to other thread. The usage is the same as +above except: +1. All writes are not required to be persisted immediately, they can be written into memory caches; +2. Persisted messages should be sent after all corresponding writes are persisted; +3. [`advance_append_async`](RawNode::advance_append_async) is used when all writes are finished + instead of `advance/advance_append`. +4. Only persisted entries can be committed and applied, so to make progress, all writes should + be persisted at some point. + +## Arbitrary Membership Changes + +When building a resilient, scalable distributed system there is a strong need to be able to change +the membership of a peer group *dynamically, without downtime.* This Raft crate supports this via +**Joint Consensus** +([Raft paper, section 6](https://web.stanford.edu/~ouster/cgi-bin/papers/raft-atc14)). + +It permits resilient arbitrary dynamic membership changes. A membership change can do any or all of +the following: + +* Add peer (learner or voter) *n* to the group. +* Remove a learner *n* from the group. +* Promote a learner to a voter. +* Demote a voter back to learner. +* Replace a node *n* with another node *m*. + +For example to promote a learner 4 and demote an existing voter 3: +```no_run +# use raft::{Config, storage::MemStorage, raw_node::RawNode, eraftpb::*}; +# use protobuf::Message as PbMessage; +# use slog::{Drain, o}; +# +# let mut config = Config { id: 1, ..Default::default() }; +# let store = MemStorage::new_with_conf_state((vec![1, 2], vec![])); +# let logger = slog::Logger::root(slog_stdlog::StdLog.fuse(), o!()); +# let mut node = RawNode::new(&mut config, store, &logger).unwrap(); +let steps = vec![ + raft_proto::new_conf_change_single(4, ConfChangeType::AddNode), + raft_proto::new_conf_change_single(3, ConfChangeType::RemoveNode), +]; +let mut cc = ConfChangeV2::default(); +cc.set_changes(steps.into()); +node.propose_conf_change(vec![], cc).unwrap(); +// After the log is committed and applied +// node.apply_conf_change(&cc).unwrap(); +``` + +This process is a two-phase process, during the midst of it the peer group's leader is managing +**two independent, possibly overlapping peer sets**. + +> **Note:** In order to maintain resiliency guarantees (progress while a majority of both peer sets is +> active), it is recommended to wait until the entire peer group has exited the transition phase +> before taking old, removed peers offline. + +*/ + +#![deny(clippy::all)] +#![deny(missing_docs)] +#![recursion_limit = "128"] +// TODO: remove this when we update the mininum rust compatible version. +#![allow(unused_imports)] +// This is necessary to support prost and rust-protobuf at the same time. +#![allow(clippy::useless_conversion)] +// This lint recommends some bad choices sometimes. +#![allow(clippy::unnecessary_unwrap)] +// We use `default` method a lot to be support prost and rust-protobuf at the +// same time. And reassignment can be optimized by compiler. +#![allow(clippy::field_reassign_with_default)] + +macro_rules! fatal { + ($logger:expr, $msg:expr) => {{ + let owned_kv = ($logger).list(); + let s = crate::util::format_kv_list(&owned_kv); + if s.is_empty() { + panic!("{}", $msg) + } else { + panic!("{}, {}", $msg, s) + } + }}; + ($logger:expr, $fmt:expr, $($arg:tt)+) => {{ + fatal!($logger, format_args!($fmt, $($arg)+)) + }}; +} + +mod confchange; +mod config; +mod errors; +mod log_unstable; +mod quorum; +mod raft; +mod raft_log; +pub mod raw_node; +mod read_only; +mod status; +pub mod storage; +mod tracker; +pub mod util; + +pub use crate::raft::{ + vote_resp_msg_type, Raft, SoftState, StateRole, CAMPAIGN_ELECTION, CAMPAIGN_PRE_ELECTION, + CAMPAIGN_TRANSFER, INVALID_ID, INVALID_INDEX, +}; +pub use confchange::{Changer, MapChange}; +pub use config::Config; +pub use errors::{Error, Result, StorageError}; +pub use log_unstable::Unstable; +pub use quorum::joint::Configuration as JointConfig; +pub use quorum::majority::Configuration as MajorityConfig; +pub use raft_log::{RaftLog, NO_LIMIT}; +pub use raft_proto::eraftpb; +#[allow(deprecated)] +pub use raw_node::is_empty_snap; +pub use raw_node::{LightReady, Peer, RawNode, Ready, SnapshotStatus}; +pub use read_only::{ReadOnlyOption, ReadState}; +pub use status::Status; +pub use storage::{GetEntriesContext, RaftState, Storage}; +pub use tracker::{Inflights, Progress, ProgressState, ProgressTracker}; +pub use util::majority; + +pub mod prelude { + //! A "prelude" for crates using the `raft` crate. + //! + //! This prelude is similar to the standard library's prelude in that you'll + //! almost always want to import its entire contents, but unlike the standard + //! library's prelude you'll have to do so manually: + //! + //! ``` + //! use raft::prelude::*; + //! ``` + //! + //! The prelude may grow over time as additional items see ubiquitous use. + + pub use raft_proto::prelude::*; + + pub use crate::config::Config; + pub use crate::raft::Raft; + + pub use crate::storage::{RaftState, Storage}; + + pub use crate::raw_node::{Peer, RawNode, Ready, SnapshotStatus}; + + pub use crate::Progress; + + pub use crate::status::Status; + + pub use crate::read_only::{ReadOnlyOption, ReadState}; +} + +/// The default logger we fall back to when passed `None` in external facing constructors. +/// +/// Currently, this is a `log` adaptor behind a `OnceLock` to ensure there is no clobbering. +#[cfg(any(test, feature = "default-logger"))] +pub fn default_logger() -> slog::Logger { + use slog::{o, Drain}; + use std::sync::{Mutex, OnceLock}; + + static LOGGER_INITIALIZED: OnceLock = OnceLock::new(); + let logger = LOGGER_INITIALIZED.get_or_init(|| { + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::CompactFormat::new(decorator).build(); + let drain = slog_envlogger::new(drain); + slog::Logger::root(Mutex::new(drain).fuse(), o!()) + }); + + if let Some(case) = std::thread::current() + .name() + .and_then(|v| v.split(':').last()) + { + logger.new(o!("case" => case.to_string())) + } else { + logger.new(o!()) + } +} + +type DefaultHashBuilder = std::hash::BuildHasherDefault; +type HashMap = std::collections::HashMap; +type HashSet = std::collections::HashSet; diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/log_unstable.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/log_unstable.rs new file mode 100644 index 0000000..dd90615 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/log_unstable.rs @@ -0,0 +1,482 @@ +//! A representation of not-yet-committed log entries and state. + +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::eraftpb::{Entry, Snapshot}; +use crate::util::entry_approximate_size; +use slog::Logger; + +/// Unstable contains "unstable" log entries and snapshot state that has +/// not yet been written to Storage. +/// +/// The `unstable.entries[i]` has raft log position `i+unstable.offset`. +/// Note that `unstable.offset` may be less than the highest log +/// position in storage; this means that the next write to storage +/// might need to truncate the log before persisting unstable.entries. +#[derive(Debug)] +pub struct Unstable { + /// The incoming unstable snapshot, if any. + pub snapshot: Option, + + /// All entries that have not yet been written to storage. + pub entries: Vec, + + /// The size of entries. + pub entries_size: usize, + + /// The offset from the vector index. + pub offset: u64, + + /// The tag to use when logging. + pub logger: Logger, +} + +impl Unstable { + /// Creates a new log of unstable entries. + pub fn new(offset: u64, logger: Logger) -> Unstable { + Unstable { + offset, + snapshot: None, + entries: vec![], + entries_size: 0, + logger, + } + } + + /// Returns the index of the first possible entry in entries + /// if it has a snapshot. + pub fn maybe_first_index(&self) -> Option { + self.snapshot + .as_ref() + .map(|snap| snap.get_metadata().index + 1) + } + + /// Returns the last index if it has at least one unstable entry or snapshot. + pub fn maybe_last_index(&self) -> Option { + match self.entries.len() { + 0 => self.snapshot.as_ref().map(|snap| snap.get_metadata().index), + len => Some(self.offset + len as u64 - 1), + } + } + + /// Returns the term of the entry at index idx, if there is any. + pub fn maybe_term(&self, idx: u64) -> Option { + if idx < self.offset { + let snapshot = self.snapshot.as_ref()?; + let meta = snapshot.get_metadata(); + if idx == meta.index { + Some(meta.term) + } else { + None + } + } else { + self.maybe_last_index().and_then(|last| { + if idx > last { + return None; + } + Some(self.entries[(idx - self.offset) as usize].term) + }) + } + } + + /// Clears the unstable entries and moves the stable offset up to the + /// last index, if there is any. + pub fn stable_entries(&mut self, index: u64, term: u64) { + // The snapshot must be stabled before entries + assert!(self.snapshot.is_none()); + if let Some(entry) = self.entries.last() { + if entry.get_index() != index || entry.get_term() != term { + fatal!( + self.logger, + "the last one of unstable.slice has different index {} and term {}, expect {} {}", + entry.get_index(), + entry.get_term(), + index, + term + ); + } + self.offset = entry.get_index() + 1; + self.entries.clear(); + self.entries_size = 0; + } else { + fatal!( + self.logger, + "unstable.slice is empty, expect its last one's index and term are {} and {}", + index, + term + ); + } + } + + /// Clears the unstable snapshot. + pub fn stable_snap(&mut self, index: u64) { + if let Some(snap) = &self.snapshot { + if snap.get_metadata().index != index { + fatal!( + self.logger, + "unstable.snap has different index {}, expect {}", + snap.get_metadata().index, + index + ); + } + self.snapshot = None; + } else { + fatal!( + self.logger, + "unstable.snap is none, expect a snapshot with index {}", + index + ); + } + } + + /// From a given snapshot, restores the snapshot to self, but doesn't unpack. + pub fn restore(&mut self, snap: Snapshot) { + self.entries.clear(); + self.entries_size = 0; + self.offset = snap.get_metadata().index + 1; + self.snapshot = Some(snap); + } + + /// Append entries to unstable, truncate local block first if overlapped. + /// + /// # Panics + /// + /// Panics if truncate logs to the entry before snapshot + pub fn truncate_and_append(&mut self, ents: &[Entry]) { + let after = ents[0].index; + if after == self.offset + self.entries.len() as u64 { + // after is the next index in the self.entries, append directly + } else if after <= self.offset { + // The log is being truncated to before our current offset + // portion, so set the offset and replace the entries + self.offset = after; + self.entries.clear(); + self.entries_size = 0; + } else { + // truncate to after and copy to self.entries then append + let off = self.offset; + self.must_check_outofbounds(off, after); + for e in &self.entries[(after - off) as usize..] { + self.entries_size -= entry_approximate_size(e); + } + self.entries.truncate((after - off) as usize); + } + self.entries.extend_from_slice(ents); + self.entries_size += ents.iter().map(entry_approximate_size).sum::(); + } + + /// Returns a slice of entries between the high and low. + /// + /// # Panics + /// + /// Panics if the `lo` or `hi` are out of bounds. + /// Panics if `lo > hi`. + pub fn slice(&self, lo: u64, hi: u64) -> &[Entry] { + self.must_check_outofbounds(lo, hi); + let l = lo as usize; + let h = hi as usize; + let off = self.offset as usize; + &self.entries[l - off..h - off] + } + + /// Asserts the `hi` and `lo` values against each other and against the + /// entries themselves. + pub fn must_check_outofbounds(&self, lo: u64, hi: u64) { + if lo > hi { + fatal!(self.logger, "invalid unstable.slice {} > {}", lo, hi) + } + let upper = self.offset + self.entries.len() as u64; + if lo < self.offset || hi > upper { + fatal!( + self.logger, + "unstable.slice[{}, {}] out of bound[{}, {}]", + lo, + hi, + self.offset, + upper + ) + } + } +} + +#[cfg(test)] +mod test { + use crate::eraftpb::{Entry, Snapshot, SnapshotMetadata}; + use crate::log_unstable::Unstable; + use crate::util::entry_approximate_size; + + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.term = term; + e.index = index; + e + } + + fn new_snapshot(index: u64, term: u64) -> Snapshot { + let mut snap = Snapshot::default(); + let mut meta = SnapshotMetadata::default(); + meta.index = index; + meta.term = term; + snap.set_metadata(meta); + snap + } + + #[test] + fn test_maybe_first_index() { + // entry, offset, snap, wok, windex, + let tests = vec![ + // no snapshot + (Some(new_entry(5, 1)), 5, None, false, 0), + (None, 0, None, false, 0), + // has snapshot + (Some(new_entry(5, 1)), 5, Some(new_snapshot(4, 1)), true, 5), + (None, 5, Some(new_snapshot(4, 1)), true, 5), + ]; + + for (e, offset, snapshot, wok, windex) in tests { + let mut entries_size = 0; + let mut entries = vec![]; + if let Some(entry) = e { + entries_size = entry_approximate_size(&entry); + entries = vec![entry]; + } + let u = Unstable { + entries, + entries_size, + offset, + snapshot, + logger: crate::default_logger(), + }; + let index = u.maybe_first_index(); + match index { + None => assert!(!wok), + Some(index) => assert_eq!(index, windex), + } + } + } + + #[test] + fn test_maybe_last_index() { + // entry, offset, snap, wok, windex, + let tests = vec![ + (Some(new_entry(5, 1)), 5, None, true, 5), + (Some(new_entry(5, 1)), 5, Some(new_snapshot(4, 1)), true, 5), + // last in snapshot + (None, 5, Some(new_snapshot(4, 1)), true, 4), + // empty unstable + (None, 0, None, false, 0), + ]; + + for (e, offset, snapshot, wok, windex) in tests { + let mut entries_size = 0; + let mut entries = vec![]; + if let Some(entry) = e { + entries_size = entry_approximate_size(&entry); + entries = vec![entry]; + } + let u = Unstable { + entries, + entries_size, + offset, + snapshot, + logger: crate::default_logger(), + }; + let index = u.maybe_last_index(); + match index { + None => assert!(!wok), + Some(index) => assert_eq!(index, windex), + } + } + } + + #[test] + fn test_maybe_term() { + // entry, offset, snap, index, wok, wterm + let tests = vec![ + // term from entries + (Some(new_entry(5, 1)), 5, None, 5, true, 1), + (Some(new_entry(5, 1)), 5, None, 6, false, 0), + (Some(new_entry(5, 1)), 5, None, 4, false, 0), + ( + Some(new_entry(5, 1)), + 5, + Some(new_snapshot(4, 1)), + 5, + true, + 1, + ), + ( + Some(new_entry(5, 1)), + 5, + Some(new_snapshot(4, 1)), + 6, + false, + 0, + ), + // term from snapshot + ( + Some(new_entry(5, 1)), + 5, + Some(new_snapshot(4, 1)), + 4, + true, + 1, + ), + ( + Some(new_entry(5, 1)), + 5, + Some(new_snapshot(4, 1)), + 3, + false, + 0, + ), + (None, 5, Some(new_snapshot(4, 1)), 5, false, 0), + (None, 5, Some(new_snapshot(4, 1)), 4, true, 1), + (None, 0, None, 5, false, 0), + ]; + + for (e, offset, snapshot, index, wok, wterm) in tests { + let mut entries_size = 0; + let mut entries = vec![]; + if let Some(entry) = e { + entries_size = entry_approximate_size(&entry); + entries = vec![entry]; + } + let u = Unstable { + entries, + entries_size, + offset, + snapshot, + logger: crate::default_logger(), + }; + let term = u.maybe_term(index); + match term { + None => assert!(!wok), + Some(term) => assert_eq!(term, wterm), + } + } + } + + #[test] + fn test_restore() { + let mut u = Unstable { + entries: vec![new_entry(5, 1)], + entries_size: entry_approximate_size(&new_entry(5, 1)), + offset: 5, + snapshot: Some(new_snapshot(4, 1)), + logger: crate::default_logger(), + }; + + let s = new_snapshot(6, 2); + u.restore(s.clone()); + + assert_eq!(u.offset, s.get_metadata().index + 1); + assert!(u.entries.is_empty()); + assert_eq!(u.entries_size, 0); + assert_eq!(u.snapshot.unwrap(), s); + } + + #[test] + fn test_stable_snapshot_and_entries() { + let ents = vec![new_entry(5, 1), new_entry(5, 2), new_entry(6, 3)]; + let entries_size = ents.iter().map(entry_approximate_size).sum::(); + let mut u = Unstable { + entries: ents.clone(), + entries_size, + offset: 5, + snapshot: Some(new_snapshot(4, 1)), + logger: crate::default_logger(), + }; + assert_eq!(ents, u.entries); + u.stable_snap(4); + u.stable_entries(6, 3); + assert!(u.entries.is_empty()); + assert_eq!(u.entries_size, 0); + assert_eq!(u.offset, 7); + } + + #[test] + fn test_truncate_and_append() { + // entries, offset, snap, to_append, woffset, wentries + let tests = vec![ + // replace to the end + ( + vec![new_entry(5, 1)], + 5, + None, + vec![new_entry(6, 1), new_entry(7, 1)], + 5, + vec![new_entry(5, 1), new_entry(6, 1), new_entry(7, 1)], + ), + // replace to unstable entries + ( + vec![new_entry(5, 1)], + 5, + None, + vec![new_entry(5, 2), new_entry(6, 2)], + 5, + vec![new_entry(5, 2), new_entry(6, 2)], + ), + ( + vec![new_entry(5, 1)], + 5, + None, + vec![new_entry(4, 2), new_entry(5, 2), new_entry(6, 2)], + 4, + vec![new_entry(4, 2), new_entry(5, 2), new_entry(6, 2)], + ), + // truncate existing entries and append + ( + vec![new_entry(5, 1), new_entry(6, 1), new_entry(7, 1)], + 5, + None, + vec![new_entry(6, 2)], + 5, + vec![new_entry(5, 1), new_entry(6, 2)], + ), + ( + vec![new_entry(5, 1), new_entry(6, 1), new_entry(7, 1)], + 5, + None, + vec![new_entry(7, 2), new_entry(8, 2)], + 5, + vec![ + new_entry(5, 1), + new_entry(6, 1), + new_entry(7, 2), + new_entry(8, 2), + ], + ), + ]; + + for (entries, offset, snapshot, to_append, woffset, wentries) in tests { + let entries_size = entries.iter().map(entry_approximate_size).sum::(); + let mut u = Unstable { + entries, + entries_size, + offset, + snapshot, + logger: crate::default_logger(), + }; + u.truncate_and_append(&to_append); + assert_eq!(u.offset, woffset); + assert_eq!(u.entries, wentries); + let entries_size = wentries.iter().map(entry_approximate_size).sum::(); + assert_eq!(u.entries_size, entries_size); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum.rs new file mode 100644 index 0000000..24c6d6a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum.rs @@ -0,0 +1,74 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +#[cfg(test)] +pub mod datadriven_test; +pub mod joint; +pub mod majority; + +use std::collections::HashMap; +use std::fmt::{self, Debug, Display, Formatter}; + +/// VoteResult indicates the outcome of a vote. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum VoteResult { + /// Pending indicates that the decision of the vote depends on future + /// votes, i.e. neither "yes" or "no" has reached quorum yet. + Pending, + // Lost indicates that the quorum has voted "no". + Lost, + // Won indicates that the quorum has voted "yes". + Won, +} + +impl Display for VoteResult { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + VoteResult::Won => write!(f, "VoteWon"), + VoteResult::Lost => write!(f, "VoteLost"), + VoteResult::Pending => write!(f, "VotePending"), + } + } +} + +/// Index is a Raft log position. +#[derive(Default, Clone, Copy)] +pub struct Index { + pub index: u64, + pub group_id: u64, +} + +impl Display for Index { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self.group_id { + 0 => match self.index { + u64::MAX => write!(f, "∞"), + index => write!(f, "{}", index), + }, + group_id => match self.index { + u64::MAX => write!(f, "[{}]∞", group_id), + index => write!(f, "[{}]{}", group_id, index), + }, + } + } +} + +impl Debug for Index { + #[inline] + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + Display::fmt(self, f) + } +} + +pub trait AckedIndexer { + fn acked_index(&self, voter_id: u64) -> Option; +} + +pub type AckIndexer = HashMap; + +impl AckedIndexer for AckIndexer { + #[inline] + fn acked_index(&self, voter: u64) -> Option { + self.get(&voter).cloned() + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/datadriven_test.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/datadriven_test.rs new file mode 100644 index 0000000..df18452 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/datadriven_test.rs @@ -0,0 +1,323 @@ +use crate::quorum::{AckIndexer, AckedIndexer, Index}; +use crate::{default_logger, HashMap, HashSet, JointConfig, MajorityConfig}; +use datadriven::{run_test, TestData}; +use std::fmt::Write; + +fn test_quorum(data: &TestData) -> String { + // Two majority configs. The first one is always used (though it may + // be empty) and the second one is used iff joint is true. + let mut joint = false; + let mut ids: Vec = Vec::new(); + let mut idsj: Vec = Vec::new(); + + // The committed indexes for the nodes in the config in the order in + // which they appear in (ids,idsj), without repetition. An underscore + // denotes an omission (i.e. no information for this voter); this is + // different from 0. + // + // For example, + // cfg=(1,2) cfgj=(2,3,4) idxs=(_,5,_,7) initializes the idx for voter 2 + // to 5 and that for voter 4 to 7 (and no others). + // + // cfgj=zero is specified to instruct the test harness to treat cfgj + // as zero instead of not specified (i.e. it will trigger a joint + // quorum test instead of a majority quorum test for cfg only). + let mut idxs: Vec = Vec::new(); + + // group id of each nodes in the config. + let mut gids: Vec = Vec::new(); + + // Votes. These are initialized similar to idxs except the only values + // used are 1 (voted against) and 2 (voted for). This looks awkward, + // but is convenient because it allows sharing code between the two. + let mut votes: Vec = Vec::new(); + + for arg in &data.cmd_args { + for val in &arg.vals { + match arg.key.as_str() { + "cfg" => { + let n: u64 = val.parse().expect("type of n should be u64"); + ids.push(n); + } + "cfgj" => { + joint = true; + + if val == "zero" { + assert_eq!(arg.vals.len(), 1, "cannot mix 'zero' into configuration") + } else { + let n: u64 = val.parse().expect("type of n should be u64"); + idsj.push(n); + } + } + "idx" => { + let mut n: u64 = 0; + if val != "_" { + n = val.parse().expect("type of n should be u64"); + if n == 0 { + panic!("use '_' as 0, check {}", data.pos) + } + } + idxs.push(Index { + index: n, + group_id: 0, + }); + } + "gid" => { + let mut n: u64 = 0; + if val != "_" { + n = val.parse().expect("type of n should be u64"); + if n == 0 { + panic!("use '_' as 0, check {}", data.pos) + } + } + gids.push(n); + } + "votes" => match val.as_str() { + "y" => votes.push(Index { + index: 2, + group_id: 0, + }), + "n" => votes.push(Index { + index: 1, + group_id: 0, + }), + "_" => votes.push(Index { + index: 0, + group_id: 0, + }), + _ => { + panic!("unknown arg: {}", val); + } + }, + _ => { + panic!("unknown arg: {}", arg.key); + } + } + } + } + + let ids_set: HashSet = ids.iter().cloned().collect(); + let idsj_set: HashSet = idsj.iter().cloned().collect(); + + // Build the two majority configs. + let c = MajorityConfig::new(ids_set); + let cj = MajorityConfig::new(idsj_set); + + let make_lookuper = |idxs: &[Index], ids: &[u64], idsj: &[u64]| -> AckIndexer { + let mut l = AckIndexer::default(); + // next to consume from idxs + let mut p: usize = 0; + for id in ids.iter().chain(idsj) { + if !l.contains_key(id) && p < idxs.len() { + l.insert(*id, idxs[p]); + p += 1; + } + } + + // Zero entries are created by _ placeholders; we don't want + // them in the lookuper because "no entry" is different from + // "zero entry". Note that we prevent tests from specifying + // zero commit indexes, so that there's no confusion between + // the two concepts. + l.retain(|_, index| index.index > 0); + l + }; + + // verify length of voters + let mut input = idxs.len(); + + if data.cmd.as_str() == "vote" { + input = votes.len(); + } + + let voters = JointConfig::new_joint_from_majorities(c.clone(), cj.clone()) + .ids() + .len(); + + if voters != input { + return format!( + "error: mismatched input (explicit or _) for voters {:?}: {:?}", + voters, input + ); + } + + // verify length of group ids + if !gids.is_empty() { + if gids.len() != voters { + return format!( + "error: mismatched input (explicit or _) for group ids {:?}: {:?}", + voters, + gids.len() + ); + } else { + // assign group ids to idxs + for (idx, gid) in idxs.iter_mut().zip(gids) { + idx.group_id = gid; + } + } + } + + // buffer for expected value + let mut buf = String::new(); + + match data.cmd.as_str() { + "committed" => { + let use_group_commit = false; + + let mut l = make_lookuper(&idxs, &ids, &idsj); + + let idx; + + // Branch based on whether this is a majority or joint quorum + // test case. + if joint { + let cc = JointConfig::new_joint_from_majorities(c.clone(), cj.clone()); + buf.push_str(&cc.describe(&l)); + idx = cc.committed_index(use_group_commit, &l); + // Interchanging the majorities shouldn't make a difference. If it does, print. + let a_idx = JointConfig::new_joint_from_majorities(cj, c) + .committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!(buf, "{} <-- via symmetry", a_idx.0).unwrap(); + } + } else { + idx = c.committed_index(use_group_commit, &l); + buf.push_str(&c.describe(&l)); + + // Joining a majority with the empty majority should give same result. + let a_idx = + JointConfig::new_joint_from_majorities(c.clone(), MajorityConfig::default()) + .committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!(buf, "{} <-- via zero-joint quorum", a_idx.0).unwrap(); + } + + // Joining a majority with itself should give same result. + let a_idx = JointConfig::new_joint_from_majorities(c.clone(), c.clone()) + .committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!(buf, "{} <-- via self-joint quorum", a_idx.0).unwrap(); + } + + // test overlaying + // If the committed index was definitely above the currently inspected idx, + // the result shouldn't change if we lower it further. + for &id in c.ids() { + if let Some(iidx) = l.acked_index(id) { + if idx.0 > iidx.index { + // try index - 1 + l.insert( + id, + Index { + index: iidx.index - 1, + group_id: iidx.group_id, + }, + ); + + let a_idx = c.committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!( + buf, + "{} <-- overlaying {}->{}", + a_idx.0, + id, + iidx.index - 1 + ) + .unwrap(); + } + // try 0 + l.insert( + id, + Index { + index: 0, + group_id: iidx.group_id, + }, + ); + + let a_idx = c.committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!(buf, "{} <-- overlaying {}->{}", a_idx.0, id, 0).unwrap(); + } + // recovery + l.insert(id, iidx); + } + } + } + } + writeln!( + buf, + "{}", + Index { + index: idx.0, + group_id: 0 + } + ) + .unwrap(); + } + "group_committed" => { + let use_group_commit = true; + + let l = make_lookuper(&idxs, &ids, &idsj); + + let mut idx = (0, false); + + if joint { + let cc = JointConfig::new_joint_from_majorities(c.clone(), cj.clone()); + // `describe` doesn't seem to be useful for group commit. + // buf.push_str(&cc.describe(&l)); + idx = cc.committed_index(use_group_commit, &l); + // Interchanging the majorities shouldn't make a difference. If it does, print. + let a_idx = JointConfig::new_joint_from_majorities(cj, c) + .committed_index(use_group_commit, &l); + if a_idx != idx { + writeln!(buf, "{} <-- via symmetry", a_idx.0).unwrap(); + } + } else { + // TODO: majority group commit + } + writeln!( + buf, + "{}", + Index { + index: idx.0, + group_id: 0 + } + ) + .unwrap(); + } + "vote" => { + let ll = make_lookuper(&votes, &ids, &idsj); + let mut l = HashMap::default(); + for (id, v) in ll { + l.insert(id, v.index != 1); + } + + let r; + if joint { + // Run a joint quorum test case. + r = JointConfig::new_joint_from_majorities(c.clone(), cj.clone()) + .vote_result(|id| l.get(&id).cloned()); + // Interchanging the majorities shouldn't make a difference. If it does, print. + let ar = JointConfig::new_joint_from_majorities(cj, c) + .vote_result(|id| l.get(&id).cloned()); + if ar != r { + writeln!(buf, "{} <-- via symmetry", ar).unwrap(); + } + } else { + r = c.vote_result(|id| l.get(&id).cloned()); + } + writeln!(buf, "{}", r).unwrap(); + } + _ => { + panic!("unknown command: {}", data.cmd); + } + } + buf +} + +#[test] +fn test_data_driven_quorum() -> anyhow::Result<()> { + let logger = default_logger(); + run_test("src/quorum/testdata", test_quorum, false, &logger)?; + Ok(()) +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/joint.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/joint.rs new file mode 100644 index 0000000..679509f --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/joint.rs @@ -0,0 +1,98 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use super::{AckedIndexer, VoteResult}; +use crate::util::Union; +use crate::HashSet; +use crate::MajorityConfig; +use std::cmp; + +/// A configuration of two groups of (possibly overlapping) majority configurations. +/// Decisions require the support of both majorities. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct Configuration { + pub(crate) incoming: MajorityConfig, + pub(crate) outgoing: MajorityConfig, +} + +impl Configuration { + /// Creates a new configuration using the given IDs. + pub fn new(voters: HashSet) -> Configuration { + Configuration { + incoming: MajorityConfig::new(voters), + outgoing: MajorityConfig::default(), + } + } + + #[cfg(test)] + pub(crate) fn new_joint_from_majorities( + incoming: MajorityConfig, + outgoing: MajorityConfig, + ) -> Self { + Self { incoming, outgoing } + } + + /// Creates an empty configuration with given capacity. + pub fn with_capacity(cap: usize) -> Configuration { + Configuration { + incoming: MajorityConfig::with_capacity(cap), + outgoing: MajorityConfig::default(), + } + } + + /// Returns the largest committed index for the given joint quorum. An index is + /// jointly committed if it is committed in both constituent majorities. + /// + /// The bool flag indicates whether the index is computed by group commit algorithm + /// successfully. It's true only when both majorities use group commit. + pub fn committed_index(&self, use_group_commit: bool, l: &impl AckedIndexer) -> (u64, bool) { + let (i_idx, i_use_gc) = self.incoming.committed_index(use_group_commit, l); + let (o_idx, o_use_gc) = self.outgoing.committed_index(use_group_commit, l); + (cmp::min(i_idx, o_idx), i_use_gc && o_use_gc) + } + + /// Takes a mapping of voters to yes/no (true/false) votes and returns a result + /// indicating whether the vote is pending, lost, or won. A joint quorum requires + /// both majority quorums to vote in favor. + pub fn vote_result(&self, check: impl Fn(u64) -> Option) -> VoteResult { + let i = self.incoming.vote_result(&check); + let o = self.outgoing.vote_result(check); + match (i, o) { + // It won if won in both. + (VoteResult::Won, VoteResult::Won) => VoteResult::Won, + // It lost if lost in either. + (VoteResult::Lost, _) | (_, VoteResult::Lost) => VoteResult::Lost, + // It remains pending if pending in both or just won in one side. + _ => VoteResult::Pending, + } + } + + /// Clears all IDs. + pub fn clear(&mut self) { + self.incoming.clear(); + self.outgoing.clear(); + } + + /// Returns true if (and only if) there is only one voting member + /// (i.e. the leader) in the current configuration. + pub fn is_singleton(&self) -> bool { + self.outgoing.is_empty() && self.incoming.len() == 1 + } + + /// Returns an iterator over two hash set without cloning. + pub fn ids(&self) -> Union<'_> { + Union::new(&self.incoming, &self.outgoing) + } + + /// Check if an id is a voter. + #[inline] + pub fn contains(&self, id: u64) -> bool { + self.incoming.contains(&id) || self.outgoing.contains(&id) + } + + /// Describe returns a (multi-line) representation of the commit indexes for the + /// given lookuper. + #[cfg(test)] + pub(crate) fn describe(&self, l: &impl AckedIndexer) -> String { + MajorityConfig::new(self.ids().iter().collect()).describe(l) + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/majority.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/majority.rs new file mode 100644 index 0000000..4ea9292 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/majority.rs @@ -0,0 +1,255 @@ +// Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. + +use super::{AckedIndexer, Index, VoteResult}; +use crate::{DefaultHashBuilder, HashSet}; + +use std::collections::hash_set::Iter; +use std::fmt::Formatter; +use std::mem::MaybeUninit; +use std::ops::{Deref, DerefMut}; +use std::{cmp, slice}; + +/// A set of IDs that uses majority quorums to make decisions. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct Configuration { + voters: HashSet, +} + +impl std::fmt::Display for Configuration { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "({})", + self.voters + .iter() + .map(|x| x.to_string()) + .collect::>() + .join(" ") + ) + } +} + +impl Configuration { + /// Creates a new configuration using the given IDs. + pub fn new(voters: HashSet) -> Configuration { + Configuration { voters } + } + + /// Creates an empty configuration with given capacity. + pub fn with_capacity(cap: usize) -> Configuration { + Configuration { + voters: HashSet::with_capacity_and_hasher(cap, DefaultHashBuilder::default()), + } + } + + /// Returns an iterator over voters. + pub fn ids(&self) -> Iter<'_, u64> { + self.voters.iter() + } + + /// Returns the MajorityConfig as a sorted slice. + pub fn slice(&self) -> Vec { + let mut voters = self.raw_slice(); + voters.sort_unstable(); + voters + } + + /// Returns the MajorityConfig as a slice. + pub fn raw_slice(&self) -> Vec { + self.voters.iter().cloned().collect() + } + + /// Computes the committed index from those supplied via the + /// provided AckedIndexer (for the active config). + /// + /// The bool flag indicates whether the index is computed by group commit algorithm + /// successfully. + /// + /// Eg. If the matched indexes are `[2,2,2,4,5]`, it will return `2`. + /// If the matched indexes and groups are `[(1, 1), (2, 2), (3, 2)]`, it will return `1`. + pub fn committed_index(&self, use_group_commit: bool, l: &impl AckedIndexer) -> (u64, bool) { + if self.voters.is_empty() { + // This plays well with joint quorums which, when one half is the zero + // MajorityConfig, should behave like the other half. + return (u64::MAX, true); + } + + let mut stack_arr: [MaybeUninit; 7] = unsafe { MaybeUninit::uninit().assume_init() }; + let mut heap_arr; + let matched = if self.voters.len() <= 7 { + for (i, v) in self.voters.iter().enumerate() { + stack_arr[i] = MaybeUninit::new(l.acked_index(*v).unwrap_or_default()); + } + unsafe { + slice::from_raw_parts_mut(stack_arr.as_mut_ptr() as *mut _, self.voters.len()) + } + } else { + let mut buf = Vec::with_capacity(self.voters.len()); + for v in &self.voters { + buf.push(l.acked_index(*v).unwrap_or_default()); + } + heap_arr = Some(buf); + heap_arr.as_mut().unwrap().as_mut_slice() + }; + // Reverse sort. + matched.sort_by(|a, b| b.index.cmp(&a.index)); + + let quorum = crate::majority(matched.len()); + let quorum_index = matched[quorum - 1]; + if !use_group_commit { + return (quorum_index.index, false); + } + let (quorum_commit_index, mut checked_group_id) = + (quorum_index.index, quorum_index.group_id); + let mut single_group = true; + for m in matched.iter() { + if m.group_id == 0 { + single_group = false; + continue; + } + if checked_group_id == 0 { + checked_group_id = m.group_id; + continue; + } + if checked_group_id == m.group_id { + continue; + } + return (cmp::min(m.index, quorum_commit_index), true); + } + if single_group { + (quorum_commit_index, false) + } else { + (matched.last().unwrap().index, false) + } + } + + /// Takes a mapping of voters to yes/no (true/false) votes and returns + /// a result indicating whether the vote is pending (i.e. neither a quorum of + /// yes/no has been reached), won (a quorum of yes has been reached), or lost (a + /// quorum of no has been reached). + pub fn vote_result(&self, check: impl Fn(u64) -> Option) -> VoteResult { + if self.voters.is_empty() { + // By convention, the elections on an empty config win. This comes in + // handy with joint quorums because it'll make a half-populated joint + // quorum behave like a majority quorum. + return VoteResult::Won; + } + + let (mut yes, mut missing) = (0, 0); + for v in &self.voters { + match check(*v) { + Some(true) => yes += 1, + None => missing += 1, + _ => (), + } + } + let q = crate::majority(self.voters.len()); + if yes >= q { + VoteResult::Won + } else if yes + missing >= q { + VoteResult::Pending + } else { + VoteResult::Lost + } + } + + /// Describe returns a (multi-line) representation of the commit indexes for the + /// given lookuper. + /// Including `Index`,`Id` and the number of smaller index (represented as the bar) + /// + /// Print `?` if `Index` is not exist. + /// + /// e.g. + /// ```txt + /// idx + /// x> 100 (id=1) + /// xx> 101 (id=2) + /// > 99 (id=3) + /// 100 + /// ``` + #[cfg(test)] + pub(crate) fn describe(&self, l: &impl AckedIndexer) -> String { + use std::fmt::Write; + + let n = self.voters.len(); + if n == 0 { + return "".to_string(); + } + + struct Tup { + id: u64, + idx: Option, + // length of bar displayed for this Tup + bar: usize, + } + + // Below, populate .bar so that the i-th largest commit index has bar i (we + // plot this as sort of a progress bar). The actual code is a bit more + // complicated and also makes sure that equal index => equal bar. + + let mut info = Vec::with_capacity(n); + + for &id in &self.voters { + let idx = l.acked_index(id); + info.push(Tup { id, idx, bar: 0 }) + } + + info.sort_by(|a, b| { + (a.idx.unwrap_or_default().index, a.id).cmp(&(b.idx.unwrap_or_default().index, b.id)) + }); + + for i in 0..n { + if i > 0 + && info[i - 1].idx.unwrap_or_default().index < info[i].idx.unwrap_or_default().index + { + info[i].bar = i; + } + } + + info.sort_by(|a, b| a.id.cmp(&b.id)); + + let mut buf = String::new(); + buf.push_str(" ".repeat(n).as_str()); + buf.push_str(" idx\n"); + + for tup in info { + match tup.idx { + Some(idx) => { + buf.push_str("x".repeat(tup.bar).as_str()); + buf.push('>'); + buf.push_str(" ".repeat(n - tup.bar).as_str()); + writeln!(buf, " {:>5} (id={})", format!("{}", idx), tup.id) + .expect("Error occurred while trying to write in String"); + } + None => { + buf.push('?'); + buf.push_str(" ".repeat(n).as_str()); + writeln!( + buf, + " {:>5} (id={})", + format!("{}", Index::default()), + tup.id + ) + .expect("Error occurred while trying to write in String"); + } + } + } + buf + } +} + +impl Deref for Configuration { + type Target = HashSet; + + #[inline] + fn deref(&self) -> &HashSet { + &self.voters + } +} + +impl DerefMut for Configuration { + #[inline] + fn deref_mut(&mut self) -> &mut HashSet { + &mut self.voters + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_commit.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_commit.txt new file mode 100644 index 0000000..3d20964 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_commit.txt @@ -0,0 +1,481 @@ +# No difference between a simple majority quorum and a simple majority quorum +# joint with an empty majority quorum. (This is asserted for all datadriven tests +# by the framework, so we don't dwell on it more). +# +# Note that by specifying cfgj explicitly we tell the test harness to treat the +# input as a joint quorum and not a majority quorum. If we didn't specify +# cfgj=zero the test would pass just the same, but it wouldn't be exercising the +# joint quorum path. +committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) +---- + idx +x> 100 (id=1) +xx> 101 (id=2) +> 99 (id=3) +100 + +# Joint nonoverlapping singleton quorums. + +committed cfg=(1) cfgj=(2) idx=(_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +0 + +# Voter 1 has 100 committed, 2 nothing. This means we definitely won't commit +# past 100. +committed cfg=(1) cfgj=(2) idx=(100,_) +---- + idx +x> 100 (id=1) +? 0 (id=2) +0 + +# Committed index collapses once both majorities do, to the lower index. +committed cfg=(1) cfgj=(2) idx=(13, 100) +---- + idx +> 13 (id=1) +x> 100 (id=2) +13 + +# Joint overlapping (i.e. identical) singleton quorum. + +committed cfg=(1) cfgj=(1) idx=(_) +---- + idx +? 0 (id=1) +0 + +committed cfg=(1) cfgj=(1) idx=(100) +---- + idx +> 100 (id=1) +100 + + + +# Two-node config joint with non-overlapping single node config +committed cfg=(1,3) cfgj=(2) idx=(_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +committed cfg=(1,3) cfgj=(2) idx=(100,_,_) +---- + idx +xx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +# 1 has 100 committed, 2 has 50 (collapsing half of the joint quorum to 50). +committed cfg=(1,3) cfgj=(2) idx=(100,_,50) +---- + idx +xx> 100 (id=1) +x> 50 (id=2) +? 0 (id=3) +0 + +# 2 reports 45, collapsing the other half (to 45). +committed cfg=(1,3) cfgj=(2) idx=(100,45,50) +---- + idx +xx> 100 (id=1) +x> 50 (id=2) +> 45 (id=3) +45 + +# Two-node config with overlapping single-node config. + +committed cfg=(1,2) cfgj=(2) idx=(_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +0 + +# 1 reports 100. +committed cfg=(1,2) cfgj=(2) idx=(100,_) +---- + idx +x> 100 (id=1) +? 0 (id=2) +0 + +# 2 reports 100. +committed cfg=(1,2) cfgj=(2) idx=(_,100) +---- + idx +? 0 (id=1) +x> 100 (id=2) +0 + +committed cfg=(1,2) cfgj=(2) idx=(50,100) +---- + idx +> 50 (id=1) +x> 100 (id=2) +50 + +committed cfg=(1,2) cfgj=(2) idx=(100,50) +---- + idx +x> 100 (id=1) +> 50 (id=2) +50 + + + +# Joint non-overlapping two-node configs. + +committed cfg=(1,2) cfgj=(3,4) idx=(50,_,_,_) +---- + idx +xxx> 50 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +0 + +committed cfg=(1,2) cfgj=(3,4) idx=(50,_,49,_) +---- + idx +xxx> 50 (id=1) +? 0 (id=2) +xx> 49 (id=3) +? 0 (id=4) +0 + +committed cfg=(1,2) cfgj=(3,4) idx=(50,48,49,_) +---- + idx +xxx> 50 (id=1) +x> 48 (id=2) +xx> 49 (id=3) +? 0 (id=4) +0 + +committed cfg=(1,2) cfgj=(3,4) idx=(50,48,49,47) +---- + idx +xxx> 50 (id=1) +x> 48 (id=2) +xx> 49 (id=3) +> 47 (id=4) +47 + +# Joint overlapping two-node configs. +committed cfg=(1,2) cfgj=(2,3) idx=(_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +committed cfg=(1,2) cfgj=(2,3) idx=(100,_,_) +---- + idx +xx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +committed cfg=(1,2) cfgj=(2,3) idx=(_,100,_) +---- + idx +? 0 (id=1) +xx> 100 (id=2) +? 0 (id=3) +0 + +committed cfg=(1,2) cfgj=(2,3) idx=(_,100,99) +---- + idx +? 0 (id=1) +xx> 100 (id=2) +x> 99 (id=3) +0 + +committed cfg=(1,2) cfgj=(2,3) idx=(101,100,99) +---- + idx +xx> 101 (id=1) +x> 100 (id=2) +> 99 (id=3) +99 + +# Joint identical two-node configs. +committed cfg=(1,2) cfgj=(1,2) idx=(_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +0 + +committed cfg=(1,2) cfgj=(1,2) idx=(_,40) +---- + idx +? 0 (id=1) +x> 40 (id=2) +0 + +committed cfg=(1,2) cfgj=(1,2) idx=(41,40) +---- + idx +x> 41 (id=1) +> 40 (id=2) +40 + + + +# Joint disjoint three-node configs. + +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(_,_,_,_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +? 0 (id=6) +0 + +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(100,_,_,_,_,_) +---- + idx +xxxxx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +? 0 (id=6) +0 + +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(100,_,_,90,_,_) +---- + idx +xxxxx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +xxxx> 90 (id=4) +? 0 (id=5) +? 0 (id=6) +0 + +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(100,99,_,_,_,_) +---- + idx +xxxxx> 100 (id=1) +xxxx> 99 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +? 0 (id=6) +0 + +# First quorum <= 99, second one <= 97. Both quorums guarantee that 90 is +# committed. +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(_,99,90,97,95,_) +---- + idx +? 0 (id=1) +xxxxx> 99 (id=2) +xx> 90 (id=3) +xxxx> 97 (id=4) +xxx> 95 (id=5) +? 0 (id=6) +90 + +# First quorum collapsed to 92. Second one already had at least 95 committed, +# so the result also collapses. +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(92,99,90,97,95,_) +---- + idx +xx> 92 (id=1) +xxxxx> 99 (id=2) +x> 90 (id=3) +xxxx> 97 (id=4) +xxx> 95 (id=5) +? 0 (id=6) +92 + +# Second quorum collapses, but nothing changes in the output. +committed cfg=(1,2,3) cfgj=(4,5,6) idx=(92,99,90,97,95,77) +---- + idx +xx> 92 (id=1) +xxxxx> 99 (id=2) +x> 90 (id=3) +xxxx> 97 (id=4) +xxx> 95 (id=5) +> 77 (id=6) +92 + + +# Joint overlapping three-node configs. + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(_,_,_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +0 + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(100,_,_,_,_) +---- + idx +xxxx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +0 + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(100,101,_,_,_) +---- + idx +xxx> 100 (id=1) +xxxx> 101 (id=2) +? 0 (id=3) +? 0 (id=4) +? 0 (id=5) +0 + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(100,101,100,_,_) +---- + idx +xx> 100 (id=1) +xxxx> 101 (id=2) +> 100 (id=3) +? 0 (id=4) +? 0 (id=5) +0 + +# Second quorum could commit either 98 or 99, but first quorum is open. +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(_,100,_,99,98) +---- + idx +? 0 (id=1) +xxxx> 100 (id=2) +? 0 (id=3) +xxx> 99 (id=4) +xx> 98 (id=5) +0 + +# Additionally, first quorum can commit either 100 or 99 +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(_,100,99,99,98) +---- + idx +? 0 (id=1) +xxxx> 100 (id=2) +xx> 99 (id=3) +> 99 (id=4) +x> 98 (id=5) +98 + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(1,100,99,99,98) +---- + idx +> 1 (id=1) +xxxx> 100 (id=2) +xx> 99 (id=3) +> 99 (id=4) +x> 98 (id=5) +98 + +committed cfg=(1,2,3) cfgj=(1,4,5) idx=(100,100,99,99,98) +---- + idx +xxx> 100 (id=1) +> 100 (id=2) +x> 99 (id=3) +> 99 (id=4) +> 98 (id=5) +99 + + +# More overlap. + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(_,_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +? 0 (id=4) +0 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(_,100,99,_) +---- + idx +? 0 (id=1) +xxx> 100 (id=2) +xx> 99 (id=3) +? 0 (id=4) +99 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(98,100,99,_) +---- + idx +x> 98 (id=1) +xxx> 100 (id=2) +xx> 99 (id=3) +? 0 (id=4) +99 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(100,100,99,_) +---- + idx +xx> 100 (id=1) +> 100 (id=2) +x> 99 (id=3) +? 0 (id=4) +99 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(100,100,99,98) +---- + idx +xx> 100 (id=1) +> 100 (id=2) +x> 99 (id=3) +> 98 (id=4) +99 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(100,_,_,101) +---- + idx +xx> 100 (id=1) +? 0 (id=2) +? 0 (id=3) +xxx> 101 (id=4) +0 + +committed cfg=(1,2,3) cfgj=(2,3,4) idx=(100,99,_,101) +---- + idx +xx> 100 (id=1) +x> 99 (id=2) +? 0 (id=3) +xxx> 101 (id=4) +99 + +# Identical. This is also exercised in the test harness, so it's listed here +# only briefly. +committed cfg=(1,2,3) cfgj=(1,2,3) idx=(50,45,_) +---- + idx +xx> 50 (id=1) +x> 45 (id=2) +? 0 (id=3) +45 \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_group_commit.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_group_commit.txt new file mode 100644 index 0000000..7045958 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_group_commit.txt @@ -0,0 +1,76 @@ +# the same result of joint single group commit +group_committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) gid=(1,1,1) +---- +100 + +# min(quorum_commit_index = 100, first index that appears in second group = 99) +group_committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) gid=(1,1,2) +---- +99 + +# min(quorum_commit_index = 100, first index that appears in second group = 101) +group_committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) gid=(2,1,1) +---- +100 + +# minimum index = 99 +group_committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) gid=(_,1,1) +---- +99 + +# min(quorum_commit_index = 100, first index that appears in second group = 99) +group_committed cfg=(1,2,3) cfgj=zero idx=(100,101,99) gid=(_,1,2) +---- +99 + +# minimum index = 98 +group_committed cfg=(1,2,3,4,5) cfgj=zero idx=(100,101,99,102,98) gid=(_,_,_,_,1) +---- +98 + +# cfg 1 = min(quorum_commit_index = 100, first index that appears in second group = 99) +# cfg 2 = min(quorum_commit_index = 100, first index that appears in second group = 1) +group_committed cfg=(1,2,3,4) cfgj=(3,4,5,6) idx=(101,99,100,102,103,1) gid=(1,_,1,1,_,2) +---- +1 + +# cfg 1 = min(quorum_commit_index = 100, first index that appears in second group = 101) +# cfg 2 = min(quorum_commit_index = 100, first index that appears in second group = 101) +group_committed cfg=(1,2,3) cfgj=(4,5,6) idx=(99,100,101,99,100,101) gid=(1,1,2,1,2,1) +---- +100 + +# cfg 1 = min(quorum_commit_index = 100, first index that appears in second group = 101) +# cfg 2 = minimum index = 99 +group_committed cfg=(1,2,3) cfgj=(4,5,6) idx=(99,100,101,99,100,101) gid=(1,1,2,1,1,_) +---- +99 + +# min(quorum_commit_index = 101, first index that appears in second group = 103) +group_committed cfg=(1,2,3,4,5) cfgj=zero idx=(99,100,101,102,103) gid=(1,1,1,1,2) +---- +101 + +# cfg 1 = minimum index = 1 +# cfg 2 = minimum index = 2 +group_committed cfg=(1,2,3,4,5) cfgj=(2,3,4,5,6) idx=(1,100,101,102,103,2) gid=(1,_,1,1,1,1) +---- +1 + +# cfg 1 = minimum index = 3 +# cfg 2 = quorum_commit_index = 101 +group_committed cfg=(1,2,3,4,5) cfgj=(2,3,4,5,6) idx=(3,100,101,102,103,2) gid=(_,1,1,1,1,1) +---- +3 + +# cfg 1 = min(quorum_commit_index = 101, first index that appears in second group = 103) +# cfg 2 = min(quorum_commit_index = 101, first index that appears in second group = 103) +group_committed cfg=(1,2,3,4,5) cfgj=(2,3,4,5,6) idx=(3,100,101,102,103,2) gid=(_,1,1,1,3,1) +---- +101 + +# cfg 1 = minimum index = 3 +# cfg 2 = min(quorum_commit_index = 101, first index that appears in second group = 2) +group_committed cfg=(1,2,3,4,5) cfgj=(2,3,4,5,6) idx=(3,100,101,102,103,2) gid=(_,1,1,1,1,3) +---- +2 diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_vote.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_vote.txt new file mode 100644 index 0000000..5324917 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/joint_vote.txt @@ -0,0 +1,165 @@ +# Empty joint config wins all votes. This isn't used in production. Note that +# by specifying cfgj explicitly we tell the test harness to treat the input as +# a joint quorum and not a majority quorum. +vote cfgj=zero +---- +VoteWon + +# More examples with close to trivial configs. + +vote cfg=(1) cfgj=zero votes=(_) +---- +VotePending + +vote cfg=(1) cfgj=zero votes=(y) +---- +VoteWon + +vote cfg=(1) cfgj=zero votes=(n) +---- +VoteLost + +vote cfg=(1) cfgj=(1) votes=(_) +---- +VotePending + +vote cfg=(1) cfgj=(1) votes=(y) +---- +VoteWon + +vote cfg=(1) cfgj=(1) votes=(n) +---- +VoteLost + +vote cfg=(1) cfgj=(2) votes=(_,_) +---- +VotePending + +vote cfg=(1) cfgj=(2) votes=(y,_) +---- +VotePending + +vote cfg=(1) cfgj=(2) votes=(y,y) +---- +VoteWon + +vote cfg=(1) cfgj=(2) votes=(y,n) +---- +VoteLost + +vote cfg=(1) cfgj=(2) votes=(n,_) +---- +VoteLost + +vote cfg=(1) cfgj=(2) votes=(n,n) +---- +VoteLost + +vote cfg=(1) cfgj=(2) votes=(n,y) +---- +VoteLost + +# Two node configs. + +vote cfg=(1,2) cfgj=(3,4) votes=(_,_,_,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(3,4) votes=(y,_,_,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(3,4) votes=(y,y,_,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(3,4) votes=(y,y,n,_) +---- +VoteLost + +vote cfg=(1,2) cfgj=(3,4) votes=(y,y,n,n) +---- +VoteLost + +vote cfg=(1,2) cfgj=(3,4) votes=(y,y,y,n) +---- +VoteLost + +vote cfg=(1,2) cfgj=(3,4) votes=(y,y,y,y) +---- +VoteWon + +vote cfg=(1,2) cfgj=(2,3) votes=(_,_,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(2,3) votes=(_,n,_) +---- +VoteLost + +vote cfg=(1,2) cfgj=(2,3) votes=(y,y,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(2,3) votes=(y,y,n) +---- +VoteLost + +vote cfg=(1,2) cfgj=(2,3) votes=(y,y,y) +---- +VoteWon + +vote cfg=(1,2) cfgj=(1,2) votes=(_,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(1,2) votes=(y,_) +---- +VotePending + +vote cfg=(1,2) cfgj=(1,2) votes=(y,n) +---- +VoteLost + +vote cfg=(1,2) cfgj=(1,2) votes=(n,_) +---- +VoteLost + +vote cfg=(1,2) cfgj=(1,2) votes=(n,n) +---- +VoteLost + + +# Simple example for overlapping three node configs. + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(_,_,_,_) +---- +VotePending + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(_,n,_,_) +---- +VotePending + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(_,n,n,_) +---- +VoteLost + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(_,y,y,_) +---- +VoteWon + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,_,_) +---- +VotePending + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,_) +---- +VotePending + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,n) +---- +VoteLost + +vote cfg=(1,2,3) cfgj=(2,3,4) votes=(y,y,n,y) +---- +VoteWon \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_commit.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_commit.txt new file mode 100644 index 0000000..0e8e7c6 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_commit.txt @@ -0,0 +1,153 @@ +# The empty quorum commits "everything". This is useful for its use in joint +# quorums. +committed +---- +∞ + + + +# A single voter quorum is not final when no index is known. +committed cfg=(1) idx=(_) +---- + idx +? 0 (id=1) +0 + +# When an index is known, that's the committed index, and that's final. +committed cfg=(1) idx=(12) +---- + idx +> 12 (id=1) +12 + + + + +# With two nodes, start out similarly. +committed cfg=(1, 2) idx=(_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +0 + +# The first committed index becomes known (for n1). Nothing changes in the +# output because idx=12 is not known to be on a quorum (which is both nodes). +committed cfg=(1, 2) idx=(12,_) +---- + idx +x> 12 (id=1) +? 0 (id=2) +0 + +# The second index comes in and finalize the decision. The result will be the +# smaller of the two indexes. +committed cfg=(1,2) idx=(12,5) +---- + idx +x> 12 (id=1) +> 5 (id=2) +5 + + + + +# No surprises for three nodes. +committed cfg=(1,2,3) idx=(_,_,_) +---- + idx +? 0 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +committed cfg=(1,2,3) idx=(12,_,_) +---- + idx +xx> 12 (id=1) +? 0 (id=2) +? 0 (id=3) +0 + +# We see a committed index, but a higher committed index for the last pending +# votes could change (increment) the outcome, so not final yet. +committed cfg=(1,2,3) idx=(12,5,_) +---- + idx +xx> 12 (id=1) +x> 5 (id=2) +? 0 (id=3) +5 + +# a) the case in which it does: +committed cfg=(1,2,3) idx=(12,5,6) +---- + idx +xx> 12 (id=1) +> 5 (id=2) +x> 6 (id=3) +6 + +# b) the case in which it does not: +committed cfg=(1,2,3) idx=(12,5,4) +---- + idx +xx> 12 (id=1) +x> 5 (id=2) +> 4 (id=3) +5 + +# c) a different case in which the last index is pending but it has no chance of +# swaying the outcome (because nobody in the current quorum agrees on anything +# higher than the candidate): +committed cfg=(1,2,3) idx=(5,5,_) +---- + idx +x> 5 (id=1) +> 5 (id=2) +? 0 (id=3) +5 + +# c) continued: Doesn't matter what shows up last. The result is final. +committed cfg=(1,2,3) idx=(5,5,12) +---- + idx +> 5 (id=1) +> 5 (id=2) +xx> 12 (id=3) +5 + +# With all committed idx known, the result is final. +committed cfg=(1, 2, 3) idx=(100, 101, 103) +---- + idx +> 100 (id=1) +x> 101 (id=2) +xx> 103 (id=3) +101 + + + +# Some more complicated examples. Similar to case c) above. The result is +# already final because no index higher than 103 is one short of quorum. +committed cfg=(1, 2, 3, 4, 5) idx=(101, 104, 103, 103,_) +---- + idx +x> 101 (id=1) +xxxx> 104 (id=2) +xx> 103 (id=3) +> 103 (id=4) +? 0 (id=5) +103 + +# A similar case which is not final because another vote for >= 103 would change +# the outcome. +committed cfg=(1, 2, 3, 4, 5) idx=(101, 102, 103, 103,_) +---- + idx +x> 101 (id=1) +xx> 102 (id=2) +xxx> 103 (id=3) +> 103 (id=4) +? 0 (id=5) +102 \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_vote.txt b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_vote.txt new file mode 100644 index 0000000..b338a0a --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/quorum/testdata/majority_vote.txt @@ -0,0 +1,97 @@ +# The empty config always announces a won vote. +vote +---- +VoteWon + +vote cfg=(1) votes=(_) +---- +VotePending + +vote cfg=(1) votes=(n) +---- +VoteLost + +vote cfg=(123) votes=(y) +---- +VoteWon + + + + +vote cfg=(4,8) votes=(_,_) +---- +VotePending + +# With two voters, a single rejection loses the vote. +vote cfg=(4,8) votes=(n,_) +---- +VoteLost + +vote cfg=(4,8) votes=(y,_) +---- +VotePending + +vote cfg=(4,8) votes=(n,y) +---- +VoteLost + +vote cfg=(4,8) votes=(y,y) +---- +VoteWon + + + +vote cfg=(2,4,7) votes=(_,_,_) +---- +VotePending + +vote cfg=(2,4,7) votes=(n,_,_) +---- +VotePending + +vote cfg=(2,4,7) votes=(y,_,_) +---- +VotePending + +vote cfg=(2,4,7) votes=(n,n,_) +---- +VoteLost + +vote cfg=(2,4,7) votes=(y,n,_) +---- +VotePending + +vote cfg=(2,4,7) votes=(y,y,_) +---- +VoteWon + +vote cfg=(2,4,7) votes=(y,y,n) +---- +VoteWon + +vote cfg=(2,4,7) votes=(n,y,n) +---- +VoteLost + + + +# Test some random example with seven nodes (why not). +vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,_,_) +---- +VotePending + +vote cfg=(1,2,3,4,5,6,7) votes=(_,y,y,_,n,y,n) +---- +VotePending + +vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,_,n,y) +---- +VoteWon + +vote cfg=(1,2,3,4,5,6,7) votes=(y,y,_,n,y,n,n) +---- +VotePending + +vote cfg=(1,2,3,4,5,6,7) votes=(y,y,n,y,n,n,n) +---- +VoteLost \ No newline at end of file diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/raft.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/raft.rs new file mode 100644 index 0000000..1b228d4 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/raft.rs @@ -0,0 +1,2957 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp; +use std::ops::{Deref, DerefMut}; + +use crate::eraftpb::{ + ConfChange, ConfChangeV2, ConfState, Entry, EntryType, HardState, Message, MessageType, + Snapshot, +}; +use protobuf::Message as _; +use raft_proto::ConfChangeI; +use rand::Rng; +use slog::Logger; + +#[cfg(feature = "failpoints")] +use fail::fail_point; +use getset::Getters; +use slog::{debug, error, info, o, trace, warn}; + +use super::errors::{Error, Result, StorageError}; +use super::raft_log::RaftLog; +use super::read_only::{ReadOnly, ReadOnlyOption, ReadState}; +use super::storage::{GetEntriesContext, GetEntriesFor, Storage}; +use super::Config; +use crate::confchange::Changer; +use crate::quorum::VoteResult; +use crate::util; +use crate::util::NO_LIMIT; +use crate::{confchange, Progress, ProgressState, ProgressTracker}; + +// CAMPAIGN_PRE_ELECTION represents the first phase of a normal election when +// Config.pre_vote is true. +#[doc(hidden)] +pub const CAMPAIGN_PRE_ELECTION: &[u8] = b"CampaignPreElection"; +#[doc(hidden)] +// CAMPAIGN_ELECTION represents a normal (time-based) election (the second phase +// of the election when Config.pre_vote is true). +#[doc(hidden)] +pub const CAMPAIGN_ELECTION: &[u8] = b"CampaignElection"; +#[doc(hidden)] +// CAMPAIGN_TRANSFER represents the type of leader transfer. +#[doc(hidden)] +pub const CAMPAIGN_TRANSFER: &[u8] = b"CampaignTransfer"; + +/// The role of the node. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] +pub enum StateRole { + /// The node is a follower of the leader. + #[default] + Follower, + /// The node could become a leader. + Candidate, + /// The node is a leader. + Leader, + /// The node could become a candidate, if `prevote` is enabled. + PreCandidate, +} + +/// A constant represents invalid id of raft. +pub const INVALID_ID: u64 = 0; +/// A constant represents invalid index of raft log. +pub const INVALID_INDEX: u64 = 0; + +/// SoftState provides state that is useful for logging and debugging. +/// The state is volatile and does not need to be persisted to the WAL. +#[derive(Default, PartialEq, Eq, Debug)] +pub struct SoftState { + /// The potential leader of the cluster. + pub leader_id: u64, + /// The soft role this node may take. + pub raft_state: StateRole, +} + +/// UncommittedState is used to keep track of information of uncommitted +/// log entries on 'leader' node +struct UncommittedState { + /// Specify maximum of uncommitted entry size. + /// When this limit is reached, all proposals to append new log will be dropped + max_uncommitted_size: usize, + + /// Record current uncommitted entries size. + uncommitted_size: usize, + + /// Record index of last log entry when node becomes leader from candidate. + /// See https://github.com/tikv/raft-rs/pull/398#discussion_r502417531 for more detail + last_log_tail_index: u64, +} + +impl UncommittedState { + #[inline] + pub fn is_no_limit(&self) -> bool { + self.max_uncommitted_size == NO_LIMIT as usize + } + + pub fn maybe_increase_uncommitted_size(&mut self, ents: &[Entry]) -> bool { + // fast path + if self.is_no_limit() { + return true; + } + + let size: usize = ents.iter().map(|ent| ent.get_data().len()).sum(); + + // 1. we should never drop an entry without any data(eg. leader election) + // 2. we should allow at least one uncommitted entry + // 3. add these entries will not cause size overlimit + if size == 0 + || self.uncommitted_size == 0 + || size + self.uncommitted_size <= self.max_uncommitted_size + { + self.uncommitted_size += size; + true + } else { + false + } + } + + pub fn maybe_reduce_uncommitted_size(&mut self, ents: &[Entry]) -> bool { + // fast path + if self.is_no_limit() || ents.is_empty() { + return true; + } + + // user may advance a 'Ready' which is generated before this node becomes leader + let size: usize = ents + .iter() + .skip_while(|ent| ent.index <= self.last_log_tail_index) + .map(|ent| ent.get_data().len()) + .sum(); + + if size > self.uncommitted_size { + self.uncommitted_size = 0; + false + } else { + self.uncommitted_size -= size; + true + } + } +} + +/// The core struct of raft consensus. +/// +/// It's a helper struct to get around rust borrow checks. +#[derive(Getters)] +pub struct RaftCore { + /// The current election term. + pub term: u64, + + /// Which peer this raft is voting for. + pub vote: u64, + + /// The ID of this node. + pub id: u64, + + /// The current read states. + pub read_states: Vec, + + /// The persistent log. + pub raft_log: RaftLog, + + /// The maximum number of messages that can be inflight. + pub max_inflight: usize, + + /// The maximum length (in bytes) of all the entries. + pub max_msg_size: u64, + + /// The peer is requesting snapshot, it is the index that the follower + /// needs it to be included in a snapshot. + pub pending_request_snapshot: u64, + + /// The current role of this node. + pub state: StateRole, + + /// Indicates whether state machine can be promoted to leader, + /// which is true when it's a voter and its own id is in progress list. + promotable: bool, + + /// The leader id. + pub leader_id: u64, + + /// ID of the leader transfer target when its value is not None. + /// + /// If this is Some(id), we follow the procedure defined in raft thesis 3.10. + pub lead_transferee: Option, + + /// Only one conf change may be pending (in the log, but not yet + /// applied) at a time. This is enforced via `pending_conf_index`, which + /// is set to a value >= the log index of the latest pending + /// configuration change (if any). Config changes are only allowed to + /// be proposed if the leader's applied index is greater than this + /// value. + /// + /// This value is conservatively set in cases where there may be a configuration change pending, + /// but scanning the log is possibly expensive. This implies that the index stated here may not + /// necessarily be a config change entry, and it may not be a `BeginMembershipChange` entry, even if + /// we set this to one. + pub pending_conf_index: u64, + + /// The queue of read-only requests. + pub read_only: ReadOnly, + + /// Ticks since it reached last electionTimeout when it is leader or candidate. + /// Number of ticks since it reached last electionTimeout or received a + /// valid message from current leader when it is a follower. + pub election_elapsed: usize, + + /// Number of ticks since it reached last heartbeatTimeout. + /// only leader keeps heartbeatElapsed. + heartbeat_elapsed: usize, + + /// Whether to check the quorum + pub check_quorum: bool, + + /// Enable the prevote algorithm. + /// + /// This enables a pre-election vote round on Candidates prior to disrupting the cluster. + /// + /// Enable this if greater cluster stability is preferred over faster elections. + pub pre_vote: bool, + + skip_bcast_commit: bool, + batch_append: bool, + disable_proposal_forwarding: bool, + + heartbeat_timeout: usize, + election_timeout: usize, + + // randomized_election_timeout is a random number between + // [min_election_timeout, max_election_timeout - 1]. It gets reset + // when raft changes its state to follower or candidate. + randomized_election_timeout: usize, + min_election_timeout: usize, + max_election_timeout: usize, + + /// The logger for the raft structure. + pub(crate) logger: slog::Logger, + + /// The election priority of this node. + pub priority: i64, + + /// Track uncommitted log entry on this node. + uncommitted_state: UncommittedState, + + /// Max size per committed entries in a `Read`. + pub(crate) max_committed_size_per_ready: u64, +} + +/// A struct that represents the raft consensus itself. Stores details concerning the current +/// and possible state the system can take. +pub struct Raft { + prs: ProgressTracker, + + /// The list of messages. + pub msgs: Vec, + /// Internal raftCore. + pub r: RaftCore, +} + +impl Deref for Raft { + type Target = RaftCore; + + #[inline] + fn deref(&self) -> &RaftCore { + &self.r + } +} + +impl DerefMut for Raft { + #[inline] + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.r + } +} + +#[allow(dead_code)] // ensure Raft is always Send +trait AssertSend: Send {} +impl AssertSend for Raft {} + +fn new_message(to: u64, field_type: MessageType, from: Option) -> Message { + let mut m = Message::default(); + m.to = to; + if let Some(id) = from { + m.from = id; + } + m.set_msg_type(field_type); + m +} + +fn get_priority(m: &Message) -> i64 { + if m.priority != 0 { + m.priority + } else { + i64::try_from(m.deprecated_priority).unwrap_or(i64::MAX) + } +} + +/// Maps vote and pre_vote message types to their correspond responses. +pub fn vote_resp_msg_type(t: MessageType) -> MessageType { + match t { + MessageType::MsgRequestVote => MessageType::MsgRequestVoteResponse, + MessageType::MsgRequestPreVote => MessageType::MsgRequestPreVoteResponse, + _ => panic!("Not a vote message: {:?}", t), + } +} + +impl Raft { + /// Creates a new raft for use on the node. + #[allow(clippy::new_ret_no_self)] + pub fn new(c: &Config, store: T, logger: &Logger) -> Result { + c.validate()?; + let logger = logger.new(o!("raft_id" => c.id)); + let raft_state = store.initial_state()?; + let conf_state = &raft_state.conf_state; + let voters = &conf_state.voters; + let learners = &conf_state.learners; + + let mut r = Raft { + prs: ProgressTracker::with_capacity(voters.len(), learners.len(), c.max_inflight_msgs), + msgs: Default::default(), + r: RaftCore { + id: c.id, + read_states: Default::default(), + raft_log: RaftLog::new(store, logger.clone(), c), + max_inflight: c.max_inflight_msgs, + max_msg_size: c.max_size_per_msg, + pending_request_snapshot: INVALID_INDEX, + state: StateRole::Follower, + promotable: false, + check_quorum: c.check_quorum, + pre_vote: c.pre_vote, + read_only: ReadOnly::new(c.read_only_option), + heartbeat_timeout: c.heartbeat_tick, + election_timeout: c.election_tick, + leader_id: Default::default(), + lead_transferee: None, + term: Default::default(), + election_elapsed: Default::default(), + pending_conf_index: Default::default(), + vote: Default::default(), + heartbeat_elapsed: Default::default(), + randomized_election_timeout: Default::default(), + min_election_timeout: c.min_election_tick(), + max_election_timeout: c.max_election_tick(), + skip_bcast_commit: c.skip_bcast_commit, + batch_append: c.batch_append, + logger, + priority: c.priority, + uncommitted_state: UncommittedState { + max_uncommitted_size: c.max_uncommitted_size as usize, + uncommitted_size: 0, + last_log_tail_index: 0, + }, + max_committed_size_per_ready: c.max_committed_size_per_ready, + disable_proposal_forwarding: c.disable_proposal_forwarding, + }, + }; + confchange::restore(&mut r.prs, r.r.raft_log.last_index(), conf_state)?; + let new_cs = r.post_conf_change(); + if !raft_proto::conf_state_eq(&new_cs, conf_state) { + fatal!( + r.logger, + "invalid restore: {:?} != {:?}", + conf_state, + new_cs + ); + } + + if raft_state.hard_state != HardState::default() { + r.load_state(&raft_state.hard_state); + } + if c.applied > 0 { + // at initialize, it is possible that applied_index > committed_index, + // so we should skip the check at `commit_apply`. + r.commit_apply_internal(c.applied, true); + } + r.become_follower(r.term, INVALID_ID); + + info!( + r.logger, + "newRaft"; + "term" => r.term, + "commit" => r.raft_log.committed, + "applied" => r.raft_log.applied, + "last index" => r.raft_log.last_index(), + "last term" => r.raft_log.last_term(), + "peers" => ?r.prs.conf().voters, + ); + Ok(r) + } + + /// Sets priority of node. + pub fn set_priority(&mut self, priority: i64) { + self.priority = priority; + } + + /// Creates a new raft for use on the node with the default logger. + /// + /// The default logger is an `slog` to `log` adapter. + #[allow(clippy::new_ret_no_self)] + #[cfg(feature = "default-logger")] + pub fn with_default_logger(c: &Config, store: T) -> Result { + Self::new(c, store, &crate::default_logger()) + } + + /// Grabs an immutable reference to the store. + #[inline] + pub fn store(&self) -> &T { + &self.raft_log.store + } + + /// Grabs a mutable reference to the store. + #[inline] + pub fn mut_store(&mut self) -> &mut T { + &mut self.raft_log.store + } + + /// Grabs a reference to the snapshot + #[inline] + pub fn snap(&self) -> Option<&Snapshot> { + self.raft_log.unstable.snapshot.as_ref() + } + + /// Returns the number of pending read-only messages. + #[inline] + pub fn pending_read_count(&self) -> usize { + self.read_only.pending_read_count() + } + + /// Returns how many read states exist. + #[inline] + pub fn ready_read_count(&self) -> usize { + self.read_states.len() + } + + /// Returns a value representing the softstate at the time of calling. + pub fn soft_state(&self) -> SoftState { + SoftState { + leader_id: self.leader_id, + raft_state: self.state, + } + } + + /// Returns a value representing the hardstate at the time of calling. + pub fn hard_state(&self) -> HardState { + let mut hs = HardState::default(); + hs.term = self.term; + hs.vote = self.vote; + hs.commit = self.raft_log.committed; + hs + } + + /// Returns whether the current raft is in lease. + pub fn in_lease(&self) -> bool { + self.state == StateRole::Leader && self.check_quorum + } + + /// For testing leader lease + #[doc(hidden)] + pub fn set_randomized_election_timeout(&mut self, t: usize) { + assert!(self.min_election_timeout <= t && t < self.max_election_timeout); + self.randomized_election_timeout = t; + } + + /// Fetch the length of the election timeout. + pub fn election_timeout(&self) -> usize { + self.election_timeout + } + + /// Fetch the length of the heartbeat timeout + pub fn heartbeat_timeout(&self) -> usize { + self.heartbeat_timeout + } + + /// Fetch the number of ticks elapsed since last heartbeat. + pub fn heartbeat_elapsed(&self) -> usize { + self.heartbeat_elapsed + } + + /// Return the length of the current randomized election timeout. + pub fn randomized_election_timeout(&self) -> usize { + self.randomized_election_timeout + } + + /// Set whether skip broadcast empty commit messages at runtime. + #[inline] + pub fn skip_bcast_commit(&mut self, skip: bool) { + self.skip_bcast_commit = skip; + } + + /// Set whether batch append msg at runtime. + #[inline] + pub fn set_batch_append(&mut self, batch_append: bool) { + self.batch_append = batch_append; + } + + /// Configures group commit. + /// + /// If group commit is enabled, only logs replicated to at least two + /// different groups are committed. + /// + /// You should use `assign_commit_groups` to configure peer groups. + pub fn enable_group_commit(&mut self, enable: bool) { + self.mut_prs().enable_group_commit(enable); + if StateRole::Leader == self.state && !enable && self.maybe_commit() { + self.bcast_append(); + } + } + + /// Whether enable group commit. + pub fn group_commit(&self) -> bool { + self.prs().group_commit() + } + + /// Assigns groups to peers. + /// + /// The tuple is (`peer_id`, `group_id`). `group_id` should be larger than 0. + /// + /// The group information is only stored in memory. So you need to configure + /// it every time a raft state machine is initialized or a snapshot is applied. + pub fn assign_commit_groups(&mut self, ids: &[(u64, u64)]) { + let prs = self.mut_prs(); + for (peer_id, group_id) in ids { + assert!(*group_id > 0); + if let Some(pr) = prs.get_mut(*peer_id) { + pr.commit_group_id = *group_id; + } else { + continue; + } + } + if StateRole::Leader == self.state && self.group_commit() && self.maybe_commit() { + self.bcast_append(); + } + } + + /// Removes all commit group configurations. + pub fn clear_commit_group(&mut self) { + for (_, pr) in self.mut_prs().iter_mut() { + pr.commit_group_id = 0; + } + } + + /// Checks whether the raft group is using group commit and consistent + /// over group. + /// + /// If it can't get a correct answer, `None` is returned. + pub fn check_group_commit_consistent(&mut self) -> Option { + if self.state != StateRole::Leader { + return None; + } + // Previous leader may have reach consistency already. + // + // check applied_index instead of committed_index to avoid pending conf change. + if !self.apply_to_current_term() { + return None; + } + let (index, use_group_commit) = self.mut_prs().maximal_committed_index(); + debug!( + self.logger, + "check group commit consistent"; + "index" => index, + "use_group_commit" => use_group_commit, + "committed" => self.raft_log.committed + ); + Some(use_group_commit && index == self.raft_log.committed) + } + + /// Checks if logs are committed to its term. + /// + /// The check is useful usually when raft is leader. + pub fn commit_to_current_term(&self) -> bool { + self.raft_log + .term(self.raft_log.committed) + .is_ok_and(|t| t == self.term) + } + + /// Checks if logs are applied to current term. + pub fn apply_to_current_term(&self) -> bool { + self.raft_log + .term(self.raft_log.applied) + .is_ok_and(|t| t == self.term) + } + + /// Set `max_committed_size_per_ready` to `size`. + pub fn set_max_committed_size_per_ready(&mut self, size: u64) { + self.max_committed_size_per_ready = size; + } + + /// Set whether or not `check_quorum`. + pub fn set_check_quorum(&mut self, check_quorum: bool) { + self.check_quorum = check_quorum; + } + + /// Set the maximum limit that applied index can be ahead of persisted index. + pub fn set_max_apply_unpersisted_log_limit(&mut self, limit: u64) { + self.raft_log.max_apply_unpersisted_log_limit = limit; + } +} + +impl RaftCore { + // send persists state to stable storage and then sends to its mailbox. + fn send(&mut self, mut m: Message, msgs: &mut Vec) { + debug!( + self.logger, + "Sending from {from} to {to}", + from = self.id, + to = m.to; + "msg" => ?m, + ); + if m.from == INVALID_ID { + m.from = self.id; + } + if m.get_msg_type() == MessageType::MsgRequestVote + || m.get_msg_type() == MessageType::MsgRequestPreVote + || m.get_msg_type() == MessageType::MsgRequestVoteResponse + || m.get_msg_type() == MessageType::MsgRequestPreVoteResponse + { + if m.term == 0 { + // All {pre-,}campaign messages need to have the term set when + // sending. + // - MsgVote: m.Term is the term the node is campaigning for, + // non-zero as we increment the term when campaigning. + // - MsgVoteResp: m.Term is the new r.Term if the MsgVote was + // granted, non-zero for the same reason MsgVote is + // - MsgPreVote: m.Term is the term the node will campaign, + // non-zero as we use m.Term to indicate the next term we'll be + // campaigning for + // - MsgPreVoteResp: m.Term is the term received in the original + // MsgPreVote if the pre-vote was granted, non-zero for the + // same reasons MsgPreVote is + fatal!( + self.logger, + "term should be set when sending {:?}", + m.get_msg_type() + ); + } + } else { + if m.term != 0 { + fatal!( + self.logger, + "term should not be set when sending {:?} (was {})", + m.get_msg_type(), + m.term + ); + } + // do not attach term to MsgPropose, MsgReadIndex + // proposals are a way to forward to the leader and + // should be treated as local message. + // MsgReadIndex is also forwarded to leader. + if m.get_msg_type() != MessageType::MsgPropose + && m.get_msg_type() != MessageType::MsgReadIndex + { + m.term = self.term; + } + } + if m.get_msg_type() == MessageType::MsgRequestVote + || m.get_msg_type() == MessageType::MsgRequestPreVote + { + if self.priority > 0 { + m.deprecated_priority = self.priority as u64; + } + m.priority = self.priority; + } + msgs.push(m); + } + + fn prepare_send_snapshot(&mut self, m: &mut Message, pr: &mut Progress, to: u64) -> bool { + if !pr.recent_active { + debug!( + self.logger, + "ignore sending snapshot to {} since it is not recently active", + to; + ); + return false; + } + + m.set_msg_type(MessageType::MsgSnapshot); + let snapshot_r = self.raft_log.snapshot(pr.pending_request_snapshot, to); + if let Err(e) = snapshot_r { + if e == Error::Store(StorageError::SnapshotTemporarilyUnavailable) { + debug!( + self.logger, + "failed to send snapshot to {} because snapshot is temporarily \ + unavailable", + to; + ); + return false; + } + fatal!(self.logger, "unexpected error: {:?}", e); + } + let snapshot = snapshot_r.unwrap(); + if snapshot.get_metadata().index == 0 { + fatal!(self.logger, "need non-empty snapshot"); + } + let (sindex, sterm) = (snapshot.get_metadata().index, snapshot.get_metadata().term); + m.set_snapshot(snapshot); + debug!( + self.logger, + "[firstindex: {first_index}, commit: {committed}] sent snapshot[index: {snapshot_index}, term: {snapshot_term}] to {to}", + first_index = self.raft_log.first_index(), + committed = self.raft_log.committed, + snapshot_index = sindex, + snapshot_term = sterm, + to = to; + "progress" => ?pr, + ); + pr.become_snapshot(sindex); + debug!( + self.logger, + "paused sending replication messages to {}", + to; + "progress" => ?pr, + ); + true + } + + fn prepare_send_entries( + &mut self, + m: &mut Message, + pr: &mut Progress, + term: u64, + ents: Vec, + ) { + m.set_msg_type(MessageType::MsgAppend); + m.index = pr.next_idx - 1; + m.log_term = term; + m.set_entries(ents.into()); + m.commit = self.raft_log.committed; + if !m.entries.is_empty() { + let last = m.entries.last().unwrap().index; + pr.update_state(last); + } + } + + fn try_batching( + &mut self, + to: u64, + msgs: &mut [Message], + pr: &mut Progress, + ents: &mut Vec, + ) -> bool { + // if MsgAppend for the receiver already exists, try_batching + // will append the entries to the existing MsgAppend + let mut is_batched = false; + for msg in msgs { + if msg.get_msg_type() == MessageType::MsgAppend && msg.to == to { + if !ents.is_empty() { + if !util::is_continuous_ents(msg, ents) { + return is_batched; + } + let mut batched_entries: Vec<_> = msg.take_entries().into(); + batched_entries.append(ents); + msg.set_entries(batched_entries.into()); + let last_idx = msg.entries.last().unwrap().index; + pr.update_state(last_idx); + } + msg.commit = self.raft_log.committed; + is_batched = true; + break; + } + } + is_batched + } + + /// Sends an append RPC with new entries (if any) and the current commit index to the given + /// peer. + fn send_append(&mut self, to: u64, pr: &mut Progress, msgs: &mut Vec) { + self.maybe_send_append(to, pr, true, msgs); + } + + fn send_append_aggressively(&mut self, to: u64, pr: &mut Progress, msgs: &mut Vec) { + // If we have more entries to send, send as many messages as we + // can (without sending empty messages for the commit index) + while self.maybe_send_append(to, pr, false, msgs) {} + } + + /// Sends an append RPC with new entries to the given peer, + /// if necessary. Returns true if a message was sent. The allow_empty + /// argument controls whether messages with no entries will be sent + /// ("empty" messages are useful to convey updated Commit indexes, but + /// are undesirable when we're sending multiple messages in a batch). + fn maybe_send_append( + &mut self, + to: u64, + pr: &mut Progress, + allow_empty: bool, + msgs: &mut Vec, + ) -> bool { + if pr.is_paused() { + trace!( + self.logger, + "Skipping sending to {to}, it's paused", + to = to; + "progress" => ?pr, + ); + return false; + } + let mut m = Message::default(); + m.to = to; + if pr.pending_request_snapshot != INVALID_INDEX { + // Check pending request snapshot first to avoid unnecessary loading entries. + if !self.prepare_send_snapshot(&mut m, pr, to) { + return false; + } + } else { + let ents = self.raft_log.entries( + pr.next_idx, + self.max_msg_size, + GetEntriesContext(GetEntriesFor::SendAppend { + to, + term: self.term, + aggressively: !allow_empty, + }), + ); + if !allow_empty && ents.as_ref().map_or(true, |e| e.is_empty()) { + return false; + } + let term = self.raft_log.term(pr.next_idx - 1); + match (term, ents) { + (Ok(term), Ok(mut ents)) => { + if self.batch_append && self.try_batching(to, msgs, pr, &mut ents) { + return true; + } + self.prepare_send_entries(&mut m, pr, term, ents) + } + (_, Err(Error::Store(StorageError::LogTemporarilyUnavailable))) => { + // wait for storage to fetch entries asynchronously + return false; + } + _ => { + // send snapshot if we failed to get term or entries. + if !self.prepare_send_snapshot(&mut m, pr, to) { + return false; + } + } + } + } + self.send(m, msgs); + true + } + + // send_heartbeat sends an empty MsgAppend + fn send_heartbeat( + &mut self, + to: u64, + pr: &Progress, + ctx: Option>, + msgs: &mut Vec, + ) { + // Attach the commit as min(to.matched, self.raft_log.committed). + // When the leader sends out heartbeat message, + // the receiver(follower) might not be matched with the leader + // or it might not have all the committed entries. + // The leader MUST NOT forward the follower's commit to + // an unmatched index. + let mut m = Message::default(); + m.to = to; + m.set_msg_type(MessageType::MsgHeartbeat); + let commit = cmp::min(pr.matched, self.raft_log.committed); + m.commit = commit; + if let Some(context) = ctx { + m.context = context.into(); + } + self.send(m, msgs); + } +} + +impl Raft { + /// Get the inflight buffer size. + pub fn inflight_buffers_size(&self) -> usize { + let mut total_size = 0; + for (_, pr) in self.prs().iter() { + total_size += pr.ins.buffer_capacity() * std::mem::size_of::(); + } + total_size + } + + /// Sends an append RPC with new entries (if any) and the current commit index to the given + /// peer. + pub fn send_append(&mut self, to: u64) { + let pr = self.prs.get_mut(to).unwrap(); + self.r.send_append(to, pr, &mut self.msgs) + } + + pub(super) fn send_append_aggressively(&mut self, to: u64) { + let pr = self.prs.get_mut(to).unwrap(); + self.r.send_append_aggressively(to, pr, &mut self.msgs) + } + + /// Sends RPC, with entries to all peers that are not up-to-date + /// according to the progress recorded in r.prs(). + pub fn bcast_append(&mut self) { + let self_id = self.id; + let core = &mut self.r; + let msgs = &mut self.msgs; + self.prs + .iter_mut() + .filter(|&(id, _)| *id != self_id) + .for_each(|(id, pr)| core.send_append(*id, pr, msgs)); + } + + /// Broadcasts heartbeats to all the followers if it's leader. + pub fn ping(&mut self) { + if self.state == StateRole::Leader { + self.bcast_heartbeat(); + } + } + + /// Sends RPC, without entries to all the peers. + pub fn bcast_heartbeat(&mut self) { + let ctx = self.read_only.last_pending_request_ctx(); + self.bcast_heartbeat_with_ctx(ctx) + } + + fn bcast_heartbeat_with_ctx(&mut self, ctx: Option>) { + let self_id = self.id; + let core = &mut self.r; + let msgs = &mut self.msgs; + self.prs + .iter_mut() + .filter(|&(id, _)| *id != self_id) + .for_each(|(id, pr)| core.send_heartbeat(*id, pr, ctx.clone(), msgs)); + } + + /// Attempts to advance the commit index. Returns true if the commit index + /// changed (in which case the caller should call `r.bcast_append`). + pub fn maybe_commit(&mut self) -> bool { + let mci = self.mut_prs().maximal_committed_index().0; + if self.r.raft_log.maybe_commit(mci, self.r.term) { + let (self_id, committed) = (self.id, self.raft_log.committed); + self.mut_prs() + .get_mut(self_id) + .unwrap() + .update_committed(committed); + return true; + } + false + } + + /// Commit that the Raft peer has applied up to the given index. + /// + /// Registers the new applied index to the Raft log. + /// + /// # Hooks + /// + /// * Post: Checks to see if it's time to finalize a Joint Consensus state. + #[inline] + pub fn commit_apply(&mut self, applied: u64) { + self.commit_apply_internal(applied, false) + } + + /// Commit that the Raft peer has applied up to the given index. + /// + /// Registers the new applied index to the Raft log. + /// if `skip_check` is true, will skip the applied_index check, this is only + /// used at initialization. + /// + /// # Hooks + /// + /// * Post: Checks to see if it's time to finalize a Joint Consensus state. + fn commit_apply_internal(&mut self, applied: u64, skip_check: bool) { + let old_applied = self.raft_log.applied; + if !skip_check { + #[allow(deprecated)] + self.raft_log.applied_to(applied); + } else { + // skip applied_index check at initialization. + assert!(applied > 0); + self.raft_log.applied_to_unchecked(applied); + } + + // TODO: it may never auto_leave if leader steps down before enter joint is applied. + if self.prs.conf().auto_leave + && old_applied <= self.pending_conf_index + && applied >= self.pending_conf_index + && self.state == StateRole::Leader + { + // If the current (and most recent, at least for this leader's term) + // configuration should be auto-left, initiate that now. We use a + // nil Data which unmarshals into an empty ConfChangeV2 and has the + // benefit that appendEntry can never refuse it based on its size + // (which registers as zero). + let mut entry = Entry::default(); + entry.set_entry_type(EntryType::EntryConfChangeV2); + + // append_entry will never refuse an empty + if !self.append_entry(&mut [entry]) { + panic!("appending an empty EntryConfChangeV2 should never be dropped") + } + self.pending_conf_index = self.raft_log.last_index(); + info!(self.logger, "initiating automatic transition out of joint configuration"; "config" => ?self.prs.conf()); + } + } + + /// Resets the current node to a given term. + pub fn reset(&mut self, term: u64) { + if self.term != term { + self.term = term; + self.vote = INVALID_ID; + } + self.leader_id = INVALID_ID; + self.reset_randomized_election_timeout(); + self.election_elapsed = 0; + self.heartbeat_elapsed = 0; + + self.abort_leader_transfer(); + + self.prs.reset_votes(); + + self.pending_conf_index = 0; + self.read_only = ReadOnly::new(self.read_only.option); + self.pending_request_snapshot = INVALID_INDEX; + + let last_index = self.raft_log.last_index(); + let committed = self.raft_log.committed; + let persisted = self.raft_log.persisted; + let self_id = self.id; + for (&id, pr) in self.mut_prs().iter_mut() { + pr.reset(last_index + 1); + if id == self_id { + pr.matched = persisted; + pr.committed_index = committed; + } + } + } + + /// Appends a slice of entries to the log. + /// The entries are updated to match the current index and term. + /// Only called by leader currently + #[must_use] + pub fn append_entry(&mut self, es: &mut [Entry]) -> bool { + if !self.maybe_increase_uncommitted_size(es) { + return false; + } + + let li = self.raft_log.last_index(); + for (i, e) in es.iter_mut().enumerate() { + e.term = self.term; + e.index = li + 1 + i as u64; + } + self.raft_log.append(es); + + // Not update self's pr.matched until on_persist_entries + true + } + + /// Notifies that these raft logs have been persisted. + pub fn on_persist_entries(&mut self, index: u64, term: u64) { + let update = self.raft_log.maybe_persist(index, term); + if update && self.state == StateRole::Leader { + // Actually, if it is a leader and persisted index is updated, this term + // must be equal to self.term because the persisted index must be equal to + // the last index of entries from previous leader when it becomes leader + // (see the comments in become_leader), namely, the new persisted entries + // must come from this leader. + if term != self.term { + error!( + self.logger, + "leader's persisted index changed but the term {} is not the same as {}", + term, + self.term + ); + } + let self_id = self.id; + let pr = self.mut_prs().get_mut(self_id).unwrap(); + if pr.maybe_update(index) && self.maybe_commit() && self.should_bcast_commit() { + self.bcast_append(); + } + } + } + + /// Notifies that the snapshot have been persisted. + pub fn on_persist_snap(&mut self, index: u64) { + self.raft_log.maybe_persist_snap(index); + } + + /// Returns true to indicate that there will probably be some readiness need to be handled. + pub fn tick(&mut self) -> bool { + match self.state { + StateRole::Follower | StateRole::PreCandidate | StateRole::Candidate => { + self.tick_election() + } + StateRole::Leader => self.tick_heartbeat(), + } + } + + // TODO: revoke pub when there is a better way to test. + /// Run by followers and candidates after self.election_timeout. + /// + /// Returns true to indicate that there will probably be some readiness need to be handled. + pub fn tick_election(&mut self) -> bool { + self.election_elapsed += 1; + if !self.pass_election_timeout() || !self.promotable { + return false; + } + + self.election_elapsed = 0; + let m = new_message(INVALID_ID, MessageType::MsgHup, Some(self.id)); + let _ = self.step(m); + true + } + + // tick_heartbeat is run by leaders to send a MsgBeat after self.heartbeat_timeout. + // Returns true to indicate that there will probably be some readiness need to be handled. + fn tick_heartbeat(&mut self) -> bool { + self.heartbeat_elapsed += 1; + self.election_elapsed += 1; + + let mut has_ready = false; + if self.election_elapsed >= self.election_timeout { + self.election_elapsed = 0; + if self.check_quorum { + let m = new_message(INVALID_ID, MessageType::MsgCheckQuorum, Some(self.id)); + has_ready = true; + let _ = self.step(m); + } + if self.state == StateRole::Leader && self.lead_transferee.is_some() { + self.abort_leader_transfer() + } + } + + if self.state != StateRole::Leader { + return has_ready; + } + + if self.heartbeat_elapsed >= self.heartbeat_timeout { + self.heartbeat_elapsed = 0; + has_ready = true; + let m = new_message(INVALID_ID, MessageType::MsgBeat, Some(self.id)); + let _ = self.step(m); + } + has_ready + } + + /// Converts this node to a follower. + pub fn become_follower(&mut self, term: u64, leader_id: u64) { + let pending_request_snapshot = self.pending_request_snapshot; + self.reset(term); + self.leader_id = leader_id; + let from_role = self.state; + self.state = StateRole::Follower; + self.pending_request_snapshot = pending_request_snapshot; + // TODO: In theory, it's better to let the user control this after + // leadership changes, but because the user only know this info after + // calling `ready()` but the committed entries is fetched within `ready()`, + // so we hard code this logic here currently. We may need to remove + // this hard code when we want to also support apply unpersisted log + // on follower. + self.raft_log.max_apply_unpersisted_log_limit = 0; + info!( + self.logger, + "became follower at term {term}", + term = self.term; + "from_role" => ?from_role, + ); + } + + // TODO: revoke pub when there is a better way to test. + /// Converts this node to a candidate + /// + /// # Panics + /// + /// Panics if a leader already exists. + pub fn become_candidate(&mut self) { + assert_ne!( + self.state, + StateRole::Leader, + "invalid transition [leader -> candidate]" + ); + let term = self.term + 1; + self.reset(term); + let id = self.id; + self.vote = id; + self.state = StateRole::Candidate; + info!( + self.logger, + "became candidate at term {term}", + term = self.term; + ); + } + + /// Converts this node to a pre-candidate + /// + /// # Panics + /// + /// Panics if a leader already exists. + pub fn become_pre_candidate(&mut self) { + assert_ne!( + self.state, + StateRole::Leader, + "invalid transition [leader -> pre-candidate]" + ); + // Becoming a pre-candidate changes our state. + // but doesn't change anything else. In particular it does not increase + // self.term or change self.vote. + self.state = StateRole::PreCandidate; + self.prs.reset_votes(); + // If a network partition happens, and leader is in minority partition, + // it will step down, and become follower without notifying others. + self.leader_id = INVALID_ID; + info!( + self.logger, + "became pre-candidate at term {term}", + term = self.term; + ); + } + + // TODO: revoke pub when there is a better way to test. + /// Makes this raft the leader. + /// + /// # Panics + /// + /// Panics if this is a follower node. + pub fn become_leader(&mut self) { + trace!(self.logger, "ENTER become_leader"); + assert_ne!( + self.state, + StateRole::Follower, + "invalid transition [follower -> leader]" + ); + let term = self.term; + self.reset(term); + self.leader_id = self.id; + self.state = StateRole::Leader; + + let last_index = self.raft_log.last_index(); + // If there is only one peer, it becomes leader after campaigning + // so all logs must be persisted. + // If not, it becomes leader after sending RequestVote msg. + // Since all logs must be persisted before sending RequestVote + // msg and logs can not be changed when it's (pre)candidate, the + // last index is equal to persisted index when it becomes leader. + assert_eq!(last_index, self.raft_log.persisted); + + // Update uncommitted state + self.uncommitted_state.uncommitted_size = 0; + self.uncommitted_state.last_log_tail_index = last_index; + + // Followers enter replicate mode when they've been successfully probed + // (perhaps after having received a snapshot as a result). The leader is + // trivially in this state. Note that r.reset() has initialized this + // progress with the last index already. + let id = self.id; + self.mut_prs().get_mut(id).unwrap().become_replicate(); + + // Conservatively set the pending_conf_index to the last index in the + // log. There may or may not be a pending config change, but it's + // safe to delay any future proposals until we commit all our + // pending log entries, and scanning the entire tail of the log + // could be expensive. + self.pending_conf_index = last_index; + + // No need to check result because append_entry never refuse entries + // which size is zero + if !self.append_entry(&mut [Entry::default()]) { + panic!("appending an empty entry should never be dropped") + } + + info!( + self.logger, + "became leader at term {term}", + term = self.term; + ); + trace!(self.logger, "EXIT become_leader"); + } + + // Campaign to attempt to become a leader. + // + // If prevote is enabled, this is handled as well. + #[doc(hidden)] + pub fn campaign(&mut self, campaign_type: &'static [u8]) { + let (vote_msg, term) = if campaign_type == CAMPAIGN_PRE_ELECTION { + self.become_pre_candidate(); + // Pre-vote RPCs are sent for next term before we've incremented self.term. + (MessageType::MsgRequestPreVote, self.term + 1) + } else { + self.become_candidate(); + (MessageType::MsgRequestVote, self.term) + }; + let self_id = self.id; + if VoteResult::Won == self.poll(self_id, vote_msg, true) { + // We won the election after voting for ourselves (which must mean that + // this is a single-node cluster). + return; + } + + let (commit, commit_term) = self.raft_log.commit_info(); + let mut voters = [0; 7]; + let mut voter_cnt = 0; + + // Only send vote request to voters. + for id in self.prs.conf().voters().ids().iter() { + if id == self_id { + continue; + } + + if voter_cnt == voters.len() { + self.log_broadcast_vote(vote_msg, &voters); + voter_cnt = 0; + } + voters[voter_cnt] = id; + voter_cnt += 1; + let mut m = new_message(id, vote_msg, None); + m.term = term; + m.index = self.raft_log.last_index(); + m.log_term = self.raft_log.last_term(); + m.commit = commit; + m.commit_term = commit_term; + if campaign_type == CAMPAIGN_TRANSFER { + m.context = campaign_type.into(); + } + self.r.send(m, &mut self.msgs); + } + if voter_cnt > 0 { + self.log_broadcast_vote(vote_msg, &voters[..voter_cnt]); + } + } + + #[inline] + fn log_broadcast_vote(&self, t: MessageType, ids: &[u64]) { + info!( + self.logger, + "broadcasting vote request"; + "type" => ?t, + "term" => self.term, + "log_term" => self.raft_log.last_term(), + "log_index" => self.raft_log.last_index(), + "to" => ?ids, + ); + } + + /// Steps the raft along via a message. This should be called everytime your raft receives a + /// message from a peer. + pub fn step(&mut self, m: Message) -> Result<()> { + // Handle the message term, which may result in our stepping down to a follower. + if m.term == 0 { + // local message + } else if m.term > self.term { + if m.get_msg_type() == MessageType::MsgRequestVote + || m.get_msg_type() == MessageType::MsgRequestPreVote + { + let force = m.context == CAMPAIGN_TRANSFER; + let in_lease = self.check_quorum + && self.leader_id != INVALID_ID + && self.election_elapsed < self.election_timeout; + if !force && in_lease { + // if a server receives RequestVote request within the minimum election + // timeout of hearing from a current leader, it does not update its term + // or grant its vote + // + // This is included in the 3rd concern for Joint Consensus, where if another + // peer is removed from the cluster it may try to hold elections and disrupt + // stability. + info!( + self.logger, + "[logterm: {log_term}, index: {log_index}, vote: {vote}] ignored vote from \ + {from} [logterm: {msg_term}, index: {msg_index}]: lease is not expired", + log_term = self.raft_log.last_term(), + log_index = self.raft_log.last_index(), + vote = self.vote, + from = m.from, + msg_term = m.log_term, + msg_index = m.index; + "term" => self.term, + "remaining ticks" => self.election_timeout - self.election_elapsed, + "msg type" => ?m.get_msg_type(), + "leader_id" => self.leader_id, + ); + + return Ok(()); + } + } + + if m.get_msg_type() == MessageType::MsgRequestPreVote + || (m.get_msg_type() == MessageType::MsgRequestPreVoteResponse && !m.reject) + { + // For a pre-vote request: + // Never change our term in response to a pre-vote request. + // + // For a pre-vote response with pre-vote granted: + // We send pre-vote requests with a term in our future. If the + // pre-vote is granted, we will increment our term when we get a + // quorum. If it is not, the term comes from the node that + // rejected our vote so we should become a follower at the new + // term. + } else { + info!( + self.logger, + "received a message with higher term from {from}", + from = m.from; + "term" => self.term, + "message_term" => m.term, + "msg type" => ?m.get_msg_type(), + ); + if m.get_msg_type() == MessageType::MsgAppend + || m.get_msg_type() == MessageType::MsgHeartbeat + || m.get_msg_type() == MessageType::MsgSnapshot + { + self.become_follower(m.term, m.from); + } else { + self.become_follower(m.term, INVALID_ID); + } + } + } else if m.term < self.term { + if (self.check_quorum || self.pre_vote) + && (m.get_msg_type() == MessageType::MsgHeartbeat + || m.get_msg_type() == MessageType::MsgAppend) + { + // We have received messages from a leader at a lower term. It is possible + // that these messages were simply delayed in the network, but this could + // also mean that this node has advanced its term number during a network + // partition, and it is now unable to either win an election or to rejoin + // the majority on the old term. If checkQuorum is false, this will be + // handled by incrementing term numbers in response to MsgVote with a higher + // term, but if checkQuorum is true we may not advance the term on MsgVote and + // must generate other messages to advance the term. The net result of these + // two features is to minimize the disruption caused by nodes that have been + // removed from the cluster's configuration: a removed node will send MsgVotes + // which will be ignored, but it will not receive MsgApp or MsgHeartbeat, so it + // will not create disruptive term increases, by notifying leader of this node's + // activeness. + // The above comments also true for Pre-Vote + // + // When follower gets isolated, it soon starts an election ending + // up with a higher term than leader, although it won't receive enough + // votes to win the election. When it regains connectivity, this response + // with "pb.MsgAppResp" of higher term would force leader to step down. + // However, this disruption is inevitable to free this stuck node with + // fresh election. This can be prevented with Pre-Vote phase. + let to_send = new_message(m.from, MessageType::MsgAppendResponse, None); + self.r.send(to_send, &mut self.msgs); + } else if m.get_msg_type() == MessageType::MsgRequestPreVote { + // Before pre_vote enable, there may be a receiving candidate with higher term, + // but less log. After update to pre_vote, the cluster may deadlock if + // we drop messages with a lower term. + info!( + self.logger, + "{} [log_term: {}, index: {}, vote: {}] rejected {:?} from {} [log_term: {}, index: {}] at term {}", + self.id, + self.raft_log.last_term(), + self.raft_log.last_index(), + self.vote, + m.get_msg_type(), + m.from, + m.log_term, + m.index, + self.term, + ); + + let mut to_send = new_message(m.from, MessageType::MsgRequestPreVoteResponse, None); + to_send.term = self.term; + to_send.reject = true; + self.r.send(to_send, &mut self.msgs); + } else { + // ignore other cases + info!( + self.logger, + "ignored a message with lower term from {from}", + from = m.from; + "term" => self.term, + "msg type" => ?m.get_msg_type(), + "msg term" => m.term + ); + } + return Ok(()); + } + + #[cfg(feature = "failpoints")] + fail_point!("before_step"); + + match m.get_msg_type() { + MessageType::MsgHup => self.hup(false), + MessageType::MsgRequestVote | MessageType::MsgRequestPreVote => { + // We can vote if this is a repeat of a vote we've already cast... + let can_vote = (self.vote == m.from) || + // ...we haven't voted and we don't think there's a leader yet in this term... + (self.vote == INVALID_ID && self.leader_id == INVALID_ID) || + // ...or this is a PreVote for a future term... + (m.get_msg_type() == MessageType::MsgRequestPreVote && m.term > self.term); + // ...and we believe the candidate is up to date. + if can_vote + && self.raft_log.is_up_to_date(m.index, m.log_term) + && (m.index > self.raft_log.last_index() || self.priority <= get_priority(&m)) + { + // When responding to Msg{Pre,}Vote messages we include the term + // from the message, not the local term. To see why consider the + // case where a single node was previously partitioned away and + // it's local term is now of date. If we include the local term + // (recall that for pre-votes we don't update the local term), the + // (pre-)campaigning node on the other end will proceed to ignore + // the message (it ignores all out of date messages). + // The term in the original message and current local term are the + // same in the case of regular votes, but different for pre-votes. + self.log_vote_approve(&m); + let mut to_send = + new_message(m.from, vote_resp_msg_type(m.get_msg_type()), None); + to_send.reject = false; + to_send.term = m.term; + self.r.send(to_send, &mut self.msgs); + if m.get_msg_type() == MessageType::MsgRequestVote { + // Only record real votes. + self.election_elapsed = 0; + self.vote = m.from; + } + } else { + self.log_vote_reject(&m); + let mut to_send = + new_message(m.from, vote_resp_msg_type(m.get_msg_type()), None); + to_send.reject = true; + to_send.term = self.term; + let (commit, commit_term) = self.raft_log.commit_info(); + to_send.commit = commit; + to_send.commit_term = commit_term; + self.r.send(to_send, &mut self.msgs); + self.maybe_commit_by_vote(&m); + } + } + _ => match self.state { + StateRole::PreCandidate | StateRole::Candidate => self.step_candidate(m)?, + StateRole::Follower => self.step_follower(m)?, + StateRole::Leader => self.step_leader(m)?, + }, + } + Ok(()) + } + + fn hup(&mut self, transfer_leader: bool) { + if self.state == StateRole::Leader { + debug!( + self.logger, + "ignoring MsgHup because already leader"; + ); + return; + } + + // Scan all unapplied committed entries to find a config change. + // Paginate the scan, to avoid a potentially unlimited memory spike. + // + // If there is a pending snapshot, its index will be returned by + // `maybe_first_index`. Note that snapshot updates configuration + // already, so as long as pending entries don't contain conf change + // it's safe to start campaign. + let low = match self.raft_log.unstable.maybe_first_index() { + Some(idx) => idx, + None => self.raft_log.applied + 1, + }; + let high = self.raft_log.committed + 1; + let ctx = GetEntriesContext(GetEntriesFor::TransferLeader); + if self.has_unapplied_conf_changes(low, high, ctx) { + warn!( + self.logger, + "cannot campaign at term {} since there are still pending configuration changes to apply", self.term + ); + return; + } + + info!( + self.logger, + "starting a new election"; + "term" => self.term, + ); + if transfer_leader { + self.campaign(CAMPAIGN_TRANSFER); + } else if self.pre_vote { + self.campaign(CAMPAIGN_PRE_ELECTION); + } else { + self.campaign(CAMPAIGN_ELECTION); + } + } + + fn has_unapplied_conf_changes(&self, lo: u64, hi: u64, context: GetEntriesContext) -> bool { + if self.raft_log.applied >= self.raft_log.committed { + // in fact applied == committed + return false; + } + let mut found = false; + // Reuse the max_committed_size_per_ready limit because it is used for + // similar purposes (limiting the read of unapplied committed entries) + // when raft sends entries via the Ready struct for application. + // TODO(pavelkalinnikov): find a way to budget memory/bandwidth for this scan + // outside the raft package. + let page_size = self.max_committed_size_per_ready; + if let Err(err) = self.raft_log.scan(lo, hi, page_size, context, |ents| { + for e in ents { + if e.get_entry_type() == EntryType::EntryConfChange + || e.get_entry_type() == EntryType::EntryConfChangeV2 + { + found = true; + return false; + } + } + true + }) { + fatal!( + self.logger, + "error scanning unapplied entries [{}, {}): {:?}", + lo, + hi, + err + ); + } + found + } + + fn log_vote_approve(&self, m: &Message) { + info!( + self.logger, + "[logterm: {log_term}, index: {log_index}, vote: {vote}] cast vote for {from} [logterm: {msg_term}, index: {msg_index}] \ + at term {term}", + log_term = self.raft_log.last_term(), + log_index = self.raft_log.last_index(), + vote = self.vote, + from = m.from, + msg_term = m.log_term, + msg_index = m.index, + term = self.term; + "msg type" => ?m.get_msg_type(), + ); + } + + fn log_vote_reject(&self, m: &Message) { + info!( + self.logger, + "[logterm: {log_term}, index: {log_index}, vote: {vote}] rejected vote from {from} [logterm: {msg_term}, index: \ + {msg_index}] at term {term}", + log_term = self.raft_log.last_term(), + log_index = self.raft_log.last_index(), + vote = self.vote, + from = m.from, + msg_term = m.log_term, + msg_index = m.index, + term = self.term; + "msg type" => ?m.get_msg_type(), + ); + } + + fn handle_append_response(&mut self, m: &Message) { + let mut next_probe_index: u64 = m.reject_hint; + // pull out find_conflict_by_term for immutable borrow + if m.reject && m.log_term > 0 { + // If the follower has an uncommitted log tail, we would end up + // probing one by one until we hit the common prefix. + // + // For example, if the leader has: + // + // idx 1 2 3 4 5 6 7 8 9 + // ----------------- + // term (L) 1 3 3 3 5 5 5 5 5 + // term (F) 1 1 1 1 2 2 + // + // Then, after sending an append anchored at (idx=9,term=5) we + // would receive a RejectHint of 6 and LogTerm of 2. Without the + // code below, we would try an append at index 6, which would + // fail again. + // + // However, looking only at what the leader knows about its own + // log and the rejection hint, it is clear that a probe at index + // 6, 5, 4, 3, and 2 must fail as well: + // + // For all of these indexes, the leader's log term is larger than + // the rejection's log term. If a probe at one of these indexes + // succeeded, its log term at that index would match the leader's, + // i.e. 3 or 5 in this example. But the follower already told the + // leader that it is still at term 2 at index 6, and since the + // log term only ever goes up (within a log), this is a contradiction. + // + // At index 1, however, the leader can draw no such conclusion, + // as its term 1 is not larger than the term 2 from the + // follower's rejection. We thus probe at 1, which will succeed + // in this example. In general, with this approach we probe at + // most once per term found in the leader's log. + // + // There is a similar mechanism on the follower (implemented in + // handleAppendEntries via a call to findConflictByTerm) that is + // useful if the follower has a large divergent uncommitted log + // tail[1], as in this example: + // + // idx 1 2 3 4 5 6 7 8 9 + // ----------------- + // term (L) 1 3 3 3 3 3 3 3 7 + // term (F) 1 3 3 4 4 5 5 5 6 + // + // Naively, the leader would probe at idx=9, receive a rejection + // revealing the log term of 6 at the follower. Since the leader's + // term at the previous index is already smaller than 6, the leader- + // side optimization discussed above is ineffective. The leader thus + // probes at index 8 and, naively, receives a rejection for the same + // index and log term 5. Again, the leader optimization does not improve + // over linear probing as term 5 is above the leader's term 3 for that + // and many preceding indexes; the leader would have to probe linearly + // until it would finally hit index 3, where the probe would succeed. + // + // Instead, we apply a similar optimization on the follower. When the + // follower receives the probe at index 8 (log term 3), it concludes + // that all of the leader's log preceding that index has log terms of + // 3 or below. The largest index in the follower's log with a log term + // of 3 or below is index 3. The follower will thus return a rejection + // for index=3, log term=3 instead. The leader's next probe will then + // succeed at that index. + // + // [1]: more precisely, if the log terms in the large uncommitted + // tail on the follower are larger than the leader's. At first, + // it may seem unintuitive that a follower could even have such + // a large tail, but it can happen: + // + // 1. Leader appends (but does not commit) entries 2 and 3, crashes. + // idx 1 2 3 4 5 6 7 8 9 + // ----------------- + // term (L) 1 2 2 [crashes] + // term (F) 1 + // term (F) 1 + // + // 2. a follower becomes leader and appends entries at term 3. + // ----------------- + // term (x) 1 2 2 [down] + // term (F) 1 3 3 3 3 + // term (F) 1 + // + // 3. term 3 leader goes down, term 2 leader returns as term 4 + // leader. It commits the log & entries at term 4. + // + // ----------------- + // term (L) 1 2 2 2 + // term (x) 1 3 3 3 3 [down] + // term (F) 1 + // ----------------- + // term (L) 1 2 2 2 4 4 4 + // term (F) 1 3 3 3 3 [gets probed] + // term (F) 1 2 2 2 4 4 4 + // + // 4. the leader will now probe the returning follower at index + // 7, the rejection points it at the end of the follower's log + // which is at a higher log term than the actually committed + // log. + next_probe_index = self + .raft_log + .find_conflict_by_term(m.reject_hint, m.log_term) + .0; + } + + let pr = match self.prs.get_mut(m.from) { + Some(pr) => pr, + None => { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + return; + } + }; + pr.recent_active = true; + + // update followers committed index via append response + pr.update_committed(m.commit); + + if m.reject { + // RejectHint is the suggested next base entry for appending (i.e. + // we try to append entry RejectHint+1 next), and LogTerm is the + // term that the follower has at index RejectHint. Older versions + // of this library did not populate LogTerm for rejections and it + // is zero for followers with an empty log. + // + // Under normal circumstances, the leader's log is longer than the + // follower's and the follower's log is a prefix of the leader's + // (i.e. there is no divergent uncommitted suffix of the log on the + // follower). In that case, the first probe reveals where the + // follower's log ends (RejectHint=follower's last index) and the + // subsequent probe succeeds. + // + // However, when networks are partitioned or systems overloaded, + // large divergent log tails can occur. The naive attempt, probing + // entry by entry in decreasing order, will be the product of the + // length of the diverging tails and the network round-trip latency, + // which can easily result in hours of time spent probing and can + // even cause outright outages. The probes are thus optimized as + // described below. + debug!( + self.r.logger, + "received msgAppend rejection"; + "reject_hint_index" => m.reject_hint, + "reject_hint_term" => m.log_term, + "from" => m.from, + "index" => m.index, + ); + + if pr.maybe_decr_to(m.index, next_probe_index, m.request_snapshot) { + debug!( + self.r.logger, + "decreased progress of {}", + m.from; + "progress" => ?pr, + ); + if pr.state == ProgressState::Replicate { + pr.become_probe(); + } + self.send_append(m.from); + } + return; + } + + let old_paused = pr.is_paused(); + if !pr.maybe_update(m.index) { + return; + } + + match pr.state { + ProgressState::Probe => pr.become_replicate(), + ProgressState::Snapshot => { + if pr.is_snapshot_caught_up() { + debug!( + self.r.logger, + "snapshot caught up, resumed sending replication messages to {from}", + from = m.from; + "progress" => ?pr, + ); + pr.become_probe(); + } + } + ProgressState::Replicate => pr.ins.free_to(m.get_index()), + } + + if self.maybe_commit() { + if self.should_bcast_commit() { + self.bcast_append() + } + } else if old_paused { + self.send_append(m.from); + } + + // We've updated flow control information above, which may + // allow us to send multiple (size-limited) in-flight messages + // at once (such as when transitioning from probe to + // replicate, or when freeTo() covers multiple messages). If + // we have more entries to send, send as many messages as we + // can (without sending empty messages for the commit index) + self.send_append_aggressively(m.from); + + // Transfer leadership is in progress. + if Some(m.from) == self.r.lead_transferee { + let last_index = self.r.raft_log.last_index(); + let pr = self.prs.get_mut(m.from).unwrap(); + if pr.matched == last_index { + info!( + self.logger, + "sent MsgTimeoutNow to {from} after received MsgAppResp", + from = m.from; + ); + self.send_timeout_now(m.from); + } + } + } + + fn handle_heartbeat_response(&mut self, m: &Message) { + // Update the node. Drop the value explicitly since we'll check the qourum after. + let pr = match self.prs.get_mut(m.from) { + Some(pr) => pr, + None => { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + return; + } + }; + // update followers committed index via heartbeat response + pr.update_committed(m.commit); + pr.recent_active = true; + pr.resume(); + + // free one slot for the full inflights window to allow progress. + if pr.state == ProgressState::Replicate && pr.ins.full() { + pr.ins.free_first_one(); + } + // Does it request snapshot? + if pr.matched < self.r.raft_log.last_index() || pr.pending_request_snapshot != INVALID_INDEX + { + self.r.send_append(m.from, pr, &mut self.msgs); + } + + if self.read_only.option != ReadOnlyOption::Safe || m.context.is_empty() { + return; + } + + match self.r.read_only.recv_ack(m.from, &m.context) { + Some(acks) if self.prs.has_quorum(acks) => {} + _ => return, + } + + for rs in self.r.read_only.advance(&m.context, &self.r.logger) { + if let Some(m) = self.handle_ready_read_index(rs.req, rs.index) { + self.r.send(m, &mut self.msgs); + } + } + } + + fn handle_transfer_leader(&mut self, m: &Message) { + if self.prs().get(m.from).is_none() { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + return; + } + + let from = m.from; + if self.prs.conf().learners.contains(&from) { + debug!( + self.logger, + "ignored transferring leadership"; + "to" => from, + ); + return; + } + let lead_transferee = from; + if let Some(last_lead_transferee) = self.lead_transferee { + if last_lead_transferee == lead_transferee { + info!( + self.logger, + "[term {term}] transfer leadership to {lead_transferee} is in progress, ignores request \ + to same node {lead_transferee}", + term = self.term, + lead_transferee = lead_transferee; + ); + return; + } + self.abort_leader_transfer(); + info!( + self.logger, + "[term {term}] abort previous transferring leadership to {last_lead_transferee}", + term = self.term, + last_lead_transferee = last_lead_transferee; + ); + } + if lead_transferee == self.id { + debug!( + self.logger, + "already leader; ignored transferring leadership to self"; + ); + return; + } + // Transfer leadership to third party. + info!( + self.logger, + "[term {term}] starts to transfer leadership to {lead_transferee}", + term = self.term, + lead_transferee = lead_transferee; + ); + // Transfer leadership should be finished in one electionTimeout + // so reset r.electionElapsed. + self.election_elapsed = 0; + self.lead_transferee = Some(lead_transferee); + let pr = self.prs.get_mut(from).unwrap(); + if pr.matched == self.r.raft_log.last_index() { + self.send_timeout_now(lead_transferee); + info!( + self.logger, + "sends MsgTimeoutNow to {lead_transferee} immediately as {lead_transferee} already has up-to-date log", + lead_transferee = lead_transferee; + ); + } else { + self.r.send_append(lead_transferee, pr, &mut self.msgs); + } + } + + fn handle_snapshot_status(&mut self, m: &Message) { + let pr = match self.prs.get_mut(m.from) { + Some(pr) => pr, + None => { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + return; + } + }; + if pr.state != ProgressState::Snapshot { + return; + } + if m.reject { + pr.snapshot_failure(); + pr.become_probe(); + debug!( + self.r.logger, + "snapshot failed, resumed sending replication messages to {from}", + from = m.from; + "progress" => ?pr, + ); + } else { + pr.become_probe(); + debug!( + self.r.logger, + "snapshot succeeded, resumed sending replication messages to {from}", + from = m.from; + "progress" => ?pr, + ); + } + // If snapshot finish, wait for the msgAppResp from the remote node before sending + // out the next msgAppend. + // If snapshot failure, wait for a heartbeat interval before next try + pr.pause(); + pr.pending_request_snapshot = INVALID_INDEX; + } + + fn handle_unreachable(&mut self, m: &Message) { + let pr = match self.prs.get_mut(m.from) { + Some(pr) => pr, + None => { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + return; + } + }; + // During optimistic replication, if the remote becomes unreachable, + // there is huge probability that a MsgAppend is lost. + if pr.state == ProgressState::Replicate { + pr.become_probe(); + } + debug!( + self.r.logger, + "failed to send message to {from} because it is unreachable", + from = m.from; + "progress" => ?pr, + ); + } + + fn step_leader(&mut self, mut m: Message) -> Result<()> { + // These message types do not require any progress for m.From. + match m.get_msg_type() { + MessageType::MsgBeat => { + self.bcast_heartbeat(); + return Ok(()); + } + MessageType::MsgCheckQuorum => { + if !self.check_quorum_active() { + warn!( + self.logger, + "stepped down to follower since quorum is not active"; + ); + let term = self.term; + self.become_follower(term, INVALID_ID); + } + return Ok(()); + } + MessageType::MsgPropose => { + if m.entries.is_empty() { + fatal!(self.logger, "stepped empty MsgProp"); + } + if !self.prs.progress().contains_key(&self.id) { + // If we are not currently a member of the range (i.e. this node + // was removed from the configuration while serving as leader), + // drop any new proposals. + return Err(Error::ProposalDropped); + } + if self.lead_transferee.is_some() { + debug!( + self.logger, + "[term {term}] transfer leadership to {lead_transferee} is in progress; dropping \ + proposal", + term = self.term, + lead_transferee = self.lead_transferee.unwrap(); + ); + return Err(Error::ProposalDropped); + } + + for (i, e) in m.mut_entries().iter_mut().enumerate() { + let mut cc; + if e.get_entry_type() == EntryType::EntryConfChange { + let mut cc_v1 = ConfChange::default(); + if let Err(e) = cc_v1.merge_from_bytes(e.get_data()) { + error!(self.logger, "invalid confchange"; "error" => ?e); + return Err(Error::ProposalDropped); + } + cc = cc_v1.into_v2(); + } else if e.get_entry_type() == EntryType::EntryConfChangeV2 { + cc = ConfChangeV2::default(); + if let Err(e) = cc.merge_from_bytes(e.get_data()) { + error!(self.logger, "invalid confchangev2"; "error" => ?e); + return Err(Error::ProposalDropped); + } + } else { + continue; + } + + let reason = if self.has_pending_conf() { + "possible unapplied conf change" + } else { + let already_joint = confchange::joint(self.prs.conf()); + let want_leave = cc.changes.is_empty(); + if already_joint && !want_leave { + "must transition out of joint config first" + } else if !already_joint && want_leave { + "not in joint state; refusing empty conf change" + } else { + "" + } + }; + + if reason.is_empty() { + self.pending_conf_index = self.raft_log.last_index() + i as u64 + 1; + } else { + info!( + self.logger, + "ignoring conf change"; + "conf change" => ?cc, + "reason" => reason, + "config" => ?self.prs.conf(), + "index" => self.pending_conf_index, + "applied" => self.raft_log.applied, + ); + *e = Entry::default(); + e.set_entry_type(EntryType::EntryNormal); + } + } + if !self.append_entry(m.mut_entries()) { + // return ProposalDropped when uncommitted size limit is reached + debug!( + self.logger, + "entries are dropped due to overlimit of max uncommitted size, uncommitted_size: {}", + self.uncommitted_size() + ); + return Err(Error::ProposalDropped); + } + self.bcast_append(); + return Ok(()); + } + MessageType::MsgReadIndex => { + if !self.commit_to_current_term() { + // Reject read only request when this leader has not committed any log entry + // in its term. + return Ok(()); + } + + if self.prs().is_singleton() { + let read_index = self.raft_log.committed; + if let Some(m) = self.handle_ready_read_index(m, read_index) { + self.r.send(m, &mut self.msgs); + } + return Ok(()); + } + + // thinking: use an internally defined context instead of the user given context. + // We can express this in terms of the term and index instead of + // a user-supplied value. + // This would allow multiple reads to piggyback on the same message. + match self.read_only.option { + ReadOnlyOption::Safe => { + let ctx = m.entries[0].data.to_vec(); + self.r + .read_only + .add_request(self.r.raft_log.committed, m, self.r.id); + self.bcast_heartbeat_with_ctx(Some(ctx)); + } + ReadOnlyOption::LeaseBased => { + let read_index = self.raft_log.committed; + if let Some(m) = self.handle_ready_read_index(m, read_index) { + self.r.send(m, &mut self.msgs); + } + } + } + return Ok(()); + } + _ => {} + } + + match m.get_msg_type() { + MessageType::MsgAppendResponse => { + self.handle_append_response(&m); + } + MessageType::MsgHeartbeatResponse => { + self.handle_heartbeat_response(&m); + } + MessageType::MsgSnapStatus => { + self.handle_snapshot_status(&m); + } + MessageType::MsgUnreachable => { + self.handle_unreachable(&m); + } + MessageType::MsgTransferLeader => { + self.handle_transfer_leader(&m); + } + _ => { + if self.prs().get(m.from).is_none() { + debug!( + self.logger, + "no progress available for {}", + m.from; + ); + } + } + } + + Ok(()) + } + + /// Commits the logs using commit info in vote message. + fn maybe_commit_by_vote(&mut self, m: &Message) { + if m.commit == 0 || m.commit_term == 0 { + return; + } + let last_commit = self.raft_log.committed; + if m.commit <= last_commit || self.state == StateRole::Leader { + return; + } + if !self.raft_log.maybe_commit(m.commit, m.commit_term) { + return; + } + + let log = &mut self.r.raft_log; + info!(self.r.logger, "[commit: {}, lastindex: {}, lastterm: {}] fast-forwarded commit to vote request [index: {}, term: {}]", + log.committed, log.last_index(), log.last_term(), m.commit, m.commit_term); + + if self.state != StateRole::Candidate && self.state != StateRole::PreCandidate { + return; + } + + // Scan all unapplied committed entries to find a config change. + // Paginate the scan, to avoid a potentially unlimited memory spike. + let low = last_commit + 1; + let high = self.raft_log.committed + 1; + let ctx = GetEntriesContext(GetEntriesFor::CommitByVote); + if self.has_unapplied_conf_changes(low, high, ctx) { + // The candidate doesn't have to step down in theory, here just for best + // safety as we assume quorum won't change during election. + let term = self.term; + self.become_follower(term, INVALID_ID); + } + } + + fn poll(&mut self, from: u64, t: MessageType, vote: bool) -> VoteResult { + self.prs.record_vote(from, vote); + let (gr, rj, res) = self.prs.tally_votes(); + // Unlike etcd, we log when necessary. + if from != self.id { + info!( + self.logger, + "received votes response"; + "vote" => vote, + "from" => from, + "rejections" => rj, + "approvals" => gr, + "type" => ?t, + "term" => self.term, + ); + } + + match res { + VoteResult::Won => { + if self.state == StateRole::PreCandidate { + self.campaign(CAMPAIGN_ELECTION); + } else { + self.become_leader(); + self.bcast_append(); + } + } + VoteResult::Lost => { + // pb.MsgPreVoteResp contains future term of pre-candidate + // m.term > self.term; reuse self.term + let term = self.term; + self.become_follower(term, INVALID_ID); + } + VoteResult::Pending => (), + } + res + } + + // step_candidate is shared by state Candidate and PreCandidate; the difference is + // whether they respond to MsgRequestVote or MsgRequestPreVote. + fn step_candidate(&mut self, m: Message) -> Result<()> { + match m.get_msg_type() { + MessageType::MsgPropose => { + info!( + self.logger, + "no leader at term {term}; dropping proposal", + term = self.term; + ); + return Err(Error::ProposalDropped); + } + MessageType::MsgAppend => { + debug_assert_eq!(self.term, m.term); + self.become_follower(m.term, m.from); + self.handle_append_entries(&m); + } + MessageType::MsgHeartbeat => { + debug_assert_eq!(self.term, m.term); + self.become_follower(m.term, m.from); + self.handle_heartbeat(m); + } + MessageType::MsgSnapshot => { + debug_assert_eq!(self.term, m.term); + self.become_follower(m.term, m.from); + self.handle_snapshot(m); + } + MessageType::MsgRequestPreVoteResponse | MessageType::MsgRequestVoteResponse => { + // Only handle vote responses corresponding to our candidacy (while in + // state Candidate, we may get stale MsgPreVoteResp messages in this term from + // our pre-candidate state). + if (self.state == StateRole::PreCandidate + && m.get_msg_type() != MessageType::MsgRequestPreVoteResponse) + || (self.state == StateRole::Candidate + && m.get_msg_type() != MessageType::MsgRequestVoteResponse) + { + return Ok(()); + } + + self.poll(m.from, m.get_msg_type(), !m.reject); + self.maybe_commit_by_vote(&m); + } + MessageType::MsgTimeoutNow => debug!( + self.logger, + "{term} ignored MsgTimeoutNow from {from}", + term = self.term, + from = m.from; + "state" => ?self.state, + ), + _ => {} + } + Ok(()) + } + + fn step_follower(&mut self, mut m: Message) -> Result<()> { + match m.get_msg_type() { + MessageType::MsgPropose => { + if self.leader_id == INVALID_ID { + info!( + self.logger, + "no leader at term {term}; dropping proposal", + term = self.term; + ); + return Err(Error::ProposalDropped); + } else if self.disable_proposal_forwarding { + info!( + self.logger, + "{from} not forwarding to leader {to} at term {term}; dropping proposal", + from = self.id, + to = self.leader_id, + term = self.term; + ); + return Err(Error::ProposalDropped); + } + m.to = self.leader_id; + self.r.send(m, &mut self.msgs); + } + MessageType::MsgAppend => { + self.election_elapsed = 0; + self.leader_id = m.from; + self.handle_append_entries(&m); + } + MessageType::MsgHeartbeat => { + self.election_elapsed = 0; + self.leader_id = m.from; + self.handle_heartbeat(m); + } + MessageType::MsgSnapshot => { + self.election_elapsed = 0; + self.leader_id = m.from; + self.handle_snapshot(m); + } + MessageType::MsgTransferLeader => { + if self.leader_id == INVALID_ID { + info!( + self.logger, + "no leader at term {term}; dropping leader transfer msg", + term = self.term; + ); + return Ok(()); + } + m.to = self.leader_id; + self.r.send(m, &mut self.msgs); + } + MessageType::MsgTimeoutNow => { + if self.promotable { + info!( + self.logger, + "[term {term}] received MsgTimeoutNow from {from} and starts an election to \ + get leadership.", + term = self.term, + from = m.from; + ); + // Leadership transfers never use pre-vote even if self.pre_vote is true; we + // know we are not recovering from a partition so there is no need for the + // extra round trip. + self.hup(true); + } else { + info!( + self.logger, + "received MsgTimeoutNow from {} but is not promotable", + m.from; + ); + } + } + MessageType::MsgReadIndex => { + if self.leader_id == INVALID_ID { + info!( + self.logger, + "no leader at term {term}; dropping index reading msg", + term = self.term; + ); + return Ok(()); + } + m.to = self.leader_id; + self.r.send(m, &mut self.msgs); + } + MessageType::MsgReadIndexResp => { + if m.entries.len() != 1 { + error!( + self.logger, + "invalid format of MsgReadIndexResp from {}", + m.from; + "entries count" => m.entries.len(), + ); + return Ok(()); + } + let rs = ReadState { + index: m.index, + request_ctx: m.take_entries()[0].take_data().to_vec(), + }; + self.read_states.push(rs); + // `index` and `term` in MsgReadIndexResp is the leader's commit index and its current term, + // the log entry in the leader's commit index will always have the leader's current term, + // because the leader only handle MsgReadIndex after it has committed log entry in its term. + self.raft_log.maybe_commit(m.index, m.term); + } + _ => {} + } + Ok(()) + } + + /// Request a snapshot from a leader. + pub fn request_snapshot(&mut self) -> Result<()> { + if self.state == StateRole::Leader { + info!( + self.logger, + "can not request snapshot on leader; dropping request snapshot"; + ); + } else if self.leader_id == INVALID_ID { + info!( + self.logger, + "no leader; dropping request snapshot"; + "term" => self.term, + ); + } else if self.snap().is_some() { + info!( + self.logger, + "there is a pending snapshot; dropping request snapshot"; + ); + } else if self.pending_request_snapshot != INVALID_INDEX { + info!( + self.logger, + "there is a pending snapshot; dropping request snapshot"; + ); + } else { + let request_index = self.raft_log.last_index(); + let request_index_term = self.raft_log.term(request_index).unwrap(); + if self.term == request_index_term { + self.pending_request_snapshot = request_index; + self.send_request_snapshot(); + return Ok(()); + } + info! { + self.logger, + "mismatched term; dropping request snapshot"; + "term" => self.term, + "last_term" => request_index_term, + }; + } + Err(Error::RequestSnapshotDropped) + } + + // TODO: revoke pub when there is a better way to test. + /// For a given message, append the entries to the log. + pub fn handle_append_entries(&mut self, m: &Message) { + if self.pending_request_snapshot != INVALID_INDEX { + self.send_request_snapshot(); + return; + } + if m.index < self.raft_log.committed { + debug!( + self.logger, + "got message with lower index than committed."; + ); + let mut to_send = Message::default(); + to_send.set_msg_type(MessageType::MsgAppendResponse); + to_send.to = m.from; + to_send.index = self.raft_log.committed; + to_send.commit = self.raft_log.committed; + self.r.send(to_send, &mut self.msgs); + return; + } + + let mut to_send = Message::default(); + to_send.to = m.from; + to_send.set_msg_type(MessageType::MsgAppendResponse); + + if let Some((_, last_idx)) = self + .raft_log + .maybe_append(m.index, m.log_term, m.commit, &m.entries) + { + to_send.set_index(last_idx); + } else { + debug!( + self.logger, + "rejected msgApp [logterm: {msg_log_term}, index: {msg_index}] \ + from {from}", + msg_log_term = m.log_term, + msg_index = m.index, + from = m.from; + "index" => m.index, + "logterm" => ?self.raft_log.term(m.index), + ); + + let hint_index = cmp::min(m.index, self.raft_log.last_index()); + let (hint_index, hint_term) = + self.raft_log.find_conflict_by_term(hint_index, m.log_term); + + if hint_term.is_none() { + fatal!( + self.logger, + "term({index}) must be valid", + index = hint_index + ) + } + + to_send.index = m.index; + to_send.reject = true; + to_send.reject_hint = hint_index; + to_send.log_term = hint_term.unwrap(); + } + to_send.set_commit(self.raft_log.committed); + self.r.send(to_send, &mut self.msgs); + } + + // TODO: revoke pub when there is a better way to test. + /// For a message, commit and send out heartbeat. + pub fn handle_heartbeat(&mut self, mut m: Message) { + self.raft_log.commit_to(m.commit); + if self.pending_request_snapshot != INVALID_INDEX { + self.send_request_snapshot(); + return; + } + let mut to_send = Message::default(); + to_send.set_msg_type(MessageType::MsgHeartbeatResponse); + to_send.to = m.from; + to_send.context = m.take_context(); + to_send.commit = self.raft_log.committed; + self.r.send(to_send, &mut self.msgs); + } + + fn handle_snapshot(&mut self, mut m: Message) { + let metadata = m.get_snapshot().get_metadata(); + let (sindex, sterm) = (metadata.index, metadata.term); + if self.restore(m.take_snapshot()) { + info!( + self.logger, + "[commit: {commit}, term: {term}] restored snapshot [index: {snapshot_index}, term: {snapshot_term}]", + term = self.term, + commit = self.raft_log.committed, + snapshot_index = sindex, + snapshot_term = sterm; + ); + let mut to_send = Message::default(); + to_send.set_msg_type(MessageType::MsgAppendResponse); + to_send.to = m.from; + to_send.index = self.raft_log.last_index(); + self.r.send(to_send, &mut self.msgs); + } else { + info!( + self.logger, + "[commit: {commit}] ignored snapshot [index: {snapshot_index}, term: {snapshot_term}]", + commit = self.raft_log.committed, + snapshot_index = sindex, + snapshot_term = sterm; + ); + let mut to_send = Message::default(); + to_send.set_msg_type(MessageType::MsgAppendResponse); + to_send.to = m.from; + to_send.index = self.raft_log.committed; + self.r.send(to_send, &mut self.msgs); + } + } + + /// Recovers the state machine from a snapshot. It restores the log and the + /// configuration of state machine. + pub fn restore(&mut self, snap: Snapshot) -> bool { + if snap.get_metadata().index < self.raft_log.committed { + return false; + } + if self.state != StateRole::Follower { + // This is defense-in-depth: if the leader somehow ended up applying a + // snapshot, it could move into a new term without moving into a + // follower state. This should never fire, but if it did, we'd have + // prevented damage by returning early, so log only a loud warning. + // + // At the time of writing, the instance is guaranteed to be in follower + // state when this method is called. + warn!(self.logger, "non-follower attempted to restore snapshot"; "state" => ?self.state); + self.become_follower(self.term + 1, INVALID_INDEX); + return false; + } + + // More defense-in-depth: throw away snapshot if recipient is not in the + // config. This shouldn't ever happen (at the time of writing) but lots of + // code here and there assumes that r.id is in the progress tracker. + let meta = snap.get_metadata(); + let (snap_index, snap_term) = (meta.index, meta.term); + let cs = meta.get_conf_state(); + if cs + .get_voters() + .iter() + .chain(cs.get_learners()) + .chain(cs.get_voters_outgoing()) + // `learners_next` doesn't need to be checked. According to the rules, if a peer in + // `learners_next`, it has to be in `voters_outgoing`. + .all(|id| *id != self.id) + { + warn!(self.logger, "attempted to restore snapshot but it is not in the ConfState"; "conf_state" => ?cs); + return false; + } + + // Now go ahead and actually restore. + + if self.pending_request_snapshot == INVALID_INDEX + && self.raft_log.match_term(meta.index, meta.term) + { + info!( + self.logger, + "fast-forwarded commit to snapshot"; + "commit" => self.raft_log.committed, + "last_index" => self.raft_log.last_index(), + "last_term" => self.raft_log.last_term(), + "snapshot_index" => snap_index, + "snapshot_term" => snap_term + ); + self.raft_log.commit_to(meta.index); + return false; + } + + self.raft_log.restore(snap); + let cs = self + .r + .raft_log + .pending_snapshot() + .unwrap() + .get_metadata() + .get_conf_state(); + + self.prs.clear(); + let last_index = self.raft_log.last_index(); + if let Err(e) = confchange::restore(&mut self.prs, last_index, cs) { + // This should never happen. Either there's a bug in our config change + // handling or the client corrupted the conf change. + fatal!(self.logger, "unable to restore config {:?}: {}", cs, e); + } + let new_cs = self.post_conf_change(); + let cs = self + .r + .raft_log + .pending_snapshot() + .unwrap() + .get_metadata() + .get_conf_state(); + if !raft_proto::conf_state_eq(cs, &new_cs) { + fatal!(self.logger, "invalid restore: {:?} != {:?}", cs, new_cs); + } + + // TODO: this is untested and likely unneeded + let pr = self.prs.get_mut(self.id).unwrap(); + pr.maybe_update(pr.next_idx - 1); + + self.pending_request_snapshot = INVALID_INDEX; + + info!( + self.logger, + "restored snapshot"; + "commit" => self.raft_log.committed, + "last_index" => self.raft_log.last_index(), + "last_term" => self.raft_log.last_term(), + "snapshot_index" => snap_index, + "snapshot_term" => snap_term, + ); + + true + } + + /// Updates the in-memory state and, when necessary, carries out additional actions + /// such as reacting to the removal of nodes or changed quorum requirements. + pub fn post_conf_change(&mut self) -> ConfState { + info!(self.logger, "switched to configuration"; "config" => ?self.prs.conf()); + // TODO: instead of creating a conf state, validating conf state inside + // progress tracker is better. + let cs = self.prs.conf().to_conf_state(); + let is_voter = self.prs.conf().voters.contains(self.id); + self.promotable = is_voter; + if !is_voter && self.state == StateRole::Leader { + // This node is leader and was removed or demoted. We prevent demotions + // at the time writing but hypothetically we handle them the same way as + // removing the leader: stepping down into the next Term. + // + // TODO(tbg): step down (for sanity) and ask follower with largest Match + // to TimeoutNow (to avoid interruption). This might still drop some + // proposals but it's better than nothing. + // + // TODO(tbg): test this branch. It is untested at the time of writing. + return cs; + } + + // The remaining steps only make sense if this node is the leader and there + // are other nodes. + if self.state != StateRole::Leader || cs.voters.is_empty() { + return cs; + } + + if self.maybe_commit() { + // If the configuration change means that more entries are committed now, + // broadcast/append to everyone in the updated config. + self.bcast_append(); + } else { + // Otherwise, still probe the newly added replicas; there's no reason to + // let them wait out a heartbeat interval (or the next incoming proposal). + let self_id = self.id; + let core = &mut self.r; + let msgs = &mut self.msgs; + self.prs + .iter_mut() + .filter(|&(id, _)| *id != self_id) + .for_each(|(id, pr)| { + core.maybe_send_append(*id, pr, false, msgs); + }); + } + + // The quorum size is now smaller, consider to response some read requests. + // If there is only one peer, all pending read requests must be responded. + if let Some(ctx) = self.read_only.last_pending_request_ctx() { + let prs = &self.prs; + if self + .r + .read_only + .recv_ack(self.id, &ctx) + .is_some_and(|acks| prs.has_quorum(acks)) + { + for rs in self.r.read_only.advance(&ctx, &self.r.logger) { + if let Some(m) = self.handle_ready_read_index(rs.req, rs.index) { + self.r.send(m, &mut self.msgs); + } + } + } + } + + if self + .lead_transferee + .is_some_and(|e| !self.prs.conf().voters.contains(e)) + { + self.abort_leader_transfer(); + } + cs + } + + /// Check if there is any pending confchange. + /// + /// This method can be false positive. + #[inline] + pub fn has_pending_conf(&self) -> bool { + self.pending_conf_index > self.raft_log.applied + } + + /// Specifies if the commit should be broadcast. + pub fn should_bcast_commit(&self) -> bool { + !self.skip_bcast_commit || self.has_pending_conf() + } + + /// Indicates whether state machine can be promoted to leader, + /// which is true when it's a voter and its own id is in progress list. + pub fn promotable(&self) -> bool { + self.promotable + } + + #[doc(hidden)] + pub fn apply_conf_change(&mut self, cc: &ConfChangeV2) -> Result { + let mut changer = Changer::new(&self.prs); + let (cfg, changes) = if cc.leave_joint() { + changer.leave_joint()? + } else if let Some(auto_leave) = cc.enter_joint() { + changer.enter_joint(auto_leave, &cc.changes)? + } else { + changer.simple(&cc.changes)? + }; + self.prs + .apply_conf(cfg, changes, self.raft_log.last_index()); + Ok(self.post_conf_change()) + } + + /// Returns a read-only reference to the progress set. + pub fn prs(&self) -> &ProgressTracker { + &self.prs + } + + /// Returns a mutable reference to the progress set. + pub fn mut_prs(&mut self) -> &mut ProgressTracker { + &mut self.prs + } + + // TODO: revoke pub when there is a better way to test. + /// For a given hardstate, load the state into self. + pub fn load_state(&mut self, hs: &HardState) { + if hs.commit < self.raft_log.committed || hs.commit > self.raft_log.last_index() { + fatal!( + self.logger, + "hs.commit {} is out of range [{}, {}]", + hs.commit, + self.raft_log.committed, + self.raft_log.last_index() + ) + } + self.raft_log.committed = hs.commit; + self.term = hs.term; + self.vote = hs.vote; + } + + /// `pass_election_timeout` returns true iff `election_elapsed` is greater + /// than or equal to the randomized election timeout in + /// [`election_timeout`, 2 * `election_timeout` - 1]. + pub fn pass_election_timeout(&self) -> bool { + self.election_elapsed >= self.randomized_election_timeout + } + + /// Regenerates and stores the election timeout. + pub fn reset_randomized_election_timeout(&mut self) { + let prev_timeout = self.randomized_election_timeout; + let timeout = + rand::thread_rng().gen_range(self.min_election_timeout..self.max_election_timeout); + debug!( + self.logger, + "reset election timeout {prev_timeout} -> {timeout} at {election_elapsed}", + prev_timeout = prev_timeout, + timeout = timeout, + election_elapsed = self.election_elapsed; + ); + self.randomized_election_timeout = timeout; + } + + // check_quorum_active returns true if the quorum is active from + // the view of the local raft state machine. Otherwise, it returns + // false. + // check_quorum_active also resets all recent_active to false. + // check_quorum_active can only called by leader. + fn check_quorum_active(&mut self) -> bool { + let self_id = self.id; + self.mut_prs().quorum_recently_active(self_id) + } + + /// Issues a message to timeout immediately. + pub fn send_timeout_now(&mut self, to: u64) { + let msg = new_message(to, MessageType::MsgTimeoutNow, None); + self.r.send(msg, &mut self.msgs); + } + + /// Stops the transfer of a leader. + pub fn abort_leader_transfer(&mut self) { + self.lead_transferee = None; + } + + fn send_request_snapshot(&mut self) { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgAppendResponse); + m.index = self.raft_log.committed; + m.reject = true; + m.reject_hint = self.raft_log.last_index(); + m.to = self.leader_id; + m.request_snapshot = self.pending_request_snapshot; + m.log_term = self.raft_log.term(m.reject_hint).unwrap(); + self.r.send(m, &mut self.msgs); + } + + fn handle_ready_read_index(&mut self, mut req: Message, index: u64) -> Option { + if req.from == INVALID_ID || req.from == self.id { + let rs = ReadState { + index, + request_ctx: req.take_entries()[0].take_data().to_vec(), + }; + self.read_states.push(rs); + return None; + } + let mut to_send = Message::default(); + to_send.set_msg_type(MessageType::MsgReadIndexResp); + to_send.to = req.from; + to_send.index = index; + to_send.set_entries(req.take_entries()); + Some(to_send) + } + + /// Reduce size of 'ents' from uncommitted size. + pub fn reduce_uncommitted_size(&mut self, ents: &[Entry]) { + // fast path for non-leader endpoint + if self.state != StateRole::Leader { + return; + } + + if !self.uncommitted_state.maybe_reduce_uncommitted_size(ents) { + // this will make self.uncommitted size not accurate. + // but in most situation, this behaviour will not cause big problem + warn!( + self.r.logger, + "try to reduce uncommitted size less than 0, first index of pending ents is {}", + ents[0].get_index() + ); + } + } + + /// Increase size of 'ents' to uncommitted size. Return true when size limit + /// is satisfied. Otherwise return false and uncommitted size remains unchanged. + /// For raft with no limit(or non-leader raft), it always return true. + #[inline] + pub fn maybe_increase_uncommitted_size(&mut self, ents: &[Entry]) -> bool { + self.uncommitted_state.maybe_increase_uncommitted_size(ents) + } + + /// Return current uncommitted size recorded by uncommitted_state. + #[inline] + pub fn uncommitted_size(&self) -> usize { + self.uncommitted_state.uncommitted_size + } + + /// A Raft leader allocates a vector with capacity `max_inflight_msgs` for every peer. + /// It takes a lot of memory if there are too many Raft groups. `maybe_free_inflight_buffers` + /// is used to free memory if necessary. + pub fn maybe_free_inflight_buffers(&mut self) { + for (_, pr) in self.mut_prs().iter_mut() { + pr.ins.maybe_free_buffer(); + } + } + + /// To adjust `max_inflight_msgs` for the specified peer. + /// Set to `0` will disable the progress. + pub fn adjust_max_inflight_msgs(&mut self, target: u64, cap: usize) { + if let Some(pr) = self.mut_prs().get_mut(target) { + pr.ins.set_cap(cap); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/raft_log.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/raft_log.rs new file mode 100644 index 0000000..b54e3dd --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/raft_log.rs @@ -0,0 +1,1918 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp; +use std::fmt::{Display, Formatter}; + +use slog::warn; +use slog::Logger; +use slog::{debug, info, trace}; + +use crate::config::Config; +use crate::eraftpb::{Entry, Snapshot}; +use crate::errors::{Error, Result, StorageError}; +use crate::log_unstable::Unstable; +use crate::storage::{GetEntriesContext, GetEntriesFor, Storage}; +use crate::util; +pub use crate::util::NO_LIMIT; + +/// Raft log implementation +pub struct RaftLog { + /// Contains all stable entries since the last snapshot. + pub store: T, + + /// Contains all unstable entries and snapshot. + /// they will be saved into storage. + pub unstable: Unstable, + + /// The highest log position that is known to be in stable storage + /// on a quorum of nodes. + /// + /// Invariant: applied <= committed + /// NOTE: this invariant can be break after restart if max_apply_unpersisted_log_limit > 0, + /// but once the committed catches up with applied, it should never fall behind again. + pub committed: u64, + + /// The highest log position that is known to be persisted in stable + /// storage. It's used for limiting the upper bound of committed and + /// persisted entries. + /// + /// Invariant: persisted < unstable.offset + pub persisted: u64, + + /// The highest log position that the application has been instructed + /// to apply to its state machine. + /// + /// Invariant: applied <= committed. + /// NOTE: + /// - this invariant can be break after restart if max_apply_unpersisted_log_limit > 0, + /// but once the committed catches up with applied, it should never fall behind again. + /// - if `max_apply_unpersisted_log_limit` is 0, applied < persisted is also ensured + /// (if it is changed from >0 to 0, it is ensured after persisted catching up with applied). + pub applied: u64, + + /// The maximum log gap between persisted and applied. + /// + /// NOTE: We force reset `max_apply_unpersisted_log_limit` value to 0 when + /// raft role demote from leader currently to ensure only allow applying + /// not persisted raft logs on leader. + pub max_apply_unpersisted_log_limit: u64, +} + +impl Display for RaftLog { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "committed={}, persisted={}, applied={}, unstable.offset={}, unstable.entries.len()={}", + self.committed, + self.persisted, + self.applied, + self.unstable.offset, + self.unstable.entries.len() + ) + } +} + +impl RaftLog { + /// Creates a new raft log with a given storage and tag. + pub fn new(store: T, logger: Logger, cfg: &Config) -> RaftLog { + let first_index = store.first_index().unwrap(); + let last_index = store.last_index().unwrap(); + + // Initialize committed and applied pointers to the time of the last compaction. + RaftLog { + store, + committed: first_index - 1, + persisted: last_index, + applied: first_index - 1, + unstable: Unstable::new(last_index + 1, logger), + max_apply_unpersisted_log_limit: cfg.max_apply_unpersisted_log_limit, + } + } + + /// Grabs the term from the last entry. + /// + /// # Panics + /// + /// Panics if there are entries but the last term has been discarded. + pub fn last_term(&self) -> u64 { + match self.term(self.last_index()) { + Ok(t) => t, + Err(e) => fatal!( + self.unstable.logger, + "unexpected error when getting the last term: {:?}", + e + ), + } + } + + /// Grab a read-only reference to the underlying storage. + #[inline] + pub fn store(&self) -> &T { + &self.store + } + + /// Grab a mutable reference to the underlying storage. + #[inline] + pub fn mut_store(&mut self) -> &mut T { + &mut self.store + } + + /// For a given index, finds the term associated with it. + pub fn term(&self, idx: u64) -> Result { + // the valid term range is [index of dummy entry, last index] + let dummy_idx = self.first_index() - 1; + if idx < dummy_idx || idx > self.last_index() { + return Ok(0u64); + } + + match self.unstable.maybe_term(idx) { + Some(term) => Ok(term), + _ => self.store.term(idx).map_err(|e| { + match e { + Error::Store(StorageError::Compacted) + | Error::Store(StorageError::Unavailable) => {} + _ => fatal!(self.unstable.logger, "unexpected error: {:?}", e), + } + e + }), + } + } + + /// Returns th first index in the store that is available via entries + /// + /// # Panics + /// + /// Panics if the store doesn't have a first index. + pub fn first_index(&self) -> u64 { + match self.unstable.maybe_first_index() { + Some(idx) => idx, + None => self.store.first_index().unwrap(), + } + } + + /// Returns the last index in the store that is available via entries. + /// + /// # Panics + /// + /// Panics if the store doesn't have a last index. + pub fn last_index(&self) -> u64 { + match self.unstable.maybe_last_index() { + Some(idx) => idx, + None => self.store.last_index().unwrap(), + } + } + + /// Finds the index of the conflict. + /// + /// It returns the first index of conflicting entries between the existing + /// entries and the given entries, if there are any. + /// + /// If there are no conflicting entries, and the existing entries contain + /// all the given entries, zero will be returned. + /// + /// If there are no conflicting entries, but the given entries contains new + /// entries, the index of the first new entry will be returned. + /// + /// An entry is considered to be conflicting if it has the same index but + /// a different term. + /// + /// The first entry MUST have an index equal to the argument 'from'. + /// The index of the given entries MUST be continuously increasing. + pub fn find_conflict(&self, ents: &[Entry]) -> u64 { + for e in ents { + if !self.match_term(e.index, e.term) { + if e.index <= self.last_index() { + info!( + self.unstable.logger, + "found conflict at index {index}", + index = e.index; + "existing term" => self.term(e.index).unwrap_or(0), + "conflicting term" => e.term, + ); + } + return e.index; + } + } + 0 + } + + /// find_conflict_by_term takes an (`index`, `term`) pair (indicating a conflicting log + /// entry on a leader/follower during an append) and finds the largest index in + /// log with log.term <= `term` and log.index <= `index`. If no such index exists + /// in the log, the log's first index is returned. + /// + /// The index provided MUST be equal to or less than self.last_index(). Invalid + /// inputs log a warning and the input index is returned. + /// + /// Return (index, term) + pub fn find_conflict_by_term(&self, index: u64, term: u64) -> (u64, Option) { + let mut conflict_index = index; + + let last_index = self.last_index(); + if index > last_index { + warn!( + self.unstable.logger, + "index({}) is out of range [0, last_index({})] in find_conflict_by_term", + index, + last_index, + ); + return (index, None); + } + + loop { + match self.term(conflict_index) { + Ok(t) => { + if t > term { + conflict_index -= 1 + } else { + return (conflict_index, Some(t)); + } + } + Err(_) => return (conflict_index, None), + } + } + } + + /// Answers the question: Does this index belong to this term? + pub fn match_term(&self, idx: u64, term: u64) -> bool { + self.term(idx).map(|t| t == term).unwrap_or(false) + } + + // TODO: revoke pub when there is a better way to append without proposals. + /// Returns None if the entries cannot be appended. Otherwise, + /// it returns Some((conflict_index, last_index)). + /// + /// # Panics + /// + /// Panics if it finds a conflicting index less than committed index. + pub fn maybe_append( + &mut self, + idx: u64, + term: u64, + committed: u64, + ents: &[Entry], + ) -> Option<(u64, u64)> { + if self.match_term(idx, term) { + let conflict_idx = self.find_conflict(ents); + if conflict_idx == 0 { + } else if conflict_idx <= self.committed { + fatal!( + self.unstable.logger, + "entry {} conflict with committed entry {}", + conflict_idx, + self.committed + ) + } else { + let start = (conflict_idx - (idx + 1)) as usize; + self.append(&ents[start..]); + // persisted should be decreased because entries are changed + if self.persisted > conflict_idx - 1 { + self.persisted = conflict_idx - 1; + } + } + let last_new_index = idx + ents.len() as u64; + self.commit_to(cmp::min(committed, last_new_index)); + return Some((conflict_idx, last_new_index)); + } + None + } + + /// Sets the last committed value to the passed in value. + /// + /// # Panics + /// + /// Panics if the index goes past the last index. + pub fn commit_to(&mut self, to_commit: u64) { + // never decrease commit + if self.committed >= to_commit { + return; + } + if self.last_index() < to_commit { + fatal!( + self.unstable.logger, + "to_commit {} is out of range [last_index {}]", + to_commit, + self.last_index() + ) + } + self.committed = to_commit; + } + + /// Advance the applied index to the passed in value. + /// + /// # Panics + /// + /// Panics if the value passed in is not new or known. + #[deprecated = "Call raft::commit_apply(idx) instead. Joint Consensus requires an on-apply hook to + finalize a configuration change. This will become internal API in future versions."] + pub fn applied_to(&mut self, idx: u64) { + if idx == 0 { + return; + } + // NOTE: here we must use `commmitted` instead of `min(committed, perssited + max_apply_unpersisted_log_limit)` + // as the uppper bound because the `max_apply_unpersisted_log_limit` can be adjusted dynamically. + if idx > self.committed || idx < self.applied { + fatal!( + self.unstable.logger, + "applied({}) is out of range [prev_applied({}), committed({})]", + idx, + self.applied, + self.committed, + ) + } + self.applied_to_unchecked(idx); + } + + #[inline] + pub(crate) fn applied_to_unchecked(&mut self, idx: u64) { + self.applied = idx; + } + + /// Returns the last applied index. + pub fn applied(&self) -> u64 { + self.applied + } + + /// Clears the unstable entries and moves the stable offset up to the + /// last index, if there is any. + pub fn stable_entries(&mut self, index: u64, term: u64) { + self.unstable.stable_entries(index, term); + } + + /// Clears the unstable snapshot. + pub fn stable_snap(&mut self, index: u64) { + self.unstable.stable_snap(index); + } + + /// Returns a reference to the unstable log. + pub fn unstable(&self) -> &Unstable { + &self.unstable + } + + /// Returns slice of entries that are not persisted. + pub fn unstable_entries(&self) -> &[Entry] { + &self.unstable.entries + } + + /// Returns the snapshot that are not persisted. + pub fn unstable_snapshot(&self) -> &Option { + &self.unstable.snapshot + } + + /// Appends a set of entries to the unstable list. + pub fn append(&mut self, ents: &[Entry]) -> u64 { + trace!( + self.unstable.logger, + "Entries being appended to unstable list"; + "ents" => ?ents, + ); + if ents.is_empty() { + return self.last_index(); + } + + let after = ents[0].index - 1; + if after < self.committed { + fatal!( + self.unstable.logger, + "after {} is out of range [committed {}]", + after, + self.committed + ) + } + self.unstable.truncate_and_append(ents); + self.last_index() + } + + /// Returns entries starting from a particular index and not exceeding a bytesize. + pub fn entries( + &self, + idx: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> Result> { + let max_size = max_size.into(); + let last = self.last_index(); + if idx > last { + return Ok(Vec::new()); + } + self.slice(idx, last + 1, max_size, context) + } + + /// Returns all the entries. Only used by tests. + #[doc(hidden)] + pub fn all_entries(&self) -> Vec { + let first_index = self.first_index(); + match self.entries(first_index, None, GetEntriesContext::empty(false)) { + Err(e) => { + // try again if there was a racing compaction + if e == Error::Store(StorageError::Compacted) { + return self.all_entries(); + } + fatal!(self.unstable.logger, "unexpected error: {:?}", e); + } + Ok(ents) => ents, + } + } + + /// Determines if the given (lastIndex,term) log is more up-to-date + /// by comparing the index and term of the last entry in the existing logs. + /// If the logs have last entry with different terms, then the log with the + /// later term is more up-to-date. If the logs end with the same term, then + /// whichever log has the larger last_index is more up-to-date. If the logs are + /// the same, the given log is up-to-date. + pub fn is_up_to_date(&self, last_index: u64, term: u64) -> bool { + term > self.last_term() || (term == self.last_term() && last_index >= self.last_index()) + } + + /// Returns committed and persisted entries since max(`since_idx` + 1, first_index). + pub fn next_entries_since(&self, since_idx: u64, max_size: Option) -> Option> { + let offset = cmp::max(since_idx + 1, self.first_index()); + let high = self.applied_index_upper_bound() + 1; + if high > offset { + match self.slice( + offset, + high, + max_size, + GetEntriesContext(GetEntriesFor::GenReady), + ) { + Ok(vec) => return Some(vec), + Err(e) => fatal!(self.unstable.logger, "{}", e), + } + } + None + } + + #[inline] + fn applied_index_upper_bound(&self) -> u64 { + std::cmp::min( + self.committed, + self.persisted + self.max_apply_unpersisted_log_limit, + ) + } + + /// Returns all the available entries for execution. + /// If applied is smaller than the index of snapshot, it returns all committed + /// entries after the index of snapshot. + pub fn next_entries(&self, max_size: Option) -> Option> { + self.next_entries_since(self.applied, max_size) + } + + /// Returns whether there are committed and persisted entries since + /// max(`since_idx` + 1, first_index). + pub fn has_next_entries_since(&self, since_idx: u64) -> bool { + let offset = cmp::max(since_idx + 1, self.first_index()); + let high = self.applied_index_upper_bound() + 1; + high > offset + } + + /// Returns whether there are new entries. + pub fn has_next_entries(&self) -> bool { + self.has_next_entries_since(self.applied) + } + + /// Returns the current snapshot + pub fn snapshot(&self, request_index: u64, to: u64) -> Result { + if let Some(snap) = self.unstable.snapshot.as_ref() { + if snap.get_metadata().index >= request_index { + return Ok(snap.clone()); + } + } + self.store.snapshot(request_index, to) + } + + pub(crate) fn pending_snapshot(&self) -> Option<&Snapshot> { + self.unstable.snapshot.as_ref() + } + + fn must_check_outofbounds(&self, low: u64, high: u64) -> Option { + if low > high { + fatal!(self.unstable.logger, "invalid slice {} > {}", low, high) + } + let first_index = self.first_index(); + if low < first_index { + return Some(Error::Store(StorageError::Compacted)); + } + + let length = self.last_index() + 1 - first_index; + if low < first_index || high > first_index + length { + fatal!( + self.unstable.logger, + "slice[{},{}] out of bound[{},{}]", + low, + high, + first_index, + self.last_index() + ) + } + None + } + + /// Attempts to commit the index and term and returns whether it did. + pub fn maybe_commit(&mut self, max_index: u64, term: u64) -> bool { + if max_index > self.committed && self.term(max_index).is_ok_and(|t| t == term) { + debug!( + self.unstable.logger, + "committing index {index}", + index = max_index + ); + self.commit_to(max_index); + true + } else { + false + } + } + + /// Attempts to persist the index and term and returns whether it did. + pub fn maybe_persist(&mut self, index: u64, term: u64) -> bool { + // It's possible that the term check can be passed but index is greater + // than or equal to the first_update_index in some corner cases. + // For example, there are 5 nodes, A B C D E. + // 1. A is leader and it proposes some raft logs but only B receives these logs. + // 2. B gets the Ready and the logs are persisted asynchronously. + // 2. A crashes and C becomes leader after getting the vote from D and E. + // 3. C proposes some raft logs and B receives these logs. + // 4. C crashes and A restarts and becomes leader again after getting the vote from D and E. + // 5. B receives the logs from A which are the same to the ones from step 1. + // 6. The logs from Ready has been persisted on B so it calls on_persist_ready and comes to here. + // + // We solve this problem by not forwarding the persisted index. It's pretty intuitive + // because the first_update_index means there are snapshot or some entries whose indexes + // are greater than or equal to the first_update_index have not been persisted yet. + let first_update_index = match &self.unstable.snapshot { + Some(s) => s.get_metadata().index, + None => self.unstable.offset, + }; + if index > self.persisted + && index < first_update_index + && self.store.term(index).is_ok_and(|t| t == term) + { + debug!(self.unstable.logger, "persisted index {}", index); + self.persisted = index; + true + } else { + false + } + } + + /// Attempts to persist the snapshot and returns whether it did. + pub fn maybe_persist_snap(&mut self, index: u64) -> bool { + if index > self.persisted { + // commit index should not be less than snapshot's index + if index > self.committed { + fatal!( + self.unstable.logger, + "snapshot's index {} > committed {}", + index, + self.committed, + ) + } + // All of the indexes of later entries must be greater than snapshot's index + if index >= self.unstable.offset { + fatal!( + self.unstable.logger, + "snapshot's index {} >= offset {}", + index, + self.unstable.offset, + ); + } + + debug!(self.unstable.logger, "snapshot's persisted index {}", index); + self.persisted = index; + true + } else { + false + } + } + + // scan visits all log entries in the [lo, hi) range, returning them via the + // given callback. The callback can be invoked multiple times, with consecutive + // sub-ranges of the requested range. Returns up to page_size bytes worth of + // entries at a time. May return more if a single entry size exceeds the limit. + // + // The entries in [lo, hi) must exist, otherwise scan() eventually returns an + // error. + // + // If the callback returns false, scan terminates. + pub(crate) fn scan( + &self, + mut lo: u64, + hi: u64, + page_size: u64, + context: GetEntriesContext, + mut v: F, + ) -> Result<()> + where + F: FnMut(Vec) -> bool, + { + while lo < hi { + let ents = self.slice(lo, hi, page_size, context)?; + if ents.is_empty() { + return Err(Error::Store(StorageError::Other( + format!("got 0 entries in [{}, {})", lo, hi).into(), + ))); + } + lo += ents.len() as u64; + if !v(ents) { + return Ok(()); + } + } + Ok(()) + } + + /// Grabs a slice of entries from the raft. Unlike a rust slice pointer, these are + /// returned by value. The result is truncated to the max_size in bytes. + pub fn slice( + &self, + low: u64, + high: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> Result> { + let max_size = max_size.into(); + if let Some(err) = self.must_check_outofbounds(low, high) { + return Err(err); + } + + let mut ents = vec![]; + if low == high { + return Ok(ents); + } + + if low < self.unstable.offset { + let unstable_high = cmp::min(high, self.unstable.offset); + match self.store.entries(low, unstable_high, max_size, context) { + Err(e) => match e { + Error::Store(StorageError::Compacted) + | Error::Store(StorageError::LogTemporarilyUnavailable) => return Err(e), + Error::Store(StorageError::Unavailable) => fatal!( + self.unstable.logger, + "entries[{}:{}] is unavailable from storage", + low, + unstable_high, + ), + _ => fatal!(self.unstable.logger, "unexpected error: {:?}", e), + }, + Ok(entries) => { + ents = entries; + if (ents.len() as u64) < unstable_high - low { + return Ok(ents); + } + } + } + } + + if high > self.unstable.offset { + let offset = self.unstable.offset; + let unstable = self.unstable.slice(cmp::max(low, offset), high); + ents.extend_from_slice(unstable); + } + util::limit_size(&mut ents, max_size); + Ok(ents) + } + + /// Restores the current log from a snapshot. + pub fn restore(&mut self, snapshot: Snapshot) { + info!( + self.unstable.logger, + "log [{log}] starts to restore snapshot [index: {snapshot_index}, term: {snapshot_term}]", + log = self.to_string(), + snapshot_index = snapshot.get_metadata().index, + snapshot_term = snapshot.get_metadata().term, + ); + let index = snapshot.get_metadata().index; + assert!(index >= self.committed, "{} < {}", index, self.committed); + // If `persisted` is greater than `committed`, reset it to `committed`. + // It's because only the persisted entries whose index are less than `committed` can be + // considered the same as the data from snapshot. + // Although there may be some persisted entries with greater index are also committed, + // we can not judge them nor do we care about them because these entries can not be applied + // thus the invariant which is `applied` <= min(`persisted`, `committed`) is satisfied. + if self.persisted > self.committed { + self.persisted = self.committed; + } + self.committed = index; + self.unstable.restore(snapshot); + } + + /// Returns the committed index and its term. + pub fn commit_info(&self) -> (u64, u64) { + match self.term(self.committed) { + Ok(t) => (self.committed, t), + Err(e) => fatal!( + self.unstable.logger, + "last committed entry at {} is missing: {:?}", + self.committed, + e + ), + } + } +} + +#[cfg(test)] +mod test { + use std::{ + cmp, + panic::{self, AssertUnwindSafe}, + }; + + use protobuf::Message as PbMessage; + + use crate::config::Config; + use crate::default_logger; + use crate::eraftpb; + use crate::errors::{Error, StorageError}; + use crate::raft_log::{self, RaftLog}; + use crate::storage::{GetEntriesContext, MemStorage}; + use crate::NO_LIMIT; + + fn new_entry(index: u64, term: u64) -> eraftpb::Entry { + let mut e = eraftpb::Entry::default(); + e.term = term; + e.index = index; + e + } + + fn new_snapshot(meta_index: u64, meta_term: u64) -> eraftpb::Snapshot { + let mut meta = eraftpb::SnapshotMetadata::default(); + meta.index = meta_index; + meta.term = meta_term; + let mut snapshot = eraftpb::Snapshot::default(); + snapshot.set_metadata(meta); + snapshot + } + + #[test] + fn test_find_conflict() { + let l = default_logger(); + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 3)]; + let tests = vec![ + // no conflict, empty ent + (vec![], 0), + (vec![], 0), + // no conflict + (vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 3)], 0), + (vec![new_entry(2, 2), new_entry(3, 3)], 0), + (vec![new_entry(3, 3)], 0), + // no conflict, but has new entries + ( + vec![ + new_entry(1, 1), + new_entry(2, 2), + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 4), + ], + 4, + ), + ( + vec![ + new_entry(2, 2), + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 4), + ], + 4, + ), + (vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 4)], 4), + (vec![new_entry(4, 4), new_entry(5, 4)], 4), + // conflicts with existing entries + (vec![new_entry(1, 4), new_entry(2, 4)], 1), + (vec![new_entry(2, 1), new_entry(3, 4), new_entry(4, 4)], 2), + ( + vec![ + new_entry(3, 1), + new_entry(4, 2), + new_entry(5, 4), + new_entry(6, 4), + ], + 3, + ), + ]; + for (i, &(ref ents, wconflict)) in tests.iter().enumerate() { + let store = MemStorage::new(); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.append(&previous_ents); + let gconflict = raft_log.find_conflict(ents); + if gconflict != wconflict { + panic!("#{}: conflict = {}, want {}", i, gconflict, wconflict) + } + } + } + + #[test] + fn test_is_up_to_date() { + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 3)]; + let store = MemStorage::new(); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + raft_log.append(&previous_ents); + let tests = vec![ + // greater term, ignore lastIndex + (raft_log.last_index() - 1, 4, true), + (raft_log.last_index(), 4, true), + (raft_log.last_index() + 1, 4, true), + // smaller term, ignore lastIndex + (raft_log.last_index() - 1, 2, false), + (raft_log.last_index(), 2, false), + (raft_log.last_index() + 1, 2, false), + // equal term, lager lastIndex wins + (raft_log.last_index() - 1, 3, false), + (raft_log.last_index(), 3, true), + (raft_log.last_index() + 1, 3, true), + ]; + for (i, &(last_index, term, up_to_date)) in tests.iter().enumerate() { + let g_up_to_date = raft_log.is_up_to_date(last_index, term); + if g_up_to_date != up_to_date { + panic!("#{}: uptodate = {}, want {}", i, g_up_to_date, up_to_date); + } + } + } + + #[test] + fn test_append() { + let l = default_logger(); + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2)]; + let tests = vec![ + (vec![], 2, vec![new_entry(1, 1), new_entry(2, 2)], 3), + ( + vec![new_entry(3, 2)], + 3, + vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 2)], + 3, + ), + // conflicts with index 1 + (vec![new_entry(1, 2)], 1, vec![new_entry(1, 2)], 1), + // conflicts with index 2 + ( + vec![new_entry(2, 3), new_entry(3, 3)], + 3, + vec![new_entry(1, 1), new_entry(2, 3), new_entry(3, 3)], + 2, + ), + ]; + for (i, &(ref ents, windex, ref wents, wunstable)) in tests.iter().enumerate() { + let store = MemStorage::new(); + store.wl().append(&previous_ents).expect("append failed"); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + let index = raft_log.append(ents); + if index != windex { + panic!("#{}: last_index = {}, want {}", i, index, windex); + } + match raft_log.entries(1, None, GetEntriesContext::empty(false)) { + Err(e) => panic!("#{}: unexpected error {}", i, e), + Ok(ref g) if g != wents => panic!("#{}: logEnts = {:?}, want {:?}", i, &g, &wents), + _ => { + let goff = raft_log.unstable.offset; + if goff != wunstable { + panic!("#{}: unstable = {}, want {}", i, goff, wunstable); + } + } + } + } + } + + #[test] + fn test_compaction_side_effects() { + let last_index = 1000u64; + let unstable_index = 750u64; + let last_term = last_index; + let storage = MemStorage::new(); + for i in 1..=unstable_index { + storage + .wl() + .append(&[new_entry(i, i)]) + .expect("append failed"); + } + let mut raft_log = RaftLog::new(storage, default_logger(), &Config::default()); + for i in unstable_index..last_index { + raft_log.append(&[new_entry(i + 1, i + 1)]); + } + assert!( + raft_log.maybe_commit(last_index, last_term), + "maybe_commit return false" + ); + + let offset = 500u64; + raft_log.store.wl().compact(offset).expect("compact failed"); + + assert_eq!(last_index, raft_log.last_index()); + + for j in offset..=raft_log.last_index() { + assert_eq!(j, raft_log.term(j).expect("")); + if !raft_log.match_term(j, j) { + panic!("match_term({}) = false, want true", j); + } + } + + { + let unstable_ents = raft_log.unstable_entries(); + assert_eq!(last_index - unstable_index, unstable_ents.len() as u64); + assert_eq!(unstable_index + 1, unstable_ents[0].index); + } + + let mut prev = raft_log.last_index(); + raft_log.append(&[new_entry(prev + 1, prev + 1)]); + assert_eq!(prev + 1, raft_log.last_index()); + + prev = raft_log.last_index(); + let ents = raft_log + .entries(prev, None, GetEntriesContext::empty(false)) + .expect("unexpected error"); + assert_eq!(1, ents.len()); + } + + #[test] + fn test_term_with_unstable_snapshot() { + let storagesnapi = 10064; + let unstablesnapi = storagesnapi + 5; + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(storagesnapi, 1)) + .expect("apply failed."); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + raft_log.restore(new_snapshot(unstablesnapi, 1)); + assert_eq!(raft_log.committed, unstablesnapi); + assert_eq!(raft_log.persisted, storagesnapi); + + let tests = [ + // cannot get term from storage + (storagesnapi, 0), + // cannot get term from the gap between storage ents and unstable snapshot + (storagesnapi + 1, 0), + (unstablesnapi - 1, 0), + // get term from unstable snapshot index + (unstablesnapi, 1), + ]; + + for (i, &(index, w)) in tests.iter().enumerate() { + let term = raft_log.term(index).expect(""); + if term != w { + panic!("#{}: at = {}, want {}", i, term, w); + } + } + } + + #[test] + fn test_term() { + let offset = 100u64; + let num = 100u64; + + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(offset, 1)) + .expect("apply failed."); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + for i in 1..num { + raft_log.append(&[new_entry(offset + i, i)]); + } + + let tests = [ + (offset - 1, 0), + (offset, 1), + (offset + num / 2, num / 2), + (offset + num - 1, num - 1), + (offset + num, 0), + ]; + + for (i, &(index, w)) in tests.iter().enumerate() { + let term = raft_log.term(index).expect(""); + if term != w { + panic!("#{}: at = {}, want {}", i, term, w); + } + } + } + + #[test] + fn test_log_restore() { + let (index, term) = (1000u64, 1000u64); + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(index, term)) + .expect("apply failed."); + let entries = vec![new_entry(index + 1, term), new_entry(index + 2, term + 1)]; + store.wl().append(&entries).expect(""); + let raft_log = RaftLog::new(store, default_logger(), &Config::default()); + + assert_eq!(raft_log.all_entries(), entries); + assert_eq!(index + 1, raft_log.first_index()); + assert_eq!(index, raft_log.committed); + assert_eq!(index + 2, raft_log.persisted); + assert_eq!(index + 3, raft_log.unstable.offset); + + assert_eq!(term, raft_log.term(index).unwrap()); + assert_eq!(term, raft_log.term(index + 1).unwrap()); + assert_eq!(term + 1, raft_log.term(index + 2).unwrap()); + } + + #[test] + fn test_maybe_persist_with_snap() { + let l = default_logger(); + let (snap_index, snap_term) = (5u64, 2u64); + // persisted_index, persisted_term, new_entries, wpersisted + let tests = vec![ + (snap_index + 1, snap_term, vec![], snap_index), + (snap_index, snap_term, vec![], snap_index), + (snap_index - 1, snap_term, vec![], snap_index), + (snap_index + 1, snap_term + 1, vec![], snap_index), + (snap_index, snap_term + 1, vec![], snap_index), + (snap_index - 1, snap_term + 1, vec![], snap_index), + ( + snap_index + 1, + snap_term, + vec![new_entry(snap_index + 1, snap_term)], + snap_index + 1, + ), + ( + snap_index, + snap_term, + vec![new_entry(snap_index + 1, snap_term)], + snap_index, + ), + ( + snap_index - 1, + snap_term, + vec![new_entry(snap_index + 1, snap_term)], + snap_index, + ), + ( + snap_index + 1, + snap_term + 1, + vec![new_entry(snap_index + 1, snap_term)], + snap_index, + ), + ( + snap_index, + snap_term + 1, + vec![new_entry(snap_index + 1, snap_term)], + snap_index, + ), + ( + snap_index - 1, + snap_term + 1, + vec![new_entry(snap_index + 1, snap_term)], + snap_index, + ), + ]; + + for (i, &(stablei, stablet, ref new_ents, wpersist)) in tests.iter().enumerate() { + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(snap_index, snap_term)) + .expect(""); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + assert_eq!(raft_log.persisted, snap_index); + raft_log.append(new_ents); + let unstable = raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + raft_log.stable_entries(e.get_index(), e.get_term()); + raft_log.mut_store().wl().append(&unstable).expect(""); + } + let is_changed = raft_log.persisted != wpersist; + assert_eq!(raft_log.maybe_persist(stablei, stablet), is_changed); + if raft_log.persisted != wpersist { + panic!( + "#{}: persisted = {}, want {}", + i, raft_log.persisted, wpersist + ); + } + } + + let mut raft_log = RaftLog::new(MemStorage::new(), default_logger(), &Config::default()); + raft_log.restore(new_snapshot(100, 1)); + assert_eq!(raft_log.unstable.offset, 101); + raft_log.append(&[new_entry(101, 1)]); + assert_eq!(raft_log.term(101), Ok(1)); + // 101 == offset, should not forward persisted + assert!(!raft_log.maybe_persist(101, 1)); + raft_log.append(&[new_entry(102, 1)]); + assert_eq!(raft_log.term(102), Ok(1)); + // 102 > offset, should not forward persisted + assert!(!raft_log.maybe_persist(102, 1)); + } + + // TestUnstableEnts ensures unstableEntries returns the unstable part of the + // entries correctly. + #[test] + fn test_unstable_ents() { + let l = default_logger(); + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2)]; + let tests = [(3, vec![]), (1, previous_ents.clone())]; + + for (i, &(unstable, ref wents)) in tests.iter().enumerate() { + // append stable entries to storage + let store = MemStorage::new(); + store + .wl() + .append(&previous_ents[..(unstable - 1)]) + .expect(""); + + // append unstable entries to raftlog + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.append(&previous_ents[(unstable - 1)..]); + + let ents = raft_log.unstable_entries().to_vec(); + if let Some(e) = ents.last() { + raft_log.stable_entries(e.get_index(), e.get_term()); + } + if &ents != wents { + panic!("#{}: unstableEnts = {:?}, want {:?}", i, ents, wents); + } + let w = previous_ents[previous_ents.len() - 1].index + 1; + let g = raft_log.unstable.offset; + if g != w { + panic!("#{}: unstable = {}, want {}", i, g, w); + } + } + } + + #[test] + fn test_has_next_ents_and_next_ents() { + let l = default_logger(); + let ents = [ + new_entry(4, 1), + new_entry(5, 1), + new_entry(6, 1), + new_entry(7, 1), + ]; + // applied, persisted, committed, expect_entries + let tests = vec![ + (0, 3, 3, None), + (0, 3, 4, None), + (0, 4, 6, Some(&ents[..1])), + (0, 6, 4, Some(&ents[..1])), + (0, 5, 5, Some(&ents[..2])), + (0, 5, 7, Some(&ents[..2])), + (0, 7, 5, Some(&ents[..2])), + (3, 4, 3, None), + (3, 5, 5, Some(&ents[..2])), + (3, 6, 7, Some(&ents[..3])), + (3, 7, 6, Some(&ents[..3])), + (4, 5, 5, Some(&ents[1..2])), + (4, 5, 7, Some(&ents[1..2])), + (4, 7, 5, Some(&ents[1..2])), + (4, 7, 7, Some(&ents[1..4])), + (5, 5, 5, None), + (5, 7, 7, Some(&ents[2..4])), + (7, 7, 7, None), + ]; + for (i, &(applied, persisted, committed, ref expect_entries)) in tests.iter().enumerate() { + let store = MemStorage::new(); + store.wl().apply_snapshot(new_snapshot(3, 1)).expect(""); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.append(&ents); + let unstable = raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + raft_log.stable_entries(e.get_index(), e.get_term()); + raft_log.mut_store().wl().append(&unstable).expect(""); + } + raft_log.maybe_persist(persisted, 1); + assert_eq!( + persisted, raft_log.persisted, + "#{}: persisted = {}, want {}", + i, raft_log.persisted, persisted + ); + raft_log.maybe_commit(committed, 1); + assert_eq!( + committed, raft_log.committed, + "#{}: committed = {}, want {}", + i, raft_log.committed, committed + ); + #[allow(deprecated)] + raft_log.applied_to(applied); + + let expect_has_next = expect_entries.is_some(); + let actual_has_next = raft_log.has_next_entries(); + if actual_has_next != expect_has_next { + panic!( + "#{}: hasNext = {}, want {}", + i, actual_has_next, expect_has_next + ); + } + + let next_entries = raft_log.next_entries(None); + if next_entries != expect_entries.map(|n| n.to_vec()) { + panic!( + "#{}: next_entries = {:?}, want {:?}", + i, next_entries, expect_entries + ); + } + } + + let ents = [ + new_entry(4, 1), + new_entry(5, 1), + new_entry(6, 1), + new_entry(7, 1), + new_entry(8, 1), + new_entry(9, 1), + new_entry(10, 1), + ]; + const UNLIMITED: u64 = u32::MAX as u64; + let tests = vec![ + (0, 3, 3, 0, None), + (0, 3, 4, 0, None), + (0, 3, 4, UNLIMITED, Some(&ents[..1])), + (0, 4, 6, 0, Some(&ents[..1])), + (0, 4, 6, 2, Some(&ents[..3])), + (0, 4, 6, 6, Some(&ents[..3])), + (0, 4, 10, 0, Some(&ents[..1])), + (0, 4, 10, 2, Some(&ents[..3])), + (0, 4, 10, 6, Some(&ents)), + (0, 4, 10, 7, Some(&ents)), + (0, 6, 4, 0, Some(&ents[..1])), + (0, 6, 4, UNLIMITED, Some(&ents[..1])), + (0, 5, 5, 0, Some(&ents[..2])), + (3, 4, 3, UNLIMITED, None), + (3, 5, 5, UNLIMITED, Some(&ents[..2])), + (3, 6, 7, UNLIMITED, Some(&ents[..4])), + (3, 7, 6, UNLIMITED, Some(&ents[..3])), + (4, 5, 5, UNLIMITED, Some(&ents[1..2])), + (4, 5, 5, UNLIMITED, Some(&ents[1..2])), + (4, 5, 7, UNLIMITED, Some(&ents[1..4])), + (4, 5, 9, UNLIMITED, Some(&ents[1..6])), + (4, 5, 10, UNLIMITED, Some(&ents[1..])), + (4, 7, 5, UNLIMITED, Some(&ents[1..2])), + (4, 7, 7, 0, Some(&ents[1..4])), + (5, 5, 5, 0, None), + (5, 7, 7, UNLIMITED, Some(&ents[2..4])), + (7, 7, 7, UNLIMITED, None), + // test applied can be bigger than `persisted + limit`(when limit is changed) + (8, 6, 8, 0, None), + ]; + for (i, &(applied, persisted, committed, limit, ref expect_entries)) in + tests.iter().enumerate() + { + let store = MemStorage::new(); + store.wl().apply_snapshot(new_snapshot(3, 1)).expect(""); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.max_apply_unpersisted_log_limit = limit; + raft_log.append(&ents); + let unstable = raft_log.unstable_entries().to_vec(); + if let Some(e) = unstable.last() { + raft_log.stable_entries(e.get_index(), e.get_term()); + raft_log.mut_store().wl().append(&unstable).expect(""); + } + raft_log.maybe_persist(persisted, 1); + assert_eq!( + persisted, raft_log.persisted, + "#{}: persisted = {}, want {}", + i, raft_log.persisted, persisted + ); + raft_log.maybe_commit(committed, 1); + assert_eq!( + committed, raft_log.committed, + "#{}: committed = {}, want {}", + i, raft_log.committed, committed + ); + #[allow(deprecated)] + raft_log.applied_to(applied); + + let expect_has_next = expect_entries.is_some(); + let actual_has_next = raft_log.has_next_entries(); + if actual_has_next != expect_has_next { + panic!( + "#{}: hasNext = {}, want {}", + i, actual_has_next, expect_has_next + ); + } + + let next_entries = raft_log.next_entries(None); + if next_entries != expect_entries.map(|n| n.to_vec()) { + panic!( + "#{}: next_entries = {:?}, want {:?}", + i, next_entries, expect_entries + ); + } + } + } + + #[test] + fn test_slice() { + let (offset, num) = (100u64, 100u64); + let (last, half) = (offset + num, offset + num / 2); + let halfe = new_entry(half, half); + + let halfe_size = u64::from(halfe.compute_size()); + + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(offset, 0)) + .expect(""); + for i in 1..(num / 2) { + store + .wl() + .append(&[new_entry(offset + i, offset + i)]) + .expect(""); + } + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + for i in (num / 2)..num { + raft_log.append(&[new_entry(offset + i, offset + i)]); + } + + let tests = vec![ + // test no limit + (offset - 1, offset + 1, raft_log::NO_LIMIT, vec![], false), + (offset, offset + 1, raft_log::NO_LIMIT, vec![], false), + ( + half - 1, + half + 1, + raft_log::NO_LIMIT, + vec![new_entry(half - 1, half - 1), new_entry(half, half)], + false, + ), + ( + half, + half + 1, + raft_log::NO_LIMIT, + vec![new_entry(half, half)], + false, + ), + ( + last - 1, + last, + raft_log::NO_LIMIT, + vec![new_entry(last - 1, last - 1)], + false, + ), + (last, last + 1, raft_log::NO_LIMIT, vec![], true), + // test limit + ( + half - 1, + half + 1, + 0, + vec![new_entry(half - 1, half - 1)], + false, + ), + ( + half - 1, + half + 1, + halfe_size + 1, + vec![new_entry(half - 1, half - 1)], + false, + ), + ( + half - 2, + half + 1, + halfe_size + 1, + vec![new_entry(half - 2, half - 2)], + false, + ), + ( + half - 1, + half + 1, + halfe_size * 2, + vec![new_entry(half - 1, half - 1), new_entry(half, half)], + false, + ), + ( + half - 1, + half + 2, + halfe_size * 3, + vec![ + new_entry(half - 1, half - 1), + new_entry(half, half), + new_entry(half + 1, half + 1), + ], + false, + ), + ( + half, + half + 2, + halfe_size, + vec![new_entry(half, half)], + false, + ), + ( + half, + half + 2, + halfe_size * 2, + vec![new_entry(half, half), new_entry(half + 1, half + 1)], + false, + ), + ]; + + for (i, &(from, to, limit, ref w, wpanic)) in tests.iter().enumerate() { + let res = panic::catch_unwind(AssertUnwindSafe(|| { + raft_log.slice(from, to, Some(limit), GetEntriesContext::empty(false)) + })); + if res.is_err() ^ wpanic { + panic!("#{}: panic = {}, want {}: {:?}", i, true, false, res); + } + if res.is_err() { + continue; + } + let slice_res = res.unwrap(); + if from <= offset && slice_res != Err(Error::Store(StorageError::Compacted)) { + let err = slice_res.err(); + panic!("#{}: err = {:?}, want {}", i, err, StorageError::Compacted); + } + if from > offset && slice_res.is_err() { + panic!("#{}: unexpected error {}", i, slice_res.unwrap_err()); + } + if let Ok(ref g) = slice_res { + if g != w { + panic!("#{}: from {} to {} = {:?}, want {:?}", i, from, to, g, w); + } + } + } + } + + fn ents_size(ents: &[eraftpb::Entry]) -> u64 { + let mut size = 0; + for ent in ents { + size += ent.compute_size() as u64; + } + size + } + + #[test] + fn test_scan() { + let offset = 47; + let num = 20; + let last = offset + num; + let half = offset + num / 2; + let entries = |from, to| { + let mut ents = vec![]; + for i in from..to { + ents.push(new_entry(i, i)); + } + ents + }; + let entry_size = ents_size(&entries(half, half + 1)); + + let store = MemStorage::new(); + store.wl().apply_snapshot(new_snapshot(offset, 0)).unwrap(); + store.wl().append(&entries(offset + 1, half)).unwrap(); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + raft_log.append(&entries(half, last)); + + // Test that scan() returns the same entries as slice(), on all inputs. + for page_size in [0, 1, 10, 100, entry_size, entry_size + 1] { + for lo in offset + 1..last { + for hi in lo..=last { + let mut got = vec![]; + raft_log + .scan(lo, hi, page_size, GetEntriesContext::empty(false), |e| { + assert!( + e.len() == 1 || ents_size(&e) < page_size, + "{} {} {}", + e.len(), + ents_size(&e), + page_size + ); + got.extend(e); + true + }) + .unwrap(); + let want = raft_log + .slice(lo, hi, NO_LIMIT, GetEntriesContext::empty(false)) + .unwrap(); + assert_eq!( + got, want, + "scan() and slice() mismatch on [{}, {}) @ {}", + lo, hi, page_size + ); + } + } + } + + // Test that the callback early return. + let mut iters = 0; + raft_log + .scan(offset + 1, half, 0, GetEntriesContext::empty(false), |_| { + iters += 1; + if iters == 2 { + return false; + } + true + }) + .unwrap(); + assert_eq!(iters, 2); + + // Test that we max out the limit, and not just always return a single entry. + // NB: this test works only because the requested range length is even. + raft_log + .scan( + offset + 1, + offset + 11, + entry_size * 2, + GetEntriesContext::empty(false), + |e| { + assert_eq!(e.len(), 2); + assert_eq!(entry_size * 2, ents_size(&e)); + true + }, + ) + .unwrap(); + } + + /// `test_log_maybe_append` ensures: + /// If the given (index, term) matches with the existing log: + /// 1. If an existing entry conflicts with a new one (same index + /// but different terms), delete the existing entry and all that + /// follow it and decrease the persisted + /// 2. Append any new entries not already in the log + /// If the given (index, term) does not match with the existing log: + /// return false + #[test] + fn test_log_maybe_append() { + let l = default_logger(); + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 3)]; + let (last_index, last_term, commit, persist) = (3u64, 3u64, 1u64, 3u64); + + let tests = vec![ + // not match: term is different + ( + last_term - 1, + last_index, + last_index, + vec![new_entry(last_index + 1, 4)], + None, + commit, + persist, + false, + ), + // not match: index out of bound + ( + last_term, + last_index + 1, + last_index, + vec![new_entry(last_index + 2, 4)], + None, + commit, + persist, + false, + ), + // match with the last existing entry + ( + last_term, + last_index, + last_index, + vec![], + Some(last_index), + last_index, + persist, + false, + ), + // do not increase commit higher than lastnewi + ( + last_term, + last_index, + last_index + 1, + vec![], + Some(last_index), + last_index, + persist, + false, + ), + // commit up to the commit in the message + ( + last_term, + last_index, + last_index - 1, + vec![], + Some(last_index), + last_index - 1, + persist, + false, + ), + // commit do not decrease + ( + last_term, + last_index, + 0, + vec![], + Some(last_index), + commit, + persist, + false, + ), + // commit do not decrease + (0, 0, last_index, vec![], Some(0), commit, persist, false), + ( + last_term, + last_index, + last_index, + vec![new_entry(last_index + 1, 4)], + Some(last_index + 1), + last_index, + persist, + false, + ), + ( + last_term, + last_index, + last_index + 1, + vec![new_entry(last_index + 1, 4)], + Some(last_index + 1), + last_index + 1, + persist, + false, + ), + // do not increase commit higher than lastnewi + ( + last_term, + last_index, + last_index + 2, + vec![new_entry(last_index + 1, 4)], + Some(last_index + 1), + last_index + 1, + persist, + false, + ), + ( + last_term, + last_index, + last_index + 2, + vec![new_entry(last_index + 1, 4), new_entry(last_index + 2, 4)], + Some(last_index + 2), + last_index + 2, + persist, + false, + ), + // match with the the entry in the middle + ( + last_term - 1, + last_index - 1, + last_index, + vec![new_entry(last_index, 4)], + Some(last_index), + last_index, + cmp::min(last_index - 1, persist), + false, + ), + ( + last_term - 2, + last_index - 2, + last_index, + vec![new_entry(last_index - 1, 4)], + Some(last_index - 1), + last_index - 1, + cmp::min(last_index - 2, persist), + false, + ), + // conflict with existing committed entry + ( + last_term - 3, + last_index - 3, + last_index, + vec![new_entry(last_index - 2, 4)], + Some(last_index - 2), + last_index - 2, + cmp::min(last_index - 3, persist), + true, + ), + ( + last_term - 2, + last_index - 2, + last_index, + vec![new_entry(last_index - 1, 4), new_entry(last_index, 4)], + Some(last_index), + last_index, + cmp::min(last_index - 2, persist), + false, + ), + ( + last_term - 2, + last_index - 2, + last_index + 2, + vec![ + new_entry(last_index - 1, last_term - 1), + new_entry(last_index, 4), + new_entry(last_index + 1, 4), + ], + Some(last_index + 1), + last_index + 1, + cmp::min(last_index - 1, persist), + false, + ), + ]; + + for (i, &(log_term, index, committed, ref ents, wlasti, wcommit, wpersist, wpanic)) in + tests.iter().enumerate() + { + let store = MemStorage::new(); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.append(&previous_ents); + raft_log.committed = commit; + raft_log.persisted = persist; + let res = panic::catch_unwind(AssertUnwindSafe(|| { + raft_log + .maybe_append(index, log_term, committed, ents) + .map(|(_, last_idx)| last_idx) + })); + if res.is_err() ^ wpanic { + panic!("#{}: panic = {}, want {}", i, res.is_err(), wpanic); + } + if res.is_err() { + continue; + } + let glasti = res.unwrap(); + let gcommitted = raft_log.committed; + let gpersisted = raft_log.persisted; + if glasti != wlasti { + panic!("#{}: lastindex = {:?}, want {:?}", i, glasti, wlasti); + } + if gcommitted != wcommit { + panic!("#{}: committed = {}, want {}", i, gcommitted, wcommit); + } + if gpersisted != wpersist { + panic!("#{}: persisted = {}, want {}", i, gpersisted, wpersist); + } + let ents_len = ents.len() as u64; + if glasti.is_some() && ents_len != 0 { + let (from, to) = ( + raft_log.last_index() - ents_len + 1, + raft_log.last_index() + 1, + ); + let gents = raft_log + .slice(from, to, None, GetEntriesContext::empty(false)) + .expect(""); + if &gents != ents { + panic!("#{}: appended entries = {:?}, want {:?}", i, gents, ents); + } + } + } + } + + #[test] + fn test_commit_to() { + let l = default_logger(); + let previous_ents = vec![new_entry(1, 1), new_entry(2, 2), new_entry(3, 3)]; + let previous_commit = 2u64; + let tests = [ + (3, 3, false), + (1, 2, false), // never decrease + (4, 0, true), // commit out of range -> panic + ]; + for (i, &(commit, wcommit, wpanic)) in tests.iter().enumerate() { + let store = MemStorage::new(); + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.append(&previous_ents); + raft_log.committed = previous_commit; + let has_panic = + panic::catch_unwind(AssertUnwindSafe(|| raft_log.commit_to(commit))).is_err(); + if has_panic ^ wpanic { + panic!("#{}: panic = {}, want {}", i, has_panic, wpanic) + } + if !has_panic && raft_log.committed != wcommit { + let actual_committed = raft_log.committed; + panic!("#{}: committed = {}, want {}", i, actual_committed, wcommit); + } + } + } + + // TestCompaction ensures that the number of log entries is correct after compactions. + #[test] + fn test_compaction() { + let l = default_logger(); + let tests = [ + // out of upper bound + (1000, vec![1001u64], vec![0usize], true), + ( + 1000, + vec![300, 500, 800, 900], + vec![700, 500, 200, 100], + false, + ), + // out of lower bound + (1000, vec![300, 299], vec![700, 700], false), + ]; + + for (i, &(index, ref compact, ref wleft, should_panic)) in tests.iter().enumerate() { + let store = MemStorage::new(); + for i in 1u64..index { + store.wl().append(&[new_entry(i, 0)]).expect(""); + } + let mut raft_log = RaftLog::new(store, l.clone(), &Config::default()); + raft_log.maybe_commit(index - 1, 0); + let committed = raft_log.committed; + #[allow(deprecated)] + raft_log.applied_to(committed); + + for (j, idx) in compact.iter().enumerate() { + let res = + panic::catch_unwind(AssertUnwindSafe(|| raft_log.store.wl().compact(*idx))); + if !(should_panic ^ res.is_ok()) { + panic!("#{}: should_panic: {}, but got: {:?}", i, should_panic, res); + } + if !should_panic { + let l = raft_log.all_entries().len(); + if l != wleft[j] { + panic!("#{}.{} len = {}, want {}", i, j, l, wleft[j]); + } + } + } + } + } + + #[test] + fn test_is_outofbounds() { + let (offset, num) = (100u64, 100u64); + let store = MemStorage::new(); + store + .wl() + .apply_snapshot(new_snapshot(offset, 0)) + .expect(""); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + for i in 1u64..=num { + raft_log.append(&[new_entry(i + offset, 0)]); + } + let first = offset + 1; + let tests = [ + (first - 2, first + 1, false, true), + (first - 1, first + 1, false, true), + (first, first, false, false), + (first + num / 2, first + num / 2, false, false), + (first + num - 1, first + num - 1, false, false), + (first + num, first + num, false, false), + (first + num, first + num + 1, true, false), + (first + num + 1, first + num + 1, true, false), + ]; + + for (i, &(lo, hi, wpanic, w_err_compacted)) in tests.iter().enumerate() { + let res = + panic::catch_unwind(AssertUnwindSafe(|| raft_log.must_check_outofbounds(lo, hi))); + if res.is_err() ^ wpanic { + panic!( + "#{}: panic = {}, want {}: {:?}", + i, + res.is_err(), + wpanic, + res + ); + } + if res.is_err() { + continue; + } + let check_res = res.unwrap(); + if w_err_compacted && check_res != Some(Error::Store(StorageError::Compacted)) { + panic!( + "#{}: err = {:?}, want {}", + i, + check_res, + StorageError::Compacted + ); + } + if !w_err_compacted && check_res.is_some() { + panic!("#{}: unexpected err {:?}", i, check_res) + } + } + } + + #[test] + fn test_restore_snap() { + let store = MemStorage::new(); + store.wl().apply_snapshot(new_snapshot(100, 1)).expect(""); + let mut raft_log = RaftLog::new(store, default_logger(), &Config::default()); + assert_eq!(raft_log.committed, 100); + assert_eq!(raft_log.persisted, 100); + raft_log.restore(new_snapshot(200, 1)); + assert_eq!(raft_log.committed, 200); + assert_eq!(raft_log.persisted, 100); + + for i in 201..210 { + raft_log.append(&[new_entry(i, 1)]); + } + raft_log + .mut_store() + .wl() + .apply_snapshot(new_snapshot(200, 1)) + .expect(""); + raft_log.stable_snap(200); + let unstable = raft_log.unstable_entries().to_vec(); + raft_log.stable_entries(209, 1); + raft_log.mut_store().wl().append(&unstable).expect(""); + raft_log.maybe_persist(209, 1); + assert_eq!(raft_log.persisted, 209); + + raft_log.restore(new_snapshot(205, 1)); + assert_eq!(raft_log.committed, 205); + // persisted should reset to previous commit index(200) + assert_eq!(raft_log.persisted, 200); + + // use smaller commit index, should panic + assert!( + panic::catch_unwind(AssertUnwindSafe(|| raft_log.restore(new_snapshot(204, 1)))) + .is_err() + ); + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/raw_node.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/raw_node.rs new file mode 100644 index 0000000..15cae0b --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/raw_node.rs @@ -0,0 +1,831 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! The raw node of the raft module. +//! +//! This module contains the value types for the node and it's connection to other +//! nodes but not the raft consensus itself. Generally, you'll interact with the +//! RawNode first and use it to access the inner workings of the consensus protocol. + +use std::{collections::VecDeque, mem}; + +use protobuf::Message as PbMessage; +use raft_proto::ConfChangeI; +use slog::Logger; + +use crate::eraftpb::{ConfState, Entry, EntryType, HardState, Message, MessageType, Snapshot}; +use crate::errors::{Error, Result}; +use crate::read_only::ReadState; +use crate::{config::Config, StateRole}; +use crate::{storage::GetEntriesFor, GetEntriesContext, Raft, SoftState, Status, Storage}; + +use slog::info; + +/// Represents a Peer node in the cluster. +#[derive(Debug, Default)] +pub struct Peer { + /// The ID of the peer. + pub id: u64, + /// If there is context associated with the peer (like connection information), it can be + /// serialized and stored here. + pub context: Option>, +} + +/// The status of the snapshot. +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum SnapshotStatus { + /// Represents that the snapshot is finished being created. + Finish, + /// Indicates that the snapshot failed to build or is not ready. + Failure, +} + +/// Checks if certain message type should be used internally. +pub fn is_local_msg(t: MessageType) -> bool { + matches!( + t, + MessageType::MsgHup + | MessageType::MsgBeat + | MessageType::MsgUnreachable + | MessageType::MsgSnapStatus + | MessageType::MsgCheckQuorum + ) +} + +fn is_response_msg(t: MessageType) -> bool { + matches!( + t, + MessageType::MsgAppendResponse + | MessageType::MsgRequestVoteResponse + | MessageType::MsgHeartbeatResponse + | MessageType::MsgUnreachable + | MessageType::MsgRequestPreVoteResponse + ) +} + +/// For a given snapshot, determine if it's empty or not. +#[deprecated(since = "0.6.0", note = "Please use `Snapshot::is_empty` instead")] +pub fn is_empty_snap(s: &Snapshot) -> bool { + s.is_empty() +} + +/// Ready encapsulates the entries and messages that are ready to read, +/// be saved to stable storage, committed or sent to other peers. +#[derive(Default, Debug, PartialEq)] +pub struct Ready { + number: u64, + + ss: Option, + + hs: Option, + + read_states: Vec, + + entries: Vec, + + snapshot: Snapshot, + + is_persisted_msg: bool, + + light: LightReady, + + must_sync: bool, +} + +impl Ready { + /// The number of current Ready. + /// It is used for identifying the different Ready and ReadyRecord. + #[inline] + pub fn number(&self) -> u64 { + self.number + } + + /// The current volatile state of a Node. + /// SoftState will be None if there is no update. + /// It is not required to consume or store SoftState. + #[inline] + pub fn ss(&self) -> Option<&SoftState> { + self.ss.as_ref() + } + + /// The current state of a Node to be saved to stable storage. + /// HardState will be None state if there is no update. + #[inline] + pub fn hs(&self) -> Option<&HardState> { + self.hs.as_ref() + } + + /// ReadStates specifies the state for read only query. + #[inline] + pub fn read_states(&self) -> &Vec { + &self.read_states + } + + /// Take the ReadStates. + #[inline] + pub fn take_read_states(&mut self) -> Vec { + mem::take(&mut self.read_states) + } + + /// Entries specifies entries to be saved to stable storage. + #[inline] + pub fn entries(&self) -> &Vec { + &self.entries + } + + /// Take the Entries. + #[inline] + pub fn take_entries(&mut self) -> Vec { + mem::take(&mut self.entries) + } + + /// Snapshot specifies the snapshot to be saved to stable storage. + #[inline] + pub fn snapshot(&self) -> &Snapshot { + &self.snapshot + } + + /// CommittedEntries specifies entries to be committed to a + /// store/state-machine. These have previously been committed to stable + /// store. + #[inline] + pub fn committed_entries(&self) -> &Vec { + self.light.committed_entries() + } + + /// Take the CommitEntries. + #[inline] + pub fn take_committed_entries(&mut self) -> Vec { + self.light.take_committed_entries() + } + + /// Messages specifies outbound messages to be sent. + /// If it contains a MsgSnap message, the application MUST report back to raft + /// when the snapshot has been received or has failed by calling ReportSnapshot. + #[inline] + pub fn messages(&self) -> &[Message] { + if !self.is_persisted_msg { + self.light.messages() + } else { + &[] + } + } + + /// Take the Messages. + #[inline] + pub fn take_messages(&mut self) -> Vec { + if !self.is_persisted_msg { + self.light.take_messages() + } else { + Vec::new() + } + } + + /// Persisted Messages specifies outbound messages to be sent AFTER the HardState, + /// Entries and Snapshot are persisted to stable storage. + #[inline] + pub fn persisted_messages(&self) -> &[Message] { + if self.is_persisted_msg { + self.light.messages() + } else { + &[] + } + } + + /// Take the Persisted Messages. + #[inline] + pub fn take_persisted_messages(&mut self) -> Vec { + if self.is_persisted_msg { + self.light.take_messages() + } else { + Vec::new() + } + } + + /// MustSync is false if and only if + /// 1. no HardState or only its commit is different from before + /// 2. no Entries and Snapshot + /// + /// If it's false, an asynchronous write of HardState is permissible before calling + /// [`RawNode::on_persist_ready`] or [`RawNode::advance`] or its families. + #[inline] + pub fn must_sync(&self) -> bool { + self.must_sync + } +} + +/// ReadyRecord encapsulates some needed data from the corresponding Ready. +#[derive(Default, Debug, PartialEq)] +struct ReadyRecord { + number: u64, + // (index, term) of the last entry from the entries in Ready + last_entry: Option<(u64, u64)>, + // (index, term) of the snapshot in Ready + snapshot: Option<(u64, u64)>, +} + +/// LightReady encapsulates the commit index, committed entries and +/// messages that are ready to be applied or be sent to other peers. +#[derive(Default, Debug, PartialEq)] +pub struct LightReady { + commit_index: Option, + committed_entries: Vec, + messages: Vec, +} + +impl LightReady { + /// The current commit index. + /// It will be None state if there is no update. + /// It is not required to save it to stable storage. + #[inline] + pub fn commit_index(&self) -> Option { + self.commit_index + } + + /// CommittedEntries specifies entries to be committed to a + /// store/state-machine. These have previously been committed to stable + /// store. + #[inline] + pub fn committed_entries(&self) -> &Vec { + &self.committed_entries + } + + /// Take the CommittedEntries. + #[inline] + pub fn take_committed_entries(&mut self) -> Vec { + mem::take(&mut self.committed_entries) + } + + /// Messages specifies outbound messages to be sent. + #[inline] + pub fn messages(&self) -> &[Message] { + &self.messages + } + + /// Take the Messages. + #[inline] + pub fn take_messages(&mut self) -> Vec { + mem::take(&mut self.messages) + } +} + +/// RawNode is a thread-unsafe Node. +/// The methods of this struct correspond to the methods of Node and are described +/// more fully there. +pub struct RawNode { + /// The internal raft state. + pub raft: Raft, + prev_ss: SoftState, + prev_hs: HardState, + // Current max number of Record and ReadyRecord. + max_number: u64, + records: VecDeque, + // Index which the given committed entries should start from. + commit_since_index: u64, +} + +impl RawNode { + #[allow(clippy::new_ret_no_self)] + /// Create a new RawNode given some [`Config`]. + pub fn new(config: &Config, store: T, logger: &Logger) -> Result { + assert_ne!(config.id, 0, "config.id must not be zero"); + let r = Raft::new(config, store, logger)?; + let mut rn = RawNode { + raft: r, + prev_hs: Default::default(), + prev_ss: Default::default(), + max_number: 0, + records: VecDeque::new(), + commit_since_index: config.applied, + }; + rn.prev_hs = rn.raft.hard_state(); + rn.prev_ss = rn.raft.soft_state(); + info!( + rn.raft.logger, + "RawNode created with id {id}.", + id = rn.raft.id + ); + Ok(rn) + } + + /// Create a new RawNode given some [`Config`] and the default logger. + /// + /// The default logger is an `slog` to `log` adapter. + #[cfg(feature = "default-logger")] + #[allow(clippy::new_ret_no_self)] + pub fn with_default_logger(c: &Config, store: T) -> Result { + Self::new(c, store, &crate::default_logger()) + } + + /// Sets priority of node. + #[inline] + pub fn set_priority(&mut self, priority: i64) { + self.raft.set_priority(priority); + } + + /// Tick advances the internal logical clock by a single tick. + /// + /// Returns true to indicate that there will probably be some readiness which + /// needs to be handled. + pub fn tick(&mut self) -> bool { + self.raft.tick() + } + + /// Campaign causes this RawNode to transition to candidate state. + pub fn campaign(&mut self) -> Result<()> { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgHup); + self.raft.step(m) + } + + /// Propose proposes data to be appended to the raft log. + pub fn propose(&mut self, context: Vec, data: Vec) -> Result<()> { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgPropose); + m.from = self.raft.id; + let mut e = Entry::default(); + e.data = data.into(); + e.context = context.into(); + m.set_entries(vec![e].into()); + self.raft.step(m) + } + + /// Broadcast heartbeats to all the followers. + /// + /// If it's not leader, nothing will happen. + pub fn ping(&mut self) { + self.raft.ping() + } + + /// ProposeConfChange proposes a config change. + /// + /// If the node enters joint state with `auto_leave` set to true, it's + /// caller's responsibility to propose an empty conf change again to force + /// leaving joint state. + pub fn propose_conf_change(&mut self, context: Vec, cc: impl ConfChangeI) -> Result<()> { + let (data, ty) = if let Some(cc) = cc.as_v1() { + (cc.write_to_bytes()?, EntryType::EntryConfChange) + } else { + (cc.as_v2().write_to_bytes()?, EntryType::EntryConfChangeV2) + }; + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgPropose); + let mut e = Entry::default(); + e.set_entry_type(ty); + e.data = data.into(); + e.context = context.into(); + m.set_entries(vec![e].into()); + self.raft.step(m) + } + + /// Applies a config change to the local node. The app must call this when it + /// applies a configuration change, except when it decides to reject the + /// configuration change, in which case no call must take place. + pub fn apply_conf_change(&mut self, cc: &impl ConfChangeI) -> Result { + self.raft.apply_conf_change(&cc.as_v2()) + } + + /// Step advances the state machine using the given message. + pub fn step(&mut self, m: Message) -> Result<()> { + // Ignore unexpected local messages receiving over network + if is_local_msg(m.get_msg_type()) { + return Err(Error::StepLocalMsg); + } + if self.raft.prs().get(m.from).is_some() || !is_response_msg(m.get_msg_type()) { + return self.raft.step(m); + } + Err(Error::StepPeerNotFound) + } + + /// A callback when entries are fetched asynchronously. + /// The context should provide the context passed from Storage.entries(). + /// See more in the comment of Storage.entries(). + /// + /// # Panics + /// + /// Panics if passed with the context of context.can_async() == false + pub fn on_entries_fetched(&mut self, context: GetEntriesContext) { + match context.0 { + GetEntriesFor::SendAppend { + to, + term, + aggressively, + } => { + if self.raft.term != term || self.raft.state != StateRole::Leader { + // term or leadership has changed + return; + } + if self.raft.prs().get(to).is_none() { + // the peer has been removed, do nothing + return; + } + + if aggressively { + self.raft.send_append_aggressively(to) + } else { + self.raft.send_append(to) + } + } + GetEntriesFor::Empty(can_async) if can_async => {} + _ => panic!("shouldn't call callback on non-async context"), + } + } + + /// Generates a LightReady that has the committed entries and messages but no commit index. + fn gen_light_ready(&mut self) -> LightReady { + let mut rd = LightReady::default(); + let max_size = Some(self.raft.max_committed_size_per_ready); + let raft = &mut self.raft; + rd.committed_entries = raft + .raft_log + .next_entries_since(self.commit_since_index, max_size) + .unwrap_or_default(); + // Update raft uncommitted entries size + raft.reduce_uncommitted_size(&rd.committed_entries); + if let Some(e) = rd.committed_entries.last() { + assert!(self.commit_since_index < e.get_index()); + self.commit_since_index = e.get_index(); + } + + if !raft.msgs.is_empty() { + rd.messages = mem::take(&mut raft.msgs); + } + + rd + } + + /// Returns the outstanding work that the application needs to handle. + /// + /// This includes appending and applying entries or a snapshot, updating the HardState, + /// and sending messages. The returned `Ready` *MUST* be handled and subsequently + /// passed back via `advance` or its families. Before that, *DO NOT* call any function like + /// `step`, `propose`, `campaign` to change internal state. + /// + /// [`Self::has_ready`] should be called first to check if it's necessary to handle the ready. + pub fn ready(&mut self) -> Ready { + let raft = &mut self.raft; + + self.max_number += 1; + let mut rd = Ready { + number: self.max_number, + ..Default::default() + }; + let mut rd_record = ReadyRecord { + number: self.max_number, + ..Default::default() + }; + + if self.prev_ss.raft_state != StateRole::Leader && raft.state == StateRole::Leader { + // The vote msg which makes this peer become leader has been sent after persisting. + // So the remaining records must be generated during being candidate which can not + // have last_entry and snapshot(if so, it should become follower). + for record in self.records.drain(..) { + assert_eq!(record.last_entry, None); + assert_eq!(record.snapshot, None); + } + } + + let ss = raft.soft_state(); + if ss != self.prev_ss { + rd.ss = Some(ss); + } + let hs = raft.hard_state(); + if hs != self.prev_hs { + if hs.vote != self.prev_hs.vote || hs.term != self.prev_hs.term { + rd.must_sync = true; + } + rd.hs = Some(hs); + } + + if !raft.read_states.is_empty() { + rd.read_states = mem::take(&mut raft.read_states); + } + + if let Some(snapshot) = &raft.raft_log.unstable_snapshot() { + rd.snapshot = snapshot.clone(); + assert!(self.commit_since_index <= rd.snapshot.get_metadata().index); + self.commit_since_index = rd.snapshot.get_metadata().index; + // If there is a snapshot, the latter entries can not be persisted + // so there is no committed entries. + assert!( + !raft + .raft_log + .has_next_entries_since(self.commit_since_index), + "has snapshot but also has committed entries since {}", + self.commit_since_index + ); + rd_record.snapshot = Some(( + rd.snapshot.get_metadata().index, + rd.snapshot.get_metadata().term, + )); + rd.must_sync = true; + } + + rd.entries = raft.raft_log.unstable_entries().to_vec(); + if let Some(e) = rd.entries.last() { + // If the last entry exists, the entries must not empty, vice versa. + rd.must_sync = true; + rd_record.last_entry = Some((e.get_index(), e.get_term())); + } + + // Leader can send messages immediately to make replication concurrently. + // For more details, check raft thesis 10.2.1. + rd.is_persisted_msg = raft.state != StateRole::Leader; + rd.light = self.gen_light_ready(); + self.records.push_back(rd_record); + rd + } + + /// HasReady called when RawNode user need to check if any Ready pending. + pub fn has_ready(&self) -> bool { + let raft = &self.raft; + if !raft.msgs.is_empty() { + return true; + } + + if raft.soft_state() != self.prev_ss { + return true; + } + if raft.hard_state() != self.prev_hs { + return true; + } + + if !raft.read_states.is_empty() { + return true; + } + + if !raft.raft_log.unstable_entries().is_empty() { + return true; + } + + if self.snap().is_some_and(|s| !s.is_empty()) { + return true; + } + + if raft + .raft_log + .has_next_entries_since(self.commit_since_index) + { + return true; + } + + false + } + + fn commit_ready(&mut self, rd: Ready) { + if let Some(ss) = rd.ss { + self.prev_ss = ss; + } + if let Some(hs) = rd.hs { + self.prev_hs = hs; + } + let rd_record = self.records.back().unwrap(); + assert!(rd_record.number == rd.number); + let raft = &mut self.raft; + if let Some((index, _)) = rd_record.snapshot { + raft.raft_log.stable_snap(index); + } + if let Some((index, term)) = rd_record.last_entry { + raft.raft_log.stable_entries(index, term); + } + } + + fn commit_apply(&mut self, applied: u64) { + self.raft.commit_apply(applied); + } + + /// Notifies that the ready of this number has been persisted. + /// + /// Since Ready must be persisted in order, calling this function implicitly means + /// all readies with numbers smaller than this one have been persisted. + /// + /// [`Self::has_ready`] and [`Self::ready`] should be called later to handle further + /// updates that become valid after ready being persisted. + pub fn on_persist_ready(&mut self, number: u64) { + let (mut index, mut term) = (0, 0); + let mut snap_index = 0; + while let Some(record) = self.records.front() { + if record.number > number { + break; + } + let record = self.records.pop_front().unwrap(); + + if let Some((i, _)) = record.snapshot { + snap_index = i; + index = 0; + term = 0; + } + + if let Some((i, t)) = record.last_entry { + index = i; + term = t; + } + } + if snap_index != 0 { + self.raft.on_persist_snap(snap_index); + } + if index != 0 { + self.raft.on_persist_entries(index, term); + } + } + + /// Advances the ready after fully processing it. + /// + /// Fully processing a ready requires to persist snapshot, entries and hard states, apply all + /// committed entries, send all messages. + /// + /// Returns the LightReady that contains commit index, committed entries and messages. [`LightReady`] + /// contains updates that are only valid after persisting last ready. It should also be fully processed. + /// Then [`Self::advance_apply`] or [`Self::advance_apply_to`] should be used later to update applying + /// progress. + pub fn advance(&mut self, rd: Ready) -> LightReady { + let applied = self.commit_since_index; + let light_rd = self.advance_append(rd); + self.advance_apply_to(applied); + light_rd + } + + /// Advances the ready without applying committed entries. [`Self::advance_apply`] or + /// [`Self::advance_apply_to`] should be used later to update applying progress. + /// + /// Returns the LightReady that contains commit index, committed entries and messages. + /// + /// Since Ready must be persisted in order, calling this function implicitly means + /// all ready collected before have been persisted. + #[inline] + pub fn advance_append(&mut self, rd: Ready) -> LightReady { + self.commit_ready(rd); + self.on_persist_ready(self.max_number); + let mut light_rd = self.gen_light_ready(); + if self.raft.state != StateRole::Leader && !light_rd.messages().is_empty() { + fatal!(self.raft.logger, "not leader but has new msg after advance"); + } + // Set commit index if it's updated + let hard_state = self.raft.hard_state(); + if hard_state.commit > self.prev_hs.commit { + light_rd.commit_index = Some(hard_state.commit); + self.prev_hs.commit = hard_state.commit; + } else { + assert!(hard_state.commit == self.prev_hs.commit); + light_rd.commit_index = None; + } + assert_eq!(hard_state, self.prev_hs, "hard state != prev_hs"); + light_rd + } + + /// Same as [`Self::advance_append`] except that it allows to only store the updates in cache. + /// [`Self::on_persist_ready`] should be used later to update the persisting progress. + /// + /// Raft works on an assumption persisted updates should not be lost, which usually requires expensive + /// operations like `fsync`. `advance_append_async` allows you to control the rate of such operations and + /// get a reasonable batch size. However, it's still required that the updates can be read by raft from the + /// `Storage` trait before calling `advance_append_async`. + #[inline] + pub fn advance_append_async(&mut self, rd: Ready) { + self.commit_ready(rd); + } + + /// Advance apply to the index of the last committed entries given before. + #[inline] + pub fn advance_apply(&mut self) { + self.commit_apply(self.commit_since_index); + } + + /// Advance apply to the passed index. + #[inline] + pub fn advance_apply_to(&mut self, applied: u64) { + self.commit_apply(applied); + } + + /// Grabs the snapshot from the raft if available. + #[inline] + pub fn snap(&self) -> Option<&Snapshot> { + self.raft.snap() + } + + /// Status returns the current status of the given group. + #[inline] + pub fn status(&self) -> Status { + Status::new(&self.raft) + } + + /// ReportUnreachable reports the given node is not reachable for the last send. + pub fn report_unreachable(&mut self, id: u64) { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgUnreachable); + m.from = id; + // we don't care if it is ok actually + let _ = self.raft.step(m); + } + + /// ReportSnapshot reports the status of the sent snapshot. + pub fn report_snapshot(&mut self, id: u64, status: SnapshotStatus) { + let rej = status == SnapshotStatus::Failure; + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgSnapStatus); + m.from = id; + m.reject = rej; + // we don't care if it is ok actually + let _ = self.raft.step(m); + } + + /// Request a snapshot from a leader. + /// The snapshot's index must be greater or equal to the request_index (last_index) or + /// the leader's term must be greater than the request term (last_index's term). + pub fn request_snapshot(&mut self) -> Result<()> { + self.raft.request_snapshot() + } + + /// TransferLeader tries to transfer leadership to the given transferee. + pub fn transfer_leader(&mut self, transferee: u64) { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgTransferLeader); + m.from = transferee; + let _ = self.raft.step(m); + } + + /// ReadIndex requests a read state. The read state will be set in ready. + /// Read State has a read index. Once the application advances further than the read + /// index, any linearizable read requests issued before the read request can be + /// processed safely. The read state will have the same rctx attached. + pub fn read_index(&mut self, rctx: Vec) { + let mut m = Message::default(); + m.set_msg_type(MessageType::MsgReadIndex); + let mut e = Entry::default(); + e.data = rctx.into(); + m.set_entries(vec![e].into()); + let _ = self.raft.step(m); + } + + /// Returns the store as an immutable reference. + #[inline] + pub fn store(&self) -> &T { + self.raft.store() + } + + /// Returns the store as a mutable reference. + #[inline] + pub fn mut_store(&mut self) -> &mut T { + self.raft.mut_store() + } + + /// Set whether skip broadcast empty commit messages at runtime. + #[inline] + pub fn skip_bcast_commit(&mut self, skip: bool) { + self.raft.skip_bcast_commit(skip) + } + + /// Set whether to batch append msg at runtime. + #[inline] + pub fn set_batch_append(&mut self, batch_append: bool) { + self.raft.set_batch_append(batch_append) + } +} + +#[cfg(test)] +mod test { + use crate::eraftpb::MessageType; + + use super::is_local_msg; + + #[test] + fn test_is_local_msg() { + let tests = vec![ + (MessageType::MsgHup, true), + (MessageType::MsgBeat, true), + (MessageType::MsgUnreachable, true), + (MessageType::MsgSnapStatus, true), + (MessageType::MsgCheckQuorum, true), + (MessageType::MsgPropose, false), + (MessageType::MsgAppend, false), + (MessageType::MsgAppendResponse, false), + (MessageType::MsgRequestVote, false), + (MessageType::MsgRequestVoteResponse, false), + (MessageType::MsgSnapshot, false), + (MessageType::MsgHeartbeat, false), + (MessageType::MsgHeartbeatResponse, false), + (MessageType::MsgTransferLeader, false), + (MessageType::MsgTimeoutNow, false), + (MessageType::MsgReadIndex, false), + (MessageType::MsgReadIndexResp, false), + (MessageType::MsgRequestPreVote, false), + (MessageType::MsgRequestPreVoteResponse, false), + ]; + for (msg_type, result) in tests { + assert_eq!(is_local_msg(msg_type), result); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/read_only.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/read_only.rs new file mode 100644 index 0000000..1dd0ffa --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/read_only.rs @@ -0,0 +1,136 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2016 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::VecDeque; + +use slog::Logger; + +use crate::eraftpb::Message; +use crate::{HashMap, HashSet}; + +/// Determines the relative safety of and consistency of read only requests. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] +pub enum ReadOnlyOption { + /// Safe guarantees the linearizability of the read only request by + /// communicating with the quorum. It is the default and suggested option. + #[default] + Safe, + /// LeaseBased ensures linearizability of the read only request by + /// relying on the leader lease. It can be affected by clock drift. + /// If the clock drift is unbounded, leader might keep the lease longer than it + /// should (clock can move backward/pause without any bound). ReadIndex is not safe + /// in that case. + LeaseBased, +} + +/// ReadState provides state for read only query. +/// +/// It's caller's responsibility to send MsgReadIndex first before getting +/// this state from ready. It's also caller's duty to differentiate if this +/// state is what it requests through request_ctx, e.g. given a unique id as +/// request_ctx. +#[derive(Default, Debug, PartialEq, Eq, Clone)] +pub struct ReadState { + /// The index of the read state. + pub index: u64, + /// A datagram consisting of context about the request. + pub request_ctx: Vec, +} + +#[derive(Default, Debug, Clone)] +pub struct ReadIndexStatus { + pub req: Message, + pub index: u64, + pub acks: HashSet, +} + +#[derive(Default, Debug, Clone)] +pub struct ReadOnly { + pub option: ReadOnlyOption, + pub pending_read_index: HashMap, ReadIndexStatus>, + pub read_index_queue: VecDeque>, +} + +impl ReadOnly { + pub fn new(option: ReadOnlyOption) -> ReadOnly { + ReadOnly { + option, + pending_read_index: HashMap::default(), + read_index_queue: VecDeque::new(), + } + } + + /// Adds a read only request into readonly struct. + /// + /// `index` is the commit index of the raft state machine when it received + /// the read only request. + /// + /// `m` is the original read only request message from the local or remote node. + pub fn add_request(&mut self, index: u64, req: Message, self_id: u64) { + let ctx = { + let key: &[u8] = req.entries[0].data.as_ref(); + if self.pending_read_index.contains_key(key) { + return; + } + key.to_vec() + }; + let mut acks = HashSet::::default(); + acks.insert(self_id); + let status = ReadIndexStatus { req, index, acks }; + self.pending_read_index.insert(ctx.clone(), status); + self.read_index_queue.push_back(ctx); + } + + /// Notifies the ReadOnly struct that the raft state machine received + /// an acknowledgment of the heartbeat that attached with the read only request + /// context. + pub fn recv_ack(&mut self, id: u64, ctx: &[u8]) -> Option<&HashSet> { + self.pending_read_index.get_mut(ctx).map(|rs| { + rs.acks.insert(id); + &rs.acks + }) + } + + /// Advances the read only request queue kept by the ReadOnly struct. + /// It dequeues the requests until it finds the read only request that has + /// the same context as the given `ctx`. + pub fn advance(&mut self, ctx: &[u8], logger: &Logger) -> Vec { + let mut rss = vec![]; + if let Some(i) = self.read_index_queue.iter().position(|x| { + if !self.pending_read_index.contains_key(x) { + fatal!(logger, "cannot find correspond read state from pending map"); + } + *x == ctx + }) { + for _ in 0..=i { + let rs = self.read_index_queue.pop_front().unwrap(); + let status = self.pending_read_index.remove(&rs).unwrap(); + rss.push(status); + } + } + rss + } + + /// Returns the context of the last pending read only request in ReadOnly struct. + pub fn last_pending_request_ctx(&self) -> Option> { + self.read_index_queue.back().cloned() + } + + #[inline] + pub fn pending_read_count(&self) -> usize { + self.read_index_queue.len() + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/status.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/status.rs new file mode 100644 index 0000000..1df0038 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/status.rs @@ -0,0 +1,53 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::eraftpb::HardState; + +use crate::raft::{Raft, SoftState, StateRole}; +use crate::storage::Storage; +use crate::ProgressTracker; + +/// Represents the current status of the raft +#[derive(Default)] +pub struct Status<'a> { + /// The ID of the current node. + pub id: u64, + /// The hardstate of the raft, representing voted state. + pub hs: HardState, + /// The softstate of the raft, representing proposed state. + pub ss: SoftState, + /// The index of the last entry to have been applied. + pub applied: u64, + /// The progress towards catching up and applying logs. + pub progress: Option<&'a ProgressTracker>, +} + +impl<'a> Status<'a> { + /// Gets a copy of the current raft status. + pub fn new(raft: &'a Raft) -> Status<'a> { + let mut s = Status { + id: raft.id, + ..Default::default() + }; + s.hs = raft.hard_state(); + s.ss = raft.soft_state(); + s.applied = raft.raft_log.applied; + if s.ss.raft_state == StateRole::Leader { + s.progress = Some(raft.prs()); + } + s + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/storage.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/storage.rs new file mode 100644 index 0000000..99e02e3 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/storage.rs @@ -0,0 +1,813 @@ +//! Represents the storage trait and example implementation. +//! +//! The storage trait is used to house and eventually serialize the state of the system. +//! Custom implementations of this are normal and this is likely to be a key integration +//! point for your distributed storage. + +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp; +use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; + +use crate::eraftpb::*; + +use crate::errors::{Error, Result, StorageError}; +use crate::util::limit_size; + +use getset::{Getters, Setters}; + +/// Holds both the hard state (commit index, vote leader, term) and the configuration state +/// (Current node IDs) +#[derive(Debug, Clone, Default, Getters, Setters)] +pub struct RaftState { + /// Contains the last meta information including commit index, the vote leader, and the vote term. + pub hard_state: HardState, + + /// Records the current node IDs like `[1, 2, 3]` in the cluster. Every Raft node must have a + /// unique ID in the cluster; + pub conf_state: ConfState, +} + +impl RaftState { + /// Create a new RaftState. + pub fn new(hard_state: HardState, conf_state: ConfState) -> RaftState { + RaftState { + hard_state, + conf_state, + } + } + /// Indicates the `RaftState` is initialized or not. + pub fn initialized(&self) -> bool { + self.conf_state != ConfState::default() + } +} + +/// Records the context of the caller who calls entries() of Storage trait. +#[derive(Debug, Clone, Copy)] +pub struct GetEntriesContext(pub(crate) GetEntriesFor); + +impl GetEntriesContext { + /// Used for callers out of raft. Caller can customize if it supports async. + pub fn empty(can_async: bool) -> Self { + GetEntriesContext(GetEntriesFor::Empty(can_async)) + } + + /// Check if the caller's context support fetching entries asynchronously. + pub fn can_async(&self) -> bool { + match self.0 { + GetEntriesFor::SendAppend { .. } => true, + GetEntriesFor::Empty(can_async) => can_async, + _ => false, + } + } +} + +#[derive(Debug, Clone, Copy)] +pub(crate) enum GetEntriesFor { + // for sending entries to followers + SendAppend { + /// the peer id to which the entries are going to send + to: u64, + /// the term when the request is issued + term: u64, + /// whether to exhaust all the entries + aggressively: bool, + }, + // for getting committed entries in a ready + GenReady, + // for getting entries to check pending conf when transferring leader + TransferLeader, + // for getting entries to check pending conf when forwarding commit index by vote messages + CommitByVote, + // It's not called by the raft itself + Empty(bool), +} + +/// Storage saves all the information about the current Raft implementation, including Raft Log, +/// commit index, the leader to vote for, etc. +/// +/// If any Storage method returns an error, the raft instance will +/// become inoperable and refuse to participate in elections; the +/// application is responsible for cleanup and recovery in this case. +pub trait Storage { + /// `initial_state` is called when Raft is initialized. This interface will return a `RaftState` + /// which contains `HardState` and `ConfState`. + /// + /// `RaftState` could be initialized or not. If it's initialized it means the `Storage` is + /// created with a configuration, and its last index and term should be greater than 0. + fn initial_state(&self) -> Result; + + /// Returns a slice of log entries in the range `[low, high)`. + /// max_size limits the total size of the log entries returned if not `None`, however + /// the slice of entries returned will always have length at least 1 if entries are + /// found in the range. + /// + /// Entries are supported to be fetched asynchronously depending on the context. Async is optional. + /// Storage should check context.can_async() first and decide whether to fetch entries asynchronously + /// based on its own implementation. If the entries are fetched asynchronously, storage should return + /// LogTemporarilyUnavailable, and application needs to call `on_entries_fetched(context)` to trigger + /// re-fetch of the entries after the storage finishes fetching the entries. + /// + /// # Panics + /// + /// Panics if `high` is higher than `Storage::last_index(&self) + 1`. + fn entries( + &self, + low: u64, + high: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> Result>; + + /// Returns the term of entry idx, which must be in the range + /// [first_index()-1, last_index()]. The term of the entry before + /// first_index is retained for matching purpose even though the + /// rest of that entry may not be available. + fn term(&self, idx: u64) -> Result; + + /// Returns the index of the first log entry that is possible available via entries, which will + /// always equal to `truncated index` plus 1. + /// + /// New created (but not initialized) `Storage` can be considered as truncated at 0 so that 1 + /// will be returned in this case. + fn first_index(&self) -> Result; + + /// The index of the last entry replicated in the `Storage`. + fn last_index(&self) -> Result; + + /// Returns the most recent snapshot. + /// + /// If snapshot is temporarily unavailable, it should return SnapshotTemporarilyUnavailable, + /// so raft state machine could know that Storage needs some time to prepare + /// snapshot and call snapshot later. + /// A snapshot's index must not be less than the `request_index`. + /// `to` indicates which peer is requesting the snapshot. + fn snapshot(&self, request_index: u64, to: u64) -> Result; +} + +/// The Memory Storage Core instance holds the actual state of the storage struct. To access this +/// value, use the `rl` and `wl` functions on the main MemStorage implementation. +#[derive(Default)] +pub struct MemStorageCore { + raft_state: RaftState, + // entries[i] has raft log position i+snapshot.get_metadata().index + entries: Vec, + // Metadata of the last snapshot received. + snapshot_metadata: SnapshotMetadata, + // If it is true, the next snapshot will return a + // SnapshotTemporarilyUnavailable error. + trigger_snap_unavailable: bool, + // Peers that are fetching entries asynchronously. + trigger_log_unavailable: bool, + // Stores get entries context. + get_entries_context: Option, +} + +impl MemStorageCore { + /// Saves the current HardState. + pub fn set_hardstate(&mut self, hs: HardState) { + self.raft_state.hard_state = hs; + } + + /// Get the hard state. + pub fn hard_state(&self) -> &HardState { + &self.raft_state.hard_state + } + + /// Get the mut hard state. + pub fn mut_hard_state(&mut self) -> &mut HardState { + &mut self.raft_state.hard_state + } + + /// Commit to an index. + /// + /// # Panics + /// + /// Panics if there is no such entry in raft logs. + pub fn commit_to(&mut self, index: u64) -> Result<()> { + assert!( + self.has_entry_at(index), + "commit_to {} but the entry does not exist", + index + ); + + let diff = (index - self.entries[0].index) as usize; + self.raft_state.hard_state.commit = index; + self.raft_state.hard_state.term = self.entries[diff].term; + Ok(()) + } + + /// Saves the current conf state. + pub fn set_conf_state(&mut self, cs: ConfState) { + self.raft_state.conf_state = cs; + } + + #[inline] + fn has_entry_at(&self, index: u64) -> bool { + !self.entries.is_empty() && index >= self.first_index() && index <= self.last_index() + } + + fn first_index(&self) -> u64 { + match self.entries.first() { + Some(e) => e.index, + None => self.snapshot_metadata.index + 1, + } + } + + fn last_index(&self) -> u64 { + match self.entries.last() { + Some(e) => e.index, + None => self.snapshot_metadata.index, + } + } + + /// Overwrites the contents of this Storage object with those of the given snapshot. + /// + /// # Panics + /// + /// Panics if the snapshot index is less than the storage's first index. + pub fn apply_snapshot(&mut self, mut snapshot: Snapshot) -> Result<()> { + let mut meta = snapshot.take_metadata(); + let index = meta.index; + + if self.first_index() > index { + return Err(Error::Store(StorageError::SnapshotOutOfDate)); + } + + self.snapshot_metadata = meta.clone(); + + self.raft_state.hard_state.term = cmp::max(self.raft_state.hard_state.term, meta.term); + self.raft_state.hard_state.commit = index; + self.entries.clear(); + + // Update conf states. + self.raft_state.conf_state = meta.take_conf_state(); + Ok(()) + } + + fn snapshot(&self) -> Snapshot { + let mut snapshot = Snapshot::default(); + + // We assume all entries whose indexes are less than `hard_state.commit` + // have been applied, so use the latest commit index to construct the snapshot. + // TODO: This is not true for async ready. + let meta = snapshot.mut_metadata(); + meta.index = self.raft_state.hard_state.commit; + meta.term = match meta.index.cmp(&self.snapshot_metadata.index) { + cmp::Ordering::Equal => self.snapshot_metadata.term, + cmp::Ordering::Greater => { + let offset = self.entries[0].index; + self.entries[(meta.index - offset) as usize].term + } + cmp::Ordering::Less => { + panic!( + "commit {} < snapshot_metadata.index {}", + meta.index, self.snapshot_metadata.index + ); + } + }; + + meta.set_conf_state(self.raft_state.conf_state.clone()); + snapshot + } + + /// Discards all log entries prior to compact_index. + /// It is the application's responsibility to not attempt to compact an index + /// greater than RaftLog.applied. + /// + /// # Panics + /// + /// Panics if `compact_index` is higher than `Storage::last_index(&self) + 1`. + pub fn compact(&mut self, compact_index: u64) -> Result<()> { + if compact_index <= self.first_index() { + // Don't need to treat this case as an error. + return Ok(()); + } + + if compact_index > self.last_index() + 1 { + panic!( + "compact not received raft logs: {}, last index: {}", + compact_index, + self.last_index() + ); + } + + if let Some(entry) = self.entries.first() { + let offset = compact_index - entry.index; + self.entries.drain(..offset as usize); + } + Ok(()) + } + + /// Append the new entries to storage. + /// + /// # Panics + /// + /// Panics if `ents` contains compacted entries, or there's a gap between `ents` and the last + /// received entry in the storage. + pub fn append(&mut self, ents: &[Entry]) -> Result<()> { + if ents.is_empty() { + return Ok(()); + } + if self.first_index() > ents[0].index { + panic!( + "overwrite compacted raft logs, compacted: {}, append: {}", + self.first_index() - 1, + ents[0].index, + ); + } + if self.last_index() + 1 < ents[0].index { + panic!( + "raft logs should be continuous, last index: {}, new appended: {}", + self.last_index(), + ents[0].index, + ); + } + + // Remove all entries overwritten by `ents`. + let diff = ents[0].index - self.first_index(); + self.entries.drain(diff as usize..); + self.entries.extend_from_slice(ents); + Ok(()) + } + + /// Commit to `idx` and set configuration to the given states. Only used for tests. + pub fn commit_to_and_set_conf_states(&mut self, idx: u64, cs: Option) -> Result<()> { + self.commit_to(idx)?; + if let Some(cs) = cs { + self.raft_state.conf_state = cs; + } + Ok(()) + } + + /// Trigger a SnapshotTemporarilyUnavailable error. + pub fn trigger_snap_unavailable(&mut self) { + self.trigger_snap_unavailable = true; + } + + /// Set a LogTemporarilyUnavailable error. + pub fn trigger_log_unavailable(&mut self, v: bool) { + self.trigger_log_unavailable = v; + } + + /// Take get entries context. + pub fn take_get_entries_context(&mut self) -> Option { + self.get_entries_context.take() + } +} + +/// `MemStorage` is a thread-safe but incomplete implementation of `Storage`, mainly for tests. +/// +/// A real `Storage` should save both raft logs and applied data. However `MemStorage` only +/// contains raft logs. So you can call `MemStorage::append` to persist newly received unstable raft +/// logs and then access them with `Storage` APIs. The only exception is `Storage::snapshot`. There +/// is no data in `Snapshot` returned by `MemStorage::snapshot` because applied data is not stored +/// in `MemStorage`. +#[derive(Clone, Default)] +pub struct MemStorage { + core: Arc>, +} + +impl MemStorage { + /// Returns a new memory storage value. + pub fn new() -> MemStorage { + MemStorage { + ..Default::default() + } + } + + /// Create a new `MemStorage` with a given `Config`. The given `Config` will be used to + /// initialize the storage. + /// + /// You should use the same input to initialize all nodes. + pub fn new_with_conf_state(conf_state: T) -> MemStorage + where + ConfState: From, + { + let store = MemStorage::new(); + store.initialize_with_conf_state(conf_state); + store + } + + /// Initialize a `MemStorage` with a given `Config`. + /// + /// You should use the same input to initialize all nodes. + pub fn initialize_with_conf_state(&self, conf_state: T) + where + ConfState: From, + { + assert!(!self.initial_state().unwrap().initialized()); + let mut core = self.wl(); + // Setting initial state is very important to build a correct raft, as raft algorithm + // itself only guarantees logs consistency. Typically, you need to ensure either all start + // states are the same on all nodes, or new nodes always catch up logs by snapshot first. + // + // In practice, we choose the second way by assigning non-zero index to first index. Here + // we choose the first way for historical reason and easier to write tests. + core.raft_state.conf_state = ConfState::from(conf_state); + } + + /// Opens up a read lock on the storage and returns a guard handle. Use this + /// with functions that don't require mutation. + pub fn rl(&self) -> RwLockReadGuard<'_, MemStorageCore> { + self.core.read().unwrap() + } + + /// Opens up a write lock on the storage and returns guard handle. Use this + /// with functions that take a mutable reference to self. + pub fn wl(&self) -> RwLockWriteGuard<'_, MemStorageCore> { + self.core.write().unwrap() + } +} + +impl Storage for MemStorage { + /// Implements the Storage trait. + fn initial_state(&self) -> Result { + Ok(self.rl().raft_state.clone()) + } + + /// Implements the Storage trait. + fn entries( + &self, + low: u64, + high: u64, + max_size: impl Into>, + context: GetEntriesContext, + ) -> Result> { + let max_size = max_size.into(); + let mut core = self.wl(); + if low < core.first_index() { + return Err(Error::Store(StorageError::Compacted)); + } + + if high > core.last_index() + 1 { + panic!( + "index out of bound (last: {}, high: {})", + core.last_index() + 1, + high + ); + } + + if core.trigger_log_unavailable && context.can_async() { + core.get_entries_context = Some(context); + return Err(Error::Store(StorageError::LogTemporarilyUnavailable)); + } + + let offset = core.entries[0].index; + let lo = (low - offset) as usize; + let hi = (high - offset) as usize; + let mut ents = core.entries[lo..hi].to_vec(); + limit_size(&mut ents, max_size); + Ok(ents) + } + + /// Implements the Storage trait. + fn term(&self, idx: u64) -> Result { + let core = self.rl(); + if idx == core.snapshot_metadata.index { + return Ok(core.snapshot_metadata.term); + } + + let offset = core.first_index(); + if idx < offset { + return Err(Error::Store(StorageError::Compacted)); + } + + if idx > core.last_index() { + return Err(Error::Store(StorageError::Unavailable)); + } + Ok(core.entries[(idx - offset) as usize].term) + } + + /// Implements the Storage trait. + fn first_index(&self) -> Result { + Ok(self.rl().first_index()) + } + + /// Implements the Storage trait. + fn last_index(&self) -> Result { + Ok(self.rl().last_index()) + } + + /// Implements the Storage trait. + fn snapshot(&self, request_index: u64, _to: u64) -> Result { + let mut core = self.wl(); + if core.trigger_snap_unavailable { + core.trigger_snap_unavailable = false; + Err(Error::Store(StorageError::SnapshotTemporarilyUnavailable)) + } else { + let mut snap = core.snapshot(); + if snap.get_metadata().index < request_index { + snap.mut_metadata().index = request_index; + } + Ok(snap) + } + } +} + +#[cfg(test)] +mod test { + use std::panic::{self, AssertUnwindSafe}; + + use protobuf::Message as PbMessage; + + use crate::eraftpb::{ConfState, Entry, Snapshot}; + use crate::errors::{Error as RaftError, StorageError}; + + use super::{GetEntriesContext, MemStorage, Storage}; + + fn new_entry(index: u64, term: u64) -> Entry { + let mut e = Entry::default(); + e.term = term; + e.index = index; + e + } + + fn size_of(m: &T) -> u32 { + m.compute_size() + } + + fn new_snapshot(index: u64, term: u64, voters: Vec) -> Snapshot { + let mut s = Snapshot::default(); + s.mut_metadata().index = index; + s.mut_metadata().term = term; + s.mut_metadata().mut_conf_state().voters = voters; + s + } + + #[test] + fn test_storage_term() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut tests = vec![ + (2, Err(RaftError::Store(StorageError::Compacted))), + (3, Ok(3)), + (4, Ok(4)), + (5, Ok(5)), + (6, Err(RaftError::Store(StorageError::Unavailable))), + ]; + + for (i, (idx, wterm)) in tests.drain(..).enumerate() { + let storage = MemStorage::new(); + storage.wl().entries = ents.clone(); + + let t = storage.term(idx); + if t != wterm { + panic!("#{}: expect res {:?}, got {:?}", i, wterm, t); + } + } + } + + #[test] + fn test_storage_entries() { + let ents = vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 6), + ]; + let max_u64 = u64::MAX; + let mut tests = vec![ + ( + 2, + 6, + max_u64, + Err(RaftError::Store(StorageError::Compacted)), + ), + (3, 4, max_u64, Ok(vec![new_entry(3, 3)])), + (4, 5, max_u64, Ok(vec![new_entry(4, 4)])), + (4, 6, max_u64, Ok(vec![new_entry(4, 4), new_entry(5, 5)])), + ( + 4, + 7, + max_u64, + Ok(vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]), + ), + // even if maxsize is zero, the first entry should be returned + (4, 7, 0, Ok(vec![new_entry(4, 4)])), + // limit to 2 + ( + 4, + 7, + u64::from(size_of(&ents[1]) + size_of(&ents[2])), + Ok(vec![new_entry(4, 4), new_entry(5, 5)]), + ), + ( + 4, + 7, + u64::from(size_of(&ents[1]) + size_of(&ents[2]) + size_of(&ents[3]) / 2), + Ok(vec![new_entry(4, 4), new_entry(5, 5)]), + ), + ( + 4, + 7, + u64::from(size_of(&ents[1]) + size_of(&ents[2]) + size_of(&ents[3]) - 1), + Ok(vec![new_entry(4, 4), new_entry(5, 5)]), + ), + // all + ( + 4, + 7, + u64::from(size_of(&ents[1]) + size_of(&ents[2]) + size_of(&ents[3])), + Ok(vec![new_entry(4, 4), new_entry(5, 5), new_entry(6, 6)]), + ), + ]; + for (i, (lo, hi, maxsize, wentries)) in tests.drain(..).enumerate() { + let storage = MemStorage::new(); + storage.wl().entries = ents.clone(); + let e = storage.entries(lo, hi, maxsize, GetEntriesContext::empty(false)); + if e != wentries { + panic!("#{}: expect entries {:?}, got {:?}", i, wentries, e); + } + } + } + + #[test] + fn test_storage_last_index() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let storage = MemStorage::new(); + storage.wl().entries = ents; + + let wresult = Ok(5); + let result = storage.last_index(); + if result != wresult { + panic!("want {:?}, got {:?}", wresult, result); + } + + storage.wl().append(&[new_entry(6, 5)]).unwrap(); + let wresult = Ok(6); + let result = storage.last_index(); + if result != wresult { + panic!("want {:?}, got {:?}", wresult, result); + } + } + + #[test] + fn test_storage_first_index() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let storage = MemStorage::new(); + storage.wl().entries = ents; + + assert_eq!(storage.first_index(), Ok(3)); + storage.wl().compact(4).unwrap(); + assert_eq!(storage.first_index(), Ok(4)); + } + + #[test] + fn test_storage_compact() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut tests = vec![(2, 3, 3, 3), (3, 3, 3, 3), (4, 4, 4, 2), (5, 5, 5, 1)]; + for (i, (idx, windex, wterm, wlen)) in tests.drain(..).enumerate() { + let storage = MemStorage::new(); + storage.wl().entries = ents.clone(); + + storage.wl().compact(idx).unwrap(); + let index = storage.first_index().unwrap(); + if index != windex { + panic!("#{}: want {}, index {}", i, windex, index); + } + let term = if let Ok(v) = + storage.entries(index, index + 1, 1, GetEntriesContext::empty(false)) + { + v.first().map_or(0, |e| e.term) + } else { + 0 + }; + if term != wterm { + panic!("#{}: want {}, term {}", i, wterm, term); + } + let last = storage.last_index().unwrap(); + let len = storage + .entries(index, last + 1, 100, GetEntriesContext::empty(false)) + .unwrap() + .len(); + if len != wlen { + panic!("#{}: want {}, term {}", i, wlen, len); + } + } + } + + #[test] + fn test_storage_create_snapshot() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let nodes = vec![1, 2, 3]; + let mut conf_state = ConfState::default(); + conf_state.voters = nodes.clone(); + + let unavailable = Err(RaftError::Store( + StorageError::SnapshotTemporarilyUnavailable, + )); + let mut tests = vec![ + (4, Ok(new_snapshot(4, 4, nodes.clone())), 0), + (5, Ok(new_snapshot(5, 5, nodes.clone())), 5), + (5, Ok(new_snapshot(6, 5, nodes)), 6), + (5, unavailable, 6), + ]; + for (i, (idx, wresult, windex)) in tests.drain(..).enumerate() { + let storage = MemStorage::new(); + storage.wl().entries = ents.clone(); + storage.wl().raft_state.hard_state.commit = idx; + storage.wl().raft_state.hard_state.term = idx; + storage.wl().raft_state.conf_state = conf_state.clone(); + + if wresult.is_err() { + storage.wl().trigger_snap_unavailable(); + } + + let result = storage.snapshot(windex, 0); + if result != wresult { + panic!("#{}: want {:?}, got {:?}", i, wresult, result); + } + } + } + + #[test] + fn test_storage_append() { + let ents = vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]; + let mut tests = vec![ + ( + vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)], + Some(vec![new_entry(3, 3), new_entry(4, 4), new_entry(5, 5)]), + ), + ( + vec![new_entry(3, 3), new_entry(4, 6), new_entry(5, 6)], + Some(vec![new_entry(3, 3), new_entry(4, 6), new_entry(5, 6)]), + ), + ( + vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 5), + ], + Some(vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 5), + ]), + ), + // overwrite compacted raft logs is not allowed + ( + vec![new_entry(2, 3), new_entry(3, 3), new_entry(4, 5)], + None, + ), + // truncate the existing entries and append + ( + vec![new_entry(4, 5)], + Some(vec![new_entry(3, 3), new_entry(4, 5)]), + ), + // direct append + ( + vec![new_entry(6, 6)], + Some(vec![ + new_entry(3, 3), + new_entry(4, 4), + new_entry(5, 5), + new_entry(6, 6), + ]), + ), + ]; + for (i, (entries, wentries)) in tests.drain(..).enumerate() { + let storage = MemStorage::new(); + storage.wl().entries = ents.clone(); + let res = panic::catch_unwind(AssertUnwindSafe(|| storage.wl().append(&entries))); + if let Some(wentries) = wentries { + let _ = res.unwrap(); + let e = &storage.wl().entries; + if *e != wentries { + panic!("#{}: want {:?}, entries {:?}", i, wentries, e); + } + } else { + res.unwrap_err(); + } + } + } + + #[test] + fn test_storage_apply_snapshot() { + let nodes = vec![1, 2, 3]; + let storage = MemStorage::new(); + + // Apply snapshot successfully + let snap = new_snapshot(4, 4, nodes.clone()); + storage.wl().apply_snapshot(snap).unwrap(); + + // Apply snapshot fails due to StorageError::SnapshotOutOfDate + let snap = new_snapshot(3, 3, nodes); + storage.wl().apply_snapshot(snap).unwrap_err(); + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker.rs new file mode 100644 index 0000000..4814d38 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker.rs @@ -0,0 +1,388 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod inflights; +mod progress; +mod state; + +pub use self::inflights::Inflights; +pub use self::progress::Progress; +pub use self::state::ProgressState; + +use crate::confchange::{MapChange, MapChangeType}; +use crate::eraftpb::ConfState; +use crate::quorum::{AckedIndexer, Index, VoteResult}; +use crate::{DefaultHashBuilder, HashMap, HashSet, JointConfig}; +use getset::Getters; +use std::fmt::Debug; + +/// Config reflects the configuration tracked in a ProgressTracker. +#[derive(Clone, Debug, Default, PartialEq, Eq, Getters)] +pub struct Configuration { + #[get = "pub"] + pub(crate) voters: JointConfig, + /// Learners is a set of IDs corresponding to the learners active in the + /// current configuration. + /// + /// Invariant: Learners and Voters does not intersect, i.e. if a peer is in + /// either half of the joint config, it can't be a learner; if it is a + /// learner it can't be in either half of the joint config. This invariant + /// simplifies the implementation since it allows peers to have clarity about + /// its current role without taking into account joint consensus. + #[get = "pub"] + pub(crate) learners: HashSet, + /// When we turn a voter into a learner during a joint consensus transition, + /// we cannot add the learner directly when entering the joint state. This is + /// because this would violate the invariant that the intersection of + /// voters and learners is empty. For example, assume a Voter is removed and + /// immediately re-added as a learner (or in other words, it is demoted): + /// + /// Initially, the configuration will be + /// + /// voters: {1 2 3} + /// learners: {} + /// + /// and we want to demote 3. Entering the joint configuration, we naively get + /// + /// voters: {1 2} & {1 2 3} + /// learners: {3} + /// + /// but this violates the invariant (3 is both voter and learner). Instead, + /// we get + /// + /// voters: {1 2} & {1 2 3} + /// learners: {} + /// next_learners: {3} + /// + /// Where 3 is now still purely a voter, but we are remembering the intention + /// to make it a learner upon transitioning into the final configuration: + /// + /// voters: {1 2} + /// learners: {3} + /// next_learners: {} + /// + /// Note that next_learners is not used while adding a learner that is not + /// also a voter in the joint config. In this case, the learner is added + /// right away when entering the joint configuration, so that it is caught up + /// as soon as possible. + #[get = "pub"] + pub(crate) learners_next: HashSet, + /// True if the configuration is joint and a transition to the incoming + /// configuration should be carried out automatically by Raft when this is + /// possible. If false, the configuration will be joint until the application + /// initiates the transition manually. + #[get = "pub"] + pub(crate) auto_leave: bool, +} + +// Display and crate::itertools used only for test +#[cfg(test)] +impl std::fmt::Display for Configuration { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use itertools::Itertools; + if self.voters.outgoing.is_empty() { + write!(f, "voters={}", self.voters.incoming)? + } else { + write!( + f, + "voters={}&&{}", + self.voters.incoming, self.voters.outgoing + )? + } + if !self.learners.is_empty() { + write!( + f, + " learners=({})", + self.learners + .iter() + .sorted_by(|&a, &b| a.cmp(b)) + .map(|x| x.to_string()) + .collect::>() + .join(" ") + )? + } + if !self.learners_next.is_empty() { + write!( + f, + " learners_next=({})", + self.learners_next + .iter() + .map(|x| x.to_string()) + .collect::>() + .join(" ") + )? + } + if self.auto_leave { + write!(f, " autoleave")? + } + Ok(()) + } +} + +impl Configuration { + /// Create a new configuration with the given configuration. + pub fn new( + voters: impl IntoIterator, + learners: impl IntoIterator, + ) -> Self { + Self { + voters: JointConfig::new(voters.into_iter().collect()), + auto_leave: false, + learners: learners.into_iter().collect(), + learners_next: HashSet::default(), + } + } + + fn with_capacity(voters: usize, learners: usize) -> Self { + Self { + voters: JointConfig::with_capacity(voters), + learners: HashSet::with_capacity_and_hasher(learners, DefaultHashBuilder::default()), + learners_next: HashSet::default(), + auto_leave: false, + } + } + + /// Create a new `ConfState` from the configuration itself. + pub fn to_conf_state(&self) -> ConfState { + // Note: Different from etcd, we don't sort. + let mut state = ConfState::default(); + state.set_voters(self.voters.incoming.raw_slice()); + state.set_voters_outgoing(self.voters.outgoing.raw_slice()); + state.set_learners(self.learners.iter().cloned().collect()); + state.set_learners_next(self.learners_next.iter().cloned().collect()); + state.auto_leave = self.auto_leave; + state + } + + fn clear(&mut self) { + self.voters.clear(); + self.learners.clear(); + self.learners_next.clear(); + self.auto_leave = false; + } +} + +pub type ProgressMap = HashMap; + +impl AckedIndexer for ProgressMap { + fn acked_index(&self, voter_id: u64) -> Option { + self.get(&voter_id).map(|p| Index { + index: p.matched, + group_id: p.commit_group_id, + }) + } +} + +/// `ProgressTracker` contains several `Progress`es, +/// which could be `Leader`, `Follower` and `Learner`. +#[derive(Clone, Getters)] +pub struct ProgressTracker { + progress: ProgressMap, + + /// The current configuration state of the cluster. + #[get = "pub"] + conf: Configuration, + #[doc(hidden)] + #[get = "pub"] + votes: HashMap, + #[get = "pub(crate)"] + max_inflight: usize, + + group_commit: bool, +} + +impl ProgressTracker { + /// Creates a new ProgressTracker. + pub fn new(max_inflight: usize) -> Self { + Self::with_capacity(0, 0, max_inflight) + } + + /// Create a progress set with the specified sizes already reserved. + pub fn with_capacity(voters: usize, learners: usize, max_inflight: usize) -> Self { + ProgressTracker { + progress: HashMap::with_capacity_and_hasher( + voters + learners, + DefaultHashBuilder::default(), + ), + conf: Configuration::with_capacity(voters, learners), + votes: HashMap::with_capacity_and_hasher(voters, DefaultHashBuilder::default()), + max_inflight, + group_commit: false, + } + } + + /// Configures group commit. + pub fn enable_group_commit(&mut self, enable: bool) { + self.group_commit = enable; + } + + /// Whether enable group commit. + pub fn group_commit(&self) -> bool { + self.group_commit + } + + pub(crate) fn clear(&mut self) { + self.progress.clear(); + self.conf.clear(); + self.votes.clear(); + } + + /// Returns true if (and only if) there is only one voting member + /// (i.e. the leader) in the current configuration. + pub fn is_singleton(&self) -> bool { + self.conf.voters.is_singleton() + } + + /// Grabs a reference to the progress of a node. + #[inline] + pub fn get(&self, id: u64) -> Option<&Progress> { + self.progress.get(&id) + } + + /// Grabs a mutable reference to the progress of a node. + #[inline] + pub fn get_mut(&mut self, id: u64) -> Option<&mut Progress> { + self.progress.get_mut(&id) + } + + /// Returns an iterator across all the nodes and their progress. + /// + /// **Note:** Do not use this for majority/quorum calculation. The Raft node may be + /// transitioning to a new configuration and have two quorums. Use `has_quorum` instead. + #[inline] + pub fn iter(&self) -> impl ExactSizeIterator { + self.progress.iter() + } + + /// Returns a mutable iterator across all the nodes and their progress. + /// + /// **Note:** Do not use this for majority/quorum calculation. The Raft node may be + /// transitioning to a new configuration and have two quorums. Use `has_quorum` instead. + #[inline] + pub fn iter_mut(&mut self) -> impl ExactSizeIterator { + self.progress.iter_mut() + } + + /// Returns the maximal committed index for the cluster. The bool flag indicates whether + /// the index is computed by group commit algorithm successfully. + /// + /// Eg. If the matched indexes are `[2,2,2,4,5]`, it will return `2`. + /// If the matched indexes and groups are `[(1, 1), (2, 2), (3, 2)]`, it will return `1`. + pub fn maximal_committed_index(&mut self) -> (u64, bool) { + self.conf + .voters + .committed_index(self.group_commit, &self.progress) + } + + /// Prepares for a new round of vote counting via recordVote. + pub fn reset_votes(&mut self) { + self.votes.clear(); + } + + /// Records that the node with the given id voted for this Raft + /// instance if v == true (and declined it otherwise). + pub fn record_vote(&mut self, id: u64, vote: bool) { + self.votes.entry(id).or_insert(vote); + } + + /// TallyVotes returns the number of granted and rejected Votes, and whether the + /// election outcome is known. + pub fn tally_votes(&self) -> (usize, usize, VoteResult) { + // Make sure to populate granted/rejected correctly even if the Votes slice + // contains members no longer part of the configuration. This doesn't really + // matter in the way the numbers are used (they're informational), but might + // as well get it right. + let (mut granted, mut rejected) = (0, 0); + for (id, vote) in &self.votes { + if !self.conf.voters.contains(*id) { + continue; + } + if *vote { + granted += 1; + } else { + rejected += 1; + } + } + let result = self.vote_result(&self.votes); + (granted, rejected, result) + } + + /// Returns the Candidate's eligibility in the current election. + /// + /// If it is still eligible, it should continue polling nodes and checking. + /// Eventually, the election will result in this returning either `Elected` + /// or `Ineligible`, meaning the election can be concluded. + pub fn vote_result(&self, votes: &HashMap) -> VoteResult { + self.conf.voters.vote_result(|id| votes.get(&id).cloned()) + } + + /// Determines if the current quorum is active according to the this raft node. + /// Doing this will set the `recent_active` of each peer to false. + /// + /// This should only be called by the leader. + pub fn quorum_recently_active(&mut self, perspective_of: u64) -> bool { + let mut active = + HashSet::with_capacity_and_hasher(self.progress.len(), DefaultHashBuilder::default()); + for (id, pr) in &mut self.progress { + if *id == perspective_of { + pr.recent_active = true; + active.insert(*id); + } else if pr.recent_active { + // It doesn't matter whether it's learner. As we calculate quorum + // by actual ids instead of count. + active.insert(*id); + pr.recent_active = false; + } + } + self.has_quorum(&active) + } + + /// Determine if a quorum is formed from the given set of nodes. + /// + /// This is the only correct way to verify you have reached a quorum for the whole group. + #[inline] + pub fn has_quorum(&self, potential_quorum: &HashSet) -> bool { + self.conf + .voters + .vote_result(|id| potential_quorum.get(&id).map(|_| true)) + == VoteResult::Won + } + + #[inline] + pub(crate) fn progress(&self) -> &ProgressMap { + &self.progress + } + + /// Applies configuration and updates progress map to match the configuration. + pub fn apply_conf(&mut self, conf: Configuration, changes: MapChange, next_idx: u64) { + self.conf = conf; + for (id, change_type) in changes { + match change_type { + MapChangeType::Add => { + let mut pr = Progress::new(next_idx, self.max_inflight); + // When a node is first added, we should mark it as recently active. + // Otherwise, CheckQuorum may cause us to step down if it is invoked + // before the added node has had a chance to communicate with us. + pr.recent_active = true; + self.progress.insert(id, pr); + } + MapChangeType::Remove => { + self.progress.remove(&id); + } + } + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/inflights.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/inflights.rs new file mode 100644 index 0000000..8ff73dc --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/inflights.rs @@ -0,0 +1,426 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Ordering; + +/// A buffer of inflight messages. +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct Inflights { + // the starting index in the buffer + start: usize, + // number of inflights in the buffer + count: usize, + + // ring buffer + buffer: Vec, + + // capacity + cap: usize, + + // To support dynamically change inflight size. + incoming_cap: Option, +} + +impl Inflights { + /// Creates a new buffer for inflight messages. + pub fn new(cap: usize) -> Inflights { + Inflights { + buffer: Vec::with_capacity(cap), + start: 0, + count: 0, + cap, + incoming_cap: None, + } + } + + /// Adjust inflight buffer capacity. Set it to `0` will disable the progress. + // Calling it between `self.full()` and `self.add()` can cause a panic. + pub fn set_cap(&mut self, incoming_cap: usize) { + match self.cap.cmp(&incoming_cap) { + Ordering::Equal => self.incoming_cap = None, + Ordering::Less => { + if self.start + self.count <= self.cap { + if self.buffer.capacity() > 0 { + self.buffer.reserve(incoming_cap - self.buffer.len()); + } + } else { + debug_assert_eq!(self.cap, self.buffer.len()); + let mut buffer = Vec::with_capacity(incoming_cap); + buffer.extend_from_slice(&self.buffer[self.start..]); + buffer.extend_from_slice(&self.buffer[0..self.count - (self.cap - self.start)]); + self.buffer = buffer; + self.start = 0; + } + self.cap = incoming_cap; + self.incoming_cap = None; + } + Ordering::Greater => { + if self.count == 0 { + self.cap = incoming_cap; + self.incoming_cap = None; + self.start = 0; + if self.buffer.capacity() > 0 { + self.buffer = Vec::with_capacity(incoming_cap); + } + } else { + self.incoming_cap = Some(incoming_cap); + } + } + } + } + + /// Returns true if the inflights is full. + #[inline] + pub fn full(&self) -> bool { + self.count == self.cap || self.incoming_cap.is_some_and(|cap| self.count >= cap) + } + + /// Adds an inflight into inflights + pub fn add(&mut self, inflight: u64) { + if self.full() { + panic!("cannot add into a full inflights") + } + + if self.buffer.capacity() == 0 { + debug_assert_eq!(self.count, 0); + debug_assert_eq!(self.start, 0); + debug_assert!(self.incoming_cap.is_none()); + self.buffer = Vec::with_capacity(self.cap); + } + + let mut next = self.start + self.count; + if next >= self.cap { + next -= self.cap; + } + assert!(next <= self.buffer.len()); + if next == self.buffer.len() { + self.buffer.push(inflight); + } else { + self.buffer[next] = inflight; + } + self.count += 1; + } + + /// Frees the inflights smaller or equal to the given `to` flight. + pub fn free_to(&mut self, to: u64) { + if self.count == 0 || to < self.buffer[self.start] { + // out of the left side of the window + return; + } + + let mut i = 0usize; + let mut idx = self.start; + while i < self.count { + if to < self.buffer[idx] { + // found the first large inflight + break; + } + + // increase index and maybe rotate + idx += 1; + if idx >= self.cap { + idx -= self.cap; + } + + i += 1; + } + + // free i inflights and set new start index + self.count -= i; + self.start = idx; + + if self.count == 0 { + if let Some(incoming_cap) = self.incoming_cap.take() { + self.start = 0; + self.cap = incoming_cap; + self.buffer = Vec::with_capacity(self.cap); + } + } + } + + /// Frees the first buffer entry. + #[inline] + pub fn free_first_one(&mut self) { + if self.count > 0 { + let start = self.buffer[self.start]; + self.free_to(start); + } + } + + /// Frees all inflights. + #[inline] + pub fn reset(&mut self) { + self.count = 0; + self.start = 0; + self.buffer = vec![]; + self.cap = self.incoming_cap.take().unwrap_or(self.cap); + } + + // Number of inflight messages. It's for tests. + #[doc(hidden)] + #[inline] + pub fn count(&self) -> usize { + self.count + } + + // Capacity of the internal buffer. + #[doc(hidden)] + #[inline] + pub fn buffer_capacity(&self) -> usize { + self.buffer.capacity() + } + + // Whether buffer is allocated or not. It's for tests. + #[doc(hidden)] + #[inline] + pub fn buffer_is_allocated(&self) -> bool { + self.buffer_capacity() > 0 + } + + /// Free unused memory + #[inline] + pub fn maybe_free_buffer(&mut self) { + if self.count == 0 { + self.start = 0; + self.buffer = vec![]; + debug_assert_eq!(self.buffer.capacity(), 0); + } + } +} + +#[cfg(test)] +mod tests { + use super::Inflights; + + #[test] + fn test_inflight_add() { + let mut inflight = Inflights::new(10); + for i in 0..5 { + inflight.add(i); + } + + let wantin = Inflights { + start: 0, + count: 5, + buffer: vec![0, 1, 2, 3, 4], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin); + + for i in 5..10 { + inflight.add(i); + } + + let wantin2 = Inflights { + start: 0, + count: 10, + buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin2); + + let mut inflight2 = Inflights::new(10); + inflight2.start = 5; + inflight2.buffer.extend_from_slice(&[0, 0, 0, 0, 0]); + + for i in 0..5 { + inflight2.add(i); + } + + let wantin21 = Inflights { + start: 5, + count: 5, + buffer: vec![0, 0, 0, 0, 0, 0, 1, 2, 3, 4], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight2, wantin21); + + for i in 5..10 { + inflight2.add(i); + } + + let wantin22 = Inflights { + start: 5, + count: 10, + buffer: vec![5, 6, 7, 8, 9, 0, 1, 2, 3, 4], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight2, wantin22); + } + + #[test] + fn test_inflight_free_to() { + let mut inflight = Inflights::new(10); + for i in 0..10 { + inflight.add(i); + } + + inflight.free_to(4); + + let wantin = Inflights { + start: 5, + count: 5, + buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin); + + inflight.free_to(8); + + let wantin2 = Inflights { + start: 9, + count: 1, + buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin2); + + for i in 10..15 { + inflight.add(i); + } + + inflight.free_to(12); + + let wantin3 = Inflights { + start: 3, + count: 2, + buffer: vec![10, 11, 12, 13, 14, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin3); + + inflight.free_to(14); + + let wantin4 = Inflights { + start: 5, + count: 0, + buffer: vec![10, 11, 12, 13, 14, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin4); + } + + #[test] + fn test_inflight_free_first_one() { + let mut inflight = Inflights::new(10); + for i in 0..10 { + inflight.add(i); + } + + inflight.free_first_one(); + + let wantin = Inflights { + start: 1, + count: 9, + buffer: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + cap: 10, + incoming_cap: None, + }; + + assert_eq!(inflight, wantin); + } + + #[test] + fn test_inflights_set_cap() { + // Prepare 3 `Inflights` with 16 items, but start at 16, 112 and 120. + let mut inflights = Vec::with_capacity(3); + for &start in &[16, 112, 120] { + let mut inflight = Inflights::new(128); + (0..start).for_each(|i| inflight.add(i)); + inflight.free_to(start - 1); + (0..16).for_each(|i| inflight.add(i)); + assert_eq!(inflight.count(), 16); + assert_eq!(inflight.start, start as usize); + inflights.push(inflight); + } + + // Adjust cap to a larger value. + for (i, inflight) in inflights.iter_mut().enumerate() { + inflight.set_cap(1024); + assert_eq!(inflight.cap, 1024); + assert_eq!(inflight.incoming_cap, None); + assert_eq!(inflight.buffer_capacity(), 1024); + if i < 2 { + // The internal buffer is extended directly. + assert_ne!(inflight.start, 0); + } else { + // The internal buffer is re-allocated instead of extended. + assert_eq!(inflight.start, 0); + } + } + + // Prepare 3 `Inflights` with given `start`, `count` and `buffer_cap`. + let mut inflights = Vec::with_capacity(3); + for &(start, count, buffer_cap) in &[(1, 0, 0), (1, 0, 128), (1, 8, 128)] { + let mut inflight = Inflights::new(128); + inflight.start = start; + inflight.buffer = vec![0; buffer_cap]; + (0..count).for_each(|i| inflight.add(i)); + inflights.push(inflight); + } + + // Adjust cap to a less value. + for (i, inflight) in inflights.iter_mut().enumerate() { + inflight.set_cap(64); + if i == 0 || i == 1 { + assert_eq!(inflight.cap, 64); + assert_eq!(inflight.incoming_cap, None); + assert_eq!(inflight.start, 0); + if i == 0 { + assert_eq!(inflight.buffer.capacity(), 0) + } else { + assert_eq!(inflight.buffer.capacity(), 64) + } + } else { + assert_eq!(inflight.cap, 128); + assert_eq!(inflight.incoming_cap, Some(64)); + assert_eq!(inflight.start, 1); + assert_eq!(inflight.buffer.capacity(), 128) + } + } + + // `incoming_cap` can be cleared if the buffer is freed totally. + let mut inflight = inflights[2].clone(); + inflight.free_to(7); + assert_eq!(inflight.cap, 64); + assert_eq!(inflight.incoming_cap, None); + assert_eq!(inflight.start, 0); + + // `incoming_cap` can be cleared when `cap` is enlarged. + for &new_cap in &[128, 1024] { + let mut inflight = inflights[2].clone(); + inflight.set_cap(new_cap); + assert_eq!(inflight.cap, new_cap); + assert_eq!(inflight.incoming_cap, None); + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/progress.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/progress.rs new file mode 100644 index 0000000..dc19bb8 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/progress.rs @@ -0,0 +1,411 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::{Inflights, ProgressState, INVALID_INDEX}; +use std::cmp; + +/// The progress of catching up from a restart. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Progress { + /// How much state is matched. + pub matched: u64, + /// The next index to apply + pub next_idx: u64, + /// When in ProgressStateProbe, leader sends at most one replication message + /// per heartbeat interval. It also probes actual progress of the follower. + /// + /// When in ProgressStateReplicate, leader optimistically increases next + /// to the latest entry sent after sending replication message. This is + /// an optimized state for fast replicating log entries to the follower. + /// + /// When in ProgressStateSnapshot, leader should have sent out snapshot + /// before and stop sending any replication message. + pub state: ProgressState, + /// Paused is used in ProgressStateProbe. + /// When Paused is true, raft should pause sending replication message to this peer. + pub paused: bool, + /// This field is used in ProgressStateSnapshot. + /// If there is a pending snapshot, the pendingSnapshot will be set to the + /// index of the snapshot. If pendingSnapshot is set, the replication process of + /// this Progress will be paused. raft will not resend snapshot until the pending one + /// is reported to be failed. + pub pending_snapshot: u64, + /// This field is used in request snapshot. + /// If there is a pending request snapshot, this will be set to the request + /// index of the snapshot. + pub pending_request_snapshot: u64, + + /// This is true if the progress is recently active. Receiving any messages + /// from the corresponding follower indicates the progress is active. + /// RecentActive can be reset to false after an election timeout. + pub recent_active: bool, + + /// Inflights is a sliding window for the inflight messages. + /// When inflights is full, no more message should be sent. + /// When a leader sends out a message, the index of the last + /// entry should be added to inflights. The index MUST be added + /// into inflights in order. + /// When a leader receives a reply, the previous inflights should + /// be freed by calling inflights.freeTo. + pub ins: Inflights, + + /// Only logs replicated to different group will be committed if any group is configured. + pub commit_group_id: u64, + + /// Committed index in raft_log + pub committed_index: u64, +} + +impl Progress { + /// Creates a new progress with the given settings. + pub fn new(next_idx: u64, ins_size: usize) -> Self { + Progress { + matched: 0, + next_idx, + state: ProgressState::default(), + paused: false, + pending_snapshot: 0, + pending_request_snapshot: 0, + recent_active: false, + ins: Inflights::new(ins_size), + commit_group_id: 0, + committed_index: 0, + } + } + + fn reset_state(&mut self, state: ProgressState) { + self.paused = false; + self.pending_snapshot = 0; + self.state = state; + self.ins.reset(); + } + + pub(crate) fn reset(&mut self, next_idx: u64) { + self.matched = 0; + self.next_idx = next_idx; + self.state = ProgressState::default(); + self.paused = false; + self.pending_snapshot = 0; + self.pending_request_snapshot = INVALID_INDEX; + self.recent_active = false; + self.ins.reset(); + } + + /// Changes the progress to a probe. + pub fn become_probe(&mut self) { + // If the original state is ProgressStateSnapshot, progress knows that + // the pending snapshot has been sent to this peer successfully, then + // probes from pendingSnapshot + 1. + if self.state == ProgressState::Snapshot { + let pending_snapshot = self.pending_snapshot; + self.reset_state(ProgressState::Probe); + self.next_idx = cmp::max(self.matched + 1, pending_snapshot + 1); + } else { + self.reset_state(ProgressState::Probe); + self.next_idx = self.matched + 1; + } + } + + /// Changes the progress to a Replicate. + #[inline] + pub fn become_replicate(&mut self) { + self.reset_state(ProgressState::Replicate); + self.next_idx = self.matched + 1; + } + + /// Changes the progress to a snapshot. + #[inline] + pub fn become_snapshot(&mut self, snapshot_idx: u64) { + self.reset_state(ProgressState::Snapshot); + self.pending_snapshot = snapshot_idx; + } + + /// Sets the snapshot to failure. + #[inline] + pub fn snapshot_failure(&mut self) { + self.pending_snapshot = 0; + } + + /// Returns true if Match is equal or higher than the pendingSnapshot. + #[inline] + pub fn is_snapshot_caught_up(&self) -> bool { + self.state == ProgressState::Snapshot && self.matched >= self.pending_snapshot + } + + /// Returns false if the given n index comes from an outdated message. + /// Otherwise it updates the progress and returns true. + pub fn maybe_update(&mut self, n: u64) -> bool { + let need_update = self.matched < n; + if need_update { + self.matched = n; + self.resume(); + }; + + if self.next_idx < n + 1 { + self.next_idx = n + 1 + } + + need_update + } + + /// update committed_index. + pub fn update_committed(&mut self, committed_index: u64) { + if committed_index > self.committed_index { + self.committed_index = committed_index + } + } + + /// Optimistically advance the index + #[inline] + pub fn optimistic_update(&mut self, n: u64) { + self.next_idx = n + 1; + } + + /// Returns false if the given index comes from an out of order message. + /// Otherwise it decreases the progress next index to min(rejected, last) + /// and returns true. + pub fn maybe_decr_to(&mut self, rejected: u64, match_hint: u64, request_snapshot: u64) -> bool { + if self.state == ProgressState::Replicate { + // the rejection must be stale if the progress has matched and "rejected" + // is smaller than "match". + // Or rejected equals to matched and request_snapshot is the INVALID_INDEX. + if rejected < self.matched + || (rejected == self.matched && request_snapshot == INVALID_INDEX) + { + return false; + } + if request_snapshot == INVALID_INDEX { + self.next_idx = self.matched + 1; + } else { + self.pending_request_snapshot = request_snapshot; + } + return true; + } + + // The rejection must be stale if "rejected" does not match next - 1. + // Do not consider it stale if it is a request snapshot message. + if (self.next_idx == 0 || self.next_idx - 1 != rejected) + && request_snapshot == INVALID_INDEX + { + return false; + } + + // Do not decrease next index if it's requesting snapshot. + if request_snapshot == INVALID_INDEX { + self.next_idx = cmp::min(rejected, match_hint + 1); + if self.next_idx < self.matched + 1 { + self.next_idx = self.matched + 1; + } + } else if self.pending_request_snapshot == INVALID_INDEX { + // Allow requesting snapshot even if it's not Replicate. + self.pending_request_snapshot = request_snapshot; + } + self.resume(); + true + } + + /// Determine whether progress is paused. + #[inline] + pub fn is_paused(&self) -> bool { + match self.state { + ProgressState::Probe => self.paused, + ProgressState::Replicate => self.ins.full(), + ProgressState::Snapshot => true, + } + } + + /// Resume progress + #[inline] + pub fn resume(&mut self) { + self.paused = false; + } + + /// Pause progress. + #[inline] + pub fn pause(&mut self) { + self.paused = true; + } + + /// Update inflight msgs and next_idx + pub fn update_state(&mut self, last: u64) { + match self.state { + ProgressState::Replicate => { + self.optimistic_update(last); + self.ins.add(last); + } + ProgressState::Probe => self.pause(), + ProgressState::Snapshot => panic!( + "updating progress state in unhandled state {:?}", + self.state + ), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn new_progress( + state: ProgressState, + matched: u64, + next_idx: u64, + pending_snapshot: u64, + ins_size: usize, + ) -> Progress { + let mut p = Progress::new(next_idx, ins_size); + p.state = state; + p.matched = matched; + p.pending_snapshot = pending_snapshot; + p + } + + #[test] + fn test_progress_is_paused() { + let tests = [ + (ProgressState::Probe, false, false), + (ProgressState::Probe, true, true), + (ProgressState::Replicate, false, false), + (ProgressState::Replicate, true, false), + (ProgressState::Snapshot, false, true), + (ProgressState::Snapshot, true, true), + ]; + for (i, &(state, paused, w)) in tests.iter().enumerate() { + let mut p = new_progress(state, 0, 0, 0, 256); + p.paused = paused; + if p.is_paused() != w { + panic!("#{}: shouldwait = {}, want {}", i, p.is_paused(), w) + } + } + } + + // test_progress_resume ensures that progress.maybeUpdate and progress.maybeDecrTo + // will reset progress.paused. + #[test] + fn test_progress_resume() { + let mut p = Progress::new(2, 256); + p.paused = true; + p.maybe_decr_to(1, 1, INVALID_INDEX); + assert!(!p.paused, "paused= true, want false"); + p.paused = true; + p.maybe_update(2); + assert!(!p.paused, "paused= true, want false"); + } + + #[test] + fn test_progress_become_probe() { + let matched = 1u64; + let mut tests = vec![ + ( + new_progress(ProgressState::Replicate, matched, 5, 0, 256), + 2, + ), + // snapshot finish + ( + new_progress(ProgressState::Snapshot, matched, 5, 10, 256), + 11, + ), + // snapshot failure + (new_progress(ProgressState::Snapshot, matched, 5, 0, 256), 2), + ]; + for (i, &mut (ref mut p, wnext)) in tests.iter_mut().enumerate() { + p.become_probe(); + if p.state != ProgressState::Probe { + panic!( + "#{}: state = {:?}, want {:?}", + i, + p.state, + ProgressState::Probe + ); + } + if p.matched != matched { + panic!("#{}: match = {:?}, want {:?}", i, p.matched, matched); + } + if p.next_idx != wnext { + panic!("#{}: next = {}, want {}", i, p.next_idx, wnext); + } + } + } + + #[test] + fn test_progress_become_replicate() { + let mut p = new_progress(ProgressState::Probe, 1, 5, 0, 256); + p.become_replicate(); + + assert_eq!(p.state, ProgressState::Replicate); + assert_eq!(p.matched, 1); + assert_eq!(p.matched + 1, p.next_idx); + } + + #[test] + fn test_progress_become_snapshot() { + let mut p = new_progress(ProgressState::Probe, 1, 5, 0, 256); + p.become_snapshot(10); + assert_eq!(p.state, ProgressState::Snapshot); + assert_eq!(p.matched, 1); + assert_eq!(p.pending_snapshot, 10); + } + + #[test] + fn test_progress_update() { + let (prev_m, prev_n) = (3u64, 5u64); + let tests = [ + (prev_m - 1, prev_m, prev_n, false), + (prev_m, prev_m, prev_n, false), + (prev_m + 1, prev_m + 1, prev_n, true), + (prev_m + 2, prev_m + 2, prev_n + 1, true), + ]; + for (i, &(update, wm, wn, wok)) in tests.iter().enumerate() { + let mut p = Progress::new(prev_n, 256); + p.matched = prev_m; + let ok = p.maybe_update(update); + if ok != wok { + panic!("#{}: ok= {}, want {}", i, ok, wok); + } + if p.matched != wm { + panic!("#{}: match= {}, want {}", i, p.matched, wm); + } + if p.next_idx != wn { + panic!("#{}: next= {}, want {}", i, p.next_idx, wn); + } + } + } + + #[test] + fn test_progress_maybe_decr() { + let tests = vec![ + // state replicate and rejected is not greater than match + (ProgressState::Replicate, 5, 10, 5, 5, false, 10), + // state replicate and rejected is not greater than match + (ProgressState::Replicate, 5, 10, 4, 4, false, 10), + // state replicate and rejected is greater than match + // directly decrease to match+1 + (ProgressState::Replicate, 5, 10, 9, 9, true, 6), + // next-1 != rejected is always false + (ProgressState::Probe, 0, 0, 0, 0, false, 0), + // next-1 != rejected is always false + (ProgressState::Probe, 0, 10, 5, 5, false, 10), + // next>1 = decremented by 1 + (ProgressState::Probe, 0, 10, 9, 9, true, 9), + // next>1 = decremented by 1 + (ProgressState::Probe, 0, 2, 1, 1, true, 1), + // next<=1 = reset to 1 + (ProgressState::Probe, 0, 1, 0, 0, true, 1), + // decrease to min(rejected, last+1) + (ProgressState::Probe, 0, 10, 9, 2, true, 3), + // rejected < 1, reset to 1 + (ProgressState::Probe, 0, 10, 9, 0, true, 1), + ]; + for (i, &(state, m, n, rejected, last, w, wn)) in tests.iter().enumerate() { + let mut p = new_progress(state, m, n, 0, 0); + if p.maybe_decr_to(rejected, last, 0) != w { + panic!("#{}: maybeDecrTo= {}, want {}", i, !w, w); + } + if p.matched != m { + panic!("#{}: match= {}, want {}", i, p.matched, m); + } + if p.next_idx != wn { + panic!("#{}: next= {}, want {}", i, p.next_idx, wn); + } + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/state.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/state.rs new file mode 100644 index 0000000..5c6b4d4 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/tracker/state.rs @@ -0,0 +1,40 @@ +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +// Copyright 2015 The etcd Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; +use std::fmt::{Display, Formatter}; + +/// The state of the progress. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] +pub enum ProgressState { + /// Whether it's probing. + #[default] + Probe, + /// Whether it's replicating. + Replicate, + /// Whether it's a snapshot. + Snapshot, +} + +impl Display for ProgressState { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + ProgressState::Probe => write!(f, "StateProbe"), + ProgressState::Replicate => write!(f, "StateReplicate"), + ProgressState::Snapshot => write!(f, "StateSnapshot"), + } + } +} diff --git a/systems/Raft-rs/driver/rust_so/raft-rs/src/util.rs b/systems/Raft-rs/driver/rust_so/raft-rs/src/util.rs new file mode 100644 index 0000000..c4fd582 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/raft-rs/src/util.rs @@ -0,0 +1,178 @@ +//! This module contains a collection of various tools to use to manipulate +//! and control messages and data associated with raft. + +// Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. + +use std::fmt; +use std::fmt::Write; + +use slog::{OwnedKVList, Record, KV}; + +use crate::eraftpb::{Entry, Message}; +use crate::HashSet; +use protobuf::Message as PbMessage; + +use slog::{b, record_static}; + +/// A number to represent that there is no limit. +pub const NO_LIMIT: u64 = u64::MAX; + +/// Truncates the list of entries down to a specific byte-length of +/// all entries together. +/// +/// # Examples +/// +/// ``` +/// use raft::{util::limit_size, prelude::*}; +/// +/// let template = { +/// let mut entry = Entry::default(); +/// entry.data = "*".repeat(100).into_bytes().into(); +/// entry +/// }; +/// +/// // Make a bunch of entries that are ~100 bytes long +/// let mut entries = vec![ +/// template.clone(), +/// template.clone(), +/// template.clone(), +/// template.clone(), +/// template.clone(), +/// ]; +/// +/// assert_eq!(entries.len(), 5); +/// limit_size(&mut entries, Some(220)); +/// assert_eq!(entries.len(), 2); +/// +/// // `entries` will always have at least 1 Message +/// limit_size(&mut entries, Some(0)); +/// assert_eq!(entries.len(), 1); +/// ``` +pub fn limit_size(entries: &mut Vec, max: Option) { + if entries.len() <= 1 { + return; + } + let max = match max { + None | Some(NO_LIMIT) => return, + Some(max) => max, + }; + + let mut size = 0; + let limit = entries + .iter() + .take_while(|&e| { + if size == 0 { + size += u64::from(e.compute_size()); + return true; + } + size += u64::from(e.compute_size()); + size <= max + }) + .count(); + + entries.truncate(limit); +} + +/// Check whether the entry is continuous to the message. +/// i.e msg's next entry index should be equal to the index of the first entry in `ents` +pub fn is_continuous_ents(msg: &Message, ents: &[Entry]) -> bool { + if !msg.entries.is_empty() && !ents.is_empty() { + let expected_next_idx = msg.entries.last().unwrap().index + 1; + return expected_next_idx == ents.first().unwrap().index; + } + true +} + +struct FormatKeyValueList { + pub buffer: String, +} + +impl slog::Serializer for FormatKeyValueList { + fn emit_arguments(&mut self, key: slog::Key, val: &fmt::Arguments) -> slog::Result { + if !self.buffer.is_empty() { + write!(&mut self.buffer, ", {}: {}", key, val).unwrap(); + } else { + write!(&mut self.buffer, "{}: {}", key, val).unwrap(); + } + Ok(()) + } +} + +pub(crate) fn format_kv_list(kv_list: &OwnedKVList) -> String { + let mut formatter = FormatKeyValueList { + buffer: "".to_owned(), + }; + let record = record_static!(slog::Level::Trace, ""); + kv_list + .serialize( + &Record::new(&record, &format_args!(""), b!()), + &mut formatter, + ) + .unwrap(); + formatter.buffer +} + +/// Get the majority number of given nodes count. +#[inline] +pub fn majority(total: usize) -> usize { + (total / 2) + 1 +} + +/// A convenient struct that handles queries to both HashSet. +pub struct Union<'a> { + first: &'a HashSet, + second: &'a HashSet, +} + +impl<'a> Union<'a> { + /// Creates a union. + pub fn new(first: &'a HashSet, second: &'a HashSet) -> Union<'a> { + Union { first, second } + } + + /// Checks if id shows up in either HashSet. + #[inline] + pub fn contains(&self, id: u64) -> bool { + self.first.contains(&id) || self.second.contains(&id) + } + + /// Returns an iterator iterates the distinct values in two sets. + pub fn iter(&self) -> impl Iterator + '_ { + self.first.union(self.second).cloned() + } + + /// Checks if union is empty. + pub fn is_empty(&self) -> bool { + self.first.is_empty() && self.second.is_empty() + } + + /// Gets the count of the union. + /// + /// The time complexity is O(n). + pub fn len(&self) -> usize { + // Usually, second is empty. + self.first.len() + self.second.len() - self.second.intersection(self.first).count() + } +} + +/// Get the approximate size of entry +#[inline] +pub fn entry_approximate_size(e: &Entry) -> usize { + // message Entry { + // EntryType entry_type = 1; + // uint64 term = 2; + // uint64 index = 3; + // bytes data = 4; + // bytes context = 6; + // bool sync_log = 5;(Deprecated) + // } + // Each field has tag(1 byte) if it's not default value. + // Tips: x bytes can represent a value up to 1 << x*7 - 1, + // So 1 byte => 127, 2 bytes => 16383, 3 bytes => 2097151. + // If entry_type is normal(default), in general, the size should + // be tag(4) + term(1) + index(2) + data(2) + context(1) = 10. + // If entry_type is conf change, in general, the size should be + // tag(5) + entry_type(1) + term(1) + index(2) + data(1) + context(1) = 11. + // We choose 12 in case of large index or large data for normal entry. + e.data.len() + e.context.len() + 12 +} diff --git a/systems/Raft-rs/driver/rust_so/src/lib.rs b/systems/Raft-rs/driver/rust_so/src/lib.rs new file mode 100644 index 0000000..8582750 --- /dev/null +++ b/systems/Raft-rs/driver/rust_so/src/lib.rs @@ -0,0 +1,778 @@ +// 导入 raft-rs 库 +#![allow(clippy::field_reassign_with_default)] +extern crate slog_term; +extern crate slog; + +#[allow(unused_imports)] +use std::slice; +// use slog::*; + +#[allow(unused_imports)] +extern crate raft; +use raft::eraftpb::Snapshot; +use protobuf::Message as PbMessage; +use raft::{Config, storage::MemStorage, raw_node::RawNode, StateRole, ProgressState}; +use raft::eraftpb::{ Entry, Message, MessageType}; +use raft::{CAMPAIGN_PRE_ELECTION, CAMPAIGN_ELECTION, CAMPAIGN_TRANSFER}; +// use raft::{prelude::*}; + +// use protobuf::Message; +use std::ffi::CString; +use libc::{c_int, ssize_t, c_char}; + +use slog::Drain; +// use std::collections::{HashMap}; +use std::time::{Duration, Instant}; +use std::{str}; +use std::ffi::CStr; +// use slog::*; + +// use regex::Regex; + +use slog::{error, info, o}; + +// #[link(name = "config")] +#[link(name = "myLibrary")] +extern "C" { + fn getIdSelf() -> u64; + fn send_to(node_id: c_int, data: *const libc::c_char, length: ssize_t) -> ssize_t; + #[allow(dead_code)] + fn recv_from(node_id: c_int, data: *const libc::c_char) -> ssize_t; +} + + + +#[no_mangle] +pub extern "C" fn raftInit() -> *mut Node { + + let _node = create_node().unwrap(); + let node = Box::new(_node); + + Box::into_raw(node) +} + + +#[no_mangle] +pub extern "C" fn raftRecvMsg(ptr: *mut Node, data: *const libc::c_char, length: usize) { + + // 将 C 字符串指针转换为 Rust 字符串切片 + // let c_str = unsafe { + // assert!(!data.is_null()); + // CStr::from_ptr(data) + // }; + + let data_slice: &[u8] = unsafe { + assert!(!data.is_null()); + std::slice::from_raw_parts(data as *const u8, length) + }; + // 将 CStr 转换为 Rust 的 str 类型 + // let data = c_str.to_str().expect("Invalid UTF-8"); + + // 尝试将字节切片转换为 &str + match std::str::from_utf8(data_slice) { + + Ok(data_str) => { + unsafe{ + // println!("data {} node {}", data, (*ptr).get_id()); + + (*ptr).recv_msg(data_str); + }; + }, + Err(e) => { + // 处理转换失败的情况,例如不合法的UTF-8数据 + println!("Failed to convert data to str: {}", e); + } + } + // unsafe{ + // // println!("data {} node {}", data, (*ptr).get_id()); + + // (*ptr).recv_msg(data); + // }; + + +} + +#[no_mangle] +pub extern "C" fn raftClientOperation(ptr: *mut Node, data: *const libc::c_char) { + let c_str = unsafe { + assert!(!data.is_null()); + CStr::from_ptr(data) + }; + + + // 将 CStr 转换为 Rust 的 str 类型 + let data = c_str.to_str().expect("Invalid UTF-8"); + println!("propose {}", data); + unsafe{ + // println!("data {} node {}", data, (*ptr).get_id()); + (*ptr).propose(data); + }; + + + +} +#[allow(unused_imports)] +use std::{ptr::addr_of}; +use libc::{c_uint, printf}; + +#[allow(dead_code)] +fn print_buffer(addr: *const u8, size: isize) { + let c_fmt = CString::new("%02X ").unwrap(); + let c_fmt_endl = CString::new("%02X\n").unwrap(); + unsafe { + for i in 0..size { + if (i + 1) % 8 == 0 { + printf(c_fmt_endl.as_ptr(), *addr.offset(i) as c_uint); + } else { + printf(c_fmt.as_ptr(), *addr.offset(i) as c_uint); + } + } + } +} + + +#[no_mangle] +pub extern "C" fn raftPeriodic(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).tick(); + }; +} + +#[no_mangle] +pub extern "C" fn raftCampaign(ptr: *mut Node, data: *const libc::c_char) { + let c_str = unsafe { + assert!(!data.is_null()); + CStr::from_ptr(data) + }; + // 将 CStr 转换为 Rust 的 str 类型 + let data = c_str.to_str().expect("Invalid UTF-8"); + unsafe{ + (*ptr).campaign(data); + }; +} + +#[no_mangle] +pub extern "C" fn raftBcastHeartbeat(ptr: *mut Node) { + unsafe{ + (*ptr).bcast_heartbeat(); + }; +} +#[no_mangle] +pub extern "C" fn raftBcastAppend(ptr: *mut Node) { + unsafe{ + (*ptr).bcast_append(); + }; +} + +#[no_mangle] +pub extern "C" fn raftBecomePreCandidate(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).become_pre_candidate(); + }; +} + +#[no_mangle] +pub extern "C" fn raftBecomeCandidate(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).become_candidate(); + }; +} + + + +#[no_mangle] +pub extern "C" fn raftBecomeLeader(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).become_leader(); + }; +} + +#[no_mangle] +pub extern "C" fn raftCompact(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).compact(); + }; +} + +#[no_mangle] +pub extern "C" fn raftAskSnap(ptr: *mut Node) { + unsafe{ + // println!("node {}", (*ptr).get_id()); + (*ptr).askSanp(); + }; +} + +#[no_mangle] +pub extern "C" fn raftGet(ptr: *mut Node, data: *const libc::c_char) -> *mut c_char{ + // print_buffer(ptr as *const u8, 32); + + // 将输入的 C 字符串转换为 Rust 的字符串切片 + let input = unsafe { + assert!(!data.is_null()); + CStr::from_ptr(data).to_str().unwrap() + }; + + let c_result: *mut c_char; + unsafe{ + + let result = (*ptr).get_var(input); + // 将结果字符串转换为 C 字符串(CString) + let c_string = CString::new(result).unwrap(); + c_result = c_string.into_raw(); + // 返回结果的原始指针 + + }; + c_result + +} + + +fn create_node() -> Option{ + let id = unsafe{getIdSelf()}; + let decorator = slog_term::TermDecorator::new().build(); + let drain = slog_term::FullFormat::new(decorator).build().fuse(); + let drain = slog_async::Async::new(drain) + .chan_size(4096) + .overflow_strategy(slog_async::OverflowStrategy::Block) + .build() + .fuse(); + let logger = slog::Logger::root(drain, o!()); + + let node = Node::create_node(id, &logger); + return Some(node); +} + +#[allow(dead_code)] +fn is_initial_msg(msg: &Message) -> bool { + let msg_type = msg.get_msg_type(); + msg_type == MessageType::MsgRequestVote + || msg_type == MessageType::MsgRequestPreVote + || (msg_type == MessageType::MsgHeartbeat && msg.commit == 0) +} + + +#[repr(C)] +pub struct Node { + // None if the raft is not initialized. + id : u64, + raft_group: Option>, + timer : Instant, + timeout : Duration, + logger : slog::Logger, +} + +impl Node { + + fn create_node( + id: u64, + logger : &slog::Logger, + ) -> Self { + let mut cfg = example_config(); + cfg.id = id; + + let logger = logger.new(o!("tag" => format!("peer_{}", id))); + let mut s = Snapshot::default(); + // Because we don't use the same configuration to initialize every node, so we use + // a non-zero index to force new followers catch up logs by snapshot first, which will + // bring all nodes to the same initial state. + s.mut_metadata().index = 1; + s.mut_metadata().term = 1; + + // 在这个地方设置voters + s.mut_metadata().mut_conf_state().voters = vec![1, 2, 3]; + let storage = MemStorage::new(); + storage.wl().apply_snapshot(s).unwrap(); + let raft_group = Some(RawNode::new(&cfg, storage, &logger).unwrap()); + + // let id = id as i32; + let timer = Instant::now(); + let timeout = Duration::from_millis(100); + Node { + id , + raft_group, + timer, + timeout, + logger, + } + } + + // Initialize raft for followers. + #[allow(dead_code)] + fn initialize_raft_from_message(&mut self, msg: &Message, logger: &slog::Logger) { + if !is_initial_msg(msg) { + return; + } + let mut cfg = example_config(); + cfg.id = msg.to; + self.id = msg.to; + let logger = logger.new(o!("tag" => format!("peer_{}", self.id))); + let storage = MemStorage::new(); + self.raft_group = Some(RawNode::new(&cfg, storage, &logger).unwrap()); + } + + #[allow(dead_code)] + fn step(&mut self, msg: Message, logger: &slog::Logger) { + if self.raft_group.is_none() { + if is_initial_msg(&msg) { + self.initialize_raft_from_message(&msg, logger); + } else { + return; + } + } + let raft_group = self.raft_group.as_mut().unwrap(); + let _ = raft_group.step(msg); + } + + #[allow(dead_code)] + fn get_id(& self) -> u64{ + return self.id; + } + + fn recv_msg(&mut self, input : &str){ + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + let _size = 4025; + + let bytes = input.as_bytes(); + match Message::parse_from_bytes(&bytes) { + Ok(deserialized_message) => { + // 成功将字节数组反序列化为消息对象 + // 在这里处理反序列化后的消息对象 + println!("step message: {:?}", deserialized_message); + let _ = raft_group.step(deserialized_message); + + } + Err(err) => { + // 发生错误 + error!(logger, "Error deserializing message: {:?}", err); + return; + } + } + on_ready(raft_group, logger); + } + + fn campaign(&mut self, input : &str){ + + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + if input == "normal"{ + raft_group.campaign(); + } else if input == "pre"{ + raft_group.raft.campaign(CAMPAIGN_PRE_ELECTION); + } else if input == "transfer"{ + raft_group.raft.campaign(CAMPAIGN_TRANSFER); + } + on_ready(raft_group, logger); + } + +// 实际上 在 raft-rs中不需要时钟驱动 我们通过tick就可以驱动 +// 实际上后续 heartBeat campaign 都不需要专门时钟一次次推进,我这里想做的就是在处理完事件之后有一个更新到raft节点的过程 + fn tick(&mut self){ + + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + // raft_group.tick(); + + on_ready(raft_group, logger); + } + + fn bcast_heartbeat(&mut self){ + + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + println!("raft ping"); + raft_group.raft.ping(); + + on_ready(raft_group, logger); + } + + fn bcast_append(&mut self){ + + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + println!("raft bcast append"); + raft_group.raft.bcast_append(); + + on_ready(raft_group, logger); + } + + fn become_candidate(&mut self){ + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + raft_group.raft.become_candidate(); + + on_ready(raft_group, logger); + } + + fn become_leader(&mut self){ + if self.raft_group.is_none() { + return; + } + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + raft_group.raft.become_candidate(); + raft_group.raft.become_leader(); + raft_group.raft.bcast_append(); + + on_ready(raft_group, logger); + } + + + fn compact(&mut self){ + if self.raft_group.is_none() { + return; + } + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + let store = raft_group.raft.raft_log.store.clone(); + + store.wl().compact(raft_group.raft.r.raft_log.committed).unwrap(); + + on_ready(raft_group, logger); + } + + fn askSanp(&mut self){ + if self.raft_group.is_none() { + return; + } + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + raft_group.request_snapshot(); + + on_ready(raft_group, logger); + } + + + + fn become_pre_candidate(&mut self){ + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + raft_group.raft.become_pre_candidate(); + + on_ready(raft_group, logger); + } + // let mut timeout = Duration::from_millis(100); + // let d = self.timer.elapsed(); + // self.timer = Instant::now(); + // let res = d >= self.timeout; + // if d >= self.timeout { + // self.timeout = Duration::from_millis(100); + // // We drive Raft every 100ms. + // info!(logger, "raft_group {} tick", self.id); + + // } else { + // timeout -= d; + // } + + fn propose(&mut self, data :&str){ + if self.raft_group.is_none() { + return; + } + + let raft_group = self.raft_group.as_mut().unwrap(); + let logger = &self.logger; + + // 如果我不设置回调函数呢?其实应该也没什么,毕竟就是单纯 + let data_bytes = data.as_bytes().to_vec(); + raft_group.propose(vec![], data_bytes).unwrap(); + on_ready(raft_group, logger); + } + + #[warn(unused_must_use)] + fn get_var(&mut self, data :&str)-> String { + let logger = &self.logger; + info!(logger, "node {} get {}", self.id, data); + if let Some(raw_node) = &self.raft_group{ + match data { + "state" => { + match raw_node.raft.soft_state().raft_state{ + StateRole::Follower =>{ + return "1".to_string(); + }StateRole::Candidate =>{ + return "3".to_string(); + }StateRole::Leader =>{ + return "4".to_string(); + }StateRole::PreCandidate =>{ + return "2".to_string(); + } + } + }"current_term" => { + return raw_node.raft.hard_state().term.to_string(); + } + "log" => { + let mut res = "[(1, 'Nil', 1)".to_string(); + let all_ents = raw_node.raft.raft_log.all_entries(); + for (index, ent) in all_ents.iter().enumerate() { + // if res == "".to_string(){ + // res = format!{"[({}, \'{}\')", ent.get_term(), String::from_utf8_lossy(ent.get_data())}; + // } else { + // res = format!{"{}, ({}, \'{}\')", res, ent.get_term(), String::from_utf8_lossy(ent.get_data())}; + // } + let mut ent_data = String::from_utf8_lossy(ent.get_data()).into_owned(); + ent_data = if ent_data.is_empty() { + "Nil".to_string() + } else{ + ent_data + }; + res = format!{"{}, ({}, \'{}\', {})", res, ent.get_term(), ent_data, index + 2}; + } + // if res != "".to_string(){ + // res = format!{"{}]", res}; + // } + res = format!{"{}]", res}; + + + println!("raft log {:?}", all_ents); + println!("raft_log first_index {:?}", raw_node.raft.raft_log.first_index()); + println!("raft_log last_index {:?}", raw_node.raft.raft_log.last_index()); + println!("raft_log committed {:?}", raw_node.raft.raft_log.committed); + println!("raft_log persisted {:?}", raw_node.raft.raft_log.persisted); + return res.to_string(); + } + "commit_idx" => { + return raw_node.raft.raft_log.committed.to_string(); + }"match_idx" => { + let mut res = "".to_string(); + raw_node.raft.prs() + .iter() + .filter(|&(id, _)| *id != self.get_id()) + .for_each(|(id, pr)| { + if res == "".to_string(){ + res = format!{"{{\'n{}\': {}", id, pr.matched} + } else { + res = format!{"{}, \'n{}\': {}", res, id, pr.matched} + } }); + res = format!{"{}}}", res}; + return res.to_string(); + }"next_idx" => { + let mut res = "".to_string(); + raw_node.raft.prs() + .iter() + .filter(|&(id, _)| *id != self.get_id()) + .for_each(|(id, pr)| { + if res == "".to_string(){ + res = format!{"{{\'n{}\': {}", id, pr.next_idx} + } else { + res = format!{"{}, \'n{}\': {}", res, id, pr.next_idx} + } + }); + res = format!{"{}}}", res}; + return res.to_string(); + }"pr_pending" => { + let mut res = "".to_string(); + raw_node.raft.prs() + .iter() + .filter(|&(id, _)| *id != self.get_id()) + .for_each(|(id, pr)| { + if res == "".to_string(){ + res = format!{"{{\'n{}\': {}", id, pr.pending_request_snapshot} + } else { + res = format!{"{}, \'n{}\': {}", res, id, pr.pending_request_snapshot} + } + }); + res = format!{"{}}}", res}; + return res.to_string(); + } + "pending_snapshot" => { + return raw_node.raft.pending_request_snapshot.to_string(); + }"progress" => { + let mut res = "".to_string(); + raw_node.raft.prs() + .iter() + .filter(|&(id, _)| *id != self.get_id()) + .for_each(|(id, pr)| { + let state_str = match pr.state{ + ProgressState::Probe => "Probe".to_string(), + ProgressState::Replicate =>"Replicate".to_string(), + ProgressState::Snapshot => "Snapshot".to_string(), + + }; + let pause_flag = if pr.is_paused(){ + "pause" + } else { + "running" + }; + + if res == "".to_string(){ + res = format!{"{{\'n{}\': {}, {}", id, state_str, pause_flag} + } else { + res = format!{"{}, \'n{}\': {}, {}", res, id, state_str, pause_flag} + } + }); + res = format!{"{}}}", res}; + return res.to_string(); + } + &_ => todo!() + } + } else { + error!(self.logger, "raft group empty"); + return "".to_string(); + } + } +} + + +fn on_ready(raft_group: &mut RawNode, + logger: &slog::Logger,) { + if !raft_group.has_ready() { + return; + } + println!( "on_ready start"); + let store = raft_group.raft.raft_log.store.clone(); + + // Get the `Ready` with `RawNode::ready` interface. + let mut ready = raft_group.ready(); + + println!( "ready {:?}", ready); + + + let handle_messages = |msgs: Vec| { + + for msg in msgs { + let to = msg.to as i32; + match msg.write_to_bytes() { + Ok(mut serialized_message) => { + println!("to {}", to); + println!("serialized_message {:?}", serialized_message); + + let length = serialized_message.len(); + unsafe{send_to(to, serialized_message.as_mut_ptr() as *const libc::c_char, length.try_into().unwrap())}; + } + Err(err) => { + error!(logger, "Error serializing message: {:?}", err); + } + } + } + }; + + + + if !ready.messages().is_empty() { + // Send out the messages come from the node. + handle_messages(ready.take_messages()); + } + + // Apply the snapshot. It's necessary because in `RawNode::advance` we stabilize the snapshot. + if *ready.snapshot() != Snapshot::default() { + let s = ready.snapshot().clone(); + if let Err(e) = store.wl().apply_snapshot(s) { + error!( + logger, + "apply snapshot fail: {:?}, need to retry or panic", e + ); + return; + } + } + + let mut _last_apply_index = 0; + let mut handle_committed_entries = |committed_entries: Vec| { + + + for entry in committed_entries { + // Mostly, you need to save the last apply index to resume applying + // after restart. Here we just ignore this because we use a Memory storage. + _last_apply_index = entry.index; + + // 感觉这里对于commited data 应该是对 commited 是不是不需要操作 + // if entry.data.is_empty() { + // // Emtpy entry, when the peer becomes Leader it will send an empty entry. + // continue; + // } + + // if entry.get_entry_type() == EntryType::EntryNormal { + // if let Some(cb) = cbs.remove(entry.data.first().unwrap()) { + // cb(); + // } + // } + + + // TODO: handle EntryConfChange + } + }; + handle_committed_entries(ready.take_committed_entries()); + // Persistent raft logs. It's necessary because in `RawNode::advance` we stabilize + // raft logs to the latest position. + if let Err(e) = store.wl().append(ready.entries()) { + error!( + logger, + "persist raft log fail: {:?}, need to retry or panic", e + ); + return; + } + + if let Some(hs) = ready.hs() { + // Raft HardState changed, and we need to persist it. + store.wl().set_hardstate(hs.clone()); + } + + if !ready.persisted_messages().is_empty() { + // Send out the persisted messages come from the node. + handle_messages(ready.take_persisted_messages()); + } + + // Call `RawNode::advance` interface to update flags in the raft. + let mut light_rd = raft_group.advance(ready); + // Update commit index. + if let Some(commit) = light_rd.commit_index() { + store.wl().mut_hard_state().set_commit(commit); + } + // Send out the messages. + handle_messages(light_rd.take_messages()); + // Apply all committed entries. + handle_committed_entries(light_rd.take_committed_entries()); + // Advance the apply index. + raft_group.advance_apply(); +} + + +fn example_config() -> Config { + Config { + election_tick: 10, + heartbeat_tick: 3, + max_inflight_msgs : 1, // 只允许一个 + // skip_bcast_commit: true, + ..Default::default() + } +} + diff --git a/systems/Raft-rs/driver/tlpi/CMakeLists.txt b/systems/Raft-rs/driver/tlpi/CMakeLists.txt new file mode 100644 index 0000000..ab7d671 --- /dev/null +++ b/systems/Raft-rs/driver/tlpi/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 3.16) +project(tlpi) + +set(CMAKE_C_STANDARD 99) +add_compile_options(-Wall -Wextra -Werror) + +add_library(tlpi inet_sockets.c inet_sockets.h tlpi_hdr.h) diff --git a/systems/Raft-rs/driver/tlpi/inet_sockets.c b/systems/Raft-rs/driver/tlpi/inet_sockets.c new file mode 100644 index 0000000..2ada306 --- /dev/null +++ b/systems/Raft-rs/driver/tlpi/inet_sockets.c @@ -0,0 +1,157 @@ +/*************************************************************************\ +* Copyright (C) Michael Kerrisk, 2022. * +* * +* This program is free software. You may use, modify, and redistribute it * +* under the terms of the GNU Lesser General Public License as published * +* by the Free Software Foundation, either version 3 or (at your option) * +* any later version. This program is distributed without any warranty. * +* See the files COPYING.lgpl-v3 and COPYING.gpl-v3 for details. * +\*************************************************************************/ + +/* Listing 59-9 */ + +//#define _BSD_SOURCE /* To get NI_MAXHOST and NI_MAXSERV +// definitions from */ +#define _DEFAULT_SOURCE /* To get NI_MAXHOST and NI_MAXSERV + definitions from */ +#include +#include +#include +#include +#include "inet_sockets.h" /* Declares functions defined here */ +#include "tlpi_hdr.h" + +#include + +int +inetConnect(const char *host, const char *service, int type, int retry_times) +{ + struct addrinfo hints; + struct addrinfo *result, *rp; + int sfd, s; + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_canonname = NULL; + hints.ai_addr = NULL; + hints.ai_next = NULL; + hints.ai_family = AF_UNSPEC; /* Allows IPv4 or IPv6 */ + hints.ai_socktype = type; + + s = getaddrinfo(host, service, &hints, &result); + if (s != 0) { + errno = ENOSYS; + return -1; + } + + /* Walk through returned list until we find an address structure + that can be used to successfully connect a socket */ + + for (rp = result; rp != NULL; rp = rp->ai_next) { + sfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + if (sfd == -1) + continue; /* On error, try next address */ + + if (retry_times > 0) { + setsockopt(sfd, IPPROTO_TCP, TCP_SYNCNT, &retry_times, sizeof(retry_times)); + } + + if (connect(sfd, rp->ai_addr, rp->ai_addrlen) != -1) + break; /* Success */ + + /* Connect failed: close this socket and try next address */ + + close(sfd); + } + + freeaddrinfo(result); + + return (rp == NULL) ? -1 : sfd; +} + +static int /* Public interfaces: inetBind() and inetListen() */ +inetPassiveSocket(const char *service, int type, socklen_t *addrlen, + Boolean doListen, int backlog) +{ + struct addrinfo hints; + struct addrinfo *result, *rp; + int sfd, optval, s; + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_canonname = NULL; + hints.ai_addr = NULL; + hints.ai_next = NULL; + hints.ai_socktype = type; + hints.ai_family = AF_UNSPEC; /* Allows IPv4 or IPv6 */ + hints.ai_flags = AI_PASSIVE; /* Use wildcard IP address */ + + s = getaddrinfo(NULL, service, &hints, &result); + if (s != 0) + return -1; + + /* Walk through returned list until we find an address structure + that can be used to successfully create and bind a socket */ + + optval = 1; + for (rp = result; rp != NULL; rp = rp->ai_next) { + sfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + if (sfd == -1) + continue; /* On error, try next address */ + + if (doListen) { + if (setsockopt(sfd, SOL_SOCKET, SO_REUSEADDR, &optval, + sizeof(optval)) == -1) { + close(sfd); + freeaddrinfo(result); + return -1; + } + } + + if (bind(sfd, rp->ai_addr, rp->ai_addrlen) == 0) + break; /* Success */ + + /* bind() failed: close this socket and try next address */ + + close(sfd); + } + + if (rp != NULL && doListen) { + if (listen(sfd, backlog) == -1) { + freeaddrinfo(result); + return -1; + } + } + + if (rp != NULL && addrlen != NULL) + *addrlen = rp->ai_addrlen; /* Return address structure size */ + + freeaddrinfo(result); + + return (rp == NULL) ? -1 : sfd; +} + +int +inetListen(const char *service, int backlog, socklen_t *addrlen) +{ + return inetPassiveSocket(service, SOCK_STREAM, addrlen, TRUE, backlog); +} + +int +inetBind(const char *service, int type, socklen_t *addrlen) +{ + return inetPassiveSocket(service, type, addrlen, FALSE, 0); +} + +char * +inetAddressStr(const struct sockaddr *addr, socklen_t addrlen, + char *addrStr, int addrStrLen) +{ + char host[NI_MAXHOST], service[NI_MAXSERV]; + + if (getnameinfo(addr, addrlen, host, NI_MAXHOST, + service, NI_MAXSERV, NI_NUMERICSERV) == 0) + snprintf(addrStr, addrStrLen, "(%s, %s)", host, service); + else + snprintf(addrStr, addrStrLen, "(?UNKNOWN?)"); + + return addrStr; +} diff --git a/systems/Raft-rs/driver/tlpi/inet_sockets.h b/systems/Raft-rs/driver/tlpi/inet_sockets.h new file mode 100644 index 0000000..9b62f92 --- /dev/null +++ b/systems/Raft-rs/driver/tlpi/inet_sockets.h @@ -0,0 +1,32 @@ +/*************************************************************************\ +* Copyright (C) Michael Kerrisk, 2022. * +* * +* This program is free software. You may use, modify, and redistribute it * +* under the terms of the GNU Lesser General Public License as published * +* by the Free Software Foundation, either version 3 or (at your option) * +* any later version. This program is distributed without any warranty. * +* See the files COPYING.lgpl-v3 and COPYING.gpl-v3 for details. * +\*************************************************************************/ + +/* Listing 59-8 */ + +#ifndef INET_SOCKETS_H +#define INET_SOCKETS_H /* Prevent accidental double inclusion */ + +#include +#include + +int inetConnect(const char *host, const char *service, int type, int retry_times); + +int inetListen(const char *service, int backlog, socklen_t *addrlen); + +int inetBind(const char *service, int type, socklen_t *addrlen); + +char *inetAddressStr(const struct sockaddr *addr, socklen_t addrlen, + char *addrStr, int addrStrLen); + +#define IS_ADDR_STR_LEN 4096 + /* Suggested length for string buffer that caller + should pass to inetAddressStr(). Must be greater + than (NI_MAXHOST + NI_MAXSERV + 4) */ +#endif diff --git a/systems/Raft-rs/driver/tlpi/tlpi_hdr.h b/systems/Raft-rs/driver/tlpi/tlpi_hdr.h new file mode 100644 index 0000000..580eed2 --- /dev/null +++ b/systems/Raft-rs/driver/tlpi/tlpi_hdr.h @@ -0,0 +1,34 @@ +/*************************************************************************\ +* Copyright (C) Michael Kerrisk, 2022. * +* * +* This program is free software. You may use, modify, and redistribute it * +* under the terms of the GNU Lesser General Public License as published * +* by the Free Software Foundation, either version 3 or (at your option) * +* any later version. This program is distributed without any warranty. * +* See the files COPYING.lgpl-v3 and COPYING.gpl-v3 for details. * +\*************************************************************************/ + +/* Listing 3-1 */ + +#ifndef TLPI_HDR_H +#define TLPI_HDR_H /* Prevent accidental double inclusion */ + +#include /* Type definitions used by many programs */ +#include /* Standard I/O functions */ +#include /* Prototypes of commonly used library functions, + plus EXIT_SUCCESS and EXIT_FAILURE constants */ +#include /* Prototypes for many system calls */ +#include /* Declares errno and defines error constants */ +#include /* Commonly used string-handling functions */ + +//#include "get_num.h" /* Declares our functions for handling numeric +// arguments (getInt(), getLong()) */ +// +//#include "error_functions.h" /* Declares our error-handling functions */ + +typedef enum { FALSE, TRUE } Boolean; + +#define min(m,n) ((m) < (n) ? (m) : (n)) +#define max(m,n) ((m) > (n) ? (m) : (n)) + +#endif diff --git a/systems/Raft-rs/scripts/Makefile b/systems/Raft-rs/scripts/Makefile new file mode 100644 index 0000000..a85ec50 --- /dev/null +++ b/systems/Raft-rs/scripts/Makefile @@ -0,0 +1,10 @@ +MC_DIR ?= $(shell realpath $(shell pwd)/../model-simulation/model) +TARGET_MC_DIR ?= $(shell ls -td ${MC_DIR}/*/ | head -1) +TARGET_DIR ?= $(shell ls -td ${TARGET_MC_DIR}/*/ | head -1) +TARGET ?= + +run: build + if test ${TARGET}; then ./start.sh ${TARGET}; else for i in $$(ls ${TARGET_DIR} | grep '^trace.*\.dir$$'); do ./start.sh ${TARGET_DIR}/$$i; tmux a; done; fi + +build: + cd ../.. && cmake -B cmake-build-debug && cmake --build cmake-build-debug -j $(shell nproc) \ No newline at end of file diff --git a/systems/Raft-rs/scripts/raft-rs.ini b/systems/Raft-rs/scripts/raft-rs.ini new file mode 100644 index 0000000..f9cc784 --- /dev/null +++ b/systems/Raft-rs/scripts/raft-rs.ini @@ -0,0 +1,64 @@ +[options] +target: ../../specs/rsRaft.tla +model name: ../scripts/model-simulation/model/sim +; model name: /mnt/data/model-simulation/model/sim +workers: 12 +workers: SHOW_IN_TABLE +simulation depth: 60 +simulation depth: SHOW_IN_TABLE +simulation traces: 10000 +simulation traces: SHOW_IN_TABLE +simulation dump traces: true +check deadlock: false +memory ratio: 0.3 +memory ratio: SHOW_IN_TABLE + +[behavior] +temporal formula: Spec + +[invariants] +TypeOk: TypeOk +INV: INV + +[state constraint] +SC: SC + +[constants] +M_AER: [model value] +M_AE: [model value] +M_PRV: [model value] +M_PRVR: [model value] +M_RVR: [model value] +M_RV: [model value] +M_HB: [model value] +M_HBR: [model value] +M_SNAP: [model value] +Leader: [model value] +Follower: [model value] +PreCandidate: [model value] +Candidate: [model value] +Probe: [model value] +Replicate: [model value] +Snapshot: [model value] +NoOp: [model value] +Nil: [model value] +Commands: [model value]{v1, v2} +Servers: [model value]{n1, n2, n3} +; Parameters: [MaxElectionTimes |-> 2, +; MaxAppendEntriesTimes |-> 3, +; MaxHeartBeatTimes |-> 3, +; MaxPartitionTimes |-> 1, +; MaxClientOperationsTimes |-> 3, +; MaxWireMsgs |-> 6] +; Parameters: SHOW_IN_TABLE +; 这里只能模块化检测,因为直接变为leader 没有投票过程,其他很可能再次出现投票成功选举多个leader +Parameters: [MaxElectionTimes |-> 0, + MaxAppendEntriesTimes |-> 1, + MaxHeartBeatTimes |-> 1, + MaxPartitionTimes |-> 1, + MaxClientOperationsTimes |-> 1, + MaxBecomeLeaderTimes |-> 1, + MaxSnapRequest |-> 1, + MaxWireMsgs |-> 8] +Parameters: SHOW_IN_TABLE + diff --git a/systems/Raft-rs/scripts/run copy.sh b/systems/Raft-rs/scripts/run copy.sh new file mode 100755 index 0000000..ef4ae14 --- /dev/null +++ b/systems/Raft-rs/scripts/run copy.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +RUN_SH_DIR=$(dirname "$(realpath "$0")") + +# set RUN_CMDS variable, which is an array of two cmds +function get_run_cmds() { + local tlc_cmd_dir=$RUN_SH_DIR/../../deps/tlc-cmd + python3 "$tlc_cmd_dir/tlcwrapper.py" -d + local run_cmds=$(python3 "$tlc_cmd_dir/tlcwrapper.py" -sg $RUN_SH_DIR/raft-rs.ini) + local ifsbak="$IFS" + IFS=$'\n' + RUN_CMDS=($run_cmds) + IFS="$ifsbak" +} + +function run() { + # cd {{model dir}} + eval ${RUN_CMDS[0]} + if test -d "$RUN_SH_DIR/../../venv"; then + source "$RUN_SH_DIR/../../venv/bin/activate" + fi + python3 $RUN_SH_DIR/run.py . & + PID_RUN_PY=$! + bash -c "set -x; ${RUN_CMDS[1]} -tool" | tee MC.out + trap "trap '' TERM INT" TERM INT + trap "kill $PID_RUN_PY 2>/dev/null" EXIT + local max_secs=60 + for ((i=0;i<$max_secs;i++)); do + if kill -0 $PID_RUN_PY 2>/dev/null; then + sleep 1 + else + break + fi + done + if test "$i" -eq $max_secs; then +# kill $(ps -ef | grep model-simulation/run.py | grep -v grep | awk '{print $2}') 2>/dev/null + kill $PID_RUN_PY 2>/dev/null + fi +} + +# See https://stackoverflow.com/questions/2683279/how-to-detect-if-a-script-is-being-sourced +function is_sourced() { + if [ -n "$ZSH_VERSION" ]; then + case $ZSH_EVAL_CONTEXT in *:file:*) return 0;; esac + else # Add additional POSIX-compatible shell names here, if needed. + case ${0##*/} in dash|-dash|bash|-bash|ksh|-ksh|sh|-sh) return 0;; esac + fi + return 1 # NOT sourced. +} + +if ! is_sourced; then + get_run_cmds + run +fi diff --git a/systems/Raft-rs/scripts/run-valgrind.py b/systems/Raft-rs/scripts/run-valgrind.py new file mode 100644 index 0000000..ece5947 --- /dev/null +++ b/systems/Raft-rs/scripts/run-valgrind.py @@ -0,0 +1,279 @@ +import sys +import os +import signal +import argparse + +disable_check = False + +script_dir = os.path.dirname(__file__) +deps_dir = os.path.join(os.path.dirname(os.path.dirname(script_dir)), 'deps') +generator_script = os.path.join(script_dir, 'testcase_generator.py') +config_file = os.path.join(script_dir, 'config.txt') +gen_config_script_lxd = os.path.join(os.path.dirname(script_dir), 'configuration', 'gen_3_config_lxd.sh') +gen_config_script_docker = os.path.join(os.path.dirname(script_dir), 'configuration', 'gen_3_config.sh') +check_script_lxd = os.path.join(os.path.dirname(script_dir), 'run-testcase', 'start-lxd-valgrind.sh') +check_script_docker = os.path.join(os.path.dirname(script_dir), 'run-testcase', 'start.sh') +num_servers = 3 + +if os.path.exists('/.dockerenv'): + check_script = check_script_docker + gen_config_script = gen_config_script_docker + subnet_prefix = '1' +else: + check_script = check_script_lxd + gen_config_script = gen_config_script_lxd + subnet_prefix = '2' + +sys.path.append(os.path.join(deps_dir, 'tlc-cmd')) +# print(sys.path) + +from trace_reader import TraceReader + +import jq +import json +import inotify.adapters +import time +from datetime import datetime +from multiprocessing import Pool, Value +import subprocess + +def _sigint_handler(): + signal.signal(signal.SIGINT, signal.SIG_DFL) + print('Caught signal, exiting...') + for proc in processes: + proc.terminate() + print_progress(False) + print_unreached_branches(counter) + print_inv_violated_files() + for proc in processes: + proc.join() + exit(0) + +def sigint_handler(signum, frame): + _sigint_handler() + + +signal.signal(signal.SIGINT, sigint_handler) + + +counter = { + "Init": 0, + "BecomePrecandidate": 0, + "RecvRequestVote": { + "not-vote: has leader": 0, + "not-vote: term bigger": 0, + "not-vote: already voted": 0, + "not-vote: log newer": 0, + "voted": 0 + }, + "RecvRequestVoteResponse": { + "BecomeCandidate": 0, + "BecomeLeader": 0, + "term is smaller": 0, + "vote is stale": 0, + "granted": 0, + "not granted": 0 + }, + "SendAppendentriesAll": 0, + "RecvAppendentries": { + "term is bigger": 0, + "no prev log": 0, + "term mismatch": 0, + "success": 0 + }, + "RecvAppendentriesResponse": { + "not leader": 0, + "msg_id is bigger": 0, + "term is smaller": 0, + "stale response": 0, + "retry": 0, + "success": 0 + }, + "RecvEntry": 0, + "DoNetworkPartition": 0, + "DoNetworkCure": 0, +} + +def is_trace_file(fn): + return fn.startswith("trace_") and not fn.endswith(".dir") + +tr = TraceReader() +processed_files = 0 +jq_cmd = jq.compile('.[] | .netcmd | .[0] | .[0:2]') +jq_inv = jq.compile('.[] | .inv | .[]') +# jq_length = jq.compile('length') +inv_violated = False +inv_violated_files = {} +total_states = 0 +test_case_dir = 'testcase{}'.format(datetime.now().strftime("_%Y-%m-%d_%H-%M-%S")) + +n_process = 10 + +def init_worker(): + def handler(signum, frame): + print('Pool: process killed by signal:', signum) + exit(1) + signal.signal(signal.SIGINT, handler) + signal.signal(signal.SIGTERM, handler) + +config_file_list = [] + +def gen_config(): + global config_file_list + for i in range(n_process): + cur_config_dir = os.path.join(test_case_dir, str(i)) + cur_config = os.path.join(cur_config_dir, 'config.txt') + os.makedirs(cur_config_dir, exist_ok=True) + # os.system('bash {} {} 10.{}.{}.0/24 {}'.format(gen_config_script, num_servers, subnet_prefix, i, cur_config)) + ret = subprocess.run(['bash', gen_config_script, str(num_servers), '10.{}.{}.0/24'.format(subnet_prefix, i), cur_config], shell=False, check=False) + if ret.returncode < 0: + _sigint_handler() + config_file_list.append(cur_config) + +def run_testcase_worker(fn): + # print('env -u TMUX bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + os.system('env -u TMUX bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + +def run_testcase(fn): + slot = processed_files % n_process + processes[slot].apply_async(run_testcase_worker, (fn,)) + +from testcase_generator import gen_trace +def process_file(fn): + global processed_files, inv_violated, total_states + processed_files += 1 + states = list(tr.trace_reader(fn)) + for i in jq_cmd.input(states): + level_1 = counter[i[0]] + if isinstance(level_1, int): + counter[i[0]] += 1 + else: + level_1[i[1]] += 1 + if not all(jq_inv.input(states)): + # inv_violated_files.append(fn) + inv_violated = True + inv_violated_files[fn] = list(jq.compile('.[-1] | .inv | .[]').input(states)) + # total_states += jq_length.input(states).all()[0] + total_states += len(states) + # gen_trace(states) + if fn == 'MC.out' and not inv_violated: + return + # os.system('python3 {} -I {} -c {} {}'.format(generator_script, test_case_dir, config_file_list[processed_files%n_process], fn)) + ret = subprocess.run(['python3', generator_script, '-I', test_case_dir, '-c', config_file_list[processed_files%n_process], fn], shell=False, check=False) + if ret.returncode < 0: + _sigint_handler() + # print(os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir'))) + # os.system('bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + if (not disable_check) and ret.returncode == 0: + run_testcase(fn) + +def print_inv_violated_files(): + if inv_violated_files: + print('# Invariants violated files: ') + print(json.dumps(inv_violated_files, indent=4)) + +prev_time = 0 +period = 5 +def print_progress(wait_period=True): + global prev_time, period + current_time = time.time() + if current_time - prev_time >= period or not wait_period: + print('# Total states:', total_states) + cache = 0 + for i in processes: + cache += len(i._cache) + run_files = processed_files - cache + print('# Run testcases: {}/{} ({})'.format(run_files, processed_files, (run_files + 1) / (processed_files + 1))) + print("# COVERAGE of {} traces".format(processed_files)) + print(json.dumps(counter, indent=4)) + prev_time = current_time + print_inv_violated_files() + +def print_unreached_branches(d, keys=None): + tmp_keys = [] if keys is None else keys + if keys is None: + print("# Unreached branches") + for k,v in d.items(): + if isinstance(v, int): + if v == 0: + print(tmp_keys + [k]) + else: + print_unreached_branches(v, tmp_keys + [k]) + + +use_inotify=False +trace_dir = '' + + +def process_dir(): + global trace_dir + gen_config() + finish_file = "MC.out" + if use_inotify: + i = inotify.adapters.Inotify() + i.add_watch('.') + while True: + event = None + + for event in i.event_gen(yield_nones=False, timeout_s=1): + (_, type_names, _, filename) = event + event = None + if type_names == ['IN_CLOSE_WRITE']: + if not is_trace_file(filename) and filename != finish_file: + continue + process_file(filename) + print_progress() + if filename == finish_file: + event = "" + break + print_progress() + if event is not None: + break + else: + file_list = [i for i in os.listdir() if is_trace_file(i) or i == finish_file] + for i in file_list: + process_file(i) + print_progress() + +#TEST +# process_file('/mnt/data/GitHub/Work/PySyncObjTLA/model/ae_2022-09-26_20-42-23_1/trace_0_0') +# print_progress(False) + + +def parse_args(): + global n_process, num_servers, disable_check, use_inotify, trace_dir + parser = argparse.ArgumentParser(description='Convert TLA+ trace to test cases and run them!') + parser.add_argument(dest='trace_dir', action='store', help='TLA trace dir') + parser.add_argument('-p', dest='proc_num', action='store', type=int, required=False, + help='Number of parallel running test case') + parser.add_argument('-s', dest='server_num', action='store', type=int, required=False, + help='Number of servers') + parser.add_argument('-d', dest='disable_check', action='store_true', required=False, + help='Disable check') + parser.add_argument('-i', dest='iterate_dir', action='store_true', required=False, + help='Iterate dir instead of use inotify') + args = parser.parse_args() + if args.proc_num: + n_process = args.proc_num + if args.server_num: + num_servers = args.server_num + if args.disable_check: + disable_check = True + if args.iterate_dir: + use_inotify = False + trace_dir = args.trace_dir + os.chdir(trace_dir) + + +if __name__ == '__main__': + # iterate_dir(False) + parse_args() + processes = [] + for _ in range(n_process): + processes.append(Pool(1, init_worker)) + process_dir() + for proc in processes: + proc.close() + proc.join() + print_unreached_branches(counter) + print_inv_violated_files() diff --git a/systems/Raft-rs/scripts/run.py b/systems/Raft-rs/scripts/run.py new file mode 100644 index 0000000..655ed03 --- /dev/null +++ b/systems/Raft-rs/scripts/run.py @@ -0,0 +1,306 @@ +import sys +import os +import signal +import argparse + +disable_check = False + +script_dir = os.path.dirname(__file__) +deps_dir = os.path.join(os.path.dirname(os.path.dirname(script_dir)), 'deps') +generator_script = os.path.join(script_dir, 'testcase_generator.py') +config_file = os.path.join(script_dir, 'config.txt') +gen_config_script_lxd = os.path.join(os.path.dirname(script_dir), 'configuration', 'gen_3_config_lxd.sh') +gen_config_script_docker = os.path.join(os.path.dirname(script_dir), 'configuration', 'gen_3_config.sh') +check_script_lxd = os.path.join(os.path.dirname(script_dir), 'run-testcase', 'start-lxd.sh') +check_script_docker = os.path.join(os.path.dirname(script_dir), 'run-testcase', 'start.sh') +num_servers = 3 + +if os.path.exists('/.dockerenv'): + check_script = check_script_docker + gen_config_script = gen_config_script_docker + subnet_prefix = '1' +else: + check_script = check_script_lxd + gen_config_script = gen_config_script_lxd + subnet_prefix = '2' + +sys.path.append(os.path.join(deps_dir, 'tlc-cmd')) +# print(sys.path) + +from trace_reader import TraceReader + +import jq +import json +import inotify.adapters +import time +from datetime import datetime +from multiprocessing import Pool +import subprocess + +def _sigint_handler(): + signal.signal(signal.SIGINT, signal.SIG_DFL) + print('Caught signal, exiting...') + for proc in processes: + proc.terminate() + print_progress(False) + print_unreached_branches(counter) + print_inv_violated_files() + for proc in processes: + proc.join() + exit(0) + +def sigint_handler(signum, frame): + _sigint_handler() + + +signal.signal(signal.SIGINT, sigint_handler) + + +counter = { + "Init": 0, + "BecomeCandidate": 0, + "RecvRequestVote": { + "stale message ignore": 0, + "not-vote: term bigger": 0, + "not-vote: can not vote": 0, + "not-vote: log newer": 0, + "voted": 0 + }, + "RecvRequestVoteResponse": { + "Won-BecomeLeader": 0, + "Lost": 0, + "Pending": 0, + "not candidate": 0, + "term is smaller": 0, + "vote is stale": 0 + }, + "SendAppendentriesAll": 0, + "SendHeartBeatAll": 0, + "RecvAppendentries": { + "stale message ignore": 0, + "log stale commit": 0, + "success": 0, + "term Mismatch": 0, + "snap_shot ask": 0, + }, + "RecvAppendentriesResponse": { + "stale message ignore": 0, + "not leader": 0, + "term is smaller": 0, + "commit change": 0, + "maybe_update_fail": 0, + "old_pause_pause": 0, + "old_pause_send": 0, + "aggressive_pause": 0, + "aggressive_empty": 0, + "aggressive_has": 0, + "decr_replicate stale": 0, + "decr_replicate send": 0, + "decr_probe stale": 0, + "decr_probe pause": 0, + "decr_probe send": 0, + }, + "RecvHeartBeat": { + "stale message ignore": 0, + "success": 0 + }, + "RecvHeartBeatResponse": { + "stale message ignore": 0, + "term is smaller": 0, + "send append": 0, + "not send" : 0 + }, + "RecvSnapShot": { + "stale message ignore": 0, + "restore fail-commit_already": 0, + "restore success": 0, + }, + "RecvEntry": 0, + "DoBecomeLeader": 0, + "DoNetworkPartition": 0, + "DoNetworkCure": 0, + "DoSnapRequest": 0, +} + +def is_trace_file(fn): + return fn.startswith("trace_") and not fn.endswith(".dir") + +tr = TraceReader() +processed_files = 0 +jq_cmd = jq.compile('.[] | .netcmd | .[0] | .[0:2]') +jq_inv = jq.compile('.[] | .inv | .[]') +# jq_length = jq.compile('length') +inv_violated = False +inv_violated_files = {} +total_states = 0 +test_case_dir = 'testcase{}'.format(datetime.now().strftime("_%Y-%m-%d_%H-%M-%S")) + +n_process = 10 + +def init_worker(): + def handler(signum, frame): + print('Pool: process killed by signal:', signum) + exit(1) + signal.signal(signal.SIGINT, handler) + signal.signal(signal.SIGTERM, handler) + +config_file_list = [] + +def gen_config(): + global config_file_list + for i in range(n_process): + cur_config_dir = os.path.join(test_case_dir, str(i)) + cur_config = os.path.join(cur_config_dir, 'config.txt') + os.makedirs(cur_config_dir, exist_ok=True) + # os.system('bash {} {} 10.{}.{}.0/24 {}'.format(gen_config_script, num_servers, subnet_prefix, i, cur_config)) + ret = subprocess.run(['bash', gen_config_script, str(num_servers), '10.{}.{}.0/24'.format(subnet_prefix, i), cur_config], shell=False, check=False) + if ret.returncode < 0: + _sigint_handler() + config_file_list.append(cur_config) + +def run_testcase_worker(fn): + # print('env -u TMUX bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + os.system('env -u TMUX bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + +def run_testcase(fn): + slot = processed_files % n_process + processes[slot].apply_async(run_testcase_worker, (fn,)) + +def process_file(fn): + global processed_files, inv_violated, total_states + processed_files += 1 + states = list(tr.trace_reader(fn)) + for i in jq_cmd.input(states): + level_1 = counter[i[0]] + if isinstance(level_1, int): + counter[i[0]] += 1 + else: + level_1[i[1]] += 1 + if not all(jq_inv.input(states)): + # inv_violated_files.append(fn) + inv_violated = True + inv_violated_files[fn] = list(jq.compile('.[-1] | .inv | .[]').input(states)) + # total_states += jq_length.input(states).all()[0] + total_states += len(states) + # gen_trace(states) + if fn == 'MC.out' and not inv_violated: + return + # os.system('python3 {} -I {} -c {} {}'.format(generator_script, test_case_dir, config_file_list[processed_files%n_process], fn)) + ret = subprocess.run(['python3', generator_script, '-I', test_case_dir, '-c', config_file_list[processed_files%n_process], fn], shell=False, check=False) + if ret.returncode < 0: + _sigint_handler() + # print(os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir'))) + # os.system('bash {} {}'.format(check_script, os.path.realpath(os.path.join('.', test_case_dir, fn+'.dir')))) + if (not disable_check) and ret.returncode == 0: + run_testcase(fn) + +def print_inv_violated_files(): + if inv_violated_files: + print('# Invariants violated files: ') + print(json.dumps(inv_violated_files, indent=4)) + +prev_time = 0 +period = 5 +def print_progress(wait_period=True): + global prev_time, period + current_time = time.time() + if current_time - prev_time >= period or not wait_period: + print('# Total states:', total_states) + cache = 0 + for i in processes: + cache += len(i._cache) + run_files = processed_files - cache + print('# Run testcases: {}/{} ({})'.format(run_files, processed_files, (run_files + 1) / (processed_files + 1))) + print("# COVERAGE of {} traces".format(processed_files)) + print(json.dumps(counter, indent=4)) + prev_time = current_time + print_inv_violated_files() + +def print_unreached_branches(d, keys=None): + tmp_keys = [] if keys is None else keys + if keys is None: + print("# Unreached branches") + for k,v in d.items(): + if isinstance(v, int): + if v == 0: + print(tmp_keys + [k]) + else: + print_unreached_branches(v, tmp_keys + [k]) + + +use_inotify=True +trace_dir = '' + + +def process_dir(): + global trace_dir + gen_config() + finish_file = "MC.out" + if use_inotify: + i = inotify.adapters.Inotify() + i.add_watch('.') + while True: + event = None + + for event in i.event_gen(yield_nones=False, timeout_s=1): + (_, type_names, _, filename) = event + event = None + if type_names == ['IN_CLOSE_WRITE']: + if not is_trace_file(filename) and filename != finish_file: + continue + process_file(filename) + print_progress() + if filename == finish_file: + event = "" + break + print_progress() + if event is not None: + break + else: + file_list = [i for i in os.listdir() if is_trace_file(i) or i == finish_file] + for i in file_list: + process_file(i) + print_progress() + +#TEST +# process_file('/mnt/data/GitHub/Work/PySyncObjTLA/model/ae_2022-09-26_20-42-23_1/trace_0_0') +# print_progress(False) + + +def parse_args(): + global n_process, num_servers, disable_check, use_inotify, trace_dir + parser = argparse.ArgumentParser(description='Convert TLA+ trace to test cases and run them!') + parser.add_argument(dest='trace_dir', action='store', help='TLA trace dir') + parser.add_argument('-p', dest='proc_num', action='store', type=int, required=False, + help='Number of parallel running test case') + parser.add_argument('-s', dest='server_num', action='store', type=int, required=False, + help='Number of servers') + parser.add_argument('-d', dest='disable_check', action='store_true', required=False, + help='Disable check') + parser.add_argument('-i', dest='iterate_dir', action='store_true', required=False, + help='Iterate dir instead of use inotify') + args = parser.parse_args() + if args.proc_num: + n_process = args.proc_num + if args.server_num: + num_servers = args.server_num + if args.disable_check: + disable_check = True + if args.iterate_dir: + use_inotify = False + trace_dir = args.trace_dir + os.chdir(trace_dir) + + +if __name__ == '__main__': + # iterate_dir(False) + parse_args() + processes = [] + for _ in range(n_process): + processes.append(Pool(1, init_worker)) + process_dir() + for proc in processes: + proc.close() + proc.join() + print_unreached_branches(counter) + print_inv_violated_files() diff --git a/systems/Raft-rs/scripts/run.sh b/systems/Raft-rs/scripts/run.sh new file mode 100755 index 0000000..a35c82c --- /dev/null +++ b/systems/Raft-rs/scripts/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +PROJECT_DIR=$(realpath $(dirname "$(realpath "$0")")/../..) +MC_DIR=$PROJECT_DIR/scripts/model-simulation/model +TARGET_MC_DIR=$(ls -td ${MC_DIR}/*/ | head -1) +TARGET_DIR=$(ls -td ${TARGET_MC_DIR}/*/ | head -1) +START_SCRIPT=$(dirname "$(realpath "$0")")/start-lxd.sh + +inotifywait -e close_write -m -r -q --format '%w %f' $TARGET_DIR | while read line; do + a=($line) + if [[ ${a[0]} =~ .*MC.out.dir/$ ]]; then + is_exit=true + elif test ${a[1]} != 'traces.txt'; then + continue + fi + echo "${a[0]}" + $START_SCRIPT "${a[0]}" + if test "$is_exit" = true; then + kill -10 $$ + fi +done \ No newline at end of file diff --git a/systems/Raft-rs/scripts/run_one_testcase.sh b/systems/Raft-rs/scripts/run_one_testcase.sh new file mode 100755 index 0000000..f95c5dc --- /dev/null +++ b/systems/Raft-rs/scripts/run_one_testcase.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +PROJECT_DIR=$(realpath $(dirname "$(realpath "$0")")/../..) +SCRIPTS_DIR=$PROJECT_DIR/scripts + +gen_config_script=$SCRIPTS_DIR/configuration/gen_3_config.sh +generator_script=$SCRIPTS_DIR/model-simulation/testcase_generator.py +check_script=$SCRIPTS_DIR/run-testcase/start.sh + +TRACE=$1 +TRACE_DIR=$(realpath $(dirname $TRACE)) +TEST_DIR=$TRACE_DIR/test +TEST_TRACE=$TEST_DIR/trace +CONFIG_FILE=$TEST_DIR/config/config.txt +DEBUG=${2:+-d} + +set -e +mkdir -p $TEST_DIR/config +cp $TRACE $TEST_TRACE +bash $gen_config_script 3 10.1.0.0/24 $CONFIG_FILE +cd $TEST_DIR +python3 $generator_script -i -c $CONFIG_FILE $TEST_TRACE + +env -u TMUX bash $check_script $DEBUG $TEST_DIR diff --git a/systems/Raft-rs/scripts/start.sh b/systems/Raft-rs/scripts/start.sh new file mode 100755 index 0000000..a6596b4 --- /dev/null +++ b/systems/Raft-rs/scripts/start.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +PROJECT_DIR=$(realpath $(dirname "$(realpath "$0")")/../..) +CONTROLLER=$PROJECT_DIR/cmake-build-debug/controller/controller +INTERCEPTRO_SH=$PROJECT_DIR/cmake-build-debug/interceptor/run.sh +SPSSH_SH=$PROJECT_DIR/deps/spssh/spssh.sh +CLIENT=$PROJECT_DIR/cmake-build-debug/client/RedisTMet + +function usage() { + echo "Usage: start.sh trace_xxx.dir" + exit 1 +} + +TESTCASE_DIR=$1 +if ! test -d "$TESTCASE_DIR"; then + usage +fi +TESTCASE_DIR=$(realpath $TESTCASE_DIR) + +export TMPDIR=$(mktemp -u -d -p $TESTCASE_DIR) +mkdir -p $TMPDIR + +cd $TESTCASE_DIR +CONFIG_FILE=$(realpath config/config.txt) + +HOST_CMD="$CONTROLLER -detail -config $CONFIG_FILE -tmpdir $TMPDIR; exit" + +cat </dev/null +tmux wait-for "$TMPDIR" \ No newline at end of file diff --git a/systems/Raft-rs/scripts/testcase_generator.py b/systems/Raft-rs/scripts/testcase_generator.py new file mode 100644 index 0000000..fd15dcd --- /dev/null +++ b/systems/Raft-rs/scripts/testcase_generator.py @@ -0,0 +1,326 @@ +import sys +import os +import jq +import json +import argparse +import shutil + +show_status=True + +def set_path(): + script_dir = os.path.dirname(os.path.realpath(__file__)) + deps_dir = os.path.join(os.path.dirname(os.path.dirname(script_dir)), 'deps') + sys.path.append(os.path.realpath(os.path.join(deps_dir, 'tlc-cmd'))) + # print(sys.path) + +set_path() + +from trace_reader import TraceReader + +# def sigint_handler(signum, frame): +# print('TC-Gen Caught SIGINT, Ctrl+C again to exit') +# signal.signal(signal.SIGINT, signal.SIG_DFL) +# +# +# signal.signal(signal.SIGINT, sigint_handler) + +default_config = 'config.txt' +default_output = 'traces.txt' +default_conn_fd = 1022 +default_node_port = 9000 +default_debug = False +nodes = dict() +node_port = None + + +def parse_args(): + parser = argparse.ArgumentParser(description="Test case generator for PySyncObj") + parser.add_argument(dest='trace_file', action='store', help='Trace file') + parser.add_argument('-c', dest='config', action='store', required=True, help='Config file', + default=default_config) + parser.add_argument('-o', dest='output', action='store', required=False, help='Output trace file', + default=default_output) + parser.add_argument('-f', dest='conn_fd', action='store', required=False, + help='Interceptor<->Controller connection fd', default=default_conn_fd) + parser.add_argument('-n', dest='node_port', action='store', required=False, help='Port of test nodes', + default=default_node_port) + parser.add_argument('-d', dest='debug', action='store_true', required=False, help='Print debug msg', + default=default_debug) + parser.add_argument('-i', dest='in_place', action='store_true', required=False, help='Generate in current dir', + default=False) + parser.add_argument('-I', dest='in_dir', action='store', required=False, + help='Generate in the specific dir under trace dir', default='') + arg_parser = parser.parse_args() + if not arg_parser.in_place: + arg_parser.trace_file = os.path.realpath(arg_parser.trace_file) + arg_parser.config = os.path.realpath(arg_parser.config) + + if arg_parser.in_dir: + dir_name = os.path.join(os.path.dirname(arg_parser.trace_file), arg_parser.in_dir) + os.makedirs(dir_name, mode=0o755, exist_ok=True) + else: + dir_name = os.path.dirname(arg_parser.trace_file) + base_name = os.path.basename(arg_parser.trace_file) + test_case_dir = base_name + '.dir' + # new_name = os.path.join(test_case_dir, base_name) + + os.chdir(dir_name) + os.makedirs(test_case_dir, mode=0o755, exist_ok=True) + # os.mkdir(test_case_dir, mode=0o755) + # os.rename(arg_parser.trace_file, new_name) + # arg_parser.trace_file = new_name + os.chdir(test_case_dir) + + return arg_parser + + +def eprint(*largs, **kvargs): + if args.debug: + print(*largs, **kvargs, file=sys.stderr) + + +def read_config(): + global nodes, node_port + with open(args.config) as f: + map_cidr = dict() + for line in f: + line = line.strip() + if line.startswith('map-cidr'): + _, fake, real = [cidr.replace('.0/24', '') for cidr in line.split(' ', 3)] + map_cidr[real] = fake + eprint('Read cmd:', 'map-cidr', fake + '.0/24', real + '.0/24') + elif line.startswith('node'): + _, name, ip = line.split(' ', 3) + for k, v in map_cidr.items(): + if k in ip: + nodes[name] = ip.replace(k, v) + eprint('Read cmd:', 'node', name, nodes[name]) + break + elif line.startswith('router'): + _, router_addr = line.split(' ', 2) + _, router_port = router_addr.split(':', 2) + try: + router_port = int(router_port) + node_port = router_port + except Exception: + eprint('Router port is invalid') + pass + else: + eprint('Ignored cmd:', line) + if node_port is None: + node_port = args.node_port + + +def check_test_code_is_generated(config_file, testcase_in_parent_dir): + if args.in_place: + return False + for i in nodes.keys(): + if os.path.exists(os.path.join(os.path.dirname(config_file) if testcase_in_parent_dir else '.', i + '.py')): + return True + return False + + +def yield_trace(states): + model_value = '' + model_value_replace = {"Follower": "1", "PreCandidate": "2", "Candidate": "3", "Leader": "4"} + # for k,v in nodes.items(): + # model_value_replace["TCPNode('{}:{}')".format(v, node_port)] = k + def get_converted_model_value(n, model_var_name): + nonlocal model_value + if model_var_name == 'log': + model_value = list(map(lambda t: tuple(t.values()), model_value)) + elif model_var_name == 'raftState': + model_value = model_value_replace[model_value] + elif model_var_name in {'nextIndex', 'matchIndex', 'progress', 'pr_pending'}: + if n in model_value: + model_value.pop(n) + + return str(model_value) + def compare(n, code_var_name, model_var_name, no_compare=False): + nonlocal model_value + yield ['execute', n, 'get {}'.format(code_var_name)] + model_value = cur_state[model_var_name][n] + yield ['#', 'variable', model_var_name, n, get_converted_model_value(n, model_var_name)] + if not no_compare: + yield ['compare', 'variable'] + else: + yield ['compare', 'none'] + + def do_tick(n, is_compare=True): + nonlocal model_value + yield ['execute', n, 'raft periodic'] + yield ['nop'] + if is_compare: + yield from compare(n, 'commit_idx', 'commitIndex') + yield from compare(n, 'current_term', 'currentTerm') + yield from compare(n, 'pending_snapshot', 'pending_snapshot') + yield from compare(n, 'log', 'log') + yield from compare(n, 'state', 'raftState') + if model_value == "4": # leader + yield from compare(n, 'progress', 'progress', no_compare=True) + yield from compare(n, 'pr_pending', 'pr_pending') + yield from compare(n, 'next_idx', 'nextIndex') + yield from compare(n, 'match_idx', 'matchIndex') + + def deliver(src, dst): + yield ['deliver', src, dst] + # yield ['loop', 'intercept', dst, 'check_has_recv_queue', nodes[src]+':0'] + yield ['loop', 'intercept', dst, 'check_has_recv_queue', src] + yield ['execute', dst, 'raft recvfrom {}'.format(src)] + yield from do_tick(dst) + partitioned_nodes = [] + jq_trace = jq.compile('.[].netcmd') + jq_msgs = jq.compile('.[].msgs') + states_counter = 0 + for i, msg, cur_state in zip(jq_trace.input(states), jq_msgs.input(states), states): + states_counter += 1 + + if len(i) > 1: + comment = i[0] + netcmd = i[1] + cmd, *parameters = netcmd + # print("comment\n", comment) + # print("cmd\n", cmd) + # print("parameters\n", parameters) + + yield ['#', '[' + str(states_counter) + ']'] + [str(comment)] + yield ['#', json.dumps(netcmd)] + if cmd == 'conn_part_flush': # network partition + # TODO: currently only one node is partitioned + partitioned_nodes.append(parameters[0][0]) + yield ['partition', parameters[0][0]] + elif cmd == 'conn_cure': # network cure + for n in partitioned_nodes: + yield ['recover', n] + for _ in range(2): + for j in nodes: + yield ['execute', j, 'net connectall'] + # yield ['intercept', j, 'inc_time_ms', '60'] # > 0.05s reconnection + yield from do_tick(j) + yield ['wait-recover'] + for j in nodes: + yield ['loop', 'execute', j, 'net isallconnected'] + partitioned_nodes = [] + elif cmd == 'msg_del': # recv msg (deliver to node) + # yield ['deliver', parameters[0], parameters[1]] + yield from deliver(parameters[1], parameters[0]) + elif cmd == 'msg_add': # send msg (enqueued in controller) + if comment[0] == 'DoSnapRequest': + # yield from batch_tick(election_tick) + yield ['execute', comment[1], 'raft askSnap'] + yield ['nop'] + yield from do_tick(comment[1]) + # assert False + # pass # not used + elif cmd == 'msg_add_dropped': # send msg but dropped due to partition + assert False + pass # not used + elif cmd == 'msg_reply': # recv (deliver) msg and send (enqueue) msg + # yield ['deliver', parameters[1], parameters[0]] + yield from deliver(parameters[1], parameters[0]) + elif cmd == 'msg_reply_dropped': # reply but dropped + # yield ['deliver', parameters[1], parameters[0]] + yield from deliver(parameters[1], parameters[0]) + elif cmd == 'msg_do_nothing': + if comment[0] == 'RecvEntry': + yield ['execute', comment[1], "raft cli {}".format(comment[2])] + yield ['nop'] + elif comment[0] == 'DoBecomeLeader': + yield ['execute', comment[1], 'raft BecomeLeader'] + yield ['nop'] + yield from do_tick(comment[1]) + elif cmd == 'msg_batch_add': # batch send msgs + def batch_tick(n) : + for _ in range(n): + yield ['execute', comment[1], 'raft periodic'] + # yield ['execute', comment[1], 'raft periodic'] + # election_tick heart_beat_tick 都是和rust相统一 + election_tick = 11 + heart_beat_tick = 4 + if comment[0] == 'BecomePrecandidate': + # yield from batch_tick(election_tick) + yield ['execute', comment[1], 'raft campaign pre'] + yield ['nop'] + elif comment[0] == 'ElectionTimeout': + yield ['execute', comment[1], 'raft campaign normal'] + yield ['nop'] + # yield ['execute', comment[1], 'raft campaign normal'] + elif comment[0] == 'BecomeCandidate': + yield ['execute', comment[1], 'raft campaign normal'] + yield ['nop'] + # yield ['execute', comment[1], 'raft campaign normal'] + elif comment[0] == 'DoBecomeLeader': + yield ['execute', comment[1], 'raft BecomeLeader'] + yield ['nop'] + elif comment[0] == 'SendAppendentriesAll': + yield ['execute', comment[1], 'raft bcast_append'] + yield ['nop'] + elif comment[0] == 'SendHeartBeatAll': + yield ['execute', comment[1], 'raft heartBeat'] + yield ['nop'] + # yield ['intercept', comment[1], 'inc_time_ms', '300'] # > 0.2s + elif comment[0] == 'RecvEntry': + yield ['execute', comment[1], "raft cli {}".format(comment[2])] + yield from do_tick(comment[1]) + elif cmd == 'msg_batch_add_reply': # recv msg and batch send msgs + # yield ['deliver', parameters[1], parameters[0]] + yield from deliver(parameters[1], parameters[0]) + else: + yield ['init', str(i[0][1])] + for j in nodes: + yield ['execute', j, 'net connectall'] + yield ['execute', j, 'raft init'] + # yield from do_tick(j) + yield ['wait-init', str(i[0][1])] + # yield ['deliver-all', str(len(nodes))] + for j in nodes: + yield ['loop', 'execute', j, 'net isallconnected'] + for j in nodes: + yield from do_tick(j) + if show_status: + yield ['status'] + for src in nodes: + msgs_info_str = 'msgs {}:'.format(src) + for dst in nodes: + if src != dst: + msgs_info_str += ' {}({})'.format(dst, len(msg[src][dst])) + yield ['#', msgs_info_str] + yield ['compare', 'net'] + yield ['finish one step'] + for i in nodes: + yield ['execute', i, "raft statemachine"] + eprint("Finish write:", args.output) + +def copy_config_file(config_file, testcase_in_parent_dir): + try: + if testcase_in_parent_dir: + os.symlink(os.path.dirname(config_file), 'config') + else: + shutil.copy2(config_file, os.path.join('..' if testcase_in_parent_dir else '.', os.path.basename(config_file))) + except: + pass + +def gen_trace(): + tr = TraceReader(True) + try: + os.symlink(args.trace_file, 'tlc_trace.txt') + except: + pass + states = list(tr.trace_reader(args.trace_file)) + eprint('Read states:', len(states)) + traces = list(yield_trace(states)) + # for index, trace in enumerate(traces): + # for sub_index, item in enumerate(trace): + # if not isinstance(item, str): + # print(f"在 traces 的第 {index} 项的第 {sub_index} 个元素是非字符串类型,其值为 {item},类型为 {type(item)}") + + # print('\n'.join(' '.join(i) for i in traces)) + with open('traces.txt', 'w') as f: + f.write('\n'.join(' '.join(i) for i in traces)) + + +if __name__ == '__main__': + args = parse_args() + read_config() + copy_config_file(args.config, not args.in_place) + gen_trace() diff --git a/systems/Raft-rs/specs/FifoNetwork.tla b/systems/Raft-rs/specs/FifoNetwork.tla new file mode 100644 index 0000000..d1bae76 --- /dev/null +++ b/systems/Raft-rs/specs/FifoNetwork.tla @@ -0,0 +1,376 @@ +---------------------------- MODULE FifoNetwork ---------------------------- + +EXTENDS Sequences, Naturals, FiniteSets, TLC + +(*************************************************************************** + VARIABLES definitions: see InitFifoNetwork + ***************************************************************************) +VARIABLES _msgs, \* Messages in the network + _netman, \* Network manager + _netcmd \* Current network cmd + +(*************************************************************************** + FLUSH_DISCONN: + * If true, drop all the network wired msgs that are not accessible + * If false, do not drop, and msgs can still be delivered + ***************************************************************************) +CONSTANT FLUSH_DISCONN + +(*************************************************************************** + NULL_MSG: represent a null msg in some condition checkings + * Should be a model value if its type is CONSTANT + ***************************************************************************) +CONSTANT NULL_MSG +\*NULL_MSG == [ NULL_MSG |-> "" ] + +---- \* Common functions + +(*************************************************************************** + API InitFifoNetwork(nodes): + - _msgs: init to emtpy sequences of [src][dst] records + * format: [ seq |-> 0, src |-> s0, dst |-> s1, type |-> sth, data |-> sth] + * src and dst will be dropped when storing in _msgs + * type and data are user defined fields + - _netman: + - n_sent: number of msgs sent to network, to indicate next msg seq + - n_recv: number of msgs delivered to server + - n_wire: number of msgs in network but not delivered yet + - conn: network connections + * format: {n0, n1}, represents n0 is connected with n1 + * default all connnected + - _netcmd: <<"init">> + ***************************************************************************) +InitFifoNetworkAddNetman(nodes, cmd, additionalNetman) == + /\ _msgs = [ sender \in nodes |-> [ recver \in nodes \ {sender} |-> <<>> ]] + /\ _netman = additionalNetman @@ + [ n_sent |-> 0, n_recv |-> 0, n_wire |-> 0, conn |-> <>, + n_part |-> 0, n_cure |-> 0] + /\ _netcmd = <> + +InitFifoNetwork(nodes) == InitFifoNetworkAddNetman(nodes, "init", <<>>) + +(*************************************************************************** + _GetNodes: get all nodes in msg channels + ***************************************************************************) +_GetNodes == DOMAIN _msgs + +(*************************************************************************** + _Pick: choose any one + ***************************************************************************) +_Pick(S) == CHOOSE s \in S : TRUE + +(*************************************************************************** + API IsNullMsg: check if msg m is NULL + ***************************************************************************) +IsNullMsg(m) == m = NULL_MSG + +---- \* Update _netman functions + +(*************************************************************************** + _NetGetHelper and _NetIncHelper: get, inc and dec member of _netman records + ***************************************************************************) +_NetGetHelper(member) == _netman[member] +_NetIncHelper(member) == (member :> _netman[member] + 1) +_NetDecHelper(member) == (member :> _netman[member] - 1) +NetIncBy(member, number) == (member :> _netman[member] + number) + +NetGetSent == _NetGetHelper("n_sent") +NetIncSent == _NetIncHelper("n_sent") +NetGetRecv == _NetGetHelper("n_recv") +NetIncRecv == _NetIncHelper("n_recv") +NetGetWire == _NetGetHelper("n_wire") +NetIncWire == _NetIncHelper("n_wire") +NetDecWire == _NetDecHelper("n_wire") +NetGetPart == _NetGetHelper("n_part") +NetIncPart == _NetIncHelper("n_part") +NetGetCure == _NetGetHelper("n_cure") +NetIncCure == _NetIncHelper("n_cure") + +(*************************************************************************** + NetmanIncField: increase a field that is not a standard netman member + * updater is the return value of NetDelMsg/NetAddMsg .. + ***************************************************************************) +NetmanIncField(field, updater) == + <<_NetIncHelper(field) @@ updater[1]>> @@ updater + +(*************************************************************************** + _WireReduce, _WireNodeSumHelper, _WireSumHelper, NetSumWire: + * Sum up wired msgs + ***************************************************************************) +RECURSIVE _WireReduce(_, _, _, _) +_WireReduce(Helper(_, _, _), nodes, res, msgs) == + IF nodes = {} THEN res + ELSE LET n == _Pick(nodes) + IN _WireReduce(Helper, nodes \ {n}, Helper(n, res, msgs), msgs) + +_WireNodeSumHelper(n, res, msgs) == + LET node_msgs_list == msgs[n] + IN res + Len(node_msgs_list) + +_WireSumHelper(n, res, msgs) == + LET node_msgs == msgs[n] + to_nodes == DOMAIN node_msgs + IN res + _WireReduce(_WireNodeSumHelper, to_nodes, 0, node_msgs) + +_WireSum(msgs) == + [ n_wire |-> _WireReduce(_WireSumHelper, DOMAIN msgs, 0, msgs) ] + +NetSumWire == _WireSum(_msgs) + +(*************************************************************************** + API NetUpdate(args): update _netman with args[1], update _msgs with args[2] + - e.g. NetUpdate(<>) + ***************************************************************************) +NetUpdate(args) == + /\ _netman' = args[1] @@ _netman + /\ _msgs' = args[2] + /\ IF Len(args) = 3 + THEN _netcmd' = args[3] + ELSE _netcmd' = <<"noop">> + +NetUpdate2(args, cmd) == + /\ _netman' = args[1] @@ _netman + /\ _msgs' = args[2] + /\ IF Len(args) = 3 + THEN _netcmd' = <> + ELSE _netcmd' = <> + +---- \* Network partition functions + +(*************************************************************************** + _AddConn: add nodes connections and return connected nodes + * no change: + * if nodes contain only one node, or + * if nodes already connected + ***************************************************************************) +_AddConn(nodes, conn) == + IF \/ Cardinality(nodes) <= 1 + \/ Len(SelectSeq(conn, LAMBDA p: nodes \subseteq p)) > 0 + THEN conn + ELSE Append(SelectSeq(conn, LAMBDA p: \neg (p \subseteq nodes)), nodes) + +(*************************************************************************** + _DelConn: isolate nodes from others and return connected nodes + * delete node in nodes from all connections + * if after deleting, the nodes set has no more than 1 ndoe, delete the set + ***************************************************************************) +_DelConn(nodes, conn) == + LET F[i \in 0..Len(conn)] == + IF i = 0 THEN <<>> + ELSE IF Cardinality(conn[i] \ nodes) <= 1 THEN F[i-1] + ELSE Append(F[i-1], conn[i] \ nodes) + IN F[Len(conn)] + +(*************************************************************************** + _PartConn: delete nodes from other connections and then connect nodes + ***************************************************************************) +_PartConn(nodes, conn) == + _AddConn(nodes, _DelConn(nodes, conn)) + +(*************************************************************************** + _FlushReduce, _FlushMsgsHelper, _FlushMsgs: + * Flush disconnected msgs in wire + * _FlushMsgs: Return flushed msgs + _FlushMsgsDelHelper, _FlushMsgsDel: + * All msgs are flushed in delete_nodes (not inner-connnected) + ***************************************************************************) +RECURSIVE _FlushReduce(_, _, _, _, _) +_FlushReduce(Helper(_, _, _, _), nodes, res, part, msgs) == + IF nodes = {} THEN res + ELSE LET n == _Pick(nodes) + IN _FlushReduce(Helper, nodes \ {n}, Helper(n, res, part, msgs), part, msgs) + +_FlushMsgsHelper(n, res, part, msgs) == + LET node_msgs == msgs[n] + to_nodes == DOMAIN node_msgs + flush_nodes == IF n \in part THEN to_nodes \ part ELSE part + IN ( n :> ([ x \in flush_nodes |-> <<>> ] @@ node_msgs) ) @@ res + +_FlushMsgsDelHelper(n, res, delete_nodes, msgs) == + LET node_msgs == msgs[n] + to_nodes == DOMAIN node_msgs + flush_nodes == IF n \in delete_nodes THEN to_nodes ELSE delete_nodes + IN ( n :> ([ x \in flush_nodes |-> <<>> ] @@ node_msgs) ) @@ res + +_FlushMsgs(part, msgs) == + _FlushReduce(_FlushMsgsHelper, DOMAIN msgs, msgs, part, msgs) + +_FlushMsgsDel(delete_nodes, msgs) == + _FlushReduce(_FlushMsgsDelHelper, DOMAIN msgs, msgs, delete_nodes, msgs) + +(*************************************************************************** + API NetAddConn: add a network connection + * return <> + ***************************************************************************) +NetAddConn(nodes) == + <<[ conn |-> _AddConn(nodes, _netman.conn) ], _msgs, <<"conn_add", nodes>>>> + +(*************************************************************************** + API NetDelConn: isolate nodes from all other nodes + * unlike NetPartConn, nodes in the deletion set are not connected + ***************************************************************************) +NetDelConn(nodes) == + LET conn == [ conn |-> _DelConn(nodes, _netman.conn) ] + IN IF FLUSH_DISCONN + THEN LET msgs == _FlushMsgsDel(nodes, _msgs) + msgs_sum == NetSumWire + IN <>>> + ELSE <>>> + +(*************************************************************************** + API NetPartConn: add a network partition + ***************************************************************************) +NetPartConn(nodes) == + LET conn == [ conn |-> _PartConn(nodes, _netman.conn) ] + IN IF FLUSH_DISCONN + THEN LET msgs == _FlushMsgs(nodes, _msgs) + msgs_sum == NetSumWire + IN <>>> + ELSE <>>> + +(*************************************************************************** + API NetCureConn: connect all nodes + ***************************************************************************) +NetCureConn == <<[ conn |-> <<_GetNodes>> ] @@ NetIncCure, _msgs, <<"conn_cure">>>> + +(*************************************************************************** + API NetIsConn: check s0 and s1 are connected + ***************************************************************************) +NetIsConn(s0, s1) == + Len(SelectSeq(_netman.conn, LAMBDA p: {s0, s1} \subseteq p)) /= 0 + +(*************************************************************************** + API NetIsParted: check if network is partitioned + ***************************************************************************) +NetIsParted == + IF \/ Len(_netman.conn) /= 1 + \/ _netman.conn[1] /= _GetNodes + THEN TRUE + ELSE FALSE + +---- \* Network send and recv functions + +(*************************************************************************** + _AddMsgSrcDstSeq, _AddMsgSrcDst, _AddMsg: add msg m to msgs + * return <> + * set global seq to msg m + ***************************************************************************) +_AddMsgSrcDstSeq(src, dst, seq, m, msgs) == + LET m_ == IF NetIsConn(src, dst) + THEN [ x \in ((DOMAIN m \union {"seq"}) \ {"src", "dst"}) |-> + IF x = "seq" THEN seq ELSE m[x] ] + ELSE NULL_MSG \* Dropped. + IN IF m_ = NULL_MSG THEN <<0, msgs>> + ELSE <<1, [ msgs EXCEPT ![src][dst] = Append(@, m_) ]>> +_AddMsgSrcDst(src, dst, m, msgs) == + LET seq == NetGetSent + 1 + IN _AddMsgSrcDstSeq(src, dst, seq, m, msgs) +_AddMsg(m, msgs) == _AddMsgSrcDst(m.src, m.dst, m, msgs) + +(*************************************************************************** + _BatchAddMsgs: batch add multi messages to msgs + * return <> + * set global seq to each msg m + ***************************************************************************) +_BatchAddMsgs(ms, msgs) == + LET F[i \in 0..Len(ms)] == + IF i = 0 THEN <<0, msgs, <<"msg_batch_add">>>> + ELSE LET m == ms[i] + seq == NetGetSent + F[i-1][1] + 1 + res == _AddMsgSrcDstSeq(m.src, m.dst, seq, m, F[i-1][2]) + IN <> + ELSE <<"dropped", m.src, m.dst>>)>> + IN F[Len(ms)] + +(*************************************************************************** + _DelMsg: delete m from msgs return <> + ***************************************************************************) +_DelMsg(m, msgs) == + LET m_ == msgs[m.src][m.dst][1] + IN IF m.seq = m_.seq THEN <<1, [ msgs EXCEPT ![m.src][m.dst] = Tail(@)]>> + ELSE Assert(FALSE, "DelMsg: seq mismatch") + +(*************************************************************************** + _GetMsg: get m from msgs[src][dst] + * since it is fifo network, only head msg can be obtained + ***************************************************************************) +_GetMsg(src, dst, msgs) == + LET m_ == msgs[src][dst] + len == Len(m_) + IN IF len > 0 THEN [ src |-> src, dst |-> dst] @@ m_[1] ELSE NULL_MSG + +(*************************************************************************** + _ReplyMsg: delete request from msgs and then add reponse to msgs + * return <> + ***************************************************************************) +_ReplyMsg(reponse, request, msgs) == + LET del == _DelMsg(request, msgs) + add == _AddMsgSrcDst(request.dst, request.src, reponse, del[2]) + IN <> + +(*************************************************************************** + API NetGetMsg: Get msg from src -> dst FIFO head + * return msg m + ***************************************************************************) +NetGetMsg(src, dst) == _GetMsg(src, dst, _msgs) + +(*************************************************************************** + API NetDelMsg: Del first msg of m.src -> m.dst + * return <> + * update with NetUpdate + ***************************************************************************) +NetDelMsg(m) == + LET res == _DelMsg(m, _msgs) + IN <>>> + +(*************************************************************************** + API NetAddMsgSrcDst, NetAddMsg: Add m to the end of m.src -> m.dst + * return <> + ***************************************************************************) +NetAddMsgSrcDst(src, dst, m) == + LET res == _AddMsgSrcDst(src, dst, m, _msgs) + IN IF res[1] = 1 + THEN <>>> + ELSE <<_netman, res[2], <<"msg_add_dropped", src, dst>>>> +NetAddMsg(m) == NetAddMsgSrcDst(m.src, m.dst, m) + +(*************************************************************************** + API NetReplyMsg: delete request and try to add response to network + * return <> + ***************************************************************************) +NetReplyMsg(response, request) == + LET res == _ReplyMsg(response, request, _msgs) + IN IF res[1] = 0 + THEN <>>> + ELSE <>>> + +(*************************************************************************** + API NetBatchAddMsg: batch add messages ms to msgs + ***************************************************************************) +NetBatchAddMsg(ms) == + LET res == _BatchAddMsgs(ms, _msgs) + IN <> + +(*************************************************************************** + API NetReplyBatchAddMsg: remove request and batch add ms to msgs + ***************************************************************************) +NetReplyBatchAddMsg(ms, request) == + LET del == _DelMsg(request, _msgs) + add == _BatchAddMsgs(ms, del[2]) + IN <>, add[3])>> + +(*************************************************************************** + API NetNoAction: Network state unchanged + * return <> + ***************************************************************************) +NetNoAction1 == <<_netman, _msgs>> +NetNoAction2(cmd) == <<_netman, _msgs, <>>> + +============================================================================= +\* Modification History +\* Last modified Sun Sep 25 23:05:07 CST 2022 by tangruize +\* Created Fri Apr 22 17:27:57 CST 2022 by tangruize diff --git a/systems/Raft-rs/specs/RedisRaftUdp.tla b/systems/Raft-rs/specs/RedisRaftUdp.tla new file mode 100644 index 0000000..c0e836e --- /dev/null +++ b/systems/Raft-rs/specs/RedisRaftUdp.tla @@ -0,0 +1,813 @@ +---------------------------- MODULE RedisRaftUdp ---------------------------- +(***************************************************************************) +(* Model assumptions: *) +(* - No snapshots *) +(* - No read-only requests *) +(* - No non-voting nodes *) +(* - No disk failures *) +(* - No membership change *) +(***************************************************************************) + +EXTENDS Sequences, Naturals, Integers, FiniteSets, TLC, SequencesExt + +(***************************************************************************) +(* Constants definitions *) +(***************************************************************************) +CONSTANTS Servers \* set of servers +CONSTANTS Follower, PreCandidate, Candidate, Leader \* server states +CONSTANTS Commands, NoOp \* commands of normal log entries +CONSTANTS M_RV, M_RVR, M_AE, M_AER \* basic raft msg types +CONSTANTS Nil \* a placeholder + +(***************************************************************************) +(* Variables definitions *) +(***************************************************************************) +VARIABLES current_term, voted_for, log \* persistent vars +VARIABLES commit_idx, state \* volatile vars +VARIABLES next_idx, match_idx \* leader vars +VARIABLES voted_for_me \* candidate vars +VARIABLES leader_id, match_msgid \* node vars + +(***************************************************************************) +(* Network variables and instance *) +(***************************************************************************) +VARIABLES netman, netcmd, msgs +INSTANCE UdpNetwork WITH NULL_MSG <- Nil, + _msgs <- msgs, _netman <- netman, _netcmd <- netcmd + +(***************************************************************************) +(* Self manipulated invariants checking *) +(***************************************************************************) +VARIABLES inv + +(***************************************************************************) +(* Vars groups *) +(***************************************************************************) +serverVars == <> +leaderVars == <> +candidateVars == <> +logVars == <> +nodeVars == <> +netVars == <> +noNetVars == <> +vars == <> + +(***************************************************************************) +(* State constraints helper *) +(***************************************************************************) +CONSTANTS Parameters \* to control the model scale + +GetParameterSet(p) == IF p \in DOMAIN Parameters THEN Parameters[p] ELSE {} + +CheckParameterHelper(n, p, Test(_,_)) == + IF p \in DOMAIN Parameters + THEN Test(n, Parameters[p]) + ELSE TRUE +CheckParameterMax(n, p) == CheckParameterHelper(n, p, LAMBDA i, j: i <= j) + +PrePrune(n, p) == CheckParameterHelper(n, p, LAMBDA i, j: i < j) + +(***************************************************************************) +(* Type Ok *) +(***************************************************************************) +TypeOkServerVars == + /\ current_term \in [ Servers -> Nat ] + /\ voted_for \in [ Servers -> Servers \cup {Nil} ] + /\ state \in [ Servers -> { Follower, PreCandidate, Candidate, Leader } ] +TypeOkLeaderVars == + /\ next_idx \in [ Servers -> [ Servers -> Nat \ {0} ]] + /\ match_idx \in [ Servers -> [ Servers -> Nat ]] +TypeOkCandidateVars == + /\ voted_for_me \in [ Servers -> SUBSET Servers ] +TypeOkLogVars == + \* log data structure is complex, we skip checking it + /\ commit_idx \in [ Servers -> Nat ] +TypeOkNodeVars == + /\ leader_id \in [ Servers -> Servers \cup {Nil} ] + /\ match_msgid \in [ Servers -> [ Servers -> Nat ]] +TypeOk == + /\ TypeOkServerVars + /\ TypeOkLeaderVars + /\ TypeOkCandidateVars + /\ TypeOkLogVars + /\ TypeOkNodeVars + +(***************************************************************************) +(* Init variables *) +(***************************************************************************) +InitServerVars == \* func: raft_new/raft_new_with_log + /\ current_term = [ i \in Servers |-> 0 ] + /\ voted_for = [ i \in Servers |-> Nil ] + /\ state = [ i \in Servers |-> Follower ] +InitLeaderVars == \* func: raft_node_new/raft_become_leader + /\ next_idx = [ i \in Servers |-> [ j \in Servers |-> 1 ]] + /\ match_idx = [ i \in Servers |-> [ j \in Servers |-> 0 ]] +InitCandidateVars == \* func: raft_node_new + /\ voted_for_me = [ i \in Servers |-> {} ] +InitLogVars == \* func: raft_new_with_log + /\ log = [ i \in Servers |-> <<>> ] + /\ commit_idx = [ i \in Servers |-> 0 ] +InitNodeVars == \* raft_new_with_log + /\ leader_id = [ i \in Servers |-> Nil ] + /\ match_msgid = [ i \in Servers |-> [ j \in Servers |-> 0 ] ] +InitNetVars == + /\ InitUdpNetworkNetman(Servers, <<"Init", Cardinality(Servers)>>, + [ n_op |-> 0, n_ae |-> 0, n_elec |-> 0, no_inv |-> GetParameterSet("NoInv")]) +InitInv == + /\ inv = <<>> + +Init == + /\ InitServerVars + /\ InitLeaderVars + /\ InitCandidateVars + /\ InitLogVars + /\ InitNodeVars + /\ InitNetVars + /\ InitInv + +(***************************************************************************) +(* Helper functions *) +(***************************************************************************) +NumServer == Cardinality(Servers) +Max(a, b) == IF a > b THEN a ELSE b +Min(a, b) == IF a < b THEN a ELSE b +IsQuorum(ss) == Cardinality(ss) * 2 > NumServer +IsQuorumNum(num) == num * 2 > NumServer +Update(var, n, value) == [ var EXCEPT ![n] = value ] +UpdateCurrentTerm(n, term) == current_term' = Update(current_term, n, term) +UpdateVotedFor(n, node) == voted_for' = Update(voted_for, n, node) +UpdateState(n, s) == state' = Update(state, n, s) +UpdateLeaderId(n, id) == leader_id' = Update(leader_id, n, id) +AddVotedForMe(me, node) == voted_for_me' = [ voted_for_me EXCEPT ![me] = @ \cup {node} ] +ClearVotedForMe(me) == voted_for_me' = [ voted_for_me EXCEPT ![me] = {} ] +UpdateMatchIdx(me, node, idx) == match_idx' = [ match_idx EXCEPT ![me][node] = idx ] +UpdateNextIdx(me, node, idx) == next_idx' = [ next_idx EXCEPT ![me][node] = IF idx < 1 THEN 1 ELSE idx ] +UpdateCommitIdx(n, idx) == commit_idx' = Update(commit_idx, n, idx) +UpdateMatchMsgid(me, node, id) == match_msgid' = [ match_msgid EXCEPT ![me][node] = id ] + + +(***************************************************************************) +(* Log helpers *) +(***************************************************************************) +\* Currently, the log won't be compacted +LogAppend(log_, entry) == Append(log_, entry) +LogCount(log_) == Len(log_) +LogGetEntry(log_, idx) == + IF idx > LogCount(log_) \/ idx <= 0 THEN Nil ELSE log_[idx] +LogGetEntriesFrom(log_, idx) == + IF idx > LogCount(log_) \/ idx <= 0 THEN <<>> + ELSE SubSeq(log_, idx, LogCount(log_)) +LogGetEntriesTo(log_, idx) == + IF Len(log_) < idx THEN log_ + ELSE SubSeq(log_, 1, idx) +LogDeleteEntriesFrom(log_, idx) == SubSeq(log_, 1, idx - 1) +LogCurrentIdx(log_) == LogCount(log_) +LogLastTerm(log_) == + LET idx == LogCount(log_) + term == IF idx = 0 THEN 0 ELSE log_[idx].term + IN term +LogGetTerm(log_, idx) == + IF LogCount(log_) < idx + THEN Assert(FALSE, <<"no such log entry", log_, idx>>) + ELSE IF idx = 0 THEN 0 ELSE log_[idx].term +LogGetMatchEntries(log_, entries, prevLogIdx) == + LET F[i \in 0..Len(entries)] == + IF i = 0 THEN Nil + ELSE LET ety1 == LogGetEntry(log_, prevLogIdx + i) + ety2 == LogGetEntry(entries, i) + entries1 == LogGetEntriesTo(log_, prevLogIdx + i - 1) + entries2 == LogGetEntriesFrom(entries, i) + IN IF /\ F[i-1] = Nil + /\ \/ ety1 = Nil + \/ ety1.term /= ety2.term + THEN entries1 \o entries2 + ELSE F[i-1] + result == F[Len(entries)] + IN IF result = Nil THEN log_ ELSE result + +(***************************************************************************) +(* Msg constructors *) +(***************************************************************************) +_BatchExcludesReqMsgsArg(n, excludes, Constructor2(_, _), Constructor3(_, _, _), arg) == + LET dsts == Servers \ excludes + size == Cardinality(dsts) + F[i \in 0..size] == + IF i = 0 THEN <<<<>>, dsts>> + ELSE LET ms == F[i-1][1] + s == CHOOSE j \in F[i-1][2]: TRUE + m == IF arg = Nil + THEN Constructor2(n, s) + ELSE Constructor3(n, s, arg) + remaining == F[i-1][2] \ {s} + IN <> + IN F[size][1] + +_Dummy2(a, b) == TRUE +_Dummy3(a, b, c) == TRUE + +BatchReqMsgs(n, Constructor(_, _)) == + _BatchExcludesReqMsgsArg(n, {n}, Constructor, _Dummy3, Nil) +BatchReqMsgsArg(n, Constructor(_, _, _), arg) == + _BatchExcludesReqMsgsArg(n, {n}, _Dummy2, Constructor, arg) + +ConstructMsg(src, dst, type, body) == + [ src |-> src, dst |-> dst, type |-> type, body |-> body ] + +RequestVote(i, j) == \* func:raft_send_requestvote + LET isPreCandidate == PreCandidate = state'[i] + body == [ prevote |-> isPreCandidate, + term |-> IF isPreCandidate + THEN current_term'[i] + 1 + ELSE current_term'[i], + candidate_id |-> i, + last_log_idx |-> LogCurrentIdx(log[i]), + last_log_term |-> LogLastTerm(log[i]) ] + IN ConstructMsg(i, j, M_RV, body) + +RequestVoteResponse(m, voted) == \* func: raft_recv_requestvote + LET i == m.dst + j == m.src + req == m.body + isPreVote == req.prevote + rejectHasLeaderId == /\ isPreVote + /\ leader_id[i] /= Nil + /\ leader_id[i] /= req.candidate_id + meTerm == current_term'[i] + rejectMeTermIsBigger == meTerm > req.term + rejectVotedOther == /\ meTerm = req.term + /\ voted /= Nil + /\ voted /= req.candidate_id + meLastTerm == LogLastTerm(log[i]) + rejectMeLogNewer == \/ req.last_log_term < meLastTerm + \/ /\ req.last_log_term = meLastTerm + /\ req.last_log_idx < LogCurrentIdx(log[i]) + voteStatus == IF rejectHasLeaderId THEN "not-vote: has leader" ELSE + IF rejectMeTermIsBigger THEN "not-vote: term bigger" ELSE + IF rejectVotedOther THEN "not-vote: already voted" ELSE + IF rejectMeLogNewer THEN "not-vote: log newer" ELSE "voted" + granted == voteStatus = "voted" + body == [ prevote |-> req.prevote, + request_term |-> req.term, + term |-> IF isPreVote THEN meTerm ELSE Max(req.term, meTerm), + vote_granted |-> granted ] + IN ConstructMsg(i, j, M_RVR, body) @@ [ status |-> voteStatus ] + +AppendEntriesNext(i, j, next) == \* func: raft_send_appendentries + LET prev_log_idx == next[i][j] - 1 + body == [ term |-> current_term[i], + leader_id |-> i, + leader_commit |-> commit_idx'[i], + prev_log_idx |-> prev_log_idx, + prev_log_term |-> LogGetTerm(log'[i], prev_log_idx), + entries |-> LogGetEntriesFrom(log'[i], next[i][j]) ] + IN ConstructMsg(i, j, M_AE, body) + +AppendEntries(i, j) == AppendEntriesNext(i, j, next_idx) + +AppendEntriesResponseFail(m) == \* func: raft_recv_appendentries + LET body == [ success |-> FALSE, + term |-> Max(current_term[m.dst], m.body.term), + current_idx |-> LogCurrentIdx(log[m.dst]), + msg_id |-> m.seq ] + IN ConstructMsg(m.dst, m.src, M_AER, body) + +AppendEntriesResponseSuccess(m) == \* func: raft_recv_appendentries + LET req == m.body + body == [ success |-> TRUE, + term |-> current_term[m.dst], + current_idx |-> req.prev_log_idx + Len(m.body.entries), + msg_id |-> m.seq ] + IN ConstructMsg(m.dst, m.src, M_AER, body) + + +(***************************************************************************) +(* Raft actions *) +(***************************************************************************) + +(***************************************************************************) +(* Become precandidate *) +(***************************************************************************) +BecomePrecandidate(i) == \* func: raft_become_precandidate + /\ state[i] /= Leader + /\ UpdateState(i, PreCandidate) + /\ ClearVotedForMe(i) + /\ UpdateLeaderId(i, Nil) \* func: raft_election_start + /\ UNCHANGED <> + /\ LET ms == BatchReqMsgs(i, RequestVote) + IN NetUpdate2(NetmanIncField("n_elec", NetBatchAddMsg(ms)), <<"BecomePrecandidate", i>>) + +(***************************************************************************) +(* Recv requestvote *) +(***************************************************************************) +SetCurrentTerm(i, term) == \* func: raft_set_current_term + /\ UpdateCurrentTerm(i, term) + /\ UpdateVotedFor(i, Nil) + +_BecomeFollower(i) == \* func: raft_become_follower + /\ UpdateState(i, Follower) + /\ UpdateLeaderId(i, Nil) + +BecomeFollower(i, term) == + /\ SetCurrentTerm(i, term) + /\ _BecomeFollower(i) + +RecvRequestVote(m) == \* func: raft_recv_requestvote + LET req == m.body + src == m.src + dst == m.dst + demote == ~req.prevote /\ current_term[dst] < req.term + msg == RequestVoteResponse(m, IF demote THEN Nil ELSE voted_for[dst]) + IN /\ IF demote \* Update the term only if this is not a prevote request + THEN /\ UpdateCurrentTerm(dst, req.term) + /\ UpdateState(dst, Follower) + ELSE UNCHANGED <> + /\ IF msg.body.vote_granted /\ ~req.prevote + THEN /\ Assert(~(state'[dst] \in {Leader, Candidate}), + <<"Leader/Candidate cannot vote", m, state'[dst], current_term'>>) + /\ UpdateLeaderId(dst, Nil) + /\ UpdateVotedFor(dst, src) + ELSE IF demote + THEN /\ UpdateLeaderId(dst, Nil) + /\ UpdateVotedFor(dst, Nil) + ELSE UNCHANGED <> + /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(msg, m), + <<"RecvRequestVote", msg.status, dst, src, IF req.prevote THEN "prevote" ELSE "not-prevote">>) + +(***************************************************************************) +(* Recv requestvote response *) +(***************************************************************************) +BecomeCandidate(i, m) == \* func: raft_become_candidate + /\ UpdateCurrentTerm(i, current_term[i] + 1) + /\ ClearVotedForMe(i) + /\ UpdateVotedFor(i, i) + /\ UpdateLeaderId(i, Nil) + /\ UpdateState(i, Candidate) + /\ LET ms == BatchReqMsgs(i, RequestVote) + IN NetUpdate2(NetReplyBatchAddMsg(ms, m), <<"RecvRequestVoteResponse", "BecomeCandidate", i>>) + +BecomeLeader(i, m) == \* func: raft_become_leader + /\ LET noop == [ term |-> current_term[i], data |-> Nil ] + IN log' = Update(log, i, LogAppend(log[i], noop)) + /\ UpdateState(i, Leader) + /\ UpdateLeaderId(i, i) + /\ match_idx' = [ match_idx EXCEPT ![i] = ( i :> LogCurrentIdx(log'[i]) ) @@ [ j \in Servers |-> 0 ] ] + /\ LET next == [ next_idx EXCEPT ![i] = ( i :> 1 ) @@ [ j \in Servers |-> LogCurrentIdx(log'[i]) ] ] + ms == BatchReqMsgsArg(i, AppendEntriesNext, next) + IN /\ next_idx' = [ next EXCEPT ![i] = ( i :> 1 ) @@ [ j \in Servers |-> LogCurrentIdx(log'[i]) + 1 ] ] + /\ NetUpdate2(NetReplyBatchAddMsg(ms, m), <<"RecvRequestVoteResponse", "BecomeLeader", i>>) + +RecvRequestVoteResponse(m) == \* func: raft_recv_requestvote_response + LET resp == m.body + src == m.src + dst == m.dst + IN /\ IF resp.term > current_term[dst] + THEN /\ UNCHANGED <> + /\ BecomeFollower(dst, resp.term) + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "term is smaller", dst, src>>) + ELSE IF \/ /\ resp.prevote + /\ \/ ~(state[dst] = PreCandidate) + \/ resp.request_term /= current_term[dst] + 1 + \/ /\ ~resp.prevote + /\ \/ ~(state[dst] = Candidate) + \/ resp.request_term /= current_term[dst] + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "vote is stale", dst, src>>) + ELSE IF resp.vote_granted + THEN LET votes == Cardinality(voted_for_me[dst] \cup {src}) + 1 \* +1 is itself + IN IF IsQuorumNum(votes) + THEN IF state[dst] = PreCandidate + THEN /\ UNCHANGED <> + /\ BecomeCandidate(dst, m) + ELSE /\ UNCHANGED <> + /\ AddVotedForMe(dst, src) + /\ BecomeLeader(dst, m) + ELSE /\ UNCHANGED <> + /\ AddVotedForMe(dst, src) + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "granted", dst, src>>) + ELSE /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "not granted", dst, src>>) + +(***************************************************************************) +(* Send appendentries to all other nodes *) +(***************************************************************************) +SendAppendentriesAll(n) == \* func: raft_send_appendentries_all + /\ UNCHANGED <> + /\ LET ms == BatchReqMsgsArg(n, AppendEntriesNext, next_idx) + IN /\ next_idx' = [ next_idx EXCEPT ![n] = ( n :> 1 ) @@ [ j \in Servers |-> LogCurrentIdx(log[n]) + 1 ] ] + /\ NetUpdate2(NetmanIncField("n_ae", NetBatchAddMsg(ms)), <<"SendAppendentriesAll", n>>) + +(***************************************************************************) +(* Recv appendentries *) +(***************************************************************************) +AcceptLeader(me, leader) == \* func: raft_accept_leader + /\ UpdateState(me, Follower) + /\ UpdateLeaderId(me, leader) + +SetCommitIdx(n, idx) == \* func: raft_set_commit_idx + /\ Assert(commit_idx[n] <= idx, "SetCommitIdx: commit_idx[n] <= idx") + /\ Assert(idx <= LogCurrentIdx(log'[n]), <<"SetCommitIdx: idx <= LogCurrentIdx(log'[n])", n, idx, log'>>) + /\ UpdateCommitIdx(n, idx) + +RecvAppendentries(m) == \* func: raft_recv_appendentries + LET req == m.body + src == m.src + dst == m.dst + fail == AppendEntriesResponseFail(m) + success == AppendEntriesResponseSuccess(m) + IN IF req.term < current_term[dst] + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetReplyMsg(fail, m), <<"RecvAppendentries", "term is bigger", dst, src>>) + ELSE /\ IF req.term > current_term[dst] + THEN /\ UpdateCurrentTerm(dst, req.term) + /\ UpdateVotedFor(dst, Nil) + ELSE UNCHANGED <> + /\ AcceptLeader(dst, req.leader_id) + /\ LET prevLogIsLastSnapshot == req.prev_log_idx = 0 \* snapshot is not implemented + ety == LogGetEntry(log[dst], req.prev_log_idx) + noPrevLog == ety = Nil + termMismatch == ety.term /= req.prev_log_term + IN IF /\ ~prevLogIsLastSnapshot + /\ \/ noPrevLog + \/ termMismatch + THEN IF noPrevLog + THEN /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(fail, m), <<"RecvAppendentries", "no prev log", dst, src>>) + ELSE \* term mismatch + /\ UNCHANGED <> + /\ log' = Update(log, dst, LogDeleteEntriesFrom(log[dst], req.prev_log_idx)) + /\ NetUpdate2(NetReplyMsg(fail, m), <<"RecvAppendentries", "term mismatch", dst, src>>) + ELSE \* success + /\ UNCHANGED <> + /\ log' = Update(log, dst, LogGetMatchEntries(log[dst], req.entries, req.prev_log_idx)) + /\ IF commit_idx[dst] < req.leader_commit + THEN LET lastLogIdx == Max(LogCurrentIdx(log'[dst]), 1) + idxToCommit == Min(lastLogIdx, req.leader_commit) + IN SetCommitIdx(dst, idxToCommit) + ELSE UNCHANGED commit_idx + /\ NetUpdate2(NetReplyMsg(success, m), <<"RecvAppendentries", "success", dst, src>>) + +(***************************************************************************) +(* Recv appendentries response *) +(***************************************************************************) +AdvanceCommitIdx(me) == \* func: raft_update_commit_idx + LET F[i \in 0..NumServer] == + IF i = 0 THEN <<<<>>, Servers>> + ELSE LET n == CHOOSE n \in F[i-1][2]: TRUE + IN <> + sorted_match_idx == SortSeq(F[NumServer][1], LAMBDA x, y: x > y) + commit == sorted_match_idx[NumServer \div 2 + 1] + IN IF /\ commit > commit_idx[me] + /\ current_term[me] = LogGetTerm(log[me], commit) + THEN SetCommitIdx(me, commit) + ELSE UNCHANGED commit_idx + +\* syncIndex<0: not to set match_idx' +FlushAdvanceCommitIdx(me, syncIndex) == \* func: raft_flush + IF state[me] /= Leader THEN TRUE + ELSE /\ IF syncIndex >= 0 + THEN IF syncIndex > match_idx[me][me] + THEN UpdateMatchIdx(me, me, syncIndex) + ELSE UNCHANGED match_idx + ELSE TRUE + /\ AdvanceCommitIdx(me) + +FlushSendAppendentries(me, m, info) == \* \* func: raft_flush + LET F[i \in 0..NumServer] == + IF i = 0 THEN <<{}, Servers>> + ELSE LET n == CHOOSE n \in F[i-1][2]: TRUE + idx == LogCurrentIdx(log'[me]) + IN IF \/ n = me + \/ next_idx[me][n] > idx + THEN <> + ELSE <> + excludes == F[NumServer][1] + ms == _BatchExcludesReqMsgsArg(me, excludes, _Dummy2, AppendEntriesNext, next_idx) + next_keep == [ s \in excludes |-> next_idx[me][s] ] + next_update == [ s \in Servers \ excludes |-> LogCurrentIdx(log'[me]) + 1 ] + IN /\ next_idx' = [ next_idx EXCEPT ![me] = next_keep @@ next_update ] + /\ IF m = Nil \* RecvEntry: client request + THEN NetUpdate2(NetmanIncField("n_op", NetBatchAddMsg(ms)), info) + ELSE NetUpdate2(NetReplyBatchAddMsg(ms, m), info) + +RecvAppendentriesResponse(m) == \* func: raft_recv_appendentries_response + LET resp == m.body + src == m.src + dst == m.dst + failReason == + IF state[dst] /= Leader THEN "not leader" ELSE + IF resp.msg_id < match_msgid[dst][src] THEN "msg_id is bigger" ELSE + IF resp.term > current_term[dst] THEN "term is smaller" ELSE + IF ~resp.success /\ resp.current_idx < match_idx[dst][src] THEN "stale response" ELSE + IF ~resp.success THEN "retry" ELSE "success" + IN IF failReason /= "success" + THEN IF failReason = "retry" + THEN LET next == Min(resp.current_idx + 1, LogCurrentIdx(log[dst])) + nextForAe == [next_idx EXCEPT ![dst][src] = next] + nextToUpdate == LogCurrentIdx(log'[dst]) + 1 + retryAe == AppendEntriesNext(dst, src, nextForAe) + IN /\ UNCHANGED <> + /\ UpdateNextIdx(dst, src, nextToUpdate) + /\ NetUpdate2(NetReplyMsg(retryAe, m), <<"RecvAppendentriesResponse", "retry", dst, src>>) + ELSE /\ UNCHANGED <> + /\ IF failReason = "term is smaller" + THEN BecomeFollower(dst, resp.term) + ELSE UNCHANGED <> + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", failReason, dst, src>>) + ELSE \* success + /\ UNCHANGED <> + /\ IF resp.current_idx > match_idx[dst][src] + THEN UpdateMatchIdx(dst, src, resp.current_idx) + ELSE UNCHANGED match_idx + /\ IF resp.msg_id > match_msgid[dst][src] + THEN UpdateMatchMsgid(dst, src, resp.msg_id) + ELSE UNCHANGED match_msgid + /\ FlushAdvanceCommitIdx(dst, -1) + /\ FlushSendAppendentries(dst, m, <<"RecvAppendentriesResponse", "success", dst, src>>) + +(***************************************************************************) +(* Recv client entry on Leader *) +(***************************************************************************) +RecvEntry(n, data) == \* func: raft_recv_entry + /\ state[n] = Leader + /\ UNCHANGED <> + /\ LET ety == [ term |-> current_term[n], data |-> data ] + IN log' = Update(log, n, LogAppend(log[n], ety)) + /\ FlushAdvanceCommitIdx(n, LogCurrentIdx(log'[n])) + /\ FlushSendAppendentries(n, Nil, <<"RecvEntry", n, data>>) + +(***************************************************************************) +(* Invariants *) +(***************************************************************************) +ElectionSafety == + LET TwoLeader == + \E i, j \in Servers: + /\ i /= j + /\ current_term'[i] = current_term'[j] + /\ state'[i] = Leader + /\ state'[j] = Leader + IN ~TwoLeader + +LeaderAppendOnly == + \A i \in Servers: + IF state[i] = Leader /\ state'[i] = Leader + THEN LET curLog == log[i] + nextLog == log'[i] + IN IF Len(nextLog) >= Len(curLog) + THEN SubSeq(nextLog, 1, Len(curLog)) = curLog + ELSE FALSE + ELSE TRUE + +LogMatching == + \A i, j \in Servers: + IF i /= j + THEN LET iLog == log'[i] + jLog == log'[j] + len == Min(Len(iLog), Len(jLog)) + F[k \in 0..len] == + IF k = 0 THEN <<>> + ELSE LET key1 == <> + value1 == iLog[k].data + key2 == <> + value2 == jLog[k].data + F1 == IF key1 \in DOMAIN F[k-1] + THEN IF F[k-1][key1] = value1 + THEN F[k-1] + ELSE F[k-1] @@ ( <<-1, -1>> :> <> ) + ELSE F[k-1] @@ (key1 :> value1) + F2 == IF key2 \in DOMAIN F1 + THEN IF F1[key2] = value2 + THEN F1 + ELSE F1 @@ ( <<-1, -1>> :> <> ) + ELSE F1 @@ (key2 :> value2) + IN F2 + IN IF << -1, -1>> \notin DOMAIN F[len] THEN TRUE + ELSE Assert(FALSE, <>) + ELSE TRUE + +MonotonicCurrentTerm == \A i \in Servers: current_term' [i] >= current_term[i] + +MonotonicCommitIdx == \A i \in Servers: commit_idx'[i] >= commit_idx[i] + +MonotonicMatchIdx == + \A i \in Servers: + IF state[i] = Leader + THEN \A j \in Servers: match_idx'[i][j] >= match_idx[i][j] + ELSE TRUE + +CommittedLogDurable == + \A i \in Servers: + LET len == Min(commit_idx'[i], commit_idx[i]) + logNext == SubSeq(log'[i], 1, len) + logCur == SubSeq(log[i], 1, len) + IN IF len = 1 THEN TRUE + ELSE /\ Len(logNext) >= len + /\ Len(logCur) >= len + /\ logNext = logCur + +CommittedLogReplicatedMajority == + \A i \in Servers: + IF state'[i] /= Leader \/ commit_idx'[i] <= 1 + THEN TRUE + ELSE LET entries == SubSeq(log'[i], 1, commit_idx'[i]) + len == Len(entries) + nServer == Cardinality(Servers) + F[j \in 0..nServer] == + IF j = 0 + THEN <<{}, {}>> + ELSE LET k == CHOOSE k \in Servers: k \notin F[j-1][1] + logLenOk == LogCount(log'[k]) >= commit_idx'[i] + kEntries == SubSeq(log'[k], 1, commit_idx'[i]) + IN IF /\ logLenOk + /\ entries = kEntries + THEN <> + ELSE <> + IN IsQuorum(F[nServer][2]) + +NextIdxGtMatchIdx == + \A i \in Servers: + IF state'[i] = Leader + THEN \A j \in Servers \ {i}: next_idx'[i][j] > match_idx'[i][j] + ELSE TRUE + +NextIdxGtZero == + \A i \in Servers: + IF state'[i] = Leader + THEN \A j \in Servers: next_idx'[i][j] > 0 + ELSE TRUE + +SelectSeqWithIdx(s, Test(_,_)) == + LET F[i \in 0..Len(s)] == + IF i = 0 + THEN <<>> + ELSE IF Test(s[i], i) + THEN Append(F[i-1], s[i]) + ELSE F[i-1] + IN F[Len(s)] + +FollowerLogLELeaderLogAfterAE == + LET cmd == netcmd'[1] + cmd1 == cmd[1] + cmd2 == cmd[2] + follower == cmd[3] + leader == cmd[4] + IN IF cmd1 = "RecvAppendentries" /\ cmd2 \in { "success", "no prev log" } + THEN IF log[follower] /= log'[follower] + THEN LogCount(log'[follower]) <= LogCount(log'[leader]) + ELSE TRUE + ELSE TRUE + +CommitIdxLELogLen == + \A i \in Servers: commit_idx'[i] <= LogCount(log'[i]) + +LeaderCommitCurrentTermLogs == + \A i \in Servers: + IF state'[i] = Leader + THEN IF commit_idx[i] /= commit_idx'[i] + THEN log'[i][commit_idx'[i]].term = current_term'[i] + ELSE TRUE + ELSE TRUE + +NewLeaderTermNotInLog == + \A i \in Servers: + IF state'[i] = Leader /\ state[i] /= Leader + THEN \A j \in Servers \ {i}: + \A n \in DOMAIN log'[j]: + log'[j][n].term /= current_term'[i] + ELSE TRUE + +LeaderTermLogHasGreatestIdx == + \A i \in Servers: + IF state'[i] = Leader + THEN \A j \in Servers \ {i}: + LET IncTermLogCount(a, b) == IF a.term = current_term'[i] THEN b + 1 ELSE b + IN FoldSeq(IncTermLogCount, 0, log'[i]) >= FoldSeq(IncTermLogCount, 0, log'[j]) + ELSE TRUE + +InvSequence == << + ElectionSafety, + LeaderAppendOnly, + LogMatching, + MonotonicCurrentTerm, + MonotonicCommitIdx, + MonotonicMatchIdx, + CommittedLogDurable, + CommittedLogReplicatedMajority, + NextIdxGtMatchIdx, + NextIdxGtZero, + FollowerLogLELeaderLogAfterAE, + CommitIdxLELogLen, + LeaderCommitCurrentTermLogs, + NewLeaderTermNotInLog, + LeaderTermLogHasGreatestIdx +>> + +INV == Len(SelectSeqWithIdx(inv, LAMBDA x, y: ~x /\ y \notin netman.no_inv)) = 0 + +(***************************************************************************) +(* State contraints *) +(***************************************************************************) + +\*CONSTANTS MaxSentMsgs, +\* MaxRecvMsgs, +\* MaxWireMsgs, +\* MaxClientOperationsTimes, +\* MaxAppendEntriesTimes, +\* MaxElectionTimes, +\* MaxLogLength, +\* MaxTerm, +\* MaxDropTimes, +\* MaxDupTimes, +\* MaxUnorderTimes + +GetRealLogLen(curLog) == SelectSeq(curLog, LAMBDA i: i.data /= NoOp) +GetMaxLogLen == Len(log[CHOOSE i \in Servers: \A j \in Servers \ {i}: + GetRealLogLen(log[i]) >= GetRealLogLen(log[j])]) +GetMaxTerm == current_term[CHOOSE i \in Servers: \A j \in Servers \ {i}: + current_term[i] >= current_term[j]] + +ScSent == CheckParameterMax(netman.n_sent, "MaxSentMsgs") +ScRecv == CheckParameterMax(netman.n_recv, "MaxRecvMsgs") +ScWire == CheckParameterMax(netman.n_wire, "MaxWireMsgs") +ScLog == CheckParameterMax(GetMaxLogLen, "MaxLogLength") +ScTerm == CheckParameterMax(GetMaxTerm, "MaxTerm") +ScOp == CheckParameterMax(netman.n_op, "MaxClientOperationsTimes") +ScAe == CheckParameterMax(netman.n_ae, "MaxAppendEntriesTimes") +ScElec == CheckParameterMax(netman.n_elec, "MaxElectionTimes") +ScDrop == CheckParameterMax(netman.n_drop, "MaxDropTimes") +ScDup == CheckParameterMax(netman.n_dup, "MaxDupTimes") +ScUnorder == CheckParameterMax(netman.n_unorder, "MaxUnorderTimes") + +SC == /\ ScSent /\ ScRecv /\ ScWire /\ ScLog + /\ ScTerm /\ ScOp /\ ScAe /\ ScElec + /\ ScDrop /\ ScDup /\ ScUnorder + +(***************************************************************************) +(* Next actions *) +(***************************************************************************) + +_DoRecvM(type, func(_)) == + /\ \E m \in msgs: + /\ m /= Nil + /\ m.type = type + /\ LET unorder == IF IsFirstMsg(m) THEN 0 ELSE 1 + IN CheckParameterMax(NetGetUnorder + unorder, "MaxUnorderTimes") + /\ func(m) + /\ inv' = InvSequence + +DoRecvRequestVote == /\ _DoRecvM(M_RV, RecvRequestVote) + +DoRecvRequestVoteResponse == /\ _DoRecvM(M_RVR, RecvRequestVoteResponse) + +DoRecvAppendentries == /\ _DoRecvM(M_AE, RecvAppendentries) + +DoRecvAppendentriesResponse == /\ _DoRecvM(M_AER, RecvAppendentriesResponse) + +DoBecomePrecandidate == + /\ PrePrune(netman.n_elec, "MaxElectionTimes") + /\ \E n \in Servers: BecomePrecandidate(n) + /\ inv' = InvSequence + +DoRecvEntry == + /\ PrePrune(netman.n_op, "MaxClientOperationsTimes") + /\ \E n \in Servers, v \in Commands: RecvEntry(n, v) + /\ inv' = InvSequence + +DoSendAppendentriesAll == + /\ PrePrune(netman.n_ae, "MaxAppendEntriesTimes") + /\ \E n \in Servers: + /\ state[n] = Leader + /\ SendAppendentriesAll(n) + /\ inv' = InvSequence + +DoNetworkDrop == + /\ PrePrune(NetGetDrop, "MaxDropTimes") + /\ \E m \in msgs: + /\ NetUpdate2(NetDropMsg(m), <<"DoNetworkDrop", m.dst, m.src, m.seq, m.dup>>) + /\ UNCHANGED noNetVars + /\ inv' = InvSequence + +DoNetworkDup == + /\ PrePrune(NetGetDup, "MaxDupTimes") + /\ \E m \in msgs: + /\ NetUpdate2(NetDupMsg(m), <<"DoNetworkDup", m.dst, m.src, m.seq, m.dup>>) + /\ UNCHANGED noNetVars + /\ inv' = InvSequence + +Next == + \/ DoRecvRequestVote + \/ DoRecvRequestVoteResponse + \/ DoRecvAppendentries + \/ DoRecvAppendentriesResponse + \/ DoBecomePrecandidate + \/ DoRecvEntry + \/ DoSendAppendentriesAll + \/ DoNetworkDrop + \/ DoNetworkDup + +Spec == Init /\ [][Next]_vars + +============================================================================= +\* Modification History +\* Last modified Thu Mar 16 10:12:48 CST 2023 by tangruize +\* Created Tue Jan 03 16:38:47 CST 2023 by tangruize diff --git a/systems/Raft-rs/specs/UdpNetwork.tla b/systems/Raft-rs/specs/UdpNetwork.tla new file mode 100644 index 0000000..0bc5feb --- /dev/null +++ b/systems/Raft-rs/specs/UdpNetwork.tla @@ -0,0 +1,238 @@ +----------------- MODULE UdpNetwork ---------------------- +EXTENDS TLC, Naturals, FiniteSets, Sequences +(*************************************************************************** + VARIABLES definitions: see InitUdpNetwork + ***************************************************************************) + +VARIABLES _msgs, \* Messages in the network + _netman, \* Network manager + _netcmd \* Current network cmd + + +(* NULL_MSG: represent a null msg in some condition checkings + shoule be a model value if its type is constant *) +CONSTANT NULL_MSG +\*NULL_MSG == [ NULL_MSG |-> "" ] + +---- \* Common functions + +(*****) +(* API InitUdpNetwork(nodes): + - _msgs: init to empty set of msgs + * format [seq |-> 0, src |-> s0, dst |-> s1, type |-> sth, data -> sth] + * src and dst will be dropped when storing in _msgs + * type and data are user defined fields + _ _netman: + - n_sent: number of msgs sent to network, to indicate next msg seq + - n_recv: number of msgs delivered to server + - n_wire: number of msgs in network but not delivered yet + - n_unorder: unordered failure times + - n_drop: drop failure times + - n_dup: duplicate failure times + - _netcmd: <<"Init">> *) +(*****) + + +InitUdpNetworkNetman(nodes, cmd, additonalNetman) == + /\ _msgs = {} + /\ _netman = additonalNetman @@ + [ n_sent |-> 0, n_recv |-> 0, n_wire |-> 0, + n_unorder |-> 0, n_drop |-> 0, n_dup |-> 0 ] + /\ _netcmd = <> + +InitUdpNetwork(nodes) == InitUdpNetworkNetman(nodes, "init", <<>>) + +------ \* Update _netman functions + +(*************************************************************************** + _NetGetHelper and _NetIncHelper: get, inc and dec member of _netman records + ***************************************************************************) +_NetGetHelper(member) == _netman[member] +_NetIncHelper(member) == (member :> _netman[member] + 1) +_NetDecHelper(member) == (member :> _netman[member] - 1) +NetIncBy(member, number) == (member :> _netman[member] + number) + +NetGetSent == _NetGetHelper("n_sent") +NetIncSent == _NetIncHelper("n_sent") +NetGetRecv == _NetGetHelper("n_recv") +NetIncRecv == _NetIncHelper("n_recv") +NetGetWire == _NetGetHelper("n_wire") +NetIncWire == _NetIncHelper("n_wire") +NetDecWire == _NetDecHelper("n_wire") +NetGetUnorder == _NetGetHelper("n_unorder") +NetIncUnorder == _NetIncHelper("n_unorder") +NetGetDrop == _NetGetHelper("n_drop") +NetIncDrop == _NetIncHelper("n_drop") +NetGetDup == _NetGetHelper("n_dup") +NetIncDup == _NetIncHelper("n_dup") + +---- \* Network send and recv functions + +(****) +(* API updater : + * return <> *) +(****) +NetmanIncField(field, updater) == + <<_NetIncHelper(field) @@ updater[1]>> @@ updater + + +\*NetmanIncFieldWithoutUpdate(field) == +\* <<_NetIncHelper(field) , _msgs ,_netcmd>> + + +\* return <> +_AddMsgSeq(m, seq, msgs) == LET m_ == IF "seq" \in DOMAIN m \* TODO: add partition, see wraft + THEN {[ m EXCEPT !["dup"] = @+1 ]} \* inc dup to indicate it is a duplicate msg + ELSE {m @@ [ seq |-> seq, dup |-> 0 ]} + IN <<1, msgs \union m_>> + +\* Add msg to msgs, increase scr.nMessage. +_AddMsg(m, msgs) == LET seq == NetGetSent + 1 + IN _AddMsgSeq(m, seq, msgs) + + + +(****) +(* _BatchAddMsgs: batch add multi messages to msgs + * return <> + * set global seq to each msg m +*) +(****) +_BatchAddMsgs(ms, msgs)== + LET F[i \in 0 .. Len(ms)] == + IF i = 0 THEN <<0, msgs, <<"msg_batch_add">> >> + ELSE LET m == ms[i] + seq == NetGetSent + F[i-1][1] + 1 + res == _AddMsgSeq(m, seq ,F[i-1][2]) + IN << res[1] + F[i-1][1], res[2], Append(F[i-1][3], + IF res[1] = 1 THEN <<"ok", m.src, m.dst, seq>> + ELSE <<"dropped", m.src, m.dst, seq>>) >> + IN F[Len(ms)] + + +(*************************************************************************** + _DelMsg: delete m from msgs return <> + ***************************************************************************) +\* Del msg from msgs. +_DelMsg(m, msgs) == + IF m \in msgs + THEN <<1, msgs\ {m}>> + ELSE Assert(FALSE, "Delmsg: not in network") + +(*************************************************************************** + _ReplyMsg: delete request from msgs and then add reponse to msgs + * return <> + ***************************************************************************) +\* Combination of Send and Discard. +_ReplyMsg(response, request, msgs) == + LET del == _DelMsg(request, msgs) + add == _AddMsg(response, del[2]) + IN <> + +(*************************************************************************** + API NetGetMsg: Get msg from src -> dst FIFO head + * return msg m + 真的需要实现吗?? 讨论 + ***************************************************************************) +\* NetGetMsg(src, dst) == _GetMsg(src, dst, _msgs) + +(* inc unorder *) + +IsFirstMsg(m) == + LET myMsg == { i \in _msgs: i.dst = m.dst } \* 是否需要考虑src? + first == CHOOSE i \in myMsg: i.seq <= m.seq + IN first = m + +NetIncRecvCheckUnorder(m) == + IF IsFirstMsg(m) + THEN NetIncRecv + ELSE NetIncRecv @@ NetIncUnorder + + +(*************************************************************************** + API NetDelMsg: Del msgs of m + * return <> + * update with NetUpdate + ***************************************************************************) +NetDelMsg(m) == + LET res == _DelMsg(m, _msgs) + IN <> >> + +(*************************************************************************** + API NetDropMsg: Drop msgs of m + * return <> + * update with NetUpdate + ***************************************************************************) +NetDropMsg(m) == + LET res == _DelMsg(m, _msgs) + IN <> >> + + +(*************************************************************************** + API NetDupMsg: Duplicate msgs of m + * return <> + * update with NetUpdate + ***************************************************************************) +NetDupMsg(m) == + LET res == _AddMsg(m, _msgs) + IN <> >> + + +(****) +(* API NetAddMsg : add m into msgs + * return <> *) +(****) +NetAddMsg(m) == + LET res == _AddMsg(m, _msgs) + IN IF res[1] = 1 + THEN <> >> \* here we do not need seq, because we put in network then sort + ELSE <<_netman, res[2], <<"msg_add_dropped", m.src, m.dst>> >> + +NetReplyMsg(response, request) == + LET res == _ReplyMsg(response, request, _msgs) + IN IF res[1] = 0 + THEN <> >> + ELSE <> >> + + +(*************************************************************************** + API NetBatchAddMsg: batch add messages ms to msgs + ***************************************************************************) +NetBatchAddMsg(ms) == + LET res == _BatchAddMsgs(ms, _msgs) + IN <> + +(*************************************************************************** + API NetReplyBatchAddMsg: remove request and batch add ms to msgs + ***************************************************************************) +NetReplyBatchAddMsg(ms, request) == + LET del == _DelMsg(request, _msgs) + add == _BatchAddMsgs(ms, del[2]) + IN <>, add[3])>> + +(*************************************************************************** + API NetNoAction: Network state unchanged + * return <> + ***************************************************************************) +NetNoAction(cmd) == <<_netman, _msgs, cmd>> + + +NetUpdate(args) == + /\ _netman' = args[1] @@ _netman + /\ _msgs' = args[2] + /\ IF Len(args) = 3 + THEN _netcmd' = args[3] + ELSE _netcmd' = <<"noop">> + +NetUpdate2(args, cmd) == + /\ _netman' = args[1] @@ _netman + /\ _msgs' = args[2] + /\ IF Len(args) = 3 + THEN _netcmd' = <> + ELSE _netcmd' = <> + + +==== \ No newline at end of file diff --git a/systems/Raft-rs/specs/rsRaft.tla b/systems/Raft-rs/specs/rsRaft.tla new file mode 100644 index 0000000..1a08020 --- /dev/null +++ b/systems/Raft-rs/specs/rsRaft.tla @@ -0,0 +1,1630 @@ +---------------------------- MODULE rsRaft ----------------------------- +(***************************************************************************) +(* This is the TLA+ specification for Raft-RS in TiKV with version 0.7.0 *) +(* *) +(* - Leader election: *) +(* - Log replication: *) +(* *) +(* Currently, the specification assumes: *) +(* - No snapshots *) +(* - No read-only requests *) +(* - No non-voting nodes *) +(* - No disk failures *) +(* - No membership change *) +(***************************************************************************) + +EXTENDS Sequences, Naturals, Integers, FiniteSets, TLC, SequencesExt + +(***************************************************************************) +(* Constants definitions *) +(***************************************************************************) +\* The set of servers +CONSTANT Servers +\* Server states, Corresponding to raft-rs StateRole +CONSTANTS Follower, Candidate, Leader +\* Raft message types +CONSTANTS M_RV, M_RVR, M_AE, M_AER, M_PRV, M_PRVR, M_HB, M_HBR, M_SNAP +\* The set of commands +CONSTANTS Commands +\* The abstraction of null operation +CONSTANTS NoOp +\* Misc: state constraint parameters and placeholder +CONSTANTS Nil +\* The set of ProgressState +CONSTANTS Probe, Replicate, Snapshot + +(*************************************************************************** + Variables definitions + ***************************************************************************) +\* Persistent state on all servers +VARIABLES currentTerm, \* Latest term server has seen (initialized to 0 on first boot, increases monotonically) , Corresponding to raft-rs RaftCore.term + votedFor, \* CandidateId that received vote in current term (or null if none), Corresponding to raft-rs RaftCore.vote + log \* Log entries; each entry contains command for state machine, and term when entry was received by leader, Corresponding to raft-rs RaftCore.raft_log + +\* Snapshot metadata +VARIABLES snapshotLastIdx, \* the last entry in the log the snapshot replaces + snapshotLastTerm \* the term of this entry + +\* Volatile state on all servers +VARIABLES raftState, \* State of servers, in {Follower, Candidate, Leader} , Corresponding to raft-rs RaftCore.state + commitIndex, \* Index of highest log entry known to be committed + leader_id \* The potential leader of the cluster, Corresponding to raft-rs RaftCore.leader_id + + +\* Volatile state on leader +VARIABLES nextIndex, \* for each server, index of the next log entry to send to that server, Corresponding to raft-rs Progress.next_idx + matchIndex \* for each server, index of highest log entry known to be replicated on server, Corresponding to raft-rs Progress.matched + +\* intermediate variable +VARIABLES voted_for_me \* Record nodes that have voted for me, Corresponding to raft-rs Progress.voted +VARIABLES voted_reject \* Record nodes that have not voted for me, Corresponding to raft-rs Progress.voted +VARIABLES check_quorum \* check_quorum variables +VARIABLE progress \* The status of each follower's receive log, which is used in receiving append, which contains probe and replicate. Corresponding to raft-rs Progress.state +VARIABLE inflight \* Number of letters transmitted during the recording process. Corresponding to raft-rs Progress.int (Inflights) +VARIABLE pending_snapshot +VARIABLES pr_pending + + + +(*************************************************************************** + Network variables and instance + ***************************************************************************) +\* The network is modelled through these variables +VARIABLES netman, + netcmd, + msgs +INSTANCE FifoNetwork WITH FLUSH_DISCONN <- TRUE, NULL_MSG <- Nil, + _msgs <- msgs, _netman <- netman, _netcmd <- netcmd + + +(***************************************************************************) +(* Self manipulated invariants checking *) +(***************************************************************************) +VARIABLES inv \* Invariants that guarantee correctness + +(***************************************************************************) +(* Vars groups *) +(***************************************************************************) +serverVars == <> +leaderVars == <> +candidateVars == <> +logVars == <> +snapVars == <> +nodeVars == <> +netVars == <> +noNetVars == <> +vars == <> + + +(***************************************************************************) +(* State constraints helper *) +(***************************************************************************) +CONSTANTS Parameters \* to control the model scale + +GetParameterSet(p) == IF p \in DOMAIN Parameters THEN Parameters[p] ELSE {} + +CheckParameterHelper(n, p, Test(_,_)) == + IF p \in DOMAIN Parameters + THEN Test(n, Parameters[p]) + ELSE TRUE +CheckParameterMax(n, p) == CheckParameterHelper(n, p, LAMBDA i, j: i <= j) + +PrePrune(n, p) == CheckParameterHelper(n, p, LAMBDA i, j: i < j) + + +(***************************************************************************) +(* Type Ok. Used as a check on writing format *) +(***************************************************************************) + +TypeOkServerVars == + /\ currentTerm \in [ Servers -> Nat ] + /\ votedFor \in [ Servers -> Servers \cup {Nil} ] + /\ raftState \in [ Servers -> { Follower, Candidate, Leader } ] + +TypeOkLeaderVars == + /\ nextIndex \in [ Servers -> [ Servers -> Nat \ {0} ]] + /\ matchIndex \in [ Servers -> [ Servers -> Nat ]] + +\* TypeOkCandidateVars == +\* /\ votesGranted \in [ Servers -> {} ] + +TypeOkLogVars == + \* log data structure is complex, we skip checking it + /\ commitIndex \in [ Servers -> Nat ] + +TypeOk == + /\ TypeOkServerVars + /\ TypeOkLeaderVars + /\ TypeOkLogVars + + +(*************************************************************************** + Init variables + ***************************************************************************) +InitServerVars == + /\ currentTerm = [ i \in Servers |-> 1 ] + /\ votedFor = [ i \in Servers |-> Nil ] + /\ raftState = [ i \in Servers |-> Follower ] + +InitLeaderVars == + /\ nextIndex = [ i \in Servers |-> [ j \in Servers |-> 1 ]] + /\ matchIndex = [ i \in Servers |-> [ j \in Servers |-> 0 ]] + +InitCandidateVars == + /\ voted_for_me = [ i \in Servers |-> {} ] + /\ voted_reject = [ i \in Servers |-> {} ] + +InitLogVars == + /\ log = [ i \in Servers |-> << [term |-> 1, data |-> Nil, index |-> 1]>> ] + /\ commitIndex = [ i \in Servers |-> 1 ] +InitInv == inv = <<>> + +InitSnapShotVars == + /\ snapshotLastIdx = [ i \in Servers |-> 0 ] + /\ snapshotLastTerm = [ i \in Servers |-> 0 ] + +InitNodeVars == + /\ leader_id = [ i \in Servers |-> Nil] + /\ check_quorum = [i \in Servers |-> FALSE] \* Used to determine if check_quorum is on + /\ progress = [ i \in Servers |-> [ j \in Servers |-> <>]] + /\ inflight = [ i \in Servers |-> [ j \in Servers |-> 0 ]] + /\ pr_pending = [ i \in Servers |-> [ j \in Servers |-> 0 ]] + /\ pending_snapshot = [ i \in Servers |-> 0] + +InitNetVars == + /\ InitFifoNetworkAddNetman(Servers, <<"Init", Cardinality(Servers)>>, + [n_elec |-> 0, n_ae |-> 0, n_hb |-> 0, n_op |-> 0, n_restart |-> 0, n_rqSnap |-> 0, n_becomeLeader |-> 0, no_inv |-> GetParameterSet("NoInv")]) + + +Init == + /\ InitServerVars + /\ InitLeaderVars + /\ InitCandidateVars + /\ InitLogVars + /\ InitInv + /\ InitNodeVars + /\ InitNetVars + /\ InitSnapShotVars + +(*************************************************************************** + Helper functions + ***************************************************************************) +NumServer == Cardinality(Servers) + +Min(x,y) == IF x < y THEN x ELSE y +Max(x,y) == IF x < y THEN y ELSE x + +IsQuorum(ss) == Cardinality(ss) * 2 > Cardinality(Servers) +IsQuorumNum(num) == num * 2 > Cardinality(Servers) + +CheckStateIs(n, s) == raftState[n] = s +CheckStateIsNot(n, s) == raftState[n] /= s + +Update(var, n, value) == [var EXCEPT ![n] = value] +UpdateCurrentTerm(n, term) == currentTerm' = Update(currentTerm, n, term) +UpdateLeaderId(n, id) == leader_id' = Update(leader_id, n, id) +UpdatePendingSnap(n, snap_num) == pending_snapshot' = Update(pending_snapshot, n, snap_num) +UpdateVotedFor(n, node) == votedFor' = Update(votedFor, n, node) +UpdateState(n, s) == raftState' = Update(raftState, n, s) +UpdateVotedForMe(n, value) == voted_for_me' = Update(voted_for_me, n, value) +AddVotedForMe(me, node) == voted_for_me' = [ voted_for_me EXCEPT ![me] = @ \cup {node} ] +ClearVotedForMe(me) == voted_for_me' = [ voted_for_me EXCEPT ![me] = {} ] +UpdateVotesReject(n, value) == voted_reject' = Update(voted_reject, n, value) +AddVotesReject(me, node) == voted_reject' = [ voted_reject EXCEPT ![me] = @ \cup {node}] +ClearVotesReject(me) == voted_reject' = [ voted_reject EXCEPT ![me] = {} ] +UpdateMatchIdx(me, node, idx) == matchIndex' = [ matchIndex EXCEPT ![me][node] = idx ] +UpdatePrPending(me, node, idx) == pr_pending' = [ pr_pending EXCEPT ![me][node] = idx ] +UpdateNextIdx(me, node, idx) == nextIndex' = [ nextIndex EXCEPT ![me][node] = IF idx < 1 THEN 1 ELSE idx ] +UpdateProgress(me, node, state) == progress' = [progress EXCEPT ![me][node] = state ] +UpdateInflight(me, node, num) == inflight' = [inflight EXCEPT ![me][node] = num ] +UpdateCommitIdx(n, idx) == commitIndex' = Update(commitIndex, n, idx) +UpdateSnapIdx(me, idx) == snapshotLastIdx' = Update(snapshotLastIdx, me, idx) +UpdateSnapTerm(me, term) == snapshotLastTerm' = Update(snapshotLastTerm, me, term) +AllUpdateNextIdx(me, idx) == + LET f == [i \in Servers |-> idx] + IN nextIndex' = [nextIndex EXCEPT ![me] = f] +AllUpdateMatchIdx(me, idx) == + LET f == [i \in Servers |-> idx] + IN matchIndex' = [matchIndex EXCEPT ![me] = f] +AllUpdateProgress(me, prstate) == + LET f == [i \in Servers |-> prstate] + IN progress' = [progress EXCEPT ![me] = f] +AllUpdateInflight(me, num_msg) == + LET f == [i \in Servers |-> num_msg] + IN inflight' = [inflight EXCEPT ![me] = f] +AllUpdatePrPending(me, pending) == + LET f == [i \in Servers |-> pending] + IN pr_pending' = [pr_pending EXCEPT ![me] = f] + +(***************************************************************************) +(* Log helpers *) +(***************************************************************************) +\* idx = 1, data = Nil +LogAppend(log_, entry) == Append(log_, entry) + +LogCount(log_) == Len(log_) + +\* originale version, withous snapshot, only for simple log +LogGetEntry(log_, idx) == + IF idx > LogCount(log_) \/ idx <= 0 + THEN Nil + ELSE log_[idx] +LogGetEntryOne(log_, idx) == + IF idx > LogCount(log_) \/ idx <= 0 + THEN <<>> + ELSE SubSeq(log_, idx, idx) +LogGetEntriesFrom(log_, idx) == + IF idx > LogCount(log_) \/ idx <= 0 THEN <<>> + ELSE SubSeq(log_, idx, LogCount(log_)) +LogGetEntriesTo(log_, idx) == + IF Len(log_) < idx THEN log_ + ELSE SubSeq(log_, 1, idx) + + +\* Get last idx of server s. +LastIdx(log_, n) == + IF LogCount(log_) = 0 + THEN snapshotLastIdx'[n] + ELSE log_[LogCount(log_)].index \* last entry's idx + +FirstIdx(log_, n) == + IF LogCount(log_) = 0 + THEN snapshotLastIdx[n] + 1 + ELSE log_[1].index + +TruncatedIdx(log_, idx, n) == \* 用作现有idx 经过压缩之后 获得真正的log索引 + LET firstIdx == FirstIdx(log_, n) + lastIdx == LastIdx(log_, n) + newIndex == idx - firstIdx + 1 \* 这里和 raft-rs不同的是 我们是从下标1 开始统计, raftrs是从下标0 + IN IF idx > lastIdx \/ idx < firstIdx + THEN -1 + ELSE newIndex + + +\* used in server's real log, because of snapshot +LogGetEntry2(log_, idx, n) == + LET newIndex == TruncatedIdx(log_, idx, n) + IN IF newIndex = -1 + THEN Nil + ELSE log_[newIndex] + +LogGetEntryOne2(log_, idx, n) == + LET newIndex == TruncatedIdx(log_, idx, n) + IN IF newIndex = -1 + THEN <<>> + ELSE SubSeq(log_, newIndex, newIndex) + +LogGetEntriesFrom2(log_, idx, n) == + LET newIndex == TruncatedIdx(log_, idx, n) + IN IF newIndex = -1 + THEN <<>> + ELSE SubSeq(log_, newIndex, LogCount(log_)) + + +LogGetEntriesTo2(log_, idx, n) == + LET newIndex == TruncatedIdx(log_, idx, n) + IN IF newIndex = -1 + THEN <<>> + ELSE SubSeq(log_, 1, newIndex) + +LogGetTerm2(log_, idx, info, n) == + LET newIndex == TruncatedIdx(log_, idx, n) + IN IF newIndex = -1 + THEN 0 + ELSE IF newIndex = 0 THEN 0 ELSE log_[newIndex].term + +\* LogDeleteEntriesFrom(log_, idx, base ) == +\* LET newIdx == idx - base +\* IN SubSeq(log_, 1, newIdx - 1) + +LogCurrentIdx(log_, n) == LastIdx(log_, n) + + +LogLastTerm(log_, n) == + LET idx == TruncatedIdx(log_, LastIdx(log_, n), n) + term == + IF idx = -1 + THEN snapshotLastTerm[n] + ELSE IF idx = 0 + THEN 0 + ELSE log_[idx].term + IN term + +\* just for temp log +LogLastIdx(log_) == + LET idx == LogCount(log_) + index == IF idx = 0 THEN 0 ELSE log_[idx].index + IN index + + + + +\* log_ is the log of the original node, entries is the logs that need to be added in the AE letter, we need to find a suitable location to overwrite the conflicting logs according to the incoming prevLogIdx, and add the subsequent logs. +\* in maybe_append@raft_log.rs +LogGetMatchEntries(log_, entries, prevLogIdx, n) == + LET F[i \in 0..Len(entries)] == + IF i = 0 THEN Nil + ELSE LET ety1 == LogGetEntry2(log_, prevLogIdx + i, n) \* Original log Entry at prevLogIdx + i + ety2 == LogGetEntry(entries, i)\* The entries ith one to be added + entries1 == LogGetEntriesTo2(log_, prevLogIdx + i - 1, n) \* log_ from first_index to prevLogIdx + i - 1 + entries2 == LogGetEntriesFrom(entries, i) \* entries from i to Len(entries) + IN IF /\ F[i-1] = Nil + /\ \/ ety1 = Nil \* The original log does not have the ith one, indicating that all subsequent ones need to be added directly. + \/ ety1.term /= ety2.term \* The i-th mismatch of the original log indicates that it needs to be overwritten from the i-th onwards with all newly added + THEN entries1 \o entries2 + ELSE F[i-1] + result == F[Len(entries)] + IN IF result = Nil THEN log_ ELSE result + + +(***************************************************************************) +(* Msg constructors *) +(***************************************************************************) +\* Send the letter to the remaining nodes, constructing the letter according to the rules of the Contrustor2/3 function +_BatchExcludesReqMsgsArg(n, excludes, Constructor2(_, _), Constructor3(_, _, _), arg) == + LET dsts == Servers \ excludes + size == Cardinality(dsts) + F[i \in 0..size] == + IF i = 0 THEN <<<<>>, dsts>> + ELSE LET ms == F[i-1][1] + s == CHOOSE j \in F[i-1][2]: TRUE + m == IF arg = Nil + THEN Constructor2(n, s) + ELSE Constructor3(n, s, arg) + remaining == F[i-1][2] \ {s} + IN <> + IN F[size][1] + +_Dummy2(a, b) == TRUE +_Dummy3(a, b, c) == TRUE + +BatchReqMsgs(n, Constructor(_, _)) == + _BatchExcludesReqMsgsArg(n, {n}, Constructor, _Dummy3, Nil) +BatchReqMsgsArg(n, Constructor(_, _, _), arg) == + _BatchExcludesReqMsgsArg(n, {n}, _Dummy2, Constructor, arg) +ConstructMsg(src, dst, type, body) == + [ src |-> src, dst |-> dst, type |-> type, data |-> body ] + +\* func:new_message(MsgRequestVote)@raft.rs +RequestVote(i, j) == + LET body == [ term |-> currentTerm'[i], + candidate_id |-> i, + index |-> LogCurrentIdx(log[i], i), + log_term |-> LogLastTerm(log[i], i), + commit |-> commitIndex[i], + commitTerm |-> LogGetTerm2(log[i], commitIndex[i], "RequestVote", i)] + msg_type == M_RV + IN ConstructMsg(i, j, msg_type, body) + +\* func:new_message(MsgRequestVoteResponse)@raft.rs +RequestVoteResponse(m, voted, tempLeaderId) == + LET i == m.dst + j == m.src + req == m.data + \* can_vote corresponding to step()@raft.rs, which define the situation it can vote or not + can_vote == \/ voted = j + \/ /\ voted = Nil + /\ tempLeaderId = Nil + meTerm == currentTerm'[i] + rejectMeTermIsBigger == meTerm > req.term + meLastTerm == LogLastTerm(log[i], i) + rejectMeLogNewer == \/ req.log_term < meLastTerm + \/ /\ req.log_term = meLastTerm + /\ req.index < LogCurrentIdx(log[i], i) + voteStatus == IF rejectMeTermIsBigger THEN "not-vote: term bigger" ELSE + IF ~can_vote THEN "not-vote: can not vote" ELSE + IF rejectMeLogNewer THEN "not-vote: log newer" ELSE "voted" + granted == voteStatus = "voted" + reject == ~granted + send_commit == IF reject THEN commitIndex[i] ELSE 0 + send_commit_term == IF reject THEN LogGetTerm2(log[i], commitIndex[i], "RequestVoteResponse", i) ELSE 0 + body == [ request_term |-> req.term, + term |-> Max(req.term, meTerm), + reject |-> reject, + commit |-> send_commit, + commitTerm |-> send_commit_term] + IN ConstructMsg(i, j, M_RVR, body) @@ [ status |-> voteStatus ] + + + +SendSnapshot(i, j) == + LET meteIdx == commitIndex'[i] + metaTerm == IF commitIndex'[i] > snapshotLastIdx'[i] + THEN LogGetTerm2(log[i], commitIndex'[i], "SendSnapshot", i) + ELSE snapshotLastTerm'[i] + body == [ term |-> currentTerm[i], + metaIndex |-> meteIdx, + metaTerm |-> metaTerm ] + IN ConstructMsg(i, j, M_SNAP, body) + +\* func: prepare_send_entries +AppendEntriesNext(i, j, next) == + LET prev_log_idx == next[i][j] - 1 + body == [ term |-> currentTerm'[i], + leader_id |-> i, + commit |-> commitIndex'[i], + index |-> prev_log_idx, \* prev_log_idx + log_term |-> IF LastIdx(log'[i], i) >= prev_log_idx + THEN LogGetTerm2(log'[i], prev_log_idx, "AppendEntriesNext", i) + ELSE 0 , + entries |-> LogGetEntryOne2(log'[i], next[i][j], i) ] \* The model restricts AppendEntry messages to one entry at a time. + IN ConstructMsg(i, j, M_AE, body) + + +\* func: send_heartbeat +HeartBeatNext(i, j, next) == + LET body == [ term |-> currentTerm[i], + commit |-> Min(matchIndex[i][j], commitIndex[i])] + IN ConstructMsg(i, j, M_HB, body) + +HeartBeatResponse(m) == + LET body == [ term |-> currentTerm'[m.dst], + commitIdx |-> commitIndex'[m.dst] ] + IN ConstructMsg(m.dst, m.src, M_HBR, body) + +\* new_message(MsgAppendResponse)@raft.rs +AERFailLogStale(m) == \* func: handle_append_entries + LET body == [ reject |-> FALSE, + term |-> Max(currentTerm[m.dst], m.data.term), + index |-> commitIndex[m.dst], + re |-> Nil, + commit |-> commitIndex[m.dst] ] + IN ConstructMsg(m.dst, m.src, M_AER, body) + +\* new_message(MsgAppendResponse)@raft.rs +AERFailTermMismatch(m, hint_index, hint_term) == + LET body == [ reject |-> TRUE, + term |-> Max(currentTerm[m.dst], m.data.term), + index |-> m.data.index, + reject_hint |-> hint_index, + log_term |-> hint_term, + request_snapshot |-> 0, + commit |-> commitIndex[m.dst] ] + IN ConstructMsg(m.dst, m.src, M_AER, body) + +SnapFail(m) == + LET body == [ reject |-> FALSE, + term |-> currentTerm'[m.dst], + request_snapshot |-> 0, + index |-> commitIndex'[m.dst] ] + IN ConstructMsg(m.dst, m.src, M_AER, body) + +SnapSuccess(m) == + LET body == [ reject |-> FALSE, + term |-> currentTerm'[m.dst], + request_snapshot |-> 0, + index |-> snapshotLastIdx'[m.dst] ] \* 这里不能使用 lastidx 因为我们的log 设计问题,这里log清除之后一定是这个使用snap 我们简化为这个 + IN ConstructMsg(m.dst, m.src, M_AER, body) + +\* new_message(MsgAppendResponse)@raft.rs +AppendEntriesResponseSuccess(m) == + LET data == m.data + body == [ reject |-> FALSE, + term |-> currentTerm'[m.dst], + index |-> data.index + Len(data.entries), + commitIdx |-> commitIndex'[m.dst]] + IN ConstructMsg(m.dst, m.src, M_AER, body) + + +\* At bcast_append the next_index of the node to the target node is updated for each letter.(in prepare_send_entries@raft.rs) +BatchUpdateNextWithMsg(n, new_msgs) == + LET lenMsg == Len(new_msgs) + F[i \in 0..lenMsg] == + IF i = 0 THEN <<{}, Servers, (n :> 1)>> + ELSE LET dst == new_msgs[i].dst + ety == new_msgs[i].data.entries + etyLastIdx == LogLastIdx(ety) + IN IF \/ ety = <<>> \* If the content of the letter is empty, no need to update + \/ progress[n][dst][1] = Probe \* If a node is in the Probe state, sending at this point will block( maybe_send_append().is_paused() @ raft.rs) + THEN < etyLastIdx) >> + ELSE < etyLastIdx)>> + updateServer == F[lenMsg][1] + remainServer == F[lenMsg][2] + updateMap == F[lenMsg][3] + next_keep == [ s \in remainServer |-> nextIndex[n][s] ] + next_update == [ s \in updateServer |-> updateMap[s] ] + IN nextIndex' = [ nextIndex EXCEPT ![n] = next_keep @@ next_update ] + + + +(***************************************************************************) +(* Raft actions *) +(***************************************************************************) + +\* func reset +reset(i) == + /\ ClearVotedForMe(i) + /\ ClearVotesReject(i) + /\ AllUpdateNextIdx(i, LastIdx(log[i], i) + 1) + /\ AllUpdateMatchIdx(i, 0) + /\ AllUpdateProgress(i, <>) + /\ AllUpdateInflight(i, 0) + /\ AllUpdatePrPending(i, 0) + +(***************************************************************************) +(* Become candidate *) +(***************************************************************************) + +\* func: become_candidate +BecomeCandidate(i) == + /\ UpdateCurrentTerm(i, currentTerm[i] + 1) + /\ UpdateVotedFor(i, i) + /\ UNCHANGED << check_quorum, logVars, snapVars>> + /\ reset(i) + /\ UpdatePendingSnap(i, 0) + /\ UpdateLeaderId(i, Nil) + /\ UpdateState(i, Candidate) + /\ LET ms == BatchReqMsgs(i, RequestVote) + IN NetUpdate2(NetmanIncField("n_elec", NetBatchAddMsg(ms)), <<"BecomeCandidate", i>>) + + + +(***************************************************************************) +(* Become leader *) +(***************************************************************************) + +\* func: become_leader@raft.rs +BecomeLeader(i, m) == + /\ LET noop == [ term |-> currentTerm[i], data |-> Nil, index |-> LastIdx(log[i], i) + 1 ] + IN log' = Update(log, i, LogAppend(log[i], noop)) + /\ UpdateState(i, Leader) + /\ UpdateLeaderId(i, i) + /\ ClearVotedForMe(i) + /\ UpdatePendingSnap(i, 0) + /\ ClearVotesReject(i) + /\ matchIndex' = [ matchIndex EXCEPT ![i] = ( i :> LastIdx(log'[i], i) ) @@ [ j \in Servers |-> 0 ] ] + /\ AllUpdateProgress(i, <>) \* All progress needs to be in probe mode + /\ AllUpdateInflight(i, 0) \* All inflight needs to be 0 (no message send) + /\ AllUpdatePrPending(i, 0) \* All pr.pending_request_snapshot needs to be 0 + /\ LET next == [ nextIndex EXCEPT ![i] = ( i :> matchIndex'[i][i] + 1 ) @@ [ j \in Servers |-> LastIdx(log[i], i) + 1] ] + ms == BatchReqMsgsArg(i, AppendEntriesNext, next) + IN /\ nextIndex' = next + /\ NetUpdate2(NetReplyBatchAddMsg(ms, m), <<"RecvRequestVoteResponse", "Won-BecomeLeader", i>>) \* bcast_send + +\* func: become_leader@raft.rs +BecomeLeader1(i) == + /\ UNCHANGED << commitIndex, snapVars, votedFor, check_quorum>> + /\ LET new_term == currentTerm[i] + 1 + noop == [ term |-> new_term, data |-> Nil, index |-> LastIdx(log[i], i) + 1 ] + IN log' = Update(log, i, LogAppend(log[i], noop)) + /\ UpdateState(i, Leader) + /\ UpdateLeaderId(i, i) + /\ UpdatePendingSnap(i, 0) + /\ UpdateCurrentTerm(i, currentTerm[i] + 1) + /\ ClearVotedForMe(i) + /\ ClearVotesReject(i) + /\ matchIndex' = [ matchIndex EXCEPT ![i] = ( i :> LastIdx(log'[i], i) ) @@ [ j \in Servers |-> 0 ] ] + /\ AllUpdateProgress(i, <>) \* All progress needs to be in probe mode + /\ AllUpdateInflight(i, 0) \* All inflight needs to be 0 (no message send) + /\ AllUpdatePrPending(i, 0) \* All pr.pending_request_snapshot needs to be 0 + /\ LET next == [ nextIndex EXCEPT ![i] = ( i :> matchIndex'[i][i] + 1 ) @@ [ j \in Servers |-> LastIdx(log[i], i) + 1] ] + ms == BatchReqMsgsArg(i, AppendEntriesNext, next) + IN /\ nextIndex' = next + /\ NetUpdate2(NetmanIncField("n_becomeLeader", NetBatchAddMsg(ms)), <<"DoBecomeLeader", i>>) + +(***************************************************************************) +(* Become follower *) +(***************************************************************************) + +SetCurrentTerm(i, term) == + /\ UpdateCurrentTerm(i, term) + /\ UpdateVotedFor(i, Nil) + +_BecomeFollower(i) == + /\ UpdateState(i, Follower) + /\ UpdateLeaderId(i, Nil) + /\ reset(i) + +\* func : become_follower@raft.rs +BecomeFollower(i, term) == + /\ SetCurrentTerm(i, term) + /\ _BecomeFollower(i) + /\ UNCHANGED pending_snapshot + +BecomeFollowerInLost(i, term) == + /\ UNCHANGED <> + /\ UpdateCurrentTerm(i, term) + /\ _BecomeFollower(i) + + +BecomeFollowerWithLeader(i, term, leaderId) == + /\ SetCurrentTerm(i, term) + /\ UpdateState(i, Follower) + /\ UpdateLeaderId(i, leaderId) + /\ UNCHANGED pending_snapshot + /\ reset(i) + +BecomeFollowerWithLeaderWithouPending(i, term, leaderId) == + /\ SetCurrentTerm(i, term) + /\ UpdateState(i, Follower) + /\ UpdateLeaderId(i, leaderId) + /\ reset(i) + +(***************************************************************************) +(* Recv requestvote *) +(***************************************************************************) + +\* func: maybe_commit_by_vote@raft.rs +maybe_commit_by_vote(n, commitIdx, commitTerm) == + IF \/ commitIdx = 0 + \/ commitTerm = 0 + \/ raftState'[n] = Leader + THEN UNCHANGED commitIndex + ELSE IF \/ commitIdx <= commitIndex[n] + THEN UNCHANGED commitIndex + ELSE IF /\ commitIdx > commitIndex[n] + /\ commitTerm = LogGetTerm2(log[n], commitIdx, "maybe_commit_by_vote", n) + THEN UpdateCommitIdx(n, commitIdx) + ELSE UNCHANGED commitIndex + +HandleMsgRV(m) == + LET data == m.data + dst == m.dst + src == m.src + demote == currentTerm[dst] < data.term + stale == currentTerm[dst] > data.term + msg == RequestVoteResponse(m, IF demote THEN Nil ELSE votedFor[dst], IF demote THEN Nil ELSE leader_id[dst]) \* Pass in intermediate values based on demote status. + IN IF stale \* stale message drop + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), + <<"RecvRequestVote", "stale message ignore", dst, src, m.seq>>) + ELSE /\ UNCHANGED <> + /\ IF demote \* Received a newerletter and became a follower. + THEN /\ UpdateCurrentTerm(dst, data.term) + /\ UpdateState(dst, Follower) + /\ UpdateLeaderId(dst, Nil) + /\ reset(dst) + ELSE UNCHANGED <> + /\ IF ~msg.data.reject \* Determine whether to vote based on RequestVote letter + THEN /\ UpdateVotedFor(dst, src) + /\ UNCHANGED <> + ELSE /\ IF demote \* If there is a no vote the default is not to change the vote value, but due to the demote state, the node will reset and thus the vote will become nil + THEN UpdateVotedFor(dst, Nil) + ELSE UNCHANGED <> + /\ maybe_commit_by_vote(dst, data.commit, data.commitTerm) \* func: maybe_commit_by_vote @ raft.rs + /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(msg, m), + <<"RecvRequestVote", msg.status, dst, src, m, IF ~msg.data.reject THEN "vote" ELSE "not-vote">>) + + +(***************************************************************************) +(* Recv requestvote response *) +(***************************************************************************) + +\* func : poll@raft.rs +Poll(grant, reject) == + LET grantNum == Cardinality(grant) + 1 \* +1 is voted for myself + rejectNum == Cardinality(reject) + IN IF IsQuorumNum(grantNum) + THEN "Won" + ELSE IF IsQuorumNum(rejectNum) + THEN "Lost" + ELSE "Pending" + + + + +HandleMsgRVR( m) == + LET resp == m.data + src == m.src + dst == m.dst + demote == resp.term > currentTerm[dst] + isCandidate == raftState[dst] = Candidate + stale == resp.term < currentTerm[dst] + IN /\ IF demote \* Received a newerletter and became a follower. + THEN /\ UNCHANGED <> + /\ BecomeFollower(dst, resp.term) + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "term is smaller", dst, src, m>>) + ELSE IF stale \* stale message drop + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "vote is stale", dst, src, m>>) + ELSE IF ~isCandidate \* only candidate process M_RVR + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "not candidate", dst, src, m>>) + ELSE /\ UNCHANGED <> + /\ LET newVotedForMe == IF ~resp.reject + THEN voted_for_me[dst] \cup {src} + ELSE voted_for_me[dst] + newVotedReject == IF ~resp.reject + THEN voted_reject[dst] + ELSE voted_reject[dst]\cup {src} + res == Poll(newVotedForMe, newVotedReject) + IN IF res = "Won" + THEN /\ UNCHANGED << commitIndex>> \* The reason for this is that in becomeLeader we need to broadcast the AE letter globally, and the AE letter carries the latest commitIndex, but we don't update the commitIndex until below in maybe_commit_by_vote, and it has to use the latest commitIndex, so we need to write it here. + /\ UNCHANGED << votedFor, currentTerm>> + /\ BecomeLeader(dst, m) + ELSE /\ UNCHANGED <> + /\ IF res = "Lost" + THEN /\ BecomeFollowerInLost(dst, currentTerm[dst]) + /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "Lost", dst, src, m>>) + ELSE /\ NetUpdate2(NetDelMsg(m), <<"RecvRequestVoteResponse", "Pending", dst, src, m>>) + /\ UpdateVotedForMe(dst, newVotedForMe) + /\ UpdateVotesReject(dst, newVotedReject) + /\ UNCHANGED << serverVars, leader_id, progress, leaderVars>> + /\ maybe_commit_by_vote(dst, resp.commit, resp.commitTerm) + +(***************************************************************************) +(* Send appendentries to all other nodes *) +(***************************************************************************) +SendAppendentriesAll(n) == \* func: bcast_append + /\ UNCHANGED <> + /\ LET ms == BatchReqMsgsArg(n, AppendEntriesNext, nextIndex) + IN /\ BatchUpdateNextWithMsg(n, ms) + /\ NetUpdate2(NetmanIncField("n_ae", NetBatchAddMsg(ms)), <<"SendAppendentriesAll", n>>) + +(***************************************************************************) +(* Send heartbeat(empty log appendentries) to all other nodes *) +(***************************************************************************) +SendHeartBeatAll(n) == \* func: bcast_heart + /\ UNCHANGED <> + /\ LET ms == BatchReqMsgsArg(n, HeartBeatNext, nextIndex) + IN NetUpdate2(NetmanIncField("n_hb", NetBatchAddMsg(ms)), <<"SendHeartBeatAll", n>>) + +(***************************************************************************) +(* Recv appendentries *) +(***************************************************************************) +AcceptLeader(me, leader) == + /\ UpdateState(me, Follower) + /\ UpdateLeaderId(me, leader) + /\ IF raftState[me] = Follower + THEN UNCHANGED <> + ELSE reset(me) + +\* func: find_conflict_by_term +find_conflict_by_term(me, index, term) == + LET hint_index == Min(index, LastIdx(log[me], me)) + F[i \in 0..hint_index ] == + IF hint_index = 0 + THEN <<0, 0>> + ELSE IF i = 0 + THEN << >> + ELSE IF term >= LogGetTerm2(log[me] ,i, "find_conflict_by_term", me) + THEN <> + ELSE F[i-1] + IN F[hint_index] + +\* func: raft_log.maybe_commit() +SetCommitIdx(n, idx) == + \* /\ Assert(idx <= LastIdx(log'[n], n), <<"SetCommitIdx: idx <= LogCurrentIdx(log'[n])", n, idx, log', LastIdx(log'[n], n)>>) + /\ IF idx > commitIndex[n] + THEN UpdateCommitIdx(n, idx) + ELSE UNCHANGED <> + + \* new_message(send_request_snapshot)@raft.rs +SendRequestSnapshot(n) == + LET body == [ reject |-> TRUE, + reject_hint |-> LastIdx(log[n], n), + term |-> currentTerm[n], + index |-> commitIndex'[n], + request_snapshot |-> pending_snapshot'[n], + log_term |-> LogLastTerm(log[n], n)] + to == leader_id[n] + IN ConstructMsg(n, to, M_AER, body) + +HandleMsgAE(m) == \* func: handle_append + LET data == m.data + src == m.src + dst == m.dst + demote == data.term > currentTerm[dst] + stale == data.term < currentTerm[dst] + snap_req == pending_snapshot[dst] /= 0 + log_stale == data.index < commitIndex[dst] + ask_snap_msg == SendRequestSnapshot(dst) + log_stale_msg == AERFailLogStale(m) + success == AppendEntriesResponseSuccess(m) + IN IF stale \* drop stale message + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentries", "stale message ignore", dst, src, m>>) + ELSE /\ UNCHANGED <> + /\ IF demote \* Received a newer letter and became a follower, but there are related variables that need to be updated later, so only their term values are updated here. + THEN SetCurrentTerm(dst, data.term) + ELSE UNCHANGED <> + /\ AcceptLeader(dst, data.leader_id) \* Update the leader_id and make sure the node state is follower + /\ IF snap_req + THEN /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(ask_snap_msg, m), <<"RecvAppendentries", "snap_shot ask", dst, src, m>>) + ELSE /\ IF log_stale \* if m.index < self.raft_log.committed @ raft.rs + THEN /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(log_stale_msg, m), <<"RecvAppendentries", "log stale commit", dst, src, m>>) + ELSE LET ety == LogGetEntry2(log[dst], data.index, dst) + noPrevLog == ety = Nil + termMatch == \/ /\ noPrevLog + /\ data.log_term = 0 + \/ /\ ~noPrevLog + /\ ety.term = data.log_term + IN IF termMatch \* maybe_append@raft_log.rs + THEN /\ log' = Update(log, dst, LogGetMatchEntries(log[dst], data.entries, data.index, dst)) + /\ IF commitIndex[dst] < data.commit + THEN LET lastLogIdx == Max(LastIdx(log'[dst], dst), 1) + idxToCommit == Min(lastLogIdx, data.commit) + IN SetCommitIdx(dst, idxToCommit) + ELSE UNCHANGED commitIndex + /\ NetUpdate2(NetReplyMsg(success, m), <<"RecvAppendentries", "success", dst, src, m>>) + ELSE LET conflict == find_conflict_by_term(dst, data.index, data.log_term) \* find_conflict_by_term @ raft_log.rs + fail == AERFailTermMismatch(m, conflict[1], conflict[2]) + IN /\ UNCHANGED <> + /\ NetUpdate2(NetReplyMsg(fail, m), <<"RecvAppendentries", "term Mismatch", dst, src, m>>) + + + +(***************************************************************************) +(* Recv appendentries response *) +(***************************************************************************) +\* The reason for this is that we have multiple designs for calculating whether or not a paused operation has occurred, +\* and handle_aer needs to calculate both old and new paused for the unification step. +IsPaused(me, node, _inflight, _progress) == + \/ /\ _progress[me][node][1] = Probe + /\ _progress[me][node][2] = TRUE \* Here, true means pause. + \/ _progress[me][node][1] = Snapshot + \/ /\ _progress[me][node][1] = Replicate + /\ _inflight[me][node] /= 0 \* We only send one packet at a time, so it must be FULL when there is data, i.e. it will PAUSE + + +FlushSendAppendentries(me, m, tempNextIdx, tempInflight, info) == + LET F[i \in 0..NumServer] == + IF i = 0 THEN <<{}, Servers>> + ELSE LET n == CHOOSE n \in F[i-1][2]: TRUE + idx == LastIdx(log'[me], me) + IN IF n = me + THEN <> + ELSE LET pause == IsPaused(me, n, tempInflight, progress') + IN IF pause + THEN <> \* 不会发送 + ELSE <> \* 未暂停 就会发送 + excludes == F[NumServer][1] + excludes2 == F[NumServer][1] \ {me} + ms == _BatchExcludesReqMsgsArg(me, excludes, _Dummy2, AppendEntriesNext, tempNextIdx) + next_keep == [ s \in excludes2 |-> tempNextIdx[me][s] ] + next_me == IF tempNextIdx[me][me] < LastIdx(log'[me], me) + 1 + THEN (me :> LastIdx(log'[me], me) + 1) + ELSE (me :> tempNextIdx[me][me] ) + next_update == [ s \in Servers \ excludes |-> IF tempNextIdx[me][s] <= LastIdx(log'[me], me) + THEN tempNextIdx[me][s] + 1 + ELSE tempNextIdx[me][s] ] + inflight_keep == [ s \in excludes |-> tempInflight[me][s]] + inflight_update == [ s \in Servers \ excludes |-> IF tempNextIdx[me][s] <= LastIdx(log'[me], me) + THEN tempNextIdx[me][s] + ELSE 0] + IN /\ nextIndex' = [ nextIndex EXCEPT ![me] = next_keep @@ next_update @@ next_me] + /\ inflight' = [inflight EXCEPT ![me] = inflight_keep @@ inflight_update] + /\ IF m = Nil \* RecvEntry: client request + THEN NetUpdate2(NetmanIncField("n_op", NetBatchAddMsg(ms)), info) + ELSE NetUpdate2(NetReplyBatchAddMsg(ms, m), info) + +\* (maybe_update + maybe_commit) in handle_append_response@raft.rs +AdvanceCommitIdx(me, m, succ_rsp, tempNextIndex, tempInflight, old_pause) == + LET F[i \in 0..NumServer] == + IF i = 0 THEN <<<<>>, Servers>> + ELSE LET n == CHOOSE n \in F[i-1][2]: TRUE + IN <> + sorted_match_idx == SortSeq(F[NumServer][1], LAMBDA x, y: x > y) + commit == sorted_match_idx[NumServer \div 2 + 1] + pause == IsPaused(me, m.src, tempInflight, progress') + is_ae == succ_rsp.type = M_AE + empty_entries == Len(succ_rsp.data.entries) = 0 + IN IF /\ commit > commitIndex[me] + /\ currentTerm[me] = LogGetTerm2(log[me], commit, "AdvanceCommitIdx", me) + THEN /\ SetCommitIdx(me, commit) \* commit change, maybe send_bcast + /\ FlushSendAppendentries(me, m, tempNextIndex, tempInflight, <<"RecvAppendentriesResponse", "commit change", m.dst, m.src, m>>) \* bcast_append + ELSE /\ UNCHANGED commitIndex + /\ IF old_pause \* If it's an "old pause", it sends empty_entries regardless. + THEN IF pause + THEN /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "old_pause_pause", m.dst, m.src, m>>) + /\ UNCHANGED inflight + /\ nextIndex' = tempNextIndex + ELSE /\ NetUpdate2(NetReplyMsg(succ_rsp, m), <<"RecvAppendentriesResponse", "old_pause_send", m.dst, m.src, m>>) + /\ IF is_ae + THEN /\ IF empty_entries + THEN /\ UpdateInflight(me, m.src, 0) + /\ nextIndex' = tempNextIndex + ELSE /\ UpdateInflight(me, m.src, succ_rsp.data.entries[1].index) + /\ UpdateNextIdx(me, m.src, succ_rsp.data.entries[1].index + 1) + ELSE /\ UNCHANGED inflight + /\ nextIndex' = tempNextIndex + ELSE \* 否则 只在不空的时候发送 (aggressive) + IF pause \* If it's an "old pause", it sends empty_entries regardless. + THEN /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "aggressive_pause", m.dst, m.src, m>>) + /\ UNCHANGED inflight + /\ nextIndex' = tempNextIndex + ELSE IF is_ae /\ empty_entries + THEN /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "aggressive_empty", m.dst, m.src, m>>) + /\ UNCHANGED inflight + /\ nextIndex' = tempNextIndex + ELSE /\ NetUpdate2(NetReplyMsg(succ_rsp, m), <<"RecvAppendentriesResponse", "aggressive_has", m.dst, m.src, m>>) + /\ IF is_ae + THEN /\ UpdateInflight(me, m.src, succ_rsp.data.entries[1].index) \* replicate + /\ UpdateNextIdx(me, m.src, succ_rsp.data.entries[1].index + 1) + ELSE /\ UNCHANGED inflight + /\ nextIndex' = tempNextIndex + \* ELSE /\ UNCHANGED commitIndex + \* /\ IF ~pause + \* THEN IF old_pause + \* THEN /\ NetUpdate2(NetReplyMsg(succ_rsp, m), <<"RecvAppendentriesResponse", "commit still send", m.dst, m.src, m>>) + \* /\ IF ~empty_entries + \* THEN UpdateInflight(me, m.src, succ_rsp.data.entries[1].index) + \* ELSE UpdateInflight(me, m.src, 0) + \* /\ IF empty_entries + \* THEN nextIndex' = tempNextIndex + \* ELSE UpdateNextIdx(me, m.src, succ_rsp.data.entries[1].index + 1) + \* ELSE IF empty_entries + \* THEN /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "commit still pause", m.dst, m.src, m>>) + \* /\ UpdateInflight(me, m.src, 0) + \* /\ nextIndex' = tempNextIndex + \* ELSE /\ NetUpdate2(NetReplyMsg(succ_rsp, m), <<"RecvAppendentriesResponse", "commit still send", m.dst, m.src, m>>) + \* /\ UpdateInflight(me, m.src, succ_rsp.data.entries[1].index) + \* /\ UpdateNextIdx(me, m.src, succ_rsp.data.entries[1].index + 1) + \* ELSE /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "commit still pause", m.dst, m.src, m>>) + \* /\ UNCHANGED inflight + \* /\ nextIndex' = tempNextIndex + +\* maybe_decr_to @ progress.rs +maybe_decr_to(dst, src, m, next_probe_index) == + LET rejected == m.data.index + match_hint == m.data.reject_hint + pending == m.data.request_snapshot + IN /\ IF progress[dst][src][1] = Replicate + THEN IF \/ rejected < matchIndex[dst][src] + \/ /\ rejected = matchIndex[dst][src] + /\ pending = 0 + THEN /\ UNCHANGED << nextIndex, progress, pr_pending>> + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "decr_replicate stale", dst, src, m>>) + ELSE /\ IF pending = 0 + THEN UNCHANGED pr_pending + ELSE UpdatePrPending(dst, src, pending) + /\ UpdateNextIdx(dst, src, matchIndex[dst][src] + 1) \* 不涉及 snapshot 所以 become_probe中 直接进行 else分支 + /\ LET need_snap == \/ nextIndex'[dst][src] < FirstIdx(log'[dst], dst) + \/ pr_pending'[dst][src] /= 0 \* 根据 progress中的pending_request_snapshot进行对比 + one_rsp == IF need_snap + THEN SendSnapshot(dst, src) + ELSE AppendEntriesNext(dst, src, nextIndex') + IN /\ NetUpdate2(NetReplyMsg(one_rsp, m), <<"RecvAppendentriesResponse", "decr_replicate send", dst, src, m>>) \* 一定会变为probe become_probe + /\ IF need_snap + THEN UpdateProgress(dst, src, <>) \* pr.become_snapshot + ELSE IF Len(one_rsp.data.entries) = 0 \* simulate prepare_send_entries determines if the sent entries are empty to update the progress + THEN UpdateProgress(dst, src, <>) + ELSE UpdateProgress(dst, src, <>) + ELSE /\ IF /\ \/ nextIndex[dst][src] = 0 \* probe or snapshot + \/ nextIndex[dst][src] - 1 /= rejected + /\ pending = 0 + THEN /\ UNCHANGED << nextIndex, progress, pr_pending>> + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "decr_probe stale", dst, src, m>>) + ELSE /\ IF pending /= 0 + THEN /\ IF pr_pending[dst][src] = 0 + THEN UpdatePrPending(dst, src, pending) + ELSE UNCHANGED pr_pending + /\ UNCHANGED nextIndex + ELSE /\ LET new_match == Min(rejected, next_probe_index + 1) + new_next_idx == Max(new_match, 1) + IN UpdateNextIdx(dst, src, new_next_idx) + /\ UNCHANGED pr_pending + /\ LET need_snap == \/ nextIndex'[dst][src] < FirstIdx(log'[dst], dst) + \/ pr_pending'[dst][src] /= 0 + one_rsp == IF need_snap + THEN SendSnapshot(dst, src) + ELSE AppendEntriesNext(dst, src, nextIndex') + pause == IsPaused(dst, src, inflight, progress) + IN /\ IF pause + THEN NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "decr_probe pause", dst, src, m>>) + ELSE NetUpdate2(NetReplyMsg(one_rsp, m), <<"RecvAppendentriesResponse", "decr_probe send", dst, src, m>>) + /\ IF need_snap + THEN UpdateProgress(dst, src, <>) \* pr.become_snapshot + ELSE IF Len(one_rsp.data.entries) = 0 + THEN UpdateProgress(dst, src, <>) + ELSE UpdateProgress(dst, src, <>) + +\* func: handle_append +HandleMsgAER(m) == + LET resp == m.data + src == m.src + dst == m.dst + stale == resp.term < currentTerm[dst] + demote == resp.term > currentTerm[dst] + need_optimize == resp.reject /\ resp.log_term > 0 + next_probe_index == find_conflict_by_term(dst, resp.reject_hint, resp.log_term)[1] + failReason == + IF stale THEN "stale message ignore" ELSE + IF resp.term > currentTerm[dst] THEN "term is smaller" ELSE + IF raftState[dst] /= Leader THEN "not leader" ELSE + IF need_optimize THEN "retry" ELSE "success" + IN IF failReason /= "success" + THEN IF failReason = "stale message ignore" \* drop stale message + THEN /\ UNCHANGED <> + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "stale message ignore", dst, src, m>>) + ELSE IF failReason = "term is smaller" \* Received a newer letter and became a follower + THEN /\ UNCHANGED <> + /\ BecomeFollower(dst, resp.term) + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "term is smaller", dst, src, m>>) + ELSE IF failReason = "not leader" \* node not leader, drop the message + THEN /\ UNCHANGED <> + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "not leader", dst, src, m>>) + ELSE IF failReason = "retry" \* m.reject + THEN /\ UNCHANGED <> + /\ maybe_decr_to(dst, src, m, next_probe_index) + ELSE Assert(FALSE, <<"handle aer Unseen error situation", failReason>>) + ELSE \* success + \* 因为不涉及其他状态变换,我感觉在这里处理了snapshot的progress变化比较合适 + /\ UNCHANGED <> + /\ LET pending == m.data.request_snapshot + prboeToReplicate == progress[dst][src][1] = Probe + snapToProbe == /\ progress[dst][src][1] = Snapshot \* Simulate is_snapshot_caught_up will update if successful, pending_snapshot is empty + /\ matchIndex'[dst][src] >= progress[dst][src][2] + \* 这里之所以这么写 是因为 如果resp.index < match 必会返回,不会发送包,基本上nextIndex 会改变,而之后更新progress状态,所以这里我们可以直接用这个来更新 + \* match 在maybe_update 中 一定会变为resp.index + nnextBymaybeUpdate == Max(resp.index + 1, nextIndex[dst][src]) + nextToUpdate == IF snapToProbe + THEN Max(nnextBymaybeUpdate, pr_pending[dst][src] + 1) + ELSE nnextBymaybeUpdate + old_pause == IsPaused(dst, src, inflight, progress) + \* The simulation here is that a call to maybe_update in handle_append_response may update next_idx, but since it will be changed again in prepare_entries, a temporary variable is needed to retrieve the corresponding entries. + tempNextIndex == [nextIndex EXCEPT ![dst][src] = nextToUpdate] + \* The temp nextIndex is also needed here. + need_snap == \/ nextIndex[dst][src] < FirstIdx(log'[dst], dst) + \/ /\ ~snapToProbe \* 这里如果snapToProbe 为 true,那么一定不需要snapshot + /\ pr_pending[dst][src] /= 0 + temp_entries == LogGetEntryOne2(log'[dst], tempNextIndex[dst][src], dst) + one_rsp == IF need_snap + THEN SendSnapshot(dst, src) + ELSE AppendEntriesNext(dst, src, tempNextIndex) + repCanSend == inflight[dst][src] <= resp.index \* The number of the arriving packet is stored in inflight, and in raft.rs, the replicate state will be free_to, so we'll simulate it directly here. + tempInflight == IF snapToProbe \* 这里我们利用tempinflight 来做是否pause的判断,因为所有会在中间状态进行改变 + THEN [inflight EXCEPT ![dst][src] = 0] + ELSE IF prboeToReplicate + THEN [inflight EXCEPT ![dst][src] = 0] + ELSE IF repCanSend + THEN [inflight EXCEPT ![dst][src] = 0] + ELSE inflight + IN IF resp.index > matchIndex[dst][src] \* maybe_update return true + THEN /\ UpdateMatchIdx(dst, src, resp.index) + /\ UNCHANGED <> + \* Here we need to update the progress and nextIndex status according to the content of the letter, corresponding to the handle_append_response of the maybe_update to maybe_commit processing logic + /\ IF snapToProbe + THEN /\ IF Len(temp_entries) = 0 \* 为了解决循环依赖,因为这里原版使用了 one_rsp.entries的内容, 不过需要commitIndex' 但是advance中才会更新 commitIndex + THEN UpdateProgress(dst, src, <>) + ELSE UpdateProgress(dst, src, <>) + ELSE /\ IF prboeToReplicate + THEN UpdateProgress(dst, src, <>) + ELSE UNCHANGED progress + /\ AdvanceCommitIdx(dst, m, one_rsp, tempNextIndex, tempInflight, old_pause) + ELSE /\ UNCHANGED << matchIndex, commitIndex, inflight, nextIndex, pending_snapshot, progress, pr_pending>> \* Direct RETURN doesn't do anything + /\ NetUpdate2(NetDelMsg(m), <<"RecvAppendentriesResponse", "maybe_update_fail", dst, src, m>>) + +(***************************************************************************) +(* Recv heartBeat *) +(***************************************************************************) + +\* func: handle_heartbeat +HandleMsgHB(m) == + LET data == m.data + src == m.src + dst == m.dst + demote == currentTerm[dst] < data.term + stale == data.term < currentTerm[dst] + ask_snapReq == pending_snapshot[dst] /= 0 + rsp == IF ask_snapReq + THEN SendRequestSnapshot(dst) + ELSE HeartBeatResponse(m) + IN IF stale + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvHeartBeat", "stale message ignore", dst, src, m>>) + ELSE /\ UNCHANGED <> + /\ IF \/ demote + \/ raftState[dst] = Candidate + THEN /\ BecomeFollowerWithLeader(dst, data.term, src) + /\ UNCHANGED <> + ELSE UNCHANGED <> + /\ SetCommitIdx(dst, data.commit) + /\ NetUpdate2(NetReplyMsg(rsp, m), <<"RecvHeartBeat", "success", dst, src, m>>) + +(***************************************************************************) +(* Recv HeartBeatResponse *) +(***************************************************************************) +\* func: handle_heartbeat_response +HandleMsgHBR(m) == + LET resp == m.data + src == m.src + dst == m.dst + demote == resp.term > currentTerm[dst] + stale == resp.term < currentTerm[dst] + IN IF stale + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvHeartBeatResponse", "stale message ignore", dst, src, m>>) + ELSE IF demote + THEN /\ UNCHANGED << logVars, check_quorum, snapVars, pending_snapshot>> + /\ BecomeFollower(dst, resp.term) + /\ NetUpdate2(NetDelMsg(m), <<"RecvHeartBeatResponse", "term is smaller", dst, src, m>>) + ELSE /\ UNCHANGED <> + /\ IF matchIndex[dst][src] < LastIdx(log[dst], dst) + THEN LET need_snap == \/ nextIndex[dst][src] < FirstIdx(log'[dst], dst) + \/ pr_pending'[dst][src] /= 0 + req_msg == IF need_snap + THEN SendSnapshot(dst, src) + ELSE AppendEntriesNext(dst, src, nextIndex) + send_entry == IF need_snap + THEN <<>> + ELSE req_msg.data.entries + isReplicate == progress[dst][src][1] = Replicate + inflightToUpdate == IF send_entry /= <<>> + THEN send_entry[1].index + ELSE 0 + nextIndexToUpdate == IF isReplicate + THEN IF send_entry /= <<>> + THEN nextIndex[dst][src] + 1 + ELSE nextIndex[dst][src] + ELSE nextIndex[dst][src] + IN /\ NetUpdate2(NetReplyMsg(req_msg, m), <<"RecvHeartBeatResponse", "send append", dst, src, m>>) + /\ UpdateInflight(dst, src ,inflightToUpdate) + /\ UpdateNextIdx(dst, src, nextIndexToUpdate) + ELSE /\ NetUpdate2(NetDelMsg(m), <<"RecvHeartBeatResponse", "not send", dst, src, m>>) + /\ UpdateInflight(dst, src ,0) + /\ UNCHANGED nextIndex + + +\* in step_leader: msg_propose +RecvEntry(n, data) == + /\ raftState[n] = Leader + /\ UNCHANGED <> + /\ LET ety == [ term |-> currentTerm[n], data |-> data, index |-> LastIdx(log[n], n) + 1] + IN log' = Update(log, n, LogAppend(log[n], ety)) + /\ IF matchIndex[n][n] < LastIdx(log'[n], n) + THEN UpdateMatchIdx(n, n, LastIdx(log'[n], n)) + ELSE UNCHANGED matchIndex + /\ FlushSendAppendentries(n, Nil, nextIndex, inflight, <<"RecvEntry", n, data>>) + + +(*************************************************************************** + restart node + ***************************************************************************) + +\* Server i restarts. Only currentTerm/votedFor/log restored (i.e. unchanged). +\* NOTE: snapshot variables are considered as parts of log +\* NOTE: last applied index should be cleared here if modelled. +Restart(i) == + /\ UNCHANGED <> + /\ raftState' = [raftState EXCEPT ![i] = Follower ] + /\ leader_id' = [ leader_id EXCEPT ![i] = Nil] + /\ voted_for_me' = [ voted_for_me EXCEPT ![i] = {} ] + /\ voted_reject' = [ voted_reject EXCEPT ![i] = {} ] + /\ nextIndex' = [ nextIndex EXCEPT ![i] = [j \in Servers |-> 1 ]] + /\ matchIndex' = [ matchIndex EXCEPT ![i] = [j \in Servers |-> 0 ]] + /\ progress' = [ progress EXCEPT ![i] = [j \in Servers |-> <>]] + /\ inflight' = [ inflight EXCEPT ![i] = [j \in Servers |-> 0 ]] + + +(*************************************************************************** + handle_snapshot + ***************************************************************************) + +\* Log load from snapshot. Update logInfo and log +LogLoadFromSnapshot(s, lastIdx) == + /\ log' = [ log EXCEPT ![s] = <<[term |-> 1, data |-> Nil, index |-> 1]>> ] \* apply_snapshot entries.clear() 但是保留最后一个snapshot的数据 + /\ SetCommitIdx(s, lastIdx) + +HandleMsgSnap(m) == + LET data == m.data + src == m.src + dst == m.dst + demote == currentTerm[dst] < data.term + isCandidate == raftState[dst] = Candidate + stale == data.term < currentTerm[dst] + fast_forward == + /\ LogGetTerm2(log[dst], data.metaIndex, "HandleSnap", dst) = data.metaTerm + /\ pending_snapshot[dst] = 0 + commit_already == data.metaIndex < commitIndex[dst] + IN IF stale + THEN /\ UNCHANGED noNetVars + /\ NetUpdate2(NetDelMsg(m), <<"RecvSnapShot", "stale message ignore", dst, src, m>>) + ELSE /\ IF demote \/ isCandidate + THEN /\ BecomeFollowerWithLeaderWithouPending(dst, data.term, src) + /\ UNCHANGED <> + ELSE UNCHANGED <> + /\ IF commit_already \/ fast_forward \* snapshot restore false + THEN /\ UNCHANGED <> + /\ IF commit_already + THEN UNCHANGED commitIndex + ELSE SetCommitIdx(dst, data.metaIndex) + /\ LET fail_msg == SnapFail(m) + IN NetUpdate2(NetReplyMsg(fail_msg, m), <<"RecvSnapShot", "restore fail-commit_already", dst, src, m>>) + ELSE /\ UpdatePendingSnap(dst, 0) + /\ UpdateSnapIdx(dst, data.metaIndex) \* snapshot restore true + /\ UpdateSnapTerm(dst, data.metaTerm) + /\ LogLoadFromSnapshot(dst, data.metaIndex) + /\ LET success_msg == SnapSuccess(m) + IN NetUpdate2(NetReplyMsg(success_msg, m), <<"RecvSnapShot", "restore success", dst, src, m>>) + + +(*************************************************************************** + Log compact + ***************************************************************************) + +CompactLog(n) == + /\ UNCHANGED <> + /\ LET index == commitIndex[n] + IN log' = Update(log, n, LogGetEntriesFrom2(log[n], index, n)) + +(*************************************************************************** + Snapshot request + ***************************************************************************) + +SnapRequest(n) == + /\ raftState[n] /= Leader + /\ leader_id[n] /= Nil + /\ pending_snapshot[n] = 0 + /\ UNCHANGED <> + /\ LogLastTerm(log[n], n) = currentTerm[n] + /\ UpdatePendingSnap(n , LastIdx(log[n], n)) + /\ LET msg == SendRequestSnapshot(n) + IN NetUpdate2(NetmanIncField("n_rqSnap", NetAddMsg(msg)), <<"DoSnapRequest", n, msg.src, msg.dst, msg>>) + + + + +(*************************************************************************** + State constraints + ***************************************************************************) + \* Here are some state limits to prevent state explosion due to control tla+ +GetRealLogLen(curLog) == SelectSeq(curLog, LAMBDA i: i.data /= NoOp) +GetMaxLogLen == Len(log[CHOOSE i \in Servers: \A j \in Servers \ {i}: + GetRealLogLen(log[i]) >= GetRealLogLen(log[j])]) +GetMaxTerm == currentTerm[CHOOSE i \in Servers: \A j \in Servers \ {i}: + currentTerm[i] >= currentTerm[j]] + +ScSent == CheckParameterMax(netman.n_sent, "MaxSentMsgs") +ScRecv == CheckParameterMax(netman.n_recv, "MaxRecvMsgs") +ScWire == CheckParameterMax(netman.n_wire, "MaxWireMsgs") +\* ScLog == CheckParameterMax(GetMaxLogLen, "MaxLogLength") +\* ScTerm == CheckParameterMax(GetMaxTerm, "MaxTerm") +ScPart == CheckParameterMax(netman.n_part, "MaxPartitionTimes") +ScCure == CheckParameterMax(netman.n_cure, "MaxCureTimes") +ScOp == CheckParameterMax(netman.n_op, "MaxClientOperationsTimes") +ScAe == CheckParameterMax(netman.n_ae, "MaxAppendEntriesTimes") +ScElec == CheckParameterMax(netman.n_elec, "MaxElectionTimes") +ScDrop == CheckParameterMax(netman.n_drop, "MaxDropTimes") +ScDup == CheckParameterMax(netman.n_dup, "MaxDupTimes") +ScRestart == CheckParameterMax(netman.n_restart, "MaxRestart") +ScUnorder == CheckParameterMax(netman.n_unorder, "MaxUnorderTimes") + +SC == /\ ScSent /\ ScRecv /\ ScWire /\ ScRestart + /\ ScPart /\ ScCure /\ ScOp /\ ScAe /\ ScElec + + +(***************************************************************************) +(* Invariants *) +(***************************************************************************) +ElectionSafety == + LET TwoLeader == + \E i, j \in Servers: + /\ i /= j + /\ currentTerm'[i] = currentTerm'[j] + /\ raftState'[i] = Leader + /\ raftState'[j] = Leader + IN ~TwoLeader + +LeaderAppendOnly == + \* compact 不检查 + IF netcmd'[1][1] = "DoCompact" THEN TRUE + ELSE \A i \in Servers: + IF raftState[i] = Leader /\ raftState'[i] = Leader + THEN LET curLog == log[i] + nextLog == log'[i] + IN IF Len(nextLog) >= Len(curLog) + THEN SubSeq(nextLog, 1, Len(curLog)) = curLog + ELSE FALSE + ELSE TRUE + +\* Every (index, term) pair determines a log prefix. +\* From page 8 of the Raft paper: "If two logs contain an entry with the same index and term, then the logs are identical in all preceding entries." +\* LogMatching == +\* \A i, j \in Servers : i /= j => +\* \A n \in 1..min(Len(log[i]), Len(log[j])) : +\* log[i][n].term = log[j][n].term => +\* SubSeq(log[i],1,n) = SubSeq(log[j],1,n) + +LogMatching == + ~UNCHANGED log => \* check the safety only if log has unchanged to avoid unnecessary evaluation cost + \A i, j \in Servers: + IF i /= j + THEN LET iLog == log'[i] + jLog == log'[j] + len == Min(Len(iLog), Len(jLog)) + F[k \in 0..len] == + IF k = 0 THEN <<>> + ELSE LET key1 == <> + value1 == iLog[k].data + key2 == <> + value2 == jLog[k].data + F1 == IF key1 \in DOMAIN F[k-1] + THEN IF F[k-1][key1] = value1 + THEN F[k-1] + ELSE F[k-1] @@ ( <<-1, -1>> :> <> ) + ELSE F[k-1] @@ (key1 :> value1) + F2 == IF key2 \in DOMAIN F1 + THEN IF F1[key2] = value2 + THEN F1 + ELSE F1 @@ ( <<-1, -1>> :> <> ) + ELSE F1 @@ (key2 :> value2) + IN F2 + IN IF << -1, -1>> \notin DOMAIN F[len] THEN TRUE + ELSE FALSE + ELSE TRUE + +MonotonicCurrentTerm == \A i \in Servers: currentTerm' [i] >= currentTerm[i] + +MonotonicCommitIdx == \A i \in Servers: commitIndex'[i] >= commitIndex[i] + +MonotonicMatchIdx == + \A i \in Servers: + IF raftState[i] = Leader /\ raftState'[i] = Leader \* change + THEN \A j \in Servers: matchIndex'[i][j] >= matchIndex[i][j] + ELSE TRUE + + +\* CommittedLogDurable == +\* \A i \in Servers: +\* LET len == Min(commitIndex'[i], commitIndex[i]) +\* logNext == SubSeq(log'[i], 1, len) +\* logCur == SubSeq(log[i], 1, len) +\* IN IF len = 1 THEN TRUE +\* ELSE /\ Len(logNext) >= len +\* /\ Len(logCur) >= len +\* /\ logNext = logCur + + +\* Inv 3: Committed log should be durable (i.e. cannot be rolled back) +\* CommittedLogDurable == +\* \A i \in Servers: +\* LET lenNext == commitIndex'[i] - snapshotLastIdx'[i] +\* lenCur == commitIndex[i] - snapshotLastIdx[i] +\* len == Min(lenNext, lenCur) +\* idx == Min(commitIndex'[i], commitIndex[i]) +\* logNext == LogGetEntriesTo2(log'[i], idx, i) +\* logCur == LogGetEntriesTo2(log[i], idx, i) +\* IN IF len = 0 \/ Len(logNext) = 0 \/ Len(logCur) = 0 THEN TRUE +\* ELSE /\ Len(logNext) >= len +\* /\ Len(logCur) >= len + +\* /\ SubSeq(logNext, Len(logNext) + 1 - len, Len(logNext)) = +\* SubSeq(logCur, Len(logCur) + 1 - len, Len(logCur)) + +CommittedLogDurable == + \A i \in Servers: + LET len == Min(commitIndex'[i] - snapshotLastIdx'[i], commitIndex[i] - snapshotLastIdx[i]) + logNext == LogGetEntriesTo2(log'[i], len, i) + logCur == LogGetEntriesTo2(log[i], len, i) + IN IF len = 1 \/ Len(logNext) = 0 \/ Len(logCur) = 0 THEN TRUE + ELSE /\ Len(logNext) >= len + /\ Len(logCur) >= len + /\ logNext = logCur + +CommittedLogReplicatedMajority == + \A i \in Servers: + IF raftState'[i] /= Leader \/ commitIndex'[i] <= 1 + THEN TRUE + ELSE LET entries == LogGetEntriesTo2(log'[i], commitIndex'[i], i) + len == Len(entries) + nServer == Cardinality(Servers) + F[j \in 0..nServer] == + IF j = 0 + THEN <<{}, {}>> + ELSE LET k == CHOOSE k \in Servers: k \notin F[j-1][1] + logLenOk == LastIdx(log'[k], k) >= commitIndex'[i] + kEntries == LogGetEntriesTo2(log'[k], commitIndex'[i], k) + minLen == Min(len, Len(kEntries)) + IN IF /\ logLenOk + /\ \/ minLen = 0 + \/ SubSeq(entries, Len(entries) + 1 - minLen, + Len(entries)) = + SubSeq(kEntries, Len(kEntries)+1 - minLen, + Len(kEntries)) + THEN <> + ELSE <> + IN IsQuorum(F[nServer][2]) + +NextIdxGtMatchIdx == + \A i \in Servers: + IF raftState'[i] = Leader + THEN \A j \in Servers \ {i}: nextIndex'[i][j] > matchIndex'[i][j] + ELSE TRUE + +NextIdxGtZero == + \A i \in Servers: + IF raftState'[i] = Leader + THEN \A j \in Servers: nextIndex'[i][j] > 0 + ELSE TRUE + +SelectSeqWithIdx(s, Test(_,_)) == + LET F[i \in 0..Len(s)] == + IF i = 0 + THEN <<>> + ELSE IF Test(s[i], i) + THEN Append(F[i-1], s[i]) + ELSE F[i-1] + IN F[Len(s)] + +FollowerLogLELeaderLogAfterAE == + LET cmd == netcmd'[1] + cmd1 == cmd[1] + cmd2 == cmd[2] + follower == cmd[3] + leader == cmd[4] + IN IF cmd1 = "RecvAppendentries" /\ cmd2 \in { "success", "term Mismatch" } + THEN IF log[follower] /= log'[follower] + THEN LastIdx(log'[follower], follower) <= LastIdx(log'[leader], leader) + ELSE TRUE + ELSE TRUE + +CommitIdxLELogLen == + IF netcmd'[1][1] = "RecvSnapShot" THEN TRUE + ELSE \A i \in Servers: commitIndex'[i] <= LastIdx(log'[i], i) + +LeaderCommitCurrentTermLogs == + \A i \in Servers: + IF raftState'[i] = Leader + THEN IF commitIndex[i] /= commitIndex'[i] + THEN LogGetTerm2(log'[i], commitIndex'[i], "LeaderCommitCurrentTermLogs", i) = currentTerm'[i] + ELSE TRUE + ELSE TRUE + +NewLeaderTermNotInLog == + \A i \in Servers: + IF raftState'[i] = Leader /\ raftState[i] /= Leader + THEN \A j \in Servers \ {i}: + \A n \in DOMAIN log'[j]: + log'[j][n].term /= currentTerm'[i] + ELSE TRUE + +LeaderTermLogHasGreatestIdx == + \A i \in Servers: + IF raftState'[i] = Leader + THEN \A j \in Servers \ {i}: + LET IncTermLogCount(a, b) == IF a.term = currentTerm'[i] THEN b + 1 ELSE b + IN FoldSeq(IncTermLogCount, 0, log'[i]) >= FoldSeq(IncTermLogCount, 0, log'[j]) + ELSE TRUE + +CheckLeader == + \A i \in Servers: + raftState[i] /= Leader + +InvSequence == << + \* ElectionSafety, + \* LeaderAppendOnly, + \* \* LogMatching, + \* MonotonicCurrentTerm, + \* MonotonicCommitIdx, + \* MonotonicMatchIdx, + \* CommittedLogDurable, + \* CommittedLogReplicatedMajority, + \* NextIdxGtMatchIdx, + \* NextIdxGtZero, + \* \* CheckLeader + \* FollowerLogLELeaderLogAfterAE, + \* CommitIdxLELogLen, + \* LeaderCommitCurrentTermLogs, + \* \* NewLeaderTermNotInLog, \\ 因为分模块化测试会违反,先省略 + \* LeaderTermLogHasGreatestIdx +>> + +INV == Len(SelectSeqWithIdx(inv, LAMBDA x, y: ~x /\ y \notin netman.no_inv)) = 0 + + + + (*************************************************************************** + Next actions + ***************************************************************************) + +DoElectionTimeout == + /\ PrePrune(netman.n_elec, "MaxElectionTimes") + /\ \E n \in Servers: CheckStateIs(n, Follower) /\ BecomeCandidate(n) + /\ inv' = InvSequence + +DoBecomeLeader == + /\ PrePrune(netman.n_becomeLeader, "MaxBecomeLeaderTimes") + /\ \A i \in Servers: + raftState[i] = Follower + /\ \E n \in Servers: + /\ BecomeLeader1(n) + /\ inv' = InvSequence + + +DoHeartBeat == + /\ PrePrune(netman.n_hb, "MaxHeartBeatTimes") + /\ \E n \in Servers: + /\ raftState[n] = Leader + /\ SendHeartBeatAll(n) + /\ inv' = InvSequence + + +_DoRecvM(type, func(_)) == + /\ \E src, dst \in Servers: + /\ src /= dst + /\ LET m == NetGetMsg(src, dst) + IN /\ m /= Nil + /\ m.type = type + /\ func(m) + /\ inv' = InvSequence + + +DoHandleMsgRV == /\ _DoRecvM(M_RV, HandleMsgRV) + +DoHandleMsgRVR == /\ _DoRecvM(M_RVR, HandleMsgRVR) + +DoHandleMsgAE == /\ _DoRecvM(M_AE, HandleMsgAE) + +DoHandleMsgAER == /\ _DoRecvM(M_AER, HandleMsgAER) + +DoHandleMsgHB == /\ _DoRecvM(M_HB, HandleMsgHB) + +DoHandleMsgHBR == /\ _DoRecvM(M_HBR, HandleMsgHBR) + +DoHandleMsgSnap == /\ _DoRecvM(M_SNAP, HandleMsgSnap) + +DoRecvEntry == + /\ PrePrune(netman.n_op, "MaxClientOperationsTimes") + /\ \E n \in Servers, v \in Commands: RecvEntry(n, v) + /\ inv' = InvSequence + +\* DoNetworkDrop == +\* /\ PrePrune(NetGetDrop, "MaxDropTimes") +\* /\ \E m \in msgs: +\* /\ NetUpdate2(NetDropMsg(m), <<"DoNetworkDrop", m.dst, m.src, m.seq>>) +\* /\ UNCHANGED noNetVars +\* /\ inv' = InvSequence + +\* DoNetworkDup == +\* /\ PrePrune(NetGetDup, "MaxDupTimes") +\* /\ \E m \in msgs: +\* /\ NetUpdate2(NetDupMsg(m), <<"DoNetworkDup", m.dst, m.src, m.seq>>) +\* /\ UNCHANGED noNetVars +\* /\ inv' = InvSequence + +DoNetworkPartition == + /\ PrePrune(netman.n_part, "MaxPartitionTimes") + /\ \E n \in Servers: + /\ NetUpdate2(NetPartConn({n}), <<"DoNetworkPartition", n>>) + /\ UNCHANGED noNetVars + /\ inv' = InvSequence + +DoNetworkCure == + /\ PrePrune(netman.n_cure, "MaxCureTimes") + /\ NetIsParted + /\ NetUpdate2(NetCureConn, <<"DoNetworkCure">>) + /\ UNCHANGED noNetVars + /\ inv' = InvSequence + +DoSnapRequest == + /\ PrePrune(netman.n_rqSnap, "MaxSnapRequest") + /\ \E n \in Servers: + /\ raftState[n] = Follower + /\ LET index == commitIndex[n] + IN /\ UNCHANGED <> + /\ FirstIdx(log[n], n) < index \* Ensure that operations are not repeated + /\ SnapRequest(n) + /\ inv' = InvSequence + +DoRestart == + /\ PrePrune(netman.n_restart, "MaxRestart") + /\ \E n \in Servers: + /\ Restart(n) + /\ NetUpdate2(NetmanIncField("n_restart", NetNoAction2("msg_do_nothing")), <<"Dorestart", n>>) + /\ inv' = InvSequence + +Next == + \/ DoRestart + \/ DoElectionTimeout + \/ DoBecomeLeader + \/ DoHeartBeat + \/ DoHandleMsgRV + \/ DoHandleMsgRVR + \/ DoHandleMsgHB + \/ DoHandleMsgHBR + \/ DoHandleMsgAE + \/ DoHandleMsgAER + \/ DoHandleMsgSnap + \/ DoRecvEntry + \/ DoSnapRequest + \/ DoNetworkPartition + \/ DoNetworkCure + + + +Spec == Init /\ [][Next]_vars +==== \ No newline at end of file