diff --git a/rs/tests/consensus/dual_workload_test.rs b/rs/tests/consensus/dual_workload_test.rs index 89466eda2b76..a39497633687 100644 --- a/rs/tests/consensus/dual_workload_test.rs +++ b/rs/tests/consensus/dual_workload_test.rs @@ -222,8 +222,8 @@ async fn make_xnet_call( fn main() -> Result<()> { SystemTestGroup::new() .with_setup(setup) - .without_assert_no_critical_errors() .add_test(systest!(test)) + .remove_metrics_to_check("critical_errors") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/max_xnet_payload_size_test.rs b/rs/tests/consensus/max_xnet_payload_size_test.rs index eb696e52e9ff..1e87d95196d9 100644 --- a/rs/tests/consensus/max_xnet_payload_size_test.rs +++ b/rs/tests/consensus/max_xnet_payload_size_test.rs @@ -175,8 +175,8 @@ async fn stable_grow(unican: &UniversalCanister<'_>, num_pages: u32) { fn main() -> Result<()> { SystemTestGroup::new() .with_setup(setup) - .without_assert_no_critical_errors() .add_test(systest!(test)) + .remove_metrics_to_check("critical_errors") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/replica_determinism_test.rs b/rs/tests/consensus/replica_determinism_test.rs index 283652c6b877..05f23cb15f94 100644 --- a/rs/tests/consensus/replica_determinism_test.rs +++ b/rs/tests/consensus/replica_determinism_test.rs @@ -158,6 +158,9 @@ fn main() -> Result<()> { .with_setup(setup) .without_assert_no_replica_restarts() .add_test(systest!(test)) + // One of the nodes has a corrupted state and proposes a CUP share which will be invalidated + // by the peers (and vice versa), so it's expected that the metric is increased. + .remove_metrics_to_check("consensus_invalidated_artifacts") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/consensus/request_auth_malicious_replica_test.rs b/rs/tests/consensus/request_auth_malicious_replica_test.rs index f3e94f25fe56..6a8e6aab81f7 100644 --- a/rs/tests/consensus/request_auth_malicious_replica_test.rs +++ b/rs/tests/consensus/request_auth_malicious_replica_test.rs @@ -554,6 +554,9 @@ fn main() -> Result<()> { SystemTestGroup::new() .with_setup(setup) .add_test(systest!(test)) + // One of the nodes produces blocks with invalid ingress messages which should be invaldated + // by the peers, so it's expected that the metric is increased. + .remove_metrics_to_check("consensus_invalidated_artifacts") .execute_from_args()?; Ok(()) diff --git a/rs/tests/consensus/safety_test.rs b/rs/tests/consensus/safety_test.rs index f35f6b410cb7..905fb26da1a3 100644 --- a/rs/tests/consensus/safety_test.rs +++ b/rs/tests/consensus/safety_test.rs @@ -58,6 +58,9 @@ fn main() -> Result<()> { SystemTestGroup::new() .with_setup(setup) .add_test(systest!(test)) + // There is a malicious node which proposes invalid blocks, so it's expected that the metric + // is increased. + .remove_metrics_to_check("consensus_invalidated_artifacts") .execute_from_args()?; Ok(()) } diff --git a/rs/tests/driver/src/driver/group.rs b/rs/tests/driver/src/driver/group.rs index a922c1555598..82fd193e212e 100644 --- a/rs/tests/driver/src/driver/group.rs +++ b/rs/tests/driver/src/driver/group.rs @@ -52,7 +52,7 @@ pub const MAX_RUNTIME_THREADS: usize = 16; const REPORT_TASK_NAME: &str = "report"; const SETUP_TASK_NAME: &str = "setup"; const TEARDOWN_TASK_NAME: &str = "teardown"; -const ASSERT_NO_CRITICAL_ERRORS_TASK_NAME: &str = "assert_no_critical_errors"; +const ASSERT_NO_METRICS_ERRORS_TASK_NAME: &str = "assert_no_metrics_errors"; const ASSERT_NO_REPLICA_RESTARTS_TASK_NAME: &str = "assert_no_replica_restarts"; const LIFETIME_GUARD_TASK_PREFIX: &str = "lifetime_guard_"; @@ -424,12 +424,12 @@ impl SystemTestSubGroup { pub struct SystemTestGroup { setup: Option>, teardown: Option>, - assert_no_critical_errors: bool, assert_no_replica_restarts: bool, tests: Vec, timeout_per_test: Option, overall_timeout: Option, with_farm: bool, + metrics_to_check: Vec, } impl Default for SystemTestGroup { @@ -464,12 +464,15 @@ impl SystemTestGroup { Self { setup: Default::default(), teardown: Default::default(), - assert_no_critical_errors: true, assert_no_replica_restarts: true, tests: Default::default(), timeout_per_test: None, overall_timeout: None, with_farm: true, + metrics_to_check: vec![ + String::from("critical_errors"), + String::from("consensus_invalidated_artifacts"), + ], } } @@ -497,8 +500,17 @@ impl SystemTestGroup { self } - pub fn without_assert_no_critical_errors(mut self) -> Self { - self.assert_no_critical_errors = false; + /// If the provided metric will have a non-zero value for any of the nodes, the test will + /// fail. + pub fn add_metrics_to_check(mut self, metric_name: impl ToString) -> Self { + self.metrics_to_check.push(metric_name.to_string()); + self + } + + pub fn remove_metrics_to_check(mut self, metric_name: impl ToString) -> Self { + let metric_name_to_remove = metric_name.to_string(); + self.metrics_to_check + .retain(|metric_name| *metric_name != metric_name_to_remove); self } @@ -699,10 +711,11 @@ impl SystemTestGroup { false, ); - let assert_no_critical_errors_fn: Option<(String, Box)> = if self - .assert_no_critical_errors + let assert_no_metric_errors_fn: Option<(String, Box)> = if !self + .metrics_to_check + .is_empty() { - let teardown_fn = |env: TestEnv| { + let teardown_fn = move |env: TestEnv| { let topology = match env.safe_topology_snapshot() { Ok(topology) => topology, Err(e) => { @@ -718,11 +731,11 @@ impl SystemTestGroup { .flat_map(|subnet| subnet.nodes()) .collect(); for node in nodes { - node.assert_no_critical_errors(); + node.assert_no_metrics_errors(self.metrics_to_check.clone()); } }; Some(( - ASSERT_NO_CRITICAL_ERRORS_TASK_NAME.to_string(), + ASSERT_NO_METRICS_ERRORS_TASK_NAME.to_string(), Box::new(teardown_fn), )) } else { @@ -763,7 +776,7 @@ impl SystemTestGroup { .teardown .into_iter() .map(|teardown| (TEARDOWN_TASK_NAME.to_string(), teardown)) - .chain(assert_no_critical_errors_fn) + .chain(assert_no_metric_errors_fn) .chain(assert_no_replica_restarts_fn) .map(|(teardown_name, teardown_fn)| { let logger = logger.clone(); diff --git a/rs/tests/driver/src/driver/test_env_api.rs b/rs/tests/driver/src/driver/test_env_api.rs index 3cd643b07e2c..f059101a1053 100644 --- a/rs/tests/driver/src/driver/test_env_api.rs +++ b/rs/tests/driver/src/driver/test_env_api.rs @@ -1178,13 +1178,10 @@ impl IcNodeSnapshot { )) } - pub fn assert_no_critical_errors(&self) { + pub fn assert_no_metrics_errors(&self, metrics_to_check: Vec) { block_on(async { - let replica_metric_name_prefix = "critical_errors"; - let replica_metrics_fetcher = MetricsFetcher::new( - std::iter::once(self.clone()), - vec![replica_metric_name_prefix.to_string()], - ); + let replica_metrics_fetcher = + MetricsFetcher::new(std::iter::once(self.clone()), metrics_to_check); let replica_metrics_result = replica_metrics_fetcher.fetch::().await; let replica_metrics = match replica_metrics_result { Ok(replica_metrics) => replica_metrics, @@ -1198,14 +1195,16 @@ impl IcNodeSnapshot { }; assert!( !replica_metrics.is_empty(), - "No critical error counters were found in replica metrics for node {}", + "No error counters were found in replica metrics for node {}", self.node_id ); for (name, value) in replica_metrics { assert_eq!( value[0], 0, - "Critical error {} raised by node {}. Create `SystemTestGroup` using `without_assert_no_critical_errors` if critical errors are expected in your test", - name, self.node_id + "The metric `{name}` on node {} has non-zero value. \ + If the metric is allowed to be non-zero in the test, \ + create `SystemTestGroup` with `remove_metrics_to_check(\"{name}\")", + self.node_id, ); } });