From bef74c0475a40dcc6d673cfe639200ac859a5434 Mon Sep 17 00:00:00 2001 From: Eric Schoeller Date: Fri, 3 Apr 2026 18:58:37 -0600 Subject: [PATCH] module: Reset peer notification state on normal recoveries handle_notification_data() propagates notification metadata from the sending peer to receivers. For normal recovery notifications, that metadata carries current_notification_number as it existed at NEBTYPE_NOTIFICATION_END: after the recovery notification increment, but before Naemon's local post-recovery reset. On receiving peers, the recovery check result is replayed through Naemon first, which correctly resets current_notification_number to 0. The later recovery notification packet then overwrites that reset with the sender's stale nonzero value. That stale counter persists on non-sender peers. If notification ownership later shifts for the same host, the new sender inherits the stale nonzero current_notification_number and Naemon skips the first_notification_delay gate, causing an immediate notification at HARD DOWN. Context: - 94f8aabf introduced cross-peer notification-state sync for later renotify/escalation. - e32d4f5b added add_notified_on() so recoveries could be sent from a different node than the one that sent the problem notification. - Normal recoveries are different because they represent the terminal post-problem state and should not preserve the pre-reset counter. Fix this by treating NOTIFICATION_NORMAL + STATE_UP/STATE_OK as a reset on receipt: clear current_notification_number and notified_on. Problem-state notifications continue to sync normally. Ref: https://github.com/ITRS-Group/monitor-merlin/issues/126 --- module/module.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/module/module.c b/module/module.c index 681e8058..7b30defe 100644 --- a/module/module.c +++ b/module/module.c @@ -565,20 +565,39 @@ static int handle_notification_data(__attribute__((unused)) merlin_node *node, v struct host *h = find_host(ds->host_name); if (!h) return -1; - h->current_notification_number = (int)(uintptr_t)(ds->object_ptr); + /* + * Normal recovery notifications carry a stale nonzero + * current_notification_number from the sender (serialized + * at NEBTYPE_NOTIFICATION_END before the sender's local + * reset). Blindly applying it overwrites the receiver's + * correct post-recovery reset of 0, causing + * first_notification_delay to be bypassed on the next + * incident. Treat normal recovery as a reset instead. + */ + if (ds->reason_type == NOTIFICATION_NORMAL && ds->state == STATE_UP) { + h->current_notification_number = 0; + h->notified_on = 0; + } else { + h->current_notification_number = (int)(uintptr_t)(ds->object_ptr); + add_notified_on(h, ds->state); + } h->last_notification = ds->start_time.tv_sec; h->next_notification = ds->end_time.tv_sec; h->no_more_notifications = ds->start_time.tv_usec; - add_notified_on(h, ds->state); } else { struct service *s = find_service(ds->host_name, ds->service_description); if (!s) return -1; - s->current_notification_number = (int)(uintptr_t)(ds->object_ptr); + if (ds->reason_type == NOTIFICATION_NORMAL && ds->state == STATE_OK) { + s->current_notification_number = 0; + s->notified_on = 0; + } else { + s->current_notification_number = (int)(uintptr_t)(ds->object_ptr); + add_notified_on(s, ds->state); + } s->last_notification = ds->start_time.tv_sec; s->next_notification = ds->end_time.tv_sec; s->no_more_notifications = ds->start_time.tv_usec; - add_notified_on(s, ds->state); } return 0; }