From bef74c0475a40dcc6d673cfe639200ac859a5434 Mon Sep 17 00:00:00 2001
From: Eric Schoeller <schoelle@colorado.edu>
Date: Fri, 3 Apr 2026 18:58:37 -0600
Subject: [PATCH] module: Reset peer notification state on normal recoveries

handle_notification_data() propagates notification metadata from the
sending peer to receivers. For normal recovery notifications, that
metadata carries current_notification_number as it existed at
NEBTYPE_NOTIFICATION_END: after the recovery notification increment,
but before Naemon's local post-recovery reset.

On receiving peers, the recovery check result is replayed through
Naemon first, which correctly resets current_notification_number to 0.
The later recovery notification packet then overwrites that reset with
the sender's stale nonzero value.

That stale counter persists on non-sender peers. If notification
ownership later shifts for the same host, the new sender inherits the
stale nonzero current_notification_number and Naemon skips the
first_notification_delay gate, causing an immediate notification at
HARD DOWN.

Context:
- 94f8aabf introduced cross-peer notification-state sync for later
  renotify/escalation.
- e32d4f5b added add_notified_on() so recoveries could be sent from a
  different node than the one that sent the problem notification.
- Normal recoveries are different because they represent the terminal
  post-problem state and should not preserve the pre-reset counter.

Fix this by treating NOTIFICATION_NORMAL + STATE_UP/STATE_OK as a reset
on receipt: clear current_notification_number and notified_on.
Problem-state notifications continue to sync normally.

Ref: https://github.com/ITRS-Group/monitor-merlin/issues/126
---
 module/module.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/module/module.c b/module/module.c
index 681e8058..7b30defe 100644
--- a/module/module.c
+++ b/module/module.c
@@ -565,20 +565,39 @@ static int handle_notification_data(__attribute__((unused)) merlin_node *node, v
 		struct host *h = find_host(ds->host_name);
 		if (!h)
 			return -1;
-		h->current_notification_number = (int)(uintptr_t)(ds->object_ptr);
+		/*
+		 * Normal recovery notifications carry a stale nonzero
+		 * current_notification_number from the sender (serialized
+		 * at NEBTYPE_NOTIFICATION_END before the sender's local
+		 * reset). Blindly applying it overwrites the receiver's
+		 * correct post-recovery reset of 0, causing
+		 * first_notification_delay to be bypassed on the next
+		 * incident. Treat normal recovery as a reset instead.
+		 */
+		if (ds->reason_type == NOTIFICATION_NORMAL && ds->state == STATE_UP) {
+			h->current_notification_number = 0;
+			h->notified_on = 0;
+		} else {
+			h->current_notification_number = (int)(uintptr_t)(ds->object_ptr);
+			add_notified_on(h, ds->state);
+		}
 		h->last_notification = ds->start_time.tv_sec;
 		h->next_notification = ds->end_time.tv_sec;
 		h->no_more_notifications = ds->start_time.tv_usec;
-		add_notified_on(h, ds->state);
 	} else {
 		struct service *s = find_service(ds->host_name, ds->service_description);
 		if (!s)
 			return -1;
-		s->current_notification_number = (int)(uintptr_t)(ds->object_ptr);
+		if (ds->reason_type == NOTIFICATION_NORMAL && ds->state == STATE_OK) {
+			s->current_notification_number = 0;
+			s->notified_on = 0;
+		} else {
+			s->current_notification_number = (int)(uintptr_t)(ds->object_ptr);
+			add_notified_on(s, ds->state);
+		}
 		s->last_notification = ds->start_time.tv_sec;
 		s->next_notification = ds->end_time.tv_sec;
 		s->no_more_notifications = ds->start_time.tv_usec;
-		add_notified_on(s, ds->state);
 	}
 	return 0;
 }