From 2a0c32baebe4fa81432955fcc5ab9d2d2f9cd918 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 8 Apr 2025 10:38:24 +0200 Subject: [PATCH 01/21] We implement a watchdog system. The system inherits from the heartbeat service. Till now we added the ability to send messages to slack using a slack_hook. Next we will do some diagnostics on docker containers itself. --- Dockerfile | 1 + dripline/extensions/__init__.py | 1 + dripline/extensions/watchdog_service.py | 139 ++++++++++++++++++++++++ 3 files changed, 141 insertions(+) create mode 100644 dripline/extensions/watchdog_service.py diff --git a/Dockerfile b/Dockerfile index 2287f91..1ff0f10 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,7 @@ FROM ${img_user}/${img_repo}:${img_tag} COPY . /usr/local/src_dragonfly WORKDIR /usr/local/src_dragonfly +RUN pip install docker RUN pip install . WORKDIR / diff --git a/dripline/extensions/__init__.py b/dripline/extensions/__init__.py index b62df1e..722b758 100644 --- a/dripline/extensions/__init__.py +++ b/dripline/extensions/__init__.py @@ -7,3 +7,4 @@ # Modules in this directory from .add_auth_spec import * +from .watchdog_service import * diff --git a/dripline/extensions/watchdog_service.py b/dripline/extensions/watchdog_service.py new file mode 100644 index 0000000..d539191 --- /dev/null +++ b/dripline/extensions/watchdog_service.py @@ -0,0 +1,139 @@ +import time +import json +import requests +import docker +from datetime import datetime, timedelta +from enum import Enum +from dripline.core import ThrowReply +from dripline.implementations import HeartbeatMonitor, HeartbeatTracker + +import logging +logger = logging.getLogger(__name__) + +__all__ = [] + + +__all__.append('WatchDogTracker') +class WatchDogTracker(HeartbeatTracker): + ''' + ''' + def __init__(self, **kwargs): + ''' + ''' + HeartbeatTracker.__init__(self, **kwargs) + + def process_heartbeat(self, timestamp): + ''' + ''' + logger.debug(f'New timestamp for {self.name}: {timestamp}') + dt = datetime.fromisoformat(timestamp) + posix_time = dt.timestamp() + logger.debug(f'Time since epoch: {posix_time}') + self.last_timestamp = posix_time + + def check_delay(self): + ''' + ''' + diff = time.time() - self.last_timestamp + if self.is_active: + if diff > self.service.critical_threshold_s: + # report critical + logger.critical(f'Missing heartbeat: {self.name}') + self.status = HeartbeatTracker.Status.CRITICAL + else: + if diff > self.service.warning_threshold_s: + # report warning + logger.warning(f'Missing heartbeat: {self.name}') + self.status = HeartbeatTracker.Status.WARNING + else: + logger.debug(f'Heartbeat status ok: {self.name}') + self.status = HeartbeatTracker.Status.OK + else: + # report inactive heartbeat received + logger.debug(f'Inactive heartbeat: time difference: {diff}') + self.status = HeartbeatTracker.Status.UNKNOWN + return {'status': self.status, 'time_since_last_hb': diff} + + class Status(Enum): + OK = 0 + WARNING = 1 + CRITICAL = 2 + UNKNOWN = -1 + + + +__all__.append('WatchDogService') +class WatchDogService(HeartbeatMonitor): + ''' + An alert consumer which listens to heartbeat messages and keeps track of the time since the last was received + + ''' + def __init__(self, **kwargs): + ''' + Args: + time_between_checks_s (int): number of seconds between heartbeat status checks + warning_threshold_s (int): warning threshold for missing heartbeats (in seconds) + critical_threshold_s (int): critical threshold for missing heartbeats (in seconds) + add_unknown_heartbeats (bool): whether or not to add a new endpoint if an unknown heartbeat is received + socket_timeout (int): number of seconds to wait for a reply from the device before timeout. + ''' + self.slack_hook = kwargs.pop("slack_hook", None) + self.blacklist = kwargs.pop("blacklist_containers", []) + HeartbeatMonitor.__init__(self, **kwargs) + self.slack_message("Started alert script") + + def slack_message(self, text): + if self.slack_hook is not None: + post = {"text": "{0}".format(text)} + response = requests.post(self.slack_hook, headers={'Content-Type': 'application/json'}, data=json.dumps(post)) + + if response.status_code != 200: + logger.error('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text) ) + + + def run_checks(self): + ''' + Checks all endpoints and collects endpoint names by heartbeat tracker status. + ''' + report_data = { + HeartbeatTracker.Status.OK: [], + HeartbeatTracker.Status.WARNING: [], + HeartbeatTracker.Status.CRITICAL: [], + HeartbeatTracker.Status.UNKNOWN: [], + } + for an_endpoint in self.sync_children.values(): + try: + endpoint_report = an_endpoint.check_delay() + report_data[endpoint_report['status']].append( + { + 'name': an_endpoint.name, + 'time_since_last_hb': endpoint_report['time_since_last_hb'], + } + ) + except Exception as err: + logger.error(f'Unable to get status of endpoint {an_endpoint.name}: {err}') + return report_data + + def process_report(self, report_data): + ''' + Print out the information from the monitoring report data. + + This function can be overridden to handle the monitoring report differently. + ''' + logger.info('Heartbeat Monitor Status Check') + if report_data[HeartbeatTracker.Status.CRITICAL]: + logger.error('Services with CRITICAL status:') + for endpoint_data in report_data[HeartbeatTracker.Status.CRITICAL]: + logger.error(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') + if report_data[HeartbeatTracker.Status.WARNING]: + logger.warning('Services with WARNING status:') + for endpoint_data in report_data[HeartbeatTracker.Status.WARNING]: + logger.warning(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') + if report_data[HeartbeatTracker.Status.OK]: + logger.info(f'Services with OK status:') + for endpoint_data in report_data[HeartbeatTracker.Status.OK]: + logger.info(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') + if report_data[HeartbeatTracker.Status.UNKNOWN]: + logger.info(f'Services with UNKNOWN status:') + for endpoint_data in report_data[HeartbeatTracker.Status.UNKNOWN]: + logger.info(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') From b786c57a7bb815c0d4aa7c1431ce4a72d67ebaa1 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 8 Apr 2025 11:36:13 +0200 Subject: [PATCH 02/21] added checking of docker containers --- dripline/extensions/watchdog_service.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/dripline/extensions/watchdog_service.py b/dripline/extensions/watchdog_service.py index d539191..67267c2 100644 --- a/dripline/extensions/watchdog_service.py +++ b/dripline/extensions/watchdog_service.py @@ -81,6 +81,7 @@ def __init__(self, **kwargs): self.blacklist = kwargs.pop("blacklist_containers", []) HeartbeatMonitor.__init__(self, **kwargs) self.slack_message("Started alert script") + self.client = docker.from_env() def slack_message(self, text): if self.slack_hook is not None: @@ -90,11 +91,24 @@ def slack_message(self, text): if response.status_code != 200: logger.error('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text) ) + def check_docker(self): + for container in self.client.containers.list(all=True): + if any([v in container.name for v in self.blacklist]): + logger.info(f"Skip container {container.name} as it is in blacklist") + continue + if container.status != "running": + logger.info(f"Container {container.name} is not running.") + self.slack_message(f"Container {container.name} is not running.") + if container.attrs["State"]["ExitCode"] != 0: + self.slack_message(f"Container {container.name} has exit code {container.attrs['State']['ExitCode']}") + if container.attrs["State"]["Error"] != "": + self.slack_message(f"Container {container.name} has error {container.attrs['State']['Error']}") def run_checks(self): ''' Checks all endpoints and collects endpoint names by heartbeat tracker status. ''' + self.check_docker() report_data = { HeartbeatTracker.Status.OK: [], HeartbeatTracker.Status.WARNING: [], From ff74880c237b1a72ace509d84a22e217f880e905 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 12:38:54 +0200 Subject: [PATCH 03/21] I decided to not make this a dripline extension but to make it a dragonfly stand alone script. That has several advantages, one of it being that we do not depend on rabbit broker, and thus the checks still work if rabbit broker went down. --- dripline/extensions/__init__.py | 1 - dripline/extensions/watchdog_service.py | 153 ------------------------ 2 files changed, 154 deletions(-) delete mode 100644 dripline/extensions/watchdog_service.py diff --git a/dripline/extensions/__init__.py b/dripline/extensions/__init__.py index 722b758..b62df1e 100644 --- a/dripline/extensions/__init__.py +++ b/dripline/extensions/__init__.py @@ -7,4 +7,3 @@ # Modules in this directory from .add_auth_spec import * -from .watchdog_service import * diff --git a/dripline/extensions/watchdog_service.py b/dripline/extensions/watchdog_service.py deleted file mode 100644 index 67267c2..0000000 --- a/dripline/extensions/watchdog_service.py +++ /dev/null @@ -1,153 +0,0 @@ -import time -import json -import requests -import docker -from datetime import datetime, timedelta -from enum import Enum -from dripline.core import ThrowReply -from dripline.implementations import HeartbeatMonitor, HeartbeatTracker - -import logging -logger = logging.getLogger(__name__) - -__all__ = [] - - -__all__.append('WatchDogTracker') -class WatchDogTracker(HeartbeatTracker): - ''' - ''' - def __init__(self, **kwargs): - ''' - ''' - HeartbeatTracker.__init__(self, **kwargs) - - def process_heartbeat(self, timestamp): - ''' - ''' - logger.debug(f'New timestamp for {self.name}: {timestamp}') - dt = datetime.fromisoformat(timestamp) - posix_time = dt.timestamp() - logger.debug(f'Time since epoch: {posix_time}') - self.last_timestamp = posix_time - - def check_delay(self): - ''' - ''' - diff = time.time() - self.last_timestamp - if self.is_active: - if diff > self.service.critical_threshold_s: - # report critical - logger.critical(f'Missing heartbeat: {self.name}') - self.status = HeartbeatTracker.Status.CRITICAL - else: - if diff > self.service.warning_threshold_s: - # report warning - logger.warning(f'Missing heartbeat: {self.name}') - self.status = HeartbeatTracker.Status.WARNING - else: - logger.debug(f'Heartbeat status ok: {self.name}') - self.status = HeartbeatTracker.Status.OK - else: - # report inactive heartbeat received - logger.debug(f'Inactive heartbeat: time difference: {diff}') - self.status = HeartbeatTracker.Status.UNKNOWN - return {'status': self.status, 'time_since_last_hb': diff} - - class Status(Enum): - OK = 0 - WARNING = 1 - CRITICAL = 2 - UNKNOWN = -1 - - - -__all__.append('WatchDogService') -class WatchDogService(HeartbeatMonitor): - ''' - An alert consumer which listens to heartbeat messages and keeps track of the time since the last was received - - ''' - def __init__(self, **kwargs): - ''' - Args: - time_between_checks_s (int): number of seconds between heartbeat status checks - warning_threshold_s (int): warning threshold for missing heartbeats (in seconds) - critical_threshold_s (int): critical threshold for missing heartbeats (in seconds) - add_unknown_heartbeats (bool): whether or not to add a new endpoint if an unknown heartbeat is received - socket_timeout (int): number of seconds to wait for a reply from the device before timeout. - ''' - self.slack_hook = kwargs.pop("slack_hook", None) - self.blacklist = kwargs.pop("blacklist_containers", []) - HeartbeatMonitor.__init__(self, **kwargs) - self.slack_message("Started alert script") - self.client = docker.from_env() - - def slack_message(self, text): - if self.slack_hook is not None: - post = {"text": "{0}".format(text)} - response = requests.post(self.slack_hook, headers={'Content-Type': 'application/json'}, data=json.dumps(post)) - - if response.status_code != 200: - logger.error('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text) ) - - def check_docker(self): - for container in self.client.containers.list(all=True): - if any([v in container.name for v in self.blacklist]): - logger.info(f"Skip container {container.name} as it is in blacklist") - continue - if container.status != "running": - logger.info(f"Container {container.name} is not running.") - self.slack_message(f"Container {container.name} is not running.") - if container.attrs["State"]["ExitCode"] != 0: - self.slack_message(f"Container {container.name} has exit code {container.attrs['State']['ExitCode']}") - if container.attrs["State"]["Error"] != "": - self.slack_message(f"Container {container.name} has error {container.attrs['State']['Error']}") - - def run_checks(self): - ''' - Checks all endpoints and collects endpoint names by heartbeat tracker status. - ''' - self.check_docker() - report_data = { - HeartbeatTracker.Status.OK: [], - HeartbeatTracker.Status.WARNING: [], - HeartbeatTracker.Status.CRITICAL: [], - HeartbeatTracker.Status.UNKNOWN: [], - } - for an_endpoint in self.sync_children.values(): - try: - endpoint_report = an_endpoint.check_delay() - report_data[endpoint_report['status']].append( - { - 'name': an_endpoint.name, - 'time_since_last_hb': endpoint_report['time_since_last_hb'], - } - ) - except Exception as err: - logger.error(f'Unable to get status of endpoint {an_endpoint.name}: {err}') - return report_data - - def process_report(self, report_data): - ''' - Print out the information from the monitoring report data. - - This function can be overridden to handle the monitoring report differently. - ''' - logger.info('Heartbeat Monitor Status Check') - if report_data[HeartbeatTracker.Status.CRITICAL]: - logger.error('Services with CRITICAL status:') - for endpoint_data in report_data[HeartbeatTracker.Status.CRITICAL]: - logger.error(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') - if report_data[HeartbeatTracker.Status.WARNING]: - logger.warning('Services with WARNING status:') - for endpoint_data in report_data[HeartbeatTracker.Status.WARNING]: - logger.warning(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') - if report_data[HeartbeatTracker.Status.OK]: - logger.info(f'Services with OK status:') - for endpoint_data in report_data[HeartbeatTracker.Status.OK]: - logger.info(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') - if report_data[HeartbeatTracker.Status.UNKNOWN]: - logger.info(f'Services with UNKNOWN status:') - for endpoint_data in report_data[HeartbeatTracker.Status.UNKNOWN]: - logger.info(f'\t{endpoint_data['name']} -- TSLH: {timedelta(seconds=endpoint_data['time_since_last_hb'])}') From 163c08fe5b249c4fbe8324d4f362c9a6c96ea11d Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 12:40:02 +0200 Subject: [PATCH 04/21] had to change version of baseimage since we need a fix that was implemented by Paul K. but not yet rolled out. We should update this once the fix is rolled out. --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 1ff0f10..27455bb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ ARG img_user=ghcr.io/driplineorg ARG img_repo=dripline-python -ARG img_tag=develop-dev +#ARG img_tag=develop-dev +ARG img_tag=receiver-test FROM ${img_user}/${img_repo}:${img_tag} From c2d4bed244f72d6148f498ea0e326ebe30167a2b Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 12:49:04 +0200 Subject: [PATCH 05/21] This is a stand alone script that can send out alerts if things go wrong in your setup. There are two different types of checks: 1. we check if all docker containers are running and not having errors, 2. we check some endpoints and see if they fullfill some condition. If there are problems we send a message to slack. The script is configured by a yaml file. The example yaml file is included as well as a docker-compose file that gives an example how to setup that script with docker compose. Testing this script worked well. --- dragonfly/alert.yaml | 36 ++++++++++++ dragonfly/docker-compose.yaml | 12 ++++ dragonfly/watchdog.py | 101 ++++++++++++++++++++++++++++++++++ 3 files changed, 149 insertions(+) create mode 100644 dragonfly/alert.yaml create mode 100755 dragonfly/docker-compose.yaml create mode 100755 dragonfly/watchdog.py diff --git a/dragonfly/alert.yaml b/dragonfly/alert.yaml new file mode 100644 index 0000000..4b79974 --- /dev/null +++ b/dragonfly/alert.yaml @@ -0,0 +1,36 @@ +dripline_mesh: + broker: rabbit-broker + broker_port: 5672 + +dripline_username: + value: dripline + +dripline_password: + value: dripline + +check_interval_s: 30 + + +# To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. +slack_hook: "https://hooks.slack.com/services/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" + +blacklist_containers: + # containers listed here will not be checked if they are running or having error messages + - mainzdripline3-dls10ZTranslator + - mainzdripline3-chillerInterface + - mainzdripline3-CoolingLoopSensor1 + - mainzdripline3-slowdash2 + - mainzdripline3-BakeoutController + - mainzdripline3-key-value-store + - mainzdripline3-AlarmSystem + - mainzdripline3-Brainboxes_ED593 + +check_endpoints: + # read this as: if 'endpoint' 'method' 'reference' send 'message' + # e.g. if 'habs_error_status' 'not_equal' '00' send 'HABS power supply issue! Error status: {value}' + # methods can be one of ["not_equal", "equal", "lower", "greater"] + - endpoint: habs_error_status + method: not_equal + reference: "00" + message: "HABS power supply issue! Error status: {value}" + diff --git a/dragonfly/docker-compose.yaml b/dragonfly/docker-compose.yaml new file mode 100755 index 0000000..a2d0798 --- /dev/null +++ b/dragonfly/docker-compose.yaml @@ -0,0 +1,12 @@ +#version: '3' + +services: + + AlarmSystem: + image: dragonfly_docker:latest + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./alert.yaml:/root/alert.yaml + - ./watchdog.py:/root/watchdog.py + command: + bash -c "python3 /root/watchdog.py --config /root/alert.yaml" diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py new file mode 100755 index 0000000..5a438b8 --- /dev/null +++ b/dragonfly/watchdog.py @@ -0,0 +1,101 @@ +#!/user/bin/env python3 +import requests +import json +import time +import docker +import dripline +import yaml +from pathlib import Path +import argparse + +from dripline.core import Interface + +class WatchDog(object): + def __init__(self, config_path): + self.config_path = config_path + self.load_configuration() + self.setup_docker_client() + self.setup_dripline_connection() + + def load_configuration(self): + with open(Path(args.config), "r") as open_file: + self.config = yaml.safe_load( open_file.read() ) + + if not "slack_hook" in self.config.keys(): + self.config["slack_hook"] = None + + print("Configuration is:", flush=True) + print(self.config, flush=True) + + def setup_docker_client(self): + self.client = docker.from_env() + + def setup_dripline_connection(self): + self.connection = Interface(username=self.config["dripline_username"], + password=self.config["dripline_password"], + dripline_mesh=self.config["dripline_mesh"]) + + def send_slack_message(self, message): + if self.config["slack_hook"] is None: + print("Slack hook not configured. No message will be send!") + return + post = {"text": "{0}".format(message)} + response = requests.post(self.config["slack_hook"], headers={'Content-Type': 'application/json'}, data=json.dumps(post)) + + if response.status_code != 200: + print(f'Request to slack returned an error {response.status_code}, the response is:\n{response.text}') + + + def get_endpoint(self, endpoint, calibrated=False): + val = self.connection.get(endpoint) + return val["value_raw" if not calibrated else "value_cal"] + + def compare(self, value, reference, method): + if method == "not_equal": + return value != reference + elif method == "equal": + return value == reference + elif method == "lower": + return value < reference + elif method == "greater": + return value > reference + else: + raise ValueError(f"Comparison method {method} is not defined. You can use one of ['not_equal', 'equal', 'lower', 'greater'].") + + def run(self): + + while True: + for entry in self.config["check_endpoints"]: + value = self.get_endpoint(entry["endpoint"]) + print(entry["endpoint"], value, flush=True) + if self.compare(value, entry["reference"], "not_equal"): + self.send_slack_message(entry["message"].format(**locals())) + # status = get_endpoint("chiller_status") + #status = " ".join([bin(int(status[i*2:(i+1)*2], 16))[2:].zfill(8)[::-1] for i in range(len(status)//2)]) + #print("Chiller status:", status) + #if status != "10000000 00000000 00000000 00000000": + # send_slack_message("Chiller issue! Error code %s"%status) + + for container in self.client.containers.list(all=True): + if any([container.name.startswith(black) for black in self.config["blacklist_containers"]]): + continue + if container.status != "running": + send_slack_message(f"Container {container.name} is not running!") + if int(container.attrs["State"]["ExitCode"]) != 0: + send_slack_message(f"Containeri {container.name} has exit code {container.attrs['State']['ExitCode']}!") + #if "seconds" in container["Status"]: + # send_slack_message(f"Container {container.name} has been restarted!") + + print("Checks done", flush=True) + time.sleep(int(self.config["check_interval_s"])) + + +if __name__ == "__main__": + print("Welcome to Watchdog", flush=True) + + parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, required=True, help="Path of the yaml config file.") + args = parser.parse_args() + + dog = WatchDog(args.config) + dog.run() From 752a36edfeff25ca7bc59613e3b4846e07b13ca1 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 14:32:11 +0200 Subject: [PATCH 06/21] remove webhook, slack notices when your webhook is on github and disables it to prevent others posting to your workspace, this is a place holder and just for demonstration purpose --- dragonfly/alert.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dragonfly/alert.yaml b/dragonfly/alert.yaml index 4b79974..7a09737 100644 --- a/dragonfly/alert.yaml +++ b/dragonfly/alert.yaml @@ -12,7 +12,7 @@ check_interval_s: 30 # To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. -slack_hook: "https://hooks.slack.com/services/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" +slack_hook: "https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" blacklist_containers: # containers listed here will not be checked if they are running or having error messages From 5abe06fa50773f7162853c7be9d320e579b8663f Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 14:49:44 +0200 Subject: [PATCH 07/21] send a test message at start up, that also helps to track if things are running --- dragonfly/watchdog.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index 5a438b8..781eefb 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -16,6 +16,7 @@ def __init__(self, config_path): self.load_configuration() self.setup_docker_client() self.setup_dripline_connection() + self.send_slack_message("Started alarm system!") def load_configuration(self): with open(Path(args.config), "r") as open_file: From fa5e330c94533ee21b006370e8f2487a5b2a10f3 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 14:50:36 +0200 Subject: [PATCH 08/21] use the script directly from dragonfly as installed in Docker image, you do not need to bind it from external, just the config file is needed --- dragonfly/docker-compose.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dragonfly/docker-compose.yaml b/dragonfly/docker-compose.yaml index a2d0798..545b903 100755 --- a/dragonfly/docker-compose.yaml +++ b/dragonfly/docker-compose.yaml @@ -7,6 +7,5 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock - ./alert.yaml:/root/alert.yaml - - ./watchdog.py:/root/watchdog.py command: - bash -c "python3 /root/watchdog.py --config /root/alert.yaml" + bash -c "python3 /usr/local/src_dragonfly/dragonfly/watchdog.py --config /root/alert.yaml" From 8eed009daed45cb41206a276e26c0607e005a244 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 14:55:42 +0200 Subject: [PATCH 09/21] remove commented out historic left overs --- dragonfly/watchdog.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index 781eefb..ffab3b0 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -71,11 +71,6 @@ def run(self): print(entry["endpoint"], value, flush=True) if self.compare(value, entry["reference"], "not_equal"): self.send_slack_message(entry["message"].format(**locals())) - # status = get_endpoint("chiller_status") - #status = " ".join([bin(int(status[i*2:(i+1)*2], 16))[2:].zfill(8)[::-1] for i in range(len(status)//2)]) - #print("Chiller status:", status) - #if status != "10000000 00000000 00000000 00000000": - # send_slack_message("Chiller issue! Error code %s"%status) for container in self.client.containers.list(all=True): if any([container.name.startswith(black) for black in self.config["blacklist_containers"]]): @@ -84,8 +79,6 @@ def run(self): send_slack_message(f"Container {container.name} is not running!") if int(container.attrs["State"]["ExitCode"]) != 0: send_slack_message(f"Containeri {container.name} has exit code {container.attrs['State']['ExitCode']}!") - #if "seconds" in container["Status"]: - # send_slack_message(f"Container {container.name} has been restarted!") print("Checks done", flush=True) time.sleep(int(self.config["check_interval_s"])) From 51f86e1ee11bc86f8bf965ddb71fb9c3bd837fb7 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Apr 2025 14:56:50 +0200 Subject: [PATCH 10/21] add comment about webhooks on github --- dragonfly/alert.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dragonfly/alert.yaml b/dragonfly/alert.yaml index 7a09737..a997bc3 100644 --- a/dragonfly/alert.yaml +++ b/dragonfly/alert.yaml @@ -12,6 +12,7 @@ check_interval_s: 30 # To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. +# Do not push your webhook to github. Slack does not like that and will disable the webhood due to security reasons. slack_hook: "https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" blacklist_containers: From 2773ee43fbd30deba03f869465883fd4aedb8dba Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Thu, 8 May 2025 17:05:02 +0200 Subject: [PATCH 11/21] fix function call, was missing a self, now is working properly --- dragonfly/watchdog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index ffab3b0..f3dc868 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -76,9 +76,9 @@ def run(self): if any([container.name.startswith(black) for black in self.config["blacklist_containers"]]): continue if container.status != "running": - send_slack_message(f"Container {container.name} is not running!") + self.send_slack_message(f"Container {container.name} is not running!") if int(container.attrs["State"]["ExitCode"]) != 0: - send_slack_message(f"Containeri {container.name} has exit code {container.attrs['State']['ExitCode']}!") + self.send_slack_message(f"Containeri {container.name} has exit code {container.attrs['State']['ExitCode']}!") print("Checks done", flush=True) time.sleep(int(self.config["check_interval_s"])) From d2d826f7f95fdc4e70fe700f79fc24770fe4afed Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Fri, 23 May 2025 12:48:18 +0200 Subject: [PATCH 12/21] handle errors thrown while checking endpoints, this could be if the device or rabbit broker is down, we want this script to not crash at all --- dragonfly/watchdog.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index f3dc868..8ade0f0 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -67,10 +67,14 @@ def run(self): while True: for entry in self.config["check_endpoints"]: - value = self.get_endpoint(entry["endpoint"]) - print(entry["endpoint"], value, flush=True) - if self.compare(value, entry["reference"], "not_equal"): - self.send_slack_message(entry["message"].format(**locals())) + try: + value = self.get_endpoint(entry["endpoint"]) + print(entry["endpoint"], value, flush=True) + if self.compare(value, entry["reference"], "not_equal"): + self.send_slack_message(entry["message"].format(**locals())) + except Exception as e: + self.send_slack_message("Could not get endpoint %s. Got error %s."%(entry["endpoint"], e.message)) + for container in self.client.containers.list(all=True): if any([container.name.startswith(black) for black in self.config["blacklist_containers"]]): From 7ef8be87d9a13423c4d7112a77722e74e3a39659 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 27 May 2025 16:37:28 +0200 Subject: [PATCH 13/21] fixing a few issues in checking endpoints that came up while checking presure gauge values. There were no type conversion for numbers, the method was fixed / hard coded to 'not_equal' and the error message came not through correctly. We fixed it by adding type conversion based on the value type itself, using the method provided in the config file and getting the error messages through. --- dragonfly/watchdog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index 8ade0f0..8dea66f 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -52,6 +52,7 @@ def get_endpoint(self, endpoint, calibrated=False): return val["value_raw" if not calibrated else "value_cal"] def compare(self, value, reference, method): + if type(value) == float: reference = float(reference) if method == "not_equal": return value != reference elif method == "equal": @@ -70,10 +71,10 @@ def run(self): try: value = self.get_endpoint(entry["endpoint"]) print(entry["endpoint"], value, flush=True) - if self.compare(value, entry["reference"], "not_equal"): + if self.compare(value, entry["reference"], entry["method"]): self.send_slack_message(entry["message"].format(**locals())) except Exception as e: - self.send_slack_message("Could not get endpoint %s. Got error %s."%(entry["endpoint"], e.message)) + self.send_slack_message("Could not get endpoint %s. Got error %s."%(entry["endpoint"], str(e) )) for container in self.client.containers.list(all=True): From c6fe0465edcbdbd55c1f1bc24e069f9ba86bd4b9 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 9 Jul 2025 11:13:34 +0200 Subject: [PATCH 14/21] add signal handling. This results in sending messages to slack if we receive a signal and also notifies when the alarm system stopped (which is also caused by a signal). --- dragonfly/watchdog.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index 8dea66f..f96d581 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -1,6 +1,7 @@ #!/user/bin/env python3 import requests import json +import signal import time import docker import dripline @@ -11,11 +12,15 @@ from dripline.core import Interface class WatchDog(object): + kill_now = False + def __init__(self, config_path): self.config_path = config_path self.load_configuration() self.setup_docker_client() self.setup_dripline_connection() + signal.signal(signal.SIGINT, self.exit_gracefully) + signal.signal(signal.SIGTERM, self.exit_gracefully) self.send_slack_message("Started alarm system!") def load_configuration(self): @@ -36,6 +41,11 @@ def setup_dripline_connection(self): password=self.config["dripline_password"], dripline_mesh=self.config["dripline_mesh"]) + def exit_gracefully(self, signum, frame): + self.kill_now = True + print("Got a signal %d"%signum, flush=True) + self.send_slack_message("Stopping, received signal: %d"%signum) + def send_slack_message(self, message): if self.config["slack_hook"] is None: print("Slack hook not configured. No message will be send!") @@ -66,18 +76,21 @@ def compare(self, value, reference, method): def run(self): - while True: - for entry in self.config["check_endpoints"]: - try: - value = self.get_endpoint(entry["endpoint"]) - print(entry["endpoint"], value, flush=True) - if self.compare(value, entry["reference"], entry["method"]): - self.send_slack_message(entry["message"].format(**locals())) - except Exception as e: - self.send_slack_message("Could not get endpoint %s. Got error %s."%(entry["endpoint"], str(e) )) + while not self.kill_now: + if self.config["check_endpoints"] is not None: + for entry in self.config["check_endpoints"]: + if self.kill_now: break + try: + value = self.get_endpoint(entry["endpoint"]) + print(entry["endpoint"], value, flush=True) + if self.compare(value, entry["reference"], entry["method"]): + self.send_slack_message(entry["message"].format(**locals())) + except Exception as e: + self.send_slack_message("Could not get endpoint %s. Got error %s."%(entry["endpoint"], str(e) )) for container in self.client.containers.list(all=True): + if self.kill_now: break if any([container.name.startswith(black) for black in self.config["blacklist_containers"]]): continue if container.status != "running": @@ -86,7 +99,10 @@ def run(self): self.send_slack_message(f"Containeri {container.name} has exit code {container.attrs['State']['ExitCode']}!") print("Checks done", flush=True) - time.sleep(int(self.config["check_interval_s"])) + for i in range(int(self.config["check_interval_s"])): + if self.kill_now: break + time.sleep(1) + self.send_slack_message(f"Stopping alarm system") if __name__ == "__main__": From 1d608bd0431ec4f31b79082b22ba222200922b65 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 15 Jul 2025 22:55:43 +0200 Subject: [PATCH 15/21] remove authentication by hand and use scarab authentication via environment variables instead --- dragonfly/watchdog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dragonfly/watchdog.py b/dragonfly/watchdog.py index f96d581..86091db 100755 --- a/dragonfly/watchdog.py +++ b/dragonfly/watchdog.py @@ -37,9 +37,7 @@ def setup_docker_client(self): self.client = docker.from_env() def setup_dripline_connection(self): - self.connection = Interface(username=self.config["dripline_username"], - password=self.config["dripline_password"], - dripline_mesh=self.config["dripline_mesh"]) + self.connection = Interface(dripline_mesh=self.config["dripline_mesh"]) def exit_gracefully(self, signum, frame): self.kill_now = True From 792999472ec240caa8aac516a50015603f4005e3 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 15 Jul 2025 22:56:08 +0200 Subject: [PATCH 16/21] adding a sample configuration file --- AlarmSystem.yaml | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 AlarmSystem.yaml diff --git a/AlarmSystem.yaml b/AlarmSystem.yaml new file mode 100644 index 0000000..60d6809 --- /dev/null +++ b/AlarmSystem.yaml @@ -0,0 +1,44 @@ +dripline_mesh: + broker: rabbit-broker + broker_port: 5672 + +check_interval_s: 30 + + +# To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. +slack_hook: "https://hooks.slack.com/services/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" + +blacklist_containers: + # containers listed here will not be checked if they are running or having error messages + - mainzdripline3-dls10ZTranslator + - mainzdripline3-pneumaticValve_dl3 + - mainzdripline3-Pressure_gauge_70 + - mainzdripline3-habs_tc + - mainzdripline3-Checklist + - mainzdripline3-slowdash + - mainzdripline3-dripline-bash + - mainzdripline3-SignalTest +check_endpoints: + # read this as: if 'endpoint' 'method' 'reference' send 'message' + # e.g. if 'habs_error_status' 'not_equal' '00' send 'HABS power supply issue! Error status: {value}' + # methods can be one of ["not_equal", "equal", "lower", "greater"] + #- endpoint: habs_error_status + # method: not_equal + # reference: "00" + # message: "HABS power supply issue! Error status: {value}" + #- endpoint: pg8_pressure_mbar + # method: greater + # reference: 2e-5 + # message: "PG8 above 2e-5 mbar (too high)" + #- endpoint: pg60_pressure_mbar + # method: greater + # reference: 1e-4 + # message: "PG60 above 1e-4 mbar (too high)" + #- endpoint: read_C_Temperature_CoolingLoopSensor1_MATS + #method: lower + #reference: 0 + #message: "Cooling loop water is below freeze point (sensor 1)" + #- endpoint: read_C_Temperature_CoolingLoopSensor2_MATS + # method: lower + #reference: 0 + #message: "Cooling loop water is below freeze point (sensor 2)" From 3e91a8f8d85d8063da4bc86680d24feb8cd9231c Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Tue, 15 Jul 2025 22:56:45 +0200 Subject: [PATCH 17/21] adding a sample entry for docker compose --- docker-compose.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9ff66ea..dfd23e9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -40,3 +40,18 @@ services: - DRIPLINE_PASSWORD=dripline command: > bash -c "dl-serve -vv -c /root/jitter_example.yaml" + + AlarmSystem: + # this image is build from this branch + image: dragonfly_docker:latest + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./AlarmSystem.yaml:/root/AlarmSystem.yaml + environment: + - DRIPLINE_USER=dripline + - DRIPLINE_PASSWORD=dripline + command: + - python3 + - /usr/local/src_dragonfly/dragonfly/watchdog.py + - --config + - /root/AlarmSystem.yaml From c33534dbabf2d64db6b8cd491861e42121403992 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 16 Jul 2025 00:06:51 +0200 Subject: [PATCH 18/21] moved AlarmSystem.yaml to example folder --- AlarmSystem.yaml => examples/AlarmSystem.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename AlarmSystem.yaml => examples/AlarmSystem.yaml (87%) diff --git a/AlarmSystem.yaml b/examples/AlarmSystem.yaml similarity index 87% rename from AlarmSystem.yaml rename to examples/AlarmSystem.yaml index 60d6809..58eee89 100644 --- a/AlarmSystem.yaml +++ b/examples/AlarmSystem.yaml @@ -6,7 +6,8 @@ check_interval_s: 30 # To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. -slack_hook: "https://hooks.slack.com/services/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" +# Do not push your webhook to github. Slack does not like that and will disable the webhood due to security reasons. +slack_hook: "https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" blacklist_containers: # containers listed here will not be checked if they are running or having error messages From f2e50d7328f6b035e0c2cfb7f6402332fe21adf0 Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 16 Jul 2025 00:07:29 +0200 Subject: [PATCH 19/21] removing dragonfly/alert.yaml which is a duplicate of the example in the example folder --- dragonfly/alert.yaml | 37 ------------------------------------- 1 file changed, 37 deletions(-) delete mode 100644 dragonfly/alert.yaml diff --git a/dragonfly/alert.yaml b/dragonfly/alert.yaml deleted file mode 100644 index a997bc3..0000000 --- a/dragonfly/alert.yaml +++ /dev/null @@ -1,37 +0,0 @@ -dripline_mesh: - broker: rabbit-broker - broker_port: 5672 - -dripline_username: - value: dripline - -dripline_password: - value: dripline - -check_interval_s: 30 - - -# To create a slack webhook see https://api.slack.com/messaging/webhooks steps 1. to 3. -# Do not push your webhook to github. Slack does not like that and will disable the webhood due to security reasons. -slack_hook: "https://hooks.slack.com/services/XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" - -blacklist_containers: - # containers listed here will not be checked if they are running or having error messages - - mainzdripline3-dls10ZTranslator - - mainzdripline3-chillerInterface - - mainzdripline3-CoolingLoopSensor1 - - mainzdripline3-slowdash2 - - mainzdripline3-BakeoutController - - mainzdripline3-key-value-store - - mainzdripline3-AlarmSystem - - mainzdripline3-Brainboxes_ED593 - -check_endpoints: - # read this as: if 'endpoint' 'method' 'reference' send 'message' - # e.g. if 'habs_error_status' 'not_equal' '00' send 'HABS power supply issue! Error status: {value}' - # methods can be one of ["not_equal", "equal", "lower", "greater"] - - endpoint: habs_error_status - method: not_equal - reference: "00" - message: "HABS power supply issue! Error status: {value}" - From fdfb45d7aad8cb5cea87e925a73fd3fd4846b36c Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 16 Jul 2025 00:09:12 +0200 Subject: [PATCH 20/21] adding watchdog to __init__ --- dragonfly/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dragonfly/__init__.py b/dragonfly/__init__.py index ae9daaf..eb89dbb 100644 --- a/dragonfly/__init__.py +++ b/dragonfly/__init__.py @@ -15,3 +15,5 @@ def __get_version(): return version version = __get_version() __version__ = version.version + +from .watchdog import * From b85ba245c41a132209dcfebb0907b156b9cea74a Mon Sep 17 00:00:00 2001 From: Rene Reimann Date: Wed, 16 Jul 2025 00:10:29 +0200 Subject: [PATCH 21/21] removing doublicate of the docker-compose.yaml --- dragonfly/docker-compose.yaml | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100755 dragonfly/docker-compose.yaml diff --git a/dragonfly/docker-compose.yaml b/dragonfly/docker-compose.yaml deleted file mode 100755 index 545b903..0000000 --- a/dragonfly/docker-compose.yaml +++ /dev/null @@ -1,11 +0,0 @@ -#version: '3' - -services: - - AlarmSystem: - image: dragonfly_docker:latest - volumes: - - /var/run/docker.sock:/var/run/docker.sock - - ./alert.yaml:/root/alert.yaml - command: - bash -c "python3 /usr/local/src_dragonfly/dragonfly/watchdog.py --config /root/alert.yaml"