From 8bc8cb60e5ea97eb755dc8757da713c855a2b018 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 12:33:50 +0100 Subject: [PATCH 1/7] Add HealthCheckRuntime context manager for shared health check boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract the ~30 lines of repeated setup code (logger init, GPU node ID detection, derived cluster resolution, TelemetryContext + OutputContext nesting, killswitch check) into a reusable HealthCheckRuntime dataclass context manager. This reduces per-subcommand boilerplate from ~30 lines to ~5 lines. The helper is purely additive — existing checks continue to work unchanged. New checks can use `with HealthCheckRuntime(...) as rt:` instead of manually wiring up the setup ceremony. Includes comprehensive tests covering field initialization, killswitch behavior, context manager nesting, GPU node ID failure handling, and the finish() convenience method. Refs: #75 --- gcm/health_checks/check_utils/runtime.py | 117 ++++++++++++ gcm/tests/health_checks_tests/test_runtime.py | 174 ++++++++++++++++++ 2 files changed, 291 insertions(+) create mode 100644 gcm/health_checks/check_utils/runtime.py create mode 100644 gcm/tests/health_checks_tests/test_runtime.py diff --git a/gcm/health_checks/check_utils/runtime.py b/gcm/health_checks/check_utils/runtime.py new file mode 100644 index 0000000..abdb312 --- /dev/null +++ b/gcm/health_checks/check_utils/runtime.py @@ -0,0 +1,117 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +import logging +import socket +import sys +import types +from collections.abc import Collection +from contextlib import ExitStack +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, ContextManager, Literal, NoReturn, Optional, Type + +import gni_lib +from gcm.health_checks.check_utils.output_context_manager import OutputContext +from gcm.health_checks.check_utils.telem import TelemetryContext +from gcm.health_checks.types import CHECK_TYPE, ExitCode, LOG_LEVEL +from gcm.monitoring.slurm.derived_cluster import get_derived_cluster +from gcm.monitoring.utils.monitor import init_logger +from gcm.schemas.health_check.health_check_name import HealthCheckName + + +@dataclass +class HealthCheckRuntime(ContextManager["HealthCheckRuntime"]): + cluster: str + type: CHECK_TYPE + log_level: LOG_LEVEL + log_folder: str + sink: str + sink_opts: Collection[str] + verbose_out: bool + heterogeneous_cluster_v1: bool + health_check_name: HealthCheckName + killswitch_getter: Callable[[], bool] + + logger: logging.Logger = field(init=False) + node: str = field(init=False) + gpu_node_id: Optional[str] = field(init=False) + derived_cluster: str = field(init=False) + exit_code: ExitCode = field(init=False, default=ExitCode.UNKNOWN) + msg: str = field(init=False, default="") + _stack: ExitStack = field(init=False) + + def __enter__(self) -> "HealthCheckRuntime": + self.node = socket.gethostname() + self.logger, _ = init_logger( + logger_name=self.type, + log_dir=str(Path(self.log_folder) / self.type / "_logs"), + log_name=self.node + ".log", + log_level=getattr(logging, self.log_level), + ) + self.logger.info( + "%s: cluster: %s, node: %s, type: %s", + self.health_check_name.value, + self.cluster, + self.node, + self.type, + ) + try: + self.gpu_node_id = gni_lib.get_gpu_node_id() + except Exception as e: + self.gpu_node_id = None + self.logger.warning(f"Could not get gpu_node_id, likely not a GPU host: {e}") + + self.derived_cluster = get_derived_cluster( + cluster=self.cluster, + heterogeneous_cluster_v1=self.heterogeneous_cluster_v1, + data={"Node": self.node}, + ) + + self._stack = ExitStack() + self._stack.__enter__() + self._stack.enter_context( + TelemetryContext( + sink=self.sink, + sink_opts=self.sink_opts, + logger=self.logger, + cluster=self.cluster, + derived_cluster=self.derived_cluster, + type=self.type, + name=self.health_check_name.value, + node=self.node, + get_exit_code_msg=lambda: (self.exit_code, self.msg), + gpu_node_id=self.gpu_node_id, + ) + ) + self._stack.enter_context( + OutputContext( + self.type, + self.health_check_name, + lambda: (self.exit_code, self.msg), + self.verbose_out, + ) + ) + + if self.killswitch_getter(): + self.exit_code = ExitCode.OK + self.logger.info( + "%s is disabled by killswitch", + self.health_check_name.value, + ) + sys.exit(0) + + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_val: Optional[BaseException], + exc_tb: Optional[types.TracebackType], + ) -> Literal[False]: + self._stack.__exit__(exc_type, exc_val, exc_tb) + return False + + def finish(self, exit_code: ExitCode, msg: str) -> NoReturn: + self.exit_code = exit_code + self.msg = msg + sys.exit(exit_code.value) diff --git a/gcm/tests/health_checks_tests/test_runtime.py b/gcm/tests/health_checks_tests/test_runtime.py new file mode 100644 index 0000000..bf5d4af --- /dev/null +++ b/gcm/tests/health_checks_tests/test_runtime.py @@ -0,0 +1,174 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Tests for the HealthCheckRuntime context manager.""" + +import logging +from unittest.mock import MagicMock, patch + +import pytest +from gcm.health_checks.check_utils.runtime import HealthCheckRuntime +from gcm.health_checks.types import ExitCode +from gcm.schemas.health_check.health_check_name import HealthCheckName + + +def _make_runtime(**kwargs) -> HealthCheckRuntime: + defaults = dict( + cluster="test_cluster", + type="prolog", + log_level="INFO", + log_folder="/tmp", + sink="do_nothing", + sink_opts=(), + verbose_out=False, + heterogeneous_cluster_v1=False, + health_check_name=HealthCheckName.CHECK_SENSORS, + killswitch_getter=lambda: False, + ) + defaults.update(kwargs) + return HealthCheckRuntime(**defaults) + + +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="derived_test") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_enter_initializes_fields( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, +) -> None: + """Verify __enter__ populates logger, node, gpu_node_id, and derived_cluster.""" + mock_socket.gethostname.return_value = "testnode01" + test_logger = logging.getLogger("test") + mock_init_logger.return_value = (test_logger, MagicMock()) + mock_gni.get_gpu_node_id.return_value = "gpu-0" + + rt = _make_runtime() + with rt as runtime: + assert runtime.node == "testnode01" + assert runtime.logger is test_logger + assert runtime.gpu_node_id == "gpu-0" + assert runtime.derived_cluster == "derived_test" + + +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_killswitch_enabled_exits_ok( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, +) -> None: + """When killswitch_getter returns True, sys.exit should be called with 0.""" + mock_socket.gethostname.return_value = "testnode01" + mock_init_logger.return_value = (logging.getLogger("test"), MagicMock()) + mock_gni.get_gpu_node_id.return_value = "gpu-0" + + with pytest.raises(SystemExit) as exc_info: + with _make_runtime(killswitch_getter=lambda: True): + pytest.fail("With block body should not be reached when killswitch is enabled") + + assert exc_info.value.code == ExitCode.OK.value + + +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_killswitch_disabled_continues( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, +) -> None: + """When killswitch_getter returns False, the with block body should execute normally.""" + mock_socket.gethostname.return_value = "testnode01" + mock_init_logger.return_value = (logging.getLogger("test"), MagicMock()) + mock_gni.get_gpu_node_id.return_value = "gpu-0" + + body_executed = False + with _make_runtime(killswitch_getter=lambda: False) as rt: + body_executed = True + rt.exit_code = ExitCode.OK + rt.msg = "all good" + + assert body_executed + + +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_finish_sets_code_and_exits( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, +) -> None: + """finish() should set exit_code and msg, then call sys.exit with the code value.""" + mock_socket.gethostname.return_value = "testnode01" + mock_init_logger.return_value = (logging.getLogger("test"), MagicMock()) + mock_gni.get_gpu_node_id.return_value = "gpu-0" + + with pytest.raises(SystemExit) as exc_info: + with _make_runtime() as rt: + rt.finish(ExitCode.CRITICAL, "something broke") + + assert exc_info.value.code == ExitCode.CRITICAL.value + assert rt.exit_code == ExitCode.CRITICAL + assert rt.msg == "something broke" + + +@patch("gcm.health_checks.check_utils.runtime.OutputContext") +@patch("gcm.health_checks.check_utils.runtime.TelemetryContext") +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_telemetry_and_output_contexts_entered( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, + mock_telemetry_cls: MagicMock, + mock_output_cls: MagicMock, +) -> None: + """Both TelemetryContext and OutputContext should be entered during __enter__.""" + mock_socket.gethostname.return_value = "testnode01" + mock_init_logger.return_value = (logging.getLogger("test"), MagicMock()) + mock_gni.get_gpu_node_id.return_value = "gpu-0" + + mock_telem_instance = MagicMock() + mock_telemetry_cls.return_value = mock_telem_instance + mock_output_instance = MagicMock() + mock_output_cls.return_value = mock_output_instance + + with _make_runtime() as rt: + rt.exit_code = ExitCode.OK + + mock_telem_instance.__enter__.assert_called_once() + mock_output_instance.__enter__.assert_called_once() + + +@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch("gcm.health_checks.check_utils.runtime.gni_lib") +@patch("gcm.health_checks.check_utils.runtime.init_logger") +@patch("gcm.health_checks.check_utils.runtime.socket") +def test_gpu_node_id_failure_handled( + mock_socket: MagicMock, + mock_init_logger: MagicMock, + mock_gni: MagicMock, + mock_derived: MagicMock, +) -> None: + """When gni_lib.get_gpu_node_id raises, gpu_node_id should be None and a warning logged.""" + mock_socket.gethostname.return_value = "testnode01" + test_logger = logging.getLogger("test_gpu_failure") + mock_init_logger.return_value = (test_logger, MagicMock()) + mock_gni.get_gpu_node_id.side_effect = RuntimeError("not a GPU host") + + with _make_runtime() as rt: + assert rt.gpu_node_id is None + rt.exit_code = ExitCode.OK From a076093fed0c4fb99bf7694fee90e155b2f22911 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 12:51:58 +0100 Subject: [PATCH 2/7] Fix formatting (ufmt) and type annotations (mypy) Apply ufmt formatting and fix mypy errors in test helper by using explicit typed parameters instead of **kwargs dict unpacking. --- gcm/health_checks/check_utils/runtime.py | 4 +- gcm/tests/health_checks_tests/test_runtime.py | 45 ++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/gcm/health_checks/check_utils/runtime.py b/gcm/health_checks/check_utils/runtime.py index abdb312..96cd3fa 100644 --- a/gcm/health_checks/check_utils/runtime.py +++ b/gcm/health_checks/check_utils/runtime.py @@ -59,7 +59,9 @@ def __enter__(self) -> "HealthCheckRuntime": self.gpu_node_id = gni_lib.get_gpu_node_id() except Exception as e: self.gpu_node_id = None - self.logger.warning(f"Could not get gpu_node_id, likely not a GPU host: {e}") + self.logger.warning( + f"Could not get gpu_node_id, likely not a GPU host: {e}" + ) self.derived_cluster = get_derived_cluster( cluster=self.cluster, diff --git a/gcm/tests/health_checks_tests/test_runtime.py b/gcm/tests/health_checks_tests/test_runtime.py index bf5d4af..f665d90 100644 --- a/gcm/tests/health_checks_tests/test_runtime.py +++ b/gcm/tests/health_checks_tests/test_runtime.py @@ -3,6 +3,7 @@ """Tests for the HealthCheckRuntime context manager.""" import logging +from typing import Callable from unittest.mock import MagicMock, patch import pytest @@ -11,8 +12,10 @@ from gcm.schemas.health_check.health_check_name import HealthCheckName -def _make_runtime(**kwargs) -> HealthCheckRuntime: - defaults = dict( +def _make_runtime( + killswitch_getter: Callable[[], bool] = lambda: False, +) -> HealthCheckRuntime: + return HealthCheckRuntime( cluster="test_cluster", type="prolog", log_level="INFO", @@ -22,13 +25,14 @@ def _make_runtime(**kwargs) -> HealthCheckRuntime: verbose_out=False, heterogeneous_cluster_v1=False, health_check_name=HealthCheckName.CHECK_SENSORS, - killswitch_getter=lambda: False, + killswitch_getter=killswitch_getter, ) - defaults.update(kwargs) - return HealthCheckRuntime(**defaults) -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="derived_test") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="derived_test", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") @@ -52,7 +56,10 @@ def test_enter_initializes_fields( assert runtime.derived_cluster == "derived_test" -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="test_cluster", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") @@ -69,12 +76,17 @@ def test_killswitch_enabled_exits_ok( with pytest.raises(SystemExit) as exc_info: with _make_runtime(killswitch_getter=lambda: True): - pytest.fail("With block body should not be reached when killswitch is enabled") + pytest.fail( + "With block body should not be reached when killswitch is enabled" + ) assert exc_info.value.code == ExitCode.OK.value -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="test_cluster", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") @@ -98,7 +110,10 @@ def test_killswitch_disabled_continues( assert body_executed -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="test_cluster", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") @@ -124,7 +139,10 @@ def test_finish_sets_code_and_exits( @patch("gcm.health_checks.check_utils.runtime.OutputContext") @patch("gcm.health_checks.check_utils.runtime.TelemetryContext") -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="test_cluster", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") @@ -153,7 +171,10 @@ def test_telemetry_and_output_contexts_entered( mock_output_instance.__enter__.assert_called_once() -@patch("gcm.health_checks.check_utils.runtime.get_derived_cluster", return_value="test_cluster") +@patch( + "gcm.health_checks.check_utils.runtime.get_derived_cluster", + return_value="test_cluster", +) @patch("gcm.health_checks.check_utils.runtime.gni_lib") @patch("gcm.health_checks.check_utils.runtime.init_logger") @patch("gcm.health_checks.check_utils.runtime.socket") From b22f79863fb588705e758b8951f0f25e6865c044 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 12:39:13 +0100 Subject: [PATCH 3/7] Add scaffold tool for creating new health checks Introduce bin/create_new_health_check.py that automates the creation of new health checks by generating all required files (check module, test skeleton, documentation stub) and registering the check across all touchpoints (checks/__init__.py, CLI entry point, HealthCheckName enum, killswitch feature flag). The tool supports single-command and grouped-command checks, has a dry-run mode, and is idempotent (safe to re-run). Generated checks use the new HealthCheckRuntime context manager from the previous commit. Also updates the "Adding New Health Check" guide with a Quick Start section pointing to the scaffold tool. Refs: #75 --- bin/create_new_health_check.py | 689 ++++++++++++++++++ gcm/tests/test_create_health_check.py | 355 +++++++++ .../adding_new_health_check.md | 27 + 3 files changed, 1071 insertions(+) create mode 100755 bin/create_new_health_check.py create mode 100644 gcm/tests/test_create_health_check.py diff --git a/bin/create_new_health_check.py b/bin/create_new_health_check.py new file mode 100755 index 0000000..9afbd10 --- /dev/null +++ b/bin/create_new_health_check.py @@ -0,0 +1,689 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Scaffold tool for creating new GCM health checks.""" + +import argparse +import re +import sys +from pathlib import Path + +PROJECT_ROOT = Path(__file__).resolve().parent.parent + +# --------------------------------------------------------------------------- +# Templates +# --------------------------------------------------------------------------- + +CHECK_FILE_SIMPLE = '''\ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""TODO: describe what {check_name} checks.""" + +import sys +from collections.abc import Collection +from dataclasses import dataclass +from typing import Optional, Protocol + +import click + +from gcm.health_checks.check_utils.runtime import HealthCheckRuntime +from gcm.health_checks.click import ( + common_arguments, + telemetry_argument, +) +from gcm.health_checks.types import CHECK_TYPE, CheckEnv, ExitCode, LOG_LEVEL +from gcm.monitoring.click import heterogeneous_cluster_v1_option +from gcm.monitoring.features.gen.generated_features_healthchecksfeatures import ( + FeatureValueHealthChecksFeatures, +) +from gcm.schemas.health_check.health_check_name import HealthCheckName +from typeguard import typechecked + + +class {PascalName}Check(CheckEnv, Protocol): + """Provide a class stub definition.""" + + ... + + +@dataclass +class {PascalName}CheckImpl: + """Implement the {check_name} check.""" + + cluster: str + type: str + log_level: str + log_folder: str + + def run(self) -> None: + """TODO: implement the check logic.""" + raise NotImplementedError + + +@click.command() +@common_arguments +@telemetry_argument +@heterogeneous_cluster_v1_option +@click.pass_obj +@typechecked +def {check_name}( + obj: Optional[{PascalName}Check], + cluster: str, + type: CHECK_TYPE, + log_level: LOG_LEVEL, + log_folder: str, + sink: str, + sink_opts: Collection[str], + verbose_out: bool, + heterogeneous_cluster_v1: bool, +) -> None: + """TODO: short description of {check_name}.""" + if not obj: + obj = {PascalName}CheckImpl(cluster, type, log_level, log_folder) + + with HealthCheckRuntime( + cluster=cluster, + type=type, + log_level=log_level, + log_folder=log_folder, + sink=sink, + sink_opts=sink_opts, + verbose_out=verbose_out, + heterogeneous_cluster_v1=heterogeneous_cluster_v1, + health_check_name=HealthCheckName.CHECK_{UPPER_NAME}, + killswitch_getter=lambda: FeatureValueHealthChecksFeatures() + .get_healthchecksfeatures_disable_check_{name}(), + ) as rt: + # TODO: implement check logic, call rt.finish(ExitCode.OK, "msg") when done + rt.finish(ExitCode.UNKNOWN, "Not implemented yet") +''' + +CHECK_FILE_GROUP = '''\ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""TODO: describe what {check_name} checks.""" + +import sys +from collections.abc import Collection +from dataclasses import dataclass +from typing import Optional, Protocol + +import click + +from gcm.health_checks.check_utils.runtime import HealthCheckRuntime +from gcm.health_checks.click import ( + common_arguments, + telemetry_argument, +) +from gcm.health_checks.types import CHECK_TYPE, CheckEnv, ExitCode, LOG_LEVEL +from gcm.monitoring.click import heterogeneous_cluster_v1_option +from gcm.monitoring.features.gen.generated_features_healthchecksfeatures import ( + FeatureValueHealthChecksFeatures, +) +from gcm.schemas.health_check.health_check_name import HealthCheckName +from typeguard import typechecked + + +class {PascalName}Check(CheckEnv, Protocol): + """Provide a class stub definition.""" + + ... + + +@dataclass +class {PascalName}CheckImpl: + """Implement the {check_name} check.""" + + cluster: str + type: str + log_level: str + log_folder: str + + def run(self) -> None: + """TODO: implement the check logic.""" + raise NotImplementedError + + +@click.group() +def {check_name}() -> None: + """TODO: short description of the {check_name} group.""" + pass + + +@{check_name}.command() +@common_arguments +@telemetry_argument +@heterogeneous_cluster_v1_option +@click.pass_obj +@typechecked +def example_subcommand( + obj: Optional[{PascalName}Check], + cluster: str, + type: CHECK_TYPE, + log_level: LOG_LEVEL, + log_folder: str, + sink: str, + sink_opts: Collection[str], + verbose_out: bool, + heterogeneous_cluster_v1: bool, +) -> None: + """TODO: short description of this subcommand.""" + if not obj: + obj = {PascalName}CheckImpl(cluster, type, log_level, log_folder) + + with HealthCheckRuntime( + cluster=cluster, + type=type, + log_level=log_level, + log_folder=log_folder, + sink=sink, + sink_opts=sink_opts, + verbose_out=verbose_out, + heterogeneous_cluster_v1=heterogeneous_cluster_v1, + health_check_name=HealthCheckName.CHECK_{UPPER_NAME}, + killswitch_getter=lambda: FeatureValueHealthChecksFeatures() + .get_healthchecksfeatures_disable_check_{name}(), + ) as rt: + # TODO: implement check logic, call rt.finish(ExitCode.OK, "msg") when done + rt.finish(ExitCode.UNKNOWN, "Not implemented yet") +''' + +TEST_FILE = '''\ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Test the {check_name} health-check.""" + +import logging +from dataclasses import dataclass +from pathlib import Path + +import pytest +from click.testing import CliRunner +from gcm.health_checks.checks.{check_name} import {check_name} +from gcm.health_checks.types import ExitCode + + +@dataclass +class Fake{PascalName}CheckImpl: + """Supply pregenerated output instead of running the real check.""" + + cluster: str = "test cluster" + type: str = "prolog" + log_level: str = "INFO" + log_folder: str = "/tmp" + + def run(self) -> None: + """No-op for testing.""" + pass + + +@pytest.fixture +def {name}_tester( + request: pytest.FixtureRequest, +) -> Fake{PascalName}CheckImpl: + """Create Fake{PascalName}CheckImpl object.""" + return Fake{PascalName}CheckImpl() + + +@pytest.mark.parametrize( + ("expected_exit_code",), + [ + # TODO: add real test cases + (ExitCode.UNKNOWN,), + ], +) +def test_{check_name}( + caplog: pytest.LogCaptureFixture, + tmp_path: Path, + expected_exit_code: ExitCode, +) -> None: + """Invoke the {check_name} command.""" + runner = CliRunner(mix_stderr=False) + caplog.at_level(logging.INFO) + + fake_impl = Fake{PascalName}CheckImpl() + result = runner.invoke( + {check_name}, + f"fair_cluster prolog --log-folder={{tmp_path}} --sink=do_nothing", + obj=fake_impl, + ) + + assert result.exit_code == expected_exit_code.value +''' + +DOC_FILE = '''\ +# {dash_name} + +## Overview +TODO: describe what this health check monitors. + +## Command-Line Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `--sink` | String | do_nothing | Telemetry sink destination | +| `--sink-opts` | Multiple | - | Sink-specific configuration | +| `--verbose-out` | Flag | False | Display detailed output | +| `--log-level` | Choice | INFO | DEBUG, INFO, WARNING, ERROR, CRITICAL | +| `--log-folder` | String | `/var/log/fb-monitoring` | Log directory | +| `--heterogeneous-cluster-v1` | Flag | False | Enable heterogeneous cluster support | + +## Exit Conditions + +| Exit Code | Description | +|-----------|-------------| +| **OK (0)** | Feature flag disabled (killswitch active) | +| **OK (0)** | TODO: describe passing condition | +| **CRITICAL (2)** | TODO: describe failing condition | +| **UNKNOWN (3)** | Check not yet implemented | + +## Usage Examples + +### Basic Check +```shell +health_checks {dash_name} [CLUSTER] app +``` + +### With Telemetry +```shell +health_checks {dash_name} \\ + --sink otel \\ + --sink-opts "log_resource_attributes={{\'attr_1\': \'value1\'}}" \\ + [CLUSTER] \\ + app +``` +''' + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def validate_name(name: str) -> str: + """Validate check_name and return it if valid, else exit with error.""" + if not re.match(r"^check_[a-z][a-z0-9_]*$", name): + print( + f"Error: '{name}' is not a valid check name.\n" + "Name must be lowercase with underscores only and match: " + "^check_[a-z][a-z0-9_]*$\n" + "Example: check_ntp_sync", + file=sys.stderr, + ) + sys.exit(1) + return name + + +def to_pascal_case(snake: str) -> str: + """Convert snake_case to PascalCase.""" + return "".join(word.capitalize() for word in snake.split("_")) + + +def _render(template: str, **kwargs: str) -> str: + """Simple {key} substitution without touching unrelated braces.""" + result = template + for key, value in kwargs.items(): + result = result.replace("{" + key + "}", value) + return result + + +# --------------------------------------------------------------------------- +# File-creation helpers +# --------------------------------------------------------------------------- + + +def create_check_file(check_name: str, group: bool, dry_run: bool) -> None: + name = check_name[len("check_"):] + upper_name = name.upper() + pascal_name = to_pascal_case(name) + + dest = PROJECT_ROOT / "gcm" / "health_checks" / "checks" / f"{check_name}.py" + + if dest.exists(): + print(f"Skipping check file: {dest} already exists") + return + + template = CHECK_FILE_GROUP if group else CHECK_FILE_SIMPLE + content = _render( + template, + check_name=check_name, + name=name, + UPPER_NAME=upper_name, + PascalName=pascal_name, + ) + + if dry_run: + print(f"[dry-run] Would create: {dest}") + return + + dest.write_text(content) + print(f"Created: {dest}") + + +def create_test_file(check_name: str, dry_run: bool) -> None: + name = check_name[len("check_"):] + pascal_name = to_pascal_case(name) + + dest = ( + PROJECT_ROOT + / "gcm" + / "tests" + / "health_checks_tests" + / f"test_{check_name}.py" + ) + + if dest.exists(): + print(f"Skipping test file: {dest} already exists") + return + + content = _render( + TEST_FILE, + check_name=check_name, + name=name, + PascalName=pascal_name, + ) + + if dry_run: + print(f"[dry-run] Would create: {dest}") + return + + dest.write_text(content) + print(f"Created: {dest}") + + +def create_doc_file(check_name: str, dry_run: bool) -> None: + dash_name = check_name.replace("_", "-") + + dest = ( + PROJECT_ROOT + / "website" + / "docs" + / "GCM_Health_Checks" + / "health_checks" + / f"{dash_name}.md" + ) + + if dest.exists(): + print(f"Skipping doc file: {dest} already exists") + return + + content = _render(DOC_FILE, dash_name=dash_name) + + if dry_run: + print(f"[dry-run] Would create: {dest}") + return + + dest.write_text(content) + print(f"Created: {dest}") + + +# --------------------------------------------------------------------------- +# File-modification helpers +# --------------------------------------------------------------------------- + + +def update_init(check_name: str, dry_run: bool) -> None: + """Add import and __all__ entry to checks/__init__.py (alphabetical).""" + init_path = PROJECT_ROOT / "gcm" / "health_checks" / "checks" / "__init__.py" + content = init_path.read_text() + + import_line = ( + f"from gcm.health_checks.checks.{check_name} import {check_name}\n" + ) + all_entry = f' "{check_name}",\n' + + # --- import block --- + if import_line.strip() in content: + print(f"Skipping __init__.py import: already exists") + else: + # Find all existing check_ import lines and insert in alphabetical order. + import_pattern = re.compile( + r"^from gcm\.health_checks\.checks\.check_\S+ import \S+$", + re.MULTILINE, + ) + existing = list(import_pattern.finditer(content)) + if existing: + # Find the correct insertion point. + inserted = False + for match in existing: + if import_line.strip() < match.group().strip(): + # Insert before this match. + pos = match.start() + content = content[:pos] + import_line + content[pos:] + inserted = True + break + if not inserted: + # Insert after the last existing import. + last = existing[-1] + pos = last.end() + 1 # after the newline + content = content[:pos] + import_line + content[pos:] + else: + # No existing check_ imports; prepend before __all__. + content = import_line + content + + if dry_run: + print(f"[dry-run] Would add import to {init_path}") + else: + init_path.write_text(content) + print(f"Updated imports in {init_path}") + # Re-read after writing so __all__ update is consistent. + content = init_path.read_text() + + # --- __all__ entry --- + if f'"{check_name}"' in content: + print(f"Skipping __init__.py __all__: already exists") + else: + # Append before the closing ]. + content = content.replace( + "]\n", + f"{all_entry}]\n", + 1, + ) + if dry_run: + print(f"[dry-run] Would add __all__ entry to {init_path}") + else: + init_path.write_text(content) + print(f"Updated __all__ in {init_path}") + + +def update_cli(check_name: str, dry_run: bool) -> None: + """Add entry to list_of_checks in health_checks.py.""" + cli_path = ( + PROJECT_ROOT / "gcm" / "health_checks" / "cli" / "health_checks.py" + ) + content = cli_path.read_text() + + entry = f" checks.{check_name},\n" + + if f"checks.{check_name}" in content: + print(f"Skipping health_checks.py: checks.{check_name} already exists") + return + + # Insert before the closing ] of list_of_checks. + content = content.replace("]\n\nfor check", f"{entry}]\n\nfor check", 1) + + if dry_run: + print(f"[dry-run] Would add checks.{check_name} to {cli_path}") + return + + cli_path.write_text(content) + print(f"Updated {cli_path}") + + +def update_enum(check_name: str, dry_run: bool) -> None: + """Add entry to HealthCheckName enum in alphabetical order.""" + enum_path = ( + PROJECT_ROOT / "gcm" / "schemas" / "health_check" / "health_check_name.py" + ) + content = enum_path.read_text() + + name = check_name[len("check_"):] + upper_name = name.upper() + space_name = name.replace("_", " ") + enum_key = f"CHECK_{upper_name}" + enum_value = f"check {space_name}" + new_line = f' {enum_key} = "{enum_value}"\n' + + if enum_key in content: + print(f"Skipping health_check_name.py: {enum_key} already exists") + return + + # Find existing CHECK_ entries and insert alphabetically. + check_pattern = re.compile(r"^ CHECK_\w+ = \".+\"$", re.MULTILINE) + existing = list(check_pattern.finditer(content)) + + if existing: + inserted = False + for match in existing: + existing_key = match.group().strip().split(" = ")[0] + if enum_key < existing_key: + pos = match.start() + content = content[:pos] + new_line + content[pos:] + inserted = True + break + if not inserted: + # Append after last CHECK_ entry. + last = existing[-1] + pos = last.end() + 1 + content = content[:pos] + new_line + content[pos:] + else: + # No existing CHECK_ entries; append before end of class. + content = content.rstrip("\n") + "\n" + new_line + + if dry_run: + print(f"[dry-run] Would add {enum_key} to {enum_path}") + return + + enum_path.write_text(content) + print(f"Updated {enum_path}") + + +def update_features(check_name: str, dry_run: bool) -> None: + """Add disable_check_{name} field to HealthChecksFeatures alphabetically.""" + features_path = ( + PROJECT_ROOT + / "gcm" + / "monitoring" + / "features" + / "feature_definitions" + / "health_checks_features.py" + ) + content = features_path.read_text() + + field_name = f"disable_{check_name}" + new_line = f" {field_name}: bool\n" + + if field_name in content: + print(f"Skipping health_checks_features.py: {field_name} already exists") + return + + # Find all disable_ field lines and insert alphabetically. + field_pattern = re.compile(r"^ disable_\w+: bool$", re.MULTILINE) + existing = list(field_pattern.finditer(content)) + + if existing: + inserted = False + for match in existing: + existing_field = match.group().strip().split(":")[0] + if field_name < existing_field: + pos = match.start() + content = content[:pos] + new_line + content[pos:] + inserted = True + break + if not inserted: + # Append after the last disable_ field. + last = existing[-1] + pos = last.end() + 1 + content = content[:pos] + new_line + content[pos:] + else: + content = content.rstrip("\n") + "\n" + new_line + + if dry_run: + print(f"[dry-run] Would add {field_name} to {features_path}") + return + + features_path.write_text(content) + print(f"Updated {features_path}") + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Scaffold a new GCM health check.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Examples:\n" + " python bin/create_new_health_check.py check_ntp_sync\n" + " python bin/create_new_health_check.py check_ntp_sync --group\n" + " python bin/create_new_health_check.py check_ntp_sync --dry-run\n" + ), + ) + parser.add_argument( + "check_name", + help=( + "Snake_case name of the new health check (e.g. check_ntp_sync). " + "Must start with 'check_' and contain only lowercase letters and underscores." + ), + ) + parser.add_argument( + "--group", + action="store_true", + default=False, + help="Generate a @click.group() command instead of a @click.command().", + ) + parser.add_argument( + "--dry-run", + action="store_true", + default=False, + help="Print what would be created/modified without writing any files.", + ) + + args = parser.parse_args() + check_name = validate_name(args.check_name) + + if args.dry_run: + print(f"[dry-run] Scaffolding health check: {check_name}") + + # 1. Create check file + create_check_file(check_name, group=args.group, dry_run=args.dry_run) + # 2. Create test file + create_test_file(check_name, dry_run=args.dry_run) + # 3. Create doc file + create_doc_file(check_name, dry_run=args.dry_run) + # 4. Update checks/__init__.py + update_init(check_name, dry_run=args.dry_run) + # 5. Update health_checks.py CLI + update_cli(check_name, dry_run=args.dry_run) + # 6. Update health_check_name.py enum + update_enum(check_name, dry_run=args.dry_run) + # 7. Update health_checks_features.py feature flags + update_features(check_name, dry_run=args.dry_run) + + if not args.dry_run: + print(f"\nDone! Health check '{check_name}' scaffolded successfully.") + print("Next steps:") + print( + f" 1. Implement the check logic in " + f"gcm/health_checks/checks/{check_name}.py" + ) + print( + f" 2. Update the test in " + f"gcm/tests/health_checks_tests/test_{check_name}.py" + ) + print( + f" 3. Update the doc stub in " + f"website/docs/GCM_Health_Checks/health_checks/" + f"{check_name.replace('_', '-')}.md" + ) + print( + f" 4. Regenerate feature flags if needed " + f"(health_checks_features.py may need regen)." + ) + + +if __name__ == "__main__": + main() diff --git a/gcm/tests/test_create_health_check.py b/gcm/tests/test_create_health_check.py new file mode 100644 index 0000000..19cf262 --- /dev/null +++ b/gcm/tests/test_create_health_check.py @@ -0,0 +1,355 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +"""Tests for the create_new_health_check scaffold tool.""" + +import importlib.util +import shutil +from io import StringIO +from pathlib import Path + +import pytest + +# --------------------------------------------------------------------------- +# Load the scaffold module from bin/ via importlib so we can test its +# individual functions without making it a package. +# --------------------------------------------------------------------------- + +_BIN_SCRIPT = Path(__file__).resolve().parent.parent.parent / "bin" / "create_new_health_check.py" + +spec = importlib.util.spec_from_file_location("create_new_health_check", str(_BIN_SCRIPT)) +scaffold = importlib.util.module_from_spec(spec) +spec.loader.exec_module(scaffold) + +# --------------------------------------------------------------------------- +# Fixture: replicate the relevant project directories under tmp_path and +# redirect scaffold.PROJECT_ROOT there so no real files are touched. +# --------------------------------------------------------------------------- + +_REAL_ROOT = Path(__file__).resolve().parent.parent.parent + + +@pytest.fixture +def scaffold_env(tmp_path: Path): + """Set up temp directory with copies of files the scaffold modifies.""" + # Mirror required directory structure + checks_dir = tmp_path / "gcm" / "health_checks" / "checks" + checks_dir.mkdir(parents=True) + cli_dir = tmp_path / "gcm" / "health_checks" / "cli" + cli_dir.mkdir(parents=True) + schema_dir = tmp_path / "gcm" / "schemas" / "health_check" + schema_dir.mkdir(parents=True) + features_dir = tmp_path / "gcm" / "monitoring" / "features" / "feature_definitions" + features_dir.mkdir(parents=True) + tests_dir = tmp_path / "gcm" / "tests" / "health_checks_tests" + tests_dir.mkdir(parents=True) + docs_dir = tmp_path / "website" / "docs" / "GCM_Health_Checks" / "health_checks" + docs_dir.mkdir(parents=True) + + # Copy real source files that the scaffold modifies + shutil.copy(_REAL_ROOT / "gcm/health_checks/checks/__init__.py", checks_dir) + shutil.copy(_REAL_ROOT / "gcm/health_checks/cli/health_checks.py", cli_dir) + shutil.copy(_REAL_ROOT / "gcm/schemas/health_check/health_check_name.py", schema_dir) + shutil.copy( + _REAL_ROOT / "gcm/monitoring/features/feature_definitions/health_checks_features.py", + features_dir, + ) + + # Redirect scaffold to temp root + original_root = scaffold.PROJECT_ROOT + scaffold.PROJECT_ROOT = tmp_path + yield tmp_path + scaffold.PROJECT_ROOT = original_root + + +# --------------------------------------------------------------------------- +# Name validation +# --------------------------------------------------------------------------- + + +def test_validate_name_valid() -> None: + """A correctly prefixed lowercase name is accepted and returned as-is.""" + assert scaffold.validate_name("check_ntp_sync") == "check_ntp_sync" + + +@pytest.mark.parametrize( + "bad_name", + [ + "Check_ntp_sync", # uppercase first letter + "CHECK_NTP_SYNC", # all uppercase + "ntp_sync", # missing check_ prefix + "check_Ntp_Sync", # mixed case after prefix + "check ntp sync", # spaces + "check_", # nothing after prefix + "", # empty string + ], +) +def test_validate_name_invalid(bad_name: str) -> None: + """Invalid names cause sys.exit(1).""" + with pytest.raises(SystemExit) as exc_info: + scaffold.validate_name(bad_name) + assert exc_info.value.code == 1 + + +# --------------------------------------------------------------------------- +# PascalCase conversion +# --------------------------------------------------------------------------- + + +def test_to_pascal_case_multi_word() -> None: + """Multi-word snake_case is converted to PascalCase.""" + assert scaffold.to_pascal_case("ntp_sync") == "NtpSync" + + +def test_to_pascal_case_single_word() -> None: + """A single word is capitalised.""" + assert scaffold.to_pascal_case("sensors") == "Sensors" + + +# --------------------------------------------------------------------------- +# create_check_file — simple (command) variant +# --------------------------------------------------------------------------- + + +def test_create_check_file(scaffold_env: Path) -> None: + """Scaffold creates the check file with expected content.""" + scaffold.create_check_file("check_ntp_sync", group=False, dry_run=False) + + dest = scaffold_env / "gcm" / "health_checks" / "checks" / "check_ntp_sync.py" + assert dest.exists(), "Check file was not created" + + content = dest.read_text() + assert "HealthCheckRuntime" in content + assert "@click.command()" in content + assert "check_ntp_sync" in content + assert "NtpSync" in content + + +# --------------------------------------------------------------------------- +# create_check_file — group variant +# --------------------------------------------------------------------------- + + +def test_create_check_file_group(scaffold_env: Path) -> None: + """With group=True the scaffold emits a @click.group() instead.""" + scaffold.create_check_file("check_ntp_sync", group=True, dry_run=False) + + dest = scaffold_env / "gcm" / "health_checks" / "checks" / "check_ntp_sync.py" + content = dest.read_text() + assert "@click.group()" in content + assert "@click.command()" not in content + + +# --------------------------------------------------------------------------- +# Idempotency — calling create helpers twice skips gracefully +# --------------------------------------------------------------------------- + + +def test_create_files_idempotent(scaffold_env: Path, capsys: pytest.CaptureFixture) -> None: + """Running the file-creation helpers twice does not raise and prints skip.""" + scaffold.create_check_file("check_ntp_sync", group=False, dry_run=False) + scaffold.create_test_file("check_ntp_sync", dry_run=False) + scaffold.create_doc_file("check_ntp_sync", dry_run=False) + + # Second call — must not raise + scaffold.create_check_file("check_ntp_sync", group=False, dry_run=False) + scaffold.create_test_file("check_ntp_sync", dry_run=False) + scaffold.create_doc_file("check_ntp_sync", dry_run=False) + + captured = capsys.readouterr() + assert "Skipping" in captured.out + + +# --------------------------------------------------------------------------- +# Dry-run — no files must be created or modified +# --------------------------------------------------------------------------- + + +def test_dry_run_no_changes(scaffold_env: Path, capsys: pytest.CaptureFixture) -> None: + """dry_run=True must not create or modify any files.""" + check_dest = scaffold_env / "gcm" / "health_checks" / "checks" / "check_ntp_sync.py" + test_dest = scaffold_env / "gcm" / "tests" / "health_checks_tests" / "test_check_ntp_sync.py" + doc_dest = ( + scaffold_env / "website" / "docs" / "GCM_Health_Checks" / "health_checks" / "check-ntp-sync.md" + ) + + init_before = (scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py").read_text() + cli_before = (scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py").read_text() + enum_before = (scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py").read_text() + features_before = ( + scaffold_env + / "gcm" + / "monitoring" + / "features" + / "feature_definitions" + / "health_checks_features.py" + ).read_text() + + scaffold.create_check_file("check_ntp_sync", group=False, dry_run=True) + scaffold.create_test_file("check_ntp_sync", dry_run=True) + scaffold.create_doc_file("check_ntp_sync", dry_run=True) + scaffold.update_init("check_ntp_sync", dry_run=True) + scaffold.update_cli("check_ntp_sync", dry_run=True) + scaffold.update_enum("check_ntp_sync", dry_run=True) + scaffold.update_features("check_ntp_sync", dry_run=True) + + # No new files + assert not check_dest.exists() + assert not test_dest.exists() + assert not doc_dest.exists() + + # Existing files unchanged + assert (scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py").read_text() == init_before + assert (scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py").read_text() == cli_before + assert (scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py").read_text() == enum_before + assert ( + scaffold_env + / "gcm" + / "monitoring" + / "features" + / "feature_definitions" + / "health_checks_features.py" + ).read_text() == features_before + + captured = capsys.readouterr() + assert "[dry-run]" in captured.out + + +# --------------------------------------------------------------------------- +# update_init +# --------------------------------------------------------------------------- + + +def test_update_init(scaffold_env: Path) -> None: + """update_init adds the import line and __all__ entry in alphabetical order.""" + scaffold.update_init("check_ntp_sync", dry_run=False) + + init_path = scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" + content = init_path.read_text() + + assert "from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync" in content + assert '"check_ntp_sync"' in content + + # Verify import is in alphabetical position: check_ntp_sync should come + # after check_node and before check_nvidia_smi. + node_pos = content.find("from gcm.health_checks.checks.check_node") + ntp_pos = content.find("from gcm.health_checks.checks.check_ntp_sync") + nvidia_pos = content.find("from gcm.health_checks.checks.check_nvidia_smi") + assert node_pos < ntp_pos < nvidia_pos + + +def test_update_init_idempotent(scaffold_env: Path) -> None: + """Running update_init twice does not duplicate entries.""" + scaffold.update_init("check_ntp_sync", dry_run=False) + scaffold.update_init("check_ntp_sync", dry_run=False) + + init_path = scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" + content = init_path.read_text() + + assert content.count("check_ntp_sync") == content.count("check_ntp_sync") + # Specifically, the import line must appear exactly once. + assert content.count("from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync") == 1 + + +# --------------------------------------------------------------------------- +# update_enum +# --------------------------------------------------------------------------- + + +def test_update_enum(scaffold_env: Path) -> None: + """update_enum adds the CHECK_NTP_SYNC entry in alphabetical order.""" + scaffold.update_enum("check_ntp_sync", dry_run=False) + + enum_path = scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + content = enum_path.read_text() + + assert 'CHECK_NTP_SYNC = "check ntp sync"' in content + + # Should appear after CHECK_NODE (alphabetically CHECK_NODE < CHECK_NTP_SYNC) + # but there is no CHECK_NODE in the enum; verify it sorts before CHECK_PCI. + ntp_pos = content.find("CHECK_NTP_SYNC") + pci_pos = content.find("CHECK_PCI") + assert ntp_pos < pci_pos + + +def test_update_enum_idempotent(scaffold_env: Path) -> None: + """Running update_enum twice does not duplicate the entry.""" + scaffold.update_enum("check_ntp_sync", dry_run=False) + scaffold.update_enum("check_ntp_sync", dry_run=False) + + enum_path = scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + content = enum_path.read_text() + + assert content.count("CHECK_NTP_SYNC") == 1 + + +# --------------------------------------------------------------------------- +# update_features +# --------------------------------------------------------------------------- + + +def test_update_features(scaffold_env: Path) -> None: + """update_features adds the disable_ field in alphabetical order.""" + scaffold.update_features("check_ntp_sync", dry_run=False) + + features_path = ( + scaffold_env + / "gcm" + / "monitoring" + / "features" + / "feature_definitions" + / "health_checks_features.py" + ) + content = features_path.read_text() + + assert "disable_check_ntp_sync: bool" in content + + # Alphabetically, disable_check_ntp_sync should come after + # disable_check_nccl (if present) or be in correct relative position. + # It must appear before disable_check_pci. + ntp_pos = content.find("disable_check_ntp_sync") + pci_pos = content.find("disable_check_pci") + assert ntp_pos < pci_pos + + +def test_update_features_idempotent(scaffold_env: Path) -> None: + """Running update_features twice does not duplicate the field.""" + scaffold.update_features("check_ntp_sync", dry_run=False) + scaffold.update_features("check_ntp_sync", dry_run=False) + + features_path = ( + scaffold_env + / "gcm" + / "monitoring" + / "features" + / "feature_definitions" + / "health_checks_features.py" + ) + content = features_path.read_text() + + assert content.count("disable_check_ntp_sync") == 1 + + +# --------------------------------------------------------------------------- +# update_cli +# --------------------------------------------------------------------------- + + +def test_update_cli(scaffold_env: Path) -> None: + """update_cli appends the check entry to list_of_checks.""" + scaffold.update_cli("check_ntp_sync", dry_run=False) + + cli_path = scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py" + content = cli_path.read_text() + + assert "checks.check_ntp_sync," in content + + +def test_update_cli_idempotent(scaffold_env: Path) -> None: + """Running update_cli twice does not duplicate the entry.""" + scaffold.update_cli("check_ntp_sync", dry_run=False) + scaffold.update_cli("check_ntp_sync", dry_run=False) + + cli_path = scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py" + content = cli_path.read_text() + + assert content.count("checks.check_ntp_sync") == 1 diff --git a/website/docs/GCM_Health_Checks/adding_new_health_check.md b/website/docs/GCM_Health_Checks/adding_new_health_check.md index de30146..e74d015 100644 --- a/website/docs/GCM_Health_Checks/adding_new_health_check.md +++ b/website/docs/GCM_Health_Checks/adding_new_health_check.md @@ -8,6 +8,32 @@ GCM Health Checks are designed to be easily extensible. Each check follows the s For a deep dive into the boilerplate and annotated code examples, see the [Deep Dive](health_checks_deep_dive.md). +## Quick Start with Scaffold Tool + +Generate all required files and registrations automatically: + +```bash +python bin/create_new_health_check.py check_my_check +``` + +For a grouped check (multiple sub-commands): +```bash +python bin/create_new_health_check.py check_my_check --group +``` + +Preview changes without modifying files: +```bash +python bin/create_new_health_check.py check_my_check --dry-run +``` + +Then run feature generation and formatting: +```bash +python bin/generate_features.py +ufmt format gcm +``` + +The tool creates the check file, test file, and documentation stub, and registers the check in all required locations (steps 1, 5, 6, 7, and 8 below). You still need to implement the actual check logic (steps 2-4) and run verification (step 9). + ## 1. Create the check file Create a new file under [`gcm/health_checks/checks/`](https://github.com/facebookresearch/gcm/tree/main/gcm/health_checks/checks). The naming convention is `check_.py`. @@ -354,3 +380,4 @@ nox -s typecheck # mypy type checking | Telemetry context | [`gcm/health_checks/check_utils/telem.py`](https://github.com/facebookresearch/gcm/blob/main/gcm/health_checks/check_utils/telem.py) | | Deep dive | [Health Checks Deep Dive](health_checks_deep_dive.md) | | Killswitch tests | [`gcm/tests/health_checks_tests/test_killswitches.py`](https://github.com/facebookresearch/gcm/blob/main/gcm/tests/health_checks_tests/test_killswitches.py) | +| Scaffold tool | [`bin/create_new_health_check.py`](https://github.com/facebookresearch/gcm/blob/main/bin/create_new_health_check.py) | From 411040060463e1e4198987d21964c4dd325e4cd2 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 12:58:19 +0100 Subject: [PATCH 4/7] Fix lint (flake8) and formatting (ufmt) issues - Replace len() slice with constant to avoid E203 whitespace-before-colon - Remove placeholder-less f-strings (F541) - Remove unused StringIO import (F401) --- bin/create_new_health_check.py | 34 ++++------ gcm/tests/test_create_health_check.py | 92 +++++++++++++++++++-------- 2 files changed, 81 insertions(+), 45 deletions(-) diff --git a/bin/create_new_health_check.py b/bin/create_new_health_check.py index 9afbd10..3234322 100755 --- a/bin/create_new_health_check.py +++ b/bin/create_new_health_check.py @@ -9,6 +9,8 @@ from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent +_CHECK_PREFIX = "check_" +_PREFIX_LEN = len(_CHECK_PREFIX) # --------------------------------------------------------------------------- # Templates @@ -251,7 +253,7 @@ def test_{check_name}( assert result.exit_code == expected_exit_code.value ''' -DOC_FILE = '''\ +DOC_FILE = """\ # {dash_name} ## Overview @@ -292,7 +294,7 @@ def test_{check_name}( [CLUSTER] \\ app ``` -''' +""" # --------------------------------------------------------------------------- @@ -333,7 +335,7 @@ def _render(template: str, **kwargs: str) -> str: def create_check_file(check_name: str, group: bool, dry_run: bool) -> None: - name = check_name[len("check_"):] + name = check_name[_PREFIX_LEN:] upper_name = name.upper() pascal_name = to_pascal_case(name) @@ -361,15 +363,11 @@ def create_check_file(check_name: str, group: bool, dry_run: bool) -> None: def create_test_file(check_name: str, dry_run: bool) -> None: - name = check_name[len("check_"):] + name = check_name[_PREFIX_LEN:] pascal_name = to_pascal_case(name) dest = ( - PROJECT_ROOT - / "gcm" - / "tests" - / "health_checks_tests" - / f"test_{check_name}.py" + PROJECT_ROOT / "gcm" / "tests" / "health_checks_tests" / f"test_{check_name}.py" ) if dest.exists(): @@ -427,14 +425,12 @@ def update_init(check_name: str, dry_run: bool) -> None: init_path = PROJECT_ROOT / "gcm" / "health_checks" / "checks" / "__init__.py" content = init_path.read_text() - import_line = ( - f"from gcm.health_checks.checks.{check_name} import {check_name}\n" - ) + import_line = f"from gcm.health_checks.checks.{check_name} import {check_name}\n" all_entry = f' "{check_name}",\n' # --- import block --- if import_line.strip() in content: - print(f"Skipping __init__.py import: already exists") + print("Skipping __init__.py import: already exists") else: # Find all existing check_ import lines and insert in alphabetical order. import_pattern = re.compile( @@ -471,7 +467,7 @@ def update_init(check_name: str, dry_run: bool) -> None: # --- __all__ entry --- if f'"{check_name}"' in content: - print(f"Skipping __init__.py __all__: already exists") + print("Skipping __init__.py __all__: already exists") else: # Append before the closing ]. content = content.replace( @@ -488,9 +484,7 @@ def update_init(check_name: str, dry_run: bool) -> None: def update_cli(check_name: str, dry_run: bool) -> None: """Add entry to list_of_checks in health_checks.py.""" - cli_path = ( - PROJECT_ROOT / "gcm" / "health_checks" / "cli" / "health_checks.py" - ) + cli_path = PROJECT_ROOT / "gcm" / "health_checks" / "cli" / "health_checks.py" content = cli_path.read_text() entry = f" checks.{check_name},\n" @@ -517,7 +511,7 @@ def update_enum(check_name: str, dry_run: bool) -> None: ) content = enum_path.read_text() - name = check_name[len("check_"):] + name = check_name[_PREFIX_LEN:] upper_name = name.upper() space_name = name.replace("_", " ") enum_key = f"CHECK_{upper_name}" @@ -680,8 +674,8 @@ def main() -> None: f"{check_name.replace('_', '-')}.md" ) print( - f" 4. Regenerate feature flags if needed " - f"(health_checks_features.py may need regen)." + " 4. Regenerate feature flags if needed " + "(health_checks_features.py may need regen)." ) diff --git a/gcm/tests/test_create_health_check.py b/gcm/tests/test_create_health_check.py index 19cf262..ce07824 100644 --- a/gcm/tests/test_create_health_check.py +++ b/gcm/tests/test_create_health_check.py @@ -4,7 +4,6 @@ import importlib.util import shutil -from io import StringIO from pathlib import Path import pytest @@ -14,9 +13,13 @@ # individual functions without making it a package. # --------------------------------------------------------------------------- -_BIN_SCRIPT = Path(__file__).resolve().parent.parent.parent / "bin" / "create_new_health_check.py" +_BIN_SCRIPT = ( + Path(__file__).resolve().parent.parent.parent / "bin" / "create_new_health_check.py" +) -spec = importlib.util.spec_from_file_location("create_new_health_check", str(_BIN_SCRIPT)) +spec = importlib.util.spec_from_file_location( + "create_new_health_check", str(_BIN_SCRIPT) +) scaffold = importlib.util.module_from_spec(spec) spec.loader.exec_module(scaffold) @@ -48,9 +51,12 @@ def scaffold_env(tmp_path: Path): # Copy real source files that the scaffold modifies shutil.copy(_REAL_ROOT / "gcm/health_checks/checks/__init__.py", checks_dir) shutil.copy(_REAL_ROOT / "gcm/health_checks/cli/health_checks.py", cli_dir) - shutil.copy(_REAL_ROOT / "gcm/schemas/health_check/health_check_name.py", schema_dir) shutil.copy( - _REAL_ROOT / "gcm/monitoring/features/feature_definitions/health_checks_features.py", + _REAL_ROOT / "gcm/schemas/health_check/health_check_name.py", schema_dir + ) + shutil.copy( + _REAL_ROOT + / "gcm/monitoring/features/feature_definitions/health_checks_features.py", features_dir, ) @@ -74,13 +80,13 @@ def test_validate_name_valid() -> None: @pytest.mark.parametrize( "bad_name", [ - "Check_ntp_sync", # uppercase first letter - "CHECK_NTP_SYNC", # all uppercase - "ntp_sync", # missing check_ prefix - "check_Ntp_Sync", # mixed case after prefix - "check ntp sync", # spaces - "check_", # nothing after prefix - "", # empty string + "Check_ntp_sync", # uppercase first letter + "CHECK_NTP_SYNC", # all uppercase + "ntp_sync", # missing check_ prefix + "check_Ntp_Sync", # mixed case after prefix + "check ntp sync", # spaces + "check_", # nothing after prefix + "", # empty string ], ) def test_validate_name_invalid(bad_name: str) -> None: @@ -144,7 +150,9 @@ def test_create_check_file_group(scaffold_env: Path) -> None: # --------------------------------------------------------------------------- -def test_create_files_idempotent(scaffold_env: Path, capsys: pytest.CaptureFixture) -> None: +def test_create_files_idempotent( + scaffold_env: Path, capsys: pytest.CaptureFixture +) -> None: """Running the file-creation helpers twice does not raise and prints skip.""" scaffold.create_check_file("check_ntp_sync", group=False, dry_run=False) scaffold.create_test_file("check_ntp_sync", dry_run=False) @@ -167,14 +175,31 @@ def test_create_files_idempotent(scaffold_env: Path, capsys: pytest.CaptureFixtu def test_dry_run_no_changes(scaffold_env: Path, capsys: pytest.CaptureFixture) -> None: """dry_run=True must not create or modify any files.""" check_dest = scaffold_env / "gcm" / "health_checks" / "checks" / "check_ntp_sync.py" - test_dest = scaffold_env / "gcm" / "tests" / "health_checks_tests" / "test_check_ntp_sync.py" + test_dest = ( + scaffold_env + / "gcm" + / "tests" + / "health_checks_tests" + / "test_check_ntp_sync.py" + ) doc_dest = ( - scaffold_env / "website" / "docs" / "GCM_Health_Checks" / "health_checks" / "check-ntp-sync.md" + scaffold_env + / "website" + / "docs" + / "GCM_Health_Checks" + / "health_checks" + / "check-ntp-sync.md" ) - init_before = (scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py").read_text() - cli_before = (scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py").read_text() - enum_before = (scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py").read_text() + init_before = ( + scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" + ).read_text() + cli_before = ( + scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py" + ).read_text() + enum_before = ( + scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + ).read_text() features_before = ( scaffold_env / "gcm" @@ -198,9 +223,15 @@ def test_dry_run_no_changes(scaffold_env: Path, capsys: pytest.CaptureFixture) - assert not doc_dest.exists() # Existing files unchanged - assert (scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py").read_text() == init_before - assert (scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py").read_text() == cli_before - assert (scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py").read_text() == enum_before + assert ( + scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" + ).read_text() == init_before + assert ( + scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py" + ).read_text() == cli_before + assert ( + scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + ).read_text() == enum_before assert ( scaffold_env / "gcm" @@ -226,7 +257,9 @@ def test_update_init(scaffold_env: Path) -> None: init_path = scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" content = init_path.read_text() - assert "from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync" in content + assert ( + "from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync" in content + ) assert '"check_ntp_sync"' in content # Verify import is in alphabetical position: check_ntp_sync should come @@ -247,7 +280,12 @@ def test_update_init_idempotent(scaffold_env: Path) -> None: assert content.count("check_ntp_sync") == content.count("check_ntp_sync") # Specifically, the import line must appear exactly once. - assert content.count("from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync") == 1 + assert ( + content.count( + "from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync" + ) + == 1 + ) # --------------------------------------------------------------------------- @@ -259,7 +297,9 @@ def test_update_enum(scaffold_env: Path) -> None: """update_enum adds the CHECK_NTP_SYNC entry in alphabetical order.""" scaffold.update_enum("check_ntp_sync", dry_run=False) - enum_path = scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + enum_path = ( + scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + ) content = enum_path.read_text() assert 'CHECK_NTP_SYNC = "check ntp sync"' in content @@ -276,7 +316,9 @@ def test_update_enum_idempotent(scaffold_env: Path) -> None: scaffold.update_enum("check_ntp_sync", dry_run=False) scaffold.update_enum("check_ntp_sync", dry_run=False) - enum_path = scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + enum_path = ( + scaffold_env / "gcm" / "schemas" / "health_check" / "health_check_name.py" + ) content = enum_path.read_text() assert content.count("CHECK_NTP_SYNC") == 1 From cacc3a0c6ce588a569b08005c1c4929a533bdf20 Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 13:05:03 +0100 Subject: [PATCH 5/7] Fix mypy type errors in scaffold test module Add type narrowing asserts for importlib spec/loader (which return Optional types), type the dynamically-loaded scaffold module as Any to allow attribute access, and annotate the scaffold_env fixture return type as Iterator[Path]. --- gcm/tests/test_create_health_check.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gcm/tests/test_create_health_check.py b/gcm/tests/test_create_health_check.py index ce07824..88c410c 100644 --- a/gcm/tests/test_create_health_check.py +++ b/gcm/tests/test_create_health_check.py @@ -4,7 +4,9 @@ import importlib.util import shutil +from collections.abc import Iterator from pathlib import Path +from typing import Any import pytest @@ -20,7 +22,9 @@ spec = importlib.util.spec_from_file_location( "create_new_health_check", str(_BIN_SCRIPT) ) -scaffold = importlib.util.module_from_spec(spec) +assert spec is not None, f"Could not load spec from {_BIN_SCRIPT}" +assert spec.loader is not None, "spec.loader is None" +scaffold: Any = importlib.util.module_from_spec(spec) spec.loader.exec_module(scaffold) # --------------------------------------------------------------------------- @@ -32,7 +36,7 @@ @pytest.fixture -def scaffold_env(tmp_path: Path): +def scaffold_env(tmp_path: Path) -> Iterator[Path]: """Set up temp directory with copies of files the scaffold modifies.""" # Mirror required directory structure checks_dir = tmp_path / "gcm" / "health_checks" / "checks" From 9781e65b74b9251fc90a2c392c7063944c0759ce Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 22:02:59 +0100 Subject: [PATCH 6/7] Run generate_features and ufmt format automatically after scaffolding The scaffold tool now runs generate_features.py and ufmt format gcm as a post-scaffold step, so users no longer need to run them manually. Updated docs to reflect the automated workflow. --- bin/create_new_health_check.py | 42 +++++++++++++++++-- .../adding_new_health_check.md | 8 +--- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/bin/create_new_health_check.py b/bin/create_new_health_check.py index 3234322..fdfd656 100755 --- a/bin/create_new_health_check.py +++ b/bin/create_new_health_check.py @@ -5,6 +5,7 @@ import argparse import re +import subprocess import sys from pathlib import Path @@ -600,6 +601,40 @@ def update_features(check_name: str, dry_run: bool) -> None: print(f"Updated {features_path}") +# --------------------------------------------------------------------------- +# Post-scaffold automation +# --------------------------------------------------------------------------- + + +def run_post_scaffold(check_name: str) -> None: + """Run feature generation and code formatting after scaffolding.""" + gen_script = PROJECT_ROOT / "bin" / "generate_features.py" + if gen_script.exists(): + print("Running feature generation...") + result = subprocess.run( + [sys.executable, str(gen_script)], + cwd=str(PROJECT_ROOT), + ) + if result.returncode != 0: + print( + f"Warning: generate_features.py exited with code {result.returncode}", + file=sys.stderr, + ) + else: + print(f"Skipping feature generation: {gen_script} not found") + + print("Running ufmt format...") + result = subprocess.run( + [sys.executable, "-m", "ufmt", "format", "gcm"], + cwd=str(PROJECT_ROOT), + ) + if result.returncode != 0: + print( + f"Warning: ufmt format exited with code {result.returncode}", + file=sys.stderr, + ) + + # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- @@ -658,6 +693,9 @@ def main() -> None: update_features(check_name, dry_run=args.dry_run) if not args.dry_run: + # 8. Regenerate feature flags and format generated code + run_post_scaffold(check_name) + print(f"\nDone! Health check '{check_name}' scaffolded successfully.") print("Next steps:") print( @@ -673,10 +711,6 @@ def main() -> None: f"website/docs/GCM_Health_Checks/health_checks/" f"{check_name.replace('_', '-')}.md" ) - print( - " 4. Regenerate feature flags if needed " - "(health_checks_features.py may need regen)." - ) if __name__ == "__main__": diff --git a/website/docs/GCM_Health_Checks/adding_new_health_check.md b/website/docs/GCM_Health_Checks/adding_new_health_check.md index e74d015..d52dedf 100644 --- a/website/docs/GCM_Health_Checks/adding_new_health_check.md +++ b/website/docs/GCM_Health_Checks/adding_new_health_check.md @@ -26,13 +26,7 @@ Preview changes without modifying files: python bin/create_new_health_check.py check_my_check --dry-run ``` -Then run feature generation and formatting: -```bash -python bin/generate_features.py -ufmt format gcm -``` - -The tool creates the check file, test file, and documentation stub, and registers the check in all required locations (steps 1, 5, 6, 7, and 8 below). You still need to implement the actual check logic (steps 2-4) and run verification (step 9). +The tool creates the check file, test file, and documentation stub, registers the check in all required locations, and automatically runs `generate_features.py` and `ufmt format gcm`. You still need to implement the actual check logic (steps 2-4) and run verification (step 9). ## 1. Create the check file From fe67af411895a2a9485ab2541b10837f8e2c982f Mon Sep 17 00:00:00 2001 From: Gustavo Lima Date: Sun, 1 Mar 2026 22:06:54 +0100 Subject: [PATCH 7/7] Address code review feedback on scaffold tool - Remove unused import sys from generated templates (would fail lint) - Remove unused check_name parameter from run_post_scaffold - Add guard for silent no-op when CLI anchor string is not found - Fix dry-run inconsistency in update_init __all__ block - Remove tautological assertion in test_update_init_idempotent - Add test for missing CLI anchor warning - Update Quick Start docs to list all remaining manual steps --- bin/create_new_health_check.py | 34 +++++++++++++------ gcm/tests/test_create_health_check.py | 18 ++++++++-- .../adding_new_health_check.md | 2 +- 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/bin/create_new_health_check.py b/bin/create_new_health_check.py index fdfd656..af23461 100755 --- a/bin/create_new_health_check.py +++ b/bin/create_new_health_check.py @@ -22,7 +22,6 @@ # All rights reserved. """TODO: describe what {check_name} checks.""" -import sys from collections.abc import Collection from dataclasses import dataclass from typing import Optional, Protocol @@ -106,7 +105,6 @@ def {check_name}( # All rights reserved. """TODO: describe what {check_name} checks.""" -import sys from collections.abc import Collection from dataclasses import dataclass from typing import Optional, Protocol @@ -463,23 +461,32 @@ def update_init(check_name: str, dry_run: bool) -> None: else: init_path.write_text(content) print(f"Updated imports in {init_path}") - # Re-read after writing so __all__ update is consistent. - content = init_path.read_text() + + # Re-read on-disk content so __all__ check is consistent regardless + # of whether the import block was written or skipped due to dry-run. + content = init_path.read_text() # --- __all__ entry --- if f'"{check_name}"' in content: print("Skipping __init__.py __all__: already exists") else: # Append before the closing ]. - content = content.replace( + new_content = content.replace( "]\n", f"{all_entry}]\n", 1, ) + if new_content == content: + print( + f"Warning: could not find insertion point in {init_path}. " + "Please add the __all__ entry manually.", + file=sys.stderr, + ) + return if dry_run: print(f"[dry-run] Would add __all__ entry to {init_path}") else: - init_path.write_text(content) + init_path.write_text(new_content) print(f"Updated __all__ in {init_path}") @@ -495,13 +502,20 @@ def update_cli(check_name: str, dry_run: bool) -> None: return # Insert before the closing ] of list_of_checks. - content = content.replace("]\n\nfor check", f"{entry}]\n\nfor check", 1) + new_content = content.replace("]\n\nfor check", f"{entry}]\n\nfor check", 1) + if new_content == content: + print( + f"Warning: could not find insertion point in {cli_path}. " + "Please add the entry manually.", + file=sys.stderr, + ) + return if dry_run: print(f"[dry-run] Would add checks.{check_name} to {cli_path}") return - cli_path.write_text(content) + cli_path.write_text(new_content) print(f"Updated {cli_path}") @@ -606,7 +620,7 @@ def update_features(check_name: str, dry_run: bool) -> None: # --------------------------------------------------------------------------- -def run_post_scaffold(check_name: str) -> None: +def run_post_scaffold() -> None: """Run feature generation and code formatting after scaffolding.""" gen_script = PROJECT_ROOT / "bin" / "generate_features.py" if gen_script.exists(): @@ -694,7 +708,7 @@ def main() -> None: if not args.dry_run: # 8. Regenerate feature flags and format generated code - run_post_scaffold(check_name) + run_post_scaffold() print(f"\nDone! Health check '{check_name}' scaffolded successfully.") print("Next steps:") diff --git a/gcm/tests/test_create_health_check.py b/gcm/tests/test_create_health_check.py index 88c410c..f1eae17 100644 --- a/gcm/tests/test_create_health_check.py +++ b/gcm/tests/test_create_health_check.py @@ -282,8 +282,7 @@ def test_update_init_idempotent(scaffold_env: Path) -> None: init_path = scaffold_env / "gcm" / "health_checks" / "checks" / "__init__.py" content = init_path.read_text() - assert content.count("check_ntp_sync") == content.count("check_ntp_sync") - # Specifically, the import line must appear exactly once. + # The import line must appear exactly once. assert ( content.count( "from gcm.health_checks.checks.check_ntp_sync import check_ntp_sync" @@ -399,3 +398,18 @@ def test_update_cli_idempotent(scaffold_env: Path) -> None: content = cli_path.read_text() assert content.count("checks.check_ntp_sync") == 1 + + +def test_update_cli_warns_on_missing_anchor( + scaffold_env: Path, capsys: pytest.CaptureFixture +) -> None: + """update_cli warns when the expected anchor string is absent.""" + cli_path = scaffold_env / "gcm" / "health_checks" / "cli" / "health_checks.py" + # Remove the anchor that update_cli relies on. + cli_path.write_text("# empty file with no anchor\n") + + scaffold.update_cli("check_ntp_sync", dry_run=False) + + captured = capsys.readouterr() + assert "Warning" in captured.err + assert "checks.check_ntp_sync" not in cli_path.read_text() diff --git a/website/docs/GCM_Health_Checks/adding_new_health_check.md b/website/docs/GCM_Health_Checks/adding_new_health_check.md index d52dedf..61a4156 100644 --- a/website/docs/GCM_Health_Checks/adding_new_health_check.md +++ b/website/docs/GCM_Health_Checks/adding_new_health_check.md @@ -26,7 +26,7 @@ Preview changes without modifying files: python bin/create_new_health_check.py check_my_check --dry-run ``` -The tool creates the check file, test file, and documentation stub, registers the check in all required locations, and automatically runs `generate_features.py` and `ufmt format gcm`. You still need to implement the actual check logic (steps 2-4) and run verification (step 9). +The tool creates the check file, test file, and documentation stub, registers the check in all required locations, and automatically runs `generate_features.py` and `ufmt format gcm`. You still need to implement the actual check logic (steps 2-4), write real tests (step 7), fill in the documentation (step 8), add your check to `test_killswitches.py` (step 6), and run verification (step 9). ## 1. Create the check file