diff --git a/gcm/monitoring/kubernetes/__init__.py b/gcm/monitoring/kubernetes/__init__.py new file mode 100644 index 0000000..ae1b0cf --- /dev/null +++ b/gcm/monitoring/kubernetes/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. diff --git a/gcm/monitoring/kubernetes/client.py b/gcm/monitoring/kubernetes/client.py new file mode 100644 index 0000000..60e15ae --- /dev/null +++ b/gcm/monitoring/kubernetes/client.py @@ -0,0 +1,30 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from __future__ import annotations + +from typing import Iterable, Protocol + +from gcm.schemas.kubernetes.node import KubernetesNodeConditionRow +from gcm.schemas.kubernetes.pod import KubernetesPodRow + + +class KubernetesClient(Protocol): + """A low-level Kubernetes client for pod and node monitoring.""" + + def list_pods( + self, namespace: str = "", label_selector: str = "" + ) -> Iterable[KubernetesPodRow]: + """Get pod information from the Kubernetes API. + + Args: + namespace: Kubernetes namespace to filter pods. Empty string means all namespaces. + label_selector: Kubernetes label selector to filter pods. + + If an error occurs during execution, RuntimeError should be raised. + """ + + def list_node_conditions(self) -> Iterable[KubernetesNodeConditionRow]: + """Get node condition information from the Kubernetes API. + + If an error occurs during execution, RuntimeError should be raised. + """ diff --git a/gcm/schemas/kubernetes/__init__.py b/gcm/schemas/kubernetes/__init__.py new file mode 100644 index 0000000..ae1b0cf --- /dev/null +++ b/gcm/schemas/kubernetes/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. diff --git a/gcm/schemas/kubernetes/node.py b/gcm/schemas/kubernetes/node.py new file mode 100644 index 0000000..aed5067 --- /dev/null +++ b/gcm/schemas/kubernetes/node.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from dataclasses import dataclass + +from gcm.schemas.slurm.derived_cluster import DerivedCluster + + +@dataclass +class KubernetesNodeConditionRow: + """Kubernetes node condition schema. + + Fields correspond to node condition data from the Kubernetes API. + """ + + name: str | None = None + condition_type: str | None = None + status: str | None = None + reason: str | None = None + message: str | None = None + + +@dataclass(kw_only=True) +class KubernetesNodePayload(DerivedCluster): + ds: str + collection_unixtime: int + cluster: str + node_condition: KubernetesNodeConditionRow diff --git a/gcm/schemas/kubernetes/pod.py b/gcm/schemas/kubernetes/pod.py new file mode 100644 index 0000000..ada3898 --- /dev/null +++ b/gcm/schemas/kubernetes/pod.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +from dataclasses import dataclass + +from gcm.schemas.slurm.derived_cluster import DerivedCluster + + +@dataclass +class KubernetesPodRow: + """Kubernetes pod status schema. + + Fields correspond to pod metadata and container status from the Kubernetes API. + """ + + name: str | None = None + namespace: str | None = None + node_name: str | None = None + phase: str | None = None + restart_count: int | None = None + container_name: str | None = None + slurm_job_id: str | None = None + + +@dataclass(kw_only=True) +class KubernetesPodPayload(DerivedCluster): + ds: str + collection_unixtime: int + cluster: str + pod: KubernetesPodRow diff --git a/gcm/tests/test_kubernetes_schemas.py b/gcm/tests/test_kubernetes_schemas.py new file mode 100644 index 0000000..e26dba5 --- /dev/null +++ b/gcm/tests/test_kubernetes_schemas.py @@ -0,0 +1,118 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +import unittest + +from gcm.monitoring.kubernetes.client import KubernetesClient +from gcm.schemas.kubernetes.node import ( + KubernetesNodeConditionRow, + KubernetesNodePayload, +) +from gcm.schemas.kubernetes.pod import KubernetesPodPayload, KubernetesPodRow + + +class TestKubernetesPodRow(unittest.TestCase): + def test_default_values(self) -> None: + row = KubernetesPodRow() + self.assertIsNone(row.name) + self.assertIsNone(row.namespace) + self.assertIsNone(row.node_name) + self.assertIsNone(row.phase) + self.assertIsNone(row.restart_count) + self.assertIsNone(row.container_name) + self.assertIsNone(row.slurm_job_id) + + def test_with_values(self) -> None: + row = KubernetesPodRow( + name="my-pod", + namespace="default", + node_name="node-1", + phase="Running", + restart_count=3, + container_name="main", + slurm_job_id="12345", + ) + self.assertEqual(row.name, "my-pod") + self.assertEqual(row.namespace, "default") + self.assertEqual(row.node_name, "node-1") + self.assertEqual(row.phase, "Running") + self.assertEqual(row.restart_count, 3) + self.assertEqual(row.container_name, "main") + self.assertEqual(row.slurm_job_id, "12345") + + +class TestKubernetesPodPayload(unittest.TestCase): + def test_payload_with_pod(self) -> None: + row = KubernetesPodRow(name="my-pod", phase="Running") + payload = KubernetesPodPayload( + ds="2026-02-27", + collection_unixtime=1740700000, + cluster="test-cluster", + pod=row, + ) + self.assertEqual(payload.ds, "2026-02-27") + self.assertEqual(payload.collection_unixtime, 1740700000) + self.assertEqual(payload.cluster, "test-cluster") + self.assertEqual(payload.pod.name, "my-pod") + self.assertIsNone(payload.derived_cluster) + + def test_payload_with_derived_cluster(self) -> None: + row = KubernetesPodRow(name="my-pod") + payload = KubernetesPodPayload( + ds="2026-02-27", + collection_unixtime=1740700000, + cluster="test-cluster", + derived_cluster="derived-1", + pod=row, + ) + self.assertEqual(payload.derived_cluster, "derived-1") + + +class TestKubernetesNodeConditionRow(unittest.TestCase): + def test_default_values(self) -> None: + row = KubernetesNodeConditionRow() + self.assertIsNone(row.name) + self.assertIsNone(row.condition_type) + self.assertIsNone(row.status) + self.assertIsNone(row.reason) + self.assertIsNone(row.message) + + def test_with_values(self) -> None: + row = KubernetesNodeConditionRow( + name="node-1", + condition_type="Ready", + status="True", + reason="KubeletReady", + message="kubelet is posting ready status", + ) + self.assertEqual(row.name, "node-1") + self.assertEqual(row.condition_type, "Ready") + self.assertEqual(row.status, "True") + self.assertEqual(row.reason, "KubeletReady") + + +class TestKubernetesNodePayload(unittest.TestCase): + def test_payload_with_node_condition(self) -> None: + row = KubernetesNodeConditionRow( + name="node-1", condition_type="Ready", status="True" + ) + payload = KubernetesNodePayload( + ds="2026-02-27", + collection_unixtime=1740700000, + cluster="test-cluster", + node_condition=row, + ) + self.assertEqual(payload.ds, "2026-02-27") + self.assertEqual(payload.cluster, "test-cluster") + self.assertEqual(payload.node_condition.name, "node-1") + self.assertIsNone(payload.derived_cluster) + + +class TestKubernetesClientProtocol(unittest.TestCase): + def test_protocol_is_importable(self) -> None: + """Verify the protocol can be imported and used as a type.""" + self.assertTrue(hasattr(KubernetesClient, "list_pods")) + self.assertTrue(hasattr(KubernetesClient, "list_node_conditions")) + + +if __name__ == "__main__": + unittest.main()