Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions gcm/monitoring/kubernetes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
30 changes: 30 additions & 0 deletions gcm/monitoring/kubernetes/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
from __future__ import annotations

from typing import Iterable, Protocol

from gcm.schemas.kubernetes.node import KubernetesNodeConditionRow
from gcm.schemas.kubernetes.pod import KubernetesPodRow


class KubernetesClient(Protocol):
"""A low-level Kubernetes client for pod and node monitoring."""

def list_pods(
self, namespace: str = "", label_selector: str = ""
) -> Iterable[KubernetesPodRow]:
"""Get pod information from the Kubernetes API.
Args:
namespace: Kubernetes namespace to filter pods. Empty string means all namespaces.
label_selector: Kubernetes label selector to filter pods.
If an error occurs during execution, RuntimeError should be raised.
"""

def list_node_conditions(self) -> Iterable[KubernetesNodeConditionRow]:
"""Get node condition information from the Kubernetes API.
If an error occurs during execution, RuntimeError should be raised.
"""
2 changes: 2 additions & 0 deletions gcm/schemas/kubernetes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
27 changes: 27 additions & 0 deletions gcm/schemas/kubernetes/node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
from dataclasses import dataclass

from gcm.schemas.slurm.derived_cluster import DerivedCluster


@dataclass
class KubernetesNodeConditionRow:
"""Kubernetes node condition schema.

Fields correspond to node condition data from the Kubernetes API.
"""

name: str | None = None
condition_type: str | None = None
status: str | None = None
reason: str | None = None
message: str | None = None


@dataclass(kw_only=True)
class KubernetesNodePayload(DerivedCluster):
ds: str
collection_unixtime: int
cluster: str
node_condition: KubernetesNodeConditionRow
29 changes: 29 additions & 0 deletions gcm/schemas/kubernetes/pod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
from dataclasses import dataclass

from gcm.schemas.slurm.derived_cluster import DerivedCluster


@dataclass
class KubernetesPodRow:
"""Kubernetes pod status schema.
Fields correspond to pod metadata and container status from the Kubernetes API.
"""

name: str | None = None
namespace: str | None = None
node_name: str | None = None
phase: str | None = None
restart_count: int | None = None
container_name: str | None = None
slurm_job_id: str | None = None


@dataclass(kw_only=True)
class KubernetesPodPayload(DerivedCluster):
ds: str
collection_unixtime: int
cluster: str
pod: KubernetesPodRow
118 changes: 118 additions & 0 deletions gcm/tests/test_kubernetes_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
import unittest

from gcm.monitoring.kubernetes.client import KubernetesClient
from gcm.schemas.kubernetes.node import (
KubernetesNodeConditionRow,
KubernetesNodePayload,
)
from gcm.schemas.kubernetes.pod import KubernetesPodPayload, KubernetesPodRow


class TestKubernetesPodRow(unittest.TestCase):
def test_default_values(self) -> None:
row = KubernetesPodRow()
self.assertIsNone(row.name)
self.assertIsNone(row.namespace)
self.assertIsNone(row.node_name)
self.assertIsNone(row.phase)
self.assertIsNone(row.restart_count)
self.assertIsNone(row.container_name)
self.assertIsNone(row.slurm_job_id)

def test_with_values(self) -> None:
row = KubernetesPodRow(
name="my-pod",
namespace="default",
node_name="node-1",
phase="Running",
restart_count=3,
container_name="main",
slurm_job_id="12345",
)
self.assertEqual(row.name, "my-pod")
self.assertEqual(row.namespace, "default")
self.assertEqual(row.node_name, "node-1")
self.assertEqual(row.phase, "Running")
self.assertEqual(row.restart_count, 3)
self.assertEqual(row.container_name, "main")
self.assertEqual(row.slurm_job_id, "12345")


class TestKubernetesPodPayload(unittest.TestCase):
def test_payload_with_pod(self) -> None:
row = KubernetesPodRow(name="my-pod", phase="Running")
payload = KubernetesPodPayload(
ds="2026-02-27",
collection_unixtime=1740700000,
cluster="test-cluster",
pod=row,
)
self.assertEqual(payload.ds, "2026-02-27")
self.assertEqual(payload.collection_unixtime, 1740700000)
self.assertEqual(payload.cluster, "test-cluster")
self.assertEqual(payload.pod.name, "my-pod")
self.assertIsNone(payload.derived_cluster)

def test_payload_with_derived_cluster(self) -> None:
row = KubernetesPodRow(name="my-pod")
payload = KubernetesPodPayload(
ds="2026-02-27",
collection_unixtime=1740700000,
cluster="test-cluster",
derived_cluster="derived-1",
pod=row,
)
self.assertEqual(payload.derived_cluster, "derived-1")


class TestKubernetesNodeConditionRow(unittest.TestCase):
def test_default_values(self) -> None:
row = KubernetesNodeConditionRow()
self.assertIsNone(row.name)
self.assertIsNone(row.condition_type)
self.assertIsNone(row.status)
self.assertIsNone(row.reason)
self.assertIsNone(row.message)

def test_with_values(self) -> None:
row = KubernetesNodeConditionRow(
name="node-1",
condition_type="Ready",
status="True",
reason="KubeletReady",
message="kubelet is posting ready status",
)
self.assertEqual(row.name, "node-1")
self.assertEqual(row.condition_type, "Ready")
self.assertEqual(row.status, "True")
self.assertEqual(row.reason, "KubeletReady")


class TestKubernetesNodePayload(unittest.TestCase):
def test_payload_with_node_condition(self) -> None:
row = KubernetesNodeConditionRow(
name="node-1", condition_type="Ready", status="True"
)
payload = KubernetesNodePayload(
ds="2026-02-27",
collection_unixtime=1740700000,
cluster="test-cluster",
node_condition=row,
)
self.assertEqual(payload.ds, "2026-02-27")
self.assertEqual(payload.cluster, "test-cluster")
self.assertEqual(payload.node_condition.name, "node-1")
self.assertIsNone(payload.derived_cluster)


class TestKubernetesClientProtocol(unittest.TestCase):
def test_protocol_is_importable(self) -> None:
"""Verify the protocol can be imported and used as a type."""
self.assertTrue(hasattr(KubernetesClient, "list_pods"))
self.assertTrue(hasattr(KubernetesClient, "list_node_conditions"))


if __name__ == "__main__":
unittest.main()
Loading