From dbe51abe560d83b4711f95ce0776d7c6ad51a36c Mon Sep 17 00:00:00 2001 From: Harshal Patil <12152047+harche@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:01:37 -0400 Subject: [PATCH 1/5] Add nodes_debug_exec tool in pkg/ocp package --- README.md | 24 +- internal/tools/update-readme/main.go | 1 + pkg/config/config_default_overrides.go | 3 +- pkg/config/config_test.go | 4 +- pkg/kubernetes-mcp-server/cmd/root_test.go | 4 +- pkg/mcp/nodes_test.go | 253 +++++++++++ pkg/mcp/openshift_modules.go | 3 + ...toolsets-full-tools-multicluster-enum.json | 46 +- .../toolsets-full-tools-multicluster.json | 46 +- .../toolsets-full-tools-openshift.json | 46 +- pkg/mcp/testdata/toolsets-full-tools.json | 46 +- .../toolsets-openshift-core-tools.json | 46 ++ pkg/ocp/nodes_debug.go | 326 +++++++++++++ pkg/ocp/nodes_debug_test.go | 429 ++++++++++++++++++ pkg/ocp/ocp_client.go | 44 ++ pkg/ocp/testhelpers.go | 163 +++++++ pkg/toolsets/openshift/nodes.go | 126 +++++ pkg/toolsets/openshift/nodes_test.go | 95 ++++ pkg/toolsets/openshift/toolset.go | 31 ++ 19 files changed, 1722 insertions(+), 14 deletions(-) create mode 100644 pkg/mcp/openshift_modules.go create mode 100644 pkg/mcp/testdata/toolsets-openshift-core-tools.json create mode 100644 pkg/ocp/nodes_debug.go create mode 100644 pkg/ocp/nodes_debug_test.go create mode 100644 pkg/ocp/ocp_client.go create mode 100644 pkg/ocp/testhelpers.go create mode 100644 pkg/toolsets/openshift/nodes.go create mode 100644 pkg/toolsets/openshift/nodes_test.go create mode 100644 pkg/toolsets/openshift/toolset.go diff --git a/README.md b/README.md index a81daea16..7a28da124 100644 --- a/README.md +++ b/README.md @@ -208,11 +208,12 @@ The following sets of tools are available (all on by default): -| Toolset | Description | -|---------|-------------------------------------------------------------------------------------| -| config | View and manage the current local Kubernetes configuration (kubeconfig) | -| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | -| helm | Tools for managing Helm charts and releases | +| Toolset | Description | +|----------------|-------------------------------------------------------------------------------------| +| config | View and manage the current local Kubernetes configuration (kubeconfig) | +| core | Most common tools for Kubernetes management (Pods, Generic Resources, Events, etc.) | +| helm | Tools for managing Helm charts and releases | +| openshift-core | Core OpenShift-specific tools (Node debugging, etc.) | @@ -336,6 +337,19 @@ In case multi-cluster support is enabled (default) and you have access to multip +
+ +openshift-core + +- **nodes_debug_exec** - Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs. + - `command` (`array`) **(required)** - Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']). + - `image` (`string`) - Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities. + - `namespace` (`string`) - Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default'). + - `node` (`string`) **(required)** - Name of the node to debug (e.g. worker-0). + - `timeout_seconds` (`integer`) - Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds). + +
+ diff --git a/internal/tools/update-readme/main.go b/internal/tools/update-readme/main.go index cdf695fc9..b425a1d3a 100644 --- a/internal/tools/update-readme/main.go +++ b/internal/tools/update-readme/main.go @@ -15,6 +15,7 @@ import ( _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/config" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/core" _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/helm" + _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" ) type OpenShift struct{} diff --git a/pkg/config/config_default_overrides.go b/pkg/config/config_default_overrides.go index 70d065bce..05c3b4ac4 100644 --- a/pkg/config/config_default_overrides.go +++ b/pkg/config/config_default_overrides.go @@ -3,6 +3,7 @@ package config func defaultOverrides() StaticConfig { return StaticConfig{ // IMPORTANT: this file is used to override default config values in downstream builds. - // This is intentionally left blank. + // OpenShift-specific defaults: add openshift-core toolset + Toolsets: []string{"core", "config", "helm", "openshift-core"}, } } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index afdde1910..caa4f95da 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -167,8 +167,8 @@ func (s *ConfigSuite) TestReadConfigValidPreservesDefaultsForMissingFields() { s.Equalf("table", config.ListOutput, "Expected ListOutput to be table, got %s", config.ListOutput) }) s.Run("toolsets defaulted correctly", func() { - s.Require().Lenf(config.Toolsets, 3, "Expected 3 toolsets, got %d", len(config.Toolsets)) - for _, toolset := range []string{"core", "config", "helm"} { + s.Require().Lenf(config.Toolsets, 4, "Expected 4 toolsets, got %d", len(config.Toolsets)) + for _, toolset := range []string{"core", "config", "helm", "openshift-core"} { s.Containsf(config.Toolsets, toolset, "Expected toolsets to contain %s", toolset) } }) diff --git a/pkg/kubernetes-mcp-server/cmd/root_test.go b/pkg/kubernetes-mcp-server/cmd/root_test.go index 225216671..e5342cd5c 100644 --- a/pkg/kubernetes-mcp-server/cmd/root_test.go +++ b/pkg/kubernetes-mcp-server/cmd/root_test.go @@ -137,7 +137,7 @@ func TestToolsets(t *testing.T) { rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--help"}) o, err := captureOutput(rootCmd.Execute) // --help doesn't use logger/klog, cobra prints directly to stdout - if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm).") { + if !strings.Contains(o, "Comma-separated list of MCP toolsets to use (available toolsets: config, core, helm, openshift-core).") { t.Fatalf("Expected all available toolsets, got %s %v", o, err) } }) @@ -145,7 +145,7 @@ func TestToolsets(t *testing.T) { ioStreams, out := testStream() rootCmd := NewMCPServer(ioStreams) rootCmd.SetArgs([]string{"--version", "--port=1337", "--log-level=1"}) - if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm") { + if err := rootCmd.Execute(); !strings.Contains(out.String(), "- Toolsets: core, config, helm, openshift-core") { t.Fatalf("Expected toolsets 'full', got %s %v", out, err) } }) diff --git a/pkg/mcp/nodes_test.go b/pkg/mcp/nodes_test.go index 62ac55e95..e7626a18a 100644 --- a/pkg/mcp/nodes_test.go +++ b/pkg/mcp/nodes_test.go @@ -1,14 +1,21 @@ package mcp import ( + "encoding/json" + "io" "net/http" "strconv" + "strings" "testing" "github.com/BurntSushi/toml" "github.com/containers/kubernetes-mcp-server/internal/test" "github.com/mark3labs/mcp-go/mcp" "github.com/stretchr/testify/suite" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" ) type NodesSuite struct { @@ -334,3 +341,249 @@ func (s *NodesSuite) TestNodesStatsSummaryDenied() { func TestNodes(t *testing.T) { suite.Run(t, new(NodesSuite)) } + +// Tests below are for the nodes_debug_exec tool (OpenShift-specific) + +type NodesDebugExecSuite struct { + BaseMcpSuite + mockServer *test.MockServer +} + +func (s *NodesDebugExecSuite) SetupTest() { + s.BaseMcpSuite.SetupTest() + s.mockServer = test.NewMockServer() + s.Cfg.KubeConfig = s.mockServer.KubeconfigFile(s.T()) +} + +func (s *NodesDebugExecSuite) TearDownTest() { + s.BaseMcpSuite.TearDownTest() + if s.mockServer != nil { + s.mockServer.Close() + } +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecTool() { + s.Run("nodes_debug_exec with successful execution", func() { + + var ( + createdPod v1.Pod + deleteCalled bool + ) + const namespace = "debug" + const logOutput = "filesystem repaired" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + createdPod = *created + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + createdPod.Name = pathParts[len(pathParts)-1] + } + createdPod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + created := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, created); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + createdPod = *created + createdPod.ObjectMeta = metav1.ObjectMeta{ + Namespace: namespace, + Name: createdPod.GenerateName + "abc", + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&createdPod) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + podStatus := createdPod.DeepCopy() + podStatus.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 0, + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(podStatus) + case req.Method == http.MethodDelete && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name: + deleteCalled = true + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(&metav1.Status{Status: "Success"}) + case req.Method == http.MethodGet && createdPod.Name != "" && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods/"+createdPod.Name+"/log": + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(logOutput)) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "worker-0", + "namespace": namespace, + "command": []interface{}{"uname", "-a"}, + }) + + s.Run("call succeeds", func() { + s.Nilf(err, "call tool should not error: %v", err) + s.Falsef(toolResult.IsError, "tool should not return error: %v", toolResult.Content) + s.NotEmpty(toolResult.Content, "expected output content") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Equalf(logOutput, text, "unexpected tool output %q", text) + }) + + s.Run("debug pod shaped correctly", func() { + s.Require().NotNil(createdPod.Spec.Containers, "expected containers in debug pod") + s.Require().Len(createdPod.Spec.Containers, 1, "expected single container in debug pod") + container := createdPod.Spec.Containers[0] + expectedCommand := []string{"uname", "-a"} + s.Truef(equalStringSlices(container.Command, expectedCommand), + "unexpected debug command: %v", container.Command) + s.Require().NotNil(container.SecurityContext, "expected security context") + s.Require().NotNil(container.SecurityContext.Privileged, "expected privileged field") + s.Truef(*container.SecurityContext.Privileged, "expected privileged container") + s.Require().NotEmpty(createdPod.Spec.Volumes, "expected volumes on debug pod") + s.Require().NotNil(createdPod.Spec.Volumes[0].HostPath, "expected hostPath volume") + s.Truef(deleteCalled, "expected debug pod to be deleted") + }) + }) +} + +func equalStringSlices(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func (s *NodesDebugExecSuite) TestNodesDebugExecToolNonZeroExit() { + s.Run("nodes_debug_exec with non-zero exit code", func() { + const namespace = "default" + const errorMessage = "failed" + + scheme := runtime.NewScheme() + _ = v1.AddToScheme(scheme) + codec := serializer.NewCodecFactory(scheme).UniversalDeserializer() + + s.mockServer.Handle(http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { + switch { + case req.URL.Path == "/api": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIVersions","versions":["v1"],"serverAddressByClientCIDRs":[{"clientCIDR":"0.0.0.0/0"}]}`)) + case req.URL.Path == "/apis": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIGroupList","apiVersion":"v1","groups":[]}`)) + case req.URL.Path == "/api/v1": + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"kind":"APIResourceList","apiVersion":"v1","resources":[{"name":"pods","singularName":"","namespaced":true,"kind":"Pod","verbs":["get","list","watch","create","update","patch","delete"]}]}`)) + case req.Method == http.MethodPatch && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + // Handle server-side apply (PATCH with fieldManager query param) + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read apply body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode apply body: %v", err) + } + // Keep the name from the request URL if it was provided + pathParts := strings.Split(req.URL.Path, "/") + if len(pathParts) > 0 { + pod.Name = pathParts[len(pathParts)-1] + } + pod.Namespace = namespace + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case req.Method == http.MethodPost && req.URL.Path == "/api/v1/namespaces/"+namespace+"/pods": + body, err := io.ReadAll(req.Body) + if err != nil { + s.T().Fatalf("failed to read create body: %v", err) + } + pod := &v1.Pod{} + if _, _, err = codec.Decode(body, nil, pod); err != nil { + s.T().Fatalf("failed to decode create body: %v", err) + } + pod.ObjectMeta = metav1.ObjectMeta{Name: pod.GenerateName + "xyz", Namespace: namespace} + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + case strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/") && strings.HasSuffix(req.URL.Path, "/log"): + w.Header().Set("Content-Type", "text/plain") + _, _ = w.Write([]byte(errorMessage)) + case req.Method == http.MethodGet && strings.HasPrefix(req.URL.Path, "/api/v1/namespaces/"+namespace+"/pods/"): + pathParts := strings.Split(req.URL.Path, "/") + podName := pathParts[len(pathParts)-1] + pod := &v1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + }, + } + pod.Status = v1.PodStatus{ + Phase: v1.PodSucceeded, + ContainerStatuses: []v1.ContainerStatus{{ + Name: "debug", + State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ + ExitCode: 2, + Reason: "Error", + }}, + }}, + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(pod) + } + })) + + s.InitMcpClient() + toolResult, err := s.CallTool("nodes_debug_exec", map[string]interface{}{ + "node": "infra-1", + "command": []interface{}{"journalctl"}, + }) + + s.Nilf(err, "call tool should not error: %v", err) + s.Truef(toolResult.IsError, "expected tool to return error") + text := toolResult.Content[0].(mcp.TextContent).Text + s.Containsf(text, "command exited with code 2", "expected exit code message, got %q", text) + s.Containsf(text, "Error", "expected error reason included, got %q", text) + }) +} + +func TestNodesDebugExec(t *testing.T) { + suite.Run(t, new(NodesDebugExecSuite)) +} diff --git a/pkg/mcp/openshift_modules.go b/pkg/mcp/openshift_modules.go new file mode 100644 index 000000000..84d982f4a --- /dev/null +++ b/pkg/mcp/openshift_modules.go @@ -0,0 +1,3 @@ +package mcp + +import _ "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift" diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json index 1551b4c28..474bb1d05 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster-enum.json @@ -195,6 +195,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -220,7 +264,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json index 6e85e4010..13a956f73 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-multicluster.json +++ b/pkg/mcp/testdata/toolsets-full-tools-multicluster.json @@ -175,6 +175,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -196,7 +240,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools-openshift.json b/pkg/mcp/testdata/toolsets-full-tools-openshift.json index fb24138ef..1ac08ca79 100644 --- a/pkg/mcp/testdata/toolsets-full-tools-openshift.json +++ b/pkg/mcp/testdata/toolsets-full-tools-openshift.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-full-tools.json b/pkg/mcp/testdata/toolsets-full-tools.json index 5a4b51122..961fbb55d 100644 --- a/pkg/mcp/testdata/toolsets-full-tools.json +++ b/pkg/mcp/testdata/toolsets-full-tools.json @@ -139,6 +139,50 @@ }, "name": "namespaces_list" }, + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + }, { "annotations": { "title": "Node: Log", @@ -156,7 +200,7 @@ "type": "string" }, "query": { - "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\u003clog-file-name\u003e\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", + "description": "query specifies services(s) or files from which to return logs (required). Example: \"kubelet\" to fetch kubelet logs, \"/\" to fetch a specific log file from the node (e.g., \"/var/log/kubelet.log\" or \"/var/log/kube-proxy.log\")", "type": "string" }, "tailLines": { diff --git a/pkg/mcp/testdata/toolsets-openshift-core-tools.json b/pkg/mcp/testdata/toolsets-openshift-core-tools.json new file mode 100644 index 000000000..65f3203d7 --- /dev/null +++ b/pkg/mcp/testdata/toolsets-openshift-core-tools.json @@ -0,0 +1,46 @@ +[ + { + "annotations": { + "title": "Nodes: Debug Exec", + "readOnlyHint": false, + "destructiveHint": true, + "idempotentHint": false, + "openWorldHint": true + }, + "description": "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). Commands execute in a chroot of the host filesystem, providing full access to node-level diagnostics. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + "inputSchema": { + "type": "object", + "properties": { + "node": { + "description": "Name of the node to debug (e.g. worker-0).", + "type": "string" + }, + "command": { + "description": "Command to execute on the node via chroot. All standard debugging utilities are available including systemctl, journalctl, ss, ip, ping, traceroute, nmap, ps, top, lsof, strace, find, tar, rsync, gdb, and more. Provide each argument as a separate array item (e.g. ['systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + "items": { + "type": "string" + }, + "type": "array" + }, + "namespace": { + "description": "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + "type": "string" + }, + "image": { + "description": "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + "type": "string" + }, + "timeout_seconds": { + "description": "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + "minimum": 1, + "type": "integer" + } + }, + "required": [ + "node", + "command" + ] + }, + "name": "nodes_debug_exec" + } +] diff --git a/pkg/ocp/nodes_debug.go b/pkg/ocp/nodes_debug.go new file mode 100644 index 000000000..cb079a9ae --- /dev/null +++ b/pkg/ocp/nodes_debug.go @@ -0,0 +1,326 @@ +package ocp + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/version" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilrand "k8s.io/apimachinery/pkg/util/rand" + "k8s.io/utils/ptr" +) + +const ( + // DefaultNodeDebugImage is the UBI9 toolbox image that provides comprehensive debugging and troubleshooting utilities. + // This image includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), + // process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), debugging tools (gdb), + // and many other utilities commonly needed for node-level debugging and diagnostics. + DefaultNodeDebugImage = "registry.access.redhat.com/ubi9/toolbox:latest" + // NodeDebugContainerName is the name used for the debug container, matching 'oc debug node' defaults. + NodeDebugContainerName = "debug" + // DefaultNodeDebugTimeout is the maximum time to wait for the debug pod to finish executing. + DefaultNodeDebugTimeout = 1 * time.Minute +) + +// NodesDebugExec mimics `oc debug node/ -- ` by creating a privileged pod on the target +// node, running the provided command, collecting its output, and removing the pod afterwards. +// The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node resources. +// +// When namespace is empty, the configured namespace (or "default" if none) is used. When image is empty the +// default debug image is used. Timeout controls how long we wait for the pod to complete. +func NodesDebugExec( + ctx context.Context, + k OpenshiftClient, + namespace string, + nodeName string, + image string, + command []string, + timeout time.Duration, +) (string, error) { + if nodeName == "" { + return "", errors.New("node name is required") + } + if len(command) == 0 { + return "", errors.New("command is required") + } + + ns := k.NamespaceOrDefault(namespace) + if ns == "" { + ns = "default" + } + debugImage := image + if debugImage == "" { + debugImage = DefaultNodeDebugImage + } + if timeout <= 0 { + timeout = DefaultNodeDebugTimeout + } + + // Create the debug pod + created, err := createDebugPod(ctx, k, nodeName, ns, debugImage, command) + if err != nil { + return "", err + } + + // Ensure the pod is deleted regardless of completion state. + defer func() { + deleteCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + pods, err := k.AccessControlClientset().Pods(ns) + if err == nil { + _ = pods.Delete(deleteCtx, created.Name, metav1.DeleteOptions{}) + } + }() + + // Poll for debug pod completion + terminated, lastPod, waitMsg, err := pollForCompletion(ctx, k, ns, created.Name, timeout) + if err != nil { + return "", err + } + + // Retrieve the logs + logs, err := retrieveLogs(ctx, k, ns, created.Name) + if err != nil { + return "", err + } + + // Process the results + return processResults(terminated, lastPod, waitMsg, logs) +} + +// createDebugPod creates a privileged pod on the target node to run debug commands. +func createDebugPod( + ctx context.Context, + k OpenshiftClient, + nodeName string, + namespace string, + image string, + command []string, +) (*corev1.Pod, error) { + sanitizedNode := sanitizeForName(nodeName) + hostPathType := corev1.HostPathDirectory + + // Generate a unique name + suffix := utilrand.String(5) + maxNodeLen := 63 - len("node-debug-") - 1 - len(suffix) + if maxNodeLen < 1 { + maxNodeLen = 1 + } + if len(sanitizedNode) > maxNodeLen { + sanitizedNode = sanitizedNode[:maxNodeLen] + } + podName := fmt.Sprintf("node-debug-%s-%s", sanitizedNode, suffix) + + debugPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: namespace, + Labels: map[string]string{ + kubernetes.AppKubernetesManagedBy: version.BinaryName, + kubernetes.AppKubernetesComponent: "node-debug", + kubernetes.AppKubernetesName: fmt.Sprintf("node-debug-%s", sanitizedNode), + }, + }, + Spec: corev1.PodSpec{ + AutomountServiceAccountToken: ptr.To(false), + HostNetwork: true, + HostPID: true, + HostIPC: true, + NodeName: nodeName, + RestartPolicy: corev1.RestartPolicyNever, + SecurityContext: &corev1.PodSecurityContext{ + RunAsUser: ptr.To[int64](0), + }, + Tolerations: []corev1.Toleration{ + {Operator: corev1.TolerationOpExists}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoSchedule}, + {Operator: corev1.TolerationOpExists, Effect: corev1.TaintEffectNoExecute}, + }, + Volumes: []corev1.Volume{ + { + Name: "host-root", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: "/", + Type: &hostPathType, + }, + }, + }, + }, + Containers: []corev1.Container{ + { + Name: NodeDebugContainerName, + Image: image, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: command, + SecurityContext: &corev1.SecurityContext{ + Privileged: ptr.To(true), + RunAsUser: ptr.To[int64](0), + }, + VolumeMounts: []corev1.VolumeMount{ + {Name: "host-root", MountPath: "/host"}, + }, + }, + }, + }, + } + + // Create the pod using AccessControlClientset + pods, err := k.AccessControlClientset().Pods(namespace) + if err != nil { + return nil, fmt.Errorf("failed to get pods interface: %w", err) + } + + created, err := pods.Create(ctx, debugPod, metav1.CreateOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to create debug pod: %w", err) + } + + return created, nil +} + +// pollForCompletion polls the debug pod until it completes or times out. +func pollForCompletion( + ctx context.Context, + k OpenshiftClient, + namespace string, + podName string, + timeout time.Duration, +) (*corev1.ContainerStateTerminated, *corev1.Pod, string, error) { + pollCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var ( + lastPod *corev1.Pod + terminated *corev1.ContainerStateTerminated + waitMsg string + ) + + for { + // Get pod status using AccessControlClientset + pods, getErr := k.AccessControlClientset().Pods(namespace) + if getErr != nil { + return nil, nil, "", fmt.Errorf("failed to get pods interface: %w", getErr) + } + + current, err := pods.Get(pollCtx, podName, metav1.GetOptions{}) + if err != nil { + return nil, nil, "", fmt.Errorf("failed to get debug pod status: %w", err) + } + lastPod = current + + if status := containerStatusByName(current.Status.ContainerStatuses, NodeDebugContainerName); status != nil { + if status.State.Waiting != nil { + waitMsg = fmt.Sprintf("container waiting: %s", status.State.Waiting.Reason) + // Image pull issues should fail fast. + if status.State.Waiting.Reason == "ErrImagePull" || status.State.Waiting.Reason == "ImagePullBackOff" { + return nil, nil, "", fmt.Errorf("debug container failed to start (%s): %s", status.State.Waiting.Reason, status.State.Waiting.Message) + } + } + if status.State.Terminated != nil { + terminated = status.State.Terminated + break + } + } + + if current.Status.Phase == corev1.PodFailed { + break + } + + // Wait for the next tick interval before checking pod status again, or timeout if context is done. + select { + case <-pollCtx.Done(): + return nil, nil, "", fmt.Errorf("timed out waiting for debug pod %s to complete: %w", podName, pollCtx.Err()) + case <-ticker.C: + } + } + + return terminated, lastPod, waitMsg, nil +} + +// retrieveLogs retrieves the logs from the debug pod. +func retrieveLogs(ctx context.Context, k OpenshiftClient, namespace, podName string) (string, error) { + logCtx, logCancel := context.WithTimeout(ctx, 30*time.Second) + defer logCancel() + logs, logErr := k.PodsLog(logCtx, namespace, podName, NodeDebugContainerName, false, 0) + if logErr != nil { + return "", fmt.Errorf("failed to retrieve debug pod logs: %w", logErr) + } + return strings.TrimSpace(logs), nil +} + +// processResults processes the debug pod completion status and returns the appropriate result. +func processResults(terminated *corev1.ContainerStateTerminated, lastPod *corev1.Pod, waitMsg, logs string) (string, error) { + if terminated != nil { + if terminated.ExitCode != 0 { + errMsg := fmt.Sprintf("command exited with code %d", terminated.ExitCode) + if terminated.Reason != "" { + errMsg = fmt.Sprintf("%s (%s)", errMsg, terminated.Reason) + } + if terminated.Message != "" { + errMsg = fmt.Sprintf("%s: %s", errMsg, terminated.Message) + } + if logs != "" { + errMsg = fmt.Sprintf("%s\nOutput:\n%s", errMsg, logs) + } + return "", errors.New(errMsg) + } + return logs, nil + } + + if lastPod != nil && lastPod.Status.Reason != "" { + if logs != "" { + return "", fmt.Errorf("debug pod failed: %s\nOutput:\n%s", lastPod.Status.Reason, logs) + } + return "", fmt.Errorf("debug pod failed: %s", lastPod.Status.Reason) + } + if waitMsg != "" { + if logs != "" { + return "", fmt.Errorf("debug container did not complete: %s\nOutput:\n%s", waitMsg, logs) + } + return "", fmt.Errorf("debug container did not complete: %s", waitMsg) + } + if logs != "" { + return "", fmt.Errorf("debug container did not reach a terminal state\nOutput:\n%s", logs) + } + return "", errors.New("debug container did not reach a terminal state") +} + +func sanitizeForName(name string) string { + lower := strings.ToLower(name) + var b strings.Builder + b.Grow(len(lower)) + for _, r := range lower { + if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' { + b.WriteRune(r) + continue + } + b.WriteRune('-') + } + sanitized := strings.Trim(b.String(), "-") + if sanitized == "" { + sanitized = "node" + } + if len(sanitized) > 40 { + sanitized = sanitized[:40] + } + return sanitized +} + +func containerStatusByName(statuses []corev1.ContainerStatus, name string) *corev1.ContainerStatus { + for idx := range statuses { + if statuses[idx].Name == name { + return &statuses[idx] + } + } + return nil +} diff --git a/pkg/ocp/nodes_debug_test.go b/pkg/ocp/nodes_debug_test.go new file mode 100644 index 000000000..1c86c92f8 --- /dev/null +++ b/pkg/ocp/nodes_debug_test.go @@ -0,0 +1,429 @@ +package ocp + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" +) + +func TestNodesDebugExecCreatesPrivilegedPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = "kernel 6.8" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "", "worker-0", "", []string{"uname", "-a"}, 2*time.Minute) + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if out != "kernel 6.8" { + t.Fatalf("unexpected command output: %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected debug pod to be created") + } + if created.Namespace != "default" { + t.Fatalf("expected default namespace fallback, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-0" { + t.Fatalf("expected pod to target node worker-0, got %q", created.Spec.NodeName) + } + if !env.Pods.Deleted { + t.Fatalf("expected debug pod to be deleted after execution") + } + + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected single container in debug pod") + } + container := created.Spec.Containers[0] + if container.Image != DefaultNodeDebugImage { + t.Fatalf("expected default image %q, got %q", DefaultNodeDebugImage, container.Image) + } + expectedCommand := []string{"uname", "-a"} + if len(container.Command) != len(expectedCommand) { + t.Fatalf("unexpected command length, got %v", container.Command) + } + for i, part := range expectedCommand { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || container.SecurityContext.Privileged == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected container to run privileged") + } + if len(container.VolumeMounts) != 1 || container.VolumeMounts[0].MountPath != "/host" { + t.Fatalf("expected container to mount host root at /host") + } + + if created.Spec.SecurityContext == nil || created.Spec.SecurityContext.RunAsUser == nil || *created.Spec.SecurityContext.RunAsUser != 0 { + t.Fatalf("expected pod security context to run as root") + } + + if len(created.Spec.Volumes) != 1 || created.Spec.Volumes[0].HostPath == nil { + t.Fatalf("expected host root volume to be configured") + } +} + +func TestNodesDebugExecReturnsErrorForNonZeroExit(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = 5 + env.Pods.TerminatedReason = "Error" + env.Pods.TerminatedMessage = "some failure" + env.Pods.Logs = "bad things happened" + + out, err := NodesDebugExec(context.Background(), env.Kubernetes, "debug-ns", "infra-node", "registry.example/custom:latest", []string{"journalctl", "-xe"}, time.Minute) + if err == nil { + t.Fatalf("expected error for non-zero exit code") + } + // Logs should be included in the error message + if !strings.Contains(err.Error(), "bad things happened") { + t.Fatalf("expected error to contain logs, got: %v", err) + } + if !strings.Contains(err.Error(), "command exited with code 5") { + t.Fatalf("expected error to contain exit code, got: %v", err) + } + if out != "" { + t.Fatalf("expected empty output on error, got %q", out) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "debug-ns" { + t.Fatalf("expected provided namespace to be used, got %q", created.Namespace) + } + if containerImage := created.Spec.Containers[0].Image; containerImage != "registry.example/custom:latest" { + t.Fatalf("expected custom image to be used, got %q", containerImage) + } +} + +func TestCreateDebugPod(t *testing.T) { + env := NewNodeDebugTestEnv(t) + + created, err := createDebugPod(context.Background(), env.Kubernetes, "worker-1", "test-ns", "custom:v1", []string{"ls", "-la"}) + if err != nil { + t.Fatalf("createDebugPod failed: %v", err) + } + if created == nil { + t.Fatalf("expected pod to be created") + } + if created.Namespace != "test-ns" { + t.Fatalf("expected namespace test-ns, got %q", created.Namespace) + } + if created.Spec.NodeName != "worker-1" { + t.Fatalf("expected node worker-1, got %q", created.Spec.NodeName) + } + if !strings.HasPrefix(created.Name, "node-debug-worker-1-") { + t.Fatalf("unexpected pod name: %q", created.Name) + } + if len(created.Name) > 63 { + t.Fatalf("pod name exceeds DNS label length: %d characters", len(created.Name)) + } + if len(created.Spec.Containers) != 1 { + t.Fatalf("expected 1 container, got %d", len(created.Spec.Containers)) + } + container := created.Spec.Containers[0] + if container.Image != "custom:v1" { + t.Fatalf("expected image custom:v1, got %q", container.Image) + } + expectedCmd := []string{"ls", "-la"} + if len(container.Command) != len(expectedCmd) { + t.Fatalf("expected %d command parts, got %d", len(expectedCmd), len(container.Command)) + } + for i, part := range expectedCmd { + if container.Command[i] != part { + t.Fatalf("command[%d] = %q, expected %q", i, container.Command[i], part) + } + } + if container.SecurityContext == nil || !*container.SecurityContext.Privileged { + t.Fatalf("expected privileged container") + } +} + +func TestPollForCompletion(t *testing.T) { + tests := []struct { + name string + exitCode int32 + terminatedReason string + waitingReason string + waitingMessage string + expectError bool + expectTerminated bool + errorContains []string + expectedExitCode int32 + expectedReason string + }{ + { + name: "successful completion", + exitCode: 0, + expectTerminated: true, + expectedExitCode: 0, + }, + { + name: "non-zero exit code", + exitCode: 42, + terminatedReason: "Error", + expectTerminated: true, + expectedExitCode: 42, + expectedReason: "Error", + }, + { + name: "image pull error", + waitingReason: "ErrImagePull", + waitingMessage: "image not found", + expectError: true, + errorContains: []string{"ErrImagePull", "image not found"}, + }, + { + name: "image pull backoff", + waitingReason: "ImagePullBackOff", + waitingMessage: "back-off pulling image", + expectError: true, + errorContains: []string{"ImagePullBackOff", "back-off pulling image"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.ExitCode = tt.exitCode + env.Pods.TerminatedReason = tt.terminatedReason + env.Pods.WaitingReason = tt.waitingReason + env.Pods.WaitingMessage = tt.waitingMessage + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + terminated, lastPod, waitMsg, err := pollForCompletion(context.Background(), env.Kubernetes, "default", created.Name, time.Minute) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + if tt.expectTerminated { + if terminated == nil { + t.Fatalf("expected terminated state") + } + if terminated.ExitCode != tt.expectedExitCode { + t.Fatalf("expected exit code %d, got %d", tt.expectedExitCode, terminated.ExitCode) + } + if tt.expectedReason != "" && terminated.Reason != tt.expectedReason { + t.Fatalf("expected reason %q, got %q", tt.expectedReason, terminated.Reason) + } + if lastPod == nil { + t.Fatalf("expected lastPod to be set") + } + } + + if tt.waitingReason == "" && waitMsg != "" { + t.Fatalf("expected no wait message, got %q", waitMsg) + } + }) + } +} + +func TestRetrieveLogs(t *testing.T) { + env := NewNodeDebugTestEnv(t) + env.Pods.Logs = " some output with whitespace \n" + + created, _ := createDebugPod(context.Background(), env.Kubernetes, "node-1", "default", DefaultNodeDebugImage, []string{"echo", "test"}) + + logs, err := retrieveLogs(context.Background(), env.Kubernetes, "default", created.Name) + if err != nil { + t.Fatalf("retrieveLogs failed: %v", err) + } + if logs != "some output with whitespace" { + t.Fatalf("expected trimmed logs, got %q", logs) + } +} + +func TestProcessResults(t *testing.T) { + tests := []struct { + name string + terminated *corev1.ContainerStateTerminated + pod *corev1.Pod + waitMsg string + logs string + expectError bool + errorContains []string + expectLogs bool + expectedResult string + }{ + { + name: "successful completion", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 0, + }, + logs: "success output", + expectError: false, + expectLogs: true, + expectedResult: "success output", + }, + { + name: "non-zero exit code with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 127, + Reason: "CommandNotFound", + Message: "command not found", + }, + logs: "error logs", + expectError: true, + errorContains: []string{"127", "CommandNotFound", "command not found", "error logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without reason or message but with logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "failed output", + expectError: true, + errorContains: []string{"command exited with code 1", "failed output", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "non-zero exit code without logs", + terminated: &corev1.ContainerStateTerminated{ + ExitCode: 1, + }, + logs: "", + expectError: true, + errorContains: []string{"command exited with code 1"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed with logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "pod evicted logs", + expectError: true, + errorContains: []string{"Evicted", "pod evicted logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "pod failed without logs", + pod: &corev1.Pod{ + Status: corev1.PodStatus{ + Reason: "Evicted", + }, + }, + logs: "", + expectError: true, + errorContains: []string{"Evicted"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting with logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "waiting logs", + expectError: true, + errorContains: []string{"did not complete", "waiting logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "container waiting without logs", + waitMsg: "container waiting: ImagePullBackOff", + logs: "", + expectError: true, + errorContains: []string{"did not complete"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state with logs", + logs: "incomplete logs", + expectError: true, + errorContains: []string{"did not reach a terminal state", "incomplete logs", "Output:"}, + expectLogs: false, + expectedResult: "", + }, + { + name: "no terminal state without logs", + logs: "", + expectError: true, + errorContains: []string{"did not reach a terminal state"}, + expectLogs: false, + expectedResult: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := processResults(tt.terminated, tt.pod, tt.waitMsg, tt.logs) + + if tt.expectError { + if err == nil { + t.Fatalf("expected error but got none") + } + for _, substr := range tt.errorContains { + if !strings.Contains(err.Error(), substr) { + t.Fatalf("expected error to contain %q, got: %v", substr, err) + } + } + // Verify logs are NOT in the result when there's an error + if result != "" { + t.Fatalf("expected empty result on error, got %q", result) + } + } else { + if err != nil { + t.Fatalf("expected no error, got: %v", err) + } + // Verify logs ARE in the result when successful + if result != tt.expectedResult { + t.Fatalf("expected result %q, got %q", tt.expectedResult, result) + } + } + }) + } +} + +func TestSanitizeForName(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"worker-0", "worker-0"}, + {"WORKER-0", "worker-0"}, + {"worker.0", "worker-0"}, + {"worker_0", "worker-0"}, + {"ip-10-0-1-42.ec2.internal", "ip-10-0-1-42-ec2-internal"}, + {"", "node"}, + {"---", "node"}, + {strings.Repeat("a", 50), strings.Repeat("a", 40)}, + {"Worker-Node_123.domain", "worker-node-123-domain"}, + } + + for _, tt := range tests { + t.Run(fmt.Sprintf("sanitize(%q)", tt.input), func(t *testing.T) { + result := sanitizeForName(tt.input) + if result != tt.expected { + t.Fatalf("expected %q, got %q", tt.expected, result) + } + }) + } +} diff --git a/pkg/ocp/ocp_client.go b/pkg/ocp/ocp_client.go new file mode 100644 index 000000000..5c2766b4a --- /dev/null +++ b/pkg/ocp/ocp_client.go @@ -0,0 +1,44 @@ +package ocp + +import ( + "context" + + "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// ClientsetInterface defines the interface for access-controlled clientset operations. +// This allows code to work with kubernetes.AccessControlClientset through an interface, +// making it easier to test and decouple from the concrete implementation. +type ClientsetInterface interface { + Pods(namespace string) (corev1client.PodInterface, error) +} + +// OpenshiftClient defines a minimal interface for kubernetes operations commonly needed +// by OCP toolsets. This allows for easier testing and decoupling from the concrete +// kubernetes.Kubernetes type. +type OpenshiftClient interface { + NamespaceOrDefault(namespace string) string + AccessControlClientset() ClientsetInterface + PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) +} + +// OpenshiftClientAdapter adapts kubernetes.Kubernetes to implement OpenshiftClient. +// This allows production code to use the concrete *kubernetes.Kubernetes type +// while tests can use a mock implementation. +type OpenshiftClientAdapter struct { + *kubernetes.Kubernetes +} + +// NewOpenshiftClient creates a new adapter that wraps kubernetes.Kubernetes +// to implement the OpenshiftClient interface. +func NewOpenshiftClient(k *kubernetes.Kubernetes) *OpenshiftClientAdapter { + return &OpenshiftClientAdapter{Kubernetes: k} +} + +// AccessControlClientset returns the access control clientset as an interface. +// This satisfies the OpenshiftClient interface. +func (c *OpenshiftClientAdapter) AccessControlClientset() ClientsetInterface { + return c.Kubernetes.AccessControlClientset() +} diff --git a/pkg/ocp/testhelpers.go b/pkg/ocp/testhelpers.go new file mode 100644 index 000000000..e35d1d9fc --- /dev/null +++ b/pkg/ocp/testhelpers.go @@ -0,0 +1,163 @@ +package ocp + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + schemek8s "k8s.io/client-go/kubernetes/scheme" + corev1client "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +// NodeDebugTestEnv bundles a test Kubernetes client with a controllable pods client for tests. +type NodeDebugTestEnv struct { + Kubernetes *FakeKubernetesClient + Pods *FakePodInterface +} + +// NewNodeDebugTestEnv constructs a testing harness for exercising NodesDebugExec. +func NewNodeDebugTestEnv(t *testing.T) *NodeDebugTestEnv { + t.Helper() + + podsClient := &FakePodInterface{} + fakeK8s := &FakeKubernetesClient{ + pods: podsClient, + namespace: "default", + } + + return &NodeDebugTestEnv{ + Kubernetes: fakeK8s, + Pods: podsClient, + } +} + +// FakeKubernetesClient implements the OpenshiftClient interface for testing +type FakeKubernetesClient struct { + pods *FakePodInterface + namespace string +} + +// AccessControlClientset returns a fake clientset for testing +func (f *FakeKubernetesClient) AccessControlClientset() ClientsetInterface { + return &FakeAccessControlClientset{pods: f.pods} +} + +func (f *FakeKubernetesClient) NamespaceOrDefault(namespace string) string { + if namespace == "" { + return f.namespace + } + return namespace +} + +func (f *FakeKubernetesClient) PodsLog(ctx context.Context, namespace, name, container string, previous bool, tail int64) (string, error) { + req := f.pods.GetLogs(name, &corev1.PodLogOptions{Container: container, Previous: previous}) + res := req.Do(ctx) + if res.Error() != nil { + return "", res.Error() + } + rawData, err := res.Raw() + if err != nil { + return "", err + } + return string(rawData), nil +} + +// FakeAccessControlClientset mimics kubernetes.AccessControlClientset for testing +type FakeAccessControlClientset struct { + pods *FakePodInterface +} + +func (f *FakeAccessControlClientset) Pods(namespace string) (corev1client.PodInterface, error) { + return f.pods, nil +} + +// FakePodInterface implements corev1client.PodInterface with deterministic behaviour for tests. +type FakePodInterface struct { + corev1client.PodInterface + Created *corev1.Pod + Deleted bool + ExitCode int32 + TerminatedReason string + TerminatedMessage string + WaitingReason string + WaitingMessage string + Logs string +} + +func (f *FakePodInterface) Create(ctx context.Context, pod *corev1.Pod, opts metav1.CreateOptions) (*corev1.Pod, error) { + copy := pod.DeepCopy() + if copy.Name == "" && copy.GenerateName != "" { + copy.Name = copy.GenerateName + "test" + } + f.Created = copy + return copy.DeepCopy(), nil +} + +func (f *FakePodInterface) Get(ctx context.Context, name string, opts metav1.GetOptions) (*corev1.Pod, error) { + if f.Created == nil { + return nil, fmt.Errorf("pod not created yet") + } + pod := f.Created.DeepCopy() + + // If waiting state is set, return that instead of terminated + if f.WaitingReason != "" { + waiting := &corev1.ContainerStateWaiting{Reason: f.WaitingReason} + if f.WaitingMessage != "" { + waiting.Message = f.WaitingMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Waiting: waiting}, + }} + pod.Status.Phase = corev1.PodPending + return pod, nil + } + + // Otherwise return terminated state + terminated := &corev1.ContainerStateTerminated{ExitCode: f.ExitCode} + if f.TerminatedReason != "" { + terminated.Reason = f.TerminatedReason + } + if f.TerminatedMessage != "" { + terminated.Message = f.TerminatedMessage + } + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: NodeDebugContainerName, + State: corev1.ContainerState{Terminated: terminated}, + }} + pod.Status.Phase = corev1.PodSucceeded + return pod, nil +} + +func (f *FakePodInterface) Delete(ctx context.Context, name string, opts metav1.DeleteOptions) error { + f.Deleted = true + return nil +} + +func (f *FakePodInterface) GetLogs(name string, opts *corev1.PodLogOptions) *restclient.Request { + body := io.NopCloser(strings.NewReader(f.Logs)) + client := &http.Client{Transport: roundTripperFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{StatusCode: http.StatusOK, Body: body}, nil + })} + content := restclient.ClientContentConfig{ + ContentType: runtime.ContentTypeJSON, + GroupVersion: schema.GroupVersion{Version: "v1"}, + Negotiator: runtime.NewClientNegotiator(schemek8s.Codecs.WithoutConversion(), schema.GroupVersion{Version: "v1"}), + } + return restclient.NewRequestWithClient(&url.URL{Scheme: "https", Host: "localhost"}, "", content, client).Verb("GET") +} + +type roundTripperFunc func(*http.Request) (*http.Response, error) + +func (f roundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) +} diff --git a/pkg/toolsets/openshift/nodes.go b/pkg/toolsets/openshift/nodes.go new file mode 100644 index 000000000..481cd53bb --- /dev/null +++ b/pkg/toolsets/openshift/nodes.go @@ -0,0 +1,126 @@ +package openshift + +import ( + "errors" + "fmt" + "time" + + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +func initNodes() []api.ServerTool { + return []api.ServerTool{ + { + Tool: api.Tool{ + Name: "nodes_debug_exec", + Description: "Run commands on an OpenShift node using a privileged debug pod with comprehensive troubleshooting utilities. The debug pod uses the UBI9 toolbox image which includes: systemd tools (systemctl, journalctl), networking tools (ss, ip, ping, traceroute, nmap), process tools (ps, top, lsof, strace), file system tools (find, tar, rsync), and debugging tools (gdb). The host filesystem is mounted at /host, allowing commands to chroot /host if needed to access node-level resources. Output is truncated to the most recent 100 lines, so prefer filters like grep when expecting large logs.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node": { + Type: "string", + Description: "Name of the node to debug (e.g. worker-0).", + }, + "command": { + Type: "array", + Description: "Command to execute on the node. All standard debugging utilities from the UBI9 toolbox are available. The host filesystem is mounted at /host - use 'chroot /host ' to access node-level resources, or run commands directly in the toolbox environment. Provide each argument as a separate array item (e.g. ['chroot', '/host', 'systemctl', 'status', 'kubelet'] or ['journalctl', '-u', 'kubelet', '--since', '1 hour ago']).", + Items: &jsonschema.Schema{Type: "string"}, + }, + "namespace": { + Type: "string", + Description: "Namespace to create the temporary debug pod in (optional, defaults to the current namespace or 'default').", + }, + "image": { + Type: "string", + Description: "Container image to use for the debug pod (optional). Defaults to registry.access.redhat.com/ubi9/toolbox:latest which provides comprehensive debugging and troubleshooting utilities.", + }, + "timeout_seconds": { + Type: "integer", + Description: "Maximum time to wait for the command to complete before timing out (optional, defaults to 60 seconds).", + Minimum: ptr.To(float64(1)), + }, + }, + Required: []string{"node", "command"}, + }, + Annotations: api.ToolAnnotations{ + Title: "Nodes: Debug Exec", + ReadOnlyHint: ptr.To(false), + DestructiveHint: ptr.To(true), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + Handler: nodesDebugExec, + }, + } +} + +func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + nodeArg := params.GetArguments()["node"] + nodeName, ok := nodeArg.(string) + if nodeArg == nil || !ok || nodeName == "" { + return api.NewToolCallResult("", errors.New("missing required argument: node")), nil + } + + commandArg := params.GetArguments()["command"] + command, err := toStringSlice(commandArg) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("invalid command argument: %w", err)), nil + } + + namespace := "" + if nsArg, ok := params.GetArguments()["namespace"].(string); ok { + namespace = nsArg + } + + image := "" + if imageArg, ok := params.GetArguments()["image"].(string); ok { + image = imageArg + } + + var timeout time.Duration + if timeoutRaw, exists := params.GetArguments()["timeout_seconds"]; exists && timeoutRaw != nil { + switch v := timeoutRaw.(type) { + case float64: + timeout = time.Duration(int64(v)) * time.Second + case int: + timeout = time.Duration(v) * time.Second + case int64: + timeout = time.Duration(v) * time.Second + default: + return api.NewToolCallResult("", errors.New("timeout_seconds must be a numeric value")), nil + } + } + + output, execErr := ocp.NodesDebugExec(params.Context, ocp.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) + if output == "" && execErr == nil { + output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) + } + return api.NewToolCallResult(output, execErr), nil +} + +func toStringSlice(arg any) ([]string, error) { + if arg == nil { + return nil, errors.New("command is required") + } + raw, ok := arg.([]interface{}) + if !ok { + return nil, errors.New("command must be an array of strings") + } + if len(raw) == 0 { + return nil, errors.New("command array cannot be empty") + } + command := make([]string, 0, len(raw)) + for _, item := range raw { + str, ok := item.(string) + if !ok { + return nil, errors.New("command items must be strings") + } + command = append(command, str) + } + return command, nil +} diff --git a/pkg/toolsets/openshift/nodes_test.go b/pkg/toolsets/openshift/nodes_test.go new file mode 100644 index 000000000..e3cddf2ca --- /dev/null +++ b/pkg/toolsets/openshift/nodes_test.go @@ -0,0 +1,95 @@ +package openshift + +import ( + "context" + "testing" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp" +) + +type staticRequest struct { + args map[string]any +} + +func (s staticRequest) GetArguments() map[string]any { + return s.args +} + +func TestNodesDebugExecHandlerValidatesInput(t *testing.T) { + t.Run("missing node", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{}}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "missing required argument: node" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) + + t.Run("invalid command type", func(t *testing.T) { + params := api.ToolHandlerParams{ + Context: context.Background(), + ToolCallRequest: staticRequest{args: map[string]any{ + "node": "worker-0", + "command": "ls -la", + }}, + } + result, err := nodesDebugExec(params) + if err != nil { + t.Fatalf("handler returned error: %v", err) + } + if result.Error == nil || result.Error.Error() != "invalid command argument: command must be an array of strings" { + t.Fatalf("unexpected error: %v", result.Error) + } + }) +} + +func TestNodesDebugExecHandlerExecutesCommand(t *testing.T) { + env := ocp.NewNodeDebugTestEnv(t) + env.Pods.Logs = "done" + + // Call NodesDebugExec directly instead of going through the handler + // This avoids the need to mock the full kubernetes.Kubernetes type + output, err := ocp.NodesDebugExec( + context.Background(), + env.Kubernetes, + "debug", + "infra-node", + "registry.local/debug:latest", + []string{"systemctl", "status", "kubelet"}, + 15*time.Second, + ) + + if err != nil { + t.Fatalf("NodesDebugExec returned error: %v", err) + } + if output != "done" { + t.Fatalf("unexpected output: %q", output) + } + + created := env.Pods.Created + if created == nil { + t.Fatalf("expected pod creation") + } + if created.Namespace != "debug" { + t.Fatalf("expected namespace override, got %q", created.Namespace) + } + if created.Spec.Containers[0].Image != "registry.local/debug:latest" { + t.Fatalf("expected custom image, got %q", created.Spec.Containers[0].Image) + } + expectedCommand := []string{"systemctl", "status", "kubelet"} + if len(created.Spec.Containers[0].Command) != len(expectedCommand) { + t.Fatalf("unexpected command length: %v", created.Spec.Containers[0].Command) + } + for i, part := range expectedCommand { + if created.Spec.Containers[0].Command[i] != part { + t.Fatalf("command[%d]=%q expected %q", i, created.Spec.Containers[0].Command[i], part) + } + } +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go new file mode 100644 index 000000000..d11e4b668 --- /dev/null +++ b/pkg/toolsets/openshift/toolset.go @@ -0,0 +1,31 @@ +package openshift + +import ( + "slices" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets" +) + +type Toolset struct{} + +var _ api.Toolset = (*Toolset)(nil) + +func (t *Toolset) GetName() string { + return "openshift-core" +} + +func (t *Toolset) GetDescription() string { + return "Core OpenShift-specific tools (Node debugging, etc.)" +} + +func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { + return slices.Concat( + initNodes(), + ) +} + +func init() { + toolsets.Register(&Toolset{}) +} From abd1c31c82c8c42f1def14c970d95e829aea4366 Mon Sep 17 00:00:00 2001 From: Swarup Ghosh Date: Wed, 5 Nov 2025 02:30:21 +0530 Subject: [PATCH 2/5] move node tools into pkg/toolsets/openshift/nodes Signed-off-by: Swarup Ghosh --- README.md | 3 +++ pkg/ocp/{ => nodes}/nodes_debug.go | 2 +- pkg/ocp/{ => nodes}/nodes_debug_test.go | 2 +- pkg/ocp/{ => nodes}/ocp_client.go | 2 +- pkg/ocp/{ => nodes}/testhelpers.go | 2 +- pkg/toolsets/openshift/{ => nodes}/nodes.go | 8 ++++---- pkg/toolsets/openshift/{ => nodes}/nodes_test.go | 8 ++++---- pkg/toolsets/openshift/toolset.go | 3 ++- 8 files changed, 17 insertions(+), 13 deletions(-) rename pkg/ocp/{ => nodes}/nodes_debug.go (99%) rename pkg/ocp/{ => nodes}/nodes_debug_test.go (99%) rename pkg/ocp/{ => nodes}/ocp_client.go (99%) rename pkg/ocp/{ => nodes}/testhelpers.go (99%) rename pkg/toolsets/openshift/{ => nodes}/nodes.go (94%) rename pkg/toolsets/openshift/{ => nodes}/nodes_test.go (94%) diff --git a/README.md b/README.md index 7a28da124..71ffaa9e4 100644 --- a/README.md +++ b/README.md @@ -250,6 +250,9 @@ In case multi-cluster support is enabled (default) and you have access to multip - `query` (`string`) **(required)** - query specifies services(s) or files from which to return logs (required). Example: "kubelet" to fetch kubelet logs, "/" to fetch a specific log file from the node (e.g., "/var/log/kubelet.log" or "/var/log/kube-proxy.log") - `tailLines` (`integer`) - Number of lines to retrieve from the end of the logs (Optional, 0 means all logs) +- **nodes_stats_summary** - Get detailed resource usage statistics from a Kubernetes node via the kubelet's Summary API. Provides comprehensive metrics including CPU, memory, filesystem, and network usage at the node, pod, and container levels. On systems with cgroup v2 and kernel 4.20+, also includes PSI (Pressure Stall Information) metrics that show resource pressure for CPU, memory, and I/O. See https://kubernetes.io/docs/reference/instrumentation/understand-psi-metrics/ for details on PSI metrics + - `name` (`string`) **(required)** - Name of the node to get stats from + - **pods_list** - List all the Kubernetes pods in the current cluster from all namespaces - `labelSelector` (`string`) - Optional Kubernetes label selector (e.g. 'app=myapp,env=prod' or 'app in (myapp,yourapp)'), use this option when you want to filter the pods by label diff --git a/pkg/ocp/nodes_debug.go b/pkg/ocp/nodes/nodes_debug.go similarity index 99% rename from pkg/ocp/nodes_debug.go rename to pkg/ocp/nodes/nodes_debug.go index cb079a9ae..f8b7e412a 100644 --- a/pkg/ocp/nodes_debug.go +++ b/pkg/ocp/nodes/nodes_debug.go @@ -1,4 +1,4 @@ -package ocp +package nodes import ( "context" diff --git a/pkg/ocp/nodes_debug_test.go b/pkg/ocp/nodes/nodes_debug_test.go similarity index 99% rename from pkg/ocp/nodes_debug_test.go rename to pkg/ocp/nodes/nodes_debug_test.go index 1c86c92f8..8fb415f5f 100644 --- a/pkg/ocp/nodes_debug_test.go +++ b/pkg/ocp/nodes/nodes_debug_test.go @@ -1,4 +1,4 @@ -package ocp +package nodes import ( "context" diff --git a/pkg/ocp/ocp_client.go b/pkg/ocp/nodes/ocp_client.go similarity index 99% rename from pkg/ocp/ocp_client.go rename to pkg/ocp/nodes/ocp_client.go index 5c2766b4a..096447366 100644 --- a/pkg/ocp/ocp_client.go +++ b/pkg/ocp/nodes/ocp_client.go @@ -1,4 +1,4 @@ -package ocp +package nodes import ( "context" diff --git a/pkg/ocp/testhelpers.go b/pkg/ocp/nodes/testhelpers.go similarity index 99% rename from pkg/ocp/testhelpers.go rename to pkg/ocp/nodes/testhelpers.go index e35d1d9fc..7ab6a0fdb 100644 --- a/pkg/ocp/testhelpers.go +++ b/pkg/ocp/nodes/testhelpers.go @@ -1,4 +1,4 @@ -package ocp +package nodes import ( "context" diff --git a/pkg/toolsets/openshift/nodes.go b/pkg/toolsets/openshift/nodes/nodes.go similarity index 94% rename from pkg/toolsets/openshift/nodes.go rename to pkg/toolsets/openshift/nodes/nodes.go index 481cd53bb..b4a23e023 100644 --- a/pkg/toolsets/openshift/nodes.go +++ b/pkg/toolsets/openshift/nodes/nodes.go @@ -1,4 +1,4 @@ -package openshift +package nodes import ( "errors" @@ -9,10 +9,10 @@ import ( "k8s.io/utils/ptr" "github.com/containers/kubernetes-mcp-server/pkg/api" - "github.com/containers/kubernetes-mcp-server/pkg/ocp" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/nodes" ) -func initNodes() []api.ServerTool { +func NodeTools() []api.ServerTool { return []api.ServerTool{ { Tool: api.Tool{ @@ -96,7 +96,7 @@ func nodesDebugExec(params api.ToolHandlerParams) (*api.ToolCallResult, error) { } } - output, execErr := ocp.NodesDebugExec(params.Context, ocp.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) + output, execErr := nodes.NodesDebugExec(params.Context, nodes.NewOpenshiftClient(params.Kubernetes), namespace, nodeName, image, command, timeout) if output == "" && execErr == nil { output = fmt.Sprintf("Command executed successfully on node %s but produced no output.", nodeName) } diff --git a/pkg/toolsets/openshift/nodes_test.go b/pkg/toolsets/openshift/nodes/nodes_test.go similarity index 94% rename from pkg/toolsets/openshift/nodes_test.go rename to pkg/toolsets/openshift/nodes/nodes_test.go index e3cddf2ca..45929f9ae 100644 --- a/pkg/toolsets/openshift/nodes_test.go +++ b/pkg/toolsets/openshift/nodes/nodes_test.go @@ -1,4 +1,4 @@ -package openshift +package nodes import ( "context" @@ -6,7 +6,7 @@ import ( "time" "github.com/containers/kubernetes-mcp-server/pkg/api" - "github.com/containers/kubernetes-mcp-server/pkg/ocp" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/nodes" ) type staticRequest struct { @@ -51,12 +51,12 @@ func TestNodesDebugExecHandlerValidatesInput(t *testing.T) { } func TestNodesDebugExecHandlerExecutesCommand(t *testing.T) { - env := ocp.NewNodeDebugTestEnv(t) + env := nodes.NewNodeDebugTestEnv(t) env.Pods.Logs = "done" // Call NodesDebugExec directly instead of going through the handler // This avoids the need to mock the full kubernetes.Kubernetes type - output, err := ocp.NodesDebugExec( + output, err := nodes.NodesDebugExec( context.Background(), env.Kubernetes, "debug", diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go index d11e4b668..911bda950 100644 --- a/pkg/toolsets/openshift/toolset.go +++ b/pkg/toolsets/openshift/toolset.go @@ -6,6 +6,7 @@ import ( "github.com/containers/kubernetes-mcp-server/pkg/api" internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" "github.com/containers/kubernetes-mcp-server/pkg/toolsets" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift/nodes" ) type Toolset struct{} @@ -22,7 +23,7 @@ func (t *Toolset) GetDescription() string { func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { return slices.Concat( - initNodes(), + nodes.NodeTools(), ) } From 9b6ac5bb04762b8cd7ba437fa03c2d797f40e106 Mon Sep 17 00:00:00 2001 From: Swarup Ghosh Date: Wed, 5 Nov 2025 03:04:15 +0530 Subject: [PATCH 3/5] make k.canIUse and exported func Signed-off-by: Swarup Ghosh --- pkg/kubernetes/resources.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/kubernetes/resources.go b/pkg/kubernetes/resources.go index 1f559e126..d816ed3dd 100644 --- a/pkg/kubernetes/resources.go +++ b/pkg/kubernetes/resources.go @@ -3,10 +3,11 @@ package kubernetes import ( "context" "fmt" - "k8s.io/apimachinery/pkg/runtime" "regexp" "strings" + "k8s.io/apimachinery/pkg/runtime" + "github.com/containers/kubernetes-mcp-server/pkg/version" authv1 "k8s.io/api/authorization/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -36,7 +37,7 @@ func (k *Kubernetes) ResourcesList(ctx context.Context, gvk *schema.GroupVersion // Check if operation is allowed for all namespaces (applicable for namespaced resources) isNamespaced, _ := k.isNamespaced(gvk) - if isNamespaced && !k.canIUse(ctx, gvr, namespace, "list") && namespace == "" { + if isNamespaced && !k.CanIUse(ctx, gvr, namespace, "list") && namespace == "" { namespace = k.manager.configuredNamespace() } if options.AsTable { @@ -187,7 +188,7 @@ func (k *Kubernetes) supportsGroupVersion(groupVersion string) bool { return true } -func (k *Kubernetes) canIUse(ctx context.Context, gvr *schema.GroupVersionResource, namespace, verb string) bool { +func (k *Kubernetes) CanIUse(ctx context.Context, gvr *schema.GroupVersionResource, namespace, verb string) bool { accessReviews, err := k.manager.accessControlClientSet.SelfSubjectAccessReviews() if err != nil { return false From cb29fed44d4445204bb5a74e2c34811b49635fcd Mon Sep 17 00:00:00 2001 From: Swarup Ghosh Date: Wed, 5 Nov 2025 23:44:15 +0530 Subject: [PATCH 4/5] implement plan_mustgather in must-gather OCP toolset --- pkg/ocp/mustgather/mustgather_plan.go | 432 ++++++++++++++++++ .../openshift/mustgather/mustgather.go | 81 ++++ pkg/toolsets/openshift/toolset.go | 4 +- 3 files changed, 516 insertions(+), 1 deletion(-) create mode 100644 pkg/ocp/mustgather/mustgather_plan.go create mode 100644 pkg/toolsets/openshift/mustgather/mustgather.go diff --git a/pkg/ocp/mustgather/mustgather_plan.go b/pkg/ocp/mustgather/mustgather_plan.go new file mode 100644 index 000000000..584980197 --- /dev/null +++ b/pkg/ocp/mustgather/mustgather_plan.go @@ -0,0 +1,432 @@ +package mustgather + +import ( + "fmt" + "path" + "strings" + "time" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/rand" + "sigs.k8s.io/yaml" +) + +const ( + defaultGatherSourceDir = "/must-gather/" + defaultMustGatherImage = "registry.redhat.io/openshift4/ose-must-gather:latest" + defaultGatherCmd = "/usr/bin/gather" + mgAnnotation = "operators.openshift.io/must-gather-image" + maxConcurrentGathers = 8 +) + +func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { + args := params.GetArguments() + + var nodeName, sourceDir, namespace, gatherCmd, timeout, since string + var hostNetwork, keepResources, allImages bool + var images []string + var nodeSelector map[string]string + + if args["node_name"] != nil { + nodeName = args["node_name"].(string) + } + + if args["node_selector"] != nil { + nodeSelector = parseNodeSelector(args["node_selector"].(string)) + } + + if args["host_network"] != nil { + hostNetwork = args["host_network"].(bool) + } + + sourceDir = defaultGatherSourceDir + if args["source_dir"] != nil { + sourceDir = path.Clean(args["source_dir"].(string)) + } + + namespace = fmt.Sprintf("openshift-must-gather-%s", rand.String(6)) + if args["namespace"] != nil { + namespace = args["namespace"].(string) + } + + if args["keep_resources"] != nil { + keepResources = args["keep_resources"].(bool) + } + + gatherCmd = defaultGatherCmd + if args["gather_command"] != nil { + gatherCmd = args["gather_command"].(string) + } + + if args["all_component_images"] != nil { + allImages = args["all_component_images"].(bool) + } + + if args["images"] != nil { + if imagesArg, ok := args["images"].([]interface{}); ok { + for _, img := range imagesArg { + if imgStr, ok := img.(string); ok { + images = append(images, imgStr) + } + } + } + } + + if allImages { + componentImages, err := getComponentImages(params) + if err != nil { + return api.NewToolCallResult("", + fmt.Errorf("failed to get operator images: %v", err), + ), nil + } + + images = append(images, componentImages...) + } + + if len(images) > maxConcurrentGathers { + return api.NewToolCallResult("", + fmt.Errorf("more than %d gather images are not supported", maxConcurrentGathers), + ), nil + } + + if args["timeout"] != nil { + timeout = args["timeout"].(string) + + _, err := time.ParseDuration(timeout) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("timeout duration is not valid")), nil + } + + gatherCmd = fmt.Sprintf("/usr/bin/timeout %s %s", timeout, gatherCmd) + } + + if args["since"] != nil { + since = args["since"].(string) + + _, err := time.ParseDuration(since) + if err != nil { + return api.NewToolCallResult("", fmt.Errorf("since duration is not valid")), nil + } + } + + envVars := []corev1.EnvVar{} + if since != "" { + envVars = append(envVars, corev1.EnvVar{ + Name: "MUST_GATHER_SINCE", + Value: since, + }) + } + + // template container for gather, + // if multiple images are added multiple containers in the same pod will be spin up + gatherContainerTemplate := corev1.Container{ + Name: "gather", + Image: defaultMustGatherImage, + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{gatherCmd}, + Env: envVars, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "must-gather-output", + MountPath: sourceDir, + }, + }, + } + + var gatherContainers = []corev1.Container{ + *gatherContainerTemplate.DeepCopy(), + } + + if len(images) > 0 { + gatherContainers = make([]corev1.Container, len(images)) + } + + for i, image := range images { + gatherContainers[i] = *gatherContainerTemplate.DeepCopy() + + // if more than one gather container(s) are added, + // suffix container name with int id + if len(images) > 1 { + gatherContainers[i].Name = fmt.Sprintf("gather-%d", i+1) + } + gatherContainers[i].Image = image + } + + serviceAccountName := "must-gather-collector" + + pod := &corev1.Pod{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Pod", + }, + ObjectMeta: metav1.ObjectMeta{ + // Avoiding generateName as resources_create_or_update fails without explicit name. + Name: fmt.Sprintf("must-gather-%s", rand.String(6)), + Namespace: namespace, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: serviceAccountName, + NodeName: nodeName, + PriorityClassName: "system-cluster-critical", + RestartPolicy: corev1.RestartPolicyNever, + Volumes: []corev1.Volume{ + { + Name: "must-gather-output", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + }, + Containers: append(gatherContainers, corev1.Container{ + Name: "wait", + Image: "registry.redhat.io/ubi9/ubi-minimal", + ImagePullPolicy: corev1.PullIfNotPresent, + Command: []string{"/bin/bash", "-c", "sleep infinity"}, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "must-gather-output", + MountPath: "/must-gather", + }, + }, + }), + HostNetwork: hostNetwork, + NodeSelector: nodeSelector, + Tolerations: []corev1.Toleration{ + { + Operator: "Exists", + }, + }, + }, + } + + namespaceExists := false + + _, err := params.ResourcesGet(params, &schema.GroupVersionKind{ + Group: "", + Version: "v1", + Kind: "Namespaces", + }, "", namespace) + if err == nil { + namespaceExists = true + } + + var namespaceObj *corev1.Namespace + if !namespaceExists { + namespaceObj = &corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: namespace, + }, + } + } + + serviceAccount := &corev1.ServiceAccount{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "ServiceAccount", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: serviceAccountName, + Namespace: namespace, + }, + } + + clusterRoleBindingName := fmt.Sprintf("%s-must-gather-collector", namespace) + clusterRoleBinding := &rbacv1.ClusterRoleBinding{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "rbac.authorization.k8s.io/v1", + Kind: "ClusterRoleBinding", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: clusterRoleBindingName, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: "rbac.authorization.k8s.io", + Kind: "ClusterRole", + Name: "cluster-admin", + }, + Subjects: []rbacv1.Subject{ + { + Kind: "ServiceAccount", + Name: serviceAccountName, + Namespace: namespace, + }, + }, + } + + allowChecks := map[string]struct { + schema.GroupVersionResource + name string + verb string + }{ + "create_namespace": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "namespace"}, + verb: "create", + }, + "create_serviceaccount": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "serviceaccount"}, + verb: "create", + }, + "create_clusterrolebinding": { + GroupVersionResource: schema.GroupVersionResource{Group: "rbac.authorization.k8s.io", Version: "v1", Resource: "clusterrolebindings"}, + verb: "create", + }, + "create_pod": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "pod"}, + verb: "create", + }, + "use_scc_hostnetwork": { + GroupVersionResource: schema.GroupVersionResource{Group: "security.openshift.io", Version: "v1", Resource: "securitycontextconstraints"}, + name: "hostnetwork-v2", + verb: "use", + }, + } + isAllowed := make(map[string]bool) + + for k, check := range allowChecks { + isAllowed[k] = params.CanIUse(params, &check.GroupVersionResource, "", check.verb) + } + + var result strings.Builder + result.WriteString("The generated plan contains YAML manifests for must-gather pods and required resources (namespace, serviceaccount, clusterrolebinding). " + + "Suggest how the user can apply the manifest and copy results locally (`oc cp` / `kubectl cp`). \n\n", + ) + result.WriteString("Ask the user if they want to apply the plan \n" + + "- use the resource_create_or_update tool to apply the manifest \n" + + "- alternatively, advise the user to execute `oc apply` / `kubectl apply` instead. \n\n", + ) + + if !keepResources { + result.WriteString("Once the must-gather collection is completed, the user may wish to cleanup the created resources. \n" + + "- use the resources_delete tool to delete the namespace and the clusterrolebinding \n" + + "- or, execute cleanup using `kubectl delete`. \n\n") + } + + if !namespaceExists && isAllowed["create_namespace"] { + namespaceYaml, err := yaml.Marshal(namespaceObj) + if err != nil { + return nil, fmt.Errorf("failed to marshal namespace to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(namespaceYaml) + result.WriteString("```\n\n") + } + + if !namespaceExists && !isAllowed["create_namespace"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create namespace(s).\n") + } + + // yaml(s) are dumped into individual code blocks of ``` ``` + // because resources_create_or_update tool call fails when content has more than one more resource, + // some models are smart to detect an error and retry with one resource a time though. + + serviceAccountYaml, err := yaml.Marshal(serviceAccount) + if err != nil { + return nil, fmt.Errorf("failed to marshal service account to yaml: %w", err) + } + result.WriteString("```yaml\n") + result.Write(serviceAccountYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_serviceaccount"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create serviceaccount(s).\n") + } + + clusterRoleBindingYaml, err := yaml.Marshal(clusterRoleBinding) + if err != nil { + return nil, fmt.Errorf("failed to marshal cluster role binding to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(clusterRoleBindingYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_clusterrolebinding"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create clusterrolebinding(s).\n") + } + + podYaml, err := yaml.Marshal(pod) + if err != nil { + return nil, fmt.Errorf("failed to marshal pod to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(podYaml) + result.WriteString("```\n") + + if !isAllowed["create_pod"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create pod(s).\n") + } + + if hostNetwork && !isAllowed["use_scc_hostnetwork"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create pod(s) with hostNetwork: true.\n") + } + + return api.NewToolCallResult(result.String(), nil), nil +} + +func getComponentImages(params api.ToolHandlerParams) ([]string, error) { + var images []string + appendImageFromAnnotation := func(obj runtime.Object) error { + unstruct, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return err + } + + u := unstructured.Unstructured{Object: unstruct} + annotations := u.GetAnnotations() + if annotations[mgAnnotation] != "" { + images = append(images, annotations[mgAnnotation]) + } + + return nil + } + + clusterOperatorsList, err := params.ResourcesList(params, &schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "ClusterOperator", + }, "", internalk8s.ResourceListOptions{}) + if err != nil { + return nil, err + } + + if err := clusterOperatorsList.EachListItem(appendImageFromAnnotation); err != nil { + return images, err + } + + csvList, err := params.ResourcesList(params, &schema.GroupVersionKind{ + Group: "operators.coreos.com", + Version: "v1alpha1", + Kind: "ClusterServiceVersion", + }, "", internalk8s.ResourceListOptions{}) + if err != nil { + return images, err + } + + err = csvList.EachListItem(appendImageFromAnnotation) + return images, err +} + +func parseNodeSelector(selector string) map[string]string { + result := make(map[string]string) + pairs := strings.Split(selector, ",") + for _, pair := range pairs { + kv := strings.SplitN(strings.TrimSpace(pair), "=", 2) + if len(kv) == 2 && strings.TrimSpace(kv[0]) != "" { + result[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) + } + } + return result +} diff --git a/pkg/toolsets/openshift/mustgather/mustgather.go b/pkg/toolsets/openshift/mustgather/mustgather.go new file mode 100644 index 000000000..67f731daf --- /dev/null +++ b/pkg/toolsets/openshift/mustgather/mustgather.go @@ -0,0 +1,81 @@ +package mustgather + +import ( + "github.com/google/jsonschema-go/jsonschema" + "k8s.io/utils/ptr" + + "github.com/containers/kubernetes-mcp-server/pkg/api" + "github.com/containers/kubernetes-mcp-server/pkg/ocp/mustgather" +) + +func MustGatherTools() []api.ServerTool { + return []api.ServerTool{{ + Tool: api.Tool{ + Name: "plan_mustgather", + Description: "Plan for collecting a must-gather archive from an OpenShift cluster, must-gather is a tool for collecting cluster data related to debugging and troubleshooting like logs, kubernetes resources, etc.", + InputSchema: &jsonschema.Schema{ + Type: "object", + Properties: map[string]*jsonschema.Schema{ + "node_name": { + Type: "string", + Description: "Optional node to run the mustgather pod. If not provided, a random control-plane node will be selected automatically", + }, + "node_selector": { + Type: "string", + Description: "Optional node label selector to use, only relevant when specifying a command and image which needs to capture data on a set of cluster nodes simultaneously", + }, + "host_network": { + Type: "boolean", + Description: "Optionally run the must-gather pods in the host network of the node. This is only relevant if a specific gather image needs to capture host-level data", + }, + "gather_command": { + Type: "string", + Description: "Optionally specify a custom gather command to run a specialized script, eg. /usr/bin/gather_audit_logs", + Default: api.ToRawMessage("/usr/bin/gather"), + }, + "all_component_images": { + Type: "boolean", + Description: "Optional when enabled, collects and runs multiple must gathers for all operators and components on the cluster that have an annotated must-gather image available", + }, + "images": { + Type: "array", + Description: "Optional list of images to use for gathering custom information about specific operators or cluster components. If not specified, OpenShift's default must-gather image will be used by default", + Items: &jsonschema.Schema{ + Type: "string", + }, + }, + "source_dir": { + Type: "string", + Description: "Optional to set a specific directory where the pod will copy gathered data from", + Default: api.ToRawMessage("/must-gather"), + }, + "timeout": { + Type: "string", + Description: "Timeout of the gather process eg. 30s, 6m20s, or 2h10m30s", + }, + "namespace": { + Type: "string", + Description: "Optional to specify an existing privileged namespace where must-gather pods should run. If not provided, a temporary namespace will be created", + }, + "keep_resources": { + Type: "boolean", + Description: "Optional to retain all temporary resources when the mustgather completes, otherwise temporary resources created will be advised to be cleaned up", + }, + "since": { + Type: "string", + Description: "Optional to collect logs newer than a relative duration like 5s, 2m5s, or 3h6m10s. If unspecified, all available logs will be collected", + }, + }, + }, + Annotations: api.ToolAnnotations{ + Title: "MustGather: Plan", + ReadOnlyHint: ptr.To(true), + DestructiveHint: ptr.To(false), + IdempotentHint: ptr.To(false), + OpenWorldHint: ptr.To(true), + }, + }, + + Handler: mustgather.PlanMustGather, + }} +} diff --git a/pkg/toolsets/openshift/toolset.go b/pkg/toolsets/openshift/toolset.go index 911bda950..999825d5b 100644 --- a/pkg/toolsets/openshift/toolset.go +++ b/pkg/toolsets/openshift/toolset.go @@ -6,6 +6,7 @@ import ( "github.com/containers/kubernetes-mcp-server/pkg/api" internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" "github.com/containers/kubernetes-mcp-server/pkg/toolsets" + "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift/mustgather" "github.com/containers/kubernetes-mcp-server/pkg/toolsets/openshift/nodes" ) @@ -18,11 +19,12 @@ func (t *Toolset) GetName() string { } func (t *Toolset) GetDescription() string { - return "Core OpenShift-specific tools (Node debugging, etc.)" + return "Core OpenShift-specific tools (must-gather, Node debugging, etc.)" } func (t *Toolset) GetTools(o internalk8s.Openshift) []api.ServerTool { return slices.Concat( + mustgather.MustGatherTools(), nodes.NodeTools(), ) } From d2bf1e91b587547bc4396f2d098fe4680b0704fa Mon Sep 17 00:00:00 2001 From: Swarup Ghosh Date: Thu, 6 Nov 2025 00:20:16 +0530 Subject: [PATCH 5/5] add persistent storage for plan_mg optionally Signed-off-by: Swarup Ghosh --- pkg/ocp/mustgather/mustgather_plan.go | 98 ++++++++++++++++--- .../openshift/mustgather/mustgather.go | 4 + 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/pkg/ocp/mustgather/mustgather_plan.go b/pkg/ocp/mustgather/mustgather_plan.go index 584980197..107f46041 100644 --- a/pkg/ocp/mustgather/mustgather_plan.go +++ b/pkg/ocp/mustgather/mustgather_plan.go @@ -10,6 +10,7 @@ import ( internalk8s "github.com/containers/kubernetes-mcp-server/pkg/kubernetes" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" @@ -30,7 +31,7 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { args := params.GetArguments() var nodeName, sourceDir, namespace, gatherCmd, timeout, since string - var hostNetwork, keepResources, allImages bool + var hostNetwork, keepResources, allImages, withStorage bool var images []string var nodeSelector map[string]string @@ -46,6 +47,10 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { hostNetwork = args["host_network"].(bool) } + if args["with_storage"] != nil { + withStorage = args["with_storage"].(bool) + } + sourceDir = defaultGatherSourceDir if args["source_dir"] != nil { sourceDir = path.Clean(args["source_dir"].(string)) @@ -160,6 +165,31 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { } serviceAccountName := "must-gather-collector" + pvcName := fmt.Sprintf("must-gather-pvc-%s", rand.String(6)) + + // Configure volume based on storage type + var volumes []corev1.Volume + if withStorage { + volumes = []corev1.Volume{ + { + Name: "must-gather-output", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: pvcName, + }, + }, + }, + } + } else { + volumes = []corev1.Volume{ + { + Name: "must-gather-output", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, + } + } pod := &corev1.Pod{ TypeMeta: metav1.TypeMeta{ @@ -176,14 +206,7 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { NodeName: nodeName, PriorityClassName: "system-cluster-critical", RestartPolicy: corev1.RestartPolicyNever, - Volumes: []corev1.Volume{ - { - Name: "must-gather-output", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - }, - }, + Volumes: volumes, Containers: append(gatherContainers, corev1.Container{ Name: "wait", Image: "registry.redhat.io/ubi9/ubi-minimal", @@ -264,6 +287,31 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { }, } + // Create PVC if persistent storage is requested + var pvc *corev1.PersistentVolumeClaim + if withStorage { + pvc = &corev1.PersistentVolumeClaim{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "v1", + Kind: "PersistentVolumeClaim", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: pvcName, + Namespace: namespace, + }, + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{ + corev1.ReadWriteOnce, + }, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("10Gi"), + }, + }, + }, + } + } + allowChecks := map[string]struct { schema.GroupVersionResource name string @@ -285,6 +333,10 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "pod"}, verb: "create", }, + "create_persistentvolumeclaim": { + GroupVersionResource: schema.GroupVersionResource{Version: "v1", Resource: "persistentvolumeclaims"}, + verb: "create", + }, "use_scc_hostnetwork": { GroupVersionResource: schema.GroupVersionResource{Group: "security.openshift.io", Version: "v1", Resource: "securitycontextconstraints"}, name: "hostnetwork-v2", @@ -298,9 +350,15 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { } var result strings.Builder - result.WriteString("The generated plan contains YAML manifests for must-gather pods and required resources (namespace, serviceaccount, clusterrolebinding). " + - "Suggest how the user can apply the manifest and copy results locally (`oc cp` / `kubectl cp`). \n\n", - ) + result.WriteString("The generated plan contains YAML manifests for must-gather pods and required resources (namespace, serviceaccount, clusterrolebinding") + if withStorage { + result.WriteString(", persistentvolumeclaim). " + + "The data will be stored on a persistent volume (10Gi) and preserved even after pod deletion. ") + } else { + result.WriteString("). ") + } + result.WriteString("Suggest how the user can apply the manifest and copy results locally (`oc cp` / `kubectl cp`). \n\n") + result.WriteString("Ask the user if they want to apply the plan \n" + "- use the resource_create_or_update tool to apply the manifest \n" + "- alternatively, advise the user to execute `oc apply` / `kubectl apply` instead. \n\n", @@ -343,6 +401,22 @@ func PlanMustGather(params api.ToolHandlerParams) (*api.ToolCallResult, error) { result.WriteString("WARNING: The resources_create_or_update call does not have permission to create serviceaccount(s).\n") } + // Output PVC YAML if persistent storage is requested + if withStorage { + pvcYaml, err := yaml.Marshal(pvc) + if err != nil { + return nil, fmt.Errorf("failed to marshal PVC to yaml: %w", err) + } + + result.WriteString("```yaml\n") + result.Write(pvcYaml) + result.WriteString("```\n\n") + + if !isAllowed["create_persistentvolumeclaim"] { + result.WriteString("WARNING: The resources_create_or_update call does not have permission to create persistentvolumeclaim(s).\n") + } + } + clusterRoleBindingYaml, err := yaml.Marshal(clusterRoleBinding) if err != nil { return nil, fmt.Errorf("failed to marshal cluster role binding to yaml: %w", err) diff --git a/pkg/toolsets/openshift/mustgather/mustgather.go b/pkg/toolsets/openshift/mustgather/mustgather.go index 67f731daf..680d2adad 100644 --- a/pkg/toolsets/openshift/mustgather/mustgather.go +++ b/pkg/toolsets/openshift/mustgather/mustgather.go @@ -65,6 +65,10 @@ func MustGatherTools() []api.ServerTool { Type: "string", Description: "Optional to collect logs newer than a relative duration like 5s, 2m5s, or 3h6m10s. If unspecified, all available logs will be collected", }, + "with_storage": { + Type: "boolean", + Description: "Optional to persist the collected must-gather data to a storage volume.", + }, }, }, Annotations: api.ToolAnnotations{