diff --git a/Dockerfile b/Dockerfile index 7c6add4..dad8e52 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=$BUILDPLATFORM golang:1.23 as builder +FROM --platform=$BUILDPLATFORM golang:1.24 as builder ARG TARGETPLATFORM ARG ENVVAR=CGO_ENABLED=0 WORKDIR /go/src/github.com/atlassian-labs/cyclops diff --git a/Makefile b/Makefile index 20d789f..3e4b245 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION = 1.10.2 +VERSION = 1.10.3 # IMPORTANT! Update api version if a new release affects cnr API_VERSION = 1.0.0 IMAGE = cyclops:$(VERSION) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitioner.go b/pkg/controller/cyclenoderequest/transitioner/transitioner.go index 653052f..abefeff 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitioner.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitioner.go @@ -17,6 +17,25 @@ type transitionFunc func() (reconcile.Result, error) // different request types. const cycleNodeLabel = "cyclops.atlassian.com/terminate" +// cyclopsManagedAnnotation marks nodes where Cyclops added the scale-down-disabled annotation. +// Used to track which annotations we added vs pre-existing ones (for safe cleanup). +// Only remove the cluster-autoscaler annotation if this marker annotation also exists. +const cyclopsManagedAnnotation = "cyclops.atlassian.com/annotation-managed" + +// clusterAutoscalerScaleDownDisabledAnnotation is the annotation key used to prevent +// Cluster Autoscaler from scaling down a node. This is used to protect new nodes +// during the cycling process from being removed by Cluster Autoscaler before +// the corresponding old nodes are fully terminated. +// See: https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node +const clusterAutoscalerScaleDownDisabledAnnotation = "cluster-autoscaler.kubernetes.io/scale-down-disabled" +const clusterAutoscalerScaleDownDisabledValue = "true" + +// nodeGroupAnnotationKey is the annotation key on NodeGroup resources that controls whether +// Cluster Autoscaler annotation management is enabled or disabled. +// Value: "true" → opt-out (disable annotation management) +// Value: "false" or missing/empty → default enabled (annotation management enabled) +const nodeGroupAnnotationKey = "cyclops.atlassian.com/disable-annotation-management" + var ( transitionDuration = 10 * time.Second requeueDuration = 30 * time.Second diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index bfb39ea..176bb85 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -408,6 +408,27 @@ func (t *CycleNodeRequestTransitioner) transitionScalingUp() (reconcile.Result, } } + // Add scale-down-disabled annotation to new nodes if enabled + if t.shouldManageAnnotations() { + newNodes := findNodesCreatedAfter(kubeNodes, scaleUpStarted.Time) + for _, newNode := range newNodes { + if err := t.addScaleDownDisabledAnnotation(newNode.Name); err != nil { + // Log warning but don't fail - annotation is best-effort and shouldn't block cycling + t.rm.Logger.Info("Failed to add scale-down-disabled annotation to new node", + "nodeName", newNode.Name, + "error", err) + t.rm.LogEvent(t.cycleNodeRequest, "AnnotationWarning", + "Failed to add scale-down-disabled annotation to node %s: %v", newNode.Name, err) + } else { + t.rm.Logger.Info("Added scale-down-disabled annotation to new node", + "nodeName", newNode.Name) + } + } + } else { + t.rm.Logger.Info("Node annotation management disabled for this CNR", + "cnr", t.cycleNodeRequest.Name) + } + t.rm.LogEvent(t.cycleNodeRequest, "ScalingUpCompleted", "New nodes are now ready") return t.transitionObject(v1.CycleNodeRequestCordoningNode) } @@ -572,6 +593,14 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er } } + // Cleanup scale-down-disabled annotations if enabled + if t.shouldManageAnnotations() { + t.cleanupScaleDownDisabledAnnotations() + } else { + t.rm.Logger.Info("Skipping annotation cleanup (disabled)", + "cnr", t.cycleNodeRequest.Name) + } + return t.transitionToFailed(nil) } @@ -599,6 +628,14 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result, return reconcile.Result{Requeue: true, RequeueAfter: transitionDuration}, nil } + // Cleanup scale-down-disabled annotations if enabled + if t.shouldManageAnnotations() { + t.cleanupScaleDownDisabledAnnotations() + } else { + t.rm.Logger.Info("Skipping annotation cleanup (disabled)", + "cnr", t.cycleNodeRequest.Name) + } + // Delete failed sibling CNRs regardless of whether the CNR for the // transitioner should be deleted. If failed CNRs pile up that will prevent // Cyclops observer from auto-generating new CNRs for a nodegroup. diff --git a/pkg/controller/cyclenoderequest/transitioner/util.go b/pkg/controller/cyclenoderequest/transitioner/util.go index e4cf9a9..a08d74f 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util.go +++ b/pkg/controller/cyclenoderequest/transitioner/util.go @@ -18,6 +18,8 @@ import ( v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" "github.com/atlassian-labs/cyclops/pkg/cloudprovider" + "github.com/atlassian-labs/cyclops/pkg/k8s" + "github.com/atlassian-labs/cyclops/pkg/metrics" ) // transitionToHealing transitions the current cycleNodeRequest to healing which will always transiting to failed @@ -589,3 +591,255 @@ func countNodesCreatedAfter(nodes map[string]corev1.Node, cutoffTime time.Time) } return count } + +// findNodesCreatedAfter returns all nodes in the given map that are created after the specified time +func findNodesCreatedAfter(nodes map[string]corev1.Node, cutoffTime time.Time) []corev1.Node { + var result []corev1.Node + for _, node := range nodes { + if node.CreationTimestamp.Time.After(cutoffTime) { + result = append(result, node) + } + } + return result +} + +// getNodegroupForMetrics returns a nodegroup identifier for metrics labeling. +// Since a CycleNodeRequest can have multiple nodegroups, we use the first one or "unknown" if none. +func (t *CycleNodeRequestTransitioner) getNodegroupForMetrics() string { + nodeGroups := t.cycleNodeRequest.GetNodeGroupNames() + if len(nodeGroups) > 0 { + return nodeGroups[0] + } + return "unknown" +} + +// getNodeGroup finds the NodeGroup resource that matches this CNR. +func (t *CycleNodeRequestTransitioner) getNodeGroup() (*v1.NodeGroup, error) { + var nodeGroupList v1.NodeGroupList + if err := t.rm.Client.List(context.TODO(), &nodeGroupList); err != nil { + return nil, err + } + + for i := range nodeGroupList.Items { + ng := &nodeGroupList.Items[i] + if t.cycleNodeRequest.IsFromNodeGroup(*ng) { + return ng, nil + } + } + + return nil, nil +} + +// shouldManageAnnotations checks if Cyclops should manage cluster-autoscaler annotations on nodes. +// Returns true if annotations should be added/removed (default behavior). +// Returns false if opt-out is enabled via NodeGroup annotation. +// +// Opt-out mechanism: +// - NodeGroup annotation: cyclops.atlassian.com/disable-annotation-management +// - Value "true" → opt-out (do NOT add/remove annotations) +// - Value "false" or missing → default enabled (add/remove annotations) +// +// Default behavior (when NodeGroup not found or annotation missing): enabled (return true) +func (t *CycleNodeRequestTransitioner) shouldManageAnnotations() bool { + nodeGroup, err := t.getNodeGroup() + if err != nil { + t.rm.Logger.Info("Failed to get NodeGroup, defaulting to annotation management enabled", + "cnr", t.cycleNodeRequest.Name, + "error", err) + return true + } + if nodeGroup == nil { + t.rm.Logger.Info("NodeGroup not found for CNR, defaulting to annotation management enabled", + "cnr", t.cycleNodeRequest.Name, + "nodeGroupNames", t.cycleNodeRequest.GetNodeGroupNames()) + return true + } + + annotationValue := nodeGroup.Annotations[nodeGroupAnnotationKey] + t.rm.Logger.Info("Checking NodeGroup annotation for opt-out", + "cnr", t.cycleNodeRequest.Name, + "nodeGroup", nodeGroup.Name, + "annotation", nodeGroupAnnotationKey, + "value", annotationValue) + if annotationValue == "true" { + t.rm.Logger.Info("Node annotations disabled via NodeGroup annotation", + "nodeGroup", nodeGroup.Name, + "annotation", nodeGroupAnnotationKey) + return false + } + + return true +} + +// addScaleDownDisabledAnnotation adds the cluster-autoscaler scale-down-disabled annotation +// to a node to prevent Cluster Autoscaler from removing it during cycling. +// This is a best-effort operation and failures should not block the cycling process. +// Note: This function should only be called when shouldManageAnnotations() returns true. +// If the annotation already exists, it is preserved (not overwritten) for backward compatibility. +func (t *CycleNodeRequestTransitioner) addScaleDownDisabledAnnotation(nodeName string) error { + // Defensive check: ensure annotation management is enabled + if !t.shouldManageAnnotations() { + t.rm.Logger.Info("Skipping annotation addition - management disabled", + "nodeName", nodeName, + "cnr", t.cycleNodeRequest.Name) + return nil + } + + // Check if annotation already exists - preserve existing value for backward compatibility + node, err := t.rm.RawClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if err == nil && node.Annotations != nil { + if existingValue, exists := node.Annotations[clusterAutoscalerScaleDownDisabledAnnotation]; exists { + t.rm.Logger.Info("Annotation already exists on node, preserving existing value", + "nodeName", nodeName, + "existingValue", existingValue, + "cnr", t.cycleNodeRequest.Name) + return nil + } + } + + nodegroup := t.getNodegroupForMetrics() + + err = k8s.AddAnnotationToNode( + nodeName, + clusterAutoscalerScaleDownDisabledAnnotation, + clusterAutoscalerScaleDownDisabledValue, + t.rm.RawClient, + ) + + if err != nil { + // Extract error type for better categorization + errorType := "unknown" + errStr := err.Error() + if strings.Contains(errStr, "not found") { + errorType = "node_not_found" + } else if strings.Contains(errStr, "forbidden") || strings.Contains(errStr, "unauthorized") { + errorType = "permission_denied" + } else if strings.Contains(errStr, "conflict") { + errorType = "conflict" + } + + metrics.AnnotationAddFailure.WithLabelValues(nodegroup, errorType).Inc() + return err + } + + // Add marker annotation to track that we added the cluster-autoscaler annotation + // This ensures we only remove annotations we added, not pre-existing ones + if markerErr := k8s.AddAnnotationToNode( + nodeName, + cyclopsManagedAnnotation, + "true", + t.rm.RawClient, + ); markerErr != nil { + // Log but don't fail - marker is for tracking, not critical + t.rm.Logger.Info("Failed to add managed annotation marker to node", + "nodeName", nodeName, + "error", markerErr) + } else { + t.rm.Logger.Info("Added managed annotation marker to node", + "nodeName", nodeName, + "cnr", t.cycleNodeRequest.Name) + } + + metrics.AnnotationAddSuccess.WithLabelValues(nodegroup).Inc() + metrics.NodesWithAnnotation.WithLabelValues(nodegroup, nodeName).Set(1) + + return nil +} + +// removeScaleDownDisabledAnnotation removes the cluster-autoscaler scale-down-disabled annotation +// from a node to allow Cluster Autoscaler to consider it for scale-down again. +// This is a best-effort operation and failures should not block the cycling process. +// Uses retry logic to handle transient API errors. +func (t *CycleNodeRequestTransitioner) removeScaleDownDisabledAnnotation(nodeName string) error { + nodegroup := t.getNodegroupForMetrics() + + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + return k8s.RemoveAnnotationFromNode( + nodeName, + clusterAutoscalerScaleDownDisabledAnnotation, + t.rm.RawClient, + ) + }) + + if err != nil { + // Extract error type for better categorization + errorType := "unknown" + errStr := err.Error() + if strings.Contains(errStr, "not found") { + errorType = "node_not_found" + } else if strings.Contains(errStr, "forbidden") || strings.Contains(errStr, "unauthorized") { + errorType = "permission_denied" + } else if strings.Contains(errStr, "conflict") { + errorType = "conflict" + } + + metrics.AnnotationRemoveFailure.WithLabelValues(nodegroup, errorType).Inc() + return err + } + + // Remove the marker annotation since we're removing the cluster-autoscaler annotation + if markerErr := k8s.RemoveAnnotationFromNode( + nodeName, + cyclopsManagedAnnotation, + t.rm.RawClient, + ); markerErr != nil { + // Log but don't fail - marker cleanup is best-effort + t.rm.Logger.Info("Failed to remove managed annotation marker from node", + "nodeName", nodeName, + "error", markerErr) + } + + metrics.AnnotationRemoveSuccess.WithLabelValues(nodegroup).Inc() + metrics.NodesWithAnnotation.WithLabelValues(nodegroup, nodeName).Set(0) + + return nil +} + +// cleanupScaleDownDisabledAnnotations removes the cluster-autoscaler scale-down-disabled annotation +// from nodes that were created during the cycle. This is called during both Healing and Successful +// phases to ensure annotations are cleaned up regardless of how the cycle completes. +// This is a best-effort operation and failures should not block the transition. +// Uses listNodes instead of listReadyNodes to ensure cleanup happens even if nodes are not Ready yet, +// which is important when cycling fails in the Healing phase. +func (t *CycleNodeRequestTransitioner) cleanupScaleDownDisabledAnnotations() { + if t.cycleNodeRequest.Status.ScaleUpStarted == nil { + return + } + + kubeNodes, err := t.listNodes(true) + if err != nil { + return + } + + phase := string(t.cycleNodeRequest.Status.Phase) + nodesCreatedDuringCycle := findNodesCreatedAfter(kubeNodes, t.cycleNodeRequest.Status.ScaleUpStarted.Time) + for _, newNode := range nodesCreatedDuringCycle { + if newNode.Annotations != nil { + if val, exists := newNode.Annotations[clusterAutoscalerScaleDownDisabledAnnotation]; exists && val == clusterAutoscalerScaleDownDisabledValue { + // Only remove annotations we added (marked with cyclopsManagedAnnotation). + // This ensures we don't remove pre-existing annotations that may have been set by + // ASG Launch Template user data, cloud-init, or other bootstrap scripts. + if newNode.Annotations[cyclopsManagedAnnotation] != "true" { + t.rm.Logger.Info("Skipping annotation removal - annotation was not added by Cyclops", + "phase", phase, + "nodeName", newNode.Name) + continue + } + + if err := t.removeScaleDownDisabledAnnotation(newNode.Name); err != nil { + // Log warning but don't fail - annotation removal is best-effort + t.rm.Logger.Info("Failed to remove scale-down-disabled annotation from node", + "phase", phase, + "nodeName", newNode.Name, + "error", err) + t.rm.LogEvent(t.cycleNodeRequest, "AnnotationCleanupWarning", + "Failed to remove scale-down-disabled annotation from node %s during %s: %v", newNode.Name, phase, err) + } else { + t.rm.Logger.Info("Removed scale-down-disabled annotation from node", + "phase", phase, + "nodeName", newNode.Name) + } + } + } + } +} \ No newline at end of file diff --git a/pkg/controller/cyclenoderequest/transitioner/util_test.go b/pkg/controller/cyclenoderequest/transitioner/util_test.go index 6c33489..00c2f5a 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/util_test.go @@ -1,11 +1,14 @@ package transitioner import ( + "context" "reflect" "testing" "time" + v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" "github.com/atlassian-labs/cyclops/pkg/cloudprovider" + "github.com/atlassian-labs/cyclops/pkg/mock" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -213,3 +216,298 @@ func buildNodeWithTimestamp(name string, timestamp time.Time) corev1.Node { }, } } + +// TestShouldManageAnnotations tests the shouldManageAnnotations function with different NodeGroup annotation values +func TestShouldManageAnnotations(t *testing.T) { + tests := []struct { + name string + nodeGroupAnnotation map[string]string // NodeGroup annotations + expectedResult bool // Expected return value + description string + }{ + { + name: "no annotation - default enabled", + nodeGroupAnnotation: nil, + expectedResult: true, + description: "When NodeGroup has no annotation, annotations should be managed (default enabled)", + }, + { + name: "annotation missing - default enabled", + nodeGroupAnnotation: map[string]string{}, + expectedResult: true, + description: "When NodeGroup annotation is missing, annotations should be managed (default enabled)", + }, + { + name: "annotation value false - enabled", + nodeGroupAnnotation: map[string]string{"cyclops.atlassian.com/disable-annotation-management": "false"}, + expectedResult: true, + description: "When annotation value is 'false', annotations should be managed", + }, + { + name: "annotation value true - opt-out disabled", + nodeGroupAnnotation: map[string]string{"cyclops.atlassian.com/disable-annotation-management": "true"}, + expectedResult: false, + description: "When annotation value is 'true', annotations should NOT be managed (opt-out)", + }, + { + name: "annotation value disabled - opt-out disabled", + nodeGroupAnnotation: map[string]string{"cyclops.atlassian.com/disable-annotation-management": "disabled"}, + expectedResult: true, // "disabled" is not recognized, defaults to enabled + description: "When annotation value is 'disabled' (not recognized), defaults to enabled", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + // Create a CNR + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cnr", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + }, + } + + // Create a NodeGroup with the test annotation + nodeGroup := &v1.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-1", + Annotations: test.nodeGroupAnnotation, + }, + Spec: v1.NodeGroupSpec{ + NodeGroupName: "ng-1", + }, + } + + // Create fake transitioner with NodeGroup + fakeTransitioner := NewFakeTransitioner(cnr, + WithExtraKubeObject(nodeGroup), + ) + + // Test shouldManageAnnotations + result := fakeTransitioner.shouldManageAnnotations() + + assert.Equal(t, test.expectedResult, result, test.description) + }) + } +} + +// TestShouldManageAnnotationsNoNodeGroup tests behavior when NodeGroup is not found +func TestShouldManageAnnotationsNoNodeGroup(t *testing.T) { + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cnr", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"non-existent-ng"}, + }, + } + + // Create fake transitioner WITHOUT NodeGroup + fakeTransitioner := NewFakeTransitioner(cnr) + + // Should default to enabled when NodeGroup not found + result := fakeTransitioner.shouldManageAnnotations() + assert.True(t, result, "When NodeGroup is not found, should default to enabled") +} + +// TestAnnotationNotAddedWhenOptOut tests that annotations are NOT added to nodes when opt-out is enabled +func TestAnnotationNotAddedWhenOptOut(t *testing.T) { + // Create a NodeGroup with opt-out enabled + nodeGroup := &v1.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-1", + Annotations: map[string]string{ + "cyclops.atlassian.com/disable-annotation-management": "true", + }, + }, + Spec: v1.NodeGroupSpec{ + NodeGroupName: "ng-1", + }, + } + + // Create nodes + nodegroup, err := mock.NewNodegroup("ng-1", 1) + assert.NoError(t, err) + + // Create CNR in ScalingUp phase with ScaleUpStarted timestamp + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cnr", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + }, + Status: v1.CycleNodeRequestStatus{ + Phase: v1.CycleNodeRequestScalingUp, + ScaleUpStarted: &metav1.Time{ + Time: time.Now().Add(-1 * time.Minute), + }, + }, + } + + // Create fake transitioner with NodeGroup and nodes + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + WithExtraKubeObject(nodeGroup), + ) + + // Verify opt-out is detected + assert.False(t, fakeTransitioner.shouldManageAnnotations(), "Opt-out should be detected") + + // Try to add annotation to a new node (simulating transitionScalingUp behavior) + newNodeName := nodegroup[0].Name + err = fakeTransitioner.addScaleDownDisabledAnnotation(newNodeName) + assert.NoError(t, err, "addScaleDownDisabledAnnotation should not error even when opt-out") + + // Verify annotation was NOT added to the node + node, err := fakeTransitioner.rm.RawClient.CoreV1().Nodes().Get( + context.TODO(), newNodeName, metav1.GetOptions{}, + ) + assert.NoError(t, err) + + // Check that the cluster-autoscaler annotation does NOT exist + annotationValue, exists := node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] + assert.False(t, exists, "Annotation should NOT exist when opt-out is enabled") + assert.Empty(t, annotationValue, "Annotation value should be empty when opt-out is enabled") +} + +// TestAnnotationAddedWhenNotExists tests Scenario 1: Cyclops adds both annotations when cluster-autoscaler annotation doesn't exist +func TestAnnotationAddedWhenNotExists(t *testing.T) { + // Create a NodeGroup without opt-out (default enabled) + nodeGroup := &v1.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-1", + Annotations: map[string]string{}, // No opt-out annotation + }, + Spec: v1.NodeGroupSpec{ + NodeGroupName: "ng-1", + }, + } + + // Create nodes + nodegroup, err := mock.NewNodegroup("ng-1", 1) + assert.NoError(t, err) + + // Create CNR + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cnr", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + }, + } + + // Create fake transitioner with NodeGroup and nodes + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + WithExtraKubeObject(nodeGroup), + ) + + // Verify annotation management is enabled + assert.True(t, fakeTransitioner.shouldManageAnnotations(), "Annotation management should be enabled") + + // Add annotation to a node (simulating transitionScalingUp behavior) + newNodeName := nodegroup[0].Name + err = fakeTransitioner.addScaleDownDisabledAnnotation(newNodeName) + assert.NoError(t, err, "addScaleDownDisabledAnnotation should succeed") + + // Verify both annotations were added to the node + node, err := fakeTransitioner.rm.RawClient.CoreV1().Nodes().Get( + context.TODO(), newNodeName, metav1.GetOptions{}, + ) + assert.NoError(t, err) + + // Check that the cluster-autoscaler annotation exists with value "true" + annotationValue, exists := node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] + assert.True(t, exists, "Cluster Autoscaler annotation should exist") + assert.Equal(t, "true", annotationValue, "Cluster Autoscaler annotation should have value 'true'") + + // Check that the marker annotation exists + markerValue, markerExists := node.Annotations["cyclops.atlassian.com/annotation-managed"] + assert.True(t, markerExists, "Marker annotation should exist") + assert.Equal(t, "true", markerValue, "Marker annotation should have value 'true'") +} + +// TestAnnotationPreservedWhenExists tests Scenario 2: Cyclops preserves existing annotation and doesn't add marker +func TestAnnotationPreservedWhenExists(t *testing.T) { + // Create a NodeGroup without opt-out (default enabled) + nodeGroup := &v1.NodeGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ng-1", + Annotations: map[string]string{}, // No opt-out annotation + }, + Spec: v1.NodeGroupSpec{ + NodeGroupName: "ng-1", + }, + } + + // Create nodes + nodegroup, err := mock.NewNodegroup("ng-1", 1) + assert.NoError(t, err) + + // Create CNR + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cnr", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + }, + } + + // Create fake transitioner with NodeGroup and nodes + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + WithExtraKubeObject(nodeGroup), + ) + + // Manually add the cluster-autoscaler annotation to simulate ASG setting it + // We need to add it to the Kubernetes node before Cyclops tries to add it + newNodeName := nodegroup[0].Name + node, err := fakeTransitioner.rm.RawClient.CoreV1().Nodes().Get( + context.TODO(), newNodeName, metav1.GetOptions{}, + ) + assert.NoError(t, err) + + // Add pre-existing annotation to simulate ASG Launch Template + if node.Annotations == nil { + node.Annotations = make(map[string]string) + } + node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] = "true" + _, err = fakeTransitioner.rm.RawClient.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) + assert.NoError(t, err, "Should be able to update node with pre-existing annotation") + + // Verify annotation management is enabled + assert.True(t, fakeTransitioner.shouldManageAnnotations(), "Annotation management should be enabled") + + // Try to add annotation to the node (should preserve existing) + err = fakeTransitioner.addScaleDownDisabledAnnotation(newNodeName) + assert.NoError(t, err, "addScaleDownDisabledAnnotation should succeed even when annotation exists") + + // Verify the existing annotation was preserved + node, err = fakeTransitioner.rm.RawClient.CoreV1().Nodes().Get( + context.TODO(), newNodeName, metav1.GetOptions{}, + ) + assert.NoError(t, err) + + // Check that the cluster-autoscaler annotation still exists with value "true" + annotationValue, exists := node.Annotations["cluster-autoscaler.kubernetes.io/scale-down-disabled"] + assert.True(t, exists, "Cluster Autoscaler annotation should still exist") + assert.Equal(t, "true", annotationValue, "Cluster Autoscaler annotation should have value 'true'") + + // Check that the marker annotation was NOT added (Cyclops didn't add the annotation, so no marker) + markerValue, markerExists := node.Annotations["cyclops.atlassian.com/annotation-managed"] + assert.False(t, markerExists, "Marker annotation should NOT exist when Cyclops preserves existing annotation") + assert.Empty(t, markerValue, "Marker annotation value should be empty") +} diff --git a/pkg/k8s/node.go b/pkg/k8s/node.go index 83ec2eb..bfa56fd 100644 --- a/pkg/k8s/node.go +++ b/pkg/k8s/node.go @@ -74,6 +74,28 @@ func AddAnnotationToNode(nodeName string, annotationName string, annotationValue return PatchNode(nodeName, patches, client) } +// RemoveAnnotationFromNode performs a patch operation on a node to remove an annotation from the node +func RemoveAnnotationFromNode(nodeName string, annotationName string, client kubernetes.Interface) error { + patches := []Patch{ + { + Op: "remove", + Path: fmt.Sprintf("/metadata/annotations/%s", strings.Replace(annotationName, "/", "~1", -1)), + }, + } + return PatchNode(nodeName, patches, client) +} + +// RemoveLabelFromNode performs a patch operation on a node to remove a label from the node +func RemoveLabelFromNode(nodeName string, labelName string, client kubernetes.Interface) error { + patches := []Patch{ + { + Op: "remove", + Path: fmt.Sprintf("/metadata/labels/%s", strings.Replace(labelName, "/", "~1", -1)), + }, + } + return PatchNode(nodeName, patches, client) +} + // AddFinalizerToNode updates a node to add a finalizer to it func AddFinalizerToNode(node *v1.Node, finalizerName string, client kubernetes.Interface) error { controllerutil.AddFinalizer(node, finalizerName) diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 7557cc7..61632ce 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -64,6 +64,75 @@ var ( ) ) +var ( + // AnnotationAddSuccess tracks successful annotation additions + // Note: Using only nodegroup label to keep cardinality low (consistent with existing Cyclops metrics) + AnnotationAddSuccess = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%v_annotation_add_success_total", namespace), + Help: "Total number of successful cluster-autoscaler scale-down-disabled annotation additions", + }, + []string{"nodegroup"}, + ) + + // AnnotationAddFailure tracks failed annotation additions + AnnotationAddFailure = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%v_annotation_add_failure_total", namespace), + Help: "Total number of failed cluster-autoscaler scale-down-disabled annotation additions", + }, + []string{"nodegroup", "error_type"}, + ) + + // AnnotationRemoveSuccess tracks successful annotation removals + AnnotationRemoveSuccess = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%v_annotation_remove_success_total", namespace), + Help: "Total number of successful cluster-autoscaler scale-down-disabled annotation removals", + }, + []string{"nodegroup"}, + ) + + // AnnotationRemoveFailure tracks failed annotation removals + AnnotationRemoveFailure = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%v_annotation_remove_failure_total", namespace), + Help: "Total number of failed cluster-autoscaler scale-down-disabled annotation removals", + }, + []string{"nodegroup", "error_type"}, + ) + + // NodesWithAnnotation tracks current nodes with the annotation + // Note: Includes node_name here because we need to track which specific nodes currently have the annotation + // Cardinality is bounded by the number of nodes with annotations (typically small) + NodesWithAnnotation = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: fmt.Sprintf("%v_nodes_with_scale_down_disabled_annotation", namespace), + Help: "Current number of nodes with cluster-autoscaler scale-down-disabled annotation (1 = has annotation, 0 = does not have annotation)", + }, + []string{"nodegroup", "node_name"}, + ) + + // AnnotationCleanupAttempts tracks cleanup operation attempts + AnnotationCleanupAttempts = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: fmt.Sprintf("%v_annotation_cleanup_attempts_total", namespace), + Help: "Total number of annotation cleanup attempts", + }, + []string{"nodegroup", "result"}, // result: "success", "partial_failure", "failure" + ) + + // AnnotationCleanupDuration tracks cleanup operation duration + AnnotationCleanupDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: fmt.Sprintf("%v_annotation_cleanup_duration_seconds", namespace), + Help: "Duration of annotation cleanup operations in seconds", + Buckets: prometheus.ExponentialBuckets(0.1, 2, 10), // 0.1s to ~51s + }, + []string{"nodegroup"}, + ) +) + // Register registers the custom metrics with prometheus func Register(client client.Client, logger logr.Logger, namespace string) { collector := cyclopsCollector{ @@ -76,6 +145,17 @@ func Register(client client.Client, logger logr.Logger, namespace string) { } metrics.Registry.MustRegister(collector) + // Register annotation metrics + metrics.Registry.MustRegister( + AnnotationAddSuccess, + AnnotationAddFailure, + AnnotationRemoveSuccess, + AnnotationRemoveFailure, + NodesWithAnnotation, + AnnotationCleanupAttempts, + AnnotationCleanupDuration, + ) + go func() { for { collector.fetch()