atlassian-labs · dnorman3 · Feb 5, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM --platform=$BUILDPLATFORM golang:1.23 as builder
+FROM --platform=$BUILDPLATFORM golang:1.24 as builder
 ARG TARGETPLATFORM
 ARG ENVVAR=CGO_ENABLED=0
 WORKDIR /go/src/github.com/atlassian-labs/cyclops

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-VERSION = 1.10.2
+VERSION = 1.10.3
 # IMPORTANT! Update api version if a new release affects cnr
 API_VERSION = 1.0.0
 IMAGE = cyclops:$(VERSION)

diff --git a/pkg/controller/cyclenoderequest/transitioner/transitioner.go b/pkg/controller/cyclenoderequest/transitioner/transitioner.go
@@ -17,6 +17,25 @@ type transitionFunc func() (reconcile.Result, error)
 // different request types.
 const cycleNodeLabel = "cyclops.atlassian.com/terminate"
 
+// cyclopsManagedAnnotation marks nodes where Cyclops added the scale-down-disabled annotation.
+// Used to track which annotations we added vs pre-existing ones (for safe cleanup).
+// Only remove the cluster-autoscaler annotation if this marker annotation also exists.
+const cyclopsManagedAnnotation = "cyclops.atlassian.com/annotation-managed"
+
+// clusterAutoscalerScaleDownDisabledAnnotation is the annotation key used to prevent
+// Cluster Autoscaler from scaling down a node. This is used to protect new nodes
+// during the cycling process from being removed by Cluster Autoscaler before
+// the corresponding old nodes are fully terminated.
+// See: https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#how-can-i-prevent-cluster-autoscaler-from-scaling-down-a-particular-node
+const clusterAutoscalerScaleDownDisabledAnnotation = "cluster-autoscaler.kubernetes.io/scale-down-disabled"
+const clusterAutoscalerScaleDownDisabledValue = "true"
+
+// nodeGroupAnnotationKey is the annotation key on NodeGroup resources that controls whether
+// Cluster Autoscaler annotation management is enabled or disabled.
+// Value: "true" → opt-out (disable annotation management)
+// Value: "false" or missing/empty → default enabled (annotation management enabled)
+const nodeGroupAnnotationKey = "cyclops.atlassian.com/disable-annotation-management"
+
 var (
 	transitionDuration = 10 * time.Second
 	requeueDuration    = 30 * time.Second

diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go
@@ -408,6 +408,27 @@ func (t *CycleNodeRequestTransitioner) transitionScalingUp() (reconcile.Result,
 		}
 	}
 
+	// Add scale-down-disabled annotation to new nodes if enabled
+	if t.shouldManageAnnotations() {
+		newNodes := findNodesCreatedAfter(kubeNodes, scaleUpStarted.Time)
+		for _, newNode := range newNodes {
+			if err := t.addScaleDownDisabledAnnotation(newNode.Name); err != nil {
+				// Log warning but don't fail - annotation is best-effort and shouldn't block cycling
+				t.rm.Logger.Info("Failed to add scale-down-disabled annotation to new node",
+					"nodeName", newNode.Name,
+					"error", err)
+				t.rm.LogEvent(t.cycleNodeRequest, "AnnotationWarning",
+					"Failed to add scale-down-disabled annotation to node %s: %v", newNode.Name, err)
+			} else {
+				t.rm.Logger.Info("Added scale-down-disabled annotation to new node",
+					"nodeName", newNode.Name)
+			}
+		}
+	} else {
+		t.rm.Logger.Info("Node annotation management disabled for this CNR",
+			"cnr", t.cycleNodeRequest.Name)
+	}
+
 	t.rm.LogEvent(t.cycleNodeRequest, "ScalingUpCompleted", "New nodes are now ready")
 	return t.transitionObject(v1.CycleNodeRequestCordoningNode)
 }
@@ -572,6 +593,14 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er
 		}
 	}
 
+	// Cleanup scale-down-disabled annotations if enabled
+	if t.shouldManageAnnotations() {
+		t.cleanupScaleDownDisabledAnnotations()
+	} else {
+		t.rm.Logger.Info("Skipping annotation cleanup (disabled)",
+			"cnr", t.cycleNodeRequest.Name)
+	}
+
 	return t.transitionToFailed(nil)
 }
 
@@ -599,6 +628,14 @@ func (t *CycleNodeRequestTransitioner) transitionSuccessful() (reconcile.Result,
 		return reconcile.Result{Requeue: true, RequeueAfter: transitionDuration}, nil
 	}
 
+	// Cleanup scale-down-disabled annotations if enabled
+	if t.shouldManageAnnotations() {
+		t.cleanupScaleDownDisabledAnnotations()
+	} else {
+		t.rm.Logger.Info("Skipping annotation cleanup (disabled)",
+			"cnr", t.cycleNodeRequest.Name)
+	}
+
 	// Delete failed sibling CNRs regardless of whether the CNR for the
 	// transitioner should be deleted. If failed CNRs pile up that will prevent
 	// Cyclops observer from auto-generating new CNRs for a nodegroup.

diff --git a/pkg/controller/cyclenoderequest/transitioner/util.go b/pkg/controller/cyclenoderequest/transitioner/util.go
@@ -18,6 +18,8 @@ import (
 
 	v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1"
 	"github.com/atlassian-labs/cyclops/pkg/cloudprovider"
+	"github.com/atlassian-labs/cyclops/pkg/k8s"
+	"github.com/atlassian-labs/cyclops/pkg/metrics"
 )
 
 // transitionToHealing transitions the current cycleNodeRequest to healing which will always transiting to failed
@@ -589,3 +591,255 @@ func countNodesCreatedAfter(nodes map[string]corev1.Node, cutoffTime time.Time)
 	}
 	return count
 }
+
+// findNodesCreatedAfter returns all nodes in the given map that are created after the specified time
+func findNodesCreatedAfter(nodes map[string]corev1.Node, cutoffTime time.Time) []corev1.Node {
+	var result []corev1.Node
+	for _, node := range nodes {
+		if node.CreationTimestamp.Time.After(cutoffTime) {
+			result = append(result, node)
+		}
+	}
+	return result
+}
+
+// getNodegroupForMetrics returns a nodegroup identifier for metrics labeling.
+// Since a CycleNodeRequest can have multiple nodegroups, we use the first one or "unknown" if none.
+func (t *CycleNodeRequestTransitioner) getNodegroupForMetrics() string {
+	nodeGroups := t.cycleNodeRequest.GetNodeGroupNames()
+	if len(nodeGroups) > 0 {
+		return nodeGroups[0]
+	}
+	return "unknown"
+}
+
+// getNodeGroup finds the NodeGroup resource that matches this CNR.
+func (t *CycleNodeRequestTransitioner) getNodeGroup() (*v1.NodeGroup, error) {
+	var nodeGroupList v1.NodeGroupList
+	if err := t.rm.Client.List(context.TODO(), &nodeGroupList); err != nil {
+		return nil, err
+	}
+
+	for i := range nodeGroupList.Items {
+		ng := &nodeGroupList.Items[i]
+		if t.cycleNodeRequest.IsFromNodeGroup(*ng) {
+			return ng, nil
+		}
+	}
+
+	return nil, nil
+}
+
+// shouldManageAnnotations checks if Cyclops should manage cluster-autoscaler annotations on nodes.
+// Returns true if annotations should be added/removed (default behavior).
+// Returns false if opt-out is enabled via NodeGroup annotation.
+//
+// Opt-out mechanism:
+//   - NodeGroup annotation: cyclops.atlassian.com/disable-annotation-management
+//   - Value "true" → opt-out (do NOT add/remove annotations)
+//   - Value "false" or missing → default enabled (add/remove annotations)
+//
+// Default behavior (when NodeGroup not found or annotation missing): enabled (return true)
+func (t *CycleNodeRequestTransitioner) shouldManageAnnotations() bool {
+	nodeGroup, err := t.getNodeGroup()
+	if err != nil {
+		t.rm.Logger.Info("Failed to get NodeGroup, defaulting to annotation management enabled",
+			"cnr", t.cycleNodeRequest.Name,
+			"error", err)
+		return true
+	}
+	if nodeGroup == nil {
+		t.rm.Logger.Info("NodeGroup not found for CNR, defaulting to annotation management enabled",
+			"cnr", t.cycleNodeRequest.Name,
+			"nodeGroupNames", t.cycleNodeRequest.GetNodeGroupNames())
+		return true
+	}
+
+	annotationValue := nodeGroup.Annotations[nodeGroupAnnotationKey]
+	t.rm.Logger.Info("Checking NodeGroup annotation for opt-out",
+		"cnr", t.cycleNodeRequest.Name,
+		"nodeGroup", nodeGroup.Name,
+		"annotation", nodeGroupAnnotationKey,
+		"value", annotationValue)
+	if annotationValue == "true" {
+		t.rm.Logger.Info("Node annotations disabled via NodeGroup annotation",
+			"nodeGroup", nodeGroup.Name,
+			"annotation", nodeGroupAnnotationKey)
+		return false
+	}
+
+	return true
+}
+
+// addScaleDownDisabledAnnotation adds the cluster-autoscaler scale-down-disabled annotation
+// to a node to prevent Cluster Autoscaler from removing it during cycling.
+// This is a best-effort operation and failures should not block the cycling process.
+// Note: This function should only be called when shouldManageAnnotations() returns true.
+// If the annotation already exists, it is preserved (not overwritten) for backward compatibility.
+func (t *CycleNodeRequestTransitioner) addScaleDownDisabledAnnotation(nodeName string) error {
+	// Defensive check: ensure annotation management is enabled
+	if !t.shouldManageAnnotations() {
+		t.rm.Logger.Info("Skipping annotation addition - management disabled",
+			"nodeName", nodeName,
+			"cnr", t.cycleNodeRequest.Name)
+		return nil
+	}
+
+	// Check if annotation already exists - preserve existing value for backward compatibility
+	node, err := t.rm.RawClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
+	if err == nil && node.Annotations != nil {
+		if existingValue, exists := node.Annotations[clusterAutoscalerScaleDownDisabledAnnotation]; exists {
+			t.rm.Logger.Info("Annotation already exists on node, preserving existing value",
+				"nodeName", nodeName,
+				"existingValue", existingValue,
+				"cnr", t.cycleNodeRequest.Name)
+			return nil
+		}
+	}
+
+	nodegroup := t.getNodegroupForMetrics()
+
+	err = k8s.AddAnnotationToNode(
+		nodeName,
+		clusterAutoscalerScaleDownDisabledAnnotation,
+		clusterAutoscalerScaleDownDisabledValue,
+		t.rm.RawClient,
+	)
+
+	if err != nil {
+		// Extract error type for better categorization
+		errorType := "unknown"
+		errStr := err.Error()
+		if strings.Contains(errStr, "not found") {
+			errorType = "node_not_found"
+		} else if strings.Contains(errStr, "forbidden") || strings.Contains(errStr, "unauthorized") {
+			errorType = "permission_denied"
+		} else if strings.Contains(errStr, "conflict") {
+			errorType = "conflict"
+		}
+
+		metrics.AnnotationAddFailure.WithLabelValues(nodegroup, errorType).Inc()
+		return err
+	}
+
+	// Add marker annotation to track that we added the cluster-autoscaler annotation
+	// This ensures we only remove annotations we added, not pre-existing ones
+	if markerErr := k8s.AddAnnotationToNode(
+		nodeName,
+		cyclopsManagedAnnotation,
+		"true",
+		t.rm.RawClient,
+	); markerErr != nil {
+		// Log but don't fail - marker is for tracking, not critical
+		t.rm.Logger.Info("Failed to add managed annotation marker to node",
+			"nodeName", nodeName,
+			"error", markerErr)
+	} else {
+		t.rm.Logger.Info("Added managed annotation marker to node",
+			"nodeName", nodeName,
+			"cnr", t.cycleNodeRequest.Name)
+	}
+
+	metrics.AnnotationAddSuccess.WithLabelValues(nodegroup).Inc()
+	metrics.NodesWithAnnotation.WithLabelValues(nodegroup, nodeName).Set(1)
+
+	return nil
+}
+
+// removeScaleDownDisabledAnnotation removes the cluster-autoscaler scale-down-disabled annotation
+// from a node to allow Cluster Autoscaler to consider it for scale-down again.
+// This is a best-effort operation and failures should not block the cycling process.
+// Uses retry logic to handle transient API errors.
+func (t *CycleNodeRequestTransitioner) removeScaleDownDisabledAnnotation(nodeName string) error {
+	nodegroup := t.getNodegroupForMetrics()
+
+	err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
+		return k8s.RemoveAnnotationFromNode(
+			nodeName,
+			clusterAutoscalerScaleDownDisabledAnnotation,
+			t.rm.RawClient,
+		)
+	})
+
+	if err != nil {
+		// Extract error type for better categorization
+		errorType := "unknown"
+		errStr := err.Error()
+		if strings.Contains(errStr, "not found") {
+			errorType = "node_not_found"
+		} else if strings.Contains(errStr, "forbidden") || strings.Contains(errStr, "unauthorized") {
+			errorType = "permission_denied"
+		} else if strings.Contains(errStr, "conflict") {
+			errorType = "conflict"
+		}
+
+		metrics.AnnotationRemoveFailure.WithLabelValues(nodegroup, errorType).Inc()
+		return err
+	}
+
+	// Remove the marker annotation since we're removing the cluster-autoscaler annotation
+	if markerErr := k8s.RemoveAnnotationFromNode(
+		nodeName,
+		cyclopsManagedAnnotation,
+		t.rm.RawClient,
+	); markerErr != nil {
+		// Log but don't fail - marker cleanup is best-effort
+		t.rm.Logger.Info("Failed to remove managed annotation marker from node",
+			"nodeName", nodeName,
+			"error", markerErr)
+	}
+
+	metrics.AnnotationRemoveSuccess.WithLabelValues(nodegroup).Inc()
+	metrics.NodesWithAnnotation.WithLabelValues(nodegroup, nodeName).Set(0)
+
+	return nil
+}
+
+// cleanupScaleDownDisabledAnnotations removes the cluster-autoscaler scale-down-disabled annotation
+// from nodes that were created during the cycle. This is called during both Healing and Successful
+// phases to ensure annotations are cleaned up regardless of how the cycle completes.
+// This is a best-effort operation and failures should not block the transition.
+// Uses listNodes instead of listReadyNodes to ensure cleanup happens even if nodes are not Ready yet,
+// which is important when cycling fails in the Healing phase.
+func (t *CycleNodeRequestTransitioner) cleanupScaleDownDisabledAnnotations() {
+	if t.cycleNodeRequest.Status.ScaleUpStarted == nil {
+		return
+	}
+
+	kubeNodes, err := t.listNodes(true)
+	if err != nil {
+		return
+	}
+
+	phase := string(t.cycleNodeRequest.Status.Phase)
+	nodesCreatedDuringCycle := findNodesCreatedAfter(kubeNodes, t.cycleNodeRequest.Status.ScaleUpStarted.Time)
+	for _, newNode := range nodesCreatedDuringCycle {
+		if newNode.Annotations != nil {
+			if val, exists := newNode.Annotations[clusterAutoscalerScaleDownDisabledAnnotation]; exists && val == clusterAutoscalerScaleDownDisabledValue {
+				// Only remove annotations we added (marked with cyclopsManagedAnnotation).
+				// This ensures we don't remove pre-existing annotations that may have been set by
+				// ASG Launch Template user data, cloud-init, or other bootstrap scripts.
+				if newNode.Annotations[cyclopsManagedAnnotation] != "true" {
+					t.rm.Logger.Info("Skipping annotation removal - annotation was not added by Cyclops",
+						"phase", phase,
+						"nodeName", newNode.Name)
+					continue
+				}
+
+				if err := t.removeScaleDownDisabledAnnotation(newNode.Name); err != nil {
+					// Log warning but don't fail - annotation removal is best-effort
+					t.rm.Logger.Info("Failed to remove scale-down-disabled annotation from node",
+						"phase", phase,
+						"nodeName", newNode.Name,
+						"error", err)
+					t.rm.LogEvent(t.cycleNodeRequest, "AnnotationCleanupWarning",
+						"Failed to remove scale-down-disabled annotation from node %s during %s: %v", newNode.Name, phase, err)
+				} else {
+					t.rm.Logger.Info("Removed scale-down-disabled annotation from node",
+						"phase", phase,
+						"nodeName", newNode.Name)
+				}
+			}
+		}
+	}
+}