From e2f20b86732d5ac024b64c4d47ff214f977f6557 Mon Sep 17 00:00:00 2001
From: Josh Clifford <37558619+jsclifford@users.noreply.github.com>
Date: Thu, 27 Feb 2025 13:56:02 -0700
Subject: [PATCH] SRE-671 - Fixing restart pods and added more options to set
 autoscale after scaling down.

---
 .../im-run-aks-scale-cluster.yml              | 170 +++++++++++++-----
 1 file changed, 122 insertions(+), 48 deletions(-)

diff --git a/workflow-templates/im-run-aks-scale-cluster.yml b/workflow-templates/im-run-aks-scale-cluster.yml
index f80c299..21afe16 100644
--- a/workflow-templates/im-run-aks-scale-cluster.yml
+++ b/workflow-templates/im-run-aks-scale-cluster.yml
@@ -1,4 +1,4 @@
-# Workflow Code: ElatedAnoconda_v3    DO NOT REMOVE
+# Workflow Code: ElatedAnoconda_v4    DO NOT REMOVE
 # Purpose:
 #    Scales AKS Cluster Node Pools down to a specified minimum after work hours
 #    to save costs.  This workflow will switch node pool scaling to manual then
@@ -50,6 +50,9 @@ permissions:
 
 env:
   TIMEZONE: 'America/Denver'
+  SCALE_UP_RESTART_FAILED_PODS_RETRY_COUNT: 5
+  # Recommended to be 60 seconds or more
+  SCALE_UP_RESTART_FAILED_PODS_WAIT_TIME_SECONDS: 60 # TODO: Set wait time for Restarting pods. 
   # The environments that are scheduled to scale up and down.
   SCHEDULED_ENVIRONMENTS: '["dev","dev-secondary","prod","prod-secondary"]' # TODO: Remove any environments you don't want scaling on a schedule
   # TODO: Update the scaling schedule start and end times to desired times.
@@ -220,6 +223,7 @@ jobs:
           # TODO: Update the NODE_POOL_SCALING variable to match the node pools in the AKS cluster. This is where you set what the workflow will scale to.
           # The manualMin is what the workflow will scale down to when run.  The autoScaleMin and autoScaleMax are the
           # values that the workflow will set the node pool to when set to autoscale on the node pool.
+          # enableAutoScaleAfterScaling is used after scaling down to set the node pool to autoscale.
           # List of Node Pools and their autoscale min/max values.  
           NODE_POOL_SCALING@dev: |
             [
@@ -227,13 +231,15 @@ jobs:
                 "name": "default-or-system-nodepool-name-here",
                 "autoScaleMin": 1,
                 "autoScaleMax": 3,
-                "manualMin": 2
+                "manualMin": 2,
+                "enableAutoScaleAfterScaling": false
               },
               {
                 "name": "user-nodepool-name-here",
                 "autoScaleMin": 0,
                 "autoScaleMax": 10,
-                "manualMin": 0
+                "manualMin": 0,
+                "enableAutoScaleAfterScaling": false
               }
             ]
 
@@ -288,6 +294,37 @@ jobs:
           | Scale Action                 | `${{ matrix.scaleAction }}` |
           | Current Scale Type           | `${{ env.CURRENT_SCALE_TYPE }}`  |
           | New Scale Type               | `${{ env.NEW_SCALE_TYPE }}` |' >> $GITHUB_STEP_SUMMARY
+
+      - name: Setup Kubectl
+        uses: azure/setup-kubectl@v4
+        if: ${{ matrix.scaleAction == 'up' }}
+        with:
+          version: latest
+
+      - name: Setup Kubelogin
+        uses: azure/use-kubelogin@v1
+        if: ${{ matrix.scaleAction == 'up' }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          kubelogin-version: 'latest'
+
+      - name: Setup kubectl config
+        if: ${{ matrix.scaleAction == 'up' }}
+        id: kube-config
+        run: |
+          base_path=$(pwd)
+          kube_config_file_path="$base_path/.kube/config-sp"
+          export KUBECONFIG=$kube_config_file_path
+          az aks get-credentials --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.TARGET_RESOURCE_GROUP }} --format exec --overwrite-existing --public-fqdn --file $kube_config_file_path
+          kubelogin convert-kubeconfig --login azurecli --kubeconfig $kube_config_file_path
+
+          echo "kube-config-file=$kube_config_file_path" >> $GITHUB_OUTPUT
+
+      - name: Pre-Scaling Commands # TODO: add any commands you want to run before scaling
+        run: |
+          echo "Running Pre-Scaling Commands"
+          echo "No Commands to run."
          
       - name: Set Scaling to ${{ env.NEW_SCALE_TYPE }} - Scale ${{ matrix.scaleAction }}
         uses: actions/github-script@v7
@@ -345,37 +382,16 @@ jobs:
                   }else{
                     core.info(`Node Pool ${pool.name} is already at the minimum scale of ${pool.manualMin}. Skipping executing scale down command.`);
                   }
+
+                  if(pool.enableAutoScaleAfterScaling && !poolStatus.enableAutoScaling){
+                    core.info(`Setting Node Pool ${pool.name} to autoscale between ${pool.autoScaleMin} and ${pool.autoScaleMax} after scaling down`);
+                    execSync(`az aks nodepool update --resource-group ${{ env.TARGET_RESOURCE_GROUP }} --cluster-name ${{ env.CLUSTER_NAME }} --name ${pool.name} --enable-cluster-autoscaler --min-count ${pool.autoScaleMin} --max-count ${pool.autoScaleMax}`, { stdio: 'inherit' });
+                  }
                 }catch(error){
                   core.setFailed(`Error Scaling Down Cluster Pool ${pool.name}: ${error}`);
                 }
               });  
             }
-      
-      - name: Setup Kubectl
-        uses: azure/setup-kubectl@v4
-        if: ${{ matrix.scaleAction == 'up' }}
-        with:
-          version: latest
-
-      - name: Setup Kubelogin
-        uses: azure/use-kubelogin@v1
-        if: ${{ matrix.scaleAction == 'up' }}
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          kubelogin-version: 'latest'
-
-      - name: Setup kubectl config
-        if: ${{ matrix.scaleAction == 'up' }}
-        id: kube-config
-        run: |
-          base_path=$(pwd)
-          kube_config_file_path="$base_path/.kube/config-sp"
-          export KUBECONFIG=$kube_config_file_path
-          az aks get-credentials --name ${{ env.CLUSTER_NAME }} --resource-group ${{ env.TARGET_RESOURCE_GROUP }} --format exec --overwrite-existing --public-fqdn --file $kube_config_file_path
-          kubelogin convert-kubeconfig --login azurecli --kubeconfig $kube_config_file_path
-
-          echo "kube-config-file=$kube_config_file_path" >> $GITHUB_OUTPUT
 
       - name: Restart Pods in Error or Pending State after Scale Up
         if: ${{ matrix.scaleAction == 'up' }}
@@ -383,28 +399,86 @@ jobs:
         with:
           script: |
             const { execSync } = require('child_process');
-            try {
-              const pods = execSync(`kubectl get pods --all-namespaces -o json --kubeconfig ${{ steps.kube-config.outputs.kube-config-file }}`, { encoding: 'utf-8', maxBuffer: 1024 * 1024 * 10 });
-              const podsJson = JSON.parse(pods);
-              const podStatusToRestart = ['Pending', 'Failed', "Unknown"];
-              const podContainerStatusToRestart = ['CrashLoopBackOff', 'Error', 'ImagePullBackOff'];
-
-              const podsToRestart = podsJson.items.filter(pod =>
-                podStatusToRestart.includes(pod.status.phase) ||
-                podContainerStatusToRestart.some(
-                  containerStatus => pod.status.containerStatuses.some(container =>
-                    !container.started && container.state.waiting && container.state.waiting.reason === containerStatus
+            const podStatusToRestart = ['Pending', 'Failed', "Unknown"];
+            const podContainerStatusToRestart = ['CrashLoopBackOff', 'Error', 'ImagePullBackOff'];
+            const excludedNamespaces = ['default','kube-node-lease','kube-public', 'kube-system','calico-system','gatekeeper-system','tigera-operator'];
+            const kubeConfigFile = '${{ steps.kube-config.outputs.kube-config-file }}';
+            const maxRetries = ${{ env.SCALE_UP_RESTART_FAILED_PODS_RETRY_COUNT }};
+            const waitTime = ${{ env.SCALE_UP_RESTART_FAILED_PODS_WAIT_TIME_SECONDS }};
+
+            function sleep(seconds) {
+              core.info(`Waiting for ${seconds} seconds before checking the pods status again...`);
+              execSync(`sleep ${seconds}`, { stdio: 'inherit' });
+            }
+
+            function getPodsToRestart(retryCount, kubeConfigFile) {
+              try {
+                const pods = execSync(`kubectl get pods --all-namespaces -o json --kubeconfig ${kubeConfigFile}`, { encoding: 'utf-8', maxBuffer: 1024 * 1024 * 10 });
+                const podsJson = JSON.parse(pods);
+
+                let podsToRestart = podsJson.items.filter(pod =>
+                  !excludedNamespaces.includes(pod.metadata.namespace) &&
+                  (podStatusToRestart.includes(pod.status.phase) ||
+                    pod.status.containerStatuses.some(container =>
+                      container.state.waiting && podContainerStatusToRestart.includes(container.state.waiting.reason)
+                    )
                   )
-                )
-              );
-              console.log(`Pods to Restart: ${podsToRestart.map(pod => `${pod.metadata.namespace}/${pod.metadata.name}`).join(', ')}`);
-              podsToRestart.forEach(pod => {
-                execSync(`kubectl delete pod ${pod.metadata.name} -n ${pod.metadata.namespace} --kubeconfig ${{ steps.kube-config.outputs.kube-config-file }}`, { stdio: 'inherit' });
-              });
-            } catch (error) {
-              core.setFailed(`Error restarting pods: ${error}`);
+                );
+                if(retryCount > 2 && podsToRestart.length > 0){
+                  core.info('Filtering out pods in pending state.');
+                  podsToRestart = podsToRestart.filter(pod => pod.status.phase != 'Pending' && !pod.name.includes('node-exporter'));
+                }
+                return podsToRestart;
+              } catch (error) {
+                core.setFailed(`Error getting pods in getPodsToRestart function: ${error}`);
+              }
+            }
+
+            function restartPods(podsToRestart,kubeConfigFile) {
+              try {
+                core.info(`Pods to Restart: ${podsToRestart.map(pod => `${pod.metadata.namespace}/${pod.metadata.name}`).join(', ')}`);
+                podsToRestart.forEach(pod => {
+                  execSync(`kubectl delete pod ${pod.metadata.name} -n ${pod.metadata.namespace} --kubeconfig ${kubeConfigFile}`, { stdio: 'inherit' });
+                });
+              } catch (error) {
+                core.setFailed(`Error restarting pods in restartPods function: ${error}`);
+              }
             }
 
+            let podsToRestart = getPodsToRestart(0,kubeConfigFile);
+            if(podsToRestart.length > 0){
+              sleep(120);
+            }
+            let retryCount = 1;
+            if (podsToRestart.length > 0) {
+              while (retryCount < maxRetries && podsToRestart.length > 0) {
+                const waitTimeMultiplier = retryCount > 0 ? retryCount * waitTime : waitTime;
+                if(retryCount > 1){
+                  podsToRestart = getPodsToRestart(retryCount, kubeConfigFile);
+                }
+                core.info(`Found ${podsToRestart.length} pods in error or pending state. Restarting pods... Retry count: ${retryCount}`);
+                restartPods(podsToRestart,kubeConfigFile);
+                
+                sleep(waitTimeMultiplier);
+                
+                podsToRestart = getPodsToRestart(retryCount, kubeConfigFile);
+                if(podsToRestart.length > 0 && retryCount < maxRetries -1){
+                  core.info(`Checking pods again...`);
+                }else{
+                  core.info(`No more pods to restart or reached maximum retry count of ${maxRetries}.`);
+                }
+                retryCount++;
+              }
+            } else {
+              core.info('No pods in error or pending state found.');
+              core.info(`Restarted Pods Retry Count: ${retryCount}`);
+            }
+              
+      - name: Post-Scaling Commands # TODO: Run any commands you want to run after scaling
+        run: |
+          echo "Running Post-Scaling Commands"
+          echo "No Commands to run."
+
       - name: Azure logout
         if: always()
         run: |