From 35bfecad1a15346090457f5eb317ddde1bd809c3 Mon Sep 17 00:00:00 2001
From: Farshad Ghodsian <47931571+farshadghodsian@users.noreply.github.com>
Date: Fri, 11 Apr 2025 13:34:30 -0400
Subject: [PATCH] Added steps to install slinky on K8s and example training
 workload

---
 slinky-example/Readme.md                  | 167 ++++++
 slinky-example/test.py                    |  12 +
 slinky-example/train_fashion_mnist.py     | 120 +++++
 slinky-example/train_mnist_distributed.py | 117 +++++
 slinky-example/values-operator.yaml       | 162 ++++++
 slinky-example/values-slurm.yaml          | 607 ++++++++++++++++++++++
 6 files changed, 1185 insertions(+)
 create mode 100644 slinky-example/Readme.md
 create mode 100644 slinky-example/test.py
 create mode 100644 slinky-example/train_fashion_mnist.py
 create mode 100644 slinky-example/train_mnist_distributed.py
 create mode 100644 slinky-example/values-operator.yaml
 create mode 100644 slinky-example/values-slurm.yaml

diff --git a/slinky-example/Readme.md b/slinky-example/Readme.md
new file mode 100644
index 0000000..0018ecd
--- /dev/null
+++ b/slinky-example/Readme.md
@@ -0,0 +1,167 @@
+# Example Slinky Training Workload on Kubernetes
+
+The following outlines steps to get up and running with Slinky on Kubernetes and running a simple image classification training workload to verify GPUs are accessible.
+
+## Clone this repo and go into slinky folder
+
+```bash
+git clone https://github.com/amd/ada.git
+cd slinky
+```
+
+## Installing Slinky Prerequisites
+
+The following steps for installing pre-requisites and installing Slinky have been taking from the SlinkProject/slinky-operator repo [quick-start guide](https://github.com/SlinkyProject/slurm-operator/blob/main/docs/quickstart.md)
+
+```bash
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/
+helm repo add bitnami https://charts.bitnami.com/bitnami
+helm repo add jetstack https://charts.jetstack.io
+helm repo update
+helm install cert-manager jetstack/cert-manager \
+	--namespace cert-manager --create-namespace --set crds.enabled=true
+helm install prometheus prometheus-community/kube-prometheus-stack \
+	--namespace prometheus --create-namespace --set installCRDs=true
+```
+
+## Installing Slinky Operator
+
+```bash
+helm install slurm-operator oci://ghcr.io/slinkyproject/charts/slurm-operator \
+  --values=values-operator.yaml --version=0.1.0 --namespace=slinky --create-namespace
+```
+
+Make sure the operator deployed successfully with:
+
+```sh
+kubectl --namespace=slinky get pods
+```
+
+Output should be similar to:
+
+```sh
+NAME                                      READY   STATUS    RESTARTS   AGE
+slurm-operator-7444c844d5-dpr5h           1/1     Running   0          5m00s
+slurm-operator-webhook-6fd8d7857d-zcvqh   1/1     Running   0          5m00s
+```
+
+## Installing Slurm Cluster
+
+Build a Slurm docker image to be used for the Slurm compute node.  See the [Dockerfile from the Slinky repo](https://github.com/SlinkyProject/containers/blob/main/schedmd/slurm/24.05/ubuntu24.04/Dockerfile) on how to create the base Docker image. This image will have to have ROCm and PyTorch added to it.
+
+Once the image has been built and pushed to a repository update the `values-slurm.yaml` file to specify the compute node image you will be using:
+
+```yaml
+# Slurm compute (slurmd) configurations.
+compute:
+  #
+  # -- (string)
+  # Set the image pull policy.
+  imagePullPolicy: IfNotPresent
+  #
+  # Default image for the nodeset pod (slurmd)
+  # Each nodeset may override this setting.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: docker-registry/docker-repository/docker-image
+    #
+    # -- (string)
+    # Set the image tag to use.
+    # @default -- The Release appVersion.
+    tag: image-tag
+```
+
+Install the Slurm Cluster helm chart
+
+```bash
+helm install slurm oci://ghcr.io/slinkyproject/charts/slurm \
+  --values=values-slurm.yaml --version=0.1.0 --namespace=slurm --create-namespace
+```
+
+Make sure the Slurm cluster deployed successfully with:
+
+```sh
+kubectl --namespace=slurm get pods
+```
+
+Output should be similar to:
+
+```sh
+NAME                              READY   STATUS    RESTARTS       AGE
+slurm-accounting-0                1/1     Running   0              5m00s
+slurm-compute-gpu-node            1/1     Running   0              5m00s
+slurm-controller-0                2/2     Running   0              5m00s
+slurm-exporter-7b44b6d856-d86q5   1/1     Running   0              5m00s
+slurm-mariadb-0                   1/1     Running   0              5m00s
+slurm-restapi-5f75db85d9-67gpl    1/1     Running   0              5m00s
+```
+
+## Prepping Compute Node
+
+1. Get SLURM Compute Node Name
+
+    ```bash
+    SLURM_COMPUTE_POD=$(kubectl get pods -n slurm | grep ^slurm-compute-gpu-node | awk '{print $1}');echo $SLURM_COMPUTE_POD
+    ```
+
+2. Add Slurm user to video and render group and create Slurm user home directory to Slrum Compute node
+
+    ```bash
+    kubectl exec -it -n slurm $SLURM_COMPUTE_POD -- bash -c "
+        usermod -aG video,render slurm
+        mkdir -p /home/slurm
+        chown slurm:slurm /home/slurm"
+    ```
+
+3. Copy PyTorch test script to Slurm compute node
+
+    ```bash
+    kubectl cp test.py slurm/$SLURM_COMPUTE_POD:/tmp/test.py 
+    ```
+
+4. Copy Fashion MNIST Image Classification Model Training script to Slurm compute node
+
+    ```bash
+    kubectl cp train_fashion_mnist.py slurm/$SLURM_COMPUTE_POD:/tmp/train_fashion_mnist.py 
+    ```
+
+5. Run test.py script on compute node to confirm GPUs are accessible
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun python3 test.py
+    ```
+
+6. Run single-GPU training script on compute node
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun python3 train_fashion_mnist.py
+    ```
+
+7. Run multi-GPU training script on compute node
+
+    ```bash
+    kubectl exec -it slurm-controller-0 -n slurm --  srun apptainer exec --rocm --bind /tmp:/tmp torch_rocm.sif torchrun --standalone --nnodes=1 --nproc_per_node=8 --master-addr localhost train_mnist_distributed.py
+    ```
+
+## Other Useful Slurm Commands
+
+### Check Slurm Node Info
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm --  sinfo
+```
+
+### Check Job Queue
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm --  squeue
+```
+
+### Check Node Resources
+
+```bash
+kubectl exec -it slurm-controller-0 -n slurm -- sinfo -N -o "%N %G"
+```
diff --git a/slinky-example/test.py b/slinky-example/test.py
new file mode 100644
index 0000000..dbcba18
--- /dev/null
+++ b/slinky-example/test.py
@@ -0,0 +1,12 @@
+# run this command to check if the GPUs are available
+# srun -N 2 --gpus=16 -t 00:02:00 python3 test.py
+import torch
+ 
+if torch.cuda.is_available():
+    print(f"GPUs available: {torch.cuda.device_count()}")
+    for i in range(torch.cuda.device_count()):
+        print(f" - GPU {i}: {torch.cuda.get_device_name(i)}")
+        print(f" - GPU {i} Pytorch and rocm version: {torch.__version__}")
+        print(f" - GPU {i} Nccl version: {torch.cuda.nccl.version()}")
+else:
+    print("No GPUs available.")
diff --git a/slinky-example/train_fashion_mnist.py b/slinky-example/train_fashion_mnist.py
new file mode 100644
index 0000000..a3d9333
--- /dev/null
+++ b/slinky-example/train_fashion_mnist.py
@@ -0,0 +1,120 @@
+import os
+
+# Set the Torch Distributed env variables so the training function can be run locally in the Notebook.
+# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "1234"
+
+def train_fashion_mnist():
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch import nn
+    from torch.utils.data import DataLoader, DistributedSampler
+    from torchvision import datasets, transforms
+
+    # Define the PyTorch CNN model to be trained
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5, 1)
+            self.conv2 = nn.Conv2d(20, 50, 5, 1)
+            self.fc1 = nn.Linear(4 * 4 * 50, 500)
+            self.fc2 = nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = x.view(-1, 4 * 4 * 50)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.
+    device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
+    print(f"Using Device: {device}, Backend: {backend}")
+
+    # Setup PyTorch distributed.
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    dist.init_process_group(backend=backend)
+    print(
+        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # Create the model and load it into the device.
+    device = torch.device(f"{device}:{local_rank}")
+    model = nn.parallel.DistributedDataParallel(Net().to(device))
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+    
+    # Download FashionMNIST dataset only on local_rank=0 process.
+    if local_rank == 0:
+        dataset = datasets.FashionMNIST(
+            "./data",
+            train=True,
+            download=True,
+            transform=transforms.Compose([transforms.ToTensor()]),
+        )
+    dist.barrier()
+    dataset = datasets.FashionMNIST(
+        "./data",
+        train=True,
+        download=False,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+
+
+    # Shard the dataset accross workers.
+    train_loader = DataLoader(
+        dataset,
+        batch_size=100,
+        sampler=DistributedSampler(dataset)
+    )
+
+    # TODO(astefanutti): add parameters to the training function
+    dist.barrier()
+    for epoch in range(1, 10):
+        model.train()
+
+        # Iterate over mini-batches from the training set
+        for batch_idx, (inputs, labels) in enumerate(train_loader):
+            # Copy the data to the GPU device if available
+            inputs, labels = inputs.to(device), labels.to(device)
+            # Forward pass
+            outputs = model(inputs)
+            loss = F.nll_loss(outputs, labels)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if batch_idx % 10 == 0 and dist.get_rank() == 0:
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(inputs),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
+
+    # Wait for the distributed training to complete
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print("Training is finished")
+
+    # Finally clean up PyTorch distributed
+    dist.destroy_process_group()
+
+# Run the training function locally.
+train_fashion_mnist()
diff --git a/slinky-example/train_mnist_distributed.py b/slinky-example/train_mnist_distributed.py
new file mode 100644
index 0000000..807dbc9
--- /dev/null
+++ b/slinky-example/train_mnist_distributed.py
@@ -0,0 +1,117 @@
+import os
+
+# Set the Torch Distributed env variables so the training function can be run locally in the Notebook.
+# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables
+os.environ["RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+
+def train_fashion_mnist():
+    import torch
+    import torch.distributed as dist
+    import torch.nn.functional as F
+    from torch import nn
+    from torch.utils.data import DataLoader, DistributedSampler
+    from torchvision import datasets, transforms
+
+    # Define the PyTorch CNN model to be trained
+    class Net(nn.Module):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.conv1 = nn.Conv2d(1, 20, 5, 1)
+            self.conv2 = nn.Conv2d(20, 50, 5, 1)
+            self.fc1 = nn.Linear(4 * 4 * 50, 500)
+            self.fc2 = nn.Linear(500, 10)
+
+        def forward(self, x):
+            x = F.relu(self.conv1(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = F.relu(self.conv2(x))
+            x = F.max_pool2d(x, 2, 2)
+            x = x.view(-1, 4 * 4 * 50)
+            x = F.relu(self.fc1(x))
+            x = self.fc2(x)
+            return F.log_softmax(x, dim=1)
+
+    # Use NCCL if a GPU is available, otherwise use Gloo as communication backend.
+    device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo")
+    print(f"Using Device: {device}, Backend: {backend}")
+
+    # Setup PyTorch distributed.
+    local_rank = int(os.getenv("LOCAL_RANK", 0))
+    dist.init_process_group(backend=backend)
+    print(
+        "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format(
+            dist.get_world_size(),
+            dist.get_rank(),
+            local_rank,
+        )
+    )
+
+    # Create the model and load it into the device.
+    device = torch.device(f"{device}:{local_rank}")
+    model = nn.parallel.DistributedDataParallel(Net().to(device))
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+
+    
+    # Download FashionMNIST dataset only on local_rank=0 process.
+    if local_rank == 0:
+        dataset = datasets.FashionMNIST(
+            "./data",
+            train=True,
+            download=True,
+            transform=transforms.Compose([transforms.ToTensor()]),
+        )
+    dist.barrier()
+    dataset = datasets.FashionMNIST(
+        "./data",
+        train=True,
+        download=False,
+        transform=transforms.Compose([transforms.ToTensor()]),
+    )
+
+
+    # Shard the dataset accross workers.
+    train_loader = DataLoader(
+        dataset,
+        batch_size=100,
+        sampler=DistributedSampler(dataset)
+    )
+
+    # TODO(astefanutti): add parameters to the training function
+    dist.barrier()
+    for epoch in range(1, 10):
+        model.train()
+
+        # Iterate over mini-batches from the training set
+        for batch_idx, (inputs, labels) in enumerate(train_loader):
+            # Copy the data to the GPU device if available
+            inputs, labels = inputs.to(device), labels.to(device)
+            # Forward pass
+            outputs = model(inputs)
+            loss = F.nll_loss(outputs, labels)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if batch_idx % 10 == 0 and dist.get_rank() == 0:
+                print(
+                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
+                        epoch,
+                        batch_idx * len(inputs),
+                        len(train_loader.dataset),
+                        100.0 * batch_idx / len(train_loader),
+                        loss.item(),
+                    )
+                )
+
+    # Wait for the distributed training to complete
+    dist.barrier()
+    if dist.get_rank() == 0:
+        print("Training is finished")
+
+    # Finally clean up PyTorch distributed
+    dist.destroy_process_group()
+
+# Run the training function locally.
+train_fashion_mnist()
diff --git a/slinky-example/values-operator.yaml b/slinky-example/values-operator.yaml
new file mode 100644
index 0000000..8850a1e
--- /dev/null
+++ b/slinky-example/values-operator.yaml
@@ -0,0 +1,162 @@
+# SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# -- (string)
+# Overrides the name of the release.
+nameOverride: ""
+
+#
+# -- (string)
+# Overrides the full name of the release.
+fullnameOverride: ""
+
+#
+# -- (string)
+# Overrides the namespace of the release.
+namespaceOverride: ""
+
+#
+# -- (list)
+# Sets the image pull secrets.
+# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
+imagePullSecrets: []
+  # - name: regcred
+
+#
+# -- (string)
+# Set the image pull policy.
+imagePullPolicy: IfNotPresent
+
+#
+# Image configurations.
+image:
+  #
+  # -- (string)
+  # Sets the image repository to use.
+  repository: ghcr.io/slinkyproject/slurm-operator
+  #
+  # -- (string)
+  # Sets the image tag to use.
+  # @default -- The Release appVersion.
+  tag: ""
+
+#
+# -- (string)
+# Set the priority class to use.
+# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass
+priorityClassName: ""
+
+#
+# Operator configurations.
+operator:
+  #
+  # -- (bool)
+  # Enables the operator.
+  enabled: true
+  #
+  # -- (integer)
+  # Set the number of replicas to deploy.
+  replicas: 1
+  #
+  # Service account configurations.
+  serviceAccount:
+    #
+    # -- (bool)
+    # Allows chart to create the service account.
+    create: true
+    #
+    # -- (string)
+    # Set the service account to use (and create).
+    name: ""
+  #
+  # -- (object)
+  # Set affinity for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
+  affinity: {}
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+  #
+  # -- (integer)
+  # Set the max concurrent workers for the Cluster controller.
+  clusterWorkers: 1
+  #
+  # -- (integer)
+  # Set the max concurrent workers for the NodeSet controller.
+  nodesetWorkers: 1
+  #
+  # -- (string)
+  # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5).
+  logLevel: info
+
+#
+# Webhook configurations.
+webhook:
+  #
+  # -- (bool)
+  # Enables the webhook.
+  enabled: true
+  #
+  # -- (integer)
+  # Set the number of replicas to deploy.
+  replicas: 1
+  #
+  # Service account configurations.
+  serviceAccount:
+    #
+    # -- (bool)
+    # Allows chart to create the service account.
+    create: true
+    #
+    # -- (string)
+    # Set the service account to use (and create).
+    name: ""
+  #
+  # -- (object)
+  # Set affinity for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
+  affinity: {}
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+  #
+  # -- (string)
+  # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5).
+  logLevel: info
+
+#
+# Cert-Manager certificate configurations.
+certManager:
+  #
+  # -- (bool)
+  # Enables cert-manager for certificate management.
+  enabled: true
+  #
+  # -- (string)
+  # The secret to be (created and) mounted.
+  secretName: slurm-operator-webhook-ca
+  #
+  # -- (string)
+  # Duration of certificate life.
+  duration: 43800h0m0s # 5 year
+  #
+  # -- (string)
+  # Certificate renewal time. Should be before the expiration.
+  renewBefore: 8760h0m0s # 1 year
diff --git a/slinky-example/values-slurm.yaml b/slinky-example/values-slurm.yaml
new file mode 100644
index 0000000..9d091dd
--- /dev/null
+++ b/slinky-example/values-slurm.yaml
@@ -0,0 +1,607 @@
+# SPDX-FileCopyrightText: Copyright (C) SchedMD LLC.
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Debug configuration.
+# @ignored
+debug:
+  #
+  # -- (bool)
+  # Enables debug configuration.
+  enabled: false
+  #
+  # -- (bool)
+  # Allow a locally running operator to communicate with slurm cluster via port-forward.
+  # NOTE: use when running the operator in a local debugger.
+  localOperator: true
+
+#
+# -- (string)
+# Overrides the name of the release.
+nameOverride: ""
+
+#
+# -- (string)
+# Overrides the full name of the release.
+fullnameOverride: ""
+
+#
+# -- (string)
+# Overrides the namespace of the release.
+namespaceOverride: ""
+
+#
+# -- (list)
+# Set the secrets for image pull.
+# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
+imagePullSecrets: []
+  # - name: regcred
+
+#
+# -- (string)
+# Set the image pull policy.
+imagePullPolicy: IfNotPresent
+
+#
+# -- (string)
+# Set the priority class to use.
+# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass
+priorityClassName: ""
+
+#
+# Slurm JWT authentication.
+jwt:
+  #
+  # JWT hs256 configurations.
+  hs256:
+    #
+    # -- (string)
+    # The existing secret to use otherwise one will be generated.
+    existingSecret: ""
+
+#
+# Slurm configurations.
+slurm:
+  #
+  # Slurm authentication configurations.
+  auth:
+    #
+    # -- (string)
+    # The existing secret to use otherwise one will be generated.
+    existingSecret: ""
+  #
+  # -- (string)
+  # Extra slurmdbd configuration lines to append to `slurmdbd.conf`.
+  # WARNING: Values can override existing ones.
+  # Ref: https://slurm.schedmd.com/slurmdbd.conf.html
+  extraSlurmdbdConf: |-
+    CommitDelay=1
+  #
+  # -- (string)
+  # Extra slurm configuration lines to append to `slurm.conf`.
+  # WARNING: Values can override existing ones.
+  # Ref: https://slurm.schedmd.com/slurm.conf.html
+  extraSlurmConf: |-
+    SchedulerParameters=batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000
+    DefMemPerCPU=1
+  #
+  # -- (map[string]string)
+  # Optional raw Slurm configuration files, as a map.
+  # The map key represents the config file by name; the map value represents config file contents as a string.
+  # Ref: https://slurm.schedmd.com/man_index.html#configuration_files
+  configFiles: {}
+    # acct_gather.conf: |
+    #   # Ref: https://slurm.schedmd.com/acct_gather.conf.html
+    # burst_buffer.conf: |
+    #   # Ref: https://slurm.schedmd.com/burst_buffer.conf.html
+    #  gres.conf: |
+    #   # Ref: https://slurm.schedmd.com/gres.conf.html
+    # helpers.conf: |
+    #   # Ref: https://slurm.schedmd.com/helpers.conf.html
+    # job_container.conf: |
+    #   # Ref: https://slurm.schedmd.com/job_container.conf.html
+    # mpi.conf: |
+    #   # Ref: https://slurm.schedmd.com/mpi.conf.html
+    # oci.conf: |
+    #   # Ref: https://slurm.schedmd.com/oci.conf.html
+    # plugstack.conf: |
+    #   # Ref: https://slurm.schedmd.com/plugstack.conf.html
+    # topology.conf: |
+    #   # Ref: https://slurm.schedmd.com/topology.conf.html
+  #
+  # -- (map[string]string)
+  # The Prolog scripts for compute nodesets, as a map.
+  # The map key represents the filename; the map value represents the script contents.
+  # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm.
+  # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog
+  # Ref: https://slurm.schedmd.com/prolog_epilog.html
+  # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix)
+  prologScripts: {}
+    # empty: |
+    #   #!/usr/bin/env bash
+    #   exit 0
+  #
+  # -- (map[string]string)
+  # The Epilog scripts for compute nodesets, as a map.
+  # The map key represents the filename; the map value represents the script contents.
+  # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm.
+  # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog
+  # Ref: https://slurm.schedmd.com/prolog_epilog.html
+  # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix)
+  epilogScripts: {}
+    # empty: |
+    #   #!/usr/bin/env bash
+    #   exit 0
+
+#
+# Slurm authcred (sackd) configurations.
+authcred:
+  #
+  # Set the image to use.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: ghcr.io/slinkyproject/sackd
+    #
+    # -- (string)
+    # Set the image tag to use.
+    tag: 24.05-ubuntu-24.04
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+
+#
+# Slurm controller (slurmctld) configurations.
+controller:
+  #
+  # -- (bool)
+  # Enables the controller node.
+  enabled: true
+  #
+  # -- (integer)
+  # Set the number of replicas to deploy.
+  replicas: 1
+  #
+  # -- (string)
+  # Set the image pull policy.
+  imagePullPolicy: IfNotPresent
+  #
+  # Set the image to use.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: ghcr.io/slinkyproject/slurmctld
+    #
+    # -- (string)
+    # Set the image tag to use.
+    tag: 24.05-ubuntu-24.04
+  #
+  # -- (string)
+  # Set the priority class to use.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass
+  priorityClassName:
+  #
+  # -- (object)
+  # Set affinity for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
+  affinity: {}
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+  #
+  # Define a persistent volume for the slurm controller to store its save-state.
+  # Used to recover from system failures or from pod upgrades.
+  persistence:
+    #
+    # -- (string)
+    # Name of an existing `PersistentVolumeClaim` to use instead of creating one from definition.
+    # NOTE: When not empty, the other persistence fields will be ignored.
+    existingClaim: ""
+    #
+    # -- (object)
+    # Create a `PersistentVolumeClaim` with these annotations.
+    annotations: {}
+    #
+    # -- (object)
+    # Create a `PersistentVolumeClaim` with these labels.
+    labels: {}
+    #
+    # -- (string)
+    # Create a `PersistentVolumeClaim` with this storage class.
+    # Note if running on Microk8s and using the hostpath-storage, set storageClass to `microk8s-hostpath`.
+    storageClass: standard
+    #
+    # -- (list)
+    # Create a `PersistentVolumeClaim` with these access modes.
+    accessModes:
+      - ReadWriteOnce
+    #
+    # -- (string)
+    # Create a `PersistentVolumeClaim` with this storage size.
+    size: 4Gi
+    #
+    # -- (object)
+    # Selector to match an existing `PersistentVolume`.
+    selector: {}
+      # matchLabels:
+      #   app: foo
+
+#
+# Slurm compute (slurmd) configurations.
+compute:
+  #
+  # -- (string)
+  # Set the image pull policy.
+  imagePullPolicy: IfNotPresent
+  #
+  # Default image for the nodeset pod (slurmd)
+  # Each nodeset may override this setting.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: #docker-registry/docker-repository/docker-image
+    #
+    # -- (string)
+    # Set the image tag to use.
+    # @default -- The Release appVersion.
+    tag: #image-tag
+  #
+  # -- (list)
+  # Slurm NodeSets by object list.
+  nodesets:
+      #
+      # -- (string)
+      # Name of NodeSet. Must be unique.
+    - name: gpu-node
+      #
+      # -- (bool)
+      # Enables the NodeSet in Slurm.
+      enabled: true
+      #
+      # -- (integer)
+      # Set the number of replicas to deploy.
+      # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet).
+      replicas: 1
+      #
+      # -- (int)
+      # The minimum number of seconds for which a newly created NodeSet Pod should be ready
+      # without any of its container crashing, for it to be considered available.
+      minReadySeconds: 0
+      #
+      # -- (string)
+      # Set the image pull policy.
+      imagePullPolicy: IfNotPresent
+      #
+      # Set the image to use.
+      image:
+        #
+        # -- (string)
+        # Set the image repository to use.
+        repository: ""
+        #
+        # -- (string)
+        # Set the image tag to use.
+        tag: ""
+      #
+      # -- (string)
+      # Set the priority class to use.
+      # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass
+      priorityClassName: ""
+      #
+      # -- (object)
+      # Set container resource requests and limits for Kubernetes Pod scheduling.
+      # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+      resources:
+        limits:
+          cpu: 16
+          memory: 132Gi
+          amd.com/gpu: 8
+      #
+      # -- (map)
+      # Selector which must match a node's labels for the pod to be scheduled on that node.
+      nodeSelector:
+        kubernetes.io/os: linux
+      #
+      # -- (object)
+      # Set affinity for Kubernetes Pod scheduling.
+      affinity: {}
+        # nodeAffinity:
+        #   requiredDuringSchedulingIgnoredDuringExecution:
+        #     nodeSelectorTerms:
+        #       - matchExpressions:
+        #           - key: "kubernetes.io/os"
+        #             operator: In
+        #             values:
+        #               - linux
+        # podAntiAffinity:
+        #   requiredDuringSchedulingIgnoredDuringExecution:
+        #     - topologyKey: "kubernetes.io/hostname"
+        #       labelSelector:
+        #         matchExpressions:
+        #           - key: "app.kubernetes.io/name"
+        #             operator: In
+        #             values:
+        #               - slurmctld
+        #               - slurmdbd
+        #               - slurmrestd
+      #
+      # -- (object)
+      # Set the update strategy configuration.
+      updateStrategy:
+        #
+        # -- (string)
+        # Set the update strategy type.
+        # Can be either: "RollingUpdate"; "OnDelete".
+        type: RollingUpdate
+        #
+        # -- (object)
+        # Define the rolling update policy.
+        # Only used when "updateStrategy.type=RollingUpdate".
+        rollingUpdate:
+          #
+          # -- (string)
+          # The maximum number of pods that can be unavailable during the update.
+          # Value can be an absolute number (ex: 5) or a percentage of desired
+          # pods (ex: 10%). Absolute number is calculated from percentage by
+          # rounding up. This can not be 0. Defaults to 1.
+          maxUnavailable: 20%
+          #
+          # -- (int)
+          # Partition indicates the number of NodeSet pods that should be
+          # not be updated to the latest version.
+          partition: 0
+          #
+          # -- (bool)
+          # Pause will halt rollingUpdate while this value is true.
+          paused: false
+      #
+      # -- (object)
+      # The policy used for PVCs created from the NodeSet VolumeClaimTemplates.
+      persistentVolumeClaimRetentionPolicy:
+        #
+        # -- (string)
+        # WhenDeleted specifies what happens to PVCs created from NodeSet
+        # VolumeClaimTemplates when the NodeSet is deleted. The default policy
+        # of `Retain` causes PVCs to not be affected by NodeSet deletion. The
+        # `Delete` policy causes those PVCs to be deleted.
+        whenDeleted: Retain
+      #
+      # --(list)
+      # List of claims that pods are allowed to reference.
+      # The NodeSet controller is responsible for mapping network identities to
+      # claims in a way that maintains the identity of a pod.
+      volumeClaimTemplates: []
+        # - metadata:
+        #     name: data
+        #   spec:
+        #     storageClassName: standard
+        #     mountPath: /mnt/data
+        #     accessModes:
+        #       - ReadWriteOnce
+        #     resources:
+        #       requests:
+        #         storage: 1Gi
+      #
+      # -- (object)
+      # Partition describes the partition created specifically for this NodeSet to be added.
+      partition:
+        #
+        # -- (bool)
+        # Enables this NodeSet's partition line to be added in Slurm.
+        enabled: true
+        #
+        # -- (string)
+        # Extra Slurm partition configuration appended onto the partition line.
+        # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI
+        config: >-
+          State=UP
+          MaxTime=INFINITE
+      #
+      # -- (string)
+      # Set Slurm node GRES.
+      # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1
+      # nodeGres: "gpus:amd=8"
+      #
+      # -- (list)
+      # Set Slurm node Features as a list(string).
+      # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features
+      nodeFeatures: []
+      #
+      # -- (string)
+      # Set Slurm node weight for Slurm scheduling.
+      # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight
+      nodeWeight: 1
+  #
+  # -- (list)
+  # Slurm Partitions by object list.
+  partitions:
+      #
+      # -- (string)
+      # Name of Partition. Must be unique.
+    - name: all
+      #
+      # -- (bool)
+      # Enables the partition in Slurm.
+      enabled: true
+      #
+      # -- (list)
+      # NodeSets to put into this Partition by name/key.
+      # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system.
+      nodesets:
+        - ALL
+      #
+      # -- (string)
+      # Extra Slurm partition configuration appended onto the partition line.
+      # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI
+      config: >-
+        State=UP
+        Default=YES
+        MaxTime=INFINITE
+
+#
+# Slurm accounting (slurmdbd) configurations.
+accounting:
+  #
+  # -- (bool)
+  # Enables accounting services.
+  enabled: true
+  #
+  # -- (integer)
+  # Set the number of replicas to deploy.
+  replicas: 1
+  #
+  # -- (string)
+  # Set the image pull policy.
+  imagePullPolicy: IfNotPresent
+  #
+  # Set the image to use.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: ghcr.io/slinkyproject/slurmdbd
+    #
+    # -- (string)
+    # Set the image tag to use.
+    tag: 24.05-ubuntu-24.04
+  #
+  # -- (object)
+  # Set affinity for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
+  affinity: {}
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+  #
+  # Configuration for an external accounting instance (slurmdbd).
+  external:
+    #
+    # -- (bool)
+    # Use an external acounting instance (slurmdbd) instead of deploying one.
+    enabled: false
+    #
+    # -- (string)
+    # The external acounting instance (slurmdbd) host.
+    host: ""
+    #
+    # -- (integer)
+    # The external acounting instance (slurmdbd) port.
+    port: 6819
+
+#
+# `bitnami/mariadb` subchart configurations.
+# Ref: https://github.com/bitnami/charts/blob/main/bitnami/mariadb/values.yaml
+mariadb:
+  enabled: true
+  auth:
+    username: slurm
+    database: slurm_acct_db
+    existingSecret: "slurm-mariadb-passwords"
+  initdbScripts:
+    # NOTE: https://slurm.schedmd.com/accounting.html#slurm-accounting-configuration-before-build
+    slurm-accounting.sql: |-
+      SET GLOBAL innodb_buffer_pool_size=(4 * 1024 * 1024 * 1024);
+      SET GLOBAL innodb_log_file_size=(64 * 1024 * 1024);
+      SET GLOBAL innodb_lock_wait_timeout=900;
+      SET GLOBAL max_allowed_packet=(16 * 1024 * 1024);
+  primary:
+    persistence:
+      enabled: false
+      existingClaim: ""
+      storageClass: standard
+      labels: {}
+      annotations: {}
+      accessModes:
+        - ReadWriteOnce
+      size: 8Gi
+      selector: {}
+    priorityClassName: ""
+  metrics:
+    enabled: false
+    serviceMonitor:
+      enabled: false
+  affinity: {}
+  resources: {}
+
+#
+# Slurm REST API (slurmrestd) configurations.
+restapi:
+  #
+  # -- (bool)
+  # Enables restapi services.
+  enabled: true
+  #
+  # -- (integer)
+  # Set the number of replicas to deploy.
+  replicas: 1
+  #
+  # -- (string)
+  # Set the image pull policy.
+  imagePullPolicy: IfNotPresent
+  #
+  # Set the image to use.
+  image:
+    #
+    # -- (string)
+    # Set the image repository to use.
+    repository: ghcr.io/slinkyproject/slurmrestd
+    #
+    # -- (string)
+    # Set the image tag to use.
+    tag: 24.05-ubuntu-24.04
+  #
+  # -- (string)
+  # Set the priority class to use.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass
+  priorityClassName: ""
+  #
+  # -- (object)
+  # Set affinity for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
+  affinity: {}
+  #
+  # -- (object)
+  # Set container resource requests and limits for Kubernetes Pod scheduling.
+  # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container
+  resources: {}
+    # requests:
+    #   cpu: 1
+    #   memory: 1Gi
+    # limits:
+    #   cpu: 2
+    #   memory: 4Gi
+
+#
+# `slurm-exporter` subchart configurations.
+# Ref: https://github.com/SlinkyProject/slurm-exporter/-/blob/main/helm/slurm-exporter/values.yaml
+slurm-exporter:
+  exporter:
+    enabled: true
+    secretName: "slurm-token-exporter"