From 35bfecad1a15346090457f5eb317ddde1bd809c3 Mon Sep 17 00:00:00 2001 From: Farshad Ghodsian <47931571+farshadghodsian@users.noreply.github.com> Date: Fri, 11 Apr 2025 13:34:30 -0400 Subject: [PATCH] Added steps to install slinky on K8s and example training workload --- slinky-example/Readme.md | 167 ++++++ slinky-example/test.py | 12 + slinky-example/train_fashion_mnist.py | 120 +++++ slinky-example/train_mnist_distributed.py | 117 +++++ slinky-example/values-operator.yaml | 162 ++++++ slinky-example/values-slurm.yaml | 607 ++++++++++++++++++++++ 6 files changed, 1185 insertions(+) create mode 100644 slinky-example/Readme.md create mode 100644 slinky-example/test.py create mode 100644 slinky-example/train_fashion_mnist.py create mode 100644 slinky-example/train_mnist_distributed.py create mode 100644 slinky-example/values-operator.yaml create mode 100644 slinky-example/values-slurm.yaml diff --git a/slinky-example/Readme.md b/slinky-example/Readme.md new file mode 100644 index 0000000..0018ecd --- /dev/null +++ b/slinky-example/Readme.md @@ -0,0 +1,167 @@ +# Example Slinky Training Workload on Kubernetes + +The following outlines steps to get up and running with Slinky on Kubernetes and running a simple image classification training workload to verify GPUs are accessible. + +## Clone this repo and go into slinky folder + +```bash +git clone https://github.com/amd/ada.git +cd slinky +``` + +## Installing Slinky Prerequisites + +The following steps for installing pre-requisites and installing Slinky have been taking from the SlinkProject/slinky-operator repo [quick-start guide](https://github.com/SlinkyProject/slurm-operator/blob/main/docs/quickstart.md) + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo add metrics-server https://kubernetes-sigs.github.io/metrics-server/ +helm repo add bitnami https://charts.bitnami.com/bitnami +helm repo add jetstack https://charts.jetstack.io +helm repo update +helm install cert-manager jetstack/cert-manager \ + --namespace cert-manager --create-namespace --set crds.enabled=true +helm install prometheus prometheus-community/kube-prometheus-stack \ + --namespace prometheus --create-namespace --set installCRDs=true +``` + +## Installing Slinky Operator + +```bash +helm install slurm-operator oci://ghcr.io/slinkyproject/charts/slurm-operator \ + --values=values-operator.yaml --version=0.1.0 --namespace=slinky --create-namespace +``` + +Make sure the operator deployed successfully with: + +```sh +kubectl --namespace=slinky get pods +``` + +Output should be similar to: + +```sh +NAME READY STATUS RESTARTS AGE +slurm-operator-7444c844d5-dpr5h 1/1 Running 0 5m00s +slurm-operator-webhook-6fd8d7857d-zcvqh 1/1 Running 0 5m00s +``` + +## Installing Slurm Cluster + +Build a Slurm docker image to be used for the Slurm compute node. See the [Dockerfile from the Slinky repo](https://github.com/SlinkyProject/containers/blob/main/schedmd/slurm/24.05/ubuntu24.04/Dockerfile) on how to create the base Docker image. This image will have to have ROCm and PyTorch added to it. + +Once the image has been built and pushed to a repository update the `values-slurm.yaml` file to specify the compute node image you will be using: + +```yaml +# Slurm compute (slurmd) configurations. +compute: + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Default image for the nodeset pod (slurmd) + # Each nodeset may override this setting. + image: + # + # -- (string) + # Set the image repository to use. + repository: docker-registry/docker-repository/docker-image + # + # -- (string) + # Set the image tag to use. + # @default -- The Release appVersion. + tag: image-tag +``` + +Install the Slurm Cluster helm chart + +```bash +helm install slurm oci://ghcr.io/slinkyproject/charts/slurm \ + --values=values-slurm.yaml --version=0.1.0 --namespace=slurm --create-namespace +``` + +Make sure the Slurm cluster deployed successfully with: + +```sh +kubectl --namespace=slurm get pods +``` + +Output should be similar to: + +```sh +NAME READY STATUS RESTARTS AGE +slurm-accounting-0 1/1 Running 0 5m00s +slurm-compute-gpu-node 1/1 Running 0 5m00s +slurm-controller-0 2/2 Running 0 5m00s +slurm-exporter-7b44b6d856-d86q5 1/1 Running 0 5m00s +slurm-mariadb-0 1/1 Running 0 5m00s +slurm-restapi-5f75db85d9-67gpl 1/1 Running 0 5m00s +``` + +## Prepping Compute Node + +1. Get SLURM Compute Node Name + + ```bash + SLURM_COMPUTE_POD=$(kubectl get pods -n slurm | grep ^slurm-compute-gpu-node | awk '{print $1}');echo $SLURM_COMPUTE_POD + ``` + +2. Add Slurm user to video and render group and create Slurm user home directory to Slrum Compute node + + ```bash + kubectl exec -it -n slurm $SLURM_COMPUTE_POD -- bash -c " + usermod -aG video,render slurm + mkdir -p /home/slurm + chown slurm:slurm /home/slurm" + ``` + +3. Copy PyTorch test script to Slurm compute node + + ```bash + kubectl cp test.py slurm/$SLURM_COMPUTE_POD:/tmp/test.py + ``` + +4. Copy Fashion MNIST Image Classification Model Training script to Slurm compute node + + ```bash + kubectl cp train_fashion_mnist.py slurm/$SLURM_COMPUTE_POD:/tmp/train_fashion_mnist.py + ``` + +5. Run test.py script on compute node to confirm GPUs are accessible + + ```bash + kubectl exec -it slurm-controller-0 -n slurm -- srun python3 test.py + ``` + +6. Run single-GPU training script on compute node + + ```bash + kubectl exec -it slurm-controller-0 -n slurm -- srun python3 train_fashion_mnist.py + ``` + +7. Run multi-GPU training script on compute node + + ```bash + kubectl exec -it slurm-controller-0 -n slurm -- srun apptainer exec --rocm --bind /tmp:/tmp torch_rocm.sif torchrun --standalone --nnodes=1 --nproc_per_node=8 --master-addr localhost train_mnist_distributed.py + ``` + +## Other Useful Slurm Commands + +### Check Slurm Node Info + +```bash +kubectl exec -it slurm-controller-0 -n slurm -- sinfo +``` + +### Check Job Queue + +```bash +kubectl exec -it slurm-controller-0 -n slurm -- squeue +``` + +### Check Node Resources + +```bash +kubectl exec -it slurm-controller-0 -n slurm -- sinfo -N -o "%N %G" +``` diff --git a/slinky-example/test.py b/slinky-example/test.py new file mode 100644 index 0000000..dbcba18 --- /dev/null +++ b/slinky-example/test.py @@ -0,0 +1,12 @@ +# run this command to check if the GPUs are available +# srun -N 2 --gpus=16 -t 00:02:00 python3 test.py +import torch + +if torch.cuda.is_available(): + print(f"GPUs available: {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + print(f" - GPU {i}: {torch.cuda.get_device_name(i)}") + print(f" - GPU {i} Pytorch and rocm version: {torch.__version__}") + print(f" - GPU {i} Nccl version: {torch.cuda.nccl.version()}") +else: + print("No GPUs available.") diff --git a/slinky-example/train_fashion_mnist.py b/slinky-example/train_fashion_mnist.py new file mode 100644 index 0000000..a3d9333 --- /dev/null +++ b/slinky-example/train_fashion_mnist.py @@ -0,0 +1,120 @@ +import os + +# Set the Torch Distributed env variables so the training function can be run locally in the Notebook. +# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables +os.environ["RANK"] = "0" +os.environ["LOCAL_RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" +os.environ["MASTER_ADDR"] = "localhost" +os.environ["MASTER_PORT"] = "1234" + +def train_fashion_mnist(): + import torch + import torch.distributed as dist + import torch.nn.functional as F + from torch import nn + from torch.utils.data import DataLoader, DistributedSampler + from torchvision import datasets, transforms + + # Define the PyTorch CNN model to be trained + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + # Use NCCL if a GPU is available, otherwise use Gloo as communication backend. + device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo") + print(f"Using Device: {device}, Backend: {backend}") + + # Setup PyTorch distributed. + local_rank = int(os.getenv("LOCAL_RANK", 0)) + dist.init_process_group(backend=backend) + print( + "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format( + dist.get_world_size(), + dist.get_rank(), + local_rank, + ) + ) + + # Create the model and load it into the device. + device = torch.device(f"{device}:{local_rank}") + model = nn.parallel.DistributedDataParallel(Net().to(device)) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) + + + # Download FashionMNIST dataset only on local_rank=0 process. + if local_rank == 0: + dataset = datasets.FashionMNIST( + "./data", + train=True, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + dist.barrier() + dataset = datasets.FashionMNIST( + "./data", + train=True, + download=False, + transform=transforms.Compose([transforms.ToTensor()]), + ) + + + # Shard the dataset accross workers. + train_loader = DataLoader( + dataset, + batch_size=100, + sampler=DistributedSampler(dataset) + ) + + # TODO(astefanutti): add parameters to the training function + dist.barrier() + for epoch in range(1, 10): + model.train() + + # Iterate over mini-batches from the training set + for batch_idx, (inputs, labels) in enumerate(train_loader): + # Copy the data to the GPU device if available + inputs, labels = inputs.to(device), labels.to(device) + # Forward pass + outputs = model(inputs) + loss = F.nll_loss(outputs, labels) + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if batch_idx % 10 == 0 and dist.get_rank() == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(inputs), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + + # Wait for the distributed training to complete + dist.barrier() + if dist.get_rank() == 0: + print("Training is finished") + + # Finally clean up PyTorch distributed + dist.destroy_process_group() + +# Run the training function locally. +train_fashion_mnist() diff --git a/slinky-example/train_mnist_distributed.py b/slinky-example/train_mnist_distributed.py new file mode 100644 index 0000000..807dbc9 --- /dev/null +++ b/slinky-example/train_mnist_distributed.py @@ -0,0 +1,117 @@ +import os + +# Set the Torch Distributed env variables so the training function can be run locally in the Notebook. +# See https://pytorch.org/docs/stable/elastic/run.html#environment-variables +os.environ["RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" + +def train_fashion_mnist(): + import torch + import torch.distributed as dist + import torch.nn.functional as F + from torch import nn + from torch.utils.data import DataLoader, DistributedSampler + from torchvision import datasets, transforms + + # Define the PyTorch CNN model to be trained + class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, 5, 1) + self.conv2 = nn.Conv2d(20, 50, 5, 1) + self.fc1 = nn.Linear(4 * 4 * 50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4 * 4 * 50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + + # Use NCCL if a GPU is available, otherwise use Gloo as communication backend. + device, backend = ("cuda", "nccl") if torch.cuda.is_available() else ("cpu", "gloo") + print(f"Using Device: {device}, Backend: {backend}") + + # Setup PyTorch distributed. + local_rank = int(os.getenv("LOCAL_RANK", 0)) + dist.init_process_group(backend=backend) + print( + "Distributed Training for WORLD_SIZE: {}, RANK: {}, LOCAL_RANK: {}".format( + dist.get_world_size(), + dist.get_rank(), + local_rank, + ) + ) + + # Create the model and load it into the device. + device = torch.device(f"{device}:{local_rank}") + model = nn.parallel.DistributedDataParallel(Net().to(device)) + optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) + + + # Download FashionMNIST dataset only on local_rank=0 process. + if local_rank == 0: + dataset = datasets.FashionMNIST( + "./data", + train=True, + download=True, + transform=transforms.Compose([transforms.ToTensor()]), + ) + dist.barrier() + dataset = datasets.FashionMNIST( + "./data", + train=True, + download=False, + transform=transforms.Compose([transforms.ToTensor()]), + ) + + + # Shard the dataset accross workers. + train_loader = DataLoader( + dataset, + batch_size=100, + sampler=DistributedSampler(dataset) + ) + + # TODO(astefanutti): add parameters to the training function + dist.barrier() + for epoch in range(1, 10): + model.train() + + # Iterate over mini-batches from the training set + for batch_idx, (inputs, labels) in enumerate(train_loader): + # Copy the data to the GPU device if available + inputs, labels = inputs.to(device), labels.to(device) + # Forward pass + outputs = model(inputs) + loss = F.nll_loss(outputs, labels) + # Backward pass + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if batch_idx % 10 == 0 and dist.get_rank() == 0: + print( + "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_idx * len(inputs), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ) + ) + + # Wait for the distributed training to complete + dist.barrier() + if dist.get_rank() == 0: + print("Training is finished") + + # Finally clean up PyTorch distributed + dist.destroy_process_group() + +# Run the training function locally. +train_fashion_mnist() diff --git a/slinky-example/values-operator.yaml b/slinky-example/values-operator.yaml new file mode 100644 index 0000000..8850a1e --- /dev/null +++ b/slinky-example/values-operator.yaml @@ -0,0 +1,162 @@ +# SPDX-FileCopyrightText: Copyright (C) SchedMD LLC. +# SPDX-License-Identifier: Apache-2.0 + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Sets the image pull secrets. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# Image configurations. +image: + # + # -- (string) + # Sets the image repository to use. + repository: ghcr.io/slinkyproject/slurm-operator + # + # -- (string) + # Sets the image tag to use. + # @default -- The Release appVersion. + tag: "" + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Operator configurations. +operator: + # + # -- (bool) + # Enables the operator. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (integer) + # Set the max concurrent workers for the Cluster controller. + clusterWorkers: 1 + # + # -- (integer) + # Set the max concurrent workers for the NodeSet controller. + nodesetWorkers: 1 + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Webhook configurations. +webhook: + # + # -- (bool) + # Enables the webhook. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # Service account configurations. + serviceAccount: + # + # -- (bool) + # Allows chart to create the service account. + create: true + # + # -- (string) + # Set the service account to use (and create). + name: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # -- (string) + # Set the log level by string (e.g. error, info, debug) or number (e.g. 1..5). + logLevel: info + +# +# Cert-Manager certificate configurations. +certManager: + # + # -- (bool) + # Enables cert-manager for certificate management. + enabled: true + # + # -- (string) + # The secret to be (created and) mounted. + secretName: slurm-operator-webhook-ca + # + # -- (string) + # Duration of certificate life. + duration: 43800h0m0s # 5 year + # + # -- (string) + # Certificate renewal time. Should be before the expiration. + renewBefore: 8760h0m0s # 1 year diff --git a/slinky-example/values-slurm.yaml b/slinky-example/values-slurm.yaml new file mode 100644 index 0000000..9d091dd --- /dev/null +++ b/slinky-example/values-slurm.yaml @@ -0,0 +1,607 @@ +# SPDX-FileCopyrightText: Copyright (C) SchedMD LLC. +# SPDX-License-Identifier: Apache-2.0 + +# +# Debug configuration. +# @ignored +debug: + # + # -- (bool) + # Enables debug configuration. + enabled: false + # + # -- (bool) + # Allow a locally running operator to communicate with slurm cluster via port-forward. + # NOTE: use when running the operator in a local debugger. + localOperator: true + +# +# -- (string) +# Overrides the name of the release. +nameOverride: "" + +# +# -- (string) +# Overrides the full name of the release. +fullnameOverride: "" + +# +# -- (string) +# Overrides the namespace of the release. +namespaceOverride: "" + +# +# -- (list) +# Set the secrets for image pull. +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ +imagePullSecrets: [] + # - name: regcred + +# +# -- (string) +# Set the image pull policy. +imagePullPolicy: IfNotPresent + +# +# -- (string) +# Set the priority class to use. +# Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass +priorityClassName: "" + +# +# Slurm JWT authentication. +jwt: + # + # JWT hs256 configurations. + hs256: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + +# +# Slurm configurations. +slurm: + # + # Slurm authentication configurations. + auth: + # + # -- (string) + # The existing secret to use otherwise one will be generated. + existingSecret: "" + # + # -- (string) + # Extra slurmdbd configuration lines to append to `slurmdbd.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurmdbd.conf.html + extraSlurmdbdConf: |- + CommitDelay=1 + # + # -- (string) + # Extra slurm configuration lines to append to `slurm.conf`. + # WARNING: Values can override existing ones. + # Ref: https://slurm.schedmd.com/slurm.conf.html + extraSlurmConf: |- + SchedulerParameters=batch_sched_delay=20,bf_continue,bf_interval=300,bf_min_age_reserve=10800,bf_resolution=600,bf_yield_interval=1000000,partition_job_depth=500,sched_max_job_start=200,sched_min_interval=2000000 + DefMemPerCPU=1 + # + # -- (map[string]string) + # Optional raw Slurm configuration files, as a map. + # The map key represents the config file by name; the map value represents config file contents as a string. + # Ref: https://slurm.schedmd.com/man_index.html#configuration_files + configFiles: {} + # acct_gather.conf: | + # # Ref: https://slurm.schedmd.com/acct_gather.conf.html + # burst_buffer.conf: | + # # Ref: https://slurm.schedmd.com/burst_buffer.conf.html + # gres.conf: | + # # Ref: https://slurm.schedmd.com/gres.conf.html + # helpers.conf: | + # # Ref: https://slurm.schedmd.com/helpers.conf.html + # job_container.conf: | + # # Ref: https://slurm.schedmd.com/job_container.conf.html + # mpi.conf: | + # # Ref: https://slurm.schedmd.com/mpi.conf.html + # oci.conf: | + # # Ref: https://slurm.schedmd.com/oci.conf.html + # plugstack.conf: | + # # Ref: https://slurm.schedmd.com/plugstack.conf.html + # topology.conf: | + # # Ref: https://slurm.schedmd.com/topology.conf.html + # + # -- (map[string]string) + # The Prolog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + prologScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + # + # -- (map[string]string) + # The Epilog scripts for compute nodesets, as a map. + # The map key represents the filename; the map value represents the script contents. + # WARNING: The script must include a shebang (!) so it can be executed correctly by Slurm. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog + # Ref: https://slurm.schedmd.com/prolog_epilog.html + # Ref: https://en.wikipedia.org/wiki/Shebang_(Unix) + epilogScripts: {} + # empty: | + # #!/usr/bin/env bash + # exit 0 + +# +# Slurm authcred (sackd) configurations. +authcred: + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/sackd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# Slurm controller (slurmctld) configurations. +controller: + # + # -- (bool) + # Enables the controller node. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmctld + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Define a persistent volume for the slurm controller to store its save-state. + # Used to recover from system failures or from pod upgrades. + persistence: + # + # -- (string) + # Name of an existing `PersistentVolumeClaim` to use instead of creating one from definition. + # NOTE: When not empty, the other persistence fields will be ignored. + existingClaim: "" + # + # -- (object) + # Create a `PersistentVolumeClaim` with these annotations. + annotations: {} + # + # -- (object) + # Create a `PersistentVolumeClaim` with these labels. + labels: {} + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage class. + # Note if running on Microk8s and using the hostpath-storage, set storageClass to `microk8s-hostpath`. + storageClass: standard + # + # -- (list) + # Create a `PersistentVolumeClaim` with these access modes. + accessModes: + - ReadWriteOnce + # + # -- (string) + # Create a `PersistentVolumeClaim` with this storage size. + size: 4Gi + # + # -- (object) + # Selector to match an existing `PersistentVolume`. + selector: {} + # matchLabels: + # app: foo + +# +# Slurm compute (slurmd) configurations. +compute: + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Default image for the nodeset pod (slurmd) + # Each nodeset may override this setting. + image: + # + # -- (string) + # Set the image repository to use. + repository: #docker-registry/docker-repository/docker-image + # + # -- (string) + # Set the image tag to use. + # @default -- The Release appVersion. + tag: #image-tag + # + # -- (list) + # Slurm NodeSets by object list. + nodesets: + # + # -- (string) + # Name of NodeSet. Must be unique. + - name: gpu-node + # + # -- (bool) + # Enables the NodeSet in Slurm. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + # NOTE: if empty, all nodes matching affinity will have a replica (like DaemonSet). + replicas: 1 + # + # -- (int) + # The minimum number of seconds for which a newly created NodeSet Pod should be ready + # without any of its container crashing, for it to be considered available. + minReadySeconds: 0 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: "" + # + # -- (string) + # Set the image tag to use. + tag: "" + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: + limits: + cpu: 16 + memory: 132Gi + amd.com/gpu: 8 + # + # -- (map) + # Selector which must match a node's labels for the pod to be scheduled on that node. + nodeSelector: + kubernetes.io/os: linux + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + affinity: {} + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: "kubernetes.io/os" + # operator: In + # values: + # - linux + # podAntiAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # - topologyKey: "kubernetes.io/hostname" + # labelSelector: + # matchExpressions: + # - key: "app.kubernetes.io/name" + # operator: In + # values: + # - slurmctld + # - slurmdbd + # - slurmrestd + # + # -- (object) + # Set the update strategy configuration. + updateStrategy: + # + # -- (string) + # Set the update strategy type. + # Can be either: "RollingUpdate"; "OnDelete". + type: RollingUpdate + # + # -- (object) + # Define the rolling update policy. + # Only used when "updateStrategy.type=RollingUpdate". + rollingUpdate: + # + # -- (string) + # The maximum number of pods that can be unavailable during the update. + # Value can be an absolute number (ex: 5) or a percentage of desired + # pods (ex: 10%). Absolute number is calculated from percentage by + # rounding up. This can not be 0. Defaults to 1. + maxUnavailable: 20% + # + # -- (int) + # Partition indicates the number of NodeSet pods that should be + # not be updated to the latest version. + partition: 0 + # + # -- (bool) + # Pause will halt rollingUpdate while this value is true. + paused: false + # + # -- (object) + # The policy used for PVCs created from the NodeSet VolumeClaimTemplates. + persistentVolumeClaimRetentionPolicy: + # + # -- (string) + # WhenDeleted specifies what happens to PVCs created from NodeSet + # VolumeClaimTemplates when the NodeSet is deleted. The default policy + # of `Retain` causes PVCs to not be affected by NodeSet deletion. The + # `Delete` policy causes those PVCs to be deleted. + whenDeleted: Retain + # + # --(list) + # List of claims that pods are allowed to reference. + # The NodeSet controller is responsible for mapping network identities to + # claims in a way that maintains the identity of a pod. + volumeClaimTemplates: [] + # - metadata: + # name: data + # spec: + # storageClassName: standard + # mountPath: /mnt/data + # accessModes: + # - ReadWriteOnce + # resources: + # requests: + # storage: 1Gi + # + # -- (object) + # Partition describes the partition created specifically for this NodeSet to be added. + partition: + # + # -- (bool) + # Enables this NodeSet's partition line to be added in Slurm. + enabled: true + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + MaxTime=INFINITE + # + # -- (string) + # Set Slurm node GRES. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1 + # nodeGres: "gpus:amd=8" + # + # -- (list) + # Set Slurm node Features as a list(string). + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Features + nodeFeatures: [] + # + # -- (string) + # Set Slurm node weight for Slurm scheduling. + # Ref: https://slurm.schedmd.com/slurm.conf.html#OPT_Weight + nodeWeight: 1 + # + # -- (list) + # Slurm Partitions by object list. + partitions: + # + # -- (string) + # Name of Partition. Must be unique. + - name: all + # + # -- (bool) + # Enables the partition in Slurm. + enabled: true + # + # -- (list) + # NodeSets to put into this Partition by name/key. + # NOTE: 'ALL' is a Slurm meta value to mean all nodes in the system. + nodesets: + - ALL + # + # -- (string) + # Extra Slurm partition configuration appended onto the partition line. + # Ref: https://slurm.schedmd.com/slurm.conf.html#lbAI + config: >- + State=UP + Default=YES + MaxTime=INFINITE + +# +# Slurm accounting (slurmdbd) configurations. +accounting: + # + # -- (bool) + # Enables accounting services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmdbd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + # + # Configuration for an external accounting instance (slurmdbd). + external: + # + # -- (bool) + # Use an external acounting instance (slurmdbd) instead of deploying one. + enabled: false + # + # -- (string) + # The external acounting instance (slurmdbd) host. + host: "" + # + # -- (integer) + # The external acounting instance (slurmdbd) port. + port: 6819 + +# +# `bitnami/mariadb` subchart configurations. +# Ref: https://github.com/bitnami/charts/blob/main/bitnami/mariadb/values.yaml +mariadb: + enabled: true + auth: + username: slurm + database: slurm_acct_db + existingSecret: "slurm-mariadb-passwords" + initdbScripts: + # NOTE: https://slurm.schedmd.com/accounting.html#slurm-accounting-configuration-before-build + slurm-accounting.sql: |- + SET GLOBAL innodb_buffer_pool_size=(4 * 1024 * 1024 * 1024); + SET GLOBAL innodb_log_file_size=(64 * 1024 * 1024); + SET GLOBAL innodb_lock_wait_timeout=900; + SET GLOBAL max_allowed_packet=(16 * 1024 * 1024); + primary: + persistence: + enabled: false + existingClaim: "" + storageClass: standard + labels: {} + annotations: {} + accessModes: + - ReadWriteOnce + size: 8Gi + selector: {} + priorityClassName: "" + metrics: + enabled: false + serviceMonitor: + enabled: false + affinity: {} + resources: {} + +# +# Slurm REST API (slurmrestd) configurations. +restapi: + # + # -- (bool) + # Enables restapi services. + enabled: true + # + # -- (integer) + # Set the number of replicas to deploy. + replicas: 1 + # + # -- (string) + # Set the image pull policy. + imagePullPolicy: IfNotPresent + # + # Set the image to use. + image: + # + # -- (string) + # Set the image repository to use. + repository: ghcr.io/slinkyproject/slurmrestd + # + # -- (string) + # Set the image tag to use. + tag: 24.05-ubuntu-24.04 + # + # -- (string) + # Set the priority class to use. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/#priorityclass + priorityClassName: "" + # + # -- (object) + # Set affinity for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity + affinity: {} + # + # -- (object) + # Set container resource requests and limits for Kubernetes Pod scheduling. + # Ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#resource-requests-and-limits-of-pod-and-container + resources: {} + # requests: + # cpu: 1 + # memory: 1Gi + # limits: + # cpu: 2 + # memory: 4Gi + +# +# `slurm-exporter` subchart configurations. +# Ref: https://github.com/SlinkyProject/slurm-exporter/-/blob/main/helm/slurm-exporter/values.yaml +slurm-exporter: + exporter: + enabled: true + secretName: "slurm-token-exporter"