From def97965b9262e66546e49d0079ed549cb3bded6 Mon Sep 17 00:00:00 2001 From: lg Date: Mon, 3 Mar 2025 13:31:07 +0000 Subject: [PATCH 1/2] Squashed commit of the following: commit d7194ebf04f13e580daf7e9e5204a0497edf4bcc Author: lg Date: Mon Mar 3 13:23:51 2025 +0000 revert commit aedea1a5b623032c6c42022260655219eaac78b8 Author: lg Date: Mon Mar 3 13:02:39 2025 +0000 remove plots and results commit fbdfc1cea98104ba27ea661fe5b83c86b3231fa4 Author: lg Date: Mon Feb 10 13:07:34 2025 +0000 add results commit 8c1a390c4ccda98634fbc57fd8c9dc950d404a6f Author: lg Date: Wed Feb 5 14:13:05 2025 +0000 migration commit 031a5d42db546fb13814551efeb6c0fa07335aef Author: lg Date: Wed Feb 5 11:04:36 2025 +0000 fix makedirs commit a3a77fa37d1032c51486f4ec387d8017b0bfe670 Author: lg Date: Tue Feb 4 16:58:46 2025 +0000 fix key error: data-file -> data_file commit fd05bf41748408e099f5a431e4fb0929e55f726a Author: lg Date: Tue Feb 4 16:54:53 2025 +0000 fix md link commit 4d562800abf28609d3288bf4ce414e8c5e3cfb68 Author: lg Date: Tue Feb 4 16:50:58 2025 +0000 include plots commit 544905061fcb2b2f36ae32c3f4ccfc93e9f330cb Author: lg Date: Tue Feb 4 16:39:45 2025 +0000 pass over experiments (WIP) commit a21bf558fd2f950d8ef23c53d9f7ae30163e7627 Author: lg Date: Thu Jan 23 20:11:25 2025 +0000 pass over documents commit 8cb10f1f0f6e5fbe238446ac736f2b6df0cd1b3e Author: lg Date: Wed Jan 22 11:02:35 2025 +0000 simplify README commit f235f0e8d0ac71644f55abacb042c51e73200007 Author: lg Date: Wed Jan 22 01:20:09 2025 +0000 inv cluster.provision/delete works --- K8S_VERSION | 1 + README.md | 18 +--- config/granny_aks_kubelet_config.json | 1 + config/granny_aks_os_config.json | 13 +++ tasks/__init__.py | 4 + tasks/cluster.py | 143 ++++++++++++++++++++++++++ tasks/elastic/README.md | 7 +- tasks/k8s.py | 91 ++++++++++++++++ tasks/kernels_mpi/README.md | 19 ++-- tasks/kernels_omp/README.md | 19 ++-- tasks/lammps/README.md | 5 +- tasks/lammps/run.py | 4 +- tasks/lulesh/README.md | 16 +-- tasks/makespan/README.md | 18 ++-- tasks/makespan/eviction.md | 20 ++-- tasks/migration/README.md | 15 ++- tasks/migration/plot.py | 2 + tasks/openmpi/README.md | 14 +-- tasks/polybench/README.md | 6 +- tasks/polybench/native.py | 7 +- tasks/util/env.py | 15 ++- tasks/util/version.py | 23 +++++ 22 files changed, 372 insertions(+), 89 deletions(-) create mode 100644 K8S_VERSION create mode 100644 config/granny_aks_kubelet_config.json create mode 100644 config/granny_aks_os_config.json create mode 100644 tasks/cluster.py create mode 100644 tasks/k8s.py create mode 100644 tasks/util/version.py diff --git a/K8S_VERSION b/K8S_VERSION new file mode 100644 index 0000000..82a5f3b --- /dev/null +++ b/K8S_VERSION @@ -0,0 +1 @@ +1.28.5 diff --git a/README.md b/README.md index 91c82c6..c2a4bbb 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,10 @@ # Granny Experiments -This repo contains the experiments for the [Granny paper]( -https://arxiv.org/abs/2302.11358). +This repo contains the experiments for the [Granny paper](https://arxiv.org/abs/2302.11358). -When following any instructions in this repository, it is recommended to -have two open terminals: -* One on the [`experiment-base`](https://github.com/faasm/experiment-base) repo - with the virtual environment activated (`source ./bin/workon.sh`). From now - onward, we will refer to this shell by its venv name: `faasm-exp-base`. -* One with this repo and the virtual environment activated - (`source ./bin/workon.sh`). From now onward, we will refer to this shell by - its venv name: `faasm-exp-faabric`. +When following any instructions in this repository, it is recommended to have a dedicated terminal with virtual environment of this repo activated: (`source ./bin/workon.sh`). -The former is used to provision/deprovision K8s clusters on Azure (with AKS), -and also to access low-level monitoring tools (we recommend `k9s`). - -The latter is used to deploy Faabric clusters, run the experiments, and plot -the results. +This virtual environment provides commands for provision/deprovision K8s clusters on Azure (with AKS), accessing low-level monitoring tools (we recommend `k9s`), and also commands for deploy Faabric clusters, run the experiments, and plot the results. ## Experiments in this repository diff --git a/config/granny_aks_kubelet_config.json b/config/granny_aks_kubelet_config.json new file mode 100644 index 0000000..0b11bb8 --- /dev/null +++ b/config/granny_aks_kubelet_config.json @@ -0,0 +1 @@ +{ "allowedUnsafeSysctls": ["net.*"] } diff --git a/config/granny_aks_os_config.json b/config/granny_aks_os_config.json new file mode 100644 index 0000000..84e27c7 --- /dev/null +++ b/config/granny_aks_os_config.json @@ -0,0 +1,13 @@ +{ + "sysctls": { + "netCoreRmemMax": 16777216, + "netCoreWmemMax": 16777216, + "netIpv4TcpRmem": "4096 87380 16777216", + "netIpv4TcpWmem": "4096 65536 16777216", + "netCoreNetdevMaxBacklog": "30000", + "netCoreRmemDefault": 16777216, + "netCoreWmemDefault": 16777216, + "netIpv4TcpMem": "16777216 16777216 16777216", + "netIpv4RouteFlush": 1 + } +} diff --git a/tasks/__init__.py b/tasks/__init__.py index a25b3b8..74d8f83 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,7 +1,9 @@ from invoke import Collection +from . import cluster from . import docker from . import format_code +from . import k8s import logging @@ -20,8 +22,10 @@ logging.getLogger().setLevel(logging.DEBUG) ns = Collection( + cluster, docker, format_code, + k8s, ) ns.add_collection(elastic_ns, name="elastic") diff --git a/tasks/cluster.py b/tasks/cluster.py new file mode 100644 index 0000000..f1df932 --- /dev/null +++ b/tasks/cluster.py @@ -0,0 +1,143 @@ +from invoke import task +from os.path import join +from subprocess import run +from tasks.util.env import ( + ACR_NAME, + AKS_CLUSTER_NAME, + AKS_NODE_COUNT, + AKS_REGION, + AKS_VM_SIZE, + AZURE_PUB_SSH_KEY, + AZURE_RESOURCE_GROUP, + CONFIG_DIR, + KUBECTL_BIN, +) +from tasks.util.version import get_k8s_version + + +# AKS commandline reference here: +# https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest +def _run_aks_cmd(name, az_args=None): + cmd = [ + "az", + "aks {}".format(name), + "--resource-group {}".format(AZURE_RESOURCE_GROUP), + ] + + if az_args: + cmd.extend(az_args) + + cmd = " ".join(cmd) + print(cmd) + run(cmd, shell=True, check=True) + + +@task +def list(ctx): + """ + List all AKS resources + """ + _run_aks_cmd("list") + + +@task(optional=["sgx"]) +def provision( + ctx, + nodes=AKS_NODE_COUNT, + vm=AKS_VM_SIZE, + location=AKS_REGION, + name=AKS_CLUSTER_NAME, + sgx=False, + granny=True, +): + """ + Provision the AKS cluster + """ + k8s_ver = get_k8s_version() + sgx = sgx and (sgx.lower() != "false") + granny_kubelet_config = join(CONFIG_DIR, "granny_aks_kubelet_config.json") + granny_os_config = join(CONFIG_DIR, "granny_aks_os_config.json") + + if sgx and "Standard_DC" not in vm: + print( + "Error provisioning SGX cluster: only `Standard_DC` VMs are supported" + ) + return + + _run_aks_cmd( + "create", + [ + "--name {}".format(name), + "--node-count {}".format(nodes), + "--node-vm-size {}".format(vm), + "--os-sku Ubuntu", + "--kubernetes-version {}".format(k8s_ver), + "--ssh-key-value {}".format(AZURE_PUB_SSH_KEY), + "--location {}".format(location), + # Could not create a role assignment for ACR. Are you an Owner on this subscription? + # "--attach-acr {}".format(ACR_NAME.split(".")[0]), + "{}".format( + "--kubelet-config {}".format(granny_kubelet_config) + if granny + else "" + ), + "{}".format( + "--linux-os-config {}".format(granny_os_config) + if granny + else "" + ), + "{}".format( + "--enable-addons confcom --enable-sgxquotehelper" + if sgx + else "" + ), + ], + ) + + +@task +def details(ctx): + """ + Show the details of the cluster + """ + _run_aks_cmd( + "show", + [ + "--name {}".format(AKS_CLUSTER_NAME), + ], + ) + + +@task +def delete(ctx, name=AKS_CLUSTER_NAME): + """ + Delete the AKS cluster + """ + _run_aks_cmd( + "delete", + [ + "--name {}".format(name), + "--yes", + ], + ) + + +@task +def credentials(ctx, name=AKS_CLUSTER_NAME, out_file=None): + """ + Get credentials for the AKS cluster + """ + # Set up the credentials + _run_aks_cmd( + "get-credentials", + [ + "--name {}".format(name), + "--overwrite-existing", + "--file {}".format(out_file) if out_file else "", + ], + ) + + # Check we can access the cluster + cmd = "{} get nodes".format(KUBECTL_BIN) + print(cmd) + run(cmd, shell=True, check=True) diff --git a/tasks/elastic/README.md b/tasks/elastic/README.md index eab7b6c..bb95383 100644 --- a/tasks/elastic/README.md +++ b/tasks/elastic/README.md @@ -1,4 +1,4 @@ -# Elastic Scaling Micro-Benchmark +# Elastic Scaling Micro-Benchmark (Fig.12) In this experiment we measure the benefits of elastically scaling-up OpenMP applications to benefit from idle resources. We run a pipe-lined algorithm @@ -44,6 +44,11 @@ You may now plot the results using: inv elastic.plot ``` +the plot will be available in [`/plots/elastic/elastic_speedup.pdf`](/plots/elastic/elastic_speedup.pdf), we also include it below: + +![Elastic Scaling Plot](/plots/elastic/elastic_speedup.png) + + ## Clean-Up Finally, delete the Granny cluster: diff --git a/tasks/k8s.py b/tasks/k8s.py new file mode 100644 index 0000000..9e6be95 --- /dev/null +++ b/tasks/k8s.py @@ -0,0 +1,91 @@ +from invoke import task +from os.path import join, exists +from os import makedirs +from shutil import copy, rmtree +from subprocess import run + +from tasks.util.env import ( + BIN_DIR, + GLOBAL_BIN_DIR, + K9S_VERSION, +) + +from tasks.util.version import get_k8s_version + + +def _download_binary(url, binary_name): + makedirs(BIN_DIR, exist_ok=True) + cmd = "curl -LO {}".format(url) + run(cmd, shell=True, check=True, cwd=BIN_DIR) + run("chmod +x {}".format(binary_name), shell=True, check=True, cwd=BIN_DIR) + + return join(BIN_DIR, binary_name) + + +def _symlink_global_bin(binary_path, name): + global_path = join(GLOBAL_BIN_DIR, name) + if exists(global_path): + print("Removing existing binary at {}".format(global_path)) + run( + "sudo rm -f {}".format(global_path), + shell=True, + check=True, + ) + + print("Symlinking {} -> {}".format(global_path, binary_path)) + run( + "sudo ln -s {} {}".format(binary_path, name), + shell=True, + check=True, + cwd=GLOBAL_BIN_DIR, + ) + + +@task +def install_kubectl(ctx, system=False): + """ + Install the k8s CLI (kubectl) + """ + k8s_ver = get_k8s_version() + url = "https://dl.k8s.io/release/v{}/bin/linux/amd64/kubectl".format( + k8s_ver + ) + + binary_path = _download_binary(url, "kubectl") + + # Symlink for kubectl globally + if system: + _symlink_global_bin(binary_path, "kubectl") + + +@task +def install_k9s(ctx, system=False): + """ + Install the K9s CLI + """ + tar_name = "k9s_Linux_amd64.tar.gz" + url = "https://github.com/derailed/k9s/releases/download/v{}/{}".format( + K9S_VERSION, tar_name + ) + print(url) + + # Download the TAR + workdir = "/tmp/k9s-csg" + makedirs(workdir, exist_ok=True) + + cmd = "curl -LO {}".format(url) + run(cmd, shell=True, check=True, cwd=workdir) + + # Untar + run("tar -xf {}".format(tar_name), shell=True, check=True, cwd=workdir) + + # Copy k9s into place + binary_path = join(BIN_DIR, "k9s") + copy(join(workdir, "k9s"), binary_path) + + # Remove tar + rmtree(workdir) + + # Symlink for k9s command globally + if system: + _symlink_global_bin(binary_path, "k9s") diff --git a/tasks/kernels_mpi/README.md b/tasks/kernels_mpi/README.md index f0ea50e..2a7b526 100644 --- a/tasks/kernels_mpi/README.md +++ b/tasks/kernels_mpi/README.md @@ -1,14 +1,14 @@ -# ParRes Kernels Experiment (MPI) +# ParRes Kernels Experiment - MPI (Fig.9b) This experiment runs a set of the [ParRes Kernels](https://github.com/ParRes/Kernels) as a microbenchmark for Granny's MPI implementation. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials ``` ## Granny @@ -16,19 +16,19 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=2 +faasmctl deploy.k8s --workers=2 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv kernels-mpi.wasm.upload +inv kernels-mpi.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv kernels-mpi.run.wasm +inv kernels-mpi.run.wasm ``` finally, delete the Granny cluster: @@ -63,15 +63,14 @@ To plot the results, just run: inv kernels-mpi.plot ``` -the plot will be available in [`./plots/kernels-mpi/mpi_kernels_slowdown.pdf`]( -./plots/kernels-mpi/mpi_kernels_slowdown.pdf), we also include it below: +the plot will be available in [`/plots/kernels-mpi/mpi_kernels_slowdown.pdf`](/plots/kernels-mpi/mpi_kernels_slowdown.pdf), we also include it below: -![MPI Kernels Slowdown Plot](./plots/kernels-mpi/mpi_kernels_slowdown.png) +![MPI Kernels Slowdown Plot](/plots/kernels-mpi/mpi_kernels_slowdown.png) ## Clean-up Finally, delete the AKS cluster: ```bash -(faasm-exp-base) inv cluster.delete +inv cluster.delete ``` diff --git a/tasks/kernels_omp/README.md b/tasks/kernels_omp/README.md index 3d0f3ef..c176b0c 100644 --- a/tasks/kernels_omp/README.md +++ b/tasks/kernels_omp/README.md @@ -1,14 +1,14 @@ -# ParRes Kernels Experiment (OpenMP) +# ParRes Kernels Experiment - OpenMP (Fig.10) This experiment runs a set of the [ParRes Kernels](https://github.com/ParRes/Kernels) as a microbenchmark for Granny's OpenMP implementation. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 2 cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 2 cluster.credentials ``` ## Faasm @@ -16,19 +16,19 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=1 +faasmctl deploy.k8s --workers=1 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv kernels-omp.wasm.upload +inv kernels-omp.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv kernels-omp.run.wasm +inv kernels-omp.run.wasm ``` finally, delete the cluster: @@ -63,15 +63,14 @@ To plot the results, just run: inv kernels-omp.plot ``` -the plot will be available in [`./plots/kernels-omp/openmp_kernels_slowdown.pdf`]( -./plots/kernels-omp/openmp_kernels_slowdown.pdf), we also include it below: +the plot will be available in [`/plots/kernels-omp/openmp_kernels_slowdown.pdf`](/plots/kernels-omp/openmp_kernels_slowdown.pdf), we also include it below: -![OpenMP Kernels Slowdown Plot](./plots/kernels-omp/openmp_kernels_slowdown.png) +![OpenMP Kernels Slowdown Plot](/plots/kernels-omp/openmp_kernels_slowdown.png) ## Clean-up Finally, delete the AKS cluster: ```bash -(faasm-exp-base) inv cluster.delete +inv cluster.delete ``` diff --git a/tasks/lammps/README.md b/tasks/lammps/README.md index 132f45b..4ccd6a5 100644 --- a/tasks/lammps/README.md +++ b/tasks/lammps/README.md @@ -5,11 +5,10 @@ as part of the array experiment. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 3 -inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials ``` ## Granny diff --git a/tasks/lammps/run.py b/tasks/lammps/run.py index c78f838..fb50c37 100644 --- a/tasks/lammps/run.py +++ b/tasks/lammps/run.py @@ -60,7 +60,7 @@ def wasm(ctx, w, repeats=1): ) workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] data_file = basename( - get_lammps_data_file(workload_config["data-file"])["data"][0] + get_lammps_data_file(workload_config["data_file"])["data"][0] ) csv_name = "lammps_granny_{}.csv".format(workload) @@ -111,7 +111,7 @@ def native(ctx, w, repeats=1): ) ) workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] - data_file = get_lammps_data_file(workload_config["data-file"])["data"][ + data_file = get_lammps_data_file(workload_config["data_file"])["data"][ 0 ] diff --git a/tasks/lulesh/README.md b/tasks/lulesh/README.md index 0043969..7236088 100644 --- a/tasks/lulesh/README.md +++ b/tasks/lulesh/README.md @@ -10,11 +10,11 @@ This experiment is a single execution of the LULESH simulation using OpenMP. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 1 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 1 +inv cluster.credentials ``` ## Granny @@ -22,25 +22,25 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=1 +faasmctl deploy.k8s --workers=1 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv lammps.wasm.upload +inv lammps.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv lammps.run.granny -w compute -w network +inv lammps.run.granny -w compute -w network ``` To remove the cluster, run: ```bash -(faasm-exp-mpi) faasmctl delete +faasmctl delete ``` ## Native @@ -78,7 +78,7 @@ which will generate a plot in [`./plots/lammps/runtime.png`]( ## Clean-Up -Remember to delete the cluster. From the experiment base terminal: +Remember to delete the cluster. ```bash inv cluster.delete diff --git a/tasks/makespan/README.md b/tasks/makespan/README.md index c2d112b..573fac0 100644 --- a/tasks/makespan/README.md +++ b/tasks/makespan/README.md @@ -7,11 +7,11 @@ NOTE: we only compare to ourselves! TODO: add README for the conservative plot -First, from the `faasm-exp-base` shell, deploy the VM cluster: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 33 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 33 +inv cluster.credentials ``` ## Native @@ -19,14 +19,14 @@ First, from the `faasm-exp-base` shell, deploy the VM cluster: First, deploy the native `k8s` cluster: ```bash -(faasm-exp-base) inv makespan.native.deploy --num-vms 32 +inv makespan.native.deploy --num-vms 32 ``` Now, you can run the different baselines: ```bash -(faasm-exp-base) inv makespan.run.native-batch --workload mpi-migrate --num-vms 32 --num-tasks 100 -(faasm-exp-base) inv makespan.run.native-slurm --workload mpi-migrate --num-vms 32 --num-tasks 100 +inv makespan.run.native-batch --workload mpi-migrate --num-vms 32 --num-tasks 100 +inv makespan.run.native-slurm --workload mpi-migrate --num-vms 32 --num-tasks 100 ``` Lastly, remove the native `k8s` cluster: @@ -46,20 +46,20 @@ faasmctl deploy.k8s --workers=32 Second, upload the corresponding WASM files: ```bash -(faasm-exp-faabric) inv makespan.wasm.upload +inv makespan.wasm.upload ``` Third, run the experiment: ```bash -(faasm-exp-faabric) inv makespan.run.granny --num-vms 32 --num-tasks 100 --workload mpi-migrate [--migrate] +inv makespan.run.granny --num-vms 32 --num-tasks 100 --workload mpi-migrate [--migrate] ``` During an experiment, you may monitor the state of the cluster (in a separete shell) by using: ```bash -(faasm-exp-faabric) faasmctl monitor.planner +faasmctl monitor.planner ``` ## Plot the results diff --git a/tasks/makespan/eviction.md b/tasks/makespan/eviction.md index b8e9c01..016af35 100644 --- a/tasks/makespan/eviction.md +++ b/tasks/makespan/eviction.md @@ -1,10 +1,10 @@ # Makespan Experiment (Eviction version) -First, from the `faasm-exp-base` shell, deploy the VM cluster: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 33 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 33 +inv cluster.credentials ``` ## Native @@ -12,14 +12,14 @@ First, from the `faasm-exp-base` shell, deploy the VM cluster: First, deploy the native `k8s` cluster: ```bash -(faasm-exp-base) inv makespan.native.deploy --num-vms 32 +inv makespan.native.deploy --num-vms 32 ``` Now, you can run the different baselines: ```bash -(faasm-exp-base) inv makespan.run.native-batch --workload mpi-evict --num-vms 32 --num-tasks 200 -(faasm-exp-base) inv makespan.run.native-slurm --workload mpi-evict --num-vms 32 --num-tasks 200 +inv makespan.run.native-batch --workload mpi-evict --num-vms 32 --num-tasks 200 +inv makespan.run.native-slurm --workload mpi-evict --num-vms 32 --num-tasks 200 ``` Lastly, remove the native `k8s` cluster: @@ -39,24 +39,24 @@ faasmctl deploy.k8s --workers=32 Second, upload the corresponding WASM files: ```bash -(faasm-exp-faabric) inv makespan.wasm.upload +inv makespan.wasm.upload ``` Third, run the experiment: ```bash # TODO: will probably ditch --workload=mpi -# (faasm-exp-faabric) inv makespan.run.granny --workload mpi +# inv makespan.run.granny --workload mpi # Set the --migrate flag to enable migrating Granules at runtime # TODO: rename the workload to `mpi` -(faasm-exp-faabric) inv makespan.run.granny --num-vms 32 --num-tasks 100 --workload mpi-migrate [--migrate] +inv makespan.run.granny --num-vms 32 --num-tasks 100 --workload mpi-migrate [--migrate] ``` During an experiment, you may monitor the state of the cluster (in a separete shell) by using: ```bash -(faasm-exp-faabric) faasmctl monitor.planner +faasmctl monitor.planner ``` ## Plot the results diff --git a/tasks/migration/README.md b/tasks/migration/README.md index aaed8b1..afc44c1 100644 --- a/tasks/migration/README.md +++ b/tasks/migration/README.md @@ -1,4 +1,4 @@ -# Migration Experiment +# Migration Experiment (Fig.11) This experiment explores the benefits of migrating the execution of scientific applications to benefit from dynamic changes in the compute environment. @@ -6,38 +6,37 @@ applications to benefit from dynamic changes in the compute environment. First, provision the cluster: ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME} -inv cluster.credentials --name ${CLUSTER_NAME} +inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials ``` Second, deploy the cluster ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers 2 +faasmctl deploy.k8s --workers 2 ``` Second, upload the WASM files: ```bash -(faasm-exp-faabric) inv migration.wasm.upload +inv migration.wasm.upload ``` Third, run the experiments: ```bash -(faasm-exp-faabric) inv migration.run -w compute -w network +inv migration.run -w all-to-all -w very-network ``` Lastly, plot the results: ```bash -(faasm-exp-faabric) inv migration.plot +inv migration.plot ``` and clean up: ```bash -(faasm-exp-faabric) faasmctl delete +faasmctl delete ``` ## Migration Oracle diff --git a/tasks/migration/plot.py b/tasks/migration/plot.py index 7d3133f..040907c 100644 --- a/tasks/migration/plot.py +++ b/tasks/migration/plot.py @@ -2,6 +2,7 @@ from invoke import task from matplotlib.pyplot import hlines, subplots import matplotlib.pyplot as plt +from os import makedirs from numpy import arange from os.path import join from pandas import read_csv @@ -59,6 +60,7 @@ def plot(ctx): Plot migration figure """ migration_results = _read_results() + makedirs(MIGRATION_PLOTS_DIR, exist_ok=True) do_plot("all-to-all", migration_results) # do_plot("compute", migration_results) diff --git a/tasks/openmpi/README.md b/tasks/openmpi/README.md index 85dccac..a608d8f 100644 --- a/tasks/openmpi/README.md +++ b/tasks/openmpi/README.md @@ -5,11 +5,11 @@ as part of the array experiment. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 3 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 3 +inv cluster.credentials ``` ## Granny @@ -17,25 +17,25 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=2 +faasmctl deploy.k8s --workers=2 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv lammps.wasm.upload +inv lammps.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv lammps.run.wasm -w compute -w network +inv lammps.run.wasm -w compute -w network ``` To remove the cluster, run: ```bash -(faasm-exp-mpi) faasmctl delete +faasmctl delete ``` ## Native diff --git a/tasks/polybench/README.md b/tasks/polybench/README.md index 3ecb70d..0b56c36 100644 --- a/tasks/polybench/README.md +++ b/tasks/polybench/README.md @@ -12,8 +12,8 @@ First, provision the cluster. For ease of deployment, we still deploy a K8s cluster of just one node, which we will access directly. ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 1 --name ${CLUSTER_NAME} -inv cluster.credentials --name ${CLUSTER_NAME} +inv cluster.provision --vm Standard_D8_v5 --nodes 1 +inv cluster.credentials ``` ## Native @@ -81,5 +81,5 @@ which will generate a `.pdf` file in `./plots/polybench/slowdown.pdf`. Lastly, clean the cluster: ```bash -inv cluster.delete --name ${CLUSTER_NAME} +inv cluster.delete ``` diff --git a/tasks/polybench/native.py b/tasks/polybench/native.py index 1528b15..1d292c7 100644 --- a/tasks/polybench/native.py +++ b/tasks/polybench/native.py @@ -17,8 +17,10 @@ def deploy(ctx, backend="k8s", num_vms=1, num_cores_per_vm=8, ctrs_per_vm=1): num_ctrs = int(num_vms) * int(ctrs_per_vm) num_cores_per_ctr = int(num_cores_per_vm / ctrs_per_vm) if backend == "k8s": + print('deploy_native_mpi') deploy_native_mpi( - "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, num_cores_per_ctr + "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, + # num_cores_per_ctr, ) wait_for_pods( @@ -39,7 +41,8 @@ def delete(ctx, backend="k8s", num_vms=2, num_cores_per_vm=8, ctrs_per_vm=1): if backend == "k8s": delete_native_mpi( - "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, num_cores_per_ctr + "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, + # num_cores_per_ctr, ) else: raise RuntimeError("Compose backend not implemented!") diff --git a/tasks/util/env.py b/tasks/util/env.py index d44584c..973da5d 100644 --- a/tasks/util/env.py +++ b/tasks/util/env.py @@ -1,14 +1,27 @@ from os.path import dirname, realpath, expanduser, join, exists from shutil import rmtree -from os import makedirs +from os import getenv, makedirs from subprocess import run HOME_DIR = expanduser("~") PROJ_ROOT = dirname(dirname(dirname(realpath(__file__)))) +BIN_DIR = join(PROJ_ROOT, "bin") +GLOBAL_BIN_DIR = "/usr/local/bin" +CONFIG_DIR = join(PROJ_ROOT, "config") FAASM_ROOT = join(HOME_DIR, "faasm") SYSTEM_NAME = "Granny" +K9S_VERSION = "0.32.2" + +AZURE_RESOURCE_GROUP = "faasm" ACR_NAME = "faasm.azurecr.io" +AKS_CLUSTER_NAME = getenv('USER') + "-faasm-cluster" +AKS_VM_SIZE = "Standard_DS5_v2" +AKS_NODE_COUNT = 4 +AKS_REGION = "eastus" +AZURE_PUB_SSH_KEY = "~/.ssh/id_rsa.pub" +KUBECTL_BIN = join(PROJ_ROOT, "bin", "kubectl") + FAABRIC_EXP_IMAGE_NAME = "faabric-experiments" NATIVE_BUILD_DIR = join(PROJ_ROOT, "build", "native") diff --git a/tasks/util/version.py b/tasks/util/version.py new file mode 100644 index 0000000..46baaaa --- /dev/null +++ b/tasks/util/version.py @@ -0,0 +1,23 @@ +from os.path import join + +from tasks.util.env import PROJ_ROOT + +# Note - this must match the version used by Faasm +KNATIVE_VERSION = "1.1.0" +K9S_VERSION = "0.24.15" + + +def _read_ver_file(file_path): + with open(file_path, "r") as fh: + ver = fh.read() + ver = ver.strip() + + return ver + + +def get_version(): + return _read_ver_file(join(PROJ_ROOT, "VERSION")) + + +def get_k8s_version(): + return _read_ver_file(join(PROJ_ROOT, "K8S_VERSION")) From d39ed19b38ad525fb77bd2dd8200bb95cea02f65 Mon Sep 17 00:00:00 2001 From: lg Date: Mon, 3 Mar 2025 13:45:35 +0000 Subject: [PATCH 2/2] fix fig links --- tasks/lammps/README.md | 8 ++++---- tasks/migration/README.md | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tasks/lammps/README.md b/tasks/lammps/README.md index 4ccd6a5..b050589 100644 --- a/tasks/lammps/README.md +++ b/tasks/lammps/README.md @@ -1,4 +1,4 @@ -# LAMMPS Experiment +# LAMMPS Experiment (Fig.9a) This experiment is a single execution of the LAMMPS simulation stress tested as part of the array experiment. @@ -65,10 +65,10 @@ To plot the results, you may run: inv lammps.plot ``` -which will generate a plot in [`./plots/lammps/runtime.png`]( -./plots/lammps/runtime.png), we also include it below: +which will generate a plot in [`/plots/lammps/lammps_slowdown.png`]( +/plots/lammps/lammps_slowdown.png), we also include it below: -![LAMMPS Runtime Plot](./plots/lammps/runtime.png) +![LAMMPS Runtime Plot](/plots/lammps/lammps_slowdown.png) ## Clean-Up diff --git a/tasks/migration/README.md b/tasks/migration/README.md index afc44c1..02df4b1 100644 --- a/tasks/migration/README.md +++ b/tasks/migration/README.md @@ -1,4 +1,4 @@ -# Migration Experiment (Fig.11) +# Migration Experiment (Fig.11b) This experiment explores the benefits of migrating the execution of scientific applications to benefit from dynamic changes in the compute environment. @@ -33,6 +33,10 @@ Lastly, plot the results: inv migration.plot ``` +which will generate a plot in [`/plots/migration/migration_speedup_all-to-all.png`](/plots/migration/migration_speedup_all-to-all.png), we also include it below: + +![migration plot](/plots/migration/migration_speedup_all-to-all.png) + and clean up: ```bash