diff --git a/K8S_VERSION b/K8S_VERSION new file mode 100644 index 0000000..82a5f3b --- /dev/null +++ b/K8S_VERSION @@ -0,0 +1 @@ +1.28.5 diff --git a/config/granny_aks_kubelet_config.json b/config/granny_aks_kubelet_config.json new file mode 100644 index 0000000..0b11bb8 --- /dev/null +++ b/config/granny_aks_kubelet_config.json @@ -0,0 +1 @@ +{ "allowedUnsafeSysctls": ["net.*"] } diff --git a/config/granny_aks_os_config.json b/config/granny_aks_os_config.json new file mode 100644 index 0000000..84e27c7 --- /dev/null +++ b/config/granny_aks_os_config.json @@ -0,0 +1,13 @@ +{ + "sysctls": { + "netCoreRmemMax": 16777216, + "netCoreWmemMax": 16777216, + "netIpv4TcpRmem": "4096 87380 16777216", + "netIpv4TcpWmem": "4096 65536 16777216", + "netCoreNetdevMaxBacklog": "30000", + "netCoreRmemDefault": 16777216, + "netCoreWmemDefault": 16777216, + "netIpv4TcpMem": "16777216 16777216 16777216", + "netIpv4RouteFlush": 1 + } +} diff --git a/tasks/__init__.py b/tasks/__init__.py index a25b3b8..74d8f83 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,7 +1,9 @@ from invoke import Collection +from . import cluster from . import docker from . import format_code +from . import k8s import logging @@ -20,8 +22,10 @@ logging.getLogger().setLevel(logging.DEBUG) ns = Collection( + cluster, docker, format_code, + k8s, ) ns.add_collection(elastic_ns, name="elastic") diff --git a/tasks/cluster.py b/tasks/cluster.py new file mode 100644 index 0000000..f410502 --- /dev/null +++ b/tasks/cluster.py @@ -0,0 +1,140 @@ +from invoke import task +from os.path import join +from subprocess import run +from tasks.util.env import ( + AKS_CLUSTER_NAME, + AKS_NODE_COUNT, + AKS_REGION, + AKS_VM_SIZE, + AZURE_PUB_SSH_KEY, + AZURE_RESOURCE_GROUP, + CONFIG_DIR, + KUBECTL_BIN, +) +from tasks.util.version import get_k8s_version + + +# AKS commandline reference here: +# https://docs.microsoft.com/en-us/cli/azure/aks?view=azure-cli-latest +def _run_aks_cmd(name, az_args=None): + cmd = [ + "az", + "aks {}".format(name), + "--resource-group {}".format(AZURE_RESOURCE_GROUP), + ] + + if az_args: + cmd.extend(az_args) + + cmd = " ".join(cmd) + print(cmd) + run(cmd, shell=True, check=True) + + +@task +def list(ctx): + """ + List all AKS resources + """ + _run_aks_cmd("list") + + +@task(optional=["sgx"]) +def provision( + ctx, + nodes=AKS_NODE_COUNT, + vm=AKS_VM_SIZE, + location=AKS_REGION, + name=AKS_CLUSTER_NAME, + sgx=False, + granny=True, +): + """ + Provision the AKS cluster + """ + k8s_ver = get_k8s_version() + sgx = sgx and (sgx.lower() != "false") + granny_kubelet_config = join(CONFIG_DIR, "granny_aks_kubelet_config.json") + granny_os_config = join(CONFIG_DIR, "granny_aks_os_config.json") + + if sgx and "Standard_DC" not in vm: + print( + "Error provisioning SGX cluster: only `Standard_DC` VMs are supported" + ) + return + + _run_aks_cmd( + "create", + [ + "--name {}".format(name), + "--node-count {}".format(nodes), + "--node-vm-size {}".format(vm), + "--os-sku Ubuntu", + "--kubernetes-version {}".format(k8s_ver), + "--ssh-key-value {}".format(AZURE_PUB_SSH_KEY), + "--location {}".format(location), + "{}".format( + "--kubelet-config {}".format(granny_kubelet_config) + if granny + else "" + ), + "{}".format( + "--linux-os-config {}".format(granny_os_config) + if granny + else "" + ), + "{}".format( + "--enable-addons confcom --enable-sgxquotehelper" + if sgx + else "" + ), + ], + ) + + +@task +def details(ctx): + """ + Show the details of the cluster + """ + _run_aks_cmd( + "show", + [ + "--name {}".format(AKS_CLUSTER_NAME), + ], + ) + + +@task +def delete(ctx, name=AKS_CLUSTER_NAME): + """ + Delete the AKS cluster + """ + _run_aks_cmd( + "delete", + [ + "--name {}".format(name), + "--yes", + ], + ) + + +@task +def credentials(ctx, name=AKS_CLUSTER_NAME, out_file=None): + """ + Get credentials for the AKS cluster + """ + # Set up the credentials + _run_aks_cmd( + "get-credentials", + [ + "--name {}".format(name), + "--overwrite-existing", + "--file {}".format(out_file) if out_file else "", + ], + ) + + # Check we can access the cluster + cmd = "{} get nodes".format(KUBECTL_BIN) + print(cmd) + run(cmd, shell=True, check=True) diff --git a/tasks/elastic/README.md b/tasks/elastic/README.md index eab7b6c..bb95383 100644 --- a/tasks/elastic/README.md +++ b/tasks/elastic/README.md @@ -1,4 +1,4 @@ -# Elastic Scaling Micro-Benchmark +# Elastic Scaling Micro-Benchmark (Fig.12) In this experiment we measure the benefits of elastically scaling-up OpenMP applications to benefit from idle resources. We run a pipe-lined algorithm @@ -44,6 +44,11 @@ You may now plot the results using: inv elastic.plot ``` +the plot will be available in [`/plots/elastic/elastic_speedup.pdf`](/plots/elastic/elastic_speedup.pdf), we also include it below: + +![Elastic Scaling Plot](/plots/elastic/elastic_speedup.png) + + ## Clean-Up Finally, delete the Granny cluster: diff --git a/tasks/k8s.py b/tasks/k8s.py new file mode 100644 index 0000000..9e6be95 --- /dev/null +++ b/tasks/k8s.py @@ -0,0 +1,91 @@ +from invoke import task +from os.path import join, exists +from os import makedirs +from shutil import copy, rmtree +from subprocess import run + +from tasks.util.env import ( + BIN_DIR, + GLOBAL_BIN_DIR, + K9S_VERSION, +) + +from tasks.util.version import get_k8s_version + + +def _download_binary(url, binary_name): + makedirs(BIN_DIR, exist_ok=True) + cmd = "curl -LO {}".format(url) + run(cmd, shell=True, check=True, cwd=BIN_DIR) + run("chmod +x {}".format(binary_name), shell=True, check=True, cwd=BIN_DIR) + + return join(BIN_DIR, binary_name) + + +def _symlink_global_bin(binary_path, name): + global_path = join(GLOBAL_BIN_DIR, name) + if exists(global_path): + print("Removing existing binary at {}".format(global_path)) + run( + "sudo rm -f {}".format(global_path), + shell=True, + check=True, + ) + + print("Symlinking {} -> {}".format(global_path, binary_path)) + run( + "sudo ln -s {} {}".format(binary_path, name), + shell=True, + check=True, + cwd=GLOBAL_BIN_DIR, + ) + + +@task +def install_kubectl(ctx, system=False): + """ + Install the k8s CLI (kubectl) + """ + k8s_ver = get_k8s_version() + url = "https://dl.k8s.io/release/v{}/bin/linux/amd64/kubectl".format( + k8s_ver + ) + + binary_path = _download_binary(url, "kubectl") + + # Symlink for kubectl globally + if system: + _symlink_global_bin(binary_path, "kubectl") + + +@task +def install_k9s(ctx, system=False): + """ + Install the K9s CLI + """ + tar_name = "k9s_Linux_amd64.tar.gz" + url = "https://github.com/derailed/k9s/releases/download/v{}/{}".format( + K9S_VERSION, tar_name + ) + print(url) + + # Download the TAR + workdir = "/tmp/k9s-csg" + makedirs(workdir, exist_ok=True) + + cmd = "curl -LO {}".format(url) + run(cmd, shell=True, check=True, cwd=workdir) + + # Untar + run("tar -xf {}".format(tar_name), shell=True, check=True, cwd=workdir) + + # Copy k9s into place + binary_path = join(BIN_DIR, "k9s") + copy(join(workdir, "k9s"), binary_path) + + # Remove tar + rmtree(workdir) + + # Symlink for k9s command globally + if system: + _symlink_global_bin(binary_path, "k9s") diff --git a/tasks/kernels_mpi/README.md b/tasks/kernels_mpi/README.md index 650fff2..2a7b526 100644 --- a/tasks/kernels_mpi/README.md +++ b/tasks/kernels_mpi/README.md @@ -1,11 +1,11 @@ -# ParRes Kernels Experiment (MPI) +# ParRes Kernels Experiment - MPI (Fig.9b) This experiment runs a set of the [ParRes Kernels](https://github.com/ParRes/Kernels) as a microbenchmark for Granny's MPI implementation. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials @@ -63,8 +63,9 @@ To plot the results, just run: inv kernels-mpi.plot ``` -the plot will be available in [`./plots/kernels-mpi/mpi_kernels_slowdown.pdf`]( -./plots/kernels-mpi/mpi_kernels_slowdown.pdf). +the plot will be available in [`/plots/kernels-mpi/mpi_kernels_slowdown.pdf`](/plots/kernels-mpi/mpi_kernels_slowdown.pdf), we also include it below: + +![MPI Kernels Slowdown Plot](/plots/kernels-mpi/mpi_kernels_slowdown.png) ## Clean-up diff --git a/tasks/kernels_omp/README.md b/tasks/kernels_omp/README.md index 73a89cc..c176b0c 100644 --- a/tasks/kernels_omp/README.md +++ b/tasks/kernels_omp/README.md @@ -1,11 +1,11 @@ -# ParRes Kernels Experiment (OpenMP) +# ParRes Kernels Experiment - OpenMP (Fig.10) This experiment runs a set of the [ParRes Kernels](https://github.com/ParRes/Kernels) as a microbenchmark for Granny's OpenMP implementation. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash inv cluster.provision --vm Standard_D8_v5 --nodes 2 cluster.credentials @@ -63,8 +63,9 @@ To plot the results, just run: inv kernels-omp.plot ``` -the plot will be available in [`./plots/kernels-omp/openmp_kernels_slowdown.pdf`]( -./plots/kernels-omp/openmp_kernels_slowdown.pdf). +the plot will be available in [`/plots/kernels-omp/openmp_kernels_slowdown.pdf`](/plots/kernels-omp/openmp_kernels_slowdown.pdf), we also include it below: + +![OpenMP Kernels Slowdown Plot](/plots/kernels-omp/openmp_kernels_slowdown.png) ## Clean-up diff --git a/tasks/lammps/README.md b/tasks/lammps/README.md index 132f45b..b050589 100644 --- a/tasks/lammps/README.md +++ b/tasks/lammps/README.md @@ -1,15 +1,14 @@ -# LAMMPS Experiment +# LAMMPS Experiment (Fig.9a) This experiment is a single execution of the LAMMPS simulation stress tested as part of the array experiment. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 3 -inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials ``` ## Granny @@ -66,10 +65,10 @@ To plot the results, you may run: inv lammps.plot ``` -which will generate a plot in [`./plots/lammps/runtime.png`]( -./plots/lammps/runtime.png), we also include it below: +which will generate a plot in [`/plots/lammps/lammps_slowdown.png`]( +/plots/lammps/lammps_slowdown.png), we also include it below: -![LAMMPS Runtime Plot](./plots/lammps/runtime.png) +![LAMMPS Runtime Plot](/plots/lammps/lammps_slowdown.png) ## Clean-Up diff --git a/tasks/lammps/run.py b/tasks/lammps/run.py index c78f838..fb50c37 100644 --- a/tasks/lammps/run.py +++ b/tasks/lammps/run.py @@ -60,7 +60,7 @@ def wasm(ctx, w, repeats=1): ) workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] data_file = basename( - get_lammps_data_file(workload_config["data-file"])["data"][0] + get_lammps_data_file(workload_config["data_file"])["data"][0] ) csv_name = "lammps_granny_{}.csv".format(workload) @@ -111,7 +111,7 @@ def native(ctx, w, repeats=1): ) ) workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] - data_file = get_lammps_data_file(workload_config["data-file"])["data"][ + data_file = get_lammps_data_file(workload_config["data_file"])["data"][ 0 ] diff --git a/tasks/lulesh/README.md b/tasks/lulesh/README.md index 0043969..7236088 100644 --- a/tasks/lulesh/README.md +++ b/tasks/lulesh/README.md @@ -10,11 +10,11 @@ This experiment is a single execution of the LULESH simulation using OpenMP. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 1 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 1 +inv cluster.credentials ``` ## Granny @@ -22,25 +22,25 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=1 +faasmctl deploy.k8s --workers=1 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv lammps.wasm.upload +inv lammps.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv lammps.run.granny -w compute -w network +inv lammps.run.granny -w compute -w network ``` To remove the cluster, run: ```bash -(faasm-exp-mpi) faasmctl delete +faasmctl delete ``` ## Native @@ -78,7 +78,7 @@ which will generate a plot in [`./plots/lammps/runtime.png`]( ## Clean-Up -Remember to delete the cluster. From the experiment base terminal: +Remember to delete the cluster. ```bash inv cluster.delete diff --git a/tasks/migration/README.md b/tasks/migration/README.md index 3cd49f6..53d4cbd 100644 --- a/tasks/migration/README.md +++ b/tasks/migration/README.md @@ -1,4 +1,4 @@ -# Migration Experiment +# Migration Experiment (Fig.11b) This experiment explores the benefits of migrating the execution of scientific applications to benefit from dynamic changes in the compute environment. @@ -6,14 +6,13 @@ applications to benefit from dynamic changes in the compute environment. First, provision the cluster: ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME} -inv cluster.credentials --name ${CLUSTER_NAME} +inv cluster.provision --vm Standard_D8_v5 --nodes 3 cluster.credentials ``` Second, deploy the cluster ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers 2 +faasmctl deploy.k8s --workers 2 ``` Second, upload the WASM files: @@ -25,7 +24,7 @@ inv migration.wasm.upload Third, run the experiments: ```bash -inv migration.run -w compute -w network +inv migration.run -w all-to-all -w very-network ``` Lastly, plot the results: @@ -34,6 +33,10 @@ Lastly, plot the results: inv migration.plot ``` +which will generate a plot in [`/plots/migration/migration_speedup_all-to-all.png`](/plots/migration/migration_speedup_all-to-all.png), we also include it below: + +![migration plot](/plots/migration/migration_speedup_all-to-all.png) + and clean up: ```bash diff --git a/tasks/migration/plot.py b/tasks/migration/plot.py index 496d2b8..c5321c7 100644 --- a/tasks/migration/plot.py +++ b/tasks/migration/plot.py @@ -2,6 +2,7 @@ from invoke import task from matplotlib.pyplot import hlines, subplots import matplotlib.pyplot as plt +from os import makedirs from numpy import arange from os.path import join from pandas import read_csv @@ -68,6 +69,7 @@ def plot(ctx): Plot migration figure """ migration_results = _read_results() + makedirs(MIGRATION_PLOTS_DIR, exist_ok=True) do_plot("all-to-all", migration_results) # do_plot("compute", migration_results) diff --git a/tasks/openmpi/README.md b/tasks/openmpi/README.md index 85dccac..a608d8f 100644 --- a/tasks/openmpi/README.md +++ b/tasks/openmpi/README.md @@ -5,11 +5,11 @@ as part of the array experiment. ## Start AKS cluster -In the `experiment-base` terminal, run: +Create a new cluster: ```bash -(faasm-exp-base) inv cluster.provision --vm Standard_D8_v5 --nodes 3 -(faasm-exp-base) inv cluster.credentials +inv cluster.provision --vm Standard_D8_v5 --nodes 3 +inv cluster.credentials ``` ## Granny @@ -17,25 +17,25 @@ In the `experiment-base` terminal, run: Deploy the cluster: ```bash -(faasm-exp-faabric) faasmctl deploy.k8s --workers=2 +faasmctl deploy.k8s --workers=2 ``` Upload the WASM file: ```bash -(faasm-exp-faabric) inv lammps.wasm.upload +inv lammps.wasm.upload ``` and run the experiment with: ```bash -(faasm-exp-faabric) inv lammps.run.wasm -w compute -w network +inv lammps.run.wasm -w compute -w network ``` To remove the cluster, run: ```bash -(faasm-exp-mpi) faasmctl delete +faasmctl delete ``` ## Native diff --git a/tasks/polybench/README.md b/tasks/polybench/README.md index 3ecb70d..0b56c36 100644 --- a/tasks/polybench/README.md +++ b/tasks/polybench/README.md @@ -12,8 +12,8 @@ First, provision the cluster. For ease of deployment, we still deploy a K8s cluster of just one node, which we will access directly. ```bash -inv cluster.provision --vm Standard_D8_v5 --nodes 1 --name ${CLUSTER_NAME} -inv cluster.credentials --name ${CLUSTER_NAME} +inv cluster.provision --vm Standard_D8_v5 --nodes 1 +inv cluster.credentials ``` ## Native @@ -81,5 +81,5 @@ which will generate a `.pdf` file in `./plots/polybench/slowdown.pdf`. Lastly, clean the cluster: ```bash -inv cluster.delete --name ${CLUSTER_NAME} +inv cluster.delete ``` diff --git a/tasks/polybench/native.py b/tasks/polybench/native.py index 1528b15..911bf84 100644 --- a/tasks/polybench/native.py +++ b/tasks/polybench/native.py @@ -18,7 +18,9 @@ def deploy(ctx, backend="k8s", num_vms=1, num_cores_per_vm=8, ctrs_per_vm=1): num_cores_per_ctr = int(num_cores_per_vm / ctrs_per_vm) if backend == "k8s": deploy_native_mpi( - "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, num_cores_per_ctr + "polybench", + FAABRIC_EXP_IMAGE_NAME, + num_ctrs, ) wait_for_pods( @@ -35,11 +37,12 @@ def delete(ctx, backend="k8s", num_vms=2, num_cores_per_vm=8, ctrs_per_vm=1): Delete the LAMMPS native MPI setup from K8s """ num_ctrs = int(num_vms) * int(ctrs_per_vm) - num_cores_per_ctr = int(num_cores_per_vm / ctrs_per_vm) if backend == "k8s": delete_native_mpi( - "polybench", FAABRIC_EXP_IMAGE_NAME, num_ctrs, num_cores_per_ctr + "polybench", + FAABRIC_EXP_IMAGE_NAME, + num_ctrs, ) else: raise RuntimeError("Compose backend not implemented!") diff --git a/tasks/util/env.py b/tasks/util/env.py index d44584c..70bc989 100644 --- a/tasks/util/env.py +++ b/tasks/util/env.py @@ -1,14 +1,27 @@ from os.path import dirname, realpath, expanduser, join, exists from shutil import rmtree -from os import makedirs +from os import getenv, makedirs from subprocess import run HOME_DIR = expanduser("~") PROJ_ROOT = dirname(dirname(dirname(realpath(__file__)))) +BIN_DIR = join(PROJ_ROOT, "bin") +GLOBAL_BIN_DIR = "/usr/local/bin" +CONFIG_DIR = join(PROJ_ROOT, "config") FAASM_ROOT = join(HOME_DIR, "faasm") SYSTEM_NAME = "Granny" +K9S_VERSION = "0.32.2" + +AZURE_RESOURCE_GROUP = "faasm" ACR_NAME = "faasm.azurecr.io" +AKS_CLUSTER_NAME = getenv("USER") + "-faasm-cluster" +AKS_VM_SIZE = "Standard_DS5_v2" +AKS_NODE_COUNT = 4 +AKS_REGION = "eastus" +AZURE_PUB_SSH_KEY = "~/.ssh/id_rsa.pub" +KUBECTL_BIN = join(PROJ_ROOT, "bin", "kubectl") + FAABRIC_EXP_IMAGE_NAME = "faabric-experiments" NATIVE_BUILD_DIR = join(PROJ_ROOT, "build", "native") diff --git a/tasks/util/version.py b/tasks/util/version.py new file mode 100644 index 0000000..46baaaa --- /dev/null +++ b/tasks/util/version.py @@ -0,0 +1,23 @@ +from os.path import join + +from tasks.util.env import PROJ_ROOT + +# Note - this must match the version used by Faasm +KNATIVE_VERSION = "1.1.0" +K9S_VERSION = "0.24.15" + + +def _read_ver_file(file_path): + with open(file_path, "r") as fh: + ver = fh.read() + ver = ver.strip() + + return ver + + +def get_version(): + return _read_ver_file(join(PROJ_ROOT, "VERSION")) + + +def get_k8s_version(): + return _read_ver_file(join(PROJ_ROOT, "K8S_VERSION"))