From 05e4c784febb74b405ad97cb975a4663fdc86e3c Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 20 Jan 2025 18:38:27 +0000 Subject: [PATCH 01/34] nydus: support host-sharing --- tasks/nydus_snapshotter.py | 123 ++++++++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 10 deletions(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index d5fbf64e..97104743 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -1,14 +1,24 @@ from invoke import task -from os.path import join +from os.path import exists, join +from shutil import rmtree from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import COCO_ROOT, GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line from tasks.util.toml import update_toml from tasks.util.versions import NYDUS_SNAPSHOTTER_VERSION -NYDUS_SNAPSHOTTER_CONFIG_FILE = join( - COCO_ROOT, "share", "nydus-snapshotter", "config-coco-guest-pulling.toml" +NYDUS_SNAPSHOTTER_CONFIG_DIR = join(COCO_ROOT, "share", "nydus-snapshotter") +NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG = join( + NYDUS_SNAPSHOTTER_CONFIG_DIR, "config-coco-guest-pulling.toml" ) +NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG = join( + NYDUS_SNAPSHOTTER_CONFIG_DIR, "config-coco-host-sharing.toml" +) + +NYDUS_SNAPSHOTTER_CONFIG_FILES = [ + NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG, + NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG, +] NYDUS_SNAPSHOTTER_CTR_NAME = "nydus-snapshotter-workon" NYDUS_SNAPSHOTTER_IMAGE_TAG = ( join(GHCR_URL, GITHUB_ORG, "nydus-snapshotter") + f":{NYDUS_SNAPSHOTTER_VERSION}" @@ -47,6 +57,47 @@ def install(debug=False, clean=False): NYDUS_SNAPSHOTTER_IMAGE_TAG, ctr_binaries, host_binaries, requires_sudo=True ) + # Populate the host-sharing config file + if clean: + rmtree(NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG) + + if not exists(NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG): + host_sharing_config = """ +version = 1 +root = "/var/lib/containerd-nydus" +address = "/run/containerd-nydus/containerd-nydus-grpc.sock" +daemon_mode = "none" + +[system] +enable = true +address = "/run/containerd-nydus/system.sock" + +[daemon] +fs_driver = "blockdev" +nydusimage_path = "/usr/local/bin/nydus-image" + +[remote] +skip_ssl_verify = true + +[snapshot] +enable_kata_volume = true + +[experimental.tarfs] +enable_tarfs = true +mount_tarfs_on_host = false +export_mode = "image_block_with_verity" +""" + cmd = """ +sudo sh -c 'cat < {destination_file} +{file_contents} +EOF' +""".format( + destination_file=NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG, + file_contents=host_sharing_config, + ) + + run(cmd, shell=True, check=True) + # Remove all nydus config for a clean start if clean: run("sudo rm -rf /var/lib/containerd-nydus", shell=True, check=True) @@ -87,13 +138,14 @@ def set_log_level(ctx, log_level): ) return - updated_toml_str = """ - [log] - level = "{log_level}" - """.format( - log_level=log_level - ) - update_toml(NYDUS_SNAPSHOTTER_CONFIG_FILE, updated_toml_str) + for config_file in NYDUS_SNAPSHOTTER_CONFIG_FILES: + updated_toml_str = """ + [log] + level = "{log_level}" + """.format( + log_level=log_level + ) + update_toml(config_file, updated_toml_str) restart_nydus_snapshotter() @@ -148,3 +200,54 @@ def hot_replace(ctx): run(docker_cmd, shell=True, check=True) restart_nydus_snapshotter() + + +@task +def set_mode(ctx, mode): + """ + Set the nydus-snapshotter operation mode: 'guest-pulling', or 'host-sharing' + """ + if mode not in ["guest-pulling", "host-sharing"]: + print(f"ERROR: unrecognised nydus-snapshotter mode: {mode}") + print("ERROR: mode must be one in: ['guest-pulling', 'host-sharing']") + return + + config_file = ( + NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG + if mode == "host-sharing" + else NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG + ) + exec_start = ( + f"{NYDUS_SNAPSHOTTER_HOST_BINPATH}/containerd-nydus-grpc-hybrid " + f"--config ${config_file} --log-to-stdout" + ) + + service_config = """ +[Unit] +Description=Nydus snapshotter +After=network.target local-fs.target +Before=containerd.service + +[Service] +ExecStart={} + +[Install] +RequiredBy=containerd.service +""".format( + exec_start + ) + + service_path = "/etc/systemd/system/nydus-snapshotter.service" + cmd = """ +sudo sh -c 'cat < {destination_file} +{file_contents} +EOF' +""".format( + destination_file=service_path, + file_contents=service_config, + ) + run(cmd, shell=True, check=True) + + # Reload systemd to apply the new service configuration + run("sudo systemctl daemon-reload", shell=True, check=True) + run("sudo systemctl restart nydus-snapshotter.service", shell=True, check=True) From b26531299e6d0130b2d2422eb5ddeeac6d7301ed Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Wed, 22 Jan 2025 18:58:09 +0000 Subject: [PATCH 02/34] nydus: minor fix --- tasks/nydus_snapshotter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 97104743..f6280daf 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -218,8 +218,8 @@ def set_mode(ctx, mode): else NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG ) exec_start = ( - f"{NYDUS_SNAPSHOTTER_HOST_BINPATH}/containerd-nydus-grpc-hybrid " - f"--config ${config_file} --log-to-stdout" + f"{NYDUS_SNAPSHOTTER_HOST_BINPATH}/containerd-nydus-grpc " + f"--config {config_file} --log-to-stdout" ) service_config = """ From 5ae8036d0e813bf10405852e0bc3757de0a94745 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 24 Jan 2025 16:39:23 +0000 Subject: [PATCH 03/34] nydus: change names in set mode --- tasks/nydus_snapshotter.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index f6280daf..509e17c6 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -205,16 +205,16 @@ def hot_replace(ctx): @task def set_mode(ctx, mode): """ - Set the nydus-snapshotter operation mode: 'guest-pulling', or 'host-sharing' + Set the nydus-snapshotter operation mode: 'guest-pull', or 'host-share' """ - if mode not in ["guest-pulling", "host-sharing"]: + if mode not in ["guest-pull", "host-share"]: print(f"ERROR: unrecognised nydus-snapshotter mode: {mode}") - print("ERROR: mode must be one in: ['guest-pulling', 'host-sharing']") + print("ERROR: mode must be one in: ['guest-pull', 'host-share']") return config_file = ( NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG - if mode == "host-sharing" + if mode == "host-share" else NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG ) exec_start = ( From a3a029f2eca0fbf3bdf8d3af7250dfee97aa77df Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 31 Jan 2025 16:03:41 +0000 Subject: [PATCH 04/34] single entrypoint to set log level --- tasks/containerd.py | 12 +---------- tasks/kata.py | 13 ++---------- tasks/kernel.py | 7 ++++++- tasks/nydus.py | 12 +++++++++-- tasks/nydus_snapshotter.py | 43 +++++++++++++++++++++----------------- tasks/sc2.py | 30 ++++++++++++++++++++++++-- 6 files changed, 71 insertions(+), 46 deletions(-) diff --git a/tasks/containerd.py b/tasks/containerd.py index 1abf3eed..1cb9f6c2 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -72,20 +72,10 @@ def cli(ctx, mount_path=join(PROJ_ROOT, "..", "containerd")): run("docker exec -it {} bash".format(CONTAINERD_CTR_NAME), shell=True, check=True) -@task -def set_log_level(ctx, log_level): +def set_log_level(log_level): """ Set containerd's log level, must be one in: info, debug """ - allowed_log_levels = ["info", "debug"] - if log_level not in allowed_log_levels: - print( - "Unsupported log level '{}'. Must be one in: {}".format( - log_level, allowed_log_levels - ) - ) - return - updated_toml_str = """ [debug] level = "{log_level}" diff --git a/tasks/kata.py b/tasks/kata.py index 1128fb67..3548e45a 100644 --- a/tasks/kata.py +++ b/tasks/kata.py @@ -67,20 +67,10 @@ def stop(ctx): stop_kata_workon_ctr() -@task -def set_log_level(ctx, log_level): +def set_log_level(log_level): """ Set kata's log level, must be one in: info, debug """ - allowed_log_levels = ["info", "debug"] - if log_level not in allowed_log_levels: - print( - "Unsupported log level '{}'. Must be one in: {}".format( - log_level, allowed_log_levels - ) - ) - return - enable_debug = str(log_level == "debug").lower() for runtime in KATA_RUNTIMES + SC2_RUNTIMES: @@ -146,6 +136,7 @@ def hot_replace_shim(ctx, runtime="qemu-snp-sc2"): ), ), sc2=runtime in SC2_RUNTIMES, + hot_replace=True, ) restart_containerd() diff --git a/tasks/kernel.py b/tasks/kernel.py index d8af7980..9b07acb8 100644 --- a/tasks/kernel.py +++ b/tasks/kernel.py @@ -54,8 +54,10 @@ def build_guest(debug=False, hot_replace=False): ctr_path, host_path, sudo=False, debug=debug, hot_replace=hot_replace ) + # The -V option enables dm-verity support in the guest (technically only + # needed for SC2) build_kernel_base_cmd = [ - f"./build-kernel.sh -x -f -v {GUEST_KERNEL_VERSION}", + f"./build-kernel.sh -x -V -f -v {GUEST_KERNEL_VERSION}", "-u 'https://cdn.kernel.org/pub/linux/kernel/v{}.x/'".format( GUEST_KERNEL_VERSION.split(".")[0] ), @@ -117,4 +119,7 @@ def build_guest(debug=False, hot_replace=False): @task def hot_replace_guest(ctx, debug=False): + """ + Hot-replace guest kernel + """ build_guest(debug=debug, hot_replace=True) diff --git a/tasks/nydus.py b/tasks/nydus.py index 63d70647..a8aef3cb 100644 --- a/tasks/nydus.py +++ b/tasks/nydus.py @@ -2,7 +2,7 @@ from os.path import join from subprocess import run from tasks.util.docker import copy_from_ctr_image -from tasks.util.env import GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line +from tasks.util.env import COCO_ROOT, GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line from tasks.util.nydus import NYDUSIFY_PATH from tasks.util.versions import NYDUS_VERSION @@ -27,12 +27,20 @@ def build(ctx, nocache=False, push=False): def do_install(): - print_dotted_line(f"Installing nydusify (v{NYDUS_VERSION})") + print_dotted_line(f"Installing nydus image services (v{NYDUS_VERSION})") + # Non root-owned binaries ctr_bin = ["/go/src/github.com/sc2-sys/nydus/contrib/nydusify/cmd/nydusify"] host_bin = [NYDUSIFY_PATH] copy_from_ctr_image(NYDUS_IMAGE_TAG, ctr_bin, host_bin, requires_sudo=False) + # Root-owned binaries + # The host-pull functionality requires nydus-image >= 2.3.0, but the one + # installed with the daemon is 2.2.4 + ctr_bin = ["/go/src/github.com/sc2-sys/nydus/target/release/nydus-image"] + host_bin = [join(COCO_ROOT, "bin", "nydus-image")] + copy_from_ctr_image(NYDUS_IMAGE_TAG, ctr_bin, host_bin, requires_sudo=True) + print("Success!") diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 509e17c6..f6937841 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -1,6 +1,5 @@ from invoke import task from os.path import exists, join -from shutil import rmtree from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import COCO_ROOT, GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line @@ -39,6 +38,23 @@ def restart_nydus_snapshotter(): run("sudo service nydus-snapshotter restart", shell=True, check=True) +def do_purge(): + # TODO: is this too much/too little? + # Seems not enough, we need to delete the images manually with crictl rmi + # TODO: delete pause image manually -> something intersting happens!! + run("sudo rm -rf /var/lib/containerd-nydus", shell=True, check=True) + + restart_nydus_snapshotter() + + +@task +def purge(ctx): + """ + Remove all cached snapshots in the snapshotter cache + """ + do_purge() + + def install(debug=False, clean=False): """ Install the nydus snapshotter binaries @@ -57,10 +73,6 @@ def install(debug=False, clean=False): NYDUS_SNAPSHOTTER_IMAGE_TAG, ctr_binaries, host_binaries, requires_sudo=True ) - # Populate the host-sharing config file - if clean: - rmtree(NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG) - if not exists(NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG): host_sharing_config = """ version = 1 @@ -74,7 +86,7 @@ def install(debug=False, clean=False): [daemon] fs_driver = "blockdev" -nydusimage_path = "/usr/local/bin/nydus-image" +nydusimage_path = "{}" [remote] skip_ssl_verify = true @@ -86,7 +98,10 @@ def install(debug=False, clean=False): enable_tarfs = true mount_tarfs_on_host = false export_mode = "image_block_with_verity" -""" +""".format( + join(COCO_ROOT, "bin", "nydus-image") + ) + cmd = """ sudo sh -c 'cat < {destination_file} {file_contents} @@ -100,7 +115,7 @@ def install(debug=False, clean=False): # Remove all nydus config for a clean start if clean: - run("sudo rm -rf /var/lib/containerd-nydus", shell=True, check=True) + do_purge() # Restart the nydus service restart_nydus_snapshotter() @@ -124,20 +139,10 @@ def build(ctx, nocache=False, push=False): run(f"docker push {NYDUS_SNAPSHOTTER_IMAGE_TAG}", shell=True, check=True) -@task -def set_log_level(ctx, log_level): +def set_log_level(log_level): """ Set the log level for the nydus snapshotter """ - allowed_log_levels = ["info", "debug"] - if log_level not in allowed_log_levels: - print( - "Unsupported log level '{}'. Must be one in: {}".format( - log_level, allowed_log_levels - ) - ) - return - for config_file in NYDUS_SNAPSHOTTER_CONFIG_FILES: updated_toml_str = """ [log] diff --git a/tasks/sc2.py b/tasks/sc2.py index 3fbe3b3a..56fa4beb 100644 --- a/tasks/sc2.py +++ b/tasks/sc2.py @@ -2,16 +2,23 @@ from os import environ, makedirs from os.path import exists, join from subprocess import run -from tasks.containerd import install as containerd_install +from tasks.containerd import ( + install as containerd_install, + set_log_level as containerd_set_log_level, +) from tasks.demo_apps import ( do_push_to_local_registry as push_demo_apps_to_local_registry, ) from tasks.k8s import install as k8s_tooling_install from tasks.k9s import install as k9s_install +from tasks.kata import set_log_level as kata_set_log_level from tasks.kernel import build_guest as build_guest_kernel from tasks.knative import install as knative_install from tasks.kubeadm import create as k8s_create, destroy as k8s_destroy -from tasks.nydus_snapshotter import install as nydus_snapshotter_install +from tasks.nydus_snapshotter import ( + install as nydus_snapshotter_install, + set_log_level as nydus_snapshotter_set_log_level, +) from tasks.nydus import do_install as nydus_install from tasks.operator import ( install as operator_install, @@ -353,3 +360,22 @@ def destroy(ctx, debug=False): assert result.returncode == 0, print(result.stderr.decode("utf-8").strip()) if debug: print(result.stdout.decode("utf-8").strip()) + + +@task +def set_log_level(ctx, log_level): + """ + Set log level for all SC2 containers: containerd, kata, and nydus-snapshotter + """ + allowed_log_levels = ["info", "debug"] + if log_level not in allowed_log_levels: + print( + "Unsupported log level '{}'. Must be one in: {}".format( + log_level, allowed_log_levels + ) + ) + return + + containerd_set_log_level(log_level) + kata_set_log_level(log_level) + nydus_snapshotter_set_log_level(log_level) From 7fc5a49b9bcb8975dbeff6b88dac684b9b8f46bb Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 31 Jan 2025 16:38:03 +0000 Subject: [PATCH 05/34] add method to stop containerd and nydus-sn containers --- tasks/containerd.py | 14 ++++++++++++++ tasks/nydus_snapshotter.py | 14 ++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/tasks/containerd.py b/tasks/containerd.py index 1cb9f6c2..e079605f 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -72,6 +72,20 @@ def cli(ctx, mount_path=join(PROJ_ROOT, "..", "containerd")): run("docker exec -it {} bash".format(CONTAINERD_CTR_NAME), shell=True, check=True) +@task +def stop(ctx): + """ + Stop the containerd work-on container + """ + result = run( + "docker rm -f {}".format(CONTAINERD_CTR_NAME), + shell=True, + check=True, + capture_output=True, + ) + assert result.returncode == 0 + + def set_log_level(log_level): """ Set containerd's log level, must be one in: info, debug diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index f6937841..6adb28c4 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -181,6 +181,20 @@ def cli(ctx, mount_path=join(PROJ_ROOT, "..", "nydus-snapshotter")): ) +@task +def stop(ctx): + """ + Stop the nydus-snapshotter work-on container + """ + result = run( + "docker rm -f {}".format(NYDUS_SNAPSHOTTER_CTR_NAME), + shell=True, + check=True, + capture_output=True, + ) + assert result.returncode == 0 + + @task def hot_replace(ctx): """ From 25236dcd6c4bbccf80ec8c947401a7e4d13dcc7e Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 31 Jan 2025 18:01:44 +0000 Subject: [PATCH 06/34] nydus: install host-sharing as separate snapshotter --- .github/workflows/tests.yml | 62 +++++++++++++++++++++ docs/troubleshooting.md | 23 ++++++-- tasks/containerd.py | 17 +++--- tasks/nydus_snapshotter.py | 104 ++++++++++++++++++++++++++++++------ 4 files changed, 177 insertions(+), 29 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 93d35d7a..42251cc7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -171,6 +171,68 @@ jobs: sleep 5 echo "Knative test succesful!" + - name: "Run nydus host-share test" + run: | + # Change the snapshotter mode + ./bin/inv_wrapper.sh nydus-snapshotter.set-mode host-share + + export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 + export POD_LABEL="apps.sc2.io/name=helloworld-py" + + # ----- Python Test ---- + + echo "Running python test..." + envsubst < ./demo-apps/helloworld-py-nydus/deployment.yaml | ./bin/kubectl apply -f - + + # Wait for pod to be ready + until [ "$(./bin/kubectl get pods -l ${POD_LABEL} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}')" = "True" ]; do echo "Waiting for pod to be ready..."; sleep 2; done + sleep 1 + + # Get the pod's IP + service_ip=$(./bin/kubectl get services -o jsonpath='{.items[?(@.metadata.name=="coco-helloworld-py-node-port")].spec.clusterIP}') + [ "$(curl --retry 3 -X GET ${service_ip}:8080)" = "Hello World!" ] + envsubst < ./demo-apps/helloworld-py-nydus/deployment.yaml | ./bin/kubectl delete -f - + + # Wait for pod to be deleted + ./bin/kubectl wait --for=delete -l ${POD_LABEL} pod --timeout=30s + + # Extra cautionary sleep + sleep 5 + echo "Python test succesful!" + + # ----- Knative Test ---- + envsubst < ./demo-apps/helloworld-knative-nydus/service.yaml | ./bin/kubectl apply -f - + sleep 1 + + # Get the service URL + service_url=$(./bin/kubectl get ksvc helloworld-knative --output=custom-columns=URL:.status.url --no-headers) + [ "$(curl --retry 3 ${service_url})" = "Hello World!" ] + + # Wait for pod to be deleted + envsubst < ./demo-apps/helloworld-knative-nydus/service.yaml | ./bin/kubectl delete -f - + ./bin/kubectl wait --for=delete -l ${POD_LABEL} pod --timeout=60s + + # Extra cautionary sleep + sleep 5 + echo "Knative test succesful!" + + # Change the snapshotter mode back again + ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull + + - name: "Enable default-memory annotation" + run: | + for runtime_class in ${{ matrix.runtime_classes }}; do + ./bin/inv_wrapper.sh kata.enable-annotation default_memory --runtime ${runtime_class} + # Here we benefit that the last variable is the one we want to use + # for vm-cache + export SC2_RUNTIME_CLASS=${runtime_class} + done + + # Aftre changing the annotation of the qemu-snp-sc2 runtime class we + # need to restart the VM cache + sudo -E ./vm-cache/target/release/vm-cache stop + sudo -E ./vm-cache/target/release/vm-cache background + - name: "Enable default-memory annotation" run: | for runtime_class in ${{ matrix.runtime_classes }}; do diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index d1b4aa54..a6cca897 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -151,14 +151,27 @@ ctr -n k8s.io content fetch ${IMAGE_NAME} the image name is the image tag appearing right before the error message in the pod logs. -### Nydus snapshot corruption +### Rootfs Mount Issue -Sometimes, after hot-replacing the nydus-snapshotter, snapshots become corrupted, -and we can see the error below. +Sometimes, if we are mixing and matching different snapshotters, we may run +into the following error: ``` Failed to create pod sandbox: rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: failed to mount /run/kata-containers/shared/containers/0a583f0691d78e2036425f99bdac8e03302158320c1c55a5c6482cae7e729009/rootfs to /run/kata-containers/0a583f0691d78e2036425f99bdac8e03302158320c1c55a5c6482cae7e729009/rootfs, with error: ENOENT: No such file or directory ``` -The only solution I found was to bump to a more up-to-date version of nydus. -This seemed to fix the issue. +this is because the pause image bundle has not been unpacked correctly. Note +that the pause image bundle is unpacked into the `/run/kata-containers/shared` +directory, and then mounted into the `/run/kata-containers/` one. + +This usually happens when containerd believes that we already have the pause +image, so we do not need to pull it. This prevents the snapshotter from +generating the respective Kata virtual volumes. + +As a rule of thumb, a good fix is to remove all images involved in the app +from the content store, and purge snapshotter caches: + +```bash +sudo crictl rmi +inv nydus-snapshotter.purge +``` diff --git a/tasks/containerd.py b/tasks/containerd.py index e079605f..c12ab5d9 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -31,23 +31,24 @@ CONTAINERD_HOST_BINPATH = "/usr/bin" -def do_build(debug=False): - docker_cmd = "docker build -t {} -f {} .".format( +def do_build(nocache=False): + docker_cmd = "docker build{} -t {} -f {} .".format( + " --no-cache" if nocache else "", CONTAINERD_IMAGE_TAG, join(PROJ_ROOT, "docker", "containerd.dockerfile"), ) - result = run(docker_cmd, shell=True, capture_output=True, cwd=PROJ_ROOT) - assert result.returncode == 0, print(result.stderr.decode("utf-8").strip()) - if debug: - print(result.stdout.decode("utf-8").strip()) + run(docker_cmd, shell=True, check=True, cwd=PROJ_ROOT) @task -def build(ctx): +def build(ctx, nocache=False, push=False): """ Build the containerd fork for CoCo """ - do_build(debug=True) + do_build(nocache=nocache) + + if push: + run(f"docker push {CONTAINERD_IMAGE_TAG}", shell=True, check=True) @task diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 6adb28c4..7f859e40 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -2,21 +2,34 @@ from os.path import exists, join from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running -from tasks.util.env import COCO_ROOT, GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line -from tasks.util.toml import update_toml +from tasks.util.env import ( + COCO_ROOT, + CONTAINERD_CONFIG_FILE, + CONTAINERD_CONFIG_ROOT, + GHCR_URL, + GITHUB_ORG, + KATA_RUNTIMES, + PROJ_ROOT, + SC2_RUNTIMES, + print_dotted_line, +) +from tasks.util.toml import read_value_from_toml, update_toml from tasks.util.versions import NYDUS_SNAPSHOTTER_VERSION +NYDUS_SNAPSHOTTER_GUEST_PULL_NAME = "nydus" +NYDUS_SNAPSHOTTER_HOST_SHARE_NAME = "nydus-hs" + NYDUS_SNAPSHOTTER_CONFIG_DIR = join(COCO_ROOT, "share", "nydus-snapshotter") NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG = join( NYDUS_SNAPSHOTTER_CONFIG_DIR, "config-coco-guest-pulling.toml" ) -NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG = join( +NYDUS_SNAPSHOTTER_HOST_SHARE_CONFIG = join( NYDUS_SNAPSHOTTER_CONFIG_DIR, "config-coco-host-sharing.toml" ) NYDUS_SNAPSHOTTER_CONFIG_FILES = [ NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG, - NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG, + NYDUS_SNAPSHOTTER_HOST_SHARE_CONFIG, ] NYDUS_SNAPSHOTTER_CTR_NAME = "nydus-snapshotter-workon" NYDUS_SNAPSHOTTER_IMAGE_TAG = ( @@ -39,10 +52,10 @@ def restart_nydus_snapshotter(): def do_purge(): - # TODO: is this too much/too little? - # Seems not enough, we need to delete the images manually with crictl rmi - # TODO: delete pause image manually -> something intersting happens!! - run("sudo rm -rf /var/lib/containerd-nydus", shell=True, check=True) + # Sometimes this may not be enough, and we need to manually delete images + # using something like `sudo crictl rmi ...` + for snap in [NYDUS_SNAPSHOTTER_HOST_SHARE_NAME, NYDUS_SNAPSHOTTER_GUEST_PULL_NAME]: + run(f"sudo rm -rf /var/lib/containerd-{snap}", shell=True, check=True) restart_nydus_snapshotter() @@ -59,7 +72,7 @@ def install(debug=False, clean=False): """ Install the nydus snapshotter binaries """ - print_dotted_line(f"Installing nydus-snapshotter (v{NYDUS_SNAPSHOTTER_VERSION})") + print_dotted_line(f"Installing nydus-snapshotter(s) (v{NYDUS_SNAPSHOTTER_VERSION})") host_binaries = [ join(NYDUS_SNAPSHOTTER_HOST_BINPATH, binary) @@ -73,10 +86,46 @@ def install(debug=False, clean=False): NYDUS_SNAPSHOTTER_IMAGE_TAG, ctr_binaries, host_binaries, requires_sudo=True ) - if not exists(NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG): + # We install nydus with host-sharing as a "different" snapshotter + imports = read_value_from_toml(CONTAINERD_CONFIG_FILE, "imports") + host_share_import_path = join( + CONTAINERD_CONFIG_ROOT, + "config.toml.d", + f"{NYDUS_SNAPSHOTTER_HOST_SHARE_NAME}-snapshotter.toml", + ) + if host_share_import_path not in imports: + config_file = """ +[proxy_plugins] + [proxy_plugins.{}] + type = "snapshot" + address = "/run/containerd-nydus/containerd-nydus-grpc.sock" +""".format( + NYDUS_SNAPSHOTTER_HOST_SHARE_NAME + ) + + cmd = """ +sudo sh -c 'cat < {destination_file} +{file_contents} +EOF' +""".format( + destination_file=host_share_import_path, + file_contents=config_file, + ) + + run(cmd, shell=True, check=True) + + imports += [host_share_import_path] + updated_toml_str = """ + imports = [ {sn} ] + """.format( + sn=",".join([f'"{s}"' for s in imports]) + ) + update_toml(CONTAINERD_CONFIG_FILE, updated_toml_str) + + if not exists(NYDUS_SNAPSHOTTER_HOST_SHARE_CONFIG): host_sharing_config = """ version = 1 -root = "/var/lib/containerd-nydus" +root = "/var/lib/containerd-{nydus_hs_name}" address = "/run/containerd-nydus/containerd-nydus-grpc.sock" daemon_mode = "none" @@ -86,7 +135,7 @@ def install(debug=False, clean=False): [daemon] fs_driver = "blockdev" -nydusimage_path = "{}" +nydusimage_path = "{nydus_image_path}" [remote] skip_ssl_verify = true @@ -99,7 +148,8 @@ def install(debug=False, clean=False): mount_tarfs_on_host = false export_mode = "image_block_with_verity" """.format( - join(COCO_ROOT, "bin", "nydus-image") + nydus_hs_name=NYDUS_SNAPSHOTTER_HOST_SHARE_NAME, + nydus_image_path=join(COCO_ROOT, "bin", "nydus-image"), ) cmd = """ @@ -107,7 +157,7 @@ def install(debug=False, clean=False): {file_contents} EOF' """.format( - destination_file=NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG, + destination_file=NYDUS_SNAPSHOTTER_HOST_SHARE_CONFIG, file_contents=host_sharing_config, ) @@ -123,6 +173,11 @@ def install(debug=False, clean=False): print("Success!") +@task +def foo(ctx): + install(clean=True, debug=False) + + @task def build(ctx, nocache=False, push=False): """ @@ -232,7 +287,7 @@ def set_mode(ctx, mode): return config_file = ( - NYDUS_SNAPSHOTTER_HOST_SHARING_CONFIG + NYDUS_SNAPSHOTTER_HOST_SHARE_CONFIG if mode == "host-share" else NYDUS_SNAPSHOTTER_GUEST_PULL_CONFIG ) @@ -267,6 +322,23 @@ def set_mode(ctx, mode): ) run(cmd, shell=True, check=True) + # Update all runtime configurations to use the right snapshotter. We + # _always_ avoid having both snapshotters co-existing + snap_name = ( + NYDUS_SNAPSHOTTER_HOST_SHARE_NAME + if mode == "host-share" + else NYDUS_SNAPSHOTTER_GUEST_PULL_NAME + ) + for runtime in KATA_RUNTIMES + SC2_RUNTIMES: + updated_toml_str = """ + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-{runtime_name}] + snapshotter = "{snapshotter_name}" + """.format( + runtime_name=runtime, snapshotter_name=snap_name + ) + update_toml(CONTAINERD_CONFIG_FILE, updated_toml_str) + # Reload systemd to apply the new service configuration run("sudo systemctl daemon-reload", shell=True, check=True) - run("sudo systemctl restart nydus-snapshotter.service", shell=True, check=True) + + restart_nydus_snapshotter() From 0b4028f6f440e1ab5163007aabd189a002e4cb5c Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 3 Feb 2025 12:15:42 +0000 Subject: [PATCH 07/34] improve purge + add doc --- .github/workflows/tests.yml | 17 +--------------- docs/image_pull.md | 28 ++++++++++++++++++++++++++ tasks/nydus_snapshotter.py | 40 +++++++++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 24 deletions(-) create mode 100644 docs/image_pull.md diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 42251cc7..354c6486 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -230,22 +230,7 @@ jobs: # Aftre changing the annotation of the qemu-snp-sc2 runtime class we # need to restart the VM cache - sudo -E ./vm-cache/target/release/vm-cache stop - sudo -E ./vm-cache/target/release/vm-cache background - - - name: "Enable default-memory annotation" - run: | - for runtime_class in ${{ matrix.runtime_classes }}; do - ./bin/inv_wrapper.sh kata.enable-annotation default_memory --runtime ${runtime_class} - # Here we benefit that the last variable is the one we want to use - # for vm-cache - export SC2_RUNTIME_CLASS=${runtime_class} - done - - # Aftre changing the annotation of the qemu-snp-sc2 runtime class we - # need to restart the VM cache - sudo -E ./vm-cache/target/release/vm-cache stop - sudo -E ./vm-cache/target/release/vm-cache background + sudo -E ./vm-cache/target/release/vm-cache restart - name: "Run knative chaining demo" run: | diff --git a/docs/image_pull.md b/docs/image_pull.md new file mode 100644 index 00000000..d4bf8f04 --- /dev/null +++ b/docs/image_pull.md @@ -0,0 +1,28 @@ +## Image Pull + +This document describes the different mechanisms to get a container image +inside a cVM in SC2. We _always_ assume that the integrity of container images +must be validated. We also consider the situation in which their confidentiality +must also be preserved. + +### Guest Pull + +The guest pull mechanism always pulls the container image inside the guest cVM. +This is the default mechanism in CoCo as it allows the most secure, and simplest +deployment: users sign (and encrypt) container images locally, they upload +them to a container registry, pull them inside the cVM, and decrypt them inside +the cVM. + +Albeit secure, this mechanism has high performance overheads as the image must +be pulled every single time, precluding any caching benefits. + +### Host Share + +The host share mechanism mounts a container image from the host to the guest. +Given that the host is untrusted, this mechanism only works for images that +do not have confidentiality requirements. To maintain integrity, we mount +the image with `dm-verity`, and validate the `dm-verity` device as part of +attestation. + +We could mount encrypted images from the host to the guest, but we would be +losing on the de-duplication opportunities in the host. diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 7f859e40..7d234bd1 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -1,4 +1,5 @@ from invoke import task +from json import loads as json_loads from os.path import exists, join from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running @@ -9,6 +10,7 @@ GHCR_URL, GITHUB_ORG, KATA_RUNTIMES, + LOCAL_REGISTRY_URL, PROJ_ROOT, SC2_RUNTIMES, print_dotted_line, @@ -52,11 +54,38 @@ def restart_nydus_snapshotter(): def do_purge(): - # Sometimes this may not be enough, and we need to manually delete images - # using something like `sudo crictl rmi ...` + """ + Purging the snapshotters for a fresh-start is a two step process. First, + we need to remove all nydus metadata. This can be achieved by just + bluntly removing `/var/lib/containerd-nydus-*`. Secondly, we need to + reset a map that we keep in containerd's image store of what images + have we pulled with which snapshotters. This is, essentially, what + we see when we run `sudo crictl images`. There's no easy way to clear + just this map, so what we do is remove all the images that we may have + used. + """ + + # Clear nydus-snapshots for snap in [NYDUS_SNAPSHOTTER_HOST_SHARE_NAME, NYDUS_SNAPSHOTTER_GUEST_PULL_NAME]: run(f"sudo rm -rf /var/lib/containerd-{snap}", shell=True, check=True) + # Clear all possibly used images (only images in our registry, or the + # pause container images) + cmd = ( + "sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock" + " images -o json" + ) + rm_cmd = "sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock rmi" + data = json_loads(run(cmd, shell=True, capture_output=True).stdout.decode("utf-8")) + for image_data in data["images"]: + if any([tag.startswith(LOCAL_REGISTRY_URL) for tag in image_data["repoTags"]]): + run("{} {}".format(rm_cmd, image_data["id"]), shell=True, check=True) + + if any( + [tag.startswith("registry.k8s.io/pause") for tag in image_data["repoTags"]] + ): + run("{} {}".format(rm_cmd, image_data["id"]), shell=True, check=True) + restart_nydus_snapshotter() @@ -146,7 +175,7 @@ def install(debug=False, clean=False): [experimental.tarfs] enable_tarfs = true mount_tarfs_on_host = false -export_mode = "image_block_with_verity" +export_mode = "layer_block_with_verity" """.format( nydus_hs_name=NYDUS_SNAPSHOTTER_HOST_SHARE_NAME, nydus_image_path=join(COCO_ROOT, "bin", "nydus-image"), @@ -173,11 +202,6 @@ def install(debug=False, clean=False): print("Success!") -@task -def foo(ctx): - install(clean=True, debug=False) - - @task def build(ctx, nocache=False, push=False): """ From b46a1a45447774f11cc575574637d2bcc8113671 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 3 Feb 2025 12:21:12 +0000 Subject: [PATCH 08/34] docs: update --- README.md | 1 + docs/image_pull.md | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/README.md b/README.md index 630a1b43..6bdd7f8d 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,7 @@ For further documentation, you may want to check these other documents: * [CoCo Upgrade](./docs/upgrade_coco.md) - upgrade the current CoCo version. * [Guest Components](./docs/guest_components.md) - instructions to patch components inside SC2 guests. * [Host Kernel](./docs/host_kernel.md) - bump the kernel version in the host. +* [Image Pull](./docs/image_pull.md) - details on the image-pulling mechanisms supported in SC2. * [K8s](./docs/k8s.md) - documentation about configuring a single-node Kubernetes cluster. * [Kata](./docs/kata.md) - instructions to build our custom Kata fork and `initrd` images. * [Key Broker Service](./docs/kbs.md) - docs on using and patching the KBS. diff --git a/docs/image_pull.md b/docs/image_pull.md index d4bf8f04..84448156 100644 --- a/docs/image_pull.md +++ b/docs/image_pull.md @@ -16,6 +16,9 @@ the cVM. Albeit secure, this mechanism has high performance overheads as the image must be pulled every single time, precluding any caching benefits. +To mitigate the performance overheads, we can convert the OCI image to a +Nydus image, that supports lazy loading of container data. + ### Host Share The host share mechanism mounts a container image from the host to the guest. @@ -26,3 +29,23 @@ attestation. We could mount encrypted images from the host to the guest, but we would be losing on the de-duplication opportunities in the host. + +### Usage + +Each image pull mechanism is implemented as a different remote snapshotter +in containerd, all of them based on the [nydus-snapshotter]( +https://github.com/containerd/nydus-snapshotter/) plus our modifications. + +To switch between different image-pulling mechanisms, you only need to change +the snapshotter mode: + +```bash +inv nydus-snapshotter.set-mode [guest-pull,host-share] +``` + +If you see any snapshotter related issues (either in the `containerd` or the +`nydus-snapshotter` journal logs), you can purge the snapshotters: + +```bash +inv nydus-snapshotter.purge +``` From 668c4ebb7651ec28b4e87483ca61ede415b29a17 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 3 Feb 2025 12:22:28 +0000 Subject: [PATCH 09/34] docs: update --- docs/image_pull.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/image_pull.md b/docs/image_pull.md index 84448156..e31b4f0d 100644 --- a/docs/image_pull.md +++ b/docs/image_pull.md @@ -27,6 +27,9 @@ do not have confidentiality requirements. To maintain integrity, we mount the image with `dm-verity`, and validate the `dm-verity` device as part of attestation. +We choose to mount individual layers separately (rather than whole images), +but we should measure that the former is actually better than the latter. + We could mount encrypted images from the host to the guest, but we would be losing on the de-duplication opportunities in the host. From 95e373f1157576b7b848c6b04c0de54962a22aec Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 10:28:49 +0000 Subject: [PATCH 10/34] gha: purge when setting snapshotter mode --- .github/workflows/tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 354c6486..69ff4388 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -173,8 +173,10 @@ jobs: - name: "Run nydus host-share test" run: | - # Change the snapshotter mode + # Change the snapshotter mode and purge (necessary to clear + # containred's content store) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode host-share + ./bin/inv_wrapper.sh nydus-snapshotter.purge export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 export POD_LABEL="apps.sc2.io/name=helloworld-py" @@ -216,8 +218,9 @@ jobs: sleep 5 echo "Knative test succesful!" - # Change the snapshotter mode back again + # Change the snapshotter mode back again (and purge) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull + ./bin/inv_wrapper.sh nydus-snapshotter.purge - name: "Enable default-memory annotation" run: | From eb324d01b87e47876e5bf071613adcf7609d05fc Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 10:47:56 +0000 Subject: [PATCH 11/34] nydus-snapshotter: better clean-up --- tasks/nydus_snapshotter.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 7d234bd1..d25d91f4 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -78,13 +78,33 @@ def do_purge(): rm_cmd = "sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock rmi" data = json_loads(run(cmd, shell=True, capture_output=True).stdout.decode("utf-8")) for image_data in data["images"]: - if any([tag.startswith(LOCAL_REGISTRY_URL) for tag in image_data["repoTags"]]): - run("{} {}".format(rm_cmd, image_data["id"]), shell=True, check=True) + # Try matching both by repoTags and repoDigests (the former is sometimes + # empty) + if any( + [ + tag.startswith(LOCAL_REGISTRY_URL) + for tag in image_data["repoTags"] + image_data["repoDigests"] + ] + ): + run( + "{} {} 2> /dev/null".format(rm_cmd, image_data["id"]), + shell=True, + check=True, + ) + continue if any( - [tag.startswith("registry.k8s.io/pause") for tag in image_data["repoTags"]] + [ + tag.startswith("registry.k8s.io/pause") + for tag in image_data["repoTags"] + image_data["repoDigests"] + ] ): - run("{} {}".format(rm_cmd, image_data["id"]), shell=True, check=True) + run( + "{} {} 2> /dev/null".format(rm_cmd, image_data["id"]), + shell=True, + check=True, + ) + continue restart_nydus_snapshotter() From e2a569d1a3bc4d08cd353777ff8c7647f95dbe43 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 11:20:27 +0000 Subject: [PATCH 12/34] ns: cleanup and debug --- tasks/nydus_snapshotter.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index d25d91f4..54946c51 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -1,5 +1,5 @@ from invoke import task -from json import loads as json_loads +from json import JSONDecodeError, loads as json_loads from os.path import exists, join from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running @@ -53,7 +53,7 @@ def restart_nydus_snapshotter(): run("sudo service nydus-snapshotter restart", shell=True, check=True) -def do_purge(): +def do_purge(debug=False): """ Purging the snapshotters for a fresh-start is a two step process. First, we need to remove all nydus metadata. This can be achieved by just @@ -76,7 +76,13 @@ def do_purge(): " images -o json" ) rm_cmd = "sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock rmi" - data = json_loads(run(cmd, shell=True, capture_output=True).stdout.decode("utf-8")) + try: + stdout = run(cmd, shell=True, capture_output=True).stdout.decode("utf-8") + data = json_loads(stdout) + except JSONDecodeError as e: + stderr = run(cmd, shell=True, capture_output=True).stderr.decode("utf-8") + print(f"ERROR: run command: {cmd}, got stdout: {stdout}, stderr: {stderr}") + raise e for image_data in data["images"]: # Try matching both by repoTags and repoDigests (the former is sometimes # empty) @@ -86,11 +92,15 @@ def do_purge(): for tag in image_data["repoTags"] + image_data["repoDigests"] ] ): - run( + result = run( "{} {} 2> /dev/null".format(rm_cmd, image_data["id"]), shell=True, - check=True, + capture_output=True, ) + assert result.returncode == 0, print(result.stderr.decode("utf-8").strip()) + if debug: + print(result.stdout.decode("utf-8").strip()) + continue if any( @@ -99,11 +109,15 @@ def do_purge(): for tag in image_data["repoTags"] + image_data["repoDigests"] ] ): - run( + result = run( "{} {} 2> /dev/null".format(rm_cmd, image_data["id"]), shell=True, - check=True, + capture_output=True, ) + assert result.returncode == 0, print(result.stderr.decode("utf-8").strip()) + if debug: + print(result.stdout.decode("utf-8").strip()) + continue restart_nydus_snapshotter() @@ -114,7 +128,7 @@ def purge(ctx): """ Remove all cached snapshots in the snapshotter cache """ - do_purge() + do_purge(debug=True) def install(debug=False, clean=False): @@ -214,7 +228,7 @@ def install(debug=False, clean=False): # Remove all nydus config for a clean start if clean: - do_purge() + do_purge(debug=debug) # Restart the nydus service restart_nydus_snapshotter() From 0d0017063bc03283ea5d5b07ebc6771da5c23f62 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 11:46:57 +0000 Subject: [PATCH 13/34] gha: add sleep before purge --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 69ff4388..4a08b110 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -176,6 +176,7 @@ jobs: # Change the snapshotter mode and purge (necessary to clear # containred's content store) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode host-share + sleep 2 ./bin/inv_wrapper.sh nydus-snapshotter.purge export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 @@ -220,6 +221,7 @@ jobs: # Change the snapshotter mode back again (and purge) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull + sleep 2 ./bin/inv_wrapper.sh nydus-snapshotter.purge - name: "Enable default-memory annotation" From 0a7ba150feabfbf8f789b3c5e2667aaab08a0806 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 13:10:04 +0000 Subject: [PATCH 14/34] gha: more debugging --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4a08b110..ae3bbecd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -177,7 +177,9 @@ jobs: # containred's content store) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode host-share sleep 2 + ./bin/inv_wrapper.sh nydus-snapshotter.purge + sleep 2 export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 export POD_LABEL="apps.sc2.io/name=helloworld-py" From a655fed980c40fadd4835d521b09e28ce950abb8 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 13:39:32 +0000 Subject: [PATCH 15/34] gha: restart vm-cache after change snapshotter mode --- .github/workflows/tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae3bbecd..02410483 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -177,9 +177,11 @@ jobs: # containred's content store) ./bin/inv_wrapper.sh nydus-snapshotter.set-mode host-share sleep 2 - ./bin/inv_wrapper.sh nydus-snapshotter.purge - sleep 2 + + # When updating the runtime we update all the config files, so we + # need to re-start the cache + sudo -E ./vm-cache/target/release/vm-cache restart export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 export POD_LABEL="apps.sc2.io/name=helloworld-py" From 7c0a11ccc2025020684b726941ab4762e08209bd Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 15:10:13 +0000 Subject: [PATCH 16/34] gha: export variable before --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 02410483..d3b77732 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -179,13 +179,13 @@ jobs: sleep 2 ./bin/inv_wrapper.sh nydus-snapshotter.purge + export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 + export POD_LABEL="apps.sc2.io/name=helloworld-py" + # When updating the runtime we update all the config files, so we # need to re-start the cache sudo -E ./vm-cache/target/release/vm-cache restart - export SC2_RUNTIME_CLASS=qemu-${{ matrix.tee }}-sc2 - export POD_LABEL="apps.sc2.io/name=helloworld-py" - # ----- Python Test ---- echo "Running python test..." From 294fa97e57f206392a268cb2b1c4438e3d6ec3ef Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Thu, 6 Feb 2025 15:39:34 +0000 Subject: [PATCH 17/34] gha: fix --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d3b77732..bf63f53f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -189,7 +189,7 @@ jobs: # ----- Python Test ---- echo "Running python test..." - envsubst < ./demo-apps/helloworld-py-nydus/deployment.yaml | ./bin/kubectl apply -f - + envsubst < ./demo-apps/helloworld-py/deployment.yaml | ./bin/kubectl apply -f - # Wait for pod to be ready until [ "$(./bin/kubectl get pods -l ${POD_LABEL} -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}')" = "True" ]; do echo "Waiting for pod to be ready..."; sleep 2; done @@ -198,7 +198,7 @@ jobs: # Get the pod's IP service_ip=$(./bin/kubectl get services -o jsonpath='{.items[?(@.metadata.name=="coco-helloworld-py-node-port")].spec.clusterIP}') [ "$(curl --retry 3 -X GET ${service_ip}:8080)" = "Hello World!" ] - envsubst < ./demo-apps/helloworld-py-nydus/deployment.yaml | ./bin/kubectl delete -f - + envsubst < ./demo-apps/helloworld-py/deployment.yaml | ./bin/kubectl delete -f - # Wait for pod to be deleted ./bin/kubectl wait --for=delete -l ${POD_LABEL} pod --timeout=30s From ae794d2470360abd97148336d9eda9c50efa2243 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 12:08:04 +0000 Subject: [PATCH 18/34] tools: update check-fork-hashes tool --- tools/check-fork-hashes/src/main.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/check-fork-hashes/src/main.rs b/tools/check-fork-hashes/src/main.rs index 15a15e47..550b8ab6 100644 --- a/tools/check-fork-hashes/src/main.rs +++ b/tools/check-fork-hashes/src/main.rs @@ -110,6 +110,14 @@ fn main() { dict.insert("branches", "sc2-main"); dict }, + { + let mut dict = HashMap::new(); + dict.insert("repo_name", "nydus-snapshotter"); + dict.insert("version_str", "NYDUS_SNAPSHOTTER_VERSION"); + dict.insert("ctr_src_paths", "/go/src/github.com/sc2-sys/nydus-snapshotter"); + dict.insert("branches", "sc2-main"); + dict + }, ]; let mut all_match = true; From b23cf7a41a965b6ab6485de16e0a30d03e62afa2 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 16:09:57 +0000 Subject: [PATCH 19/34] nydus-image: add support for hot-replacing and patching --- tasks/nydus.py | 67 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/tasks/nydus.py b/tasks/nydus.py index a8aef3cb..a8a61f25 100644 --- a/tasks/nydus.py +++ b/tasks/nydus.py @@ -1,7 +1,7 @@ from invoke import task from os.path import join from subprocess import run -from tasks.util.docker import copy_from_ctr_image +from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import COCO_ROOT, GHCR_URL, GITHUB_ORG, PROJ_ROOT, print_dotted_line from tasks.util.nydus import NYDUSIFY_PATH from tasks.util.versions import NYDUS_VERSION @@ -9,6 +9,9 @@ NYDUS_CTR_NAME = "nydus-workon" NYDUS_IMAGE_TAG = join(GHCR_URL, GITHUB_ORG, "nydus") + f":{NYDUS_VERSION}" +NYDUS_IMAGE_CTR_PATH = "/go/src/github.com/sc2-sys/nydus/target/release/nydus-image" +NYDUS_IMAGE_HOST_PATH = join(COCO_ROOT, "bin", "nydus-image") + @task def build(ctx, nocache=False, push=False): @@ -37,8 +40,8 @@ def do_install(): # Root-owned binaries # The host-pull functionality requires nydus-image >= 2.3.0, but the one # installed with the daemon is 2.2.4 - ctr_bin = ["/go/src/github.com/sc2-sys/nydus/target/release/nydus-image"] - host_bin = [join(COCO_ROOT, "bin", "nydus-image")] + ctr_bin = [NYDUS_IMAGE_CTR_PATH] + host_bin = [NYDUS_IMAGE_HOST_PATH] copy_from_ctr_image(NYDUS_IMAGE_TAG, ctr_bin, host_bin, requires_sudo=True) print("Success!") @@ -50,3 +53,61 @@ def install(ctx): Install the nydusify CLI tool """ do_install() + + +@task +def cli(ctx, mount_path=join(PROJ_ROOT, "..", "nydus")): + """ + Get a working environemnt for nydusd + """ + if not is_ctr_running(NYDUS_CTR_NAME): + docker_cmd = [ + "docker run", + "-d -it", + # The container path comes from the dockerfile in: + # ./docker/nydus.dockerfile + f"-v {mount_path}:/go/src/github.com/sc2-sys/nydus", + "--name {}".format(NYDUS_CTR_NAME), + NYDUS_IMAGE_TAG, + "bash", + ] + docker_cmd = " ".join(docker_cmd) + run(docker_cmd, shell=True, check=True, cwd=PROJ_ROOT) + + run( + "docker exec -it {} bash".format(NYDUS_CTR_NAME), + shell=True, + check=True, + ) + + +@task +def stop(ctx): + """ + Remove the Kata developement environment + """ + result = run( + "docker rm -f {}".format(NYDUS_CTR_NAME), + shell=True, + check=True, + capture_output=True, + ) + assert result.returncode == 0 + + +@task +def hot_replace(ctx): + """ + Replace nydus-image binary from running workon container + """ + if not is_ctr_running(NYDUS_CTR_NAME): + print("Must have the work-on container running to hot replace!") + print("Consider running: inv nydus-snapshotter.cli ") + + print("cp {NYDUS_CTR_NAME}:{NYDUS_IMAGE_CTR_PATH} {NYDUS_IMAGE_HOST_PATH}") + docker_cmd = ( + f"sudo docker cp {NYDUS_CTR_NAME}:{NYDUS_IMAGE_CTR_PATH} " + f"{NYDUS_IMAGE_HOST_PATH}" + ) + result = run(docker_cmd, shell=True, capture_output=True) + assert result.returncode == 0 From 337bccc86c0a47bc2a6db6ac869c895234aca95f Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 16:12:51 +0000 Subject: [PATCH 20/34] nydus-snapshotter: fix purge by waiting on metadata to be gc-ed --- .gitignore | 1 + tasks/containerd.py | 43 ++++++++++++++++++++++++- tasks/nydus_snapshotter.py | 65 ++++++++++++++++++++++++++++++++++++++ tasks/sc2.py | 2 ++ 4 files changed, 110 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6dfae5ca..015dc671 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Installed binaries +bbolt cosign crictl kubeadm diff --git a/tasks/containerd.py b/tasks/containerd.py index c12ab5d9..4aceb1c8 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -5,6 +5,7 @@ from tasks.util.containerd import is_containerd_active, restart_containerd from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import ( + BIN_DIR, CONF_FILES_DIR, CONTAINERD_CONFIG_FILE, CONTAINERD_CONFIG_ROOT, @@ -14,7 +15,7 @@ print_dotted_line, ) from tasks.util.toml import update_toml -from tasks.util.versions import CONTAINERD_VERSION +from tasks.util.versions import CONTAINERD_VERSION, GO_VERSION CONTAINERD_CTR_NAME = "containerd-workon" CONTAINERD_IMAGE_TAG = ( @@ -175,3 +176,43 @@ def install(debug=False, clean=False): raise RuntimeError("containerd config file is empty!") print("Success!") + + +def install_bbolt(debug=False, clean=False): + print_dotted_line("Installing bbolt") + + tmp_ctr_name = "bbolt_install" + result = run( + f"docker run -d -it --name {tmp_ctr_name} golang:{GO_VERSION} bash", + shell=True, + capture_output=True, + ) + assert result.returncode == 0 + + def rm_container(): + result = run(f"docker rm -f {tmp_ctr_name}", shell=True, capture_output=True) + assert result.returncode == 0 + + result = run( + f"docker exec {tmp_ctr_name} go install go.etcd.io/bbolt/cmd/bbolt@latest", + shell=True, + capture_output=True, + ) + if result.returncode != 0: + print(result.stderr.decode("utf-8").strip()), + rm_container() + raise RuntimeError("Error execing into container") + if debug: + print(result.stdout.decode("utf-8").strip()) + + result = run( + f"docker cp {tmp_ctr_name}:/go/bin/bbolt {BIN_DIR}/bbolt", + shell=True, + capture_output=True, + ) + if result.returncode != 0: + print(result.stderr.decode("utf-8").strip()), + rm_container() + raise RuntimeError("Error cp-ing from container") + if debug: + print(result.stdout.decode("utf-8").strip()) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 54946c51..784acba4 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -1,9 +1,11 @@ from invoke import task from json import JSONDecodeError, loads as json_loads +from os import getgid, getuid from os.path import exists, join from subprocess import run from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import ( + BIN_DIR, COCO_ROOT, CONTAINERD_CONFIG_FILE, CONTAINERD_CONFIG_ROOT, @@ -17,6 +19,7 @@ ) from tasks.util.toml import read_value_from_toml, update_toml from tasks.util.versions import NYDUS_SNAPSHOTTER_VERSION +from time import sleep NYDUS_SNAPSHOTTER_GUEST_PULL_NAME = "nydus" NYDUS_SNAPSHOTTER_HOST_SHARE_NAME = "nydus-hs" @@ -53,6 +56,63 @@ def restart_nydus_snapshotter(): run("sudo service nydus-snapshotter restart", shell=True, check=True) +def wait_for_snapshot_metadata_to_be_gced(snapshotter, debug=False): + """ + After restarting containerd it may take a while for the GC to kick in and + delete the metadata corresponding to previous snapshots. This metadata + is stored in a Bolt DB in /var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db + + Annoyingly, it is hard to manually delete files from the database w/out + writting a small Go script. Instead, we rely on the bbolt CLI tool to + poll the DB until the GC has done its job. + """ + bbolt_path = join(BIN_DIR, "bbolt") + db_path = "/var/lib/containerd/io.containerd.metadata.v1.bolt/meta.db" + tmp_db_path = "/tmp/containerd_meta_copy.db" + bbolt_cmd = f"{bbolt_path} keys {tmp_db_path} v1 k8s.io snapshots {snapshotter}" + + while True: + # Make a user-owned copy of the DB (bbolt complains otherwise) + run(f"sudo cp {db_path} {tmp_db_path}", shell=True, check=True) + run( + "sudo chown {}:{} {}".format(getuid(), getgid(), tmp_db_path), + shell=True, + check=True, + ) + + result = run(bbolt_cmd, shell=True, capture_output=True) + stdout = result.stdout.decode("utf-8").strip() + + if result.returncode == 1: + # This can be a benign error if the snapshotter has not been used + # at all, never + if stdout == "bucket not found": + if debug: + print("WARNING: bucket {snapsotter} not found in metadata") + run(f"rm {tmp_db_path}", shell=True, check=True) + return + elif result.returncode == 0: + if len(stdout) == 0: + run(f"rm {tmp_db_path}", shell=True, check=True) + return + + print( + "Got {} snapshot's metadata for snapshotter: {}".format( + len(stdout.split("\n")), snapshotter + ) + ) + sleep(2) + else: + print( + "ERROR: running bbolt command: stdout: {}, stderr: {}".format( + stdout, result.stderr.decode("utf-8").strip() + ) + ) + run(f"rm {tmp_db_path}", shell=True, check=True) + + raise RuntimeError("Error running bbolt command!") + + def do_purge(debug=False): """ Purging the snapshotters for a fresh-start is a two step process. First, @@ -122,6 +182,11 @@ def do_purge(debug=False): restart_nydus_snapshotter() + # After restarting we need to wait for containerd's GC to clean-up the + # metadata database + for snap in [NYDUS_SNAPSHOTTER_HOST_SHARE_NAME, NYDUS_SNAPSHOTTER_GUEST_PULL_NAME]: + wait_for_snapshot_metadata_to_be_gced(snap, debug=debug) + @task def purge(ctx): diff --git a/tasks/sc2.py b/tasks/sc2.py index 56fa4beb..bab92fc9 100644 --- a/tasks/sc2.py +++ b/tasks/sc2.py @@ -4,6 +4,7 @@ from subprocess import run from tasks.containerd import ( install as containerd_install, + install_bbolt as bbolt_install, set_log_level as containerd_set_log_level, ) from tasks.demo_apps import ( @@ -259,6 +260,7 @@ def deploy(ctx, debug=False, clean=False): # Build and install containerd containerd_install(debug=debug, clean=clean) + bbolt_install(debug=debug, clean=clean) # Install k8s tooling (including k9s) k8s_tooling_install(debug=debug, clean=clean) From b2a5d0807d2456130a104a5df18a91f7fa72366d Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 16:23:41 +0000 Subject: [PATCH 21/34] check-hashes: run cargo fmt --- tools/check-fork-hashes/src/main.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/check-fork-hashes/src/main.rs b/tools/check-fork-hashes/src/main.rs index 550b8ab6..986362cf 100644 --- a/tools/check-fork-hashes/src/main.rs +++ b/tools/check-fork-hashes/src/main.rs @@ -114,7 +114,10 @@ fn main() { let mut dict = HashMap::new(); dict.insert("repo_name", "nydus-snapshotter"); dict.insert("version_str", "NYDUS_SNAPSHOTTER_VERSION"); - dict.insert("ctr_src_paths", "/go/src/github.com/sc2-sys/nydus-snapshotter"); + dict.insert( + "ctr_src_paths", + "/go/src/github.com/sc2-sys/nydus-snapshotter", + ); dict.insert("branches", "sc2-main"); dict }, From 05daf5901faf6409fcedde78410bc7e3b2c99cc8 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 18:35:40 +0000 Subject: [PATCH 22/34] containerd: fix nit in bbolt install --- tasks/containerd.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/containerd.py b/tasks/containerd.py index 4aceb1c8..34e078d6 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -216,3 +216,5 @@ def rm_container(): raise RuntimeError("Error cp-ing from container") if debug: print(result.stdout.decode("utf-8").strip()) + + print("Success!") From 1f1ebe678cb0463cef49f5b55734a0d0410a61ca Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Fri, 7 Feb 2025 19:10:56 +0000 Subject: [PATCH 23/34] containerd: fix bbolt clean-up --- tasks/containerd.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tasks/containerd.py b/tasks/containerd.py index 34e078d6..b0e7de7f 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -182,6 +182,10 @@ def install_bbolt(debug=False, clean=False): print_dotted_line("Installing bbolt") tmp_ctr_name = "bbolt_install" + if is_ctr_running(tmp_ctr_name): + result = run(f"docker rm -f {tmp_ctr_name}", shell=True, capture_output=True) + assert result.returncode == 0 + result = run( f"docker run -d -it --name {tmp_ctr_name} golang:{GO_VERSION} bash", shell=True, @@ -217,4 +221,6 @@ def rm_container(): if debug: print(result.stdout.decode("utf-8").strip()) + rm_container() + print("Success!") From 96647c7f10de69247ecf70bb6ada2bda215ccda6 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Feb 2025 12:38:23 +0000 Subject: [PATCH 24/34] gha: add debug logging --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bf63f53f..0ce57427 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -67,7 +67,7 @@ jobs: docker pull ghcr.io/sc2-sys/nydus-snapshotter:$(grep -oP 'NYDUS_SNAPSHOTTER_VERSION\s*=\s*"\K[^"]+' ./tasks/util/versions.py) - name: "Install SC2" - run: ./bin/inv_wrapper.sh sc2.deploy --clean + run: ./bin/inv_wrapper.sh sc2.deploy --clean --debug - name: "Run python hello world (cold and warm starts)" run: | From 34d7e417183ce98310c796969d68bfce1fd3e07c Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Feb 2025 16:42:10 +0000 Subject: [PATCH 25/34] bbolt: more installation fix-ups --- tasks/containerd.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/tasks/containerd.py b/tasks/containerd.py index b0e7de7f..647e57ee 100644 --- a/tasks/containerd.py +++ b/tasks/containerd.py @@ -2,7 +2,11 @@ from os import stat from os.path import join from subprocess import run -from tasks.util.containerd import is_containerd_active, restart_containerd +from tasks.util.containerd import ( + is_containerd_active, + restart_containerd, + wait_for_containerd_socket, +) from tasks.util.docker import copy_from_ctr_image, is_ctr_running from tasks.util.env import ( BIN_DIR, @@ -16,6 +20,7 @@ ) from tasks.util.toml import update_toml from tasks.util.versions import CONTAINERD_VERSION, GO_VERSION +from time import sleep CONTAINERD_CTR_NAME = "containerd-workon" CONTAINERD_IMAGE_TAG = ( @@ -164,7 +169,9 @@ def install(debug=False, clean=False): # Populate the default config file for a clean start run(f"sudo mkdir -p {CONTAINERD_CONFIG_ROOT}", shell=True, check=True) if clean: - config_cmd = "containerd config default > {}".format(CONTAINERD_CONFIG_FILE) + config_cmd = "{}/containerd config default > {}".format( + host_base_path, CONTAINERD_CONFIG_FILE + ) config_cmd = "sudo bash -c '{}'".format(config_cmd) run(config_cmd, shell=True, check=True) @@ -175,27 +182,43 @@ def install(debug=False, clean=False): if stat(CONTAINERD_CONFIG_FILE).st_size == 0: raise RuntimeError("containerd config file is empty!") + # Wait for containerd to be ready + sleep(2) + while not is_containerd_active(): + if debug: + print("Waiting for containerd to be active...") + + sleep(2) + + # Then make sure we can dial the socket + wait_for_containerd_socket() + print("Success!") def install_bbolt(debug=False, clean=False): print_dotted_line("Installing bbolt") + wait_for_containerd_socket() + tmp_ctr_name = "bbolt_install" if is_ctr_running(tmp_ctr_name): result = run(f"docker rm -f {tmp_ctr_name}", shell=True, capture_output=True) assert result.returncode == 0 + def rm_container(): + result = run(f"docker rm -f {tmp_ctr_name}", shell=True, capture_output=True) + assert result.returncode == 0 + result = run( f"docker run -d -it --name {tmp_ctr_name} golang:{GO_VERSION} bash", shell=True, capture_output=True, ) - assert result.returncode == 0 - - def rm_container(): - result = run(f"docker rm -f {tmp_ctr_name}", shell=True, capture_output=True) - assert result.returncode == 0 + if result.returncode != 0: + print(result.stderr.decode("utf-8").strip()), + rm_container() + raise RuntimeError("Error running container") result = run( f"docker exec {tmp_ctr_name} go install go.etcd.io/bbolt/cmd/bbolt@latest", From d1ef0ecb4a7b05df5683845ead1d2ff3842c7524 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Feb 2025 16:47:33 +0000 Subject: [PATCH 26/34] ns: fix typo --- tasks/nydus_snapshotter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index 784acba4..d8f12f48 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -88,7 +88,7 @@ def wait_for_snapshot_metadata_to_be_gced(snapshotter, debug=False): # at all, never if stdout == "bucket not found": if debug: - print("WARNING: bucket {snapsotter} not found in metadata") + print(f"WARNING: bucket {snapshotter} not found in metadata") run(f"rm {tmp_db_path}", shell=True, check=True) return elif result.returncode == 0: From 1d18319f3e345ed0913aa6c165181591d3b52fdc Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Feb 2025 17:01:34 +0000 Subject: [PATCH 27/34] gha: temporarily disable host-share tests with tdx --- .github/workflows/tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0ce57427..b22ed890 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -172,6 +172,8 @@ jobs: echo "Knative test succesful!" - name: "Run nydus host-share test" + # Host-share mechanisms seem not to work with TDX + if: ${{ matrix.tee != 'tdx' }} run: | # Change the snapshotter mode and purge (necessary to clear # containred's content store) From 68cfe5e543ac77213ff226a934658b064f3a6b80 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Mon, 10 Feb 2025 19:28:39 +0000 Subject: [PATCH 28/34] gha: remove --debug --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b22ed890..f73d6ccd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -67,7 +67,7 @@ jobs: docker pull ghcr.io/sc2-sys/nydus-snapshotter:$(grep -oP 'NYDUS_SNAPSHOTTER_VERSION\s*=\s*"\K[^"]+' ./tasks/util/versions.py) - name: "Install SC2" - run: ./bin/inv_wrapper.sh sc2.deploy --clean --debug + run: ./bin/inv_wrapper.sh sc2.deploy --clean - name: "Run python hello world (cold and warm starts)" run: | From 5222854d23d7a75a33e0eaa5db66fdffc4dfa00d Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 10:11:26 +0000 Subject: [PATCH 29/34] ns: fix deploy without --debug --- tasks/nydus_snapshotter.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tasks/nydus_snapshotter.py b/tasks/nydus_snapshotter.py index d8f12f48..f62c6b25 100644 --- a/tasks/nydus_snapshotter.py +++ b/tasks/nydus_snapshotter.py @@ -90,7 +90,16 @@ def wait_for_snapshot_metadata_to_be_gced(snapshotter, debug=False): if debug: print(f"WARNING: bucket {snapshotter} not found in metadata") run(f"rm {tmp_db_path}", shell=True, check=True) - return + return + else: + print( + "ERROR: running bbolt command: stdout: {}, stderr: {}".format( + stdout, result.stderr.decode("utf-8").strip() + ) + ) + run(f"rm {tmp_db_path}", shell=True, check=True) + + raise RuntimeError("Error running bbolt command!") elif result.returncode == 0: if len(stdout) == 0: run(f"rm {tmp_db_path}", shell=True, check=True) From e012412a26bd488000fcb52920fa2768349dc2ed Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 11:00:36 +0000 Subject: [PATCH 30/34] gha: fix knative tests --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f73d6ccd..5b2a1cb4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -210,7 +210,7 @@ jobs: echo "Python test succesful!" # ----- Knative Test ---- - envsubst < ./demo-apps/helloworld-knative-nydus/service.yaml | ./bin/kubectl apply -f - + envsubst < ./demo-apps/helloworld-knative/service.yaml | ./bin/kubectl apply -f - sleep 1 # Get the service URL @@ -218,7 +218,7 @@ jobs: [ "$(curl --retry 3 ${service_url})" = "Hello World!" ] # Wait for pod to be deleted - envsubst < ./demo-apps/helloworld-knative-nydus/service.yaml | ./bin/kubectl delete -f - + envsubst < ./demo-apps/helloworld-knative/service.yaml | ./bin/kubectl delete -f - ./bin/kubectl wait --for=delete -l ${POD_LABEL} pod --timeout=60s # Extra cautionary sleep From 00431d42146749acdfd389a28bfb276aaeaf2e10 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 12:09:10 +0000 Subject: [PATCH 31/34] gha: run knative chaining in host-share --- .github/workflows/tests.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5b2a1cb4..b669585e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -226,9 +226,10 @@ jobs: echo "Knative test succesful!" # Change the snapshotter mode back again (and purge) - ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull - sleep 2 - ./bin/inv_wrapper.sh nydus-snapshotter.purge + # + # ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull + # sleep 2 + # ./bin/inv_wrapper.sh nydus-snapshotter.purge - name: "Enable default-memory annotation" run: | @@ -245,6 +246,7 @@ jobs: - name: "Run knative chaining demo" run: | + # TODO: may have to fetch content if we want to run in guest pull for runtime_class in ${{ matrix.runtime_classes }}; do echo "Running test for ${runtime_class}..." export SC2_RUNTIME_CLASS=${runtime_class} From 80494f652241287154d52050de9719d4728d1785 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 12:59:59 +0000 Subject: [PATCH 32/34] gha: fixes --- .github/workflows/tests.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b669585e..d7eac38e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -227,9 +227,9 @@ jobs: # Change the snapshotter mode back again (and purge) # - # ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull - # sleep 2 - # ./bin/inv_wrapper.sh nydus-snapshotter.purge + ./bin/inv_wrapper.sh nydus-snapshotter.set-mode guest-pull + sleep 2 + ./bin/inv_wrapper.sh nydus-snapshotter.purge - name: "Enable default-memory annotation" run: | @@ -244,6 +244,9 @@ jobs: # need to restart the VM cache sudo -E ./vm-cache/target/release/vm-cache restart + - name: "Fetch content (see #130)" + run: sudo ctr -n k8s.io content fetch -k sc2cr.io/system/knative-sidecar@sha256:79d5f6031f308cee209c4c32eeab9113b29a1ed4096c5d657504096734ca3b1d + - name: "Run knative chaining demo" run: | # TODO: may have to fetch content if we want to run in guest pull From 4cbbf146e124d2e7a90426057973fc0a83a3cea6 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 15:35:41 +0000 Subject: [PATCH 33/34] gha: more fetch --- .github/workflows/tests.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d7eac38e..6d8d91ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -245,7 +245,9 @@ jobs: sudo -E ./vm-cache/target/release/vm-cache restart - name: "Fetch content (see #130)" - run: sudo ctr -n k8s.io content fetch -k sc2cr.io/system/knative-sidecar@sha256:79d5f6031f308cee209c4c32eeab9113b29a1ed4096c5d657504096734ca3b1d + run: | + sudo ctr -n k8s.io content fetch -k sc2cr.io/system/knative-sidecar@sha256:79d5f6031f308cee209c4c32eeab9113b29a1ed4096c5d657504096734ca3b1d + sudo ctr -n k8s.io content fetch registry.k8s.io/pause:3.8 - name: "Run knative chaining demo" run: | From f8ab3de76660857a513b3908eca60b9a61441e30 Mon Sep 17 00:00:00 2001 From: Carlos Segarra Date: Tue, 11 Feb 2025 16:38:21 +0000 Subject: [PATCH 34/34] gha: tests passing --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6d8d91ee..5fba7e21 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -251,7 +251,6 @@ jobs: - name: "Run knative chaining demo" run: | - # TODO: may have to fetch content if we want to run in guest pull for runtime_class in ${{ matrix.runtime_classes }}; do echo "Running test for ${runtime_class}..." export SC2_RUNTIME_CLASS=${runtime_class}