From 492f94980d78625fa4c4ee07e4f4ac72a486fad4 Mon Sep 17 00:00:00 2001 From: bjorn-amd Date: Tue, 13 Jan 2026 16:37:36 +0100 Subject: [PATCH 1/3] First version of hf/s3 downloader --- apis/aim/v1alpha1/aimmodelcache_types.go | 4 +- .../bases/aim.silogen.ai_aimmodelcaches.yaml | 2 +- crd/crds.yaml | 2 +- docs/docs/reference/crds/aim.silogen.ai.md | 2 +- .../aim/aimmodelcache_controller.go | 182 ++---------------- python/model-downloader/Dockerfile | 16 ++ python/model-downloader/entrypoint.sh | 101 ++++++++++ python/model-downloader/progress_monitor.sh | 107 ++++++++++ 8 files changed, 242 insertions(+), 174 deletions(-) create mode 100644 python/model-downloader/Dockerfile create mode 100644 python/model-downloader/entrypoint.sh create mode 100644 python/model-downloader/progress_monitor.sh diff --git a/apis/aim/v1alpha1/aimmodelcache_types.go b/apis/aim/v1alpha1/aimmodelcache_types.go index 584fa8e1f..aa00c4837 100644 --- a/apis/aim/v1alpha1/aimmodelcache_types.go +++ b/apis/aim/v1alpha1/aimmodelcache_types.go @@ -29,7 +29,7 @@ import ( ) const ( - DefaultDownloadImage = "kserve/storage-initializer:v0.16.0-rc0" + DefaultDownloadImage = "ghcr.io/silogen/kaiwo/model-downloader:0.1" ) // AIMResolvedModelCache contains reference info and status for a cached model. @@ -71,7 +71,7 @@ type AIMModelCacheSpec struct { Env []corev1.EnvVar `json:"env,omitempty"` // ModelDownloadImage is the image used to download the model - // +kubebuilder:default="kserve/storage-initializer:v0.16.0" + // +kubebuilder:default="ghcr.io/silogen/kaiwo/model-downloader:0.1" ModelDownloadImage string `json:"modelDownloadImage"` // ImagePullSecrets references secrets for pulling AIM container images. diff --git a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml index e20a34262..e2397e964 100644 --- a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml +++ b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml @@ -201,7 +201,7 @@ spec: x-kubernetes-map-type: atomic type: array modelDownloadImage: - default: kserve/storage-initializer:v0.16.0 + default: ghcr.io/silogen/kaiwo/model-downloader:0.1 description: ModelDownloadImage is the image used to download the model type: string diff --git a/crd/crds.yaml b/crd/crds.yaml index 63ca2be11..64916cb8c 100644 --- a/crd/crds.yaml +++ b/crd/crds.yaml @@ -2349,7 +2349,7 @@ spec: x-kubernetes-map-type: atomic type: array modelDownloadImage: - default: kserve/storage-initializer:v0.16.0 + default: ghcr.io/silogen/kaiwo/model-downloader:0.1 description: ModelDownloadImage is the image used to download the model type: string diff --git a/docs/docs/reference/crds/aim.silogen.ai.md b/docs/docs/reference/crds/aim.silogen.ai.md index 9ce5d5d4c..f6b9bbb06 100644 --- a/docs/docs/reference/crds/aim.silogen.ai.md +++ b/docs/docs/reference/crds/aim.silogen.ai.md @@ -525,7 +525,7 @@ _Appears in:_ | `storageClassName` _string_ | StorageClassName specifies the storage class for the cache volume | | | | `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#quantity-resource-api)_ | Size specifies the size of the cache volume | | | | `env` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) array_ | Env lists the environment variables to use for authentication when downloading models.
These variables are used for authentication with model registries (e.g., HuggingFace tokens). | | | -| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | kserve/storage-initializer:v0.16.0 | | +| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | ghcr.io/silogen/kaiwo/model-downloader:0.1 | | | `imagePullSecrets` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#localobjectreference-v1-core) array_ | ImagePullSecrets references secrets for pulling AIM container images. | | | | `runtimeConfigName` _string_ | RuntimeConfigName references the AIM runtime configuration (by name) to use for this model cache.
This determines PVC headroom and other runtime settings. | default | | diff --git a/internal/controller/aim/aimmodelcache_controller.go b/internal/controller/aim/aimmodelcache_controller.go index 1fc63bbd0..c0d03ad07 100644 --- a/internal/controller/aim/aimmodelcache_controller.go +++ b/internal/controller/aim/aimmodelcache_controller.go @@ -553,18 +553,25 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache if len(mc.Spec.ModelDownloadImage) > 0 { downloadImage = mc.Spec.ModelDownloadImage } + + // Expected size in bytes for progress calculation + expectedSizeBytes := mc.Spec.Size.Value() + // Merge env vars with precedence: mc.Spec.Env > runtimeConfigSpec.Env > defaults newEnv := helpers.MergeEnvVars([]corev1.EnvVar{ {Name: "HF_XET_CHUNK_CACHE_SIZE_BYTES", Value: "0"}, {Name: "HF_XET_SHARD_CACHE_SIZE_BYTES", Value: "0"}, {Name: "HF_XET_HIGH_PERFORMANCE", Value: "1"}, - {Name: "HF_HOME", Value: mountPath + "/.hf"}, + {Name: "HF_HOME", Value: "/tmp/.hf"}, {Name: "UMASK", Value: "0022"}, + {Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)}, + {Name: "MOUNT_PATH", Value: mountPath}, + {Name: "CACHE_NAME", Value: mc.Name}, + {Name: "CACHE_NAMESPACE", Value: mc.Namespace}, + {Name: "STALL_TIMEOUT", Value: "120"}, + {Name: "TARGET_DIR", Value: mountPath}, }, helpers.MergeEnvVars(runtimeConfigSpec.Env, mc.Spec.Env)) - // Expected size in bytes for progress calculation - expectedSizeBytes := mc.Spec.Size.Value() - return &batchv1.Job{ TypeMeta: metav1.TypeMeta{ APIVersion: batchv1.SchemeGroupVersion.String(), @@ -598,54 +605,6 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: pvcName}, }, }, - { - Name: "tmp", - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{ - SizeLimit: baseutils.Pointer(resource.MustParse("500Mi")), // Small temp space for system operations - }, - }, - }, - }, - // Native sidecar (Kubernetes 1.28+): init container with restartPolicy=Always - // runs alongside main containers and is automatically terminated by kubelet - // when all regular containers complete (success or failure) - InitContainers: []corev1.Container{ - { - Name: "progress-monitor", - Image: "busybox:1.36", - ImagePullPolicy: corev1.PullIfNotPresent, - // restartPolicy: Always makes this a native sidecar that runs alongside main containers - // Kubernetes automatically sends SIGTERM when all regular containers terminate - RestartPolicy: baseutils.Pointer(corev1.ContainerRestartPolicyAlways), - SecurityContext: &corev1.SecurityContext{ - RunAsUser: baseutils.Pointer(int64(1000)), - RunAsGroup: baseutils.Pointer(int64(1000)), - }, - Env: []corev1.EnvVar{ - {Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)}, - {Name: "MOUNT_PATH", Value: mountPath}, - }, - Command: []string{"/bin/sh"}, - Args: []string{ - "-c", - progressMonitorScript, - }, - VolumeMounts: []corev1.VolumeMount{ - {Name: "cache", MountPath: mountPath, ReadOnly: true}, - }, - // Minimal resources for the monitor - Resources: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("10m"), - corev1.ResourceMemory: resource.MustParse("16Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("50m"), - corev1.ResourceMemory: resource.MustParse("32Mi"), - }, - }, - }, }, Containers: []corev1.Container{ { @@ -656,43 +615,10 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache RunAsUser: baseutils.Pointer(int64(1000)), RunAsGroup: baseutils.Pointer(int64(1000)), }, - Env: newEnv, - Command: []string{"/bin/sh"}, - Args: []string{ - "-c", - fmt.Sprintf(` -# Bail out if this AIM_DEBUG_CAUSE_FAILURE is set -if [ -n "$AIM_DEBUG_CAUSE_FAILURE" ]; then - echo "AIM_DEBUG_CAUSE_FAILURE is set, bailing out" - exit 1 -fi -# Set umask so downloaded files are readable by others -umask 0022 - -# Create temp directories on the same filesystem as destination -mkdir -p %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache - -# Download the model -python /storage-initializer/scripts/initializer-entrypoint %s %s && -( -# Report sizes before cleanup -echo "Storage usage before cleanup:" -du -sh %s -du -sh %s/.cache 2>/dev/null || true - -# Clean up HF cache directories to save space (keeps only final model files) -echo "Cleaning up HF cache to save space..." -rm -rf %s/.cache %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache 2>/dev/null || true - -# Report final sizes -echo "Final storage usage:" -du -sh %s || true -) - `, mountPath, mountPath, mountPath, mountPath, mc.Spec.SourceURI, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath), - }, + Env: newEnv, + Args: []string{mc.Spec.SourceURI}, VolumeMounts: []corev1.VolumeMount{ {Name: "cache", MountPath: mountPath}, - {Name: "tmp", MountPath: "/tmp"}, }, }, }, @@ -702,88 +628,6 @@ du -sh %s || true } } -// progressMonitorScript is the shell script for the download progress monitor sidecar. -// It reports download progress every 10 seconds in JSON format. -// -// This runs as a native sidecar (init container with restartPolicy=Always). -// Kubernetes automatically sends SIGTERM when all regular containers terminate, -// so we just need to handle the signal gracefully. -// -// JSON output types: -// - "start": Initial message when monitor starts -// - "progress": Periodic progress update -// - "complete": Download finished successfully (detected via marker file) -// - "terminated": Received SIGTERM from kubelet (main container finished) -const progressMonitorScript = ` -# Handle SIGTERM gracefully - kubelet sends this when main container terminates -terminated=false -trap 'terminated=true' TERM - -# Output a JSON log message -# Usage: log_json [key=value ...] -log_json() { - type=$1 - shift - timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\"" - for kv in "$@"; do - key="${kv%%=*}" - value="${kv#*=}" - # Check if value is numeric - case "$value" in - ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;; # string - *) json="$json,\"$key\":$value" ;; # number - esac - done - echo "$json}" -} - -expected_size=${EXPECTED_SIZE_BYTES:-0} -mount_path=${MOUNT_PATH:-/cache} -interval=10 - -log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$interval" - -while true; do - # Check if we received SIGTERM (main container terminated) - if [ "$terminated" = "true" ]; then - current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0) - log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size" "message=Main container terminated" - exit 0 - fi - - # Check if download completed successfully (marker file from main container) - if [ -f "$mount_path/.download-complete" ]; then - current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0) - log_json "complete" "currentBytes=$current_size" "expectedBytes=$expected_size" - exit 0 - fi - - current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0) - - if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then - percent=$((current_size * 100 / expected_size)) - # Cap at 100% (during download, temp files may exceed expected size) - if [ $percent -gt 100 ]; then - percent=100 - fi - log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size" - elif [ "$current_size" -gt 0 ]; then - log_json "progress" "currentBytes=$current_size" "expectedBytes=0" "message=Expected size unknown" - else - log_json "progress" "currentBytes=0" "expectedBytes=$expected_size" "message=Waiting for download to start" - fi - - # Use a loop with short sleeps so we can check for SIGTERM more frequently - # sleep in busybox doesn't get interrupted by signals, so we poll - i=0 - while [ $i -lt $interval ] && [ "$terminated" = "false" ]; do - sleep 1 - i=$((i + 1)) - done -done -` - func (r *AIMModelCacheReconciler) pvcName(mc *aimv1alpha1.AIMModelCache) string { return baseutils.FormatNameWithPostfix(mc.Name, "cache") } diff --git a/python/model-downloader/Dockerfile b/python/model-downloader/Dockerfile new file mode 100644 index 000000000..29499cb23 --- /dev/null +++ b/python/model-downloader/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.14-alpine + +RUN apk add --no-cache s3cmd procps kubectl \ + && pip install --no-cache-dir -U huggingface_hub + +COPY entrypoint.sh /entrypoint.sh +COPY progress_monitor.sh /progress_monitor.sh +RUN chmod +x /entrypoint.sh /progress_monitor.sh + +RUN mkdir /cache && chown 1000:1000 /cache + + +WORKDIR /cache +USER 1000 + +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/python/model-downloader/entrypoint.sh b/python/model-downloader/entrypoint.sh new file mode 100644 index 000000000..66217ede6 --- /dev/null +++ b/python/model-downloader/entrypoint.sh @@ -0,0 +1,101 @@ +#!/bin/sh +set -eu + +URL="${1:?Usage: $0 }" +TARGET_DIR="${TARGET_DIR:-/cache}" + + +# Fetch expected size if not already set +if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then + case "$URL" in + hf://*) + # Fetch expected size if not set + if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then + echo "Fetching model size from Hugging Face..." + MODEL_PATH="${URL#hf://}" + EXPECTED_SIZE_BYTES=$(python -c " + from huggingface_hub import HfApi + info = HfApi().model_info('$MODEL_PATH', files_metadata=True) + print(sum(f.size or 0 for f in info.siblings)) + " 2>/dev/null || echo 0) + fi + ;; + s3://*) + # Get size from S3 (s3cmd du returns human-readable, need bytes) + EXPECTED_SIZE_BYTES=$(s3cmd du "$URL" 2>/dev/null | awk '{print $1}' || echo 0) + ;; + esac + export EXPECTED_SIZE_BYTES +fi + +echo "Expected size: $EXPECTED_SIZE_BYTES bytes" + +# Start progress monitor in background +if [ -f /progress_monitor.sh ]; then + /progress_monitor.sh & + echo "Started progress monitor (PID: $!)" +fi + +### TESTING WHEN ENV VARS ARE SET ### +if [ -n "${AIM_DEBUG_CAUSE_HANG:-}" ]; then + echo "AIM_DEBUG_CAUSE_HANG is set, causing hang" + python -c "import time; time.sleep(1000000)" + exit 1 +fi + +if [ -n "${AIM_DEBUG_CAUSE_FAILURE:-}" ]; then + echo "AIM_DEBUG_CAUSE_FAILURE is set, causing failure" + exit 1 +fi +### END TESTING ### + +case "$URL" in + hf://*) + export HF_HOME="$TARGET_DIR/.hf" + mkdir -p "$HF_HOME" + + MODEL_PATH="${URL#hf://}" + echo "Downloading from Hugging Face: $MODEL_PATH to $TARGET_DIR" + hf download \ + --local-dir "$TARGET_DIR" \ + "$MODEL_PATH" + echo "Verifying download..." + hf cache verify \ + --local-dir "$TARGET_DIR" \ + --fail-on-missing-files \ + "$MODEL_PATH" + echo "Download complete and verified" + echo "Size of HF_HOME: $(du -sh "$HF_HOME")" + rm -rf "$HF_HOME" + ;; + s3://*) + echo "Syncing from S3: $URL to $TARGET_DIR" + + S3CMD_ARGS="" + + if [ -n "${S3_ACCESS_KEY:-}" ]; then + S3CMD_ARGS="$S3CMD_ARGS --access_key=$S3_ACCESS_KEY" + fi + if [ -n "${S3_SECRET_KEY:-}" ]; then + S3CMD_ARGS="$S3CMD_ARGS --secret_key=$S3_SECRET_KEY" + fi + if [ -n "${S3_ENDPOINT:-}" ]; then + S3CMD_ARGS="$S3CMD_ARGS --host=$S3_ENDPOINT" + S3CMD_ARGS="$S3CMD_ARGS --host-bucket=$S3_ENDPOINT/%(bucket)s" + fi + if [ "${S3_NO_SSL:-}" = "true" ]; then + S3CMD_ARGS="$S3CMD_ARGS --no-ssl" + fi + if [ "${S3_SIGNATURE_V2:-}" = "true" ]; then + S3CMD_ARGS="$S3CMD_ARGS --signature-v2" + fi + + # shellcheck disable=SC2086 + s3cmd $S3CMD_ARGS sync --stop-on-error "$URL" "$TARGET_DIR/" + echo "Sync complete" + ;; + *) + echo "Error: Unknown protocol. URL must start with hf:// or s3://" >&2 + exit 1 + ;; +esac \ No newline at end of file diff --git a/python/model-downloader/progress_monitor.sh b/python/model-downloader/progress_monitor.sh new file mode 100644 index 000000000..a530cf975 --- /dev/null +++ b/python/model-downloader/progress_monitor.sh @@ -0,0 +1,107 @@ +#!/bin/sh +# Progress monitor for model downloads +# Outputs JSON progress logs and kills stalled downloads + +# Handle SIGTERM gracefully - kubelet sends this when main container terminates +terminated=false +trap 'terminated=true' TERM + +# Output a JSON log message +# Usage: log_json [key=value ...] +log_json() { + type=$1 + shift + timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\"" + for kv in "$@"; do + key="${kv%%=*}" + value="${kv#*=}" + # Check if value is numeric + case "$value" in + ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;; # string + *) json="$json,\"$key\":$value" ;; # number + esac + done + echo "$json}" >&2 +} + +SA_TOKEN="/var/run/secrets/kubernetes.io/serviceaccount/token" +can_update_status=false +if [ -f "$SA_TOKEN" ] && [ -n "${CACHE_NAME:-}" ] && [ -n "${CACHE_NAMESPACE:-}" ]; then + can_update_status=true +fi + +update_status() { + if [ "$can_update_status" = "true" ]; then + percent=$1 + kubectl patch aimmodelcache "$CACHE_NAME" -n "$CACHE_NAMESPACE" \ + --type=merge --subresource=status \ + -p "{\"status\":{\"downloadProgress\":$percent}}" 2>/dev/null || true + fi +} + +# Kill the download process (huggingface-cli or s3cmd) +kill_downloader() { + # Kill python processes + pkill -9 -f "python" 2>/dev/null || true + # Kill s3cmd + pkill -9 -f "s3cmd" 2>/dev/null || true + +} + +expected_size=${EXPECTED_SIZE_BYTES:-0} +mount_path=${MOUNT_PATH:-/cache} +log_interval=${PROGRESS_INTERVAL:-5} # 5 seconds default +stall_timeout=${STALL_TIMEOUT:-60} # 1 minutes default + +log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$log_interval" "stallTimeoutSeconds=$stall_timeout" + +last_size=0 +last_change_time=$(date +%s) + +while true; do + # Check if we received SIGTERM (main container terminated) + if [ "$terminated" = "true" ]; then + current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0) + log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size" + exit 0 + fi + + current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0) + now=$(date +%s) + + # Track progress for stall detection + if [ "$current_size" -gt "$last_size" ]; then + last_size=$current_size + last_change_time=$now + fi + + # Check for stall (no progress for stall_timeout seconds) + stall_duration=$((now - last_change_time)) + if [ "$stall_duration" -ge "$stall_timeout" ]; then + log_json "stall" "currentBytes=$current_size" "stallDurationSeconds=$stall_duration" + kill_downloader + exit 1 + fi + + if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then + percent=$((current_size * 100 / expected_size)) + # Cap at 100% (during download, temp files may exceed expected size) + [ "$percent" -gt 100 ] && percent=100 + log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size" + elif [ "$current_size" -gt 0 ]; then + log_json "progress" "currentBytes=$current_size" "message=Expected size unknown" + else + log_json "waiting" "message=Waiting for download to start" + fi + + update_status "$percent" + + # Use a loop with short sleeps so we can check for SIGTERM more frequently + i=0 + while [ $i -lt $log_interval ] && [ "$terminated" = "false" ]; do + sleep 1 + i=$((i + 1)) + done +done + From 398391fd5766cd56cb4441909b0372f7702e0354 Mon Sep 17 00:00:00 2001 From: bjorn-amd Date: Tue, 13 Jan 2026 19:55:07 +0100 Subject: [PATCH 2/3] Added tests for stall detection and s3 downloads --- .../cache-s3-download/chainsaw-test.yaml | 232 ++++++++++++++++++ .../caching/cache-s3-download/runtime.yaml | 7 + .../cache-s3-download/s3-deployment.yaml | 79 ++++++ .../cache-stall-detection/aimservice.yaml | 13 + .../cache-stall-detection/chainsaw-test.yaml | 41 ++++ .../cache-stall-detection/runtime.yaml | 7 + 6 files changed, 379 insertions(+) create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml new file mode 100644 index 000000000..f3240c5cf --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml @@ -0,0 +1,232 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cache-s3-download +spec: + description: Test S3 model caching by comparing HF download with S3 round-trip + concurrent: false + timeouts: + assert: 600s + steps: + - name: Setup infrastructure + try: + - apply: + file: runtime.yaml + - apply: + file: s3-deployment.yaml + - wait: + apiVersion: apps/v1 + kind: Deployment + name: minio-deployment + timeout: 2m + for: + condition: + name: Available + value: 'true' + + - name: Create HuggingFace model cache + try: + - apply: + resource: + apiVersion: aim.silogen.ai/v1alpha1 + kind: AIMModelCache + metadata: + name: hf-model-cache + spec: + sourceUri: hf://HuggingFaceTB/SmolLM2-135M + size: 1Gi + + - name: Wait for HF cache to be available + try: + - assert: + timeout: 300s + resource: + apiVersion: aim.silogen.ai/v1alpha1 + kind: AIMModelCache + metadata: + name: hf-model-cache + status: + status: Available + + - name: Upload cached model to MinIO + try: + - apply: + resource: + apiVersion: batch/v1 + kind: Job + metadata: + name: upload-to-s3 + spec: + backoffLimit: 2 + template: + spec: + restartPolicy: Never + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + volumes: + - name: hf-cache + persistentVolumeClaim: + claimName: hf-model-cache-cache + containers: + - name: uploader + image: ghcr.io/silogen/kaiwo/model-downloader:0.1 + command: ["/bin/sh", "-c"] + args: + - | + set -eu + BUCKET="test-models" + S3_HOST="minio-service:9000" + + S3_OPTS="--host=$S3_HOST --host-bucket=$S3_HOST/%(bucket)s --access_key=$S3_ACCESS_KEY --secret_key=$S3_SECRET_KEY --no-ssl --signature-v2" + + echo "=== Creating bucket ===" + s3cmd $S3_OPTS mb "s3://$BUCKET" || true + + echo "=== Uploading from HF cache to S3 ===" + s3cmd $S3_OPTS --recursive put /cache/ "s3://$BUCKET/" + + echo "=== Verifying upload ===" + s3cmd $S3_OPTS ls -r "s3://$BUCKET/" + env: + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access_key_id + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret_key + volumeMounts: + - name: hf-cache + mountPath: /cache + readOnly: true + - wait: + apiVersion: batch/v1 + kind: Job + name: upload-to-s3 + timeout: 3m + for: + condition: + name: Complete + value: 'true' + + - name: Create S3 model cache + try: + - apply: + resource: + apiVersion: aim.silogen.ai/v1alpha1 + kind: AIMModelCache + metadata: + name: s3-model-cache + spec: + sourceUri: s3://test-models/ + size: 1Gi + env: + - name: S3_ENDPOINT + value: "minio-service:9000" + - name: S3_NO_SSL + value: "true" + - name: S3_SIGNATURE_V2 + value: "true" + - name: S3_ACCESS_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: access_key_id + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + name: minio-secret + key: secret_key + + - name: Wait for S3 cache to be available + try: + - assert: + timeout: 300s + resource: + apiVersion: aim.silogen.ai/v1alpha1 + kind: AIMModelCache + metadata: + name: s3-model-cache + status: + status: Available + + - name: Verify checksums match + try: + - apply: + resource: + apiVersion: batch/v1 + kind: Job + metadata: + name: verify-checksums + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + volumes: + - name: hf-cache + persistentVolumeClaim: + claimName: hf-model-cache-cache + - name: s3-cache + persistentVolumeClaim: + claimName: s3-model-cache-cache + containers: + - name: verifier + image: alpine:latest + command: ["/bin/sh", "-c"] + args: + - | + set -eu + echo "=== Computing checksums for HF cache ===" + cd /hf-cache + find . -type f -exec md5sum {} \; | sort > /tmp/hf-checksums.txt + cat /tmp/hf-checksums.txt + + echo "=== Computing checksums for S3 cache ===" + cd /s3-cache + find . -type f -exec md5sum {} \; | sort > /tmp/s3-checksums.txt + cat /tmp/s3-checksums.txt + + echo "=== Comparing checksums ===" + if diff /tmp/hf-checksums.txt /tmp/s3-checksums.txt; then + echo "SUCCESS: All checksums match!" + exit 0 + else + echo "FAILURE: Checksums do not match!" + exit 1 + fi + volumeMounts: + - name: hf-cache + mountPath: /hf-cache + readOnly: true + - name: s3-cache + mountPath: /s3-cache + readOnly: true + - wait: + apiVersion: batch/v1 + kind: Job + name: verify-checksums + timeout: 2m + for: + condition: + name: Complete + value: 'true' + + catch: + - command: + entrypoint: kaiwo-dev + env: + - name: PRINT_LEVEL + value: ($values.print_level) + - name: NAMESPACE + value: ($namespace) + args: ["debug", "chainsaw", "--namespace=$NAMESPACE", "--print-level=$PRINT_LEVEL"] \ No newline at end of file diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml new file mode 100644 index 000000000..652e0c0bf --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml @@ -0,0 +1,7 @@ +apiVersion: aim.silogen.ai/v1alpha1 +kind: AIMRuntimeConfig +metadata: + name: default +spec: + defaultStorageClassName: "rwx-nfs" + pvcHeadroomPercent: 10 diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml new file mode 100644 index 000000000..0d1e5244a --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: Secret +metadata: + name: minio-secret +data: + access_key_id: bWluaW8= + secret_key: bWluaW8xMjM= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio-deployment +spec: + replicas: 1 + selector: + matchLabels: + app: minio + template: + metadata: + labels: + app: minio + spec: + containers: + - name: minio + image: minio/minio + args: ["server", "/data"] + resources: + limits: + memory: "1Gi" + requests: + cpu: "1" + memory: "1Gi" + env: + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-secret + key: access_key_id + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-secret + key: secret_key + ports: + - containerPort: 9000 + livenessProbe: + httpGet: + path: /minio/health/live + port: 9000 + initialDelaySeconds: 120 + periodSeconds: 20 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /minio/health/ready + port: 9000 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: minio-service +spec: + selector: + app: minio + ports: + - protocol: TCP + port: 9000 + targetPort: 9000 + name: minio-endpoint + - protocol: TCP + port: 9001 + targetPort: 9001 + name: minio-console + type: ClusterIP diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml new file mode 100644 index 000000000..12dc8bb3c --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml @@ -0,0 +1,13 @@ +apiVersion: aim.silogen.ai/v1alpha1 +kind: AIMService +metadata: + name: test-cache-stall +spec: + model: + image: ghcr.io/silogen/aim-dummy:0.1.4 + cacheModel: true + template: + allowUnoptimized: true + env: + - name: AIM_DEBUG_CAUSE_HANG + value: "true" \ No newline at end of file diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml new file mode 100644 index 000000000..1e517e6e3 --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml @@ -0,0 +1,41 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cache-stall-detection +spec: + description: Test that download stall detection kills hung downloads + timeouts: + assert: 480s # 8 minutes - stall timeout (5min) + buffer + steps: + - name: Create runtime config + try: + - apply: + file: runtime.yaml + + - name: Create AIMService with cacheModel and hang trigger + try: + - apply: + file: aimservice.yaml + + - name: Verify AIMModelCache is created and download job starts + try: + - assert: + timeout: 120s + resource: + apiVersion: aim.silogen.ai/v1alpha1 + kind: AIMModelCache + metadata: + namespace: ($namespace) + status: + status: Progressing + + - name: Verify stall detection triggered (Job has at least 1 failed pod) + try: + - assert: + timeout: 420s + resource: + apiVersion: batch/v1 + kind: Job + metadata: + namespace: ($namespace) + (status.failed >= `1`): true \ No newline at end of file diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml new file mode 100644 index 000000000..c9504097e --- /dev/null +++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml @@ -0,0 +1,7 @@ +apiVersion: aim.silogen.ai/v1alpha1 +kind: AIMRuntimeConfig +metadata: + name: default +spec: + defaultStorageClassName: "rwx-nfs" + pvcHeadroomPercent: 10 \ No newline at end of file From 41913a7f6eabc6d634205fee9919cd3624404b9d Mon Sep 17 00:00:00 2001 From: bjorn-amd Date: Tue, 13 Jan 2026 20:38:30 +0100 Subject: [PATCH 3/3] Add github action to build downloader image --- .github/workflows/build-model-downloader.yaml | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/workflows/build-model-downloader.yaml diff --git a/.github/workflows/build-model-downloader.yaml b/.github/workflows/build-model-downloader.yaml new file mode 100644 index 000000000..d0ab9abc7 --- /dev/null +++ b/.github/workflows/build-model-downloader.yaml @@ -0,0 +1,60 @@ +name: build-model-downloader + +on: + push: + tags: + - "v*" + branches: + - main + paths: + - "python/model-downloader/**" + workflow_dispatch: + +permissions: + contents: read + packages: write + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GitHub Container Registry + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $GITHUB_ACTOR --password-stdin + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository_owner }}/kaiwo/model-downloader + tags: | + type=ref,event=branch + type=ref,event=tag + type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }} + type=sha,prefix= + + - name: Build and push image + uses: docker/build-push-action@v6 + with: + context: python/model-downloader + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Summary + run: | + echo "### Model Downloader Image Published" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Tags pushed:**" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY \ No newline at end of file