From 492f94980d78625fa4c4ee07e4f4ac72a486fad4 Mon Sep 17 00:00:00 2001
From: bjorn-amd <bjorn.ahlstrom@amd.com>
Date: Tue, 13 Jan 2026 16:37:36 +0100
Subject: [PATCH 1/3] First version of hf/s3 downloader

---
 apis/aim/v1alpha1/aimmodelcache_types.go      |   4 +-
 .../bases/aim.silogen.ai_aimmodelcaches.yaml  |   2 +-
 crd/crds.yaml                                 |   2 +-
 docs/docs/reference/crds/aim.silogen.ai.md    |   2 +-
 .../aim/aimmodelcache_controller.go           | 182 ++----------------
 python/model-downloader/Dockerfile            |  16 ++
 python/model-downloader/entrypoint.sh         | 101 ++++++++++
 python/model-downloader/progress_monitor.sh   | 107 ++++++++++
 8 files changed, 242 insertions(+), 174 deletions(-)
 create mode 100644 python/model-downloader/Dockerfile
 create mode 100644 python/model-downloader/entrypoint.sh
 create mode 100644 python/model-downloader/progress_monitor.sh
diff --git a/apis/aim/v1alpha1/aimmodelcache_types.go b/apis/aim/v1alpha1/aimmodelcache_types.go
index 584fa8e1f..aa00c4837 100644
--- a/apis/aim/v1alpha1/aimmodelcache_types.go
+++ b/apis/aim/v1alpha1/aimmodelcache_types.go
@@ -29,7 +29,7 @@ import (
 )
 
 const (
-	DefaultDownloadImage = "kserve/storage-initializer:v0.16.0-rc0"
+	DefaultDownloadImage = "ghcr.io/silogen/kaiwo/model-downloader:0.1"
 )
 
 // AIMResolvedModelCache contains reference info and status for a cached model.
@@ -71,7 +71,7 @@ type AIMModelCacheSpec struct {
 	Env []corev1.EnvVar `json:"env,omitempty"`
 
 	// ModelDownloadImage is the image used to download the model
-	// +kubebuilder:default="kserve/storage-initializer:v0.16.0"
+	// +kubebuilder:default="ghcr.io/silogen/kaiwo/model-downloader:0.1"
 	ModelDownloadImage string `json:"modelDownloadImage"`
 
 	// ImagePullSecrets references secrets for pulling AIM container images.
diff --git a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
index e20a34262..e2397e964 100644
--- a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
+++ b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
@@ -201,7 +201,7 @@ spec:
                   x-kubernetes-map-type: atomic
                 type: array
               modelDownloadImage:
-                default: kserve/storage-initializer:v0.16.0
+                default: ghcr.io/silogen/kaiwo/model-downloader:0.1
                 description: ModelDownloadImage is the image used to download the
                   model
                 type: string
diff --git a/crd/crds.yaml b/crd/crds.yaml
index 63ca2be11..64916cb8c 100644
--- a/crd/crds.yaml
+++ b/crd/crds.yaml
@@ -2349,7 +2349,7 @@ spec:
                   x-kubernetes-map-type: atomic
                 type: array
               modelDownloadImage:
-                default: kserve/storage-initializer:v0.16.0
+                default: ghcr.io/silogen/kaiwo/model-downloader:0.1
                 description: ModelDownloadImage is the image used to download the
                   model
                 type: string
diff --git a/docs/docs/reference/crds/aim.silogen.ai.md b/docs/docs/reference/crds/aim.silogen.ai.md
index 9ce5d5d4c..f6b9bbb06 100644
--- a/docs/docs/reference/crds/aim.silogen.ai.md
+++ b/docs/docs/reference/crds/aim.silogen.ai.md
@@ -525,7 +525,7 @@ _Appears in:_
 | `storageClassName` _string_ | StorageClassName specifies the storage class for the cache volume |  |  |
 | `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#quantity-resource-api)_ | Size specifies the size of the cache volume |  |  |
 | `env` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) array_ | Env lists the environment variables to use for authentication when downloading models.<br />These variables are used for authentication with model registries (e.g., HuggingFace tokens). |  |  |
-| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | kserve/storage-initializer:v0.16.0 |  |
+| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | ghcr.io/silogen/kaiwo/model-downloader:0.1 |  |
 | `imagePullSecrets` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#localobjectreference-v1-core) array_ | ImagePullSecrets references secrets for pulling AIM container images. |  |  |
 | `runtimeConfigName` _string_ | RuntimeConfigName references the AIM runtime configuration (by name) to use for this model cache.<br />This determines PVC headroom and other runtime settings. | default |  |
 
diff --git a/internal/controller/aim/aimmodelcache_controller.go b/internal/controller/aim/aimmodelcache_controller.go
index 1fc63bbd0..c0d03ad07 100644
--- a/internal/controller/aim/aimmodelcache_controller.go
+++ b/internal/controller/aim/aimmodelcache_controller.go
@@ -553,18 +553,25 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
 	if len(mc.Spec.ModelDownloadImage) > 0 {
 		downloadImage = mc.Spec.ModelDownloadImage
 	}
+
+	// Expected size in bytes for progress calculation
+	expectedSizeBytes := mc.Spec.Size.Value()
+
 	// Merge env vars with precedence: mc.Spec.Env > runtimeConfigSpec.Env > defaults
 	newEnv := helpers.MergeEnvVars([]corev1.EnvVar{
 		{Name: "HF_XET_CHUNK_CACHE_SIZE_BYTES", Value: "0"},
 		{Name: "HF_XET_SHARD_CACHE_SIZE_BYTES", Value: "0"},
 		{Name: "HF_XET_HIGH_PERFORMANCE", Value: "1"},
-		{Name: "HF_HOME", Value: mountPath + "/.hf"},
+		{Name: "HF_HOME", Value: "/tmp/.hf"},
 		{Name: "UMASK", Value: "0022"},
+		{Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)},
+		{Name: "MOUNT_PATH", Value: mountPath},
+		{Name: "CACHE_NAME", Value: mc.Name},
+		{Name: "CACHE_NAMESPACE", Value: mc.Namespace},
+		{Name: "STALL_TIMEOUT", Value: "120"},
+		{Name: "TARGET_DIR", Value: mountPath},
 	}, helpers.MergeEnvVars(runtimeConfigSpec.Env, mc.Spec.Env))
 
-	// Expected size in bytes for progress calculation
-	expectedSizeBytes := mc.Spec.Size.Value()
-
 	return &batchv1.Job{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: batchv1.SchemeGroupVersion.String(),
@@ -598,54 +605,6 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
 								PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: pvcName},
 							},
 						},
-						{
-							Name: "tmp",
-							VolumeSource: corev1.VolumeSource{
-								EmptyDir: &corev1.EmptyDirVolumeSource{
-									SizeLimit: baseutils.Pointer(resource.MustParse("500Mi")), // Small temp space for system operations
-								},
-							},
-						},
-					},
-					// Native sidecar (Kubernetes 1.28+): init container with restartPolicy=Always
-					// runs alongside main containers and is automatically terminated by kubelet
-					// when all regular containers complete (success or failure)
-					InitContainers: []corev1.Container{
-						{
-							Name:            "progress-monitor",
-							Image:           "busybox:1.36",
-							ImagePullPolicy: corev1.PullIfNotPresent,
-							// restartPolicy: Always makes this a native sidecar that runs alongside main containers
-							// Kubernetes automatically sends SIGTERM when all regular containers terminate
-							RestartPolicy: baseutils.Pointer(corev1.ContainerRestartPolicyAlways),
-							SecurityContext: &corev1.SecurityContext{
-								RunAsUser:  baseutils.Pointer(int64(1000)),
-								RunAsGroup: baseutils.Pointer(int64(1000)),
-							},
-							Env: []corev1.EnvVar{
-								{Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)},
-								{Name: "MOUNT_PATH", Value: mountPath},
-							},
-							Command: []string{"/bin/sh"},
-							Args: []string{
-								"-c",
-								progressMonitorScript,
-							},
-							VolumeMounts: []corev1.VolumeMount{
-								{Name: "cache", MountPath: mountPath, ReadOnly: true},
-							},
-							// Minimal resources for the monitor
-							Resources: corev1.ResourceRequirements{
-								Requests: corev1.ResourceList{
-									corev1.ResourceCPU:    resource.MustParse("10m"),
-									corev1.ResourceMemory: resource.MustParse("16Mi"),
-								},
-								Limits: corev1.ResourceList{
-									corev1.ResourceCPU:    resource.MustParse("50m"),
-									corev1.ResourceMemory: resource.MustParse("32Mi"),
-								},
-							},
-						},
 					},
 					Containers: []corev1.Container{
 						{
@@ -656,43 +615,10 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
 								RunAsUser:  baseutils.Pointer(int64(1000)),
 								RunAsGroup: baseutils.Pointer(int64(1000)),
 							},
-							Env:     newEnv,
-							Command: []string{"/bin/sh"},
-							Args: []string{
-								"-c",
-								fmt.Sprintf(`
-# Bail out if this AIM_DEBUG_CAUSE_FAILURE is set
-if [ -n "$AIM_DEBUG_CAUSE_FAILURE" ]; then
-	echo "AIM_DEBUG_CAUSE_FAILURE is set, bailing out"
-	exit 1
-fi
-# Set umask so downloaded files are readable by others
-umask 0022
-
-# Create temp directories on the same filesystem as destination
-mkdir -p %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache
-
-# Download the model
-python /storage-initializer/scripts/initializer-entrypoint %s %s &&
-(
-# Report sizes before cleanup
-echo "Storage usage before cleanup:"
-du -sh %s
-du -sh %s/.cache 2>/dev/null || true
-
-# Clean up HF cache directories to save space (keeps only final model files)
-echo "Cleaning up HF cache to save space..."
-rm -rf %s/.cache %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache 2>/dev/null || true
-
-# Report final sizes
-echo "Final storage usage:"
-du -sh %s || true
-)
-				`, mountPath, mountPath, mountPath, mountPath, mc.Spec.SourceURI, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath),
-							},
+							Env:  newEnv,
+							Args: []string{mc.Spec.SourceURI},
 							VolumeMounts: []corev1.VolumeMount{
 								{Name: "cache", MountPath: mountPath},
-								{Name: "tmp", MountPath: "/tmp"},
 							},
 						},
 					},
@@ -702,88 +628,6 @@ du -sh %s || true
 	}
 }
 
-// progressMonitorScript is the shell script for the download progress monitor sidecar.
-// It reports download progress every 10 seconds in JSON format.
-//
-// This runs as a native sidecar (init container with restartPolicy=Always).
-// Kubernetes automatically sends SIGTERM when all regular containers terminate,
-// so we just need to handle the signal gracefully.
-//
-// JSON output types:
-//   - "start": Initial message when monitor starts
-//   - "progress": Periodic progress update
-//   - "complete": Download finished successfully (detected via marker file)
-//   - "terminated": Received SIGTERM from kubelet (main container finished)
-const progressMonitorScript = `
-# Handle SIGTERM gracefully - kubelet sends this when main container terminates
-terminated=false
-trap 'terminated=true' TERM
-
-# Output a JSON log message
-# Usage: log_json <type> [key=value ...]
-log_json() {
-    type=$1
-    shift
-    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
-    json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\""
-    for kv in "$@"; do
-        key="${kv%%=*}"
-        value="${kv#*=}"
-        # Check if value is numeric
-        case "$value" in
-            ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;;  # string
-            *) json="$json,\"$key\":$value" ;;                 # number
-        esac
-    done
-    echo "$json}"
-}
-
-expected_size=${EXPECTED_SIZE_BYTES:-0}
-mount_path=${MOUNT_PATH:-/cache}
-interval=10
-
-log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$interval"
-
-while true; do
-    # Check if we received SIGTERM (main container terminated)
-    if [ "$terminated" = "true" ]; then
-        current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
-        log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size" "message=Main container terminated"
-        exit 0
-    fi
-
-    # Check if download completed successfully (marker file from main container)
-    if [ -f "$mount_path/.download-complete" ]; then
-        current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
-        log_json "complete" "currentBytes=$current_size" "expectedBytes=$expected_size"
-        exit 0
-    fi
-
-    current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
-
-    if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then
-        percent=$((current_size * 100 / expected_size))
-        # Cap at 100% (during download, temp files may exceed expected size)
-        if [ $percent -gt 100 ]; then
-            percent=100
-        fi
-        log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size"
-    elif [ "$current_size" -gt 0 ]; then
-        log_json "progress" "currentBytes=$current_size" "expectedBytes=0" "message=Expected size unknown"
-    else
-        log_json "progress" "currentBytes=0" "expectedBytes=$expected_size" "message=Waiting for download to start"
-    fi
-
-    # Use a loop with short sleeps so we can check for SIGTERM more frequently
-    # sleep in busybox doesn't get interrupted by signals, so we poll
-    i=0
-    while [ $i -lt $interval ] && [ "$terminated" = "false" ]; do
-        sleep 1
-        i=$((i + 1))
-    done
-done
-`
-
 func (r *AIMModelCacheReconciler) pvcName(mc *aimv1alpha1.AIMModelCache) string {
 	return baseutils.FormatNameWithPostfix(mc.Name, "cache")
 }
diff --git a/python/model-downloader/Dockerfile b/python/model-downloader/Dockerfile
new file mode 100644
index 000000000..29499cb23
--- /dev/null
+++ b/python/model-downloader/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.14-alpine
+
+RUN apk add --no-cache s3cmd procps kubectl \
+    && pip install --no-cache-dir -U huggingface_hub
+
+COPY entrypoint.sh /entrypoint.sh
+COPY progress_monitor.sh /progress_monitor.sh
+RUN chmod +x /entrypoint.sh /progress_monitor.sh
+
+RUN mkdir /cache && chown 1000:1000 /cache
+
+
+WORKDIR /cache
+USER 1000
+
+ENTRYPOINT ["/entrypoint.sh"]
\ No newline at end of file
diff --git a/python/model-downloader/entrypoint.sh b/python/model-downloader/entrypoint.sh
new file mode 100644
index 000000000..66217ede6
--- /dev/null
+++ b/python/model-downloader/entrypoint.sh
@@ -0,0 +1,101 @@
+#!/bin/sh
+set -eu
+
+URL="${1:?Usage: $0 <hf://org/model or s3://bucket/path>}"
+TARGET_DIR="${TARGET_DIR:-/cache}"
+
+
+# Fetch expected size if not already set
+if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then
+    case "$URL" in
+        hf://*)
+        # Fetch expected size if not set
+        if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then
+            echo "Fetching model size from Hugging Face..."
+            MODEL_PATH="${URL#hf://}"
+            EXPECTED_SIZE_BYTES=$(python -c "
+                from huggingface_hub import HfApi
+                info = HfApi().model_info('$MODEL_PATH', files_metadata=True)
+                print(sum(f.size or 0 for f in info.siblings))
+                " 2>/dev/null || echo 0)
+        fi
+        ;;
+        s3://*)
+            # Get size from S3 (s3cmd du returns human-readable, need bytes)
+            EXPECTED_SIZE_BYTES=$(s3cmd du "$URL" 2>/dev/null | awk '{print $1}' || echo 0)
+            ;;
+    esac
+    export EXPECTED_SIZE_BYTES
+fi
+
+echo "Expected size: $EXPECTED_SIZE_BYTES bytes"
+
+# Start progress monitor in background
+if [ -f /progress_monitor.sh ]; then
+    /progress_monitor.sh &
+    echo "Started progress monitor (PID: $!)"
+fi
+
+### TESTING WHEN ENV VARS ARE SET ###
+if [ -n "${AIM_DEBUG_CAUSE_HANG:-}" ]; then
+    echo "AIM_DEBUG_CAUSE_HANG is set, causing hang"
+    python -c "import time; time.sleep(1000000)"
+    exit 1
+fi
+
+if [ -n "${AIM_DEBUG_CAUSE_FAILURE:-}" ]; then
+    echo "AIM_DEBUG_CAUSE_FAILURE is set, causing failure"
+    exit 1
+fi
+### END TESTING ###
+
+case "$URL" in
+    hf://*)
+        export HF_HOME="$TARGET_DIR/.hf"
+        mkdir -p "$HF_HOME"
+        
+        MODEL_PATH="${URL#hf://}"
+        echo "Downloading from Hugging Face: $MODEL_PATH to $TARGET_DIR"
+        hf download \
+            --local-dir "$TARGET_DIR" \
+            "$MODEL_PATH"
+        echo "Verifying download..."
+        hf cache verify \
+            --local-dir "$TARGET_DIR" \
+            --fail-on-missing-files \
+            "$MODEL_PATH"
+        echo "Download complete and verified"
+        echo "Size of HF_HOME: $(du -sh "$HF_HOME")"
+        rm -rf "$HF_HOME"
+        ;;
+    s3://*)
+        echo "Syncing from S3: $URL to $TARGET_DIR"
+        
+        S3CMD_ARGS=""
+        
+        if [ -n "${S3_ACCESS_KEY:-}" ]; then
+            S3CMD_ARGS="$S3CMD_ARGS --access_key=$S3_ACCESS_KEY"
+        fi
+        if [ -n "${S3_SECRET_KEY:-}" ]; then
+            S3CMD_ARGS="$S3CMD_ARGS --secret_key=$S3_SECRET_KEY"
+        fi
+        if [ -n "${S3_ENDPOINT:-}" ]; then
+            S3CMD_ARGS="$S3CMD_ARGS --host=$S3_ENDPOINT"
+            S3CMD_ARGS="$S3CMD_ARGS --host-bucket=$S3_ENDPOINT/%(bucket)s"
+        fi
+        if [ "${S3_NO_SSL:-}" = "true" ]; then
+            S3CMD_ARGS="$S3CMD_ARGS --no-ssl"
+        fi
+        if [ "${S3_SIGNATURE_V2:-}" = "true" ]; then
+            S3CMD_ARGS="$S3CMD_ARGS --signature-v2"
+        fi
+        
+        # shellcheck disable=SC2086
+        s3cmd $S3CMD_ARGS sync --stop-on-error "$URL" "$TARGET_DIR/"
+        echo "Sync complete"
+        ;;
+    *)
+        echo "Error: Unknown protocol. URL must start with hf:// or s3://" >&2
+        exit 1
+        ;;
+esac
\ No newline at end of file
diff --git a/python/model-downloader/progress_monitor.sh b/python/model-downloader/progress_monitor.sh
new file mode 100644
index 000000000..a530cf975
--- /dev/null
+++ b/python/model-downloader/progress_monitor.sh
@@ -0,0 +1,107 @@
+#!/bin/sh
+# Progress monitor for model downloads
+# Outputs JSON progress logs and kills stalled downloads
+
+# Handle SIGTERM gracefully - kubelet sends this when main container terminates
+terminated=false
+trap 'terminated=true' TERM
+
+# Output a JSON log message
+# Usage: log_json <type> [key=value ...]
+log_json() {
+    type=$1
+    shift
+    timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+    json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\""
+    for kv in "$@"; do
+        key="${kv%%=*}"
+        value="${kv#*=}"
+        # Check if value is numeric
+        case "$value" in
+            ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;;  # string
+            *) json="$json,\"$key\":$value" ;;                 # number
+        esac
+    done
+    echo "$json}" >&2
+}
+
+SA_TOKEN="/var/run/secrets/kubernetes.io/serviceaccount/token"
+can_update_status=false
+if [ -f "$SA_TOKEN" ] && [ -n "${CACHE_NAME:-}" ] && [ -n "${CACHE_NAMESPACE:-}" ]; then
+    can_update_status=true
+fi
+
+update_status() {
+    if [ "$can_update_status" = "true" ]; then
+        percent=$1
+        kubectl patch aimmodelcache "$CACHE_NAME" -n "$CACHE_NAMESPACE" \
+            --type=merge --subresource=status \
+            -p "{\"status\":{\"downloadProgress\":$percent}}" 2>/dev/null || true
+    fi
+}
+
+# Kill the download process (huggingface-cli or s3cmd)
+kill_downloader() {
+    # Kill python processes
+    pkill -9 -f "python" 2>/dev/null || true
+    # Kill s3cmd
+    pkill -9 -f "s3cmd" 2>/dev/null || true
+    
+}   
+
+expected_size=${EXPECTED_SIZE_BYTES:-0}
+mount_path=${MOUNT_PATH:-/cache}
+log_interval=${PROGRESS_INTERVAL:-5} # 5 seconds default
+stall_timeout=${STALL_TIMEOUT:-60}  # 1 minutes default
+
+log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$log_interval" "stallTimeoutSeconds=$stall_timeout"
+
+last_size=0
+last_change_time=$(date +%s)
+
+while true; do
+    # Check if we received SIGTERM (main container terminated)
+    if [ "$terminated" = "true" ]; then
+        current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
+        log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size"
+        exit 0
+    fi
+
+    current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
+    now=$(date +%s)
+
+    # Track progress for stall detection
+    if [ "$current_size" -gt "$last_size" ]; then
+        last_size=$current_size
+        last_change_time=$now
+    fi
+
+    # Check for stall (no progress for stall_timeout seconds)
+    stall_duration=$((now - last_change_time))
+    if [ "$stall_duration" -ge "$stall_timeout" ]; then
+        log_json "stall" "currentBytes=$current_size" "stallDurationSeconds=$stall_duration"
+        kill_downloader
+        exit 1
+    fi
+
+    if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then
+        percent=$((current_size * 100 / expected_size))
+        # Cap at 100% (during download, temp files may exceed expected size)
+        [ "$percent" -gt 100 ] && percent=100
+        log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size"
+    elif [ "$current_size" -gt 0 ]; then
+        log_json "progress" "currentBytes=$current_size" "message=Expected size unknown"
+    else
+        log_json "waiting" "message=Waiting for download to start"
+    fi
+
+    update_status "$percent"
+
+    # Use a loop with short sleeps so we can check for SIGTERM more frequently
+    i=0
+    while [ $i -lt $log_interval ] && [ "$terminated" = "false" ]; do
+        sleep 1
+        i=$((i + 1))
+    done
+done
+

From 398391fd5766cd56cb4441909b0372f7702e0354 Mon Sep 17 00:00:00 2001
From: bjorn-amd <bjorn.ahlstrom@amd.com>
Date: Tue, 13 Jan 2026 19:55:07 +0100
Subject: [PATCH 2/3] Added tests for stall detection and s3 downloads

---
 .../cache-s3-download/chainsaw-test.yaml      | 232 ++++++++++++++++++
 .../caching/cache-s3-download/runtime.yaml    |   7 +
 .../cache-s3-download/s3-deployment.yaml      |  79 ++++++
 .../cache-stall-detection/aimservice.yaml     |  13 +
 .../cache-stall-detection/chainsaw-test.yaml  |  41 ++++
 .../cache-stall-detection/runtime.yaml        |   7 +
 6 files changed, 379 insertions(+)
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml
 create mode 100644 test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml

diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml
new file mode 100644
index 000000000..f3240c5cf
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml
@@ -0,0 +1,232 @@
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+  name: cache-s3-download
+spec:
+  description: Test S3 model caching by comparing HF download with S3 round-trip
+  concurrent: false
+  timeouts:
+    assert: 600s
+  steps:
+  - name: Setup infrastructure
+    try:
+    - apply:
+        file: runtime.yaml
+    - apply:
+        file: s3-deployment.yaml
+    - wait:
+        apiVersion: apps/v1
+        kind: Deployment
+        name: minio-deployment
+        timeout: 2m
+        for:
+          condition:
+            name: Available
+            value: 'true'
+
+  - name: Create HuggingFace model cache
+    try:
+    - apply:
+        resource:
+          apiVersion: aim.silogen.ai/v1alpha1
+          kind: AIMModelCache
+          metadata:
+            name: hf-model-cache
+          spec:
+            sourceUri: hf://HuggingFaceTB/SmolLM2-135M
+            size: 1Gi
+
+  - name: Wait for HF cache to be available
+    try:
+    - assert:
+        timeout: 300s
+        resource:
+          apiVersion: aim.silogen.ai/v1alpha1
+          kind: AIMModelCache
+          metadata:
+            name: hf-model-cache
+          status:
+            status: Available
+
+  - name: Upload cached model to MinIO
+    try:
+    - apply:
+        resource:
+          apiVersion: batch/v1
+          kind: Job
+          metadata:
+            name: upload-to-s3
+          spec:
+            backoffLimit: 2
+            template:
+              spec:
+                restartPolicy: Never
+                securityContext:
+                  runAsUser: 1000
+                  runAsGroup: 1000
+                  fsGroup: 1000
+                volumes:
+                - name: hf-cache
+                  persistentVolumeClaim:
+                    claimName: hf-model-cache-cache
+                containers:
+                - name: uploader
+                  image: ghcr.io/silogen/kaiwo/model-downloader:0.1
+                  command: ["/bin/sh", "-c"]
+                  args:
+                  - |
+                    set -eu
+                    BUCKET="test-models"
+                    S3_HOST="minio-service:9000"
+                    
+                    S3_OPTS="--host=$S3_HOST --host-bucket=$S3_HOST/%(bucket)s --access_key=$S3_ACCESS_KEY --secret_key=$S3_SECRET_KEY --no-ssl --signature-v2"
+                    
+                    echo "=== Creating bucket ==="
+                    s3cmd $S3_OPTS mb "s3://$BUCKET" || true
+                    
+                    echo "=== Uploading from HF cache to S3 ==="
+                    s3cmd $S3_OPTS --recursive put /cache/ "s3://$BUCKET/"
+                    
+                    echo "=== Verifying upload ==="
+                    s3cmd $S3_OPTS ls -r "s3://$BUCKET/"
+                  env:
+                  - name: S3_ACCESS_KEY
+                    valueFrom:
+                      secretKeyRef:
+                        name: minio-secret
+                        key: access_key_id
+                  - name: S3_SECRET_KEY
+                    valueFrom:
+                      secretKeyRef:
+                        name: minio-secret
+                        key: secret_key
+                  volumeMounts:
+                  - name: hf-cache
+                    mountPath: /cache
+                    readOnly: true
+    - wait:
+        apiVersion: batch/v1
+        kind: Job
+        name: upload-to-s3
+        timeout: 3m
+        for:
+          condition:
+            name: Complete
+            value: 'true'
+
+  - name: Create S3 model cache
+    try:
+    - apply:
+        resource:
+          apiVersion: aim.silogen.ai/v1alpha1
+          kind: AIMModelCache
+          metadata:
+            name: s3-model-cache
+          spec:
+            sourceUri: s3://test-models/
+            size: 1Gi
+            env:
+            - name: S3_ENDPOINT
+              value: "minio-service:9000"
+            - name: S3_NO_SSL
+              value: "true"
+            - name: S3_SIGNATURE_V2
+              value: "true"
+            - name: S3_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: minio-secret
+                  key: access_key_id
+            - name: S3_SECRET_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: minio-secret
+                  key: secret_key
+
+  - name: Wait for S3 cache to be available
+    try:
+    - assert:
+        timeout: 300s
+        resource:
+          apiVersion: aim.silogen.ai/v1alpha1
+          kind: AIMModelCache
+          metadata:
+            name: s3-model-cache
+          status:
+            status: Available
+
+  - name: Verify checksums match
+    try:
+    - apply:
+        resource:
+          apiVersion: batch/v1
+          kind: Job
+          metadata:
+            name: verify-checksums
+          spec:
+            backoffLimit: 0
+            template:
+              spec:
+                restartPolicy: Never
+                securityContext:
+                  runAsUser: 1000
+                  runAsGroup: 1000
+                  fsGroup: 1000
+                volumes:
+                - name: hf-cache
+                  persistentVolumeClaim:
+                    claimName: hf-model-cache-cache
+                - name: s3-cache
+                  persistentVolumeClaim:
+                    claimName: s3-model-cache-cache
+                containers:
+                - name: verifier
+                  image: alpine:latest
+                  command: ["/bin/sh", "-c"]
+                  args:
+                  - |
+                    set -eu
+                    echo "=== Computing checksums for HF cache ==="
+                    cd /hf-cache
+                    find . -type f -exec md5sum {} \; | sort > /tmp/hf-checksums.txt
+                    cat /tmp/hf-checksums.txt
+                    
+                    echo "=== Computing checksums for S3 cache ==="
+                    cd /s3-cache
+                    find . -type f -exec md5sum {} \; | sort > /tmp/s3-checksums.txt
+                    cat /tmp/s3-checksums.txt
+                    
+                    echo "=== Comparing checksums ==="
+                    if diff /tmp/hf-checksums.txt /tmp/s3-checksums.txt; then
+                      echo "SUCCESS: All checksums match!"
+                      exit 0
+                    else
+                      echo "FAILURE: Checksums do not match!"
+                      exit 1
+                    fi
+                  volumeMounts:
+                  - name: hf-cache
+                    mountPath: /hf-cache
+                    readOnly: true
+                  - name: s3-cache
+                    mountPath: /s3-cache
+                    readOnly: true
+    - wait:
+        apiVersion: batch/v1
+        kind: Job
+        name: verify-checksums
+        timeout: 2m
+        for:
+          condition:
+            name: Complete
+            value: 'true'
+
+  catch:
+  - command:
+      entrypoint: kaiwo-dev
+      env:
+      - name: PRINT_LEVEL
+        value: ($values.print_level)
+      - name: NAMESPACE
+        value: ($namespace)
+      args: ["debug", "chainsaw", "--namespace=$NAMESPACE", "--print-level=$PRINT_LEVEL"]
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml
new file mode 100644
index 000000000..652e0c0bf
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml
@@ -0,0 +1,7 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMRuntimeConfig
+metadata:
+  name: default
+spec:
+  defaultStorageClassName: "rwx-nfs"
+  pvcHeadroomPercent: 10
diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml
new file mode 100644
index 000000000..0d1e5244a
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: minio-secret
+data:
+  access_key_id: bWluaW8=
+  secret_key: bWluaW8xMjM=
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: minio-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: minio
+  template:
+    metadata:
+      labels:
+        app: minio
+    spec:
+      containers:
+      - name: minio
+        image: minio/minio
+        args: ["server", "/data"]
+        resources:
+          limits:
+            memory: "1Gi"
+          requests:
+            cpu: "1"
+            memory: "1Gi"
+        env:
+        - name: MINIO_ROOT_USER
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: access_key_id
+        - name: MINIO_ROOT_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: minio-secret
+              key: secret_key
+        ports:
+        - containerPort: 9000
+        livenessProbe:
+          httpGet:
+            path: /minio/health/live
+            port: 9000
+          initialDelaySeconds: 120
+          periodSeconds: 20
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /minio/health/ready
+            port: 9000
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          timeoutSeconds: 1
+          failureThreshold: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: minio-service
+spec:
+  selector:
+    app: minio
+  ports:
+  - protocol: TCP
+    port: 9000
+    targetPort: 9000
+    name: minio-endpoint
+  - protocol: TCP
+    port: 9001
+    targetPort: 9001
+    name: minio-console
+  type: ClusterIP
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml
new file mode 100644
index 000000000..12dc8bb3c
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml
@@ -0,0 +1,13 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMService
+metadata:
+  name: test-cache-stall
+spec:
+  model:
+    image: ghcr.io/silogen/aim-dummy:0.1.4
+  cacheModel: true
+  template:
+    allowUnoptimized: true
+  env:
+    - name: AIM_DEBUG_CAUSE_HANG
+      value: "true"
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml
new file mode 100644
index 000000000..1e517e6e3
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml
@@ -0,0 +1,41 @@
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+  name: cache-stall-detection
+spec:
+  description: Test that download stall detection kills hung downloads
+  timeouts:
+    assert: 480s  # 8 minutes - stall timeout (5min) + buffer
+  steps:
+  - name: Create runtime config
+    try:
+    - apply:
+        file: runtime.yaml
+
+  - name: Create AIMService with cacheModel and hang trigger
+    try:
+    - apply:
+        file: aimservice.yaml
+
+  - name: Verify AIMModelCache is created and download job starts
+    try:
+    - assert:
+        timeout: 120s
+        resource:
+          apiVersion: aim.silogen.ai/v1alpha1
+          kind: AIMModelCache
+          metadata:
+            namespace: ($namespace)
+          status:
+            status: Progressing
+
+  - name: Verify stall detection triggered (Job has at least 1 failed pod)
+    try:
+    - assert:
+        timeout: 420s
+        resource:
+          apiVersion: batch/v1
+          kind: Job
+          metadata:
+            namespace: ($namespace)
+          (status.failed >= `1`): true
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml
new file mode 100644
index 000000000..c9504097e
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml
@@ -0,0 +1,7 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMRuntimeConfig
+metadata:
+  name: default
+spec:
+  defaultStorageClassName: "rwx-nfs"
+  pvcHeadroomPercent: 10
\ No newline at end of file

From 41913a7f6eabc6d634205fee9919cd3624404b9d Mon Sep 17 00:00:00 2001
From: bjorn-amd <bjorn.ahlstrom@amd.com>
Date: Tue, 13 Jan 2026 20:38:30 +0100
Subject: [PATCH 3/3] Add github action to build downloader image

---
 .github/workflows/build-model-downloader.yaml | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 .github/workflows/build-model-downloader.yaml

diff --git a/.github/workflows/build-model-downloader.yaml b/.github/workflows/build-model-downloader.yaml
new file mode 100644
index 000000000..d0ab9abc7
--- /dev/null
+++ b/.github/workflows/build-model-downloader.yaml
@@ -0,0 +1,60 @@
+name: build-model-downloader
+
+on:
+  push:
+    tags:
+      - "v*"
+    branches:
+      - main
+    paths:
+      - "python/model-downloader/**"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to GitHub Container Registry
+        run: |
+          echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $GITHUB_ACTOR --password-stdin
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/${{ github.repository_owner }}/kaiwo/model-downloader
+          tags: |
+            type=ref,event=branch
+            type=ref,event=tag
+            type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
+            type=sha,prefix=
+
+      - name: Build and push image
+        uses: docker/build-push-action@v6
+        with:
+          context: python/model-downloader
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Summary
+        run: |
+          echo "### Model Downloader Image Published" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Tags pushed:**" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
+          echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
+          echo '```' >> $GITHUB_STEP_SUMMARY
\ No newline at end of file