diff --git a/.github/workflows/build-model-downloader.yaml b/.github/workflows/build-model-downloader.yaml
new file mode 100644
index 000000000..d0ab9abc7
--- /dev/null
+++ b/.github/workflows/build-model-downloader.yaml
@@ -0,0 +1,60 @@
+name: build-model-downloader
+
+on:
+ push:
+ tags:
+ - "v*"
+ branches:
+ - main
+ paths:
+ - "python/model-downloader/**"
+ workflow_dispatch:
+
+permissions:
+ contents: read
+ packages: write
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Log in to GitHub Container Registry
+ run: |
+ echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $GITHUB_ACTOR --password-stdin
+
+ - name: Docker meta
+ id: meta
+ uses: docker/metadata-action@v5
+ with:
+ images: ghcr.io/${{ github.repository_owner }}/kaiwo/model-downloader
+ tags: |
+ type=ref,event=branch
+ type=ref,event=tag
+ type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/v') }}
+ type=sha,prefix=
+
+ - name: Build and push image
+ uses: docker/build-push-action@v6
+ with:
+ context: python/model-downloader
+ push: true
+ tags: ${{ steps.meta.outputs.tags }}
+ labels: ${{ steps.meta.outputs.labels }}
+ cache-from: type=gha
+ cache-to: type=gha,mode=max
+
+ - name: Summary
+ run: |
+ echo "### Model Downloader Image Published" >> $GITHUB_STEP_SUMMARY
+ echo "" >> $GITHUB_STEP_SUMMARY
+ echo "**Tags pushed:**" >> $GITHUB_STEP_SUMMARY
+ echo '```' >> $GITHUB_STEP_SUMMARY
+ echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY
+ echo '```' >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
diff --git a/apis/aim/v1alpha1/aimmodelcache_types.go b/apis/aim/v1alpha1/aimmodelcache_types.go
index 584fa8e1f..aa00c4837 100644
--- a/apis/aim/v1alpha1/aimmodelcache_types.go
+++ b/apis/aim/v1alpha1/aimmodelcache_types.go
@@ -29,7 +29,7 @@ import (
)
const (
- DefaultDownloadImage = "kserve/storage-initializer:v0.16.0-rc0"
+ DefaultDownloadImage = "ghcr.io/silogen/kaiwo/model-downloader:0.1"
)
// AIMResolvedModelCache contains reference info and status for a cached model.
@@ -71,7 +71,7 @@ type AIMModelCacheSpec struct {
Env []corev1.EnvVar `json:"env,omitempty"`
// ModelDownloadImage is the image used to download the model
- // +kubebuilder:default="kserve/storage-initializer:v0.16.0"
+ // +kubebuilder:default="ghcr.io/silogen/kaiwo/model-downloader:0.1"
ModelDownloadImage string `json:"modelDownloadImage"`
// ImagePullSecrets references secrets for pulling AIM container images.
diff --git a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
index e20a34262..e2397e964 100644
--- a/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
+++ b/config/crd/bases/aim.silogen.ai_aimmodelcaches.yaml
@@ -201,7 +201,7 @@ spec:
x-kubernetes-map-type: atomic
type: array
modelDownloadImage:
- default: kserve/storage-initializer:v0.16.0
+ default: ghcr.io/silogen/kaiwo/model-downloader:0.1
description: ModelDownloadImage is the image used to download the
model
type: string
diff --git a/crd/crds.yaml b/crd/crds.yaml
index 63ca2be11..64916cb8c 100644
--- a/crd/crds.yaml
+++ b/crd/crds.yaml
@@ -2349,7 +2349,7 @@ spec:
x-kubernetes-map-type: atomic
type: array
modelDownloadImage:
- default: kserve/storage-initializer:v0.16.0
+ default: ghcr.io/silogen/kaiwo/model-downloader:0.1
description: ModelDownloadImage is the image used to download the
model
type: string
diff --git a/docs/docs/reference/crds/aim.silogen.ai.md b/docs/docs/reference/crds/aim.silogen.ai.md
index 9ce5d5d4c..f6b9bbb06 100644
--- a/docs/docs/reference/crds/aim.silogen.ai.md
+++ b/docs/docs/reference/crds/aim.silogen.ai.md
@@ -525,7 +525,7 @@ _Appears in:_
| `storageClassName` _string_ | StorageClassName specifies the storage class for the cache volume | | |
| `size` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#quantity-resource-api)_ | Size specifies the size of the cache volume | | |
| `env` _[EnvVar](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#envvar-v1-core) array_ | Env lists the environment variables to use for authentication when downloading models.
These variables are used for authentication with model registries (e.g., HuggingFace tokens). | | |
-| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | kserve/storage-initializer:v0.16.0 | |
+| `modelDownloadImage` _string_ | ModelDownloadImage is the image used to download the model | ghcr.io/silogen/kaiwo/model-downloader:0.1 | |
| `imagePullSecrets` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#localobjectreference-v1-core) array_ | ImagePullSecrets references secrets for pulling AIM container images. | | |
| `runtimeConfigName` _string_ | RuntimeConfigName references the AIM runtime configuration (by name) to use for this model cache.
This determines PVC headroom and other runtime settings. | default | |
diff --git a/internal/controller/aim/aimmodelcache_controller.go b/internal/controller/aim/aimmodelcache_controller.go
index 1fc63bbd0..c0d03ad07 100644
--- a/internal/controller/aim/aimmodelcache_controller.go
+++ b/internal/controller/aim/aimmodelcache_controller.go
@@ -553,18 +553,25 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
if len(mc.Spec.ModelDownloadImage) > 0 {
downloadImage = mc.Spec.ModelDownloadImage
}
+
+ // Expected size in bytes for progress calculation
+ expectedSizeBytes := mc.Spec.Size.Value()
+
// Merge env vars with precedence: mc.Spec.Env > runtimeConfigSpec.Env > defaults
newEnv := helpers.MergeEnvVars([]corev1.EnvVar{
{Name: "HF_XET_CHUNK_CACHE_SIZE_BYTES", Value: "0"},
{Name: "HF_XET_SHARD_CACHE_SIZE_BYTES", Value: "0"},
{Name: "HF_XET_HIGH_PERFORMANCE", Value: "1"},
- {Name: "HF_HOME", Value: mountPath + "/.hf"},
+ {Name: "HF_HOME", Value: "/tmp/.hf"},
{Name: "UMASK", Value: "0022"},
+ {Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)},
+ {Name: "MOUNT_PATH", Value: mountPath},
+ {Name: "CACHE_NAME", Value: mc.Name},
+ {Name: "CACHE_NAMESPACE", Value: mc.Namespace},
+ {Name: "STALL_TIMEOUT", Value: "120"},
+ {Name: "TARGET_DIR", Value: mountPath},
}, helpers.MergeEnvVars(runtimeConfigSpec.Env, mc.Spec.Env))
- // Expected size in bytes for progress calculation
- expectedSizeBytes := mc.Spec.Size.Value()
-
return &batchv1.Job{
TypeMeta: metav1.TypeMeta{
APIVersion: batchv1.SchemeGroupVersion.String(),
@@ -598,54 +605,6 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ClaimName: pvcName},
},
},
- {
- Name: "tmp",
- VolumeSource: corev1.VolumeSource{
- EmptyDir: &corev1.EmptyDirVolumeSource{
- SizeLimit: baseutils.Pointer(resource.MustParse("500Mi")), // Small temp space for system operations
- },
- },
- },
- },
- // Native sidecar (Kubernetes 1.28+): init container with restartPolicy=Always
- // runs alongside main containers and is automatically terminated by kubelet
- // when all regular containers complete (success or failure)
- InitContainers: []corev1.Container{
- {
- Name: "progress-monitor",
- Image: "busybox:1.36",
- ImagePullPolicy: corev1.PullIfNotPresent,
- // restartPolicy: Always makes this a native sidecar that runs alongside main containers
- // Kubernetes automatically sends SIGTERM when all regular containers terminate
- RestartPolicy: baseutils.Pointer(corev1.ContainerRestartPolicyAlways),
- SecurityContext: &corev1.SecurityContext{
- RunAsUser: baseutils.Pointer(int64(1000)),
- RunAsGroup: baseutils.Pointer(int64(1000)),
- },
- Env: []corev1.EnvVar{
- {Name: "EXPECTED_SIZE_BYTES", Value: fmt.Sprintf("%d", expectedSizeBytes)},
- {Name: "MOUNT_PATH", Value: mountPath},
- },
- Command: []string{"/bin/sh"},
- Args: []string{
- "-c",
- progressMonitorScript,
- },
- VolumeMounts: []corev1.VolumeMount{
- {Name: "cache", MountPath: mountPath, ReadOnly: true},
- },
- // Minimal resources for the monitor
- Resources: corev1.ResourceRequirements{
- Requests: corev1.ResourceList{
- corev1.ResourceCPU: resource.MustParse("10m"),
- corev1.ResourceMemory: resource.MustParse("16Mi"),
- },
- Limits: corev1.ResourceList{
- corev1.ResourceCPU: resource.MustParse("50m"),
- corev1.ResourceMemory: resource.MustParse("32Mi"),
- },
- },
- },
},
Containers: []corev1.Container{
{
@@ -656,43 +615,10 @@ func (r *AIMModelCacheReconciler) buildDownloadJob(mc *aimv1alpha1.AIMModelCache
RunAsUser: baseutils.Pointer(int64(1000)),
RunAsGroup: baseutils.Pointer(int64(1000)),
},
- Env: newEnv,
- Command: []string{"/bin/sh"},
- Args: []string{
- "-c",
- fmt.Sprintf(`
-# Bail out if this AIM_DEBUG_CAUSE_FAILURE is set
-if [ -n "$AIM_DEBUG_CAUSE_FAILURE" ]; then
- echo "AIM_DEBUG_CAUSE_FAILURE is set, bailing out"
- exit 1
-fi
-# Set umask so downloaded files are readable by others
-umask 0022
-
-# Create temp directories on the same filesystem as destination
-mkdir -p %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache
-
-# Download the model
-python /storage-initializer/scripts/initializer-entrypoint %s %s &&
-(
-# Report sizes before cleanup
-echo "Storage usage before cleanup:"
-du -sh %s
-du -sh %s/.cache 2>/dev/null || true
-
-# Clean up HF cache directories to save space (keeps only final model files)
-echo "Cleaning up HF cache to save space..."
-rm -rf %s/.cache %s/.tmp %s/.hf_home %s/.hf_cache %s/.xet_cache 2>/dev/null || true
-
-# Report final sizes
-echo "Final storage usage:"
-du -sh %s || true
-)
- `, mountPath, mountPath, mountPath, mountPath, mc.Spec.SourceURI, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath, mountPath),
- },
+ Env: newEnv,
+ Args: []string{mc.Spec.SourceURI},
VolumeMounts: []corev1.VolumeMount{
{Name: "cache", MountPath: mountPath},
- {Name: "tmp", MountPath: "/tmp"},
},
},
},
@@ -702,88 +628,6 @@ du -sh %s || true
}
}
-// progressMonitorScript is the shell script for the download progress monitor sidecar.
-// It reports download progress every 10 seconds in JSON format.
-//
-// This runs as a native sidecar (init container with restartPolicy=Always).
-// Kubernetes automatically sends SIGTERM when all regular containers terminate,
-// so we just need to handle the signal gracefully.
-//
-// JSON output types:
-// - "start": Initial message when monitor starts
-// - "progress": Periodic progress update
-// - "complete": Download finished successfully (detected via marker file)
-// - "terminated": Received SIGTERM from kubelet (main container finished)
-const progressMonitorScript = `
-# Handle SIGTERM gracefully - kubelet sends this when main container terminates
-terminated=false
-trap 'terminated=true' TERM
-
-# Output a JSON log message
-# Usage: log_json [key=value ...]
-log_json() {
- type=$1
- shift
- timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
- json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\""
- for kv in "$@"; do
- key="${kv%%=*}"
- value="${kv#*=}"
- # Check if value is numeric
- case "$value" in
- ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;; # string
- *) json="$json,\"$key\":$value" ;; # number
- esac
- done
- echo "$json}"
-}
-
-expected_size=${EXPECTED_SIZE_BYTES:-0}
-mount_path=${MOUNT_PATH:-/cache}
-interval=10
-
-log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$interval"
-
-while true; do
- # Check if we received SIGTERM (main container terminated)
- if [ "$terminated" = "true" ]; then
- current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
- log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size" "message=Main container terminated"
- exit 0
- fi
-
- # Check if download completed successfully (marker file from main container)
- if [ -f "$mount_path/.download-complete" ]; then
- current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
- log_json "complete" "currentBytes=$current_size" "expectedBytes=$expected_size"
- exit 0
- fi
-
- current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
-
- if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then
- percent=$((current_size * 100 / expected_size))
- # Cap at 100% (during download, temp files may exceed expected size)
- if [ $percent -gt 100 ]; then
- percent=100
- fi
- log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size"
- elif [ "$current_size" -gt 0 ]; then
- log_json "progress" "currentBytes=$current_size" "expectedBytes=0" "message=Expected size unknown"
- else
- log_json "progress" "currentBytes=0" "expectedBytes=$expected_size" "message=Waiting for download to start"
- fi
-
- # Use a loop with short sleeps so we can check for SIGTERM more frequently
- # sleep in busybox doesn't get interrupted by signals, so we poll
- i=0
- while [ $i -lt $interval ] && [ "$terminated" = "false" ]; do
- sleep 1
- i=$((i + 1))
- done
-done
-`
-
func (r *AIMModelCacheReconciler) pvcName(mc *aimv1alpha1.AIMModelCache) string {
return baseutils.FormatNameWithPostfix(mc.Name, "cache")
}
diff --git a/python/model-downloader/Dockerfile b/python/model-downloader/Dockerfile
new file mode 100644
index 000000000..29499cb23
--- /dev/null
+++ b/python/model-downloader/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.14-alpine
+
+RUN apk add --no-cache s3cmd procps kubectl \
+ && pip install --no-cache-dir -U huggingface_hub
+
+COPY entrypoint.sh /entrypoint.sh
+COPY progress_monitor.sh /progress_monitor.sh
+RUN chmod +x /entrypoint.sh /progress_monitor.sh
+
+RUN mkdir /cache && chown 1000:1000 /cache
+
+
+WORKDIR /cache
+USER 1000
+
+ENTRYPOINT ["/entrypoint.sh"]
\ No newline at end of file
diff --git a/python/model-downloader/entrypoint.sh b/python/model-downloader/entrypoint.sh
new file mode 100644
index 000000000..66217ede6
--- /dev/null
+++ b/python/model-downloader/entrypoint.sh
@@ -0,0 +1,101 @@
+#!/bin/sh
+set -eu
+
+URL="${1:?Usage: $0 }"
+TARGET_DIR="${TARGET_DIR:-/cache}"
+
+
+# Fetch expected size if not already set
+if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then
+ case "$URL" in
+ hf://*)
+ # Fetch expected size if not set
+ if [ -z "${EXPECTED_SIZE_BYTES:-}" ]; then
+ echo "Fetching model size from Hugging Face..."
+ MODEL_PATH="${URL#hf://}"
+ EXPECTED_SIZE_BYTES=$(python -c "
+ from huggingface_hub import HfApi
+ info = HfApi().model_info('$MODEL_PATH', files_metadata=True)
+ print(sum(f.size or 0 for f in info.siblings))
+ " 2>/dev/null || echo 0)
+ fi
+ ;;
+ s3://*)
+ # Get size from S3 (s3cmd du returns human-readable, need bytes)
+ EXPECTED_SIZE_BYTES=$(s3cmd du "$URL" 2>/dev/null | awk '{print $1}' || echo 0)
+ ;;
+ esac
+ export EXPECTED_SIZE_BYTES
+fi
+
+echo "Expected size: $EXPECTED_SIZE_BYTES bytes"
+
+# Start progress monitor in background
+if [ -f /progress_monitor.sh ]; then
+ /progress_monitor.sh &
+ echo "Started progress monitor (PID: $!)"
+fi
+
+### TESTING WHEN ENV VARS ARE SET ###
+if [ -n "${AIM_DEBUG_CAUSE_HANG:-}" ]; then
+ echo "AIM_DEBUG_CAUSE_HANG is set, causing hang"
+ python -c "import time; time.sleep(1000000)"
+ exit 1
+fi
+
+if [ -n "${AIM_DEBUG_CAUSE_FAILURE:-}" ]; then
+ echo "AIM_DEBUG_CAUSE_FAILURE is set, causing failure"
+ exit 1
+fi
+### END TESTING ###
+
+case "$URL" in
+ hf://*)
+ export HF_HOME="$TARGET_DIR/.hf"
+ mkdir -p "$HF_HOME"
+
+ MODEL_PATH="${URL#hf://}"
+ echo "Downloading from Hugging Face: $MODEL_PATH to $TARGET_DIR"
+ hf download \
+ --local-dir "$TARGET_DIR" \
+ "$MODEL_PATH"
+ echo "Verifying download..."
+ hf cache verify \
+ --local-dir "$TARGET_DIR" \
+ --fail-on-missing-files \
+ "$MODEL_PATH"
+ echo "Download complete and verified"
+ echo "Size of HF_HOME: $(du -sh "$HF_HOME")"
+ rm -rf "$HF_HOME"
+ ;;
+ s3://*)
+ echo "Syncing from S3: $URL to $TARGET_DIR"
+
+ S3CMD_ARGS=""
+
+ if [ -n "${S3_ACCESS_KEY:-}" ]; then
+ S3CMD_ARGS="$S3CMD_ARGS --access_key=$S3_ACCESS_KEY"
+ fi
+ if [ -n "${S3_SECRET_KEY:-}" ]; then
+ S3CMD_ARGS="$S3CMD_ARGS --secret_key=$S3_SECRET_KEY"
+ fi
+ if [ -n "${S3_ENDPOINT:-}" ]; then
+ S3CMD_ARGS="$S3CMD_ARGS --host=$S3_ENDPOINT"
+ S3CMD_ARGS="$S3CMD_ARGS --host-bucket=$S3_ENDPOINT/%(bucket)s"
+ fi
+ if [ "${S3_NO_SSL:-}" = "true" ]; then
+ S3CMD_ARGS="$S3CMD_ARGS --no-ssl"
+ fi
+ if [ "${S3_SIGNATURE_V2:-}" = "true" ]; then
+ S3CMD_ARGS="$S3CMD_ARGS --signature-v2"
+ fi
+
+ # shellcheck disable=SC2086
+ s3cmd $S3CMD_ARGS sync --stop-on-error "$URL" "$TARGET_DIR/"
+ echo "Sync complete"
+ ;;
+ *)
+ echo "Error: Unknown protocol. URL must start with hf:// or s3://" >&2
+ exit 1
+ ;;
+esac
\ No newline at end of file
diff --git a/python/model-downloader/progress_monitor.sh b/python/model-downloader/progress_monitor.sh
new file mode 100644
index 000000000..a530cf975
--- /dev/null
+++ b/python/model-downloader/progress_monitor.sh
@@ -0,0 +1,107 @@
+#!/bin/sh
+# Progress monitor for model downloads
+# Outputs JSON progress logs and kills stalled downloads
+
+# Handle SIGTERM gracefully - kubelet sends this when main container terminates
+terminated=false
+trap 'terminated=true' TERM
+
+# Output a JSON log message
+# Usage: log_json [key=value ...]
+log_json() {
+ type=$1
+ shift
+ timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
+ json="{\"timestamp\":\"$timestamp\",\"type\":\"$type\""
+ for kv in "$@"; do
+ key="${kv%%=*}"
+ value="${kv#*=}"
+ # Check if value is numeric
+ case "$value" in
+ ''|*[!0-9]*) json="$json,\"$key\":\"$value\"" ;; # string
+ *) json="$json,\"$key\":$value" ;; # number
+ esac
+ done
+ echo "$json}" >&2
+}
+
+SA_TOKEN="/var/run/secrets/kubernetes.io/serviceaccount/token"
+can_update_status=false
+if [ -f "$SA_TOKEN" ] && [ -n "${CACHE_NAME:-}" ] && [ -n "${CACHE_NAMESPACE:-}" ]; then
+ can_update_status=true
+fi
+
+update_status() {
+ if [ "$can_update_status" = "true" ]; then
+ percent=$1
+ kubectl patch aimmodelcache "$CACHE_NAME" -n "$CACHE_NAMESPACE" \
+ --type=merge --subresource=status \
+ -p "{\"status\":{\"downloadProgress\":$percent}}" 2>/dev/null || true
+ fi
+}
+
+# Kill the download process (huggingface-cli or s3cmd)
+kill_downloader() {
+ # Kill python processes
+ pkill -9 -f "python" 2>/dev/null || true
+ # Kill s3cmd
+ pkill -9 -f "s3cmd" 2>/dev/null || true
+
+}
+
+expected_size=${EXPECTED_SIZE_BYTES:-0}
+mount_path=${MOUNT_PATH:-/cache}
+log_interval=${PROGRESS_INTERVAL:-5} # 5 seconds default
+stall_timeout=${STALL_TIMEOUT:-60} # 1 minutes default
+
+log_json "start" "expectedBytes=$expected_size" "intervalSeconds=$log_interval" "stallTimeoutSeconds=$stall_timeout"
+
+last_size=0
+last_change_time=$(date +%s)
+
+while true; do
+ # Check if we received SIGTERM (main container terminated)
+ if [ "$terminated" = "true" ]; then
+ current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
+ log_json "terminated" "currentBytes=$current_size" "expectedBytes=$expected_size"
+ exit 0
+ fi
+
+ current_size=$(du -sb "$mount_path" 2>/dev/null | cut -f1 || echo 0)
+ now=$(date +%s)
+
+ # Track progress for stall detection
+ if [ "$current_size" -gt "$last_size" ]; then
+ last_size=$current_size
+ last_change_time=$now
+ fi
+
+ # Check for stall (no progress for stall_timeout seconds)
+ stall_duration=$((now - last_change_time))
+ if [ "$stall_duration" -ge "$stall_timeout" ]; then
+ log_json "stall" "currentBytes=$current_size" "stallDurationSeconds=$stall_duration"
+ kill_downloader
+ exit 1
+ fi
+
+ if [ "$expected_size" -gt 0 ] && [ "$current_size" -gt 0 ]; then
+ percent=$((current_size * 100 / expected_size))
+ # Cap at 100% (during download, temp files may exceed expected size)
+ [ "$percent" -gt 100 ] && percent=100
+ log_json "progress" "percent=$percent" "currentBytes=$current_size" "expectedBytes=$expected_size"
+ elif [ "$current_size" -gt 0 ]; then
+ log_json "progress" "currentBytes=$current_size" "message=Expected size unknown"
+ else
+ log_json "waiting" "message=Waiting for download to start"
+ fi
+
+ update_status "$percent"
+
+ # Use a loop with short sleeps so we can check for SIGTERM more frequently
+ i=0
+ while [ $i -lt $log_interval ] && [ "$terminated" = "false" ]; do
+ sleep 1
+ i=$((i + 1))
+ done
+done
+
diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml
new file mode 100644
index 000000000..f3240c5cf
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/chainsaw-test.yaml
@@ -0,0 +1,232 @@
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+ name: cache-s3-download
+spec:
+ description: Test S3 model caching by comparing HF download with S3 round-trip
+ concurrent: false
+ timeouts:
+ assert: 600s
+ steps:
+ - name: Setup infrastructure
+ try:
+ - apply:
+ file: runtime.yaml
+ - apply:
+ file: s3-deployment.yaml
+ - wait:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: minio-deployment
+ timeout: 2m
+ for:
+ condition:
+ name: Available
+ value: 'true'
+
+ - name: Create HuggingFace model cache
+ try:
+ - apply:
+ resource:
+ apiVersion: aim.silogen.ai/v1alpha1
+ kind: AIMModelCache
+ metadata:
+ name: hf-model-cache
+ spec:
+ sourceUri: hf://HuggingFaceTB/SmolLM2-135M
+ size: 1Gi
+
+ - name: Wait for HF cache to be available
+ try:
+ - assert:
+ timeout: 300s
+ resource:
+ apiVersion: aim.silogen.ai/v1alpha1
+ kind: AIMModelCache
+ metadata:
+ name: hf-model-cache
+ status:
+ status: Available
+
+ - name: Upload cached model to MinIO
+ try:
+ - apply:
+ resource:
+ apiVersion: batch/v1
+ kind: Job
+ metadata:
+ name: upload-to-s3
+ spec:
+ backoffLimit: 2
+ template:
+ spec:
+ restartPolicy: Never
+ securityContext:
+ runAsUser: 1000
+ runAsGroup: 1000
+ fsGroup: 1000
+ volumes:
+ - name: hf-cache
+ persistentVolumeClaim:
+ claimName: hf-model-cache-cache
+ containers:
+ - name: uploader
+ image: ghcr.io/silogen/kaiwo/model-downloader:0.1
+ command: ["/bin/sh", "-c"]
+ args:
+ - |
+ set -eu
+ BUCKET="test-models"
+ S3_HOST="minio-service:9000"
+
+ S3_OPTS="--host=$S3_HOST --host-bucket=$S3_HOST/%(bucket)s --access_key=$S3_ACCESS_KEY --secret_key=$S3_SECRET_KEY --no-ssl --signature-v2"
+
+ echo "=== Creating bucket ==="
+ s3cmd $S3_OPTS mb "s3://$BUCKET" || true
+
+ echo "=== Uploading from HF cache to S3 ==="
+ s3cmd $S3_OPTS --recursive put /cache/ "s3://$BUCKET/"
+
+ echo "=== Verifying upload ==="
+ s3cmd $S3_OPTS ls -r "s3://$BUCKET/"
+ env:
+ - name: S3_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: access_key_id
+ - name: S3_SECRET_KEY
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: secret_key
+ volumeMounts:
+ - name: hf-cache
+ mountPath: /cache
+ readOnly: true
+ - wait:
+ apiVersion: batch/v1
+ kind: Job
+ name: upload-to-s3
+ timeout: 3m
+ for:
+ condition:
+ name: Complete
+ value: 'true'
+
+ - name: Create S3 model cache
+ try:
+ - apply:
+ resource:
+ apiVersion: aim.silogen.ai/v1alpha1
+ kind: AIMModelCache
+ metadata:
+ name: s3-model-cache
+ spec:
+ sourceUri: s3://test-models/
+ size: 1Gi
+ env:
+ - name: S3_ENDPOINT
+ value: "minio-service:9000"
+ - name: S3_NO_SSL
+ value: "true"
+ - name: S3_SIGNATURE_V2
+ value: "true"
+ - name: S3_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: access_key_id
+ - name: S3_SECRET_KEY
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: secret_key
+
+ - name: Wait for S3 cache to be available
+ try:
+ - assert:
+ timeout: 300s
+ resource:
+ apiVersion: aim.silogen.ai/v1alpha1
+ kind: AIMModelCache
+ metadata:
+ name: s3-model-cache
+ status:
+ status: Available
+
+ - name: Verify checksums match
+ try:
+ - apply:
+ resource:
+ apiVersion: batch/v1
+ kind: Job
+ metadata:
+ name: verify-checksums
+ spec:
+ backoffLimit: 0
+ template:
+ spec:
+ restartPolicy: Never
+ securityContext:
+ runAsUser: 1000
+ runAsGroup: 1000
+ fsGroup: 1000
+ volumes:
+ - name: hf-cache
+ persistentVolumeClaim:
+ claimName: hf-model-cache-cache
+ - name: s3-cache
+ persistentVolumeClaim:
+ claimName: s3-model-cache-cache
+ containers:
+ - name: verifier
+ image: alpine:latest
+ command: ["/bin/sh", "-c"]
+ args:
+ - |
+ set -eu
+ echo "=== Computing checksums for HF cache ==="
+ cd /hf-cache
+ find . -type f -exec md5sum {} \; | sort > /tmp/hf-checksums.txt
+ cat /tmp/hf-checksums.txt
+
+ echo "=== Computing checksums for S3 cache ==="
+ cd /s3-cache
+ find . -type f -exec md5sum {} \; | sort > /tmp/s3-checksums.txt
+ cat /tmp/s3-checksums.txt
+
+ echo "=== Comparing checksums ==="
+ if diff /tmp/hf-checksums.txt /tmp/s3-checksums.txt; then
+ echo "SUCCESS: All checksums match!"
+ exit 0
+ else
+ echo "FAILURE: Checksums do not match!"
+ exit 1
+ fi
+ volumeMounts:
+ - name: hf-cache
+ mountPath: /hf-cache
+ readOnly: true
+ - name: s3-cache
+ mountPath: /s3-cache
+ readOnly: true
+ - wait:
+ apiVersion: batch/v1
+ kind: Job
+ name: verify-checksums
+ timeout: 2m
+ for:
+ condition:
+ name: Complete
+ value: 'true'
+
+ catch:
+ - command:
+ entrypoint: kaiwo-dev
+ env:
+ - name: PRINT_LEVEL
+ value: ($values.print_level)
+ - name: NAMESPACE
+ value: ($namespace)
+ args: ["debug", "chainsaw", "--namespace=$NAMESPACE", "--print-level=$PRINT_LEVEL"]
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml
new file mode 100644
index 000000000..652e0c0bf
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/runtime.yaml
@@ -0,0 +1,7 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMRuntimeConfig
+metadata:
+ name: default
+spec:
+ defaultStorageClassName: "rwx-nfs"
+ pvcHeadroomPercent: 10
diff --git a/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml
new file mode 100644
index 000000000..0d1e5244a
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-s3-download/s3-deployment.yaml
@@ -0,0 +1,79 @@
+apiVersion: v1
+kind: Secret
+metadata:
+ name: minio-secret
+data:
+ access_key_id: bWluaW8=
+ secret_key: bWluaW8xMjM=
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: minio-deployment
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: minio
+ template:
+ metadata:
+ labels:
+ app: minio
+ spec:
+ containers:
+ - name: minio
+ image: minio/minio
+ args: ["server", "/data"]
+ resources:
+ limits:
+ memory: "1Gi"
+ requests:
+ cpu: "1"
+ memory: "1Gi"
+ env:
+ - name: MINIO_ROOT_USER
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: access_key_id
+ - name: MINIO_ROOT_PASSWORD
+ valueFrom:
+ secretKeyRef:
+ name: minio-secret
+ key: secret_key
+ ports:
+ - containerPort: 9000
+ livenessProbe:
+ httpGet:
+ path: /minio/health/live
+ port: 9000
+ initialDelaySeconds: 120
+ periodSeconds: 20
+ timeoutSeconds: 5
+ failureThreshold: 3
+ readinessProbe:
+ httpGet:
+ path: /minio/health/ready
+ port: 9000
+ initialDelaySeconds: 5
+ periodSeconds: 10
+ timeoutSeconds: 1
+ failureThreshold: 3
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: minio-service
+spec:
+ selector:
+ app: minio
+ ports:
+ - protocol: TCP
+ port: 9000
+ targetPort: 9000
+ name: minio-endpoint
+ - protocol: TCP
+ port: 9001
+ targetPort: 9001
+ name: minio-console
+ type: ClusterIP
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml
new file mode 100644
index 000000000..12dc8bb3c
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/aimservice.yaml
@@ -0,0 +1,13 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMService
+metadata:
+ name: test-cache-stall
+spec:
+ model:
+ image: ghcr.io/silogen/aim-dummy:0.1.4
+ cacheModel: true
+ template:
+ allowUnoptimized: true
+ env:
+ - name: AIM_DEBUG_CAUSE_HANG
+ value: "true"
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml
new file mode 100644
index 000000000..1e517e6e3
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/chainsaw-test.yaml
@@ -0,0 +1,41 @@
+apiVersion: chainsaw.kyverno.io/v1alpha1
+kind: Test
+metadata:
+ name: cache-stall-detection
+spec:
+ description: Test that download stall detection kills hung downloads
+ timeouts:
+ assert: 480s # 8 minutes - stall timeout (5min) + buffer
+ steps:
+ - name: Create runtime config
+ try:
+ - apply:
+ file: runtime.yaml
+
+ - name: Create AIMService with cacheModel and hang trigger
+ try:
+ - apply:
+ file: aimservice.yaml
+
+ - name: Verify AIMModelCache is created and download job starts
+ try:
+ - assert:
+ timeout: 120s
+ resource:
+ apiVersion: aim.silogen.ai/v1alpha1
+ kind: AIMModelCache
+ metadata:
+ namespace: ($namespace)
+ status:
+ status: Progressing
+
+ - name: Verify stall detection triggered (Job has at least 1 failed pod)
+ try:
+ - assert:
+ timeout: 420s
+ resource:
+ apiVersion: batch/v1
+ kind: Job
+ metadata:
+ namespace: ($namespace)
+ (status.failed >= `1`): true
\ No newline at end of file
diff --git a/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml
new file mode 100644
index 000000000..c9504097e
--- /dev/null
+++ b/test/chainsaw/tests/aim/service/caching/cache-stall-detection/runtime.yaml
@@ -0,0 +1,7 @@
+apiVersion: aim.silogen.ai/v1alpha1
+kind: AIMRuntimeConfig
+metadata:
+ name: default
+spec:
+ defaultStorageClassName: "rwx-nfs"
+ pvcHeadroomPercent: 10
\ No newline at end of file