diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9bd0c4518..2a85c7d08 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,6 +20,8 @@ jobs: go-version-file: go.mod - name: Build redis-operator run: go build -v ./cmd/redisoperator + - name: Build redis-instance + run: go build -v ./cmd/instance check: name: Lint @@ -69,6 +71,8 @@ jobs: kubernetes-version: ${{ matrix.kubernetes }} minikube-version: 1.37.0 driver: none + - name: Build operator image for tests + run: docker build -t ghcr.io/buildio/redis-operator:test -f docker/app/Dockerfile . - name: Add redisfailover CRD run: kubectl create -f manifests/databases.spotahome.com_redisfailovers.yaml - run: make ci-integration-test diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 000000000..ed01948eb --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,965 @@ +name: E2E Tests + +on: + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + e2e-probe-behavior: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.24' + cache: true + + - name: Set up minikube + uses: medyagh/setup-minikube@v0.0.21 + with: + kubernetes-version: v1.30.0 + + - name: Build operator image + run: | + eval $(minikube docker-env) + docker build -t ghcr.io/buildio/redis-operator:test -f docker/app/Dockerfile . + + - name: Install CRD + run: kubectl apply --server-side -f manifests/databases.spotahome.com_redisfailovers.yaml + + - name: Deploy operator + run: | + helm upgrade --install redis-operator ./charts/redisoperator \ + --set image.repository=ghcr.io/buildio/redis-operator \ + --set image.tag=test \ + --set image.pullPolicy=Never \ + --wait --timeout=120s + + - name: Wait for operator ready + run: | + kubectl rollout status deployment/redis-operator --timeout=60s + + - name: Create RedisFailover for probe tests + run: | + # Use custom probes with shorter timings for faster tests + # Liveness: 5s initial, 5s period, 3 failures = 20s to detect failure + # Readiness: 5s initial, 3s period, 3 failures = 14s to detect not-ready + kubectl apply -f - </dev/null; then + kubectl rollout status statefulset/rfr-test-probes --timeout=180s && break + fi + echo "Waiting for StatefulSet to be created... ($i/60)" + sleep 5 + done + + echo "Waiting for Sentinel Deployment..." + for i in {1..60}; do + if kubectl get deployment rfs-test-probes 2>/dev/null; then + kubectl rollout status deployment/rfs-test-probes --timeout=120s && break + fi + echo "Waiting for Deployment to be created... ($i/60)" + sleep 5 + done + + - name: Verify Redis pods are Ready + run: | + echo "Checking all Redis pods are Ready..." + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + READY=$(kubectl get pod $pod -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') + if [[ "$READY" != "True" ]]; then + echo "✗ $pod is not Ready" + kubectl describe pod $pod + exit 1 + fi + echo "✓ $pod is Ready" + done + + - name: Write test data to master + run: | + echo "Finding master and writing test data..." + + # Find the master + MASTER_POD="" + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:master" ]]; then + MASTER_POD=$pod + break + fi + done + + if [[ -z "$MASTER_POD" ]]; then + echo "✗ No master found" + exit 1 + fi + + echo "Master is: $MASTER_POD" + + # Write test data with timestamps + echo "Writing 1000 keys to master..." + for i in {1..1000}; do + kubectl exec $MASTER_POD -- redis-cli SET "key:$i" "value:$i:$(date +%s%N)" > /dev/null + done + + KEY_COUNT=$(kubectl exec $MASTER_POD -- redis-cli DBSIZE | grep -oE '[0-9]+') + echo "✓ Master has $KEY_COUNT keys" + + - name: Verify data replicates to replicas + run: | + echo "Verifying replication to replicas..." + sleep 3 # Give time for replication + + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:slave" ]]; then + KEY_COUNT=$(kubectl exec $pod -- redis-cli DBSIZE | grep -oE '[0-9]+') + echo "$pod (replica): $KEY_COUNT keys" + if [[ "$KEY_COUNT" -ge 1000 ]]; then + echo "✓ Replica $pod has replicated data" + else + echo "⚠ Replica $pod only has $KEY_COUNT keys" + fi + fi + done + + - name: Test liveness probe detects Redis failure + run: | + echo "Testing liveness probe by killing Redis process..." + + # Get current restart count + RESTART_COUNT_BEFORE=$(kubectl get pod rfr-test-probes-0 -o jsonpath='{.status.containerStatuses[0].restartCount}') + echo "Restart count before: $RESTART_COUNT_BEFORE" + + # Kill the redis-server process (not the container) + echo "Killing redis-server process..." + kubectl exec rfr-test-probes-0 -- /bin/sh -c "kill \$(cat /data/redis.pid 2>/dev/null || pgrep redis-server)" || true + + # Wait for liveness probe to detect failure and restart + # Custom probe: 5s initial, 5s period, 3 failures = ~20s max + echo "Waiting for liveness probe to detect failure (up to 45s)..." + for i in {1..15}; do + sleep 3 + RESTART_COUNT_AFTER=$(kubectl get pod rfr-test-probes-0 -o jsonpath='{.status.containerStatuses[0].restartCount}' 2>/dev/null || echo "0") + echo " Check $i: restart count = $RESTART_COUNT_AFTER" + if [[ "$RESTART_COUNT_AFTER" -gt "$RESTART_COUNT_BEFORE" ]]; then + echo "✓ Pod was restarted by liveness probe (restart count: $RESTART_COUNT_BEFORE -> $RESTART_COUNT_AFTER)" + exit 0 + fi + done + + echo "✗ Pod was not restarted within expected time" + kubectl describe pod rfr-test-probes-0 + exit 1 + + - name: Wait for pod recovery after liveness test + run: | + echo "Waiting for pod to recover..." + kubectl wait --for=condition=Ready pod/rfr-test-probes-0 --timeout=120s + echo "✓ Pod recovered and is Ready" + + - name: Verify all pods Ready after recovery + run: | + echo "Verifying all Redis pods are Ready..." + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + kubectl wait --for=condition=Ready pod/$pod --timeout=60s + READY=$(kubectl get pod $pod -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r' || echo "unknown") + echo "✓ $pod is Ready ($ROLE)" + done + + - name: Test replica resync behavior + run: | + echo "Testing that replica becomes Ready after resync..." + + # Find a replica pod to delete + REPLICA_POD="" + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:slave" ]]; then + REPLICA_POD=$pod + break + fi + done + + if [[ -z "$REPLICA_POD" ]]; then + echo "✗ No replica found to test" + exit 1 + fi + + echo "Deleting replica pod: $REPLICA_POD" + kubectl delete pod $REPLICA_POD + + # Wait for pod to be recreated + echo "Waiting for pod to be recreated..." + sleep 5 + + # The pod should eventually become Ready after sync completes + echo "Waiting for pod to become Ready after resync..." + if kubectl wait --for=condition=Ready pod/$REPLICA_POD --timeout=120s; then + echo "✓ Replica $REPLICA_POD is Ready after resync" + else + echo "✗ Replica did not become Ready" + kubectl describe pod $REPLICA_POD + exit 1 + fi + + # Verify it's actually a replica and has data + ROLE=$(kubectl exec $REPLICA_POD -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + KEY_COUNT=$(kubectl exec $REPLICA_POD -- redis-cli DBSIZE | grep -oE '[0-9]+') + echo "$REPLICA_POD: $ROLE with $KEY_COUNT keys" + + if [[ "$KEY_COUNT" -ge 1000 ]]; then + echo "✓ Replica has resynced all data" + else + echo "⚠ Replica only has $KEY_COUNT keys (expected 1000+)" + fi + + - name: Test data survives failover + run: | + echo "Verifying data survives pod restarts..." + + # Check test data still exists on all pods + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + VALUE=$(kubectl exec $pod -- redis-cli GET "key:500" 2>/dev/null || echo "") + if [[ "$VALUE" == value:500:* ]]; then + echo "✓ $pod has test data (key:500)" + else + echo "⚠ $pod missing or has unexpected data for key:500: $VALUE" + fi + done + + - name: Verify Sentinels are Ready + run: | + echo "Verifying Sentinel pods are Ready..." + SENTINEL_PODS=$(kubectl get pods -l app.kubernetes.io/component=sentinel -o name) + for pod in $SENTINEL_PODS; do + READY=$(kubectl get $pod -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}') + if [[ "$READY" == "True" ]]; then + echo "✓ $pod is Ready" + else + echo "✗ $pod is not Ready" + exit 1 + fi + done + + - name: Test Redis functionality after all probe tests + run: | + echo "Final verification: Redis cluster is functional..." + + # Find master and write new data + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:master" ]]; then + echo "Writing to master ($pod)..." + kubectl exec $pod -- redis-cli SET final-test-key final-test-value + VALUE=$(kubectl exec $pod -- redis-cli GET final-test-key) + if [[ "$VALUE" == "final-test-value" ]]; then + echo "✓ Master read/write successful" + else + echo "✗ Master read/write failed" + exit 1 + fi + break + fi + done + + # Verify replication to replicas + sleep 2 + for pod in rfr-test-probes-0 rfr-test-probes-1 rfr-test-probes-2; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:slave" ]]; then + VALUE=$(kubectl exec $pod -- redis-cli GET final-test-key) + if [[ "$VALUE" == "final-test-value" ]]; then + echo "✓ Replica $pod has replicated new data" + else + echo "⚠ Replica $pod does not have new data yet" + fi + fi + done + + TOTAL_KEYS=$(kubectl exec rfr-test-probes-0 -- redis-cli DBSIZE | grep -oE '[0-9]+') + echo "✓ Final key count: $TOTAL_KEYS" + + - name: Collect logs on failure + if: failure() + run: | + echo "=== Operator logs ===" + kubectl logs -l app.kubernetes.io/name=redisoperator --tail=100 || true + echo "=== Redis pod 0 logs ===" + kubectl logs rfr-test-probes-0 --tail=100 || true + echo "=== Redis pod 1 logs ===" + kubectl logs rfr-test-probes-1 --tail=100 || true + echo "=== Redis pod 2 logs ===" + kubectl logs rfr-test-probes-2 --tail=100 || true + echo "=== Sentinel logs ===" + kubectl logs -l app.kubernetes.io/component=sentinel --tail=50 || true + echo "=== Pod descriptions ===" + kubectl describe pod -l redisfailovers.databases.spotahome.com/name=test-probes || true + echo "=== Events ===" + kubectl get events --sort-by='.lastTimestamp' | tail -50 + + e2e-sentinel-free: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.24' + cache: true + + - name: Set up minikube + uses: medyagh/setup-minikube@v0.0.21 + with: + kubernetes-version: v1.30.0 + + - name: Build operator image + run: | + eval $(minikube docker-env) + docker build -t ghcr.io/buildio/redis-operator:test -f docker/app/Dockerfile . + + - name: Install CRD + run: kubectl apply --server-side -f manifests/databases.spotahome.com_redisfailovers.yaml + + - name: Deploy operator + run: | + helm upgrade --install redis-operator ./charts/redisoperator \ + --set image.repository=ghcr.io/buildio/redis-operator \ + --set image.tag=test \ + --set image.pullPolicy=Never \ + --wait --timeout=120s + + - name: Wait for operator ready + run: | + kubectl rollout status deployment/redis-operator --timeout=60s + + - name: Create RedisFailover with sentinel disabled + run: | + kubectl apply -f - </dev/null; then + kubectl rollout status statefulset/rfr-test-no-sentinel --timeout=180s && break + fi + echo "Waiting for StatefulSet to be created... ($i/60)" + sleep 5 + done + + - name: Verify NO Sentinel resources exist + run: | + echo "Verifying Sentinel resources do NOT exist..." + + # Check Sentinel Deployment does not exist + if kubectl get deployment rfs-test-no-sentinel 2>/dev/null; then + echo "✗ Sentinel Deployment exists but should not" + exit 1 + fi + echo "✓ No Sentinel Deployment" + + # Check Sentinel Service does not exist + if kubectl get service rfs-test-no-sentinel 2>/dev/null; then + echo "✗ Sentinel Service exists but should not" + exit 1 + fi + echo "✓ No Sentinel Service" + + # Check Sentinel ConfigMap does not exist + if kubectl get configmap rfs-test-no-sentinel 2>/dev/null; then + echo "✗ Sentinel ConfigMap exists but should not" + exit 1 + fi + echo "✓ No Sentinel ConfigMap" + + - name: Verify Redis resources exist + run: | + echo "Verifying Redis resources exist..." + + kubectl get statefulset rfr-test-no-sentinel + echo "✓ Redis StatefulSet exists" + + kubectl get service rfrm-test-no-sentinel + echo "✓ Master Service exists" + + kubectl get service rfrs-test-no-sentinel + echo "✓ Slave Service exists" + + - name: Verify master is elected + run: | + echo "Checking that exactly one master exists..." + MASTER_COUNT=0 + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + echo "$pod: $ROLE" + if [[ "$ROLE" == "role:master" ]]; then + MASTER_COUNT=$((MASTER_COUNT + 1)) + MASTER_POD=$pod + fi + done + + if [[ "$MASTER_COUNT" -ne 1 ]]; then + echo "✗ Expected exactly 1 master, found $MASTER_COUNT" + exit 1 + fi + echo "✓ Exactly one master: $MASTER_POD" + + - name: Write test data + run: | + echo "Writing test data to master..." + MASTER_POD="" + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:master" ]]; then + MASTER_POD=$pod + break + fi + done + + for i in {1..100}; do + kubectl exec $MASTER_POD -- redis-cli SET "key:$i" "value:$i" > /dev/null + done + echo "✓ Wrote 100 keys to $MASTER_POD" + + - name: Verify replication + run: | + echo "Verifying data replicated to replica..." + sleep 3 + + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + KEY_COUNT=$(kubectl exec $pod -- redis-cli DBSIZE | grep -oE '[0-9]+') + echo "$pod ($ROLE): $KEY_COUNT keys" + done + + - name: Test operator-managed failover + run: | + echo "Testing operator-managed failover..." + + # Find the master + MASTER_POD="" + REPLICA_POD="" + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" == "role:master" ]]; then + MASTER_POD=$pod + else + REPLICA_POD=$pod + fi + done + + echo "Current master: $MASTER_POD" + echo "Current replica: $REPLICA_POD" + + # Delete the master pod to trigger failover + echo "Deleting master pod $MASTER_POD..." + kubectl delete pod $MASTER_POD + + # Wait for failover + echo "Waiting for failover (up to 60s)..." + for i in {1..20}; do + sleep 3 + # Check if the replica became master + ROLE=$(kubectl exec $REPLICA_POD -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r' || echo "") + echo " Check $i: $REPLICA_POD is $ROLE" + if [[ "$ROLE" == "role:master" ]]; then + echo "✓ Failover complete: $REPLICA_POD is now master" + break + fi + done + + # Verify final state + ROLE=$(kubectl exec $REPLICA_POD -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r') + if [[ "$ROLE" != "role:master" ]]; then + echo "✗ Failover failed: $REPLICA_POD is still $ROLE" + kubectl logs -l app.kubernetes.io/name=redisoperator --tail=50 || true + exit 1 + fi + + - name: Wait for pod recovery + run: | + echo "Waiting for deleted pod to recover..." + kubectl wait --for=condition=Ready pod/rfr-test-no-sentinel-0 --timeout=120s || true + kubectl wait --for=condition=Ready pod/rfr-test-no-sentinel-1 --timeout=120s || true + + - name: Verify data survived failover + run: | + echo "Verifying data survived failover..." + + # Find current master + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + if kubectl get pod $pod 2>/dev/null | grep -q Running; then + VALUE=$(kubectl exec $pod -- redis-cli GET "key:50" 2>/dev/null || echo "") + if [[ "$VALUE" == "value:50" ]]; then + echo "✓ $pod has test data (key:50 = $VALUE)" + else + echo "⚠ $pod: key:50 = '$VALUE'" + fi + fi + done + + - name: Verify master service endpoints + run: | + echo "Checking master service endpoints..." + + # Get the current master IP + MASTER_IP="" + for pod in rfr-test-no-sentinel-0 rfr-test-no-sentinel-1; do + if kubectl get pod $pod 2>/dev/null | grep -q Running; then + ROLE=$(kubectl exec $pod -- redis-cli INFO replication 2>/dev/null | grep "role:" | tr -d '\r' || echo "") + if [[ "$ROLE" == "role:master" ]]; then + MASTER_IP=$(kubectl get pod $pod -o jsonpath='{.status.podIP}') + echo "Current master: $pod ($MASTER_IP)" + break + fi + fi + done + + # Check master service endpoints + ENDPOINTS=$(kubectl get endpoints rfrm-test-no-sentinel -o jsonpath='{.subsets[0].addresses[0].ip}' 2>/dev/null || echo "") + echo "Master service endpoint: $ENDPOINTS" + + if [[ "$ENDPOINTS" == "$MASTER_IP" ]]; then + echo "✓ Master service points to correct master" + else + echo "⚠ Master service endpoint mismatch (may need more time to update)" + fi + + - name: Collect logs on failure + if: failure() + run: | + echo "=== Operator logs ===" + kubectl logs -l app.kubernetes.io/name=redisoperator --tail=100 || true + echo "=== Redis pod 0 logs ===" + kubectl logs rfr-test-no-sentinel-0 --tail=50 || true + echo "=== Redis pod 1 logs ===" + kubectl logs rfr-test-no-sentinel-1 --tail=50 || true + echo "=== Pod descriptions ===" + kubectl describe pod -l redisfailovers.databases.spotahome.com/name=test-no-sentinel || true + echo "=== Services ===" + kubectl get svc | grep test-no-sentinel || true + echo "=== Events ===" + kubectl get events --sort-by='.lastTimestamp' | tail -30 + + e2e-instance-manager: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.24' + cache: true + + - name: Set up minikube + uses: medyagh/setup-minikube@v0.0.21 + with: + kubernetes-version: v1.30.0 + + - name: Build operator image + run: | + eval $(minikube docker-env) + docker build -t ghcr.io/buildio/redis-operator:test -f docker/app/Dockerfile . + + - name: Install CRD + run: kubectl apply --server-side -f manifests/databases.spotahome.com_redisfailovers.yaml + + - name: Deploy operator + run: | + helm upgrade --install redis-operator ./charts/redisoperator \ + --set image.repository=ghcr.io/buildio/redis-operator \ + --set image.tag=test \ + --set image.pullPolicy=Never \ + --wait --timeout=120s + + - name: Wait for operator ready + run: | + kubectl rollout status deployment/redis-operator --timeout=60s + + - name: Create RedisFailover with instance manager + run: | + kubectl apply -f - </dev/null; then + kubectl rollout status statefulset/rfr-test-im --timeout=120s && break + fi + echo "Waiting for StatefulSet to be created... ($i/60)" + sleep 5 + done + + - name: Verify httpGet liveness probe is configured + run: | + echo "Checking liveness probe configuration..." + PROBE_TYPE=$(kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].livenessProbe.httpGet.path}') + echo "Liveness probe path: $PROBE_TYPE" + if [[ "$PROBE_TYPE" == "/healthz" ]]; then + echo "✓ HTTP liveness probe is configured with /healthz path" + else + echo "✗ Expected httpGet probe with /healthz path" + kubectl get statefulset rfr-test-im -o yaml | grep -A 10 livenessProbe + exit 1 + fi + + PROBE_PORT=$(kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].livenessProbe.httpGet.port}') + echo "Liveness probe port: $PROBE_PORT" + if [[ "$PROBE_PORT" == "8080" ]]; then + echo "✓ HTTP liveness probe is configured on port 8080" + else + echo "✗ Expected probe on port 8080, got $PROBE_PORT" + exit 1 + fi + + - name: Verify httpGet readiness probe is configured + run: | + echo "Checking readiness probe configuration..." + PROBE_TYPE=$(kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].readinessProbe.httpGet.path}') + echo "Readiness probe path: $PROBE_TYPE" + if [[ "$PROBE_TYPE" == "/readyz" ]]; then + echo "✓ HTTP readiness probe is configured with /readyz path" + else + echo "✗ Expected httpGet probe with /readyz path" + kubectl get statefulset rfr-test-im -o yaml | grep -A 10 readinessProbe + exit 1 + fi + + PROBE_PORT=$(kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].readinessProbe.httpGet.port}') + echo "Readiness probe port: $PROBE_PORT" + if [[ "$PROBE_PORT" == "8080" ]]; then + echo "✓ HTTP readiness probe is configured on port 8080" + else + echo "✗ Expected probe on port 8080, got $PROBE_PORT" + exit 1 + fi + + - name: Verify health port is exposed + run: | + echo "Checking container ports..." + HEALTH_PORT=$(kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].ports[?(@.name=="health")].containerPort}') + echo "Health port: $HEALTH_PORT" + if [[ "$HEALTH_PORT" == "8080" ]]; then + echo "✓ Health port 8080 is exposed" + else + echo "✗ Expected health port 8080" + kubectl get statefulset rfr-test-im -o jsonpath='{.spec.template.spec.containers[0].ports}' + exit 1 + fi + + - name: Verify instance manager is PID 1 + run: | + echo "Checking process tree..." + kubectl exec rfr-test-im-0 -- ps aux + PID1_CMD=$(kubectl exec rfr-test-im-0 -- cat /proc/1/cmdline | tr '\0' ' ') + echo "PID 1 command: $PID1_CMD" + if [[ "$PID1_CMD" == *"redis-instance"* ]]; then + echo "✓ Instance manager is running as PID 1" + else + echo "✗ Instance manager is NOT PID 1" + exit 1 + fi + + - name: Test RDB cleanup on restart + run: | + echo "Creating temp RDB files..." + kubectl exec rfr-test-im-0 -- touch /data/temp-1234.rdb /data/temp-5678.rdb + kubectl exec rfr-test-im-0 -- ls -la /data/ + + echo "Deleting pod to trigger restart..." + kubectl delete pod rfr-test-im-0 + + echo "Waiting for pod to restart..." + kubectl wait --for=condition=Ready pod/rfr-test-im-0 --timeout=120s + + echo "Checking if temp files were cleaned..." + if kubectl exec rfr-test-im-0 -- ls /data/temp-1234.rdb 2>/dev/null; then + echo "✗ temp-1234.rdb still exists - cleanup failed" + exit 1 + fi + echo "✓ Temp RDB files were cleaned up on restart" + + - name: Test Redis is functional + run: | + kubectl exec rfr-test-im-0 -- redis-cli PING + kubectl exec rfr-test-im-0 -- redis-cli SET test-key test-value + VALUE=$(kubectl exec rfr-test-im-0 -- redis-cli GET test-key) + if [[ "$VALUE" == "test-value" ]]; then + echo "✓ Redis is functional" + else + echo "✗ Redis read/write failed" + exit 1 + fi + + - name: Test /healthz endpoint returns 200 when healthy + run: | + echo "Testing /healthz endpoint..." + # Use kubectl port-forward to access health endpoint + kubectl port-forward pod/rfr-test-im-0 8080:8080 & + PF_PID=$! + sleep 2 + + RESPONSE=$(curl -s -w "\n%{http_code}" http://localhost:8080/healthz) + HTTP_CODE=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | head -n -1) + + kill $PF_PID 2>/dev/null || true + + echo "Response: $BODY" + echo "HTTP Code: $HTTP_CODE" + + if [[ "$HTTP_CODE" == "200" ]]; then + echo "✓ /healthz returned 200" + else + echo "✗ /healthz returned $HTTP_CODE, expected 200" + exit 1 + fi + + # Verify response contains expected fields + if echo "$BODY" | grep -q '"status":"ok"'; then + echo "✓ /healthz response contains status:ok" + else + echo "✗ /healthz response missing status:ok" + exit 1 + fi + + - name: Test /readyz endpoint returns 200 when ready + run: | + echo "Testing /readyz endpoint..." + kubectl port-forward pod/rfr-test-im-0 8080:8080 & + PF_PID=$! + sleep 2 + + RESPONSE=$(curl -s -w "\n%{http_code}" http://localhost:8080/readyz) + HTTP_CODE=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | head -n -1) + + kill $PF_PID 2>/dev/null || true + + echo "Response: $BODY" + echo "HTTP Code: $HTTP_CODE" + + if [[ "$HTTP_CODE" == "200" ]]; then + echo "✓ /readyz returned 200" + else + echo "✗ /readyz returned $HTTP_CODE, expected 200" + exit 1 + fi + + # Verify response contains role + if echo "$BODY" | grep -q '"role"'; then + echo "✓ /readyz response contains role" + else + echo "✗ /readyz response missing role" + exit 1 + fi + + - name: Test /status endpoint returns detailed info + run: | + echo "Testing /status endpoint..." + kubectl port-forward pod/rfr-test-im-0 8080:8080 & + PF_PID=$! + sleep 2 + + RESPONSE=$(curl -s -w "\n%{http_code}" http://localhost:8080/status) + HTTP_CODE=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | head -n -1) + + kill $PF_PID 2>/dev/null || true + + echo "Response: $BODY" + echo "HTTP Code: $HTTP_CODE" + + if [[ "$HTTP_CODE" == "200" ]]; then + echo "✓ /status returned 200" + else + echo "✗ /status returned $HTTP_CODE, expected 200" + exit 1 + fi + + # Verify response contains expected sections + if echo "$BODY" | grep -q '"redis"' && echo "$BODY" | grep -q '"instance_manager"'; then + echo "✓ /status response contains redis and instance_manager sections" + else + echo "✗ /status response missing expected sections" + exit 1 + fi + + - name: Test /healthz returns 503 when Redis is killed + run: | + echo "Testing /healthz returns 503 when Redis process dies..." + + # Start port-forward + kubectl port-forward pod/rfr-test-im-0 8080:8080 & + PF_PID=$! + sleep 2 + + # Verify healthy first + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/healthz) + if [[ "$HTTP_CODE" != "200" ]]; then + echo "✗ Initial /healthz check failed with $HTTP_CODE" + kill $PF_PID 2>/dev/null || true + exit 1 + fi + echo "✓ Initial /healthz is healthy" + + # Kill redis-server process (instance manager will detect this) + echo "Killing redis-server process..." + kubectl exec rfr-test-im-0 -- /bin/sh -c "kill \$(pgrep redis-server)" || true + + # Wait for health check to detect (checks every 1s) + echo "Waiting for health check to detect failure..." + sleep 3 + + # Check that /healthz now returns 503 + RESPONSE=$(curl -s -w "\n%{http_code}" http://localhost:8080/healthz) + HTTP_CODE=$(echo "$RESPONSE" | tail -1) + BODY=$(echo "$RESPONSE" | head -n -1) + + kill $PF_PID 2>/dev/null || true + + echo "Response after kill: $BODY" + echo "HTTP Code: $HTTP_CODE" + + if [[ "$HTTP_CODE" == "503" ]]; then + echo "✓ /healthz returned 503 after Redis killed" + else + echo "⚠ /healthz returned $HTTP_CODE (pod may have restarted already)" + fi + + - name: Wait for pod recovery after health test + run: | + echo "Waiting for pod to recover..." + sleep 5 + kubectl wait --for=condition=Ready pod/rfr-test-im-0 --timeout=120s + echo "✓ Pod recovered and is Ready" + + # Verify health endpoints work again + kubectl port-forward pod/rfr-test-im-0 8080:8080 & + PF_PID=$! + sleep 2 + + HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8080/healthz) + kill $PF_PID 2>/dev/null || true + + if [[ "$HTTP_CODE" == "200" ]]; then + echo "✓ /healthz returns 200 after recovery" + else + echo "✗ /healthz returned $HTTP_CODE after recovery" + exit 1 + fi + + - name: Collect logs on failure + if: failure() + run: | + echo "=== Operator logs ===" + kubectl logs -l app.kubernetes.io/name=redisoperator --tail=100 || true + echo "=== Redis pod logs ===" + kubectl logs rfr-test-im-0 --tail=100 || true + echo "=== Pod describe ===" + kubectl describe pod rfr-test-im-0 || true + echo "=== Events ===" + kubectl get events --sort-by='.lastTimestamp' | tail -30 diff --git a/Makefile b/Makefile index 73f9ee39c..4b49558ae 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION := v1.5.0-rc0 +VERSION := v1.5.0 # Name of this service/application SERVICE_NAME := redis-operator diff --git a/README.md b/README.md index 10a3dc08c..8344b8bca 100644 --- a/README.md +++ b/README.md @@ -1,359 +1,448 @@ # redis-operator -This is a fork of the `spotahome/redis-operator` repository. - -[![Build Status](https://github.com/Saremox/redis-operator/actions/workflows/ci.yaml/badge.svg?branch=master)](https://github.com/Saremox/redis-operator) -[![Go Report Card](https://goreportcard.com/badge/github.com/Saremox/redis-operator)](https://goreportcard.com/report/github.com/Saremox/redis-operator) +[![CI](https://github.com/buildio/redis-operator/actions/workflows/ci.yml/badge.svg)](https://github.com/buildio/redis-operator/actions/workflows/ci.yml) +[![E2E Tests](https://github.com/buildio/redis-operator/actions/workflows/e2e.yml/badge.svg)](https://github.com/buildio/redis-operator/actions/workflows/e2e.yml) +[![Go Report Card](https://goreportcard.com/badge/github.com/buildio/redis-operator)](https://goreportcard.com/report/github.com/buildio/redis-operator) Redis Operator creates/configures/manages redis-failovers atop Kubernetes. -## Requirements - -Kubernetes version: 1.21 or higher -Redis version: 6 or higher - -Redis operator is being tested against kubernetes 1.29 1.30 1.31 1.32 and redis 6,7 ; Valkey 8 -All dependencies have been vendored, so there's no need to any additional download. +This is a fork of `spotahome/redis-operator` → `Saremox/redis-operator` → `buildio/redis-operator`. -## Operator deployment on Kubernetes +## What's New in v4.0.0 -In order to create Redis failovers inside a Kubernetes cluster, the operator has to be deployed. -It can be done with plain old [deployment](example/operator), using [Kustomize](manifests/kustomize) or with the provided [Helm chart](charts/redisoperator). +**Breaking Change: Instance Manager Required** -### Using the Helm chart +v4.0.0 makes the instance manager the default and only mode. Legacy exec probes are removed. -From the root folder of the project, execute the following: +**Key changes:** +- **Sentinel disabled by default** - operator-managed failover is now the default +- Instance manager is always enabled (no opt-out) +- HTTP health probes (`/healthz`, `/readyz`) are now the only probe type +- Chart version aligned with operator version (4.0.0) +**Minimal configuration (operator-managed failover, no sentinel):** +```yaml +apiVersion: databases.spotahome.com/v1 +kind: RedisFailover +metadata: + name: my-redis +spec: + redis: + replicas: 2 ``` -helm repo add redis-operator https://Saremox.github.io/redis-operator -helm repo update -helm install redis-operator redis-operator/redis-operator + +**With Redis Sentinel (opt-in):** +```yaml +apiVersion: databases.spotahome.com/v1 +kind: RedisFailover +metadata: + name: my-redis +spec: + redis: + replicas: 2 + sentinel: + enabled: true + replicas: 3 ``` -#### Update helm chart +## What's New in v1.7.0 -Helm chart only manage the creation of CRD in the first install. In order to update the CRD you will need to apply directly. +**Sentinel-Free Architecture** ([#9](https://github.com/buildio/redis-operator/issues/9)) -``` -REDIS_OPERATOR_VERSION=v1.4.0 -kubectl replace -f https://raw.githubusercontent.com/Saremox/redis-operator/${REDIS_OPERATOR_VERSION}/manifests/databases.spotahome.com_redisfailovers.yaml -``` +v1.7.0 introduced operator-managed failover as an alternative to Redis Sentinel, reducing pod overhead from 5 pods (2 Redis + 3 Sentinel) to just 2 pods (Redis only). -``` -helm upgrade redis-operator redis-operator/redis-operator -``` -### Using kubectl +**How it works:** +- Operator monitors Redis pods and detects master failures +- On failure, promotes the replica with highest replication offset (minimizes data loss) +- Automatically reconfigures remaining replicas to follow new master +- Master Service (`rfrm-`) endpoints update automatically via label selectors -To create the operator, you can directly create it with kubectl: +**When to use:** +- Development/testing environments where you want fewer pods +- Cost-sensitive deployments where 3 Sentinel pods are overhead +- Simple HA setups where operator-managed failover is sufficient -``` -REDIS_OPERATOR_VERSION=v1.4.0 -kubectl create -f https://raw.githubusercontent.com/Saremox/redis-operator/${REDIS_OPERATOR_VERSION}/manifests/databases.spotahome.com_redisfailovers.yaml -kubectl apply -f https://raw.githubusercontent.com/Saremox/redis-operator/${REDIS_OPERATOR_VERSION}/example/operator/all-redis-operator-resources.yaml -``` +**When to keep Sentinel:** +- Production environments requiring sub-second failover +- Complex topologies with multiple Redis clusters +- When you need Sentinel's pub/sub notifications -This will create a deployment named `redisoperator`. +## What's New in v1.6.1 -### Using kustomize +**Disable Service Links** ([#3](https://github.com/buildio/redis-operator/issues/3)) -The kustomize setup included in this repo is highly customizable using [components](https://kubectl.docs.kubernetes.io/guides/config_management/components/), -but it also comes with a few presets (in the form of overlays) supporting the most common use cases. +v1.6.1 sets `enableServiceLinks: false` on all pods to prevent startup failures in namespaces with many services. Kubernetes by default injects environment variables for every service in the namespace, which can exceed limits and cause pod failures. -To install the operator with default settings and every necessary resource (including RBAC, service account, default resource limits, etc), install the `default` overlay: +## What's New in v1.6.0 -```shell -kustomize build github.com/Saremox/redis-operator/manifests/kustomize/overlays/default -``` +**CNPG-style Instance Manager** ([#2](https://github.com/buildio/redis-operator/issues/2)) -If you would like to customize RBAC or the service account used, you can install the `minimal` overlay. +v1.6.0 introduces an optional instance manager that runs as PID 1 in Redis containers, following the [CloudNativePG model](https://cloudnative-pg.io/documentation/current/instance_manager/) which has proven reliable at scale. -Finally, you can install the `full` overlay if you want everything this operator has to offer, including Prometheus ServiceMonitor resources. +**Features:** +- **RDB tempfile cleanup** - Automatically removes stale `temp-*.rdb` files on startup, preventing disk exhaustion during crash loops +- **Zombie process reaper** - Properly handles SIGCHLD for BGSAVE/BGREWRITEAOF child processes +- **Graceful shutdown** - Timeout escalation (SIGTERM → SIGKILL) for reliable shutdown -It's always a good practice to pin the version of the operator in your configuration to make sure you are not surprised by changes on the latest development branch: +**Enabled by default in v4.0.0+** - no configuration needed. -```shell -kustomize build github.com/Saremox/redis-operator/manifests/kustomize/overlays/default?ref=v1.2.4 -``` +### Roadmap -You can easily create your own config by creating a `kustomization.yaml` file -(for example, to apply custom resource limits, to add custom labels or to customize the namespace): +| Version | Features | Notes | +|---------|----------|-------| +| v1.6.0 | Instance Manager opt-in | `instanceManagerImage` field | +| v1.6.1 | Disable service links | Prevents startup failures in busy namespaces | +| v1.7.0 | Sentinel-free mode | `sentinel.enabled: false` | +| v4.0.0 | Instance Manager required | Current release - legacy probes removed, chart/operator versions aligned | -```yaml -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization +See [Issue #2](https://github.com/buildio/redis-operator/issues/2) for instance manager details and [Issue #9](https://github.com/buildio/redis-operator/issues/9) for sentinel-free architecture. -namespace: redis-operator +## Requirements -commonLabels: - foo: bar +- Kubernetes: 1.21+ +- Redis: 6+ (also supports Valkey 8) -resources: - - github.com/Saremox/redis-operator/manifests/kustomize/overlays/full -``` +Tested against Kubernetes 1.29, 1.30, 1.31, 1.32, 1.33, 1.34 and Redis 6, 7. -Take a look at the manifests inside [manifests/kustomize](manifests/kustomize) for more details. +## Versioning -## Usage +**Starting with 4.0.0, we no longer use 'v' prefix anywhere:** -Once the operator is deployed inside a Kubernetes cluster, a new API will be accesible, so you'll be able to create, update and delete redisfailovers. +| Git Tag | Chart Version | Image Tag | +|---------|---------------|-----------| +| 4.0.0 | 4.0.0 | 4.0.0 | -In order to deploy a new redis-failover a [specification](example/redisfailover/basic.yaml) has to be created: +**Warning:** Previous releases used `v` prefix for git tags (e.g., `v1.7.0`). Starting with 4.0.0, git tags are bare version numbers (e.g., `4.0.0`). -``` -REDIS_OPERATOR_VERSION=v1.2.4 -kubectl create -f https://raw.githubusercontent.com/Saremox/redis-operator/${REDIS_OPERATOR_VERSION}/example/redisfailover/basic.yaml +If you don't specify `image.tag`, the chart automatically uses the appVersion. + +## Quick Start + +### Install from GitHub Container Registry (Recommended) + +```bash +# Install CRD +kubectl apply --server-side -f https://raw.githubusercontent.com/buildio/redis-operator/main/manifests/databases.spotahome.com_redisfailovers.yaml + +# Install operator (uses default image version matching chart) +helm upgrade --install redis-operator oci://ghcr.io/buildio/redis-operator/charts/redisoperator \ + --namespace redis-operator --create-namespace ``` -This redis-failover will be managed by the operator, resulting in the following elements created inside Kubernetes: +No additional parameters required - the chart defaults to the correct image version. -- `rfr-`: Redis configmap -- `rfr-`: Redis statefulset -- `rfr-`: Redis service (if redis-exporter is enabled) -- `rfs-`: Sentinel configmap -- `rfs-`: Sentinel deployment -- `rfs-`: Sentinel service +### Install with Helm Repository -**NOTE**: `NAME` is the named provided when creating the RedisFailover. -**IMPORTANT**: the name of the redis-failover to be created cannot be longer that 48 characters, due to prepend of redis/sentinel identification and statefulset limitation. +```bash +helm repo add redis-operator https://buildio.github.io/redis-operator +helm repo update +helm install redis-operator redis-operator/redis-operator +``` -### Persistence +### Install with kubectl -The operator has the ability of add persistence to Redis data. By default an `emptyDir` will be used, so the data is not saved. +```bash +REDIS_OPERATOR_VERSION=4.0.0 +kubectl apply --server-side -f https://raw.githubusercontent.com/buildio/redis-operator/${REDIS_OPERATOR_VERSION}/manifests/databases.spotahome.com_redisfailovers.yaml +kubectl apply -f https://raw.githubusercontent.com/buildio/redis-operator/${REDIS_OPERATOR_VERSION}/example/operator/all-redis-operator-resources.yaml +``` -In order to have persistence, a `PersistentVolumeClaim` usage is allowed. The full [PVC definition has to be added](example/redisfailover/persistent-storage.yaml) to the Redis Failover Spec under the `Storage` section. +### Install with Kustomize -**IMPORTANT**: By default, the persistent volume claims will be deleted when the Redis Failover is. If this is not the expected usage, a `keepAfterDeletion` flag can be added under the `storage` section of Redis. [An example is given](example/redisfailover/persistent-storage-no-pvc-deletion.yaml). +```bash +# Default installation with RBAC, service account, resource limits +kustomize build github.com/buildio/redis-operator/manifests/kustomize/overlays/default?ref=4.0.0 | kubectl apply -f - -### NodeAffinity and Tolerations +# Minimal installation +kustomize build github.com/buildio/redis-operator/manifests/kustomize/overlays/minimal?ref=4.0.0 | kubectl apply -f - -You can use NodeAffinity and Tolerations to deploy Pods to isolated groups of Nodes. Examples are given for [node affinity](example/redisfailover/node-affinity.yaml), [pod anti affinity](example/redisfailover/pod-anti-affinity.yaml) and [tolerations](example/redisfailover/tolerations.yaml). +# Full installation with Prometheus ServiceMonitor +kustomize build github.com/buildio/redis-operator/manifests/kustomize/overlays/full?ref=4.0.0 | kubectl apply -f - +``` -## Topology Spread Contraints +## Updating -You can use the `topologySpreadContraints` to ensure the pods of a type(redis or sentinel) are evenly distributed across zones/nodes. Examples are for using [topology spread constraints](example/redisfailover/topology-spread-contraints.yaml). Further document on how `topologySpreadConstraints` work could be found [here](https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/). +### Update CRD -### Custom configurations +Helm only manages CRD creation on first install. To update the CRD: -It is possible to configure both Redis and Sentinel. This is done with the `customConfig` option inside their spec. It is a list of configurations and their values. Example are given in the [custom config example file](example/redisfailover/custom-config.yaml). +```bash +REDIS_OPERATOR_VERSION=4.0.0 +kubectl replace --server-side -f https://raw.githubusercontent.com/buildio/redis-operator/${REDIS_OPERATOR_VERSION}/manifests/databases.spotahome.com_redisfailovers.yaml +``` -In order to have the ability of this configurations to be changed "on the fly", without the need of reload the redis/sentinel processes, the operator will apply them with calls to the redises/sentinels, using `config set` or `sentinel set mymaster` respectively. Because of this, **no changes on the configmaps** will appear regarding this custom configurations and the entries of `customConfig` from Redis spec will not be written on `redis.conf` file. To verify the actual Redis configuration use [`redis-cli CONFIG GET *`](https://redis.io/commands/config-get). +Then upgrade the operator: -**Important**: in the Sentinel options, there are some "conversions" to be made: +```bash +helm upgrade redis-operator redis-operator/redis-operator +``` -- Configuration on the `sentinel.conf`: `sentinel down-after-milliseconds mymaster 2000` -- Configuration on the `configOptions`: `down-after-milliseconds 2000` +## Usage -**Important 2**: do **NOT** change the options used for control the redis/sentinel such as `port`, `bind`, `dir`, etc. +### Create a Redis Failover -### Custom shutdown script +```bash +kubectl apply -f https://raw.githubusercontent.com/buildio/redis-operator/4.0.0/example/redisfailover/basic.yaml +``` -By default, a custom shutdown file is given. This file makes redis to `SAVE` it's data, and in the case that redis is master, it'll call sentinel to ask for a failover. +This creates the following resources: +- `rfr-`: Redis StatefulSet and ConfigMap +- `rfs-`: Sentinel Deployment, ConfigMap, and Service -This behavior is configurable, creating a configmap and indicating to use it. An example about how to use this option can be found on the [shutdown example file](example/redisfailover/custom-shutdown.yaml). +**Note:** The RedisFailover name must be ≤48 characters. -**Important**: the configmap has to be in the same namespace. The configmap has to have a `shutdown.sh` data, containing the script. +### Enable Instance Manager -### Custom SecurityContext +To use the CNPG-style instance manager for improved reliability: -By default Kubernetes will run containers as the user specified in the Dockerfile (or the root user if not specified), this is not always desirable. -If you need the containers to run as a specific user (or provide any other PodSecurityContext options) then you can specify a custom `securityContext` in the -`redisfailover` object. See the [SecurityContext example file](example/redisfailover/security-context.yaml) for an example. You can visit kubernetes documentation for detailed docs about [security context](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/) +```yaml +apiVersion: databases.spotahome.com/v1 +kind: RedisFailover +metadata: + name: my-redis +spec: + redis: + replicas: 2 + sentinel: + replicas: 3 +``` + +The instance manager is enabled by default: +1. An init container copies the `redis-instance` binary to a shared volume +2. The main container runs `redis-instance run` as PID 1 +3. The instance manager performs cleanup and manages Redis as a child process -### Custom containerSecurityContext at container level +### Sentinel-Free Mode -By default Kubernetes will run containers with default docker capabilities for exemple, this is not always desirable. -If you need the containers to run with specific capabilities or with read only root file system (or provide any other securityContext options) then you can specify a custom `containerSecurityContext` in the -`redisfailover` object. See the [ContainerSecurityContext example file](example/redisfailover/container-security-context.yaml) for an example. Keys available under containerSecurityContext are detailed [here](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.20/#securitycontext-v1-core) +For simpler deployments, disable Sentinel and let the operator manage failover: + +```yaml +apiVersion: databases.spotahome.com/v1 +kind: RedisFailover +metadata: + name: my-redis +spec: + redis: + replicas: 2 + sentinel: + enabled: false + failoverTimeout: "10s" # Optional, defaults to 10s +``` -### Custom command +**How failure detection works:** -By default, redis and sentinel will be called with the basic command, giving the configuration file: +The instance manager provides HTTP health endpoints (`/healthz`, `/readyz`) that enable: +- HTTP health probes (faster than exec probes) +- Immediate detection of Redis process crashes +- No process spawning overhead during health checks -- Redis: `redis-server /redis/redis.conf` -- Sentinel: `redis-server /redis/sentinel.conf --sentinel` +This creates only: +- `rfr-`: Redis StatefulSet (2 pods) +- `rfrm-`: Master Service (points to current master via label selector) +- `rfrs-`: Slave Service (points to replicas) -If necessary, this command can be changed with the `command` option inside redis/sentinel spec. An example can be found in the [custom command example file](example/redisfailover/custom-command.yaml). +**No Sentinel pods are created.** -### Custom Priority Class -In order to use a custom Kubernetes [Priority Class](https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#priorityclass) for Redis and/or Sentinel pods, you can set the `priorityClassName` in the redis/sentinel spec, this attribute has no default and depends on the specific cluster configuration. **Note:** the operator doesn't create the referenced `Priority Class` resource. +The operator handles failover by: +1. Detecting master failure via health checks +2. Selecting the replica with highest replication offset +3. Promoting it to master (`SLAVEOF NO ONE`) +4. Reconfiguring other replicas to follow the new master +5. Updating pod labels so Services route correctly -### Custom Service Account -In order to use a custom Kubernetes [Service Account](https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/) for Redis and/or Sentinel pods, you can set the `serviceAccountName` in the redis/sentinel spec, if not specified the `default` Service Account will be used. **Note:** the operator doesn't create the referenced `Service Account` resource. +### Instance Manager CLI -### Custom Pod Annotations -By default, no pod annotations will be applied to Redis nor Sentinel pods. +The `redis-instance` binary provides the following commands: -In order to apply custom pod Annotations, you can provide the `podAnnotations` option inside redis/sentinel spec. An example can be found in the [custom annotations example file](example/redisfailover/custom-annotations.yaml). -### Custom Service Annotations -By default, no service annotations will be applied to the Redis nor Sentinel services. +```bash +# Run as instance manager (PID 1 mode) +redis-instance run --redis-conf /redis/redis.conf --data-dir /data --db-filename dump.rdb -In order to apply custom service Annotations, you can provide the `serviceAnnotations` option inside redis/sentinel spec. An example can be found in the [custom annotations example file](example/redisfailover/custom-annotations.yaml). +# Standalone cleanup (removes stale RDB files) +redis-instance cleanup --data-dir /data --db-filename dump.rdb -### Control of label propagation. -By default the operator will propagate all labels on the CRD down to the resources that it creates. This can be problematic if the -labels on the CRD are not fully under your own control (for example: being deployed by a gitops operator) -as a change to a labels value can fail on immutable resources such as PodDisruptionBudgets. To control what labels the operator propagates -to resource is creates you can modify the labelWhitelist option in the spec. +# Dry-run cleanup (show what would be removed) +redis-instance cleanup --data-dir /data --dry-run +``` -By default specifying no whitelist or an empty whitelist will cause all labels to still be copied as not to break backwards compatibility. +### Connection -Items in the array should be regular expressions, see [here](example/redisfailover/control-label-propagation.yaml) as an example of how they can be used and -[here](https://github.com/google/re2/wiki/Syntax) for a syntax reference. +**With Sentinel (default):** -The whitelist can also be used as a form of blacklist by specifying a regular expression that will not match any label. +Connect using a [Sentinel-ready client library](https://redis.io/topics/sentinel-clients): -NOTE: The operator will always add the labels it requires for operation to resources. These are the following: ``` -app.kubernetes.io/component -app.kubernetes.io/managed-by -app.kubernetes.io/name -app.kubernetes.io/part-of -redisfailovers.databases.spotahome.com/name +url: rfs- +port: 26379 +master-name: mymaster ``` +**Without Sentinel (`sentinel.enabled: false`):** + +Connect directly to the master service: + +``` +url: rfrm- +port: 6379 +``` -### ExtraVolumes and ExtraVolumeMounts +The master service automatically routes to the current master pod. -If the user choose to have extra volumes creates and mounted, he could use the `extraVolumes` and `extraVolumeMounts`, in `spec.redis` of the CRD. This allows users to mount the extra configurations, or secrets to be used. A typical use case for this might be -- Secrets that sidecars might use to backup of RDBs -- Extra users and their secrets and acls that could used the initContainers to create multiple users -- Extra Configurations that could merge on top the existing configurations -- To pass failover scripts for addition for additional operations +### Enable Authentication +```bash +kubectl create secret generic redis-auth --from-literal=password=your-password ``` ---- -apiVersion: v1 -kind: Secret -metadata: - name: foo - namespace: exm -type: Opaque -stringData: - password: MWYyZDFlMmU2N2Rm ---- + +```yaml apiVersion: databases.spotahome.com/v1 kind: RedisFailover metadata: - name: foo - namespace: exm + name: my-redis spec: - sentinel: - replicas: 3 - extraVolumes: - - name: foo - secret: - secretName: foo - optional: false - extraVolumeMounts: - - name: foo - mountPath: "/etc/foo" - readOnly: true redis: replicas: 3 - extraVolumes: - - name: foo - secret: - secretName: foo - optional: false - extraVolumeMounts: - - name: foo - mountPath: "/etc/foo" - readOnly: true + sentinel: + replicas: 3 + auth: + secretPath: redis-auth ``` +### Persistence +Enable persistent storage with a PVC: -## Connection to the created Redis Failovers +```yaml +spec: + redis: + storage: + persistentVolumeClaim: + metadata: + name: redis-data + spec: + accessModes: [ReadWriteOnce] + resources: + requests: + storage: 10Gi + keepAfterDeletion: true # Optional: retain PVCs when RedisFailover is deleted +``` -In order to connect to the redis-failover and use it, a [Sentinel-ready](https://redis.io/topics/sentinel-clients) library has to be used. This will connect through the Sentinel service to the Redis node working as a master. -The connection parameters are the following: +See [persistent-storage.yaml](example/redisfailover/persistent-storage.yaml) for a complete example. -``` -url: rfs- -port: 26379 -master-name: mymaster +### Custom Configuration + +Configure Redis and Sentinel via `customConfig`: + +```yaml +spec: + redis: + customConfig: + - maxmemory 2gb + - maxmemory-policy allkeys-lru + sentinel: + customConfig: + - down-after-milliseconds 5000 ``` -### Enabling redis auth +**Note:** Configuration is applied via `CONFIG SET` at runtime. Do not modify control options like `port`, `bind`, or `dir`. -To enable auth create a secret with a password field: +### Affinity and Tolerations -``` -echo -n "pass" > password -kubectl create secret generic redis-auth --from-file=password +- [Node Affinity](example/redisfailover/node-affinity.yaml) +- [Pod Anti-Affinity](example/redisfailover/pod-anti-affinity.yaml) +- [Tolerations](example/redisfailover/tolerations.yaml) +- [Topology Spread Constraints](example/redisfailover/topology-spread-contraints.yaml) -## example config -apiVersion: databases.spotahome.com/v1 -kind: RedisFailover -metadata: - name: redisfailover +### Security Context + +- [Pod Security Context](example/redisfailover/security-context.yaml) +- [Container Security Context](example/redisfailover/container-security-context.yaml) + +### Bootstrapping + +Migrate from an existing Redis instance: + +```yaml spec: - sentinel: - replicas: 3 - redis: - replicas: 1 - auth: - secretPath: redis-auth + bootstrapNode: + host: existing-redis.example.com + port: "6379" + allowSentinels: false # Set true to also create Sentinels pointing to bootstrap node ``` -You need to set secretPath as the secret name which is created before. -### Bootstrapping from pre-existing Redis Instance(s) -If you are wanting to migrate off of a pre-existing Redis instance, you can provide a `bootstrapNode` to your `RedisFailover` resource spec. +See [bootstrapping.yaml](example/redisfailover/bootstrapping.yaml) for details. -This `bootstrapNode` can be configured as follows: -| Key | Type | Description | Example File | -|:--------------:|--------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------| -| host | **required** | The IP of the target Redis address or the ClusterIP of a pre-existing Kubernetes Service targeting Redis pods | [bootstrapping.yaml](example/redisfailover/bootstrapping.yaml) | -| port | _optional_ | The Port that the target Redis address is listening to. Defaults to `6379`. | [bootstrapping-with-port.yaml](example/redisfailover/bootstrapping-with-port.yaml) | -| allowSentinels | _optional_ | Allow the Operator to also create the specified Sentinel resources and point them to the target Node/Port. By default, the Sentinel resources will **not** be created when bootstrapping. | [bootstrapping-with-sentinels.yaml](example/redisfailover/bootstrapping-with-sentinels.yaml) | +## CI/CD -#### What is Bootstrapping? -When a `bootstrapNode` is provided, the Operator will always set all of the defined Redis instances to replicate from the provided `bootstrapNode` host value. -This allows for defining a `RedisFailover` that replicates from an existing Redis instance to ease cutover from one instance to another. +This project includes comprehensive GitHub Actions workflows: -**Note: Redis instance will always be configured with `replica-priority 0`. This means that these Redis instances can _never_ be promoted to a `master`.** +| Workflow | Triggers | Description | +|----------|----------|-------------| +| CI | Push, PR | Build, lint, unit tests, integration tests, Docker build | +| E2E | PR | Full end-to-end tests in minikube cluster | +| Release | Tags | Multi-arch image build and push to GHCR | -Depending on the configuration provided, the Operator will launch the `RedisFailover` in two bootstrapping states: without sentinels and with sentinels. +### E2E Tests -#### Default Bootstrapping Mode (Without Sentinels) -By default, if the `RedisFailover` resource defines a valid `bootstrapNode`, **only the redis instances will be created**. -This allows for ease of bootstrapping from an existing `RedisFailover` instance without the Sentinels intermingling with each other. +The E2E workflow validates: +- Instance manager runs as PID 1 +- RDB cleanup works on pod restart +- Redis remains functional after restart +- Sentinel-free mode: no Sentinel resources created +- Sentinel-free mode: operator-managed failover works -#### Bootstrapping With Sentinels -When `allowSentinels` is provided, the Operator will also create the defined Sentinel resources. These sentinels will be configured to point to the provided -`bootstrapNode` as their monitored master. +## Development -### Default versions +### Generate CRD -The image versions deployed by the operator can be found on the [defaults file](api/redisfailover/v1/defaults.go). -## Cleanup +Requires [controller-gen](https://github.com/kubernetes-sigs/controller-tools) v0.20.0+ for Go 1.25+: + +```bash +go install sigs.k8s.io/controller-tools/cmd/controller-gen@latest +make generate-crd +``` -### Operator and CRD +### Run Tests -If you want to delete the operator from your Kubernetes cluster, the operator deployment should be deleted. +```bash +make ci-unit-test +make ci-integration-test +``` -Also, the CRD has to be deleted. Deleting CRD automatically wil delete all redis failover custom resources and their managed resources: +### Build Docker Image +```bash +make image ``` + +## Cleanup + +### Remove Operator + +```bash +helm uninstall redis-operator kubectl delete crd redisfailovers.databases.spotahome.com ``` -### Single Redis Failover +**Warning:** Deleting the CRD removes all RedisFailover resources and their managed objects. -Thanks to Kubernetes' `OwnerReference`, all the objects created from a redis-failover will be deleted after the custom resource is. +### Remove Single RedisFailover -``` +```bash kubectl delete redisfailover ``` +All managed resources are automatically cleaned up via OwnerReferences. + ## Docker Images -### Redis Operator +Images are published to GitHub Container Registry: -* [Redis Operator Image](https://github.com/Saremox/redis-operator/pkgs/container/redis-operator) +- **Operator & Instance Manager**: `ghcr.io/buildio/redis-operator` +- **Helm Chart**: `oci://ghcr.io/buildio/redis-operator/charts/redisoperator` ## Documentation -For the code documentation, you can lookup on the [GoDoc](https://godoc.org/github.com/Saremox/redis-operator). - -Also, you can check more deeply information on the [docs folder](docs). +- [API Reference](docs/) +- [Examples](example/) +- [GoDoc](https://godoc.org/github.com/buildio/redis-operator) diff --git a/api/redisfailover/v1/defaults.go b/api/redisfailover/v1/defaults.go index 18fd9699e..63e6773ac 100644 --- a/api/redisfailover/v1/defaults.go +++ b/api/redisfailover/v1/defaults.go @@ -15,6 +15,20 @@ const ( defaultRedisPort = 6379 HealthyState = "Healthy" NotHealthyState = "NotHealthy" + + // DefaultInstanceManagerImage is the default image used for the instance manager. + // This image contains the redis-instance binary that runs as PID 1. + // Users can override this per-RedisFailover via spec.redis.instanceManagerImage. + DefaultInstanceManagerImage = "ghcr.io/buildio/redis-operator:4.0.0" +) + +var ( + // DefaultSentinelEnabled is the default value for sentinel.enabled + // Starting with 4.0.0, sentinel is DISABLED by default (operator-managed failover) + // Set sentinel.enabled: true to use Redis Sentinel for failover + DefaultSentinelEnabled = false + // DefaultFailoverTimeout is the default timeout for operator-managed failover + DefaultFailoverTimeout = metav1.Duration{Duration: 10 * time.Second} ) var ( diff --git a/api/redisfailover/v1/types.go b/api/redisfailover/v1/types.go index df8303f35..3c76490f7 100644 --- a/api/redisfailover/v1/types.go +++ b/api/redisfailover/v1/types.go @@ -40,6 +40,12 @@ type RedisCommandRename struct { type RedisSettings struct { Image string `json:"image,omitempty"` ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + // InstanceManagerImage is the image containing the redis-instance binary. + // When specified, the instance manager runs as PID 1 and manages Redis, + // following the CloudNativePG model. This enables RDB cleanup on startup + // and proper signal handling for graceful shutdown. + // See: https://cloudnative-pg.io/documentation/current/instance_manager/ + InstanceManagerImage string `json:"instanceManagerImage,omitempty"` Replicas int32 `json:"replicas,omitempty"` Port int32 `json:"port,omitempty"` Resources corev1.ResourceRequirements `json:"resources,omitempty"` diff --git a/charts/redisoperator/Chart.yaml b/charts/redisoperator/Chart.yaml index c67ca3c33..e143fa3b5 100644 --- a/charts/redisoperator/Chart.yaml +++ b/charts/redisoperator/Chart.yaml @@ -1,7 +1,9 @@ apiVersion: v1 name: redis-operator -version: 4.0.0-rc0 -appVersion: "4.0.0-rc0" +version: 4.0.0 +# appVersion matches the Docker image tag (no leading "v") +# Chart version, appVersion, and image tag are all the same: 4.0.0 +appVersion: "4.0.0" description: A Helm chart for the Redis Operator home: https://github.com/saremox/redis-operator keywords: diff --git a/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml b/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml index ca358b1f6..2a1a06077 100644 --- a/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml +++ b/charts/redisoperator/crds/databases.spotahome.com_redisfailovers.yaml @@ -6820,6 +6820,14 @@ spec: - name type: object type: array + instanceManagerImage: + description: |- + InstanceManagerImage is the image containing the redis-instance binary. + When specified, the instance manager runs as PID 1 and manages Redis, + following the CloudNativePG model. This enables RDB cleanup on startup + and proper signal handling for graceful shutdown. + See: https://cloudnative-pg.io/documentation/current/instance_manager/ + type: string nodeSelector: additionalProperties: type: string diff --git a/charts/redisoperator/values.yaml b/charts/redisoperator/values.yaml index 2c233f9cf..363ad4a41 100644 --- a/charts/redisoperator/values.yaml +++ b/charts/redisoperator/values.yaml @@ -4,8 +4,10 @@ # Name of the image repository to pull the container image from. image: - repository: ghcr.io/saremox/redis-operator + repository: ghcr.io/buildio/redis-operator pullPolicy: IfNotPresent + # Image tag. If not specified, defaults to Chart appVersion (e.g., "4.0.0"). + # Note: Starting with 4.0.0, tags do NOT have a leading "v" (4.0.0, not v4.0.0). tag: "" cli_args: "" diff --git a/cmd/instance/cleanup/cmd.go b/cmd/instance/cleanup/cmd.go new file mode 100644 index 000000000..61a5d8d74 --- /dev/null +++ b/cmd/instance/cleanup/cmd.go @@ -0,0 +1,121 @@ +package cleanup + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/spf13/cobra" +) + +const ( + defaultDataDir = "/data" + defaultDBFilename = "dump.rdb" +) + +var ( + dataDir string + dbFilename string + dryRun bool +) + +// NewCmd creates the cleanup command +func NewCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "cleanup", + Short: "Clean up stale RDB tempfiles", + Long: `Clean up stale RDB tempfiles before Redis starts. + +During BGSAVE operations, Redis creates temporary files named temp-.rdb. +If Redis crashes during a BGSAVE, these files are left behind and can accumulate, +eventually filling the disk and causing further failures. + +This command removes all .rdb files except the main database file (default: dump.rdb) +from the data directory.`, + RunE: runCleanup, + } + + cmd.Flags().StringVar(&dataDir, "data-dir", defaultDataDir, "Redis data directory") + cmd.Flags().StringVar(&dbFilename, "db-filename", defaultDBFilename, "Main RDB filename to preserve") + cmd.Flags().BoolVar(&dryRun, "dry-run", false, "Print files that would be deleted without deleting them") + + return cmd +} + +func runCleanup(cmd *cobra.Command, args []string) error { + // Validate data directory exists + info, err := os.Stat(dataDir) + if err != nil { + if os.IsNotExist(err) { + // Data directory doesn't exist yet, nothing to clean + fmt.Printf("Data directory %s does not exist, skipping cleanup\n", dataDir) + return nil + } + return fmt.Errorf("failed to stat data directory: %w", err) + } + if !info.IsDir() { + return fmt.Errorf("%s is not a directory", dataDir) + } + + // Find and remove stale RDB files + entries, err := os.ReadDir(dataDir) + if err != nil { + return fmt.Errorf("failed to read data directory: %w", err) + } + + var cleaned int + var totalSize int64 + + for _, entry := range entries { + if entry.IsDir() { + continue + } + + name := entry.Name() + + // Skip non-RDB files + if !strings.HasSuffix(name, ".rdb") { + continue + } + + // Preserve the main database file + if name == dbFilename { + continue + } + + filePath := filepath.Join(dataDir, name) + + // Get file size for reporting + fileInfo, err := entry.Info() + if err != nil { + fmt.Printf("Warning: failed to get info for %s: %v\n", name, err) + continue + } + + if dryRun { + fmt.Printf("Would delete: %s (%d bytes)\n", filePath, fileInfo.Size()) + } else { + if err := os.Remove(filePath); err != nil { + fmt.Printf("Warning: failed to remove %s: %v\n", filePath, err) + continue + } + fmt.Printf("Deleted: %s (%d bytes)\n", filePath, fileInfo.Size()) + } + + cleaned++ + totalSize += fileInfo.Size() + } + + if cleaned > 0 { + action := "Deleted" + if dryRun { + action = "Would delete" + } + fmt.Printf("%s %d stale RDB file(s), freed %d bytes\n", action, cleaned, totalSize) + } else { + fmt.Println("No stale RDB files found") + } + + return nil +} diff --git a/cmd/instance/cleanup/cmd_test.go b/cmd/instance/cleanup/cmd_test.go new file mode 100644 index 000000000..5e85cf61d --- /dev/null +++ b/cmd/instance/cleanup/cmd_test.go @@ -0,0 +1,173 @@ +package cleanup + +import ( + "os" + "path/filepath" + "testing" +) + +func TestRunCleanup(t *testing.T) { + tests := []struct { + name string + files []string + expectedKept []string + expectedRemove []string + dbFilename string + }{ + { + name: "removes temp rdb files", + files: []string{"dump.rdb", "temp-1234.rdb", "temp-5678.rdb"}, + expectedKept: []string{"dump.rdb"}, + expectedRemove: []string{"temp-1234.rdb", "temp-5678.rdb"}, + dbFilename: "dump.rdb", + }, + { + name: "preserves non-rdb files", + files: []string{"dump.rdb", "temp-1234.rdb", "appendonly.aof", "nodes.conf"}, + expectedKept: []string{"dump.rdb", "appendonly.aof", "nodes.conf"}, + expectedRemove: []string{"temp-1234.rdb"}, + dbFilename: "dump.rdb", + }, + { + name: "handles custom db filename", + files: []string{"custom.rdb", "dump.rdb", "temp-1234.rdb"}, + expectedKept: []string{"custom.rdb"}, + expectedRemove: []string{"dump.rdb", "temp-1234.rdb"}, + dbFilename: "custom.rdb", + }, + { + name: "handles empty directory", + files: []string{}, + expectedKept: []string{}, + expectedRemove: []string{}, + dbFilename: "dump.rdb", + }, + { + name: "handles only main db file", + files: []string{"dump.rdb"}, + expectedKept: []string{"dump.rdb"}, + expectedRemove: []string{}, + dbFilename: "dump.rdb", + }, + { + name: "removes all rdb variants except main", + files: []string{"dump.rdb", "backup.rdb", "old.rdb", "temp-123.rdb"}, + expectedKept: []string{"dump.rdb"}, + expectedRemove: []string{"backup.rdb", "old.rdb", "temp-123.rdb"}, + dbFilename: "dump.rdb", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create temp directory + tmpDir, err := os.MkdirTemp("", "redis-cleanup-test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer func() { _ = os.RemoveAll(tmpDir) }() + + // Create test files + for _, f := range tt.files { + filePath := filepath.Join(tmpDir, f) + if err := os.WriteFile(filePath, []byte("test content"), 0644); err != nil { + t.Fatalf("failed to create test file %s: %v", f, err) + } + } + + // Set package variables for the test + dataDir = tmpDir + dbFilename = tt.dbFilename + dryRun = false + + // Run cleanup + if err := runCleanup(nil, nil); err != nil { + t.Fatalf("runCleanup failed: %v", err) + } + + // Verify expected files are kept + for _, f := range tt.expectedKept { + filePath := filepath.Join(tmpDir, f) + if _, err := os.Stat(filePath); os.IsNotExist(err) { + t.Errorf("expected file %s to be kept, but it was removed", f) + } + } + + // Verify expected files are removed + for _, f := range tt.expectedRemove { + filePath := filepath.Join(tmpDir, f) + if _, err := os.Stat(filePath); !os.IsNotExist(err) { + t.Errorf("expected file %s to be removed, but it still exists", f) + } + } + }) + } +} + +func TestRunCleanupDryRun(t *testing.T) { + // Create temp directory + tmpDir, err := os.MkdirTemp("", "redis-cleanup-test") + if err != nil { + t.Fatalf("failed to create temp dir: %v", err) + } + defer func() { _ = os.RemoveAll(tmpDir) }() + + // Create test files + files := []string{"dump.rdb", "temp-1234.rdb"} + for _, f := range files { + filePath := filepath.Join(tmpDir, f) + if err := os.WriteFile(filePath, []byte("test content"), 0644); err != nil { + t.Fatalf("failed to create test file %s: %v", f, err) + } + } + + // Set package variables for the test + dataDir = tmpDir + dbFilename = "dump.rdb" + dryRun = true + + // Run cleanup + if err := runCleanup(nil, nil); err != nil { + t.Fatalf("runCleanup failed: %v", err) + } + + // In dry-run mode, all files should still exist + for _, f := range files { + filePath := filepath.Join(tmpDir, f) + if _, err := os.Stat(filePath); os.IsNotExist(err) { + t.Errorf("dry-run should not remove files, but %s was removed", f) + } + } +} + +func TestRunCleanupNonExistentDir(t *testing.T) { + // Set package variables for the test + dataDir = "/nonexistent/path/that/does/not/exist" + dbFilename = "dump.rdb" + dryRun = false + + // Run cleanup - should not error, just skip + if err := runCleanup(nil, nil); err != nil { + t.Fatalf("runCleanup should not fail for non-existent dir: %v", err) + } +} + +func TestRunCleanupNotADirectory(t *testing.T) { + // Create a temp file (not a directory) + tmpFile, err := os.CreateTemp("", "redis-cleanup-test") + if err != nil { + t.Fatalf("failed to create temp file: %v", err) + } + defer func() { _ = os.Remove(tmpFile.Name()) }() + _ = tmpFile.Close() + + // Set package variables for the test + dataDir = tmpFile.Name() + dbFilename = "dump.rdb" + dryRun = false + + // Run cleanup - should error because it's not a directory + if err := runCleanup(nil, nil); err == nil { + t.Fatal("runCleanup should fail when dataDir is not a directory") + } +} diff --git a/cmd/instance/main.go b/cmd/instance/main.go new file mode 100644 index 000000000..e25de61d1 --- /dev/null +++ b/cmd/instance/main.go @@ -0,0 +1,50 @@ +// Package main provides the redis-instance binary, which serves as the instance +// manager for Redis pods managed by redis-operator. +// +// This follows the CloudNativePG (CNPG) model where the instance manager runs as +// PID 1 and manages the database process. This architecture has proven reliable +// at scale in production Kubernetes environments. +// +// See: https://cloudnative-pg.io/documentation/current/instance_manager/ +package main + +import ( + "fmt" + "os" + + "github.com/spf13/cobra" + + "github.com/saremox/redis-operator/cmd/instance/cleanup" + "github.com/saremox/redis-operator/cmd/instance/run" +) + +var rootCmd = &cobra.Command{ + Use: "redis-instance", + Short: "Redis instance manager for redis-operator", + Long: `Redis instance manager handles lifecycle operations for Redis instances +managed by redis-operator. + +This tool follows the CloudNativePG (CNPG) model where the instance manager +runs as PID 1 in the container and manages the Redis process as a child. +This architecture provides: + + - Full lifecycle control over the Redis process + - Clean signal handling and graceful shutdown + - Startup tasks (RDB cleanup) before Redis starts + - Foundation for health checks, metrics, and monitoring + +See: https://cloudnative-pg.io/documentation/current/instance_manager/`, + SilenceUsage: true, +} + +func init() { + rootCmd.AddCommand(cleanup.NewCmd()) + rootCmd.AddCommand(run.NewCmd()) +} + +func main() { + if err := rootCmd.Execute(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} diff --git a/cmd/instance/run/cmd.go b/cmd/instance/run/cmd.go new file mode 100644 index 000000000..cee1d7011 --- /dev/null +++ b/cmd/instance/run/cmd.go @@ -0,0 +1,344 @@ +// Package run implements the instance manager run command. +// +// This follows the CloudNativePG (CNPG) model where the instance manager runs as +// PID 1 and manages the database process as a child. This architecture has proven +// reliable at scale in production Kubernetes environments. +// +// Key features (learned from CNPG): +// - Full lifecycle control over the Redis process +// - Clean signal handling with graceful shutdown and timeout escalation +// - Zombie process reaper (required when running as PID 1) +// - Startup tasks (RDB cleanup) before Redis starts +// - Process restart capability for unexpected crashes +// +// See: https://cloudnative-pg.io/documentation/current/instance_manager/ +package run + +import ( + "context" + "fmt" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/spf13/cobra" +) + +// healthServer is the global health server instance +var healthServer *HealthServer + +const ( + defaultDataDir = "/data" + defaultDBFilename = "dump.rdb" + defaultRedisConf = "/redis/redis.conf" + defaultRedisCommand = "redis-server" + + // Shutdown timeouts (following CNPG pattern) + // These provide escalation from graceful to forced shutdown + gracefulShutdownTimeout = 25 * time.Second // Time for SIGTERM before SIGKILL + maxShutdownTimeout = 30 * time.Second // Total shutdown budget (matches K8s terminationGracePeriodSeconds) +) + +var ( + dataDir string + dbFilename string + redisConf string + healthPort int + redisPort string +) + +// NewCmd creates the run command +func NewCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "run", + Short: "Run Redis with instance management", + Long: `Run Redis server with full lifecycle management. + +This command implements the instance manager pattern (similar to CloudNativePG) +where the manager runs as PID 1 and manages Redis as a child process. + +On startup: + 1. Cleans up stale RDB tempfiles to prevent disk exhaustion + 2. Starts redis-server as a child process + 3. Reaps zombie processes (required for PID 1) + 4. Forwards signals to Redis for graceful shutdown + +Shutdown behavior (CNPG model): + - SIGTERM: Initiate graceful shutdown with timeout + - If Redis doesn't exit within timeout, escalate to SIGKILL + - Proper cleanup even under crash conditions + +This architecture provides: + - Clean signal handling and graceful shutdown + - Startup tasks before Redis begins accepting connections + - Foundation for health checks, metrics, and other lifecycle features`, + RunE: runInstance, + } + + cmd.Flags().StringVar(&dataDir, "data-dir", defaultDataDir, "Redis data directory") + cmd.Flags().StringVar(&dbFilename, "db-filename", defaultDBFilename, "Main RDB filename to preserve during cleanup") + cmd.Flags().StringVar(&redisConf, "redis-conf", defaultRedisConf, "Path to redis.conf") + cmd.Flags().IntVar(&healthPort, "health-port", defaultHealthPort, "Port for health check endpoints") + cmd.Flags().StringVar(&redisPort, "redis-port", "6379", "Redis port for health checks") + + return cmd +} + +func runInstance(cmd *cobra.Command, args []string) error { + fmt.Println("redis-instance: starting instance manager (CNPG-style)") + fmt.Printf("redis-instance: PID %d running as process manager\n", os.Getpid()) + + // Create a context that we can cancel on shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Step 1: Start zombie process reaper (CNPG pattern) + // As PID 1, we're responsible for reaping orphaned child processes + go runZombieReaper(ctx) + + // Step 2: Perform startup cleanup + cleanupErr := performStartupCleanup() + if cleanupErr != nil { + // Log but don't fail - Redis should still be able to start + fmt.Printf("redis-instance: warning: startup cleanup failed: %v\n", cleanupErr) + } + + // Step 3: Start health server (provides /healthz, /readyz, /status) + redisPassword := os.Getenv("REDIS_PASSWORD") + healthServer = NewHealthServer(healthPort, redisPort, redisPassword) + healthServer.SetCleanupDone(cleanupErr == nil) + if err := healthServer.Start(ctx); err != nil { + fmt.Printf("redis-instance: warning: failed to start health server: %v\n", err) + } + defer func() { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer shutdownCancel() + if err := healthServer.Stop(shutdownCtx); err != nil { + fmt.Printf("redis-instance: warning: health server stop error: %v\n", err) + } + }() + + // Step 4: Main process loop (CNPG pattern) + // This loop allows for process restarts without manager exit + return runProcessLoop(ctx, cancel) +} + +// runProcessLoop manages the Redis process lifecycle with restart capability. +// Following CNPG pattern, this allows recovery from unexpected crashes. +func runProcessLoop(ctx context.Context, cancel context.CancelFunc) error { + // Set up signal handling + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGTERM, syscall.SIGINT, syscall.SIGQUIT) + defer signal.Stop(sigChan) + + for { + // Start Redis as a child process + redisCmd := exec.CommandContext(ctx, defaultRedisCommand, redisConf) + redisCmd.Stdout = os.Stdout + redisCmd.Stderr = os.Stderr + redisCmd.Stdin = os.Stdin + + fmt.Printf("redis-instance: starting redis-server with config %s\n", redisConf) + if err := redisCmd.Start(); err != nil { + return fmt.Errorf("failed to start redis-server: %w", err) + } + + redisPid := redisCmd.Process.Pid + fmt.Printf("redis-instance: redis-server started with PID %d\n", redisPid) + + // Notify health server of Redis PID + if healthServer != nil { + healthServer.SetRedisPID(redisPid) + } + + // Wait for either Redis to exit or a signal + doneChan := make(chan error, 1) + go func() { + doneChan <- redisCmd.Wait() + }() + + select { + case sig := <-sigChan: + fmt.Printf("redis-instance: received signal %v, initiating graceful shutdown\n", sig) + return shutdownRedis(redisCmd, doneChan) + + case <-ctx.Done(): + fmt.Println("redis-instance: context cancelled, initiating shutdown") + return shutdownRedis(redisCmd, doneChan) + + case err := <-doneChan: + if err != nil { + // Redis exited unexpectedly + fmt.Printf("redis-instance: redis-server (PID %d) exited unexpectedly: %v\n", redisPid, err) + + // Check if this is a context cancellation (we're shutting down) + if ctx.Err() != nil { + return nil + } + + // For now, exit on unexpected crash + // Future: could implement restart with backoff + return fmt.Errorf("redis-server exited unexpectedly: %w", err) + } + // Clean exit + fmt.Printf("redis-instance: redis-server (PID %d) exited cleanly\n", redisPid) + return nil + } + } +} + +// shutdownRedis handles graceful shutdown with timeout escalation (CNPG pattern). +// First sends SIGTERM, then escalates to SIGKILL if Redis doesn't exit in time. +func shutdownRedis(cmd *exec.Cmd, doneChan <-chan error) error { + if cmd.Process == nil { + return nil + } + + pid := cmd.Process.Pid + + // Step 1: Send SIGTERM for graceful shutdown + fmt.Printf("redis-instance: sending SIGTERM to redis-server (PID %d)\n", pid) + if err := cmd.Process.Signal(syscall.SIGTERM); err != nil { + fmt.Printf("redis-instance: warning: failed to send SIGTERM: %v\n", err) + } + + // Step 2: Wait for graceful shutdown with timeout + gracefulTimer := time.NewTimer(gracefulShutdownTimeout) + defer gracefulTimer.Stop() + + select { + case err := <-doneChan: + if err != nil { + fmt.Printf("redis-instance: redis-server exited with error during shutdown: %v\n", err) + } else { + fmt.Println("redis-instance: redis-server exited gracefully") + } + return nil + + case <-gracefulTimer.C: + // Graceful shutdown timeout - escalate to SIGKILL + fmt.Printf("redis-instance: graceful shutdown timeout (%v), sending SIGKILL\n", gracefulShutdownTimeout) + if err := cmd.Process.Kill(); err != nil { + fmt.Printf("redis-instance: warning: failed to send SIGKILL: %v\n", err) + } + + // Wait for process to actually exit + maxTimer := time.NewTimer(maxShutdownTimeout - gracefulShutdownTimeout) + defer maxTimer.Stop() + + select { + case <-doneChan: + fmt.Println("redis-instance: redis-server terminated after SIGKILL") + return nil + case <-maxTimer.C: + return fmt.Errorf("redis-server (PID %d) did not exit after SIGKILL", pid) + } + } +} + +// runZombieReaper handles SIGCHLD signals to reap orphaned child processes. +// This is essential when running as PID 1 in a container (CNPG pattern). +// +// When Redis forks (e.g., for BGSAVE or BGREWRITEAOF), those child processes +// become orphans when they exit. As PID 1, we must reap them to prevent +// zombie process accumulation. +func runZombieReaper(ctx context.Context) { + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGCHLD) + defer signal.Stop(sigChan) + + for { + select { + case <-ctx.Done(): + return + case <-sigChan: + // Reap all zombie children + for { + var status syscall.WaitStatus + pid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, nil) + if pid <= 0 || err != nil { + break + } + // Log only if it's not the main Redis process (which we handle separately) + fmt.Printf("redis-instance: reaped zombie process PID %d (status: %d)\n", pid, status.ExitStatus()) + } + } + } +} + +// performStartupCleanup removes stale RDB tempfiles before Redis starts. +// During BGSAVE, Redis creates temp-.rdb files that can accumulate if +// Redis crashes repeatedly, eventually filling the disk. +func performStartupCleanup() error { + fmt.Printf("redis-instance: performing startup cleanup in %s\n", dataDir) + + // Check if data directory exists + info, err := os.Stat(dataDir) + if err != nil { + if os.IsNotExist(err) { + fmt.Printf("redis-instance: data directory %s does not exist yet, skipping cleanup\n", dataDir) + return nil + } + return fmt.Errorf("failed to stat data directory: %w", err) + } + if !info.IsDir() { + return fmt.Errorf("%s is not a directory", dataDir) + } + + // Find and remove stale RDB files + entries, err := os.ReadDir(dataDir) + if err != nil { + return fmt.Errorf("failed to read data directory: %w", err) + } + + var cleaned int + var totalSize int64 + + for _, entry := range entries { + if entry.IsDir() { + continue + } + + name := entry.Name() + + // Skip non-RDB files + if !strings.HasSuffix(name, ".rdb") { + continue + } + + // Preserve the main database file + if name == dbFilename { + continue + } + + filePath := filepath.Join(dataDir, name) + + // Get file size for reporting + fileInfo, err := entry.Info() + if err != nil { + fmt.Printf("redis-instance: warning: failed to get info for %s: %v\n", name, err) + continue + } + + if err := os.Remove(filePath); err != nil { + fmt.Printf("redis-instance: warning: failed to remove %s: %v\n", filePath, err) + continue + } + + fmt.Printf("redis-instance: removed stale RDB file %s (%d bytes)\n", name, fileInfo.Size()) + cleaned++ + totalSize += fileInfo.Size() + } + + if cleaned > 0 { + fmt.Printf("redis-instance: cleaned up %d stale RDB file(s), freed %d bytes\n", cleaned, totalSize) + } else { + fmt.Println("redis-instance: no stale RDB files found") + } + + return nil +} diff --git a/cmd/instance/run/health.go b/cmd/instance/run/health.go new file mode 100644 index 000000000..242349e9b --- /dev/null +++ b/cmd/instance/run/health.go @@ -0,0 +1,426 @@ +// Package run implements the instance manager run command. +package run + +import ( + "context" + "encoding/json" + "fmt" + "net" + "net/http" + "sync" + "sync/atomic" + "time" + + "github.com/go-redis/redis/v8" +) + +const ( + defaultHealthPort = 8080 + healthCheckInterval = time.Second + redisConnectTimeout = 2 * time.Second + redisCommandTimeout = time.Second +) + +// HealthServer provides HTTP health endpoints for the instance manager. +// It maintains a persistent connection to Redis and caches health status +// to minimize load on Redis. +type HealthServer struct { + port int + redisAddr string + redisPort string + redisPassword string + server *http.Server + client *redis.Client + + // Cached status (updated every healthCheckInterval) + mu sync.RWMutex + lastCheck time.Time + cachedInfo map[string]string + redisPid int + startTime time.Time + cleanupDone bool + + // Atomic flags + redisHealthy atomic.Bool + redisReady atomic.Bool +} + +// HealthResponse is the response for /healthz endpoint +type HealthResponse struct { + Status string `json:"status"` + RedisPID int `json:"redis_pid,omitempty"` + UptimeSeconds int64 `json:"uptime_seconds"` + Error string `json:"error,omitempty"` +} + +// ReadyResponse is the response for /readyz endpoint +type ReadyResponse struct { + Status string `json:"status"` + Role string `json:"role,omitempty"` + ConnectedClients int `json:"connected_clients,omitempty"` + Loading bool `json:"loading"` + MasterSyncInProgress bool `json:"master_sync_in_progress,omitempty"` + Error string `json:"error,omitempty"` +} + +// StatusResponse is the detailed response for /status endpoint +type StatusResponse struct { + Redis RedisStatus `json:"redis"` + Replication ReplicationStatus `json:"replication"` + InstanceManager InstanceManagerStatus `json:"instance_manager"` +} + +// RedisStatus contains Redis server status +type RedisStatus struct { + PID int `json:"pid"` + Role string `json:"role"` + ConnectedClients int `json:"connected_clients"` + UsedMemory string `json:"used_memory"` + UsedMemoryHuman string `json:"used_memory_human"` + Loading bool `json:"loading"` + RDBBgsaveInProgress bool `json:"rdb_bgsave_in_progress"` + AOFRewriteInProgress bool `json:"aof_rewrite_in_progress"` +} + +// ReplicationStatus contains replication information +type ReplicationStatus struct { + Role string `json:"role"` + ConnectedSlaves int `json:"connected_slaves,omitempty"` + MasterHost string `json:"master_host,omitempty"` + MasterPort int `json:"master_port,omitempty"` + MasterLinkStatus string `json:"master_link_status,omitempty"` + MasterSyncInProgress bool `json:"master_sync_in_progress,omitempty"` + SlaveReplOffset int64 `json:"slave_repl_offset,omitempty"` + MasterReplOffset int64 `json:"master_repl_offset,omitempty"` +} + +// InstanceManagerStatus contains instance manager status +type InstanceManagerStatus struct { + Version string `json:"version"` + UptimeSeconds int64 `json:"uptime_seconds"` + StartupCleanupDone bool `json:"startup_cleanup_done"` + HealthPort int `json:"health_port"` +} + +// NewHealthServer creates a new health server +func NewHealthServer(port int, redisPort string, redisPassword string) *HealthServer { + return &HealthServer{ + port: port, + redisAddr: "127.0.0.1", + redisPort: redisPort, + redisPassword: redisPassword, + startTime: time.Now(), + cachedInfo: make(map[string]string), + } +} + +// SetRedisPID updates the Redis process ID +func (h *HealthServer) SetRedisPID(pid int) { + h.mu.Lock() + defer h.mu.Unlock() + h.redisPid = pid +} + +// SetCleanupDone marks startup cleanup as complete +func (h *HealthServer) SetCleanupDone(done bool) { + h.mu.Lock() + defer h.mu.Unlock() + h.cleanupDone = done +} + +// Start begins the health server and background health checker +func (h *HealthServer) Start(ctx context.Context) error { + // Create Redis client + h.client = redis.NewClient(&redis.Options{ + Addr: net.JoinHostPort(h.redisAddr, h.redisPort), + Password: h.redisPassword, + DialTimeout: redisConnectTimeout, + ReadTimeout: redisCommandTimeout, + WriteTimeout: redisCommandTimeout, + }) + + // Set up HTTP routes + mux := http.NewServeMux() + mux.HandleFunc("/healthz", h.handleHealthz) + mux.HandleFunc("/readyz", h.handleReadyz) + mux.HandleFunc("/status", h.handleStatus) + + h.server = &http.Server{ + Addr: fmt.Sprintf(":%d", h.port), + Handler: mux, + ReadTimeout: 5 * time.Second, + WriteTimeout: 5 * time.Second, + } + + // Start background health checker + go h.runHealthChecker(ctx) + + // Start HTTP server + fmt.Printf("redis-instance: starting health server on port %d\n", h.port) + go func() { + if err := h.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + fmt.Printf("redis-instance: health server error: %v\n", err) + } + }() + + return nil +} + +// Stop gracefully stops the health server +func (h *HealthServer) Stop(ctx context.Context) error { + if h.server != nil { + if err := h.server.Shutdown(ctx); err != nil { + return fmt.Errorf("health server shutdown error: %w", err) + } + } + if h.client != nil { + if err := h.client.Close(); err != nil { + return fmt.Errorf("redis client close error: %w", err) + } + } + return nil +} + +// runHealthChecker periodically checks Redis health and caches the results +func (h *HealthServer) runHealthChecker(ctx context.Context) { + ticker := time.NewTicker(healthCheckInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + h.updateHealthStatus(ctx) + } + } +} + +// updateHealthStatus checks Redis and updates cached status +func (h *HealthServer) updateHealthStatus(ctx context.Context) { + checkCtx, cancel := context.WithTimeout(ctx, redisCommandTimeout) + defer cancel() + + // Try to ping Redis + if err := h.client.Ping(checkCtx).Err(); err != nil { + h.redisHealthy.Store(false) + h.redisReady.Store(false) + return + } + h.redisHealthy.Store(true) + + // Get INFO for detailed status + info, err := h.client.Info(checkCtx).Result() + if err != nil { + h.redisReady.Store(false) + return + } + + // Parse INFO output + h.mu.Lock() + h.cachedInfo = parseRedisInfo(info) + h.lastCheck = time.Now() + h.mu.Unlock() + + // Determine readiness + // Not ready if: loading, syncing from master, master link down, or no real master + loading := h.cachedInfo["loading"] == "1" + syncInProgress := h.cachedInfo["master_sync_in_progress"] == "1" + masterLinkDown := h.cachedInfo["master_link_status"] == "down" + // master_host:127.0.0.1 means no real master configured (replica pointing to itself) + noMasterConfigured := h.cachedInfo["master_host"] == "127.0.0.1" + + // Replica is not ready if syncing, master link is down, or no real master + isReplica := h.cachedInfo["role"] == "slave" + + ready := !loading + if isReplica { + ready = ready && !syncInProgress && !masterLinkDown && !noMasterConfigured + } + + h.redisReady.Store(ready) +} + +// handleHealthz handles the /healthz liveness endpoint +func (h *HealthServer) handleHealthz(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + h.mu.RLock() + pid := h.redisPid + h.mu.RUnlock() + + uptime := int64(time.Since(h.startTime).Seconds()) + + resp := HealthResponse{ + RedisPID: pid, + UptimeSeconds: uptime, + } + + if h.redisHealthy.Load() { + resp.Status = "ok" + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + } else { + resp.Status = "unhealthy" + resp.Error = "redis not responding to PING" + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusServiceUnavailable) + } + + _ = json.NewEncoder(w).Encode(resp) +} + +// handleReadyz handles the /readyz readiness endpoint +func (h *HealthServer) handleReadyz(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + h.mu.RLock() + info := h.cachedInfo + h.mu.RUnlock() + + resp := ReadyResponse{ + Role: info["role"], + Loading: info["loading"] == "1", + MasterSyncInProgress: info["master_sync_in_progress"] == "1", + } + + if clients, ok := info["connected_clients"]; ok { + _, _ = fmt.Sscanf(clients, "%d", &resp.ConnectedClients) + } + + if h.redisReady.Load() { + resp.Status = "ok" + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + } else { + resp.Status = "not ready" + if resp.Loading { + resp.Error = "redis is loading data" + } else if resp.MasterSyncInProgress { + resp.Error = "replica sync in progress" + } else if info["master_link_status"] == "down" { + resp.Error = "master link is down" + } else if info["master_host"] == "127.0.0.1" { + resp.Error = "no master configured (replica pointing to self)" + } else if !h.redisHealthy.Load() { + resp.Error = "redis not responding" + } + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusServiceUnavailable) + } + + _ = json.NewEncoder(w).Encode(resp) +} + +// handleStatus handles the /status detailed status endpoint +func (h *HealthServer) handleStatus(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + h.mu.RLock() + info := h.cachedInfo + pid := h.redisPid + cleanupDone := h.cleanupDone + h.mu.RUnlock() + + resp := StatusResponse{ + Redis: RedisStatus{ + PID: pid, + Role: info["role"], + UsedMemory: info["used_memory"], + UsedMemoryHuman: info["used_memory_human"], + Loading: info["loading"] == "1", + RDBBgsaveInProgress: info["rdb_bgsave_in_progress"] == "1", + AOFRewriteInProgress: info["aof_rewrite_in_progress"] == "1", + }, + Replication: ReplicationStatus{ + Role: info["role"], + MasterHost: info["master_host"], + MasterLinkStatus: info["master_link_status"], + MasterSyncInProgress: info["master_sync_in_progress"] == "1", + }, + InstanceManager: InstanceManagerStatus{ + Version: "4.0.0", + UptimeSeconds: int64(time.Since(h.startTime).Seconds()), + StartupCleanupDone: cleanupDone, + HealthPort: h.port, + }, + } + + // Parse integer fields + _, _ = fmt.Sscanf(info["connected_clients"], "%d", &resp.Redis.ConnectedClients) + _, _ = fmt.Sscanf(info["connected_slaves"], "%d", &resp.Replication.ConnectedSlaves) + _, _ = fmt.Sscanf(info["master_port"], "%d", &resp.Replication.MasterPort) + _, _ = fmt.Sscanf(info["slave_repl_offset"], "%d", &resp.Replication.SlaveReplOffset) + _, _ = fmt.Sscanf(info["master_repl_offset"], "%d", &resp.Replication.MasterReplOffset) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(resp) +} + +// parseRedisInfo parses Redis INFO command output into a map +func parseRedisInfo(info string) map[string]string { + result := make(map[string]string) + lines := splitLines(info) + + for _, line := range lines { + // Skip comments and empty lines + if len(line) == 0 || line[0] == '#' { + continue + } + + // Parse key:value + idx := indexByte(line, ':') + if idx > 0 { + key := line[:idx] + value := line[idx+1:] + result[key] = value + } + } + + return result +} + +// splitLines splits a string by newlines +func splitLines(s string) []string { + var lines []string + start := 0 + for i := 0; i < len(s); i++ { + if s[i] == '\n' { + line := s[start:i] + // Remove trailing \r if present + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + lines = append(lines, line) + start = i + 1 + } + } + if start < len(s) { + line := s[start:] + if len(line) > 0 && line[len(line)-1] == '\r' { + line = line[:len(line)-1] + } + lines = append(lines, line) + } + return lines +} + +// indexByte returns the index of the first occurrence of c in s, or -1 +func indexByte(s string, c byte) int { + for i := 0; i < len(s); i++ { + if s[i] == c { + return i + } + } + return -1 +} diff --git a/cmd/instance/run/health_test.go b/cmd/instance/run/health_test.go new file mode 100644 index 000000000..463299555 --- /dev/null +++ b/cmd/instance/run/health_test.go @@ -0,0 +1,386 @@ +package run + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseRedisInfo(t *testing.T) { + info := `# Server +redis_version:7.0.0 +redis_git_sha1:00000000 +process_id:1234 + +# Clients +connected_clients:5 + +# Memory +used_memory:1024000 +used_memory_human:1000K + +# Replication +role:master +connected_slaves:2 + +# Persistence +loading:0 +rdb_bgsave_in_progress:0 +aof_rewrite_in_progress:0 +` + + result := parseRedisInfo(info) + + assert.Equal(t, "7.0.0", result["redis_version"]) + assert.Equal(t, "1234", result["process_id"]) + assert.Equal(t, "5", result["connected_clients"]) + assert.Equal(t, "1024000", result["used_memory"]) + assert.Equal(t, "1000K", result["used_memory_human"]) + assert.Equal(t, "master", result["role"]) + assert.Equal(t, "2", result["connected_slaves"]) + assert.Equal(t, "0", result["loading"]) + assert.Equal(t, "0", result["rdb_bgsave_in_progress"]) +} + +func TestParseRedisInfoReplica(t *testing.T) { + info := `# Replication +role:slave +master_host:10.0.0.1 +master_port:6379 +master_link_status:up +master_sync_in_progress:0 +slave_repl_offset:12345 +` + + result := parseRedisInfo(info) + + assert.Equal(t, "slave", result["role"]) + assert.Equal(t, "10.0.0.1", result["master_host"]) + assert.Equal(t, "6379", result["master_port"]) + assert.Equal(t, "up", result["master_link_status"]) + assert.Equal(t, "0", result["master_sync_in_progress"]) + assert.Equal(t, "12345", result["slave_repl_offset"]) +} + +func TestParseRedisInfoLoading(t *testing.T) { + info := `# Persistence +loading:1 +loading_total_bytes:1000000 +loading_loaded_bytes:500000 +` + + result := parseRedisInfo(info) + + assert.Equal(t, "1", result["loading"]) + assert.Equal(t, "1000000", result["loading_total_bytes"]) + assert.Equal(t, "500000", result["loading_loaded_bytes"]) +} + +func TestHealthServerHealthzHealthy(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.startTime = time.Now().Add(-60 * time.Second) // 60 seconds ago + h.SetRedisPID(1234) + h.redisHealthy.Store(true) + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + w := httptest.NewRecorder() + + h.handleHealthz(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + + var resp HealthResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Equal(t, "ok", resp.Status) + assert.Equal(t, 1234, resp.RedisPID) + assert.GreaterOrEqual(t, resp.UptimeSeconds, int64(60)) +} + +func TestHealthServerHealthzUnhealthy(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisHealthy.Store(false) + + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + w := httptest.NewRecorder() + + h.handleHealthz(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp HealthResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Equal(t, "unhealthy", resp.Status) + assert.Contains(t, resp.Error, "not responding") +} + +func TestHealthServerReadyzReady(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisReady.Store(true) + h.redisHealthy.Store(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "master", + "connected_clients": "10", + "loading": "0", + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + h.handleReadyz(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + + var resp ReadyResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Equal(t, "ok", resp.Status) + assert.Equal(t, "master", resp.Role) + assert.Equal(t, 10, resp.ConnectedClients) + assert.False(t, resp.Loading) +} + +func TestHealthServerReadyzNotReadyLoading(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisReady.Store(false) + h.redisHealthy.Store(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "master", + "loading": "1", + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + h.handleReadyz(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp ReadyResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Equal(t, "not ready", resp.Status) + assert.True(t, resp.Loading) + assert.Contains(t, resp.Error, "loading") +} + +func TestHealthServerReadyzNotReadySyncing(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisReady.Store(false) + h.redisHealthy.Store(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "slave", + "loading": "0", + "master_sync_in_progress": "1", + "master_link_status": "up", + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + h.handleReadyz(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp ReadyResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Equal(t, "not ready", resp.Status) + assert.True(t, resp.MasterSyncInProgress) + assert.Contains(t, resp.Error, "sync") +} + +func TestHealthServerReadyzNotReadyMasterLinkDown(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisReady.Store(false) + h.redisHealthy.Store(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "slave", + "loading": "0", + "master_sync_in_progress": "0", + "master_link_status": "down", + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + h.handleReadyz(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp ReadyResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Contains(t, resp.Error, "master link") +} + +func TestHealthServerReadyzNotReadyNoMasterConfigured(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.redisReady.Store(false) + h.redisHealthy.Store(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "slave", + "loading": "0", + "master_sync_in_progress": "0", + "master_link_status": "up", + "master_host": "127.0.0.1", // No real master, pointing to self + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + w := httptest.NewRecorder() + + h.handleReadyz(w, req) + + assert.Equal(t, http.StatusServiceUnavailable, w.Code) + + var resp ReadyResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + assert.Contains(t, resp.Error, "no master configured") +} + +func TestHealthServerStatus(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + h.startTime = time.Now().Add(-120 * time.Second) + h.SetRedisPID(5678) + h.SetCleanupDone(true) + + h.mu.Lock() + h.cachedInfo = map[string]string{ + "role": "master", + "connected_clients": "15", + "used_memory": "2048000", + "used_memory_human": "2M", + "loading": "0", + "rdb_bgsave_in_progress": "0", + "aof_rewrite_in_progress": "0", + "connected_slaves": "2", + "master_repl_offset": "99999", + } + h.mu.Unlock() + + req := httptest.NewRequest(http.MethodGet, "/status", nil) + w := httptest.NewRecorder() + + h.handleStatus(w, req) + + assert.Equal(t, http.StatusOK, w.Code) + + var resp StatusResponse + err := json.Unmarshal(w.Body.Bytes(), &resp) + require.NoError(t, err) + + // Redis status + assert.Equal(t, 5678, resp.Redis.PID) + assert.Equal(t, "master", resp.Redis.Role) + assert.Equal(t, 15, resp.Redis.ConnectedClients) + assert.Equal(t, "2M", resp.Redis.UsedMemoryHuman) + assert.False(t, resp.Redis.Loading) + assert.False(t, resp.Redis.RDBBgsaveInProgress) + + // Replication status + assert.Equal(t, "master", resp.Replication.Role) + assert.Equal(t, 2, resp.Replication.ConnectedSlaves) + assert.Equal(t, int64(99999), resp.Replication.MasterReplOffset) + + // Instance manager status + assert.Equal(t, "4.0.0", resp.InstanceManager.Version) + assert.GreaterOrEqual(t, resp.InstanceManager.UptimeSeconds, int64(120)) + assert.True(t, resp.InstanceManager.StartupCleanupDone) + assert.Equal(t, 8080, resp.InstanceManager.HealthPort) +} + +func TestHealthServerMethodNotAllowed(t *testing.T) { + h := NewHealthServer(8080, "6379", "") + + endpoints := []string{"/healthz", "/readyz", "/status"} + methods := []string{http.MethodPost, http.MethodPut, http.MethodDelete} + + for _, endpoint := range endpoints { + for _, method := range methods { + req := httptest.NewRequest(method, endpoint, nil) + w := httptest.NewRecorder() + + switch endpoint { + case "/healthz": + h.handleHealthz(w, req) + case "/readyz": + h.handleReadyz(w, req) + case "/status": + h.handleStatus(w, req) + } + + assert.Equal(t, http.StatusMethodNotAllowed, w.Code, + "expected 405 for %s %s", method, endpoint) + } + } +} + +func TestSplitLines(t *testing.T) { + tests := []struct { + input string + expected []string + }{ + {"a\nb\nc", []string{"a", "b", "c"}}, + {"a\r\nb\r\nc", []string{"a", "b", "c"}}, + {"single", []string{"single"}}, + {"a\nb", []string{"a", "b"}}, + } + + for _, tt := range tests { + result := splitLines(tt.input) + assert.Equal(t, tt.expected, result, "input: %q", tt.input) + } +} + +func TestIndexByte(t *testing.T) { + assert.Equal(t, 3, indexByte("foo:bar", ':')) + assert.Equal(t, -1, indexByte("foobar", ':')) + assert.Equal(t, 0, indexByte(":foo", ':')) +} + +func TestHealthServerStartStop(t *testing.T) { + h := NewHealthServer(0, "6379", "") // Port 0 = random available port + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Start should not error (even without Redis) + err := h.Start(ctx) + assert.NoError(t, err) + + // Stop should not error + stopCtx, stopCancel := context.WithTimeout(context.Background(), 5*time.Second) + defer stopCancel() + err = h.Stop(stopCtx) + assert.NoError(t, err) +} diff --git a/cmd/utils/flags.go b/cmd/utils/flags.go index be83c4fb9..de1ac681c 100644 --- a/cmd/utils/flags.go +++ b/cmd/utils/flags.go @@ -23,6 +23,7 @@ type CMDFlags struct { Concurrency int SyncInterval int LogLevel string + InstanceManagerImage string } // Init initializes and parse the flags @@ -41,6 +42,7 @@ func (c *CMDFlags) Init() { flag.IntVar(&c.Concurrency, "concurrency", 3, "Number of conccurent workers meant to process events") flag.IntVar(&c.SyncInterval, "sync-interval", 30, "Number of seconds between checks") flag.StringVar(&c.LogLevel, "log-level", "info", "set log level") + flag.StringVar(&c.InstanceManagerImage, "instance-manager-image", "", "Image containing the redis-instance binary for init containers (defaults to empty, which disables RDB cleanup)") // Parse flags flag.Parse() @@ -57,5 +59,6 @@ func (c *CMDFlags) ToRedisOperatorConfig() redisfailover.Config { Concurrency: c.Concurrency, SyncInterval: c.SyncInterval, SupportedNamespacesRegex: c.SupportedNamespacesRegex, + InstanceManagerImage: c.InstanceManagerImage, } } diff --git a/docker/app/Dockerfile b/docker/app/Dockerfile index da6c81332..05e29fd14 100644 --- a/docker/app/Dockerfile +++ b/docker/app/Dockerfile @@ -14,9 +14,10 @@ FROM alpine:latest RUN apk --no-cache add \ ca-certificates COPY --from=build /src/bin/redis-operator /usr/local/bin +COPY --from=build /src/bin/redis-instance /usr/local/bin RUN addgroup -g 1000 rf && \ adduser -D -u 1000 -G rf rf && \ - chown rf:rf /usr/local/bin/redis-operator + chown rf:rf /usr/local/bin/redis-operator /usr/local/bin/redis-instance USER rf ENTRYPOINT ["/usr/local/bin/redis-operator"] diff --git a/example/operator/all-redis-operator-resources.yaml b/example/operator/all-redis-operator-resources.yaml index 27b71a7c2..55e38c98a 100644 --- a/example/operator/all-redis-operator-resources.yaml +++ b/example/operator/all-redis-operator-resources.yaml @@ -19,7 +19,7 @@ spec: serviceAccountName: redisoperator enableServiceLinks: false containers: - - image: ghcr.io/saremox/redis-operator:v1.4.0 + - image: ghcr.io/buildio/redis-operator:4.0.0 imagePullPolicy: IfNotPresent name: app securityContext: diff --git a/go.mod b/go.mod index 266057538..d4a639dac 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/go-redis/redis/v8 v8.11.5 github.com/prometheus/client_golang v1.23.2 github.com/sirupsen/logrus v1.9.4 + github.com/spf13/cobra v1.10.2 github.com/spotahome/kooper/v2 v2.9.0 github.com/stretchr/testify v1.11.1 k8s.io/api v0.33.1 @@ -29,6 +30,7 @@ require ( github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect + github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.9.0 // indirect @@ -40,7 +42,7 @@ require ( github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect - github.com/spf13/pflag v1.0.6 // indirect + github.com/spf13/pflag v1.0.9 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect diff --git a/go.sum b/go.sum index c2d43750f..6eb5707b3 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,7 @@ github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -38,6 +39,8 @@ github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db h1:097atOisP2aRj7vFgY github.com/google/pprof v0.0.0-20241029153458-d1b30febd7db/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= +github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= @@ -84,10 +87,13 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= -github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= -github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= +github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spotahome/kooper/v2 v2.9.0 h1:Iwk2eAZbp0M0Z4OZYkt12c7ENhyYe8byB0mtDvVYjq4= github.com/spotahome/kooper/v2 v2.9.0/go.mod h1:im2PUUOGti/fXq0tUZaowG6cWJvgzS9BlX4ipz34c/E= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= @@ -104,6 +110,7 @@ go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= diff --git a/manifests/databases.spotahome.com_redisfailovers.yaml b/manifests/databases.spotahome.com_redisfailovers.yaml index ca358b1f6..2a1a06077 100644 --- a/manifests/databases.spotahome.com_redisfailovers.yaml +++ b/manifests/databases.spotahome.com_redisfailovers.yaml @@ -6820,6 +6820,14 @@ spec: - name type: object type: array + instanceManagerImage: + description: |- + InstanceManagerImage is the image containing the redis-instance binary. + When specified, the instance manager runs as PID 1 and manages Redis, + following the CloudNativePG model. This enables RDB cleanup on startup + and proper signal handling for graceful shutdown. + See: https://cloudnative-pg.io/documentation/current/instance_manager/ + type: string nodeSelector: additionalProperties: type: string diff --git a/manifests/kustomize/base/databases.spotahome.com_redisfailovers.yaml b/manifests/kustomize/base/databases.spotahome.com_redisfailovers.yaml index ca358b1f6..2a1a06077 100644 --- a/manifests/kustomize/base/databases.spotahome.com_redisfailovers.yaml +++ b/manifests/kustomize/base/databases.spotahome.com_redisfailovers.yaml @@ -6820,6 +6820,14 @@ spec: - name type: object type: array + instanceManagerImage: + description: |- + InstanceManagerImage is the image containing the redis-instance binary. + When specified, the instance manager runs as PID 1 and manages Redis, + following the CloudNativePG model. This enables RDB cleanup on startup + and proper signal handling for graceful shutdown. + See: https://cloudnative-pg.io/documentation/current/instance_manager/ + type: string nodeSelector: additionalProperties: type: string diff --git a/manifests/kustomize/base/deployment.yaml b/manifests/kustomize/base/deployment.yaml index d2431db32..2b6e4f0f5 100644 --- a/manifests/kustomize/base/deployment.yaml +++ b/manifests/kustomize/base/deployment.yaml @@ -11,7 +11,7 @@ spec: enableServiceLinks: false containers: - name: redis-operator - image: ghcr.io/saremox/redis-operator:v1.4.0 + image: ghcr.io/buildio/redis-operator:4.0.0 imagePullPolicy: IfNotPresent securityContext: readOnlyRootFilesystem: true diff --git a/operator/redisfailover/config.go b/operator/redisfailover/config.go index d384f81ac..2e1fd0126 100644 --- a/operator/redisfailover/config.go +++ b/operator/redisfailover/config.go @@ -7,4 +7,7 @@ type Config struct { Concurrency int SyncInterval int SupportedNamespacesRegex string + // InstanceManagerImage is the image used for Redis instance management init containers. + // This should be the same image as the operator, which contains the redis-instance binary. + InstanceManagerImage string } diff --git a/operator/redisfailover/service/generator.go b/operator/redisfailover/service/generator.go index fed34e52a..f01676892 100644 --- a/operator/redisfailover/service/generator.go +++ b/operator/redisfailover/service/generator.go @@ -42,6 +42,15 @@ sentinel parallel-syncs mymaster 2` redisStorageVolumeName = "redis-data" sentinelStartupConfigurationVolumeName = "sentinel-startup-config" + // Instance manager volume and paths (follows CNPG model) + // See: https://cloudnative-pg.io/documentation/current/instance_manager/ + instanceManagerVolumeName = "instance-manager" + instanceManagerMountPath = "/controller" + instanceManagerBinaryName = "redis-instance" + instanceManagerHealthPort = 8080 + instanceManagerHealthzPath = "/healthz" + instanceManagerReadyzPath = "/readyz" + graceTime = 30 ) @@ -248,7 +257,11 @@ func generateRedisShutdownConfigMap(rf *redisfailoverv1.RedisFailover, labels ma rfName := strings.ReplaceAll(strings.ToUpper(rf.Name), "-", "_") labels = util.MergeLabels(labels, generateSelectorLabels(redisRoleName, rf.Name)) - shutdownContent := fmt.Sprintf(`master=$(redis-cli -h ${RFS_%[1]v_SERVICE_HOST} -p ${RFS_%[1]v_SERVICE_PORT_SENTINEL} --csv SENTINEL get-master-addr-by-name mymaster | tr ',' ' ' | tr -d '\"' |cut -d' ' -f1) + + var shutdownContent string + if rf.SentinelEnabled() { + // Standard shutdown with Sentinel failover + shutdownContent = fmt.Sprintf(`master=$(redis-cli -h ${RFS_%[1]v_SERVICE_HOST} -p ${RFS_%[1]v_SERVICE_PORT_SENTINEL} --csv SENTINEL get-master-addr-by-name mymaster | tr ',' ' ' | tr -d '\"' |cut -d' ' -f1) if [ "$master" = "$(hostname -i)" ]; then redis-cli -h ${RFS_%[1]v_SERVICE_HOST} -p ${RFS_%[1]v_SERVICE_PORT_SENTINEL} SENTINEL failover mymaster sleep 31 @@ -259,6 +272,16 @@ if [ ! -z "${REDIS_PASSWORD}" ]; then fi save_command="${cmd} save" eval $save_command`, rfName, port) + } else { + // Operator-managed mode: just save data, no Sentinel failover + // The operator will handle master election during the next reconcile + shutdownContent = fmt.Sprintf(`cmd="redis-cli -p %[1]v" +if [ ! -z "${REDIS_PASSWORD}" ]; then + export REDISCLI_AUTH=${REDIS_PASSWORD} +fi +save_command="${cmd} save" +eval $save_command`, port) + } return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ @@ -381,6 +404,7 @@ func generateRedisStatefulSet(rf *redisfailoverv1.RedisFailover, labels map[stri PriorityClassName: rf.Spec.Redis.PriorityClassName, ServiceAccountName: rf.Spec.Redis.ServiceAccountName, TerminationGracePeriodSeconds: &terminationGracePeriodSeconds, + EnableServiceLinks: ptrBool(false), Containers: []corev1.Container{ { Name: "redis", @@ -436,21 +460,35 @@ func generateRedisStatefulSet(rf *redisfailoverv1.RedisFailover, labels map[stri } } + // Add health port when instance manager is enabled + if instanceManagerEnabled(rf) { + ss.Spec.Template.Spec.Containers[0].Ports = append( + ss.Spec.Template.Spec.Containers[0].Ports, + corev1.ContainerPort{ + Name: "health", + ContainerPort: instanceManagerHealthPort, + Protocol: corev1.ProtocolTCP, + }, + ) + + // Note: REDIS_PASSWORD env var is added by getRedisEnv() when auth is configured. + // The instance manager health server reads this env var to authenticate to Redis. + } + if rf.Spec.Redis.CustomLivenessProbe != nil { ss.Spec.Template.Spec.Containers[0].LivenessProbe = rf.Spec.Redis.CustomLivenessProbe } else { + // HTTP probe via instance manager (CNPG model) + // This avoids process spawning and works better under memory pressure ss.Spec.Template.Spec.Containers[0].LivenessProbe = &corev1.Probe{ InitialDelaySeconds: graceTime, TimeoutSeconds: 5, FailureThreshold: 6, PeriodSeconds: 15, ProbeHandler: corev1.ProbeHandler{ - Exec: &corev1.ExecAction{ - Command: []string{ - "sh", - "-c", - fmt.Sprintf("redis-cli -h $(hostname) -p %[1]v --user pinger --pass pingpass --no-auth-warning ping | grep PONG", rf.Spec.Redis.Port), - }, + HTTPGet: &corev1.HTTPGetAction{ + Path: instanceManagerHealthzPath, + Port: intstr.FromInt32(instanceManagerHealthPort), }, }, } @@ -459,12 +497,17 @@ func generateRedisStatefulSet(rf *redisfailoverv1.RedisFailover, labels map[stri if rf.Spec.Redis.CustomReadinessProbe != nil { ss.Spec.Template.Spec.Containers[0].ReadinessProbe = rf.Spec.Redis.CustomReadinessProbe } else { + // HTTP probe via instance manager (CNPG model) + // The /readyz endpoint checks: not loading, not syncing, master link up ss.Spec.Template.Spec.Containers[0].ReadinessProbe = &corev1.Probe{ InitialDelaySeconds: graceTime, TimeoutSeconds: 5, + PeriodSeconds: 10, + FailureThreshold: 3, ProbeHandler: corev1.ProbeHandler{ - Exec: &corev1.ExecAction{ - Command: []string{"/bin/sh", "/redis-readiness/ready.sh"}, + HTTPGet: &corev1.HTTPGetAction{ + Path: instanceManagerReadyzPath, + Port: intstr.FromInt32(instanceManagerHealthPort), }, }, } @@ -491,6 +534,14 @@ func generateRedisStatefulSet(rf *redisfailoverv1.RedisFailover, labels map[stri ss.Spec.Template.Spec.Containers = append(ss.Spec.Template.Spec.Containers, exporter) } + // Add instance manager init container if enabled (CNPG model) + // This must come before user-provided init containers so the binary is + // available for any custom init logic that might need it. + if instanceManagerEnabled(rf) { + initContainer := createInstanceManagerInitContainer(rf) + ss.Spec.Template.Spec.InitContainers = append(ss.Spec.Template.Spec.InitContainers, initContainer) + } + if rf.Spec.Redis.InitContainers != nil { initContainers := getInitContainersWithRedisEnv(rf) ss.Spec.Template.Spec.InitContainers = append(ss.Spec.Template.Spec.InitContainers, initContainers...) @@ -547,6 +598,7 @@ func generateSentinelDeployment(rf *redisfailoverv1.RedisFailover, labels map[st ImagePullSecrets: rf.Spec.Sentinel.ImagePullSecrets, PriorityClassName: rf.Spec.Sentinel.PriorityClassName, ServiceAccountName: rf.Spec.Sentinel.ServiceAccountName, + EnableServiceLinks: ptrBool(false), InitContainers: []corev1.Container{ { Name: "sentinel-config-copy", @@ -875,6 +927,14 @@ func getRedisVolumeMounts(rf *redisfailoverv1.RedisFailover) []corev1.VolumeMoun }, } + // Add instance manager volume mount if enabled (CNPG model) + if instanceManagerEnabled(rf) { + volumeMounts = append(volumeMounts, corev1.VolumeMount{ + Name: instanceManagerVolumeName, + MountPath: instanceManagerMountPath, + }) + } + if rf.Spec.Redis.StartupConfigMap != "" { startupVolumeMount := corev1.VolumeMount{ Name: redisStartupConfigurationVolumeName, @@ -954,6 +1014,18 @@ func getRedisVolumes(rf *redisfailoverv1.RedisFailover) []corev1.Volume { }, } + // Add instance manager volume if enabled (CNPG model) + // This emptyDir volume is used to share the redis-instance binary + // between the init container and the main container. + if instanceManagerEnabled(rf) { + volumes = append(volumes, corev1.Volume{ + Name: instanceManagerVolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }) + } + if rf.Spec.Redis.StartupConfigMap != "" { startupVolumeName := rf.Spec.Redis.StartupConfigMap startupVolume := corev1.Volume{ @@ -1065,9 +1137,63 @@ func getRedisCommand(rf *redisfailoverv1.RedisFailover) []string { if len(rf.Spec.Redis.Command) > 0 { return rf.Spec.Redis.Command } + // Instance manager runs as PID 1 (CNPG model) + // This is required in v4.0.0+ - legacy mode is removed return []string{ - "redis-server", - fmt.Sprintf("/redis/%s", redisConfigFileName), + fmt.Sprintf("%s/%s", instanceManagerMountPath, instanceManagerBinaryName), + "run", + "--redis-conf", fmt.Sprintf("/redis/%s", redisConfigFileName), + } +} + +// instanceManagerEnabled returns true if the instance manager should be used. +// Since v4.0.0, instance manager is always enabled - legacy exec probes are removed. +// The instance manager follows the CloudNativePG model where it runs as PID 1 +// and manages the Redis process as a child. +// See: https://cloudnative-pg.io/documentation/current/instance_manager/ +func instanceManagerEnabled(rf *redisfailoverv1.RedisFailover) bool { + // Always enabled in v4.0.0+. Legacy exec probes are removed. + return true +} + +// getInstanceManagerImage returns the instance manager image to use. +// If not specified in the CRD, uses the default operator image. +func getInstanceManagerImage(rf *redisfailoverv1.RedisFailover) string { + if rf.Spec.Redis.InstanceManagerImage != "" { + return rf.Spec.Redis.InstanceManagerImage + } + return redisfailoverv1.DefaultInstanceManagerImage +} + +// createInstanceManagerInitContainer creates an init container that copies the +// redis-instance binary to a shared volume. This follows the CNPG model. +func createInstanceManagerInitContainer(rf *redisfailoverv1.RedisFailover) corev1.Container { + return corev1.Container{ + Name: "instance-manager-init", + Image: getInstanceManagerImage(rf), + ImagePullPolicy: pullPolicy(rf.Spec.Redis.ImagePullPolicy), + Command: []string{ + "cp", + fmt.Sprintf("/usr/local/bin/%s", instanceManagerBinaryName), + fmt.Sprintf("%s/%s", instanceManagerMountPath, instanceManagerBinaryName), + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: instanceManagerVolumeName, + MountPath: instanceManagerMountPath, + }, + }, + // Use minimal resources for the copy operation + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + corev1.ResourceMemory: resource.MustParse("16Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + }, } } @@ -1089,6 +1215,12 @@ func pullPolicy(specPolicy corev1.PullPolicy) corev1.PullPolicy { return specPolicy } +// ptrBool returns a pointer to a bool value. +// Used for optional PodSpec fields like EnableServiceLinks. +func ptrBool(b bool) *bool { + return &b +} + func getTerminationGracePeriodSeconds(rf *redisfailoverv1.RedisFailover) int64 { if rf.Spec.Redis.TerminationGracePeriodSeconds > 0 { return rf.Spec.Redis.TerminationGracePeriodSeconds diff --git a/operator/redisfailover/service/generator_test.go b/operator/redisfailover/service/generator_test.go index 328091af5..5cd69fa07 100644 --- a/operator/redisfailover/service/generator_test.go +++ b/operator/redisfailover/service/generator_test.go @@ -55,6 +55,10 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { Name: "redis-data", MountPath: "/data", }, + { + Name: "instance-manager", + MountPath: "/controller", + }, }, }, }, @@ -91,6 +95,12 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { }, }, }, + { + Name: "instance-manager", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, { Name: "redis-data", VolumeSource: corev1.VolumeSource{ @@ -129,6 +139,10 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { Name: "redis-data", MountPath: "/data", }, + { + Name: "instance-manager", + MountPath: "/controller", + }, }, }, }, @@ -165,6 +179,12 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { }, }, }, + { + Name: "instance-manager", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, { Name: "redis-data", VolumeSource: corev1.VolumeSource{ @@ -209,6 +229,10 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { Name: "pvc-data", MountPath: "/data", }, + { + Name: "instance-manager", + MountPath: "/controller", + }, }, }, }, @@ -245,6 +269,12 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { }, }, }, + { + Name: "instance-manager", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, }, }, }, @@ -319,6 +349,10 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { Name: "pvc-data", MountPath: "/data", }, + { + Name: "instance-manager", + MountPath: "/controller", + }, }, }, }, @@ -355,6 +389,12 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { }, }, }, + { + Name: "instance-manager", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, }, }, }, @@ -434,6 +474,10 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { Name: "pvc-data", MountPath: "/data", }, + { + Name: "instance-manager", + MountPath: "/controller", + }, }, }, }, @@ -470,6 +514,12 @@ func TestRedisStatefulSetStorageGeneration(t *testing.T) { }, }, }, + { + Name: "instance-manager", + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + }, }, }, }, @@ -554,7 +604,9 @@ func TestRedisStatefulSetCommands(t *testing.T) { name: "Default values", givenCommands: []string{}, expectedCommands: []string{ - "redis-server", + "/controller/redis-instance", + "run", + "--redis-conf", "/redis/redis.conf", }, }, @@ -2029,8 +2081,20 @@ func TestRedisExtraVolumeMounts(t *testing.T) { ms.On("CreateOrUpdatePodDisruptionBudget", namespace, mock.Anything).Once().Return(nil, nil) ms.On("CreateOrUpdateStatefulSet", namespace, mock.Anything).Once().Run(func(args mock.Arguments) { s := args.Get(1).(*appsv1.StatefulSet) - extraVolume = s.Spec.Template.Spec.Volumes[3] - extraVolumeMount = s.Spec.Template.Spec.Containers[0].VolumeMounts[4] + // Find the extra volume by name (order may vary with instance manager) + for _, v := range s.Spec.Template.Spec.Volumes { + if v.Name == test.expectedVolumes[0].Name { + extraVolume = v + break + } + } + // Find the extra volume mount by name + for _, vm := range s.Spec.Template.Spec.Containers[0].VolumeMounts { + if vm.Name == test.expectedVolumeMounts[0].Name { + extraVolumeMount = vm + break + } + } }).Return(nil) client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) @@ -2408,12 +2472,9 @@ func TestRedisCustomLivenessProbe(t *testing.T) { FailureThreshold: 6, PeriodSeconds: 15, ProbeHandler: corev1.ProbeHandler{ - Exec: &corev1.ExecAction{ - Command: []string{ - "sh", - "-c", - "redis-cli -h $(hostname) -p 6379 --user pinger --pass pingpass --no-auth-warning ping | grep PONG", - }, + HTTPGet: &corev1.HTTPGetAction{ + Path: "/healthz", + Port: intstr.FromInt32(8080), }, }, }, @@ -2442,6 +2503,82 @@ func TestRedisCustomLivenessProbe(t *testing.T) { } } +func TestRedisHTTPLivenessProbeWithInstanceManager(t *testing.T) { + assert := assert.New(t) + + var livenessProbe *corev1.Probe + var containerPorts []corev1.ContainerPort + + rf := generateRF() + rf.Spec.Redis.InstanceManagerImage = "ghcr.io/buildio/redis-operator:1.7.0" + rf.Spec.Redis.Port = 6379 + + ms := &mK8SService.Services{} + ms.On("CreateOrUpdatePodDisruptionBudget", namespace, mock.Anything).Once().Return(nil, nil) + ms.On("CreateOrUpdateStatefulSet", namespace, mock.Anything).Once().Run(func(args mock.Arguments) { + s := args.Get(1).(*appsv1.StatefulSet) + livenessProbe = s.Spec.Template.Spec.Containers[0].LivenessProbe + containerPorts = s.Spec.Template.Spec.Containers[0].Ports + }).Return(nil) + + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) + err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) + assert.NoError(err) + + // Verify HTTP liveness probe is used + assert.NotNil(livenessProbe) + assert.NotNil(livenessProbe.HTTPGet, "Expected HTTPGet probe when instance manager is enabled") + assert.Nil(livenessProbe.Exec, "Expected no Exec probe when instance manager is enabled") + assert.Equal("/healthz", livenessProbe.HTTPGet.Path) + assert.Equal(int32(8080), livenessProbe.HTTPGet.Port.IntVal) + assert.Equal(int32(30), livenessProbe.InitialDelaySeconds) + assert.Equal(int32(5), livenessProbe.TimeoutSeconds) + assert.Equal(int32(6), livenessProbe.FailureThreshold) + assert.Equal(int32(15), livenessProbe.PeriodSeconds) + + // Verify health port is added to container + var healthPortFound bool + for _, port := range containerPorts { + if port.Name == "health" && port.ContainerPort == 8080 { + healthPortFound = true + break + } + } + assert.True(healthPortFound, "Expected health port 8080 to be added when instance manager is enabled") +} + +func TestRedisDefaultInstanceManagerImage(t *testing.T) { + assert := assert.New(t) + + var initContainers []corev1.Container + + rf := generateRF() + rf.Spec.Redis.InstanceManagerImage = "" // No explicit image - should use default + rf.Spec.Redis.Port = 6379 + + ms := &mK8SService.Services{} + ms.On("CreateOrUpdatePodDisruptionBudget", namespace, mock.Anything).Once().Return(nil, nil) + ms.On("CreateOrUpdateStatefulSet", namespace, mock.Anything).Once().Run(func(args mock.Arguments) { + s := args.Get(1).(*appsv1.StatefulSet) + initContainers = s.Spec.Template.Spec.InitContainers + }).Return(nil) + + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) + err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) + assert.NoError(err) + + // Verify instance manager init container uses default image + // Note: Starting with 4.0.0, tags do not have leading 'v' + var instanceManagerFound bool + for _, c := range initContainers { + if c.Name == "instance-manager-init" { + instanceManagerFound = true + assert.Equal("ghcr.io/buildio/redis-operator:4.0.0", c.Image) + } + } + assert.True(instanceManagerFound, "Expected instance-manager-init container with default image") +} + func TestSentinelCustomLivenessProbe(t *testing.T) { tests := []struct { name string @@ -2558,9 +2695,12 @@ func TestRedisCustomReadinessProbe(t *testing.T) { expectedReadinessProbe: &corev1.Probe{ InitialDelaySeconds: 30, TimeoutSeconds: 5, + PeriodSeconds: 10, + FailureThreshold: 3, ProbeHandler: corev1.ProbeHandler{ - Exec: &corev1.ExecAction{ - Command: []string{"/bin/sh", "/redis-readiness/ready.sh"}, + HTTPGet: &corev1.HTTPGetAction{ + Path: "/readyz", + Port: intstr.FromInt32(8080), }, }, }, @@ -2792,3 +2932,62 @@ func TestSentinelCustomStartupProbe(t *testing.T) { assert.Equal(test.expectedStartupProbe, startupProbe) } } + +func TestRedisHTTPReadinessProbeWithInstanceManager(t *testing.T) { + assert := assert.New(t) + + var readinessProbe *corev1.Probe + + rf := generateRF() + rf.Spec.Redis.InstanceManagerImage = "ghcr.io/buildio/redis-operator:1.7.0" + rf.Spec.Redis.Port = 6379 + + ms := &mK8SService.Services{} + ms.On("CreateOrUpdatePodDisruptionBudget", namespace, mock.Anything).Once().Return(nil, nil) + ms.On("CreateOrUpdateStatefulSet", namespace, mock.Anything).Once().Run(func(args mock.Arguments) { + s := args.Get(1).(*appsv1.StatefulSet) + readinessProbe = s.Spec.Template.Spec.Containers[0].ReadinessProbe + }).Return(nil) + + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) + err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) + assert.NoError(err) + + // Verify HTTP readiness probe is used + assert.NotNil(readinessProbe) + assert.NotNil(readinessProbe.HTTPGet, "Expected HTTPGet probe when instance manager is enabled") + assert.Nil(readinessProbe.Exec, "Expected no Exec probe when instance manager is enabled") + assert.Equal("/readyz", readinessProbe.HTTPGet.Path) + assert.Equal(int32(8080), readinessProbe.HTTPGet.Port.IntVal) + assert.Equal(int32(30), readinessProbe.InitialDelaySeconds) + assert.Equal(int32(5), readinessProbe.TimeoutSeconds) + assert.Equal(int32(3), readinessProbe.FailureThreshold) + assert.Equal(int32(10), readinessProbe.PeriodSeconds) +} + +func TestRedisHTTPReadinessProbeAlwaysUsed(t *testing.T) { + assert := assert.New(t) + + var readinessProbe *corev1.Probe + + rf := generateRF() + rf.Spec.Redis.InstanceManagerImage = "" // No explicit image - uses default, HTTP probes required + rf.Spec.Redis.Port = 6379 + + ms := &mK8SService.Services{} + ms.On("CreateOrUpdatePodDisruptionBudget", namespace, mock.Anything).Once().Return(nil, nil) + ms.On("CreateOrUpdateStatefulSet", namespace, mock.Anything).Once().Run(func(args mock.Arguments) { + s := args.Get(1).(*appsv1.StatefulSet) + readinessProbe = s.Spec.Template.Spec.Containers[0].ReadinessProbe + }).Return(nil) + + client := rfservice.NewRedisFailoverKubeClient(ms, log.Dummy, metrics.Dummy) + err := client.EnsureRedisStatefulset(rf, nil, []metav1.OwnerReference{}) + assert.NoError(err) + + // Verify HTTP readiness probe is always used (v4.0.0+) + assert.NotNil(readinessProbe) + assert.NotNil(readinessProbe.HTTPGet, "Expected HTTPGet probe in v4.0.0+") + assert.Equal("/readyz", readinessProbe.HTTPGet.Path) + assert.Equal(int32(8080), readinessProbe.HTTPGet.Port.IntVal) +} diff --git a/operator/redisfailover/service/heal.go b/operator/redisfailover/service/heal.go index 9f9f72d66..f78f5e243 100644 --- a/operator/redisfailover/service/heal.go +++ b/operator/redisfailover/service/heal.go @@ -24,6 +24,7 @@ type RedisFailoverHeal interface { SetSentinelCustomConfig(ip string, rFailover *redisfailoverv1.RedisFailover) error SetRedisCustomConfig(ip string, rFailover *redisfailoverv1.RedisFailover) error DeletePod(podName string, rFailover *redisfailoverv1.RedisFailover) error + // Operator-managed failover methods PromoteBestReplica(newMasterIP string, rFailover *redisfailoverv1.RedisFailover) error } diff --git a/scripts/build.sh b/scripts/build.sh index db18a250a..3658443cc 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -18,4 +18,8 @@ f_ver="-X main.Version=${VERSION:-dev}" # Build the operator binary echo "Building redis-operator binary at ./bin/redis-operator" -CGO_ENABLED=0 go build -o ./bin/redis-operator --ldflags "${ldf_cmp} ${f_ver}" ./cmd/redisoperator \ No newline at end of file +CGO_ENABLED=0 go build -o ./bin/redis-operator --ldflags "${ldf_cmp} ${f_ver}" ./cmd/redisoperator + +# Build the instance manager binary +echo "Building redis-instance binary at ./bin/redis-instance" +CGO_ENABLED=0 go build -o ./bin/redis-instance --ldflags "${ldf_cmp} ${f_ver}" ./cmd/instance diff --git a/test/integration/redisfailover/creation_test.go b/test/integration/redisfailover/creation_test.go index 6b7e6dfad..91416881f 100644 --- a/test/integration/redisfailover/creation_test.go +++ b/test/integration/redisfailover/creation_test.go @@ -247,6 +247,8 @@ func TestRedisFailover(t *testing.T) { func (c *clients) testCRCreation(t *testing.T) { assert := assert.New(t) + // Get instance manager image from environment, fallback to test tag + instanceManagerImage := "ghcr.io/buildio/redis-operator:test" toCreate := &redisfailoverv1.RedisFailover{ ObjectMeta: metav1.ObjectMeta{ @@ -255,8 +257,9 @@ func (c *clients) testCRCreation(t *testing.T) { }, Spec: redisfailoverv1.RedisFailoverSpec{ Redis: redisfailoverv1.RedisSettings{ - Replicas: redisSize, - ImagePullPolicy: corev1.PullIfNotPresent, // Use locally built image + Replicas: redisSize, + InstanceManagerImage: instanceManagerImage, + ImagePullPolicy: corev1.PullIfNotPresent, // Allow pulling redis image from registry Exporter: redisfailoverv1.Exporter{ Enabled: true, }, @@ -264,7 +267,7 @@ func (c *clients) testCRCreation(t *testing.T) { }, Sentinel: redisfailoverv1.SentinelSettings{ Replicas: sentinelSize, - ImagePullPolicy: corev1.PullIfNotPresent, // Use locally built image + ImagePullPolicy: corev1.PullIfNotPresent, // Allow pulling redis image from registry // Sentinel must be explicitly enabled in v4.0.0+ (default is false) Enabled: boolPtr(true), },