From f64c791e5b89b6a2ee012b6388d8a1d7edbc4412 Mon Sep 17 00:00:00 2001 From: Ankit Pati Date: Mon, 5 Jan 2026 22:04:44 +0530 Subject: [PATCH 1/4] feat: support Valkey Cluster with sharding Signed-off-by: Ankit Pati --- .gitignore | 4 +- valkey/scripts/cluster-init-script.sh | 157 ++++++++++ valkey/templates/_helpers.tpl | 53 ++++ valkey/templates/cluster-script.yaml | 11 + valkey/templates/cluster-statefulset.yaml | 309 +++++++++++++++++++ valkey/templates/deploy_valkey.yaml | 2 +- valkey/templates/init_config.yaml | 56 ++++ valkey/templates/pvc.yaml | 2 +- valkey/templates/service-headless.yaml | 8 +- valkey/templates/service.yaml | 7 + valkey/tests/cluster_test.yaml | 347 ++++++++++++++++++++++ valkey/tests/deployment_test.yaml | 36 +++ valkey/tests/pvc_test.yaml | 165 ++++++++++ valkey/tests/service_test.yaml | 55 ++++ valkey/values.yaml | 44 +++ 15 files changed, 1251 insertions(+), 5 deletions(-) create mode 100644 valkey/scripts/cluster-init-script.sh create mode 100644 valkey/templates/cluster-script.yaml create mode 100644 valkey/templates/cluster-statefulset.yaml create mode 100644 valkey/tests/cluster_test.yaml create mode 100644 valkey/tests/pvc_test.yaml diff --git a/.gitignore b/.gitignore index 92b4047..8faae3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -*.sh *.lock dist/ .vscode -temp/ \ No newline at end of file +temp/ +*.tgz diff --git a/valkey/scripts/cluster-init-script.sh b/valkey/scripts/cluster-init-script.sh new file mode 100644 index 0000000..79e3514 --- /dev/null +++ b/valkey/scripts/cluster-init-script.sh @@ -0,0 +1,157 @@ +#!/bin/sh +set -e + +# --- Configuration & Initial Checks --- +if [ "${CLUSTER_NODE_COUNT}" -eq "1" ]; then + echo "Single node deployment. Skipping cluster initialization" + exit 0 +fi + +ORDINAL=$(echo "${POD_NAME}" | rev | cut -d'-' -f1 | rev) +REPLICAS_PER_SHARD=${CLUSTER_REPLICAS_PER_SHARD:-1} +PRIMARIES=$(( CLUSTER_NODE_COUNT / (1 + REPLICAS_PER_SHARD) )) + +{{- if and .Values.auth.enabled .Values.auth.aclUsers }} +AUTH_OPTION="-a $(cat /etc/valkey/users.acl | grep '^user {{ .Values.cluster.replicationUser }} ' | sed 's/.*#\([a-f0-9]*\).*/\1/' | head -1)" +# If we have the password from environment, use that instead +if [ -n "${VALKEY_AUTH_PASSWORD}" ]; then + AUTH_OPTION="-a ${VALKEY_AUTH_PASSWORD}" +fi +{{- else }} +AUTH_OPTION="" +{{- end }} + +{{- if .Values.tls.enabled }} +TLS_OPTION="--tls --cacert /tls/{{ .Values.tls.caPublicKey }}" +{{- else }} +TLS_OPTION="" +{{- end }} + +echo "Initializing as ordinal ${ORDINAL}. Total nodes: ${CLUSTER_NODE_COUNT}, Primaries: ${PRIMARIES}, Replicas per shard: ${REPLICAS_PER_SHARD}" + +HEADLESS_SVC="{{ include "valkey.headlessServiceName" . }}" +NAMESPACE="{{ .Release.Namespace }}" +CLUSTER_DOMAIN="{{ .Values.clusterDomain }}" +MY_IP=$(hostname -i) + +# Wait for the local Valkey server process to start +until valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h localhost -p {{ .Values.service.port }} ping 2>/dev/null | grep -q "PONG"; do + echo "Waiting for local Valkey to start..." + sleep 2 +done +echo "Local Valkey is ready at ${MY_IP}" + +# --- Discover Existing Cluster --- +HEALTHY_NODE="" +for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do + if [ "${i}" != "${ORDINAL}" ]; then + NODE_HOST="{{ include "valkey.fullname" . }}-${i}.${HEADLESS_SVC}.${NAMESPACE}.svc.${CLUSTER_DOMAIN}" + if valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h "${NODE_HOST}" -p {{ .Values.service.port }} cluster info 2>/dev/null | grep -q "cluster_state:ok"; then + HEALTHY_NODE="${NODE_HOST}" + echo "Found healthy cluster node: ${HEALTHY_NODE}" + break + fi + fi +done + +# --- Logic for Joining an Existing Cluster --- +if [ -n "${HEALTHY_NODE}" ]; then + echo "Healthy cluster found. Attempting to join..." + + # 1. Forget any old, failed instance of ourselves + FAILED_NODE_ID=$(valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h "${HEALTHY_NODE}" -p {{ .Values.service.port }} cluster nodes 2>/dev/null | grep "${MY_IP}:{{ .Values.service.port }}" | grep "fail" | awk '{print $1}' || echo "") + if [ -n "${FAILED_NODE_ID}" ]; then + echo "Found my IP (${MY_IP}) marked as failed with ID ${FAILED_NODE_ID}. Forgetting it..." + valkey-cli ${AUTH_OPTION} ${TLS_OPTION} --cluster call "${HEALTHY_NODE}:{{ .Values.service.port }}" cluster forget "${FAILED_NODE_ID}" > /dev/null 2>&1 || true + sleep 3 + fi + + # 2. Meet the cluster + HEALTHY_NODE_IP=$(getent hosts "${HEALTHY_NODE}" | awk '{print $1}') + echo "Sending CLUSTER MEET to ${HEALTHY_NODE} (${HEALTHY_NODE_IP})" + valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h localhost -p {{ .Values.service.port }} cluster meet "${HEALTHY_NODE_IP}" {{ .Values.service.port }} + sleep 5 + + # 3. Find an orphaned master and become its replica + echo "Searching for a master to replicate..." + + MY_NODE_ID=$(valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h localhost -p {{ .Values.service.port }} cluster myid) + echo "My Node ID is ${MY_NODE_ID}" + + # This prevents race conditions from the order of 'cluster nodes' output + TARGET_MASTER_ID=$(valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h "${HEALTHY_NODE}" -p {{ .Values.service.port }} cluster nodes | awk -v replicas_needed="${REPLICAS_PER_SHARD}" -v my_id="${MY_NODE_ID}" ' + # Pass 1: Build maps of masters and replica counts + /master/ && !/fail/ { masters[$1] = 1 } + /slave/ && !/fail/ { master_replicas[$4]++ } + END { + # Pass 2: Iterate over the masters we found + for (master_id in masters) { + # Check if it needs a replica AND it is not ourself + if ( master_id != my_id && (master_replicas[master_id] < replicas_needed || master_replicas[master_id] == "") ) { + print master_id + exit # Found a suitable master + } + } + } + ') + + if [ -n "${TARGET_MASTER_ID}" ]; then + echo "Found target master ${TARGET_MASTER_ID} that needs a replica." + echo "Sending CLUSTER REPLICATE command..." + + if valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h localhost -p {{ .Values.service.port }} cluster replicate "${TARGET_MASTER_ID}"; then + echo "Successfully configured as a replica for ${TARGET_MASTER_ID}." + else + echo "ERROR: Failed to replicate master ${TARGET_MASTER_ID}. Manual intervention required." + exit 1 + fi + else + echo "WARNING: Could not find a master that needs a replica. Staying as a master with no slots. Attempting rebalance..." + + # Wait for cluster propagation before rebalancing + PROPAGATION_ATTEMPTS=0 + MAX_PROPAGATION_ATTEMPTS=60 + while [ ${PROPAGATION_ATTEMPTS} -lt ${MAX_PROPAGATION_ATTEMPTS} ]; do + CLUSTER_STATE=$(valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h localhost -p {{ .Values.service.port }} cluster info 2>/dev/null | grep "cluster_state:" | cut -d: -f2 | tr -d '\r\n') + if [ "${CLUSTER_STATE}" = "ok" ]; then + echo "Cluster state is OK. Proceeding with rebalance." + break + fi + echo "Cluster state is ${CLUSTER_STATE}. Waiting for propagation... (${PROPAGATION_ATTEMPTS}/${MAX_PROPAGATION_ATTEMPTS})" + PROPAGATION_ATTEMPTS=$((PROPAGATION_ATTEMPTS + 1)) + sleep 5 + done + + valkey-cli ${AUTH_OPTION} ${TLS_OPTION} --cluster rebalance "${HEALTHY_NODE}:{{ .Values.service.port }}" --cluster-use-empty-masters --cluster-yes || true + fi + exit 0 +fi + +echo "No healthy cluster found. Proceeding with initial creation logic." +if [ "${ORDINAL}" = "0" ]; then + echo "This is the primary-0 node, creating a new cluster..." + NODES="" + for i in $(seq 0 $((CLUSTER_NODE_COUNT - 1))); do + NODE_HOST="{{ include "valkey.fullname" . }}-${i}.${HEADLESS_SVC}.${NAMESPACE}.svc.${CLUSTER_DOMAIN}" + until valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h "${NODE_HOST}" -p {{ .Values.service.port }} ping 2>/dev/null | grep -q "PONG"; do + echo "Waiting for ${NODE_HOST} to be ready..." + sleep 2 + done + NODES="${NODES} ${NODE_HOST}:{{ .Values.service.port }}" + done + sleep 10 + + echo "Creating cluster with nodes: ${NODES}" + echo "yes" | valkey-cli ${AUTH_OPTION} ${TLS_OPTION} --cluster create ${NODES} --cluster-replicas "${REPLICAS_PER_SHARD}" + echo "Cluster created successfully." +else + echo "Waiting for pod-0 to initialize the cluster..." + PRIMARY_HOST="{{ include "valkey.fullname" . }}-0.${HEADLESS_SVC}.${NAMESPACE}.svc.${CLUSTER_DOMAIN}" + until valkey-cli ${AUTH_OPTION} ${TLS_OPTION} -h "${PRIMARY_HOST}" -p {{ .Values.service.port }} cluster info 2>/dev/null | grep -q "cluster_state:ok"; do + echo "Waiting for cluster to be initialized by pod-0..." + sleep 5 + done + echo "Cluster is initialized. My role has been assigned by the creator." +fi + +exit 0 diff --git a/valkey/templates/_helpers.tpl b/valkey/templates/_helpers.tpl index 593cf77..abbfd4d 100644 --- a/valkey/templates/_helpers.tpl +++ b/valkey/templates/_helpers.tpl @@ -188,3 +188,56 @@ Validate replica authentication configuration {{- end }} {{- end -}} +{{/* +Validate cluster configuration +*/}} +{{- define "valkey.validateClusterConfig" -}} +{{- if .Values.cluster.enabled }} + {{- if .Values.replica.enabled }} + {{- fail "cluster.enabled and replica.enabled are mutually exclusive. Please enable only one mode." }} + {{- end }} + {{- if lt (int .Values.cluster.shards) 3 }} + {{- fail "Cluster mode requires at least 3 shards (cluster.shards >= 3) for proper cluster operation." }} + {{- end }} + {{- if not .Values.cluster.persistence.size }} + {{- fail "Cluster mode requires persistent storage. Please set cluster.persistence.size (e.g., '5Gi')" }} + {{- end }} +{{- end }} +{{- end -}} + +{{/* +Validate cluster authentication configuration +*/}} +{{- define "valkey.validateClusterAuth" -}} +{{- if and .Values.cluster.enabled .Values.auth.enabled }} + {{- if not (hasKey .Values.auth.aclUsers .Values.cluster.replicationUser) }} + {{- fail (printf "Cluster replication user '%s' (cluster.replicationUser) must be defined in auth.aclUsers. The chart requires this to retrieve the password for cluster authentication." .Values.cluster.replicationUser) }} + {{- end }} +{{- end }} +{{- end -}} + +{{/* +Calculate total number of nodes in the cluster +*/}} +{{- define "valkey.clusterNodeCount" -}} +{{- $shards := int .Values.cluster.shards -}} +{{- $replicasPerShard := int .Values.cluster.replicasPerShard -}} +{{- mul $shards (add 1 $replicasPerShard) -}} +{{- end -}} + +{{/* +Generate list of cluster nodes for VALKEY_NODES environment variable +*/}} +{{- define "valkey.clusterNodes" -}} +{{- $fullname := include "valkey.fullname" . -}} +{{- $headlessSvc := include "valkey.headlessServiceName" . -}} +{{- $namespace := .Release.Namespace -}} +{{- $clusterDomain := .Values.clusterDomain -}} +{{- $nodeCount := include "valkey.clusterNodeCount" . | int -}} +{{- $nodes := list -}} +{{- range $i := until $nodeCount -}} +{{- $nodes = append $nodes (printf "%s-%d.%s.%s.svc.%s" $fullname $i $headlessSvc $namespace $clusterDomain) -}} +{{- end -}} +{{- join " " $nodes -}} +{{- end -}} + diff --git a/valkey/templates/cluster-script.yaml b/valkey/templates/cluster-script.yaml new file mode 100644 index 0000000..6023fe7 --- /dev/null +++ b/valkey/templates/cluster-script.yaml @@ -0,0 +1,11 @@ +{{- if .Values.cluster.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "valkey.fullname" . }}-cluster-script + labels: + {{- include "valkey.labels" . | nindent 4 }} +data: + init-cluster.sh: |- +{{ tpl (.Files.Get "scripts/cluster-init-script.sh") . | indent 4 }} +{{- end }} diff --git a/valkey/templates/cluster-statefulset.yaml b/valkey/templates/cluster-statefulset.yaml new file mode 100644 index 0000000..30a9489 --- /dev/null +++ b/valkey/templates/cluster-statefulset.yaml @@ -0,0 +1,309 @@ +{{- if .Values.cluster.enabled }} +{{- include "valkey.validateAuthConfig" . }} +{{- include "valkey.validateClusterConfig" . }} +{{- include "valkey.validateClusterAuth" . }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "valkey.fullname" . }} + labels: + {{- include "valkey.labels" . | nindent 4 }} +spec: + serviceName: {{ include "valkey.fullname" . }}-headless + replicas: {{ include "valkey.clusterNodeCount" . }} + podManagementPolicy: Parallel + selector: + matchLabels: + {{- include "valkey.selectorLabels" . | nindent 6 }} + volumeClaimTemplates: + - metadata: + name: valkey-data + spec: + accessModes: {{ toYaml .Values.cluster.persistence.accessModes | nindent 8 }} + {{- if .Values.cluster.persistence.storageClass }} + storageClassName: {{ .Values.cluster.persistence.storageClass | quote }} + {{- end }} + resources: + requests: + storage: {{ .Values.cluster.persistence.size | quote }} + template: + metadata: + labels: + {{- include "valkey.selectorLabels" . | nindent 8 }} + {{- with .Values.commonLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + annotations: + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + checksum/initconfig: {{ include (print $.Template.BasePath "/init_config.yaml") . | sha256sum | trunc 32 | quote }} + {{- if .Values.valkeyConfig }} + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum | trunc 32 | quote }} + {{- end }} + spec: + {{- (include "valkey.imagePullSecrets" .) | nindent 6 }} + automountServiceAccountToken: {{ .Values.serviceAccount.automount }} + serviceAccountName: {{ include "valkey.serviceAccountName" . }} + {{- if .Values.priorityClassName }} + priorityClassName: {{ .Values.priorityClassName | quote }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + initContainers: + - name: {{ include "valkey.fullname" . }}-init + image: {{ include "valkey.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + command: [ "/scripts/init.sh" ] + env: + - name: POD_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['apps.kubernetes.io/pod-index'] + - name: CLUSTER_SHARDS + value: {{ .Values.cluster.shards | quote }} + - name: CLUSTER_REPLICAS_PER_SHARD + value: {{ .Values.cluster.replicasPerShard | quote }} + volumeMounts: + - name: valkey-data + mountPath: /data + - name: scripts + mountPath: /scripts + {{- if .Values.valkeyConfig }} + - name: valkey-config + mountPath: /usr/local/etc/valkey/valkey.conf + subPath: valkey.conf + {{- end }} + {{- if .Values.extraSecretValkeyConfigs }} + - name: extravalkeyconfigs-volume + mountPath: /extravalkeyconfigs + {{- end }} + {{- if .Values.auth.enabled }} + - name: valkey-acl + mountPath: /etc/valkey + {{- if .Values.auth.usersExistingSecret }} + - name: valkey-users-secret + mountPath: /valkey-users-secret + readOnly: true + {{- end }} + {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }} + - name: valkey-auth-secret + mountPath: /valkey-auth-secret + readOnly: true + {{- end }} + {{- end }} + {{- with .Values.initResources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.extraInitContainers }} + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: {{ include "valkey.fullname" . }} + image: {{ include "valkey.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: [ "/bin/sh", "-c" ] + args: + - | + /cluster-script/init-cluster.sh & + valkey-server /data/conf/valkey.conf + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + env: + - name: POD_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['apps.kubernetes.io/pod-index'] + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: VALKEY_NODES + value: {{ include "valkey.clusterNodes" . | quote }} + - name: CLUSTER_NODE_COUNT + value: {{ include "valkey.clusterNodeCount" . | quote }} + - name: CLUSTER_REPLICAS_PER_SHARD + value: {{ .Values.cluster.replicasPerShard | quote }} + {{- range $key, $val := .Values.env }} + - name: {{ $key }} + value: "{{ $val }}" + {{- end }} + - name: VALKEY_LOGLEVEL + value: "{{ .Values.valkeyLogLevel }}" + ports: + - name: tcp + containerPort: {{ .Values.service.port }} + protocol: TCP + - name: tcp-bus + containerPort: {{ .Values.cluster.busPort }} + protocol: TCP + startupProbe: + exec: + {{- if .Values.tls.enabled }} + command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ] + {{- else }} + command: [ "sh", "-c", "valkey-cli ping" ] + {{- end }} + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 30 + livenessProbe: + exec: + {{- if .Values.tls.enabled }} + command: [ "sh", "-c", "valkey-cli --cacert /tls/{{ .Values.tls.caPublicKey }} --tls ping" ] + {{- else }} + command: [ "sh", "-c", "valkey-cli ping" ] + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + - name: valkey-data + mountPath: /data + - name: cluster-script + mountPath: /cluster-script + {{- if .Values.tls.enabled }} + - name: {{ include "valkey.fullname" . }}-tls + mountPath: /tls + {{- end }} + {{- if .Values.auth.enabled }} + - name: valkey-acl + mountPath: /etc/valkey + {{- end }} + {{- range $secret := .Values.extraValkeySecrets }} + - name: {{ $secret.name }}-valkey + mountPath: {{ $secret.mountPath }} + {{- end }} + {{- range $config := .Values.extraValkeyConfigs }} + - name: {{ $config.name }}-valkey + mountPath: {{ $config.mountPath }} + {{- end }} + {{- if .Values.metrics.enabled }} + - name: metrics + image: {{ include "valkey.metrics.exporter.image" . }} + imagePullPolicy: {{ .Values.metrics.exporter.image.pullPolicy | quote }} + {{- with .Values.metrics.exporter.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.metrics.exporter.command }} + command: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.metrics.exporter.args }} + args: + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: metrics + containerPort: {{ .Values.metrics.exporter.port }} + startupProbe: + tcpSocket: + port: metrics + livenessProbe: + tcpSocket: + port: metrics + readinessProbe: + httpGet: + path: / + port: metrics + {{- with .Values.metrics.exporter.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.metrics.exporter.extraVolumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + env: + - name: REDIS_ALIAS + value: {{ include "valkey.fullname" . }} + {{- range $key, $val := .Values.metrics.exporter.extraEnvs }} + - name: {{ $key }} + value: "{{ $val }}" + {{- end }} + {{- end }} + volumes: + - name: scripts + configMap: + name: {{ include "valkey.fullname" . }}-init-scripts + defaultMode: 0555 + - name: cluster-script + configMap: + name: {{ include "valkey.fullname" . }}-cluster-script + defaultMode: 0555 + {{- if .Values.auth.enabled }} + - name: valkey-acl + emptyDir: + medium: Memory + {{- end }} + {{- if .Values.valkeyConfig }} + - name: valkey-config + configMap: + name: {{ include "valkey.fullname" . }}-config + {{- end }} + {{- range .Values.extraValkeySecrets }} + - name: {{ .name }}-valkey + secret: + secretName: {{ .name }} + defaultMode: {{ .defaultMode | default 0440 }} + {{- end }} + {{- if .Values.tls.enabled }} + - name: {{ include "valkey.fullname" . }}-tls + secret: + secretName: {{ required "An existing secret is required to enable TLS" .Values.tls.existingSecret }} + defaultMode: 0400 + {{- end }} + {{- range .Values.extraValkeyConfigs }} + - name: {{ .name }}-valkey + configMap: + name: {{ .name }} + defaultMode: {{ .defaultMode | default 0440 }} + {{- end }} + {{- if .Values.metrics.enabled }} + {{- range .Values.metrics.exporter.extraExporterSecrets }} + - name: {{ .name }}-exporter + secret: + secretName: {{ .name }} + defaultMode: {{ .defaultMode | default 0440 }} + {{- end }} + {{- end }} + {{- if .Values.auth.enabled }} + {{- if .Values.auth.usersExistingSecret }} + - name: valkey-users-secret + secret: + secretName: {{ .Values.auth.usersExistingSecret }} + defaultMode: 0400 + {{- end }} + {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }} + - name: valkey-auth-secret + secret: + secretName: {{ include "valkey.fullname" . }}-auth + defaultMode: 0400 + {{- end }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.topologySpreadConstraints }} + topologySpreadConstraints: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/valkey/templates/deploy_valkey.yaml b/valkey/templates/deploy_valkey.yaml index da7cd71..ee5dbb7 100644 --- a/valkey/templates/deploy_valkey.yaml +++ b/valkey/templates/deploy_valkey.yaml @@ -1,4 +1,4 @@ -{{- if not .Values.replica.enabled }} +{{- if not (or .Values.replica.enabled .Values.cluster.enabled) }} {{- $fullname := include "valkey.fullname" . }} {{- $storage := .Values.dataStorage }} {{- $createPVC := and $storage.enabled (not (empty $storage.requestedSize)) (empty $storage.persistentVolumeClaimName) }} diff --git a/valkey/templates/init_config.yaml b/valkey/templates/init_config.yaml index 9b0337e..654e156 100644 --- a/valkey/templates/init_config.yaml +++ b/valkey/templates/init_config.yaml @@ -219,6 +219,62 @@ data: {{- end }} {{- end }} + {{- if .Values.cluster.enabled }} + # Cluster mode configuration + log "Configuring cluster mode" + + # Use POD_INDEX from Kubernetes metadata + POD_INDEX=${POD_INDEX:-0} + + # Configure cluster-enabled settings + { + echo "" + echo "# Cluster Configuration" + echo "cluster-enabled yes" + echo "cluster-config-file /data/nodes.conf" + echo "cluster-node-timeout {{ .Values.cluster.nodeTimeout }}" + {{- if not .Values.cluster.requireFullCoverage }} + echo "cluster-require-full-coverage no" + {{- end }} + {{- if .Values.cluster.allowReadsWhenDown }} + echo "cluster-allow-reads-when-down yes" + {{- end }} + echo "" + echo "# Cluster node announcement" + echo "cluster-announce-hostname {{ include "valkey.fullname" . }}-$POD_INDEX.{{ include "valkey.headlessServiceName" . }}.{{ .Release.Namespace }}.svc.{{ .Values.clusterDomain }}" + echo "cluster-announce-port {{ .Values.service.port }}" + echo "cluster-announce-bus-port {{ .Values.cluster.busPort }}" + echo "cluster-preferred-endpoint-type hostname" + } >>"$VALKEY_CONFIG" + + log "Cluster node $POD_INDEX configured with announce IP" + + {{- if .Values.auth.enabled }} + # Configure cluster authentication + {{- $replUsername := .Values.cluster.replicationUser }} + REPL_PASSWORD=$(get_user_password "{{ $replUsername }}") || exit 1 + + { + echo "" + echo "# Cluster authentication" + echo "masterauth $REPL_PASSWORD" + echo "masteruser {{ $replUsername }}" + } >>"$VALKEY_CONFIG" + log "Configured cluster authentication with user {{ $replUsername }}" + {{- end }} + + {{- if .Values.tls.enabled }} + # TLS for cluster + { + echo "" + echo "# TLS for cluster" + echo "tls-replication yes" + echo "tls-cluster yes" + } >>"$VALKEY_CONFIG" + log "Enabled TLS for cluster communication" + {{- end }} + {{- end }} + # Append extra configs if present if [ -f /usr/local/etc/valkey/valkey.conf ]; then log "Appending /usr/local/etc/valkey/valkey.conf" diff --git a/valkey/templates/pvc.yaml b/valkey/templates/pvc.yaml index aa20859..9f25edf 100644 --- a/valkey/templates/pvc.yaml +++ b/valkey/templates/pvc.yaml @@ -1,4 +1,4 @@ -{{- if and .Values.dataStorage.enabled (not .Values.replica.enabled) (not (empty .Values.dataStorage.requestedSize)) (empty .Values.dataStorage.persistentVolumeClaimName) }} +{{- if and .Values.dataStorage.enabled (not .Values.replica.enabled) (not .Values.cluster.enabled) (not (empty .Values.dataStorage.requestedSize)) (empty .Values.dataStorage.persistentVolumeClaimName) }} apiVersion: v1 kind: PersistentVolumeClaim metadata: diff --git a/valkey/templates/service-headless.yaml b/valkey/templates/service-headless.yaml index 733ca68..796ccd9 100644 --- a/valkey/templates/service-headless.yaml +++ b/valkey/templates/service-headless.yaml @@ -1,4 +1,4 @@ -{{- if .Values.replica.enabled }} +{{- if or .Values.replica.enabled .Values.cluster.enabled }} apiVersion: v1 kind: Service metadata: @@ -15,6 +15,12 @@ spec: port: {{ .Values.service.port }} targetPort: tcp protocol: TCP + {{- if .Values.cluster.enabled }} + - name: tcp-bus + port: {{ .Values.cluster.busPort }} + targetPort: tcp-bus + protocol: TCP + {{- end }} selector: {{- include "valkey.selectorLabels" . | nindent 4 }} {{- end }} diff --git a/valkey/templates/service.yaml b/valkey/templates/service.yaml index 4ffb302..6512a5f 100644 --- a/valkey/templates/service.yaml +++ b/valkey/templates/service.yaml @@ -28,8 +28,15 @@ spec: {{- if .Values.service.appProtocol }} appProtocol: {{ .Values.service.appProtocol }} {{- end }} + {{- if .Values.cluster.enabled }} + - port: {{ .Values.cluster.busPort }} + targetPort: tcp-bus + protocol: TCP + name: tcp-bus + {{- end }} selector: {{- include "valkey.selectorLabels" . | nindent 4 }} {{- if .Values.replica.enabled }} statefulset.kubernetes.io/pod-name: {{ include "valkey.fullname" . }}-0 {{- end }} + {{- /* In cluster mode, the service routes to all nodes; clients handle redirections */}} diff --git a/valkey/tests/cluster_test.yaml b/valkey/tests/cluster_test.yaml new file mode 100644 index 0000000..72e7bfd --- /dev/null +++ b/valkey/tests/cluster_test.yaml @@ -0,0 +1,347 @@ +suite: cluster configuration +templates: + - templates/cluster-statefulset.yaml + - templates/cluster-script.yaml + - templates/service-headless.yaml + - templates/service.yaml + - templates/init_config.yaml +tests: + # Validation tests + - it: should fail when cluster enabled but no persistence size provided + set: + cluster.enabled: true + cluster.persistence.size: "" + template: templates/cluster-statefulset.yaml + asserts: + - failedTemplate: + errorPattern: "Cluster mode requires persistent storage.*" + + - it: should fail when cluster enabled with less than 3 shards + set: + cluster.enabled: true + cluster.shards: 2 + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - failedTemplate: + errorPattern: "Cluster mode requires at least 3 shards.*" + + - it: should fail when both cluster and replica are enabled + set: + cluster.enabled: true + replica.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - failedTemplate: + errorPattern: "cluster.enabled and replica.enabled are mutually exclusive.*" + + # StatefulSet tests + - it: should create StatefulSet when cluster is enabled + set: + cluster.enabled: true + cluster.shards: 3 + cluster.replicasPerShard: 1 + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - isKind: + of: StatefulSet + - equal: + path: spec.replicas + value: 6 # 3 shards * (1 + 1 replica) = 6 nodes + + - it: should create StatefulSet with 3 shards and 0 replicas (3 nodes total) + set: + cluster.enabled: true + cluster.shards: 3 + cluster.replicasPerShard: 0 + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - isKind: + of: StatefulSet + - equal: + path: spec.replicas + value: 3 + + - it: should create StatefulSet with 5 shards and 2 replicas (15 nodes total) + set: + cluster.enabled: true + cluster.shards: 5 + cluster.replicasPerShard: 2 + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - isKind: + of: StatefulSet + - equal: + path: spec.replicas + value: 15 # 5 shards * (1 + 2 replicas) = 15 nodes + + - it: should use Parallel pod management policy for cluster mode + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - equal: + path: spec.podManagementPolicy + value: Parallel + + - it: should configure PVC with correct storage settings + set: + cluster.enabled: true + cluster.persistence.size: "10Gi" + cluster.persistence.storageClass: "fast-ssd" + template: templates/cluster-statefulset.yaml + asserts: + - equal: + path: spec.volumeClaimTemplates[0].spec.resources.requests.storage + value: "10Gi" + - equal: + path: spec.volumeClaimTemplates[0].spec.storageClassName + value: "fast-ssd" + + - it: should expose both tcp and tcp-bus ports in cluster mode + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.busPort: 16379 + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].ports + content: + name: tcp + containerPort: 6379 + protocol: TCP + - contains: + path: spec.template.spec.containers[0].ports + content: + name: tcp-bus + containerPort: 16379 + protocol: TCP + + # Init container tests + - it: should have init container with cluster environment variables + set: + cluster.enabled: true + cluster.shards: 4 + cluster.replicasPerShard: 2 + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.initContainers[0].env + content: + name: CLUSTER_SHARDS + value: "4" + - contains: + path: spec.template.spec.initContainers[0].env + content: + name: CLUSTER_REPLICAS_PER_SHARD + value: "2" + + # Service headless tests + - it: should create headless service with bus port in cluster mode + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.busPort: 16379 + template: templates/service-headless.yaml + asserts: + - isKind: + of: Service + - equal: + path: spec.clusterIP + value: None + - contains: + path: spec.ports + content: + name: tcp + port: 6379 + targetPort: tcp + protocol: TCP + - contains: + path: spec.ports + content: + name: tcp-bus + port: 16379 + targetPort: tcp-bus + protocol: TCP + + # Main service tests + - it: should create service with bus port in cluster mode + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.busPort: 16379 + template: templates/service.yaml + asserts: + - isKind: + of: Service + - contains: + path: spec.ports + content: + name: tcp-bus + port: 16379 + targetPort: tcp-bus + protocol: TCP + + # Cluster init script tests + - it: should create cluster-script ConfigMap when cluster is enabled + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-script.yaml + asserts: + - isKind: + of: ConfigMap + - equal: + path: metadata.name + value: RELEASE-NAME-valkey-cluster-script + + - it: cluster-script ConfigMap should contain init-cluster.sh + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-script.yaml + asserts: + - isNotNull: + path: data["init-cluster.sh"] + - matchRegex: + path: data["init-cluster.sh"] + pattern: "CLUSTER MEET" + + - it: cluster-script should contain cluster create logic + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-script.yaml + asserts: + - matchRegex: + path: data["init-cluster.sh"] + pattern: "--cluster create" + + - it: should run cluster init script as background process + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - matchRegex: + path: spec.template.spec.containers[0].args[0] + pattern: "/cluster-script/init-cluster.sh &" + + - it: should mount cluster-script volume in container + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: cluster-script + mountPath: /cluster-script + + - it: should define cluster-script volume + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.volumes + content: + name: cluster-script + configMap: + name: RELEASE-NAME-valkey-cluster-script + defaultMode: 365 + + # Authentication tests + - it: should fail when cluster auth enabled but replication user not in aclUsers + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + cluster.replicationUser: "clusteruser" + auth.aclUsers: + default: + password: "test" + permissions: "~* &* +@all" + template: templates/cluster-statefulset.yaml + asserts: + - failedTemplate: + errorPattern: "Cluster replication user 'clusteruser'.*must be defined in auth.aclUsers.*" + + - it: should succeed when cluster auth is properly configured + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + cluster.replicationUser: "default" + auth.aclUsers: + default: + password: "testpass" + permissions: "~* &* +@all" + template: templates/cluster-statefulset.yaml + asserts: + - isKind: + of: StatefulSet + + # TLS tests + - it: should configure TLS volume mount in cluster mode + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + tls.enabled: true + tls.existingSecret: "valkey-tls-secret" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: RELEASE-NAME-valkey-tls + mountPath: /tls + + # Init config tests (cluster mode config generation) + - it: should generate cluster config in init script + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.nodeTimeout: 20000 + template: templates/init_config.yaml + asserts: + - matchRegex: + path: data["init.sh"] + pattern: "cluster-enabled yes" + - matchRegex: + path: data["init.sh"] + pattern: "cluster-config-file /data/nodes.conf" + - matchRegex: + path: data["init.sh"] + pattern: "cluster-node-timeout 20000" + + - it: should configure cluster-require-full-coverage when disabled + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.requireFullCoverage: false + template: templates/init_config.yaml + asserts: + - matchRegex: + path: data["init.sh"] + pattern: "cluster-require-full-coverage no" + + - it: should configure cluster-allow-reads-when-down when enabled + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.allowReadsWhenDown: true + template: templates/init_config.yaml + asserts: + - matchRegex: + path: data["init.sh"] + pattern: "cluster-allow-reads-when-down yes" diff --git a/valkey/tests/deployment_test.yaml b/valkey/tests/deployment_test.yaml index 5409986..bf91860 100644 --- a/valkey/tests/deployment_test.yaml +++ b/valkey/tests/deployment_test.yaml @@ -3,6 +3,42 @@ templates: - templates/deploy_valkey.yaml - templates/init_config.yaml tests: + - it: should not create Deployment when replica.enabled is true + set: + replica.enabled: true + template: templates/deploy_valkey.yaml + asserts: + - hasDocuments: + count: 0 + + - it: should not create Deployment when cluster.enabled is true + set: + cluster.enabled: true + template: templates/deploy_valkey.yaml + asserts: + - hasDocuments: + count: 0 + + - it: should not create Deployment when both replica.enabled and cluster.enabled are true + set: + replica.enabled: true + cluster.enabled: true + template: templates/deploy_valkey.yaml + asserts: + - hasDocuments: + count: 0 + + - it: should create Deployment when both replica.enabled and cluster.enabled are false + set: + replica.enabled: false + cluster.enabled: false + template: templates/deploy_valkey.yaml + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Deployment + - it: should not have auth volumes when auth disabled set: auth.enabled: false diff --git a/valkey/tests/pvc_test.yaml b/valkey/tests/pvc_test.yaml new file mode 100644 index 0000000..003939f --- /dev/null +++ b/valkey/tests/pvc_test.yaml @@ -0,0 +1,165 @@ +suite: pvc configuration +templates: + - templates/pvc.yaml +tests: + - it: should not create PVC when replica.enabled is true + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + replica.enabled: true + asserts: + - hasDocuments: + count: 0 + + - it: should not create PVC when cluster.enabled is true + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + cluster.enabled: true + asserts: + - hasDocuments: + count: 0 + + - it: should not create PVC when both replica.enabled and cluster.enabled are true + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + replica.enabled: true + cluster.enabled: true + asserts: + - hasDocuments: + count: 0 + + - it: should create PVC when both replica.enabled and cluster.enabled are false and conditions are met + set: + replica.enabled: false + cluster.enabled: false + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.persistentVolumeClaimName: "" + asserts: + - hasDocuments: + count: 1 + - isKind: + of: PersistentVolumeClaim + + - it: should not create PVC when dataStorage.enabled is false + set: + dataStorage.enabled: false + replica.enabled: false + cluster.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: should not create PVC when dataStorage.requestedSize is empty + set: + dataStorage.enabled: true + dataStorage.requestedSize: "" + replica.enabled: false + cluster.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: should not create PVC when dataStorage.persistentVolumeClaimName is set + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.persistentVolumeClaimName: "existing-pvc" + replica.enabled: false + cluster.enabled: false + asserts: + - hasDocuments: + count: 0 + + - it: should have correct storage size + set: + dataStorage.enabled: true + dataStorage.requestedSize: "16Gi" + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: spec.resources.requests.storage + value: "16Gi" + + - it: should have keepPvc annotation when enabled + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.keepPvc: true + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: metadata.annotations["helm.sh/resource-policy"] + value: keep + + - it: should have custom storage class when specified + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.className: "fast-ssd" + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: spec.storageClassName + value: fast-ssd + + - it: should have custom labels when specified + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.labels: + custom.label: "value" + another.label: "test" + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: metadata.labels["custom.label"] + value: value + - equal: + path: metadata.labels["another.label"] + value: test + + - it: should have custom annotations when specified + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.annotations: + custom.annotation: "value" + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: metadata.annotations["custom.annotation"] + value: value + + - it: should have correct access modes + set: + dataStorage.enabled: true + dataStorage.requestedSize: "8Gi" + dataStorage.accessModes: + - ReadWriteOnce + replica.enabled: false + cluster.enabled: false + asserts: + - isKind: + of: PersistentVolumeClaim + - equal: + path: spec.accessModes + value: + - ReadWriteOnce diff --git a/valkey/tests/service_test.yaml b/valkey/tests/service_test.yaml index 7e4e919..a7a70a3 100644 --- a/valkey/tests/service_test.yaml +++ b/valkey/tests/service_test.yaml @@ -76,3 +76,58 @@ tests: content: app.kubernetes.io/instance: RELEASE-NAME app.kubernetes.io/name: valkey + - it: should pin to pod-0 when replica.enabled is true + set: + replica.enabled: true + template: templates/service.yaml + asserts: + - isKind: + of: Service + - equal: + path: spec.selector["statefulset.kubernetes.io/pod-name"] + value: RELEASE-NAME-valkey-0 + - it: should not pin to pod-0 when cluster.enabled is true + set: + cluster.enabled: true + template: templates/service.yaml + asserts: + - isKind: + of: Service + - notExists: + path: spec.selector["statefulset.kubernetes.io/pod-name"] + - it: should not pin to pod-0 when both replica.enabled and cluster.enabled are false + set: + replica.enabled: false + cluster.enabled: false + template: templates/service.yaml + asserts: + - isKind: + of: Service + - notExists: + path: spec.selector["statefulset.kubernetes.io/pod-name"] + - it: should have cluster bus port when cluster.enabled is true + set: + cluster.enabled: true + cluster.busPort: 16379 + template: templates/service.yaml + asserts: + - isKind: + of: Service + - contains: + path: spec.ports + content: + port: 16379 + targetPort: tcp-bus + protocol: TCP + name: tcp-bus + - it: should not have cluster bus port when cluster.enabled is false + set: + cluster.enabled: false + template: templates/service.yaml + asserts: + - isKind: + of: Service + - notContains: + path: spec.ports + content: + name: tcp-bus diff --git a/valkey/values.yaml b/valkey/values.yaml index 6770f51..c0cd59b 100644 --- a/valkey/values.yaml +++ b/valkey/values.yaml @@ -250,6 +250,50 @@ replica: # More info: https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#persistentvolumeclaim-retention persistentVolumeClaimRetentionPolicy: {} +# Cluster mode configuration for Valkey Cluster (sharded deployment) +# Note: cluster.enabled and replica.enabled are mutually exclusive +cluster: + # Enable cluster mode (creates a sharded Valkey cluster) + enabled: false + + # Number of shards (primary nodes). Minimum recommended is 3 for cluster mode. + # Each shard handles a portion of the hash slot range (16384 slots total). + shards: 3 + + # Number of replicas per shard (for high availability within each shard) + # Total nodes = shards × (1 + replicasPerShard) + # For example: 3 shards with 1 replica each = 6 nodes total + replicasPerShard: 1 + + # Username for cluster replication authentication, ignored if auth.enabled is false. + # IMPORTANT: When auth.enabled is true, this user MUST be defined in auth.aclUsers. + # The user must have appropriate replication permissions: +psync +replconf +ping + replicationUser: "default" + + # Cluster node timeout in milliseconds (how long before a node is considered failed) + nodeTimeout: 15000 + + # Require all hash slots to be covered for the cluster to accept writes + # Set to false to allow partial cluster operation + requireFullCoverage: true + + # Allow cluster to serve read requests when in down state + allowReadsWhenDown: false + + # Persistence configuration (required for cluster mode) + persistence: + # Size of the PVC for each node (required when cluster.enabled is true) + size: "" + # Storage class name (empty = use default storage class) + storageClass: "" + # Access modes for the PVC + accessModes: + - ReadWriteOnce + + # Bus port for cluster communication (default: service.port + 10000) + # This port is used for node-to-node communication in the cluster + busPort: 16379 + tls: # Enable TLS enabled: false From 3d98daafe178390329478084609e6c1b1813bbe3 Mon Sep 17 00:00:00 2001 From: Ankit Pati Date: Thu, 15 Jan 2026 17:26:52 +0530 Subject: [PATCH 2/4] fix: auth for Valkey cluster Signed-off-by: Ankit Pati --- valkey/scripts/cluster-init-script.sh | 25 +++- valkey/templates/cluster-statefulset.yaml | 10 ++ valkey/tests/cluster_test.yaml | 150 ++++++++++++++++++++++ 3 files changed, 181 insertions(+), 4 deletions(-) diff --git a/valkey/scripts/cluster-init-script.sh b/valkey/scripts/cluster-init-script.sh index 79e3514..5925f53 100644 --- a/valkey/scripts/cluster-init-script.sh +++ b/valkey/scripts/cluster-init-script.sh @@ -12,12 +12,29 @@ REPLICAS_PER_SHARD=${CLUSTER_REPLICAS_PER_SHARD:-1} PRIMARIES=$(( CLUSTER_NODE_COUNT / (1 + REPLICAS_PER_SHARD) )) {{- if and .Values.auth.enabled .Values.auth.aclUsers }} -AUTH_OPTION="-a $(cat /etc/valkey/users.acl | grep '^user {{ .Values.cluster.replicationUser }} ' | sed 's/.*#\([a-f0-9]*\).*/\1/' | head -1)" -# If we have the password from environment, use that instead -if [ -n "${VALKEY_AUTH_PASSWORD}" ]; then - AUTH_OPTION="-a ${VALKEY_AUTH_PASSWORD}" +# Get password for cluster replication user from mounted secret +{{- $replUsername := .Values.cluster.replicationUser }} +{{- $replUser := index .Values.auth.aclUsers $replUsername }} +{{- $replPasswordKey := $replUser.passwordKey | default $replUsername }} +{{- if .Values.auth.usersExistingSecret }} +if [ -f "/valkey-users-secret/{{ $replPasswordKey }}" ]; then + AUTH_PASSWORD=$(cat "/valkey-users-secret/{{ $replPasswordKey }}") +elif [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then + AUTH_PASSWORD=$(cat "/valkey-auth-secret/{{ $replUsername }}-password") +else + echo "ERROR: No password found for cluster replication user {{ $replUsername }}" + exit 1 fi {{- else }} +if [ -f "/valkey-auth-secret/{{ $replUsername }}-password" ]; then + AUTH_PASSWORD=$(cat "/valkey-auth-secret/{{ $replUsername }}-password") +else + echo "ERROR: No password found for cluster replication user {{ $replUsername }}" + exit 1 +fi +{{- end }} +AUTH_OPTION="-a ${AUTH_PASSWORD}" +{{- else }} AUTH_OPTION="" {{- end }} diff --git a/valkey/templates/cluster-statefulset.yaml b/valkey/templates/cluster-statefulset.yaml index 30a9489..b6ca393 100644 --- a/valkey/templates/cluster-statefulset.yaml +++ b/valkey/templates/cluster-statefulset.yaml @@ -177,6 +177,16 @@ spec: {{- if .Values.auth.enabled }} - name: valkey-acl mountPath: /etc/valkey + {{- if .Values.auth.usersExistingSecret }} + - name: valkey-users-secret + mountPath: /valkey-users-secret + readOnly: true + {{- end }} + {{- if or (include "valkey.hasInlinePasswords" . | eq "true") .Values.auth.aclConfig }} + - name: valkey-auth-secret + mountPath: /valkey-auth-secret + readOnly: true + {{- end }} {{- end }} {{- range $secret := .Values.extraValkeySecrets }} - name: {{ $secret.name }}-valkey diff --git a/valkey/tests/cluster_test.yaml b/valkey/tests/cluster_test.yaml index 72e7bfd..e2c70c2 100644 --- a/valkey/tests/cluster_test.yaml +++ b/valkey/tests/cluster_test.yaml @@ -345,3 +345,153 @@ tests: - matchRegex: path: data["init.sh"] pattern: "cluster-allow-reads-when-down yes" + + # Cluster auth secret mount tests (bug fix: ensure main container has access to plaintext password) + - it: should mount valkey-users-secret to main container when auth.usersExistingSecret is set + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.usersExistingSecret: "my-valkey-users" + auth.aclUsers: + default: + permissions: "~* &* +@all" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: valkey-users-secret + mountPath: /valkey-users-secret + readOnly: true + + - it: should mount valkey-auth-secret to main container when inline passwords are used + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.aclUsers: + default: + password: "testpass" + permissions: "~* &* +@all" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: valkey-auth-secret + mountPath: /valkey-auth-secret + readOnly: true + + - it: should mount both auth secrets to main container when both are configured + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.usersExistingSecret: "my-valkey-users" + auth.aclUsers: + default: + permissions: "~* &* +@all" + password: "fallback" + template: templates/cluster-statefulset.yaml + asserts: + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: valkey-users-secret + mountPath: /valkey-users-secret + readOnly: true + - contains: + path: spec.template.spec.containers[0].volumeMounts + content: + name: valkey-auth-secret + mountPath: /valkey-auth-secret + readOnly: true + + # Cluster init script password retrieval tests (bug fix: read from secret, not ACL hash) + - it: cluster-script should read password from valkey-users-secret when usersExistingSecret is set + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.usersExistingSecret: "my-valkey-users" + auth.aclUsers: + default: + permissions: "~* &* +@all" + template: templates/cluster-script.yaml + asserts: + - matchRegex: + path: data["init-cluster.sh"] + pattern: '/valkey-users-secret/' + - notMatchRegex: + path: data["init-cluster.sh"] + pattern: '/etc/valkey/users.acl' + + - it: cluster-script should read password from valkey-auth-secret when inline passwords are used + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.aclUsers: + default: + password: "testpass" + permissions: "~* &* +@all" + template: templates/cluster-script.yaml + asserts: + - matchRegex: + path: data["init-cluster.sh"] + pattern: '/valkey-auth-secret/default-password' + - notMatchRegex: + path: data["init-cluster.sh"] + pattern: '/etc/valkey/users.acl' + + - it: cluster-script should use custom passwordKey when configured + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.usersExistingSecret: "my-valkey-users" + auth.aclUsers: + default: + permissions: "~* &* +@all" + passwordKey: "default-pwd" + template: templates/cluster-script.yaml + asserts: + - matchRegex: + path: data["init-cluster.sh"] + pattern: '/valkey-users-secret/default-pwd' + + - it: cluster-script should use custom replicationUser for auth + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + cluster.replicationUser: "clusteruser" + auth.enabled: true + auth.aclUsers: + default: + password: "defaultpass" + permissions: "~* &* +@all" + clusteruser: + password: "clusterpass" + permissions: "~* &* +@all" + template: templates/cluster-script.yaml + asserts: + - matchRegex: + path: data["init-cluster.sh"] + pattern: '/valkey-auth-secret/clusteruser-password' + + - it: cluster-script should NOT parse password hash from ACL file + set: + cluster.enabled: true + cluster.persistence.size: "5Gi" + auth.enabled: true + auth.aclUsers: + default: + password: "testpass" + permissions: "~* &* +@all" + template: templates/cluster-script.yaml + asserts: + # Ensure we don't try to extract the hash from the ACL file + - notMatchRegex: + path: data["init-cluster.sh"] + pattern: 'grep.*users\.acl' From ae713b40add00e576fdbdd7860096f3df732146e Mon Sep 17 00:00:00 2001 From: Ankit Pati Date: Wed, 21 Jan 2026 12:18:12 +0530 Subject: [PATCH 3/4] docs: update `README.md`s & `NOTES.txt` for cluster mode Signed-off-by: Ankit Pati --- valkey/README.md | 108 +++++++++++++++++++++++++++++++++++++ valkey/templates/NOTES.txt | 69 +++++++++++++++++++++++- 2 files changed, 175 insertions(+), 2 deletions(-) diff --git a/valkey/README.md b/valkey/README.md index 0487321..1f6aee5 100644 --- a/valkey/README.md +++ b/valkey/README.md @@ -58,6 +58,60 @@ replica: If fewer than `minReplicasToWrite` replicas are available, the master will reject write operations. +### Cluster Mode + +Deploy a sharded Valkey cluster for horizontal scaling and high availability: + +```bash +helm install valkey valkey/valkey --set cluster.enabled=true --set cluster.persistence.size=5Gi +``` + +**Architecture:** + +* Data is automatically sharded across multiple primary nodes (16384 hash slots distributed across shards) +* Each shard can have replicas for high availability within the shard +* Total nodes = `shards` × (1 + `replicasPerShard`) + +**Default Configuration (6 nodes):** + +```yaml +cluster: + enabled: true + shards: 3 # Minimum 3 shards required + replicasPerShard: 1 # 1 replica per shard + persistence: + size: 5Gi # Required +``` + +This creates 6 nodes: 3 primary shards + 3 replicas. + +**High Availability Configuration (15 nodes):** + +```yaml +cluster: + enabled: true + shards: 5 # 5 primary shards + replicasPerShard: 2 # 2 replicas per shard for extra redundancy + persistence: + size: 10Gi + storageClass: "fast-ssd" +``` + +**Services:** + +* `valkey`: Main service for client connections (routes to all nodes) +* `valkey-headless`: Headless service for pod discovery and cluster communication + +**Cluster Configuration Options:** + +```yaml +cluster: + nodeTimeout: 15000 # Milliseconds before a node is considered failed + requireFullCoverage: true # Require all hash slots covered to accept writes + allowReadsWhenDown: false # Allow reads when cluster is in down state + busPort: 16379 # Port for inter-node cluster communication +``` + ## Storage ### Standalone Storage @@ -93,6 +147,20 @@ replica: storageClass: "fast-ssd" # Optional ``` +### Cluster Storage + +Persistent storage is **mandatory** in cluster mode. Each node in the cluster maintains its own data partition and cluster state configuration. + +```yaml +cluster: + enabled: true + persistence: + size: 10Gi # Required + storageClass: "fast-ssd" # Optional + accessModes: + - ReadWriteOnce +``` + ## Authentication This chart supports ACL-based authentication for Valkey. @@ -174,6 +242,35 @@ replica: * This user MUST be defined in `auth.aclUsers` with appropriate permissions * Minimum permissions: `+psync +replconf +ping` +### Cluster with Authentication + +When using ACL authentication in cluster mode, nodes need credentials to authenticate with each other for cluster operations: + +```yaml +auth: + enabled: true + usersExistingSecret: "my-valkey-users" + aclUsers: + default: + permissions: "~* &* +@all" + cluster-user: + permissions: "+psync +replconf +ping" + +cluster: + enabled: true + shards: 3 + replicasPerShard: 1 + replicationUser: "cluster-user" # Must be defined in auth.aclUsers + persistence: + size: 5Gi +``` + +**Important Notes:** + +* `cluster.replicationUser` specifies which ACL user cluster nodes use to authenticate +* This user MUST be defined in `auth.aclUsers` with appropriate permissions +* Minimum permissions: `+psync +replconf +ping` + ## Metrics This chart supports Prometheus metrics collection using the [Redis exporter](https://github.com/oliver006/redis_exporter). @@ -325,6 +422,17 @@ tls: | replica.persistence.size | string | `""` | Required if replica is enabled | | replica.persistence.storageClass | string | `""` | | | replica.persistence.accessModes | list | `""` | | +| cluster.enabled | bool | `false` | Enable cluster mode (mutually exclusive with replica.enabled) | +| cluster.shards | int | `3` | Number of primary shards (minimum 3) | +| cluster.replicasPerShard | int | `1` | Number of replicas per shard | +| cluster.replicationUser | string | `"default"` | ACL user for cluster authentication (must be in auth.aclUsers) | +| cluster.nodeTimeout | int | `15000` | Milliseconds before node is considered failed | +| cluster.requireFullCoverage | bool | `true` | Require all slots covered to accept writes | +| cluster.allowReadsWhenDown | bool | `false` | Allow reads when cluster is down | +| cluster.busPort | int | `16379` | Port for inter-node cluster communication | +| cluster.persistence.size | string | `""` | Required if cluster is enabled | +| cluster.persistence.storageClass | string | `""` | | +| cluster.persistence.accessModes | list | `["ReadWriteOnce"]` | | | resources | object | `{}` | | | securityContext.capabilities.drop[0] | string | `"ALL"` | | | securityContext.readOnlyRootFilesystem | bool | `true` | | diff --git a/valkey/templates/NOTES.txt b/valkey/templates/NOTES.txt index 07ddb6d..e59b325 100644 --- a/valkey/templates/NOTES.txt +++ b/valkey/templates/NOTES.txt @@ -10,7 +10,56 @@ Namespace: {{ .Release.Namespace }} Chart: {{ .Chart.Name }} {{ .Chart.Version }} App version: {{ .Chart.AppVersion }} -{{- if .Values.replica.enabled }} +{{- if .Values.cluster.enabled }} +================================================================================ +🌐 CLUSTER MODE (Sharded) +================================================================================ + +Your Valkey deployment is running in CLUSTER mode: +- {{ .Values.cluster.shards }} Shard(s) (primary nodes) +- {{ .Values.cluster.replicasPerShard }} Replica(s) per shard +- {{ include "valkey.clusterNodeCount" . }} Total node(s) + +Hash slots (16384 total) are distributed across the {{ .Values.cluster.shards }} shards. + +Service: {{ include "valkey.fullname" . }} +Type: {{ .Values.service.type }} +Port: {{ .Values.service.port }} +Bus Port: {{ .Values.cluster.busPort }} (for inter-node communication) + +1) In-cluster access + From another Pod: + $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }} -c PING + + Note: Use the `-c` flag to enable cluster mode in valkey-cli. + +2) Local access via kubectl port-forward + $ kubectl -n {{ .Release.Namespace }} port-forward svc/{{ include "valkey.fullname" . }} 6379:{{ .Values.service.port }} + In another terminal: + $ valkey-cli -h 127.0.0.1 -p 6379{{ if .Values.tls.enabled }} --tls{{- end }} -c PING +{{ if eq .Values.service.type "LoadBalancer" }} +3) External access (LoadBalancer) + $ export SERVICE_IP=$(kubectl -n {{ .Release.Namespace }} get svc {{ include "valkey.fullname" . }} -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + $ valkey-cli -h $SERVICE_IP -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }} -c PING +{{ else if eq .Values.service.type "NodePort" }} +3) External access (NodePort) + $ export NODE_PORT=$(kubectl -n {{ .Release.Namespace }} get svc {{ include "valkey.fullname" . }} -o jsonpath='{.spec.ports[0].nodePort}') + $ export NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="InternalIP")].address}') + $ valkey-cli -h $NODE_IP -p $NODE_PORT{{ if .Values.tls.enabled }} --tls{{- end }} -c PING +{{ end }} +Direct Pod Access (Headless Service): +{{- $shards := int .Values.cluster.shards }} +{{- $replicasPerShard := int .Values.cluster.replicasPerShard }} +{{- $totalNodes := mul $shards (add 1 $replicasPerShard) }} +{{- range $i := until (int $totalNodes) }} + {{ include "valkey.fullname" $ }}-{{ $i }}.{{ include "valkey.headlessServiceName" $ }}.{{ $.Release.Namespace }}.svc.{{ $.Values.clusterDomain }} +{{- end }} + +Cluster Info: + $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user -a {{ end }} cluster info + $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user -a {{ end }} cluster nodes + +{{- else if .Values.replica.enabled }} ================================================================================ 🔄 REPLICATION MODE ================================================================================ @@ -99,13 +148,29 @@ Port: {{ .Values.service.port }} {{ end }} ✅ Quick test +{{- if .Values.cluster.enabled }} +$ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user -a {{ end }} -c +valkey> SET foo bar +valkey> GET foo +"bar" +valkey> CLUSTER INFO +{{- else }} $ valkey-cli -h {{ include "valkey.fullname" . }} -p {{ .Values.service.port }}{{ if .Values.tls.enabled }} --tls{{- end }}{{ if .Values.auth.enabled }} --user -a {{ end }} valkey> SET foo bar valkey> GET foo "bar" +{{- end }} 💾 Persistence -{{- if .Values.replica.enabled }} +{{- if .Values.cluster.enabled }} +- Persistence is ENABLED (required for cluster mode). Each node has its own volume. +- Size: {{ .Values.cluster.persistence.size }} +{{- if .Values.cluster.persistence.storageClass }} +- Storage class: {{ .Values.cluster.persistence.storageClass }} +{{- end }} +- To see PVCs: + $ kubectl -n {{ .Release.Namespace }} get pvc -l app.kubernetes.io/instance={{ .Release.Name }} +{{- else if .Values.replica.enabled }} - Persistence is ENABLED (required for replication mode). Each instance has its own volume. - Size: {{ .Values.replica.persistence.size }} {{- if .Values.replica.persistence.storageClass }} From 021f4f2dcff9ec0d03d962edcfb212a9ca7ab6e4 Mon Sep 17 00:00:00 2001 From: Ankit Pati Date: Sat, 24 Jan 2026 18:01:55 +0530 Subject: [PATCH 4/4] fix: schema updated with missing values Signed-off-by: Ankit Pati --- valkey/values.schema.json | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/valkey/values.schema.json b/valkey/values.schema.json index f34233e..7370fe9 100644 --- a/valkey/values.schema.json +++ b/valkey/values.schema.json @@ -22,6 +22,52 @@ } } }, + "cluster": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + }, + "shards": { + "type": "integer" + }, + "replicasPerShard": { + "type": "integer" + }, + "replicationUser": { + "type": "string" + }, + "nodeTimeout": { + "type": "integer" + }, + "requireFullCoverage": { + "type": "boolean" + }, + "allowReadsWhenDown": { + "type": "boolean" + }, + "persistence": { + "type": "object", + "properties": { + "size": { + "type": "string" + }, + "storageClass": { + "type": "string" + }, + "accessModes": { + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "busPort": { + "type": "integer" + } + } + }, "clusterDomain": { "type": "string" }, @@ -347,6 +393,14 @@ }, "runAsUser": { "type": "integer" + }, + "seccompProfile": { + "type": "object", + "properties": { + "type": { + "type": "string" + } + } } } }, @@ -441,6 +495,9 @@ "securityContext": { "type": "object", "properties": { + "allowPrivilegeEscalation": { + "type": "boolean" + }, "capabilities": { "type": "object", "properties": {