From a1dc56c7b8fa39dc223e1ecbe53010ffda5f94b8 Mon Sep 17 00:00:00 2001 From: Kris Howard Date: Tue, 27 Jan 2026 02:21:29 -0700 Subject: [PATCH] feat(cache): add Redis Cluster support for HA deployments Implements Redis Cluster client support in slurm-web agent to enable high-availability caching across distributed Redis clusters. ## Problem Slurm-web currently supports only standalone Redis instances for caching. In high-availability deployments with Redis Cluster (3+ node clustered Redis), slurm-web agents fail to connect because they use the standard redis.Redis() client instead of the cluster-aware redis.cluster.RedisCluster() client. ## Solution This commit adds optional Redis Cluster support while maintaining full backwards compatibility with standalone Redis deployments. ### Core Changes **slurmweb/cache.py**: - Import RedisCluster and ClusterNode from redis.cluster - Add cluster_mode and cluster_nodes optional parameters to CachingService - Implement cluster mode initialization with RedisCluster client - Parse cluster_nodes from "host:port" string format - Add connection validation with fail-fast error handling **slurmweb/apps/agent.py**: - Pass cluster_mode and cluster_nodes parameters to CachingService - Use getattr() with defaults for backwards compatibility **conf/vendor/agent.yml**: - Add cluster_mode boolean parameter (default: false) - Add cluster_nodes list parameter with string content type - Document configuration with examples ## Features - **Opt-in design**: Cluster mode disabled by default (cluster_mode=false) - **Automatic failover**: Cluster continues if a Redis node fails - **Load distribution**: Requests distributed across cluster nodes - **Backwards compatible**: Existing standalone configurations work unchanged - **Fail-fast validation**: Connection tested at initialization ## Configuration Example ```ini [cache] enabled = yes cluster_mode = yes cluster_nodes = 10.0.0.1:6379 10.0.0.2:6379 10.0.0.3:6379 jobs = 30 nodes = 30 ``` ## Testing Tested on production environment: - Slurm-web 6.0.0 - Redis cluster: 3 nodes - Slurm controllers: 2 nodes - OS: Ubuntu 24.04 - Verified backward compatibility with standalone mode ## Implementation Notes - Uses "host:port" string format for RFL schema compatibility (list content type must be str, not dict) - skip_full_coverage_check=True allows partial cluster visibility - decode_responses=False maintains pickle serialization compatibility - Connection validated with ping() at initialization Closes: #[issue-number] --- conf/vendor/agent.yml | 20 ++++++++++++++ slurmweb/apps/agent.py | 2 ++ slurmweb/cache.py | 59 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/conf/vendor/agent.yml b/conf/vendor/agent.yml index 4969209f2..9b1997632 100644 --- a/conf/vendor/agent.yml +++ b/conf/vendor/agent.yml @@ -458,6 +458,26 @@ cache: Password to connect to protected Redis server. When this parameter is not defined, Redis server is accessed without password. ex: SECR3T + cluster_mode: + type: bool + default: false + doc: | + Enable Redis cluster mode for high-availability caching. + When enabled, the agent connects to a Redis cluster instead of + a standalone instance, providing automatic failover and load distribution. + Requires cluster_nodes to be specified. + ex: yes + cluster_nodes: + type: list + content: str + doc: | + List of Redis cluster node addresses in format host:port. + Only used when cluster_mode is enabled. + Minimum 3 nodes recommended for production HA clusters. + ex: + - "10.0.0.1:6379" + - "10.0.0.2:6379" + - "10.0.0.3:6379" version: type: int default: 1800 diff --git a/slurmweb/apps/agent.py b/slurmweb/apps/agent.py index 10b2b98b7..9e0ffeffe 100644 --- a/slurmweb/apps/agent.py +++ b/slurmweb/apps/agent.py @@ -108,6 +108,8 @@ def __init__(self, seed): host=self.settings.cache.host, port=self.settings.cache.port, password=self.settings.cache.password, + cluster_mode=getattr(self.settings.cache, 'cluster_mode', False), + cluster_nodes=getattr(self.settings.cache, 'cluster_nodes', None), ) else: logger.warning("Caching is disabled") diff --git a/slurmweb/cache.py b/slurmweb/cache.py index f25fce4be..d52150c0e 100644 --- a/slurmweb/cache.py +++ b/slurmweb/cache.py @@ -8,6 +8,7 @@ import logging import redis +from redis.cluster import RedisCluster, ClusterNode import pickle from .errors import SlurmwebCacheError @@ -31,10 +32,64 @@ class CachingService: KEY_PREFIX_MISS = "cache-miss-" KEY_PREFIX_HIT = "cache-hit-" - def __init__(self, host: str, port: int, password: t.Union[str, None]): + def __init__( + self, + host: str, + port: int, + password: t.Union[str, None], + cluster_mode: bool = False, + cluster_nodes: t.Optional[t.List[str]] = None, + ): + """Initialize Redis connection (standalone or cluster mode). + + Args: + host: Redis server hostname (used in standalone mode) + port: Redis server port (used in standalone mode) + password: Redis password (optional, used in both modes) + cluster_mode: Enable Redis cluster mode (default: False) + cluster_nodes: List of cluster nodes in "host:port" format + Example: ["10.0.0.1:6379", "10.0.0.2:6379"] + Required when cluster_mode=True + """ self.host = host self.port = port - self.connection = redis.Redis(host=host, port=port, password=password) + self.cluster_mode = cluster_mode + + if cluster_mode: + if not cluster_nodes: + raise ValueError( + "cluster_nodes must be provided when cluster_mode=True" + ) + + # Parse cluster_nodes from "host:port" string format to ClusterNode objects + startup_nodes = [ + ClusterNode(host, int(port)) + for node in cluster_nodes + for host, port in [node.split(":", 1)] + ] + + logger.info( + "Initializing Redis cluster connection with %d nodes", + len(startup_nodes), + ) + + self.connection = RedisCluster( + startup_nodes=startup_nodes, + password=password, + decode_responses=False, # Binary mode for pickle + skip_full_coverage_check=True, # Allow partial clusters + ) + else: + logger.info("Initializing Redis standalone connection to %s:%d", host, port) + self.connection = redis.Redis(host=host, port=port, password=password) + + # Validate connection at initialization (fail-fast) + try: + self.connection.ping() + logger.info("Redis connection established successfully") + except redis.exceptions.ConnectionError as error: + logger.error("Failed to connect to Redis: %s", error) + raise def put(self, key: CacheKey, value: t.Any, expiration: int): try: