Conversation
Signed-off-by: will <zzchun8@gmail.com>
There was a problem hiding this comment.
Code Review
This pull request introduces a resource-based actor autoscaler and a pluggable actor removal strategy, which are significant enhancements for managing resources in Ray Data. The implementation is well-structured and includes comprehensive tests for the new functionality.
My review has identified a couple of areas for improvement:
- There is some duplicated code in the new
ResourceBasedActorAutoscalerthat could be refactored for better maintainability. - A constant appears to be defined twice in
DataContext, which should be corrected.
Overall, this is a solid contribution. Addressing these points will further improve the code quality.
| DEFAULT_ACTOR_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR = env_integer( | ||
| "RAY_DATA_ACTOR_DEFAULT_MAX_TASKS_IN_FLIGHT_TO_MAX_CONCURRENCY_FACTOR", 2 | ||
| ) |
| def _calculate_pool_min_resources( | ||
| self, | ||
| actor_pool: "AutoscalingActorPool", | ||
| per_actor_resources: ExecutionResources, | ||
| weight_ratio: float, | ||
| all_pools: list, | ||
| pool_weights: Dict, | ||
| pool_resource_needs: Dict, | ||
| ) -> ExecutionResources: | ||
| """Calculate min resources for a single pool""" | ||
| pool_min_cpu = ( | ||
| self._job_min_resources.cpu * weight_ratio | ||
| if per_actor_resources.cpu > 0 | ||
| else 0 | ||
| ) | ||
|
|
||
| # GPU allocation: Only allocate to pools that need GPUs | ||
| if per_actor_resources.gpu > 0: | ||
| gpu_pools = [p for p in all_pools if pool_resource_needs[p].gpu > 0] | ||
| if gpu_pools: | ||
| gpu_total_weight = sum(pool_weights[p] for p in gpu_pools) | ||
| gpu_weight_ratio = ( | ||
| pool_weights[actor_pool] / gpu_total_weight | ||
| if gpu_total_weight > 0 | ||
| else 1.0 / len(gpu_pools) | ||
| ) | ||
| pool_min_gpu = self._job_min_resources.gpu * gpu_weight_ratio | ||
| else: | ||
| pool_min_gpu = 0 | ||
| else: | ||
| pool_min_gpu = 0 | ||
|
|
||
| # Memory allocation | ||
| pool_min_memory = ( | ||
| self._job_min_resources.memory * weight_ratio | ||
| if per_actor_resources.memory > 0 | ||
| else 0 | ||
| ) | ||
|
|
||
| return ExecutionResources( | ||
| cpu=pool_min_cpu, | ||
| gpu=pool_min_gpu, | ||
| memory=pool_min_memory, | ||
| ) | ||
|
|
||
| def _calculate_pool_max_resources( | ||
| self, | ||
| actor_pool: "AutoscalingActorPool", | ||
| per_actor_resources: ExecutionResources, | ||
| weight_ratio: float, | ||
| all_pools: list, | ||
| pool_weights: Dict, | ||
| pool_resource_needs: Dict, | ||
| ) -> ExecutionResources: | ||
| """Calculate max resources for a single pool""" | ||
| # CPU allocation | ||
| pool_max_cpu = ( | ||
| self._job_max_resources.cpu * weight_ratio | ||
| if per_actor_resources.cpu > 0 | ||
| else 0 | ||
| ) | ||
|
|
||
| # GPU allocation: Only allocate to pools that need GPUs | ||
| if per_actor_resources.gpu > 0: | ||
| gpu_pools = [p for p in all_pools if pool_resource_needs[p].gpu > 0] | ||
| if gpu_pools: | ||
| gpu_total_weight = sum(pool_weights[p] for p in gpu_pools) | ||
| gpu_weight_ratio = ( | ||
| pool_weights[actor_pool] / gpu_total_weight | ||
| if gpu_total_weight > 0 | ||
| else 1.0 / len(gpu_pools) | ||
| ) | ||
| pool_max_gpu = self._job_max_resources.gpu * gpu_weight_ratio | ||
| else: | ||
| pool_max_gpu = 0 | ||
| else: | ||
| pool_max_gpu = 0 | ||
|
|
||
| # Memory allocation | ||
| pool_max_memory = ( | ||
| self._job_max_resources.memory * weight_ratio | ||
| if per_actor_resources.memory > 0 | ||
| else 0 | ||
| ) | ||
|
|
||
| return ExecutionResources( | ||
| cpu=pool_max_cpu, | ||
| gpu=pool_max_gpu, | ||
| memory=pool_max_memory, | ||
| ) |
There was a problem hiding this comment.
The methods _calculate_pool_min_resources and _calculate_pool_max_resources contain a lot of duplicated logic. To improve maintainability and reduce redundancy, you can extract the common logic into a single helper method that takes the job-level resources (min or max) as an argument.
Here's a suggested refactoring:
def _calculate_pool_resources(
self,
job_resources: ExecutionResources,
actor_pool: "AutoscalingActorPool",
per_actor_resources: ExecutionResources,
weight_ratio: float,
all_pools: list,
pool_weights: Dict,
pool_resource_needs: Dict,
) -> ExecutionResources:
"""Calculate resources for a single pool based on job-level resources."""
pool_cpu = (
job_resources.cpu * weight_ratio
if per_actor_resources.cpu > 0
else 0
)
# GPU allocation: Only allocate to pools that need GPUs
if per_actor_resources.gpu > 0:
gpu_pools = [p for p in all_pools if pool_resource_needs[p].gpu > 0]
if gpu_pools:
gpu_total_weight = sum(pool_weights[p] for p in gpu_pools)
gpu_weight_ratio = (
pool_weights[actor_pool] / gpu_total_weight
if gpu_total_weight > 0
else 1.0 / len(gpu_pools)
)
pool_gpu = job_resources.gpu * gpu_weight_ratio
else:
pool_gpu = 0
else:
pool_gpu = 0
# Memory allocation
pool_memory = (
job_resources.memory * weight_ratio
if per_actor_resources.memory > 0
else 0
)
return ExecutionResources(
cpu=pool_cpu,
gpu=pool_gpu,
memory=pool_memory,
)
def _calculate_pool_min_resources(
self,
actor_pool: "AutoscalingActorPool",
per_actor_resources: ExecutionResources,
weight_ratio: float,
all_pools: list,
pool_weights: Dict,
pool_resource_needs: Dict,
) -> ExecutionResources:
"""Calculate min resources for a single pool"""
return self._calculate_pool_resources(
self._job_min_resources,
actor_pool,
per_actor_resources,
weight_ratio,
all_pools,
pool_weights,
pool_resource_needs,
)
def _calculate_pool_max_resources(
self,
actor_pool: "AutoscalingActorPool",
per_actor_resources: ExecutionResources,
weight_ratio: float,
all_pools: list,
pool_weights: Dict,
pool_resource_needs: Dict,
) -> ExecutionResources:
"""Calculate max resources for a single pool"""
return self._calculate_pool_resources(
self._job_max_resources,
actor_pool,
per_actor_resources,
weight_ratio,
all_pools,
pool_weights,
pool_resource_needs,
)| logger = logging.getLogger(__name__) | ||
|
|
||
| # Default maximum pool size | ||
| DEFAULT_MAX_POOL_SIZE = 100 |
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| # Default maximum pool size |
There was a problem hiding this comment.
# Default maximum pool size.
| config: AutoscalingConfig, | ||
| ): | ||
| super().__init__(topology, resource_manager, config=config) | ||
| # job-level resource limits |
There was a problem hiding this comment.
# Job-level resource limits.
| """ | ||
|
|
||
| if min_resources is not None and max_resources is not None: | ||
| # Check CPU |
|
This pull request has been automatically marked as stale because it has not had You can always ask for help on our discussion forum or Ray's public slack channel. If you'd like to keep this open, just leave any comment, and the stale label will be removed. |
Description
This PR implements a resource-based actor autoscaler to improve resource management in Ray Data.
Features
Main Changes