From 8b6e172845dd6fffdce37ab62c38cb730c540047 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Wed, 23 Apr 2025 10:39:14 -0500 Subject: [PATCH 01/38] add parallel schedular --- .../schedulers/parallel_scheduler.jsonc | 12 ++ .../schemas/schedulers/scheduler-schema.json | 39 +++-- mlos_bench/mlos_bench/schedulers/__init__.py | 2 + .../schedulers/parallel_scheduler.py | 145 ++++++++++++++++++ .../mlos_bench/schedulers/trial_runner.py | 85 ++++++++-- .../invalid/parallel_sched-bad-repeat.jsonc | 6 + .../invalid/parallel_sched-empty-config.jsonc | 5 + .../bad/unhandled/parallel_sched-extra.jsonc | 6 + .../good/full/parallel_sched-full.jsonc | 12 ++ .../good/partial/parallel_sched-partial.jsonc | 7 + .../tests/launcher_parse_args_test.py | 64 +++++++- .../mlos_bench/tests/storage/conftest.py | 2 + .../mlos_bench/tests/storage/sql/fixtures.py | 133 +++++++++++++++- .../tests/storage/trial_schedule_test.py | 31 +++- 14 files changed, 517 insertions(+), 32 deletions(-) create mode 100644 mlos_bench/mlos_bench/config/schedulers/parallel_scheduler.jsonc create mode 100644 mlos_bench/mlos_bench/schedulers/parallel_scheduler.py create mode 100644 mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-bad-repeat.jsonc create mode 100644 mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-empty-config.jsonc create mode 100644 mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/unhandled/parallel_sched-extra.jsonc create mode 100644 mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/full/parallel_sched-full.jsonc create mode 100644 mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/partial/parallel_sched-partial.jsonc diff --git a/mlos_bench/mlos_bench/config/schedulers/parallel_scheduler.jsonc b/mlos_bench/mlos_bench/config/schedulers/parallel_scheduler.jsonc new file mode 100644 index 00000000000..4d6b7a7e272 --- /dev/null +++ b/mlos_bench/mlos_bench/config/schedulers/parallel_scheduler.jsonc @@ -0,0 +1,12 @@ +// Mock optimizer to test the benchmarking framework. +{ + "$schema": "https://raw.githubusercontent.com/microsoft/MLOS/main/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json", + + "class": "mlos_bench.schedulers.ParallelScheduler", + + "config": { + "trial_config_repeat_count": 3, + "max_trials": -1, // Limited only in the Optimizer logic/config. + "teardown": false + } +} diff --git a/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json b/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json index 81b2e797547..dedac1ed758 100644 --- a/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json +++ b/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json @@ -2,12 +2,10 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/microsoft/MLOS/main/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json", "title": "mlos_bench Scheduler config", - "$defs": { "comment": { "$comment": "This section contains reusable partial schema bits (or just split out for readability)" }, - "config_base_scheduler": { "$comment": "config properties common to all Scheduler types.", "type": "object", @@ -29,18 +27,23 @@ "description": "Max. number of trials to run. Use -1 or 0 for unlimited.", "type": "integer", "minimum": -1, - "examples": [50, -1] + "examples": [ + 50, + -1 + ] }, "trial_config_repeat_count": { "description": "Number of times to repeat a config.", "type": "integer", "minimum": 1, - "examples": [3, 5] + "examples": [ + 3, + 5 + ] } } } }, - "description": "config for the mlos_bench scheduler", "$comment": "top level schema document rules", "type": "object", @@ -51,21 +54,20 @@ "$comment": "This is optional, but if provided, should match the name of this file.", "pattern": "/schemas/schedulers/scheduler-schema.json$" }, - "description": { "description": "Optional description of the config.", "type": "string" }, - "class": { "description": "The name of the scheduler class to use.", "$comment": "required", "enum": [ "mlos_bench.schedulers.SyncScheduler", - "mlos_bench.schedulers.sync_scheduler.SyncScheduler" + "mlos_bench.schedulers.sync_scheduler.SyncScheduler", + "mlos_bench.schedulers.ParallelScheduler", + "mlos_bench.schedulers.parallel_scheduler.ParallelScheduler" ] }, - "config": { "description": "The scheduler-specific config.", "$comment": "Stub for scheduler-specific config appended with condition statements below", @@ -73,8 +75,9 @@ "minProperties": 1 } }, - "required": ["class"], - + "required": [ + "class" + ], "oneOf": [ { "$comment": "extensions to the 'config' object properties when synchronous scheduler is being used", @@ -83,17 +86,25 @@ "class": { "enum": [ "mlos_bench.schedulers.SyncScheduler", - "mlos_bench.schedulers.sync_scheduler.SyncScheduler" + "mlos_bench.schedulers.sync_scheduler.SyncScheduler", + "mlos_bench.schedulers.ParallelScheduler", + "mlos_bench.schedulers.parallel_scheduler.ParallelScheduler" ] } }, - "required": ["class"] + "required": [ + "class" + ] }, "then": { "properties": { "config": { "type": "object", - "allOf": [{ "$ref": "#/$defs/config_base_scheduler" }], + "allOf": [ + { + "$ref": "#/$defs/config_base_scheduler" + } + ], "$comment": "disallow other properties", "unevaluatedProperties": false } diff --git a/mlos_bench/mlos_bench/schedulers/__init__.py b/mlos_bench/mlos_bench/schedulers/__init__.py index 381261e53da..fd381612be7 100644 --- a/mlos_bench/mlos_bench/schedulers/__init__.py +++ b/mlos_bench/mlos_bench/schedulers/__init__.py @@ -5,9 +5,11 @@ """Interfaces and implementations of the optimization loop scheduling policies.""" from mlos_bench.schedulers.base_scheduler import Scheduler +from mlos_bench.schedulers.parallel_scheduler import ParallelScheduler from mlos_bench.schedulers.sync_scheduler import SyncScheduler __all__ = [ "Scheduler", "SyncScheduler", + "ParallelScheduler", ] diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py new file mode 100644 index 00000000000..5393cb1e933 --- /dev/null +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -0,0 +1,145 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +"""A simple single-threaded synchronous optimization loop implementation.""" + +import copy +from datetime import datetime +import logging +import threading +from typing import Callable, Optional + +from pytz import UTC + +import logging +from datetime import datetime + +from pytz import UTC + +from mlos_bench.storage.base_storage import Storage + +from mlos_bench.schedulers.base_scheduler import Scheduler +from mlos_bench.storage.base_storage import Storage +import time + +_LOG = logging.getLogger(__name__) + +class ParallelScheduler(Scheduler): + """A simple multi-threaded asynchronous optimization loop implementation.""" + def start(self) -> None: + """Start the optimization loop.""" + super().start() + + self._idle_runners: set[int] = set(self._trial_runners.keys()) + self._busy_runners: set[int] = set() + self._scheduled_trials: set[int] = set() + self._pending_updates: list[Callable[[], None]] = [] + self._pending_threads: list[threading.Thread] = [] + self._runner_lock: threading.Lock = threading.Lock() + + is_warm_up: bool = self.optimizer.supports_preload + if not is_warm_up: + _LOG.warning("Skip pending trials and warm-up: %s", self.optimizer) + + not_done: bool = True + while not_done: + _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) + self._run_schedule(is_warm_up) + not_done = self._schedule_new_optimizer_suggestions() + is_warm_up = False + + # Wait for all pending runners to finish + while len(self._busy_runners) > 0: + with self._runner_lock: + for update in self._pending_updates: + update() + time.sleep(1) + + def _run_schedule(self, running: bool = False) -> None: + """ + Scheduler part of the loop. + + Check for pending trials in the queue and run them. + """ + assert self.experiment is not None + # Collect all pending trials + # It is critical that we filter out trials that are already assigned to a trial runner + # If we do not filter out these trials, it will cause configurations to be double scheduled + # and will cause the storage backend to fail. + pending_trials: list[Storage.Trial] = [ + t + for t in self.experiment.pending_trials(datetime.now(UTC), running=running) + if t.trial_id not in self._scheduled_trials + ] + + for trial in pending_trials: + # Wait for an idle trial runner + trial_runner_id: Optional[int] = None + while trial_runner_id is None: + with self._runner_lock: + for update in self._pending_updates: + update() + self._pending_updates.clear() + if len(self._idle_runners) > 0: + # Schedule a Trial to a Trial Runner + trial_runner_id = self._idle_runners.pop() + self._busy_runners.add(trial_runner_id) + self._scheduled_trials.add(trial.trial_id) + + # Assign the trial to the trial runner. Note that this will be reset + # if pending_trials is queried again from the experiment + trial.set_trial_runner(trial_runner_id) + self.run_trial(trial) + + if trial_runner_id is None: + # Sleep for a short time if failed to find to prevent busy wait + _LOG.debug("No idle trial runners available. Waiting...") + time.sleep(1) + + def async_run_trial(self, trial: Storage.Trial) -> None: + """ + Run a single trial in the background. + + Parameters + ---------- + trial : Storage.Trial + A Storage class based Trial used to persist the experiment trial data. + """ + register_fn = self.get_trial_runner(trial)._run_trial( + trial, copy.copy(self.global_config) + ) + + def callback(trial: Storage.Trial = trial): + """ + Callback to pass to the main thread to register the results with the storage. + + Parameters + ---------- + trial : Storage.Trial + A Storage class based Trial used to persist the experiment trial data. + """ + assert trial.trial_runner_id is not None, "Trial runner ID should not be None" + + register_fn() + self._busy_runners.remove(trial.trial_runner_id) + self._idle_runners.add(trial.trial_runner_id) + + with self._runner_lock: + self._pending_updates.append(callback) + + def run_trial(self, trial: Storage.Trial) -> None: + """ + Schedule a single trial to be run in the background. + + Parameters + ---------- + trial : Storage.Trial + A Storage class based Trial used to persist the experiment trial data. + """ + super().run_trial(trial) + trial_runner = self.get_trial_runner(trial) + + thread = threading.Thread(target=self.async_run_trial, args=(trial,)) + self._pending_threads.append(thread) + thread.start() diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 80eb696bc6d..6c39279d434 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -5,6 +5,7 @@ """Simple class to run an individual Trial on a given Environment.""" import logging +from collections.abc import Callable from datetime import datetime from types import TracebackType from typing import Any, Literal @@ -164,14 +165,14 @@ def is_running(self) -> bool: """Get the running state of the current TrialRunner.""" return self._is_running - def run_trial( + def _run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, - ) -> None: + ) -> Callable[[], None]: """ - Run a single trial on this TrialRunner's Environment and stores the results in - the backend Trial Storage. + Run a single trial on this TrialRunner's Environment and return a callback to + store the results in the backend Trial Storage. Parameters ---------- @@ -182,8 +183,8 @@ def run_trial( Returns ------- - (trial_status, trial_score) : (Status, dict[str, float] | None) - Status and results of the trial. + callback : Callable[[], None] + Returns a callback to register the results with the storage backend. """ assert self._in_context @@ -199,8 +200,18 @@ def run_trial( _LOG.warning("Setup failed: %s :: %s", self.environment, trial.tunables) # FIXME: Use the actual timestamp from the environment. _LOG.info("TrialRunner: Update trial results: %s :: %s", trial, Status.FAILED) - trial.update(Status.FAILED, datetime.now(UTC)) - return + + def fail_callback() -> None: + """ + A callback to register the results with the storage backend. + + This must be called from the main thread. For a synchronous scheduler + this can just be called directly. For an asynchronous scheduler, this + will be passed to the main thread when the trial is finished. + """ + trial.update(Status.FAILED, datetime.now(UTC)) + + return fail_callback # TODO: start background status polling of the environments in the event loop. @@ -214,13 +225,63 @@ def run_trial( # Use the status and timestamp from `.run()` as it is the final status of the experiment. # TODO: Use the `.status()` output in async mode. - trial.update_telemetry(status, timestamp, telemetry) - - trial.update(status, timestamp, results) - _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) + def success_callback() -> None: + """ + A callback to register the results with the storage backend. + + This must be called from the main thread. For a synchronous scheduler this + can just be called directly. For an asynchronous scheduler, this will be + passed to the main thread when the trial is finished. + """ + trial.update_telemetry(status, timestamp, telemetry) + trial.update(status, timestamp, results) + _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) self._is_running = False + return success_callback + + def run_trial( + self, + trial: Storage.Trial, + global_config: dict[str, Any] | None = None, + ) -> None: + """ + Run a single trial on this TrialRunner's Environment and store the results in + the backend Trial Storage. + + Parameters + ---------- + trial : Storage.Trial + A Storage class based Trial used to persist the experiment trial data. + global_config : dict + Global configuration parameters. + """ + self._run_trial(trial, global_config)() + + async def async_run_trial( + self, + trial: Storage.Trial, + global_config: dict[str, Any] | None = None, + ) -> Callable[[], None]: + """ + Run a single trial on this TrialRunner's Environment and return a callback to + store the results in the backend Trial Storage. + + Parameters + ---------- + trial : Storage.Trial + A Storage class based Trial used to persist the experiment trial data. + global_config : dict + Global configuration parameters. + + Returns + ------- + callback : Callable[[], None] + Returns a callback to register the results with the storage backend. + """ + return self._run_trial(trial, global_config) + def teardown(self) -> None: """ Tear down the Environment. diff --git a/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-bad-repeat.jsonc b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-bad-repeat.jsonc new file mode 100644 index 00000000000..4ea6bdbf170 --- /dev/null +++ b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-bad-repeat.jsonc @@ -0,0 +1,6 @@ +{ + "class": "mlos_bench.schedulers.ParallelScheduler", + "config": { + "trial_config_repeat_count": 0 + } +} diff --git a/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-empty-config.jsonc b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-empty-config.jsonc new file mode 100644 index 00000000000..06729a4f368 --- /dev/null +++ b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/invalid/parallel_sched-empty-config.jsonc @@ -0,0 +1,5 @@ +{ + "class": "mlos_bench.schedulers.ParallelScheduler", + "config": { + } +} diff --git a/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/unhandled/parallel_sched-extra.jsonc b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/unhandled/parallel_sched-extra.jsonc new file mode 100644 index 00000000000..68623ee611f --- /dev/null +++ b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/bad/unhandled/parallel_sched-extra.jsonc @@ -0,0 +1,6 @@ +{ + "class": "mlos_bench.schedulers.parallel_scheduler.ParallelScheduler", + "config": { + "extra": "unsupported" + } +} diff --git a/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/full/parallel_sched-full.jsonc b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/full/parallel_sched-full.jsonc new file mode 100644 index 00000000000..90bac645032 --- /dev/null +++ b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/full/parallel_sched-full.jsonc @@ -0,0 +1,12 @@ +{ + "$schema": "https://raw.githubusercontent.com/microsoft/MLOS/main/mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json", + "class": "mlos_bench.schedulers.parallel_scheduler.ParallelScheduler", + "config": { + "trial_config_repeat_count": 3, + "teardown": false, + "experiment_id": "MyExperimentName", + "config_id": 1, + "trial_id": 1, + "max_trials": 100 + } +} diff --git a/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/partial/parallel_sched-partial.jsonc b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/partial/parallel_sched-partial.jsonc new file mode 100644 index 00000000000..1b0e39c3305 --- /dev/null +++ b/mlos_bench/mlos_bench/tests/config/schemas/schedulers/test-cases/good/partial/parallel_sched-partial.jsonc @@ -0,0 +1,7 @@ +{ + "class": "mlos_bench.schedulers.ParallelScheduler", + "config": { + "trial_config_repeat_count": 3, + "teardown": false + } +} diff --git a/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py b/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py index 6294ee8bf3b..b84245732f5 100644 --- a/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py +++ b/mlos_bench/mlos_bench/tests/launcher_parse_args_test.py @@ -18,7 +18,7 @@ from mlos_bench.launcher import Launcher from mlos_bench.optimizers import MlosCoreOptimizer, OneShotOptimizer from mlos_bench.os_environ import environ -from mlos_bench.schedulers import SyncScheduler +from mlos_bench.schedulers import ParallelScheduler, SyncScheduler from mlos_bench.services.types import ( SupportsAuth, SupportsConfigLoading, @@ -307,5 +307,67 @@ def test_launcher_args_parse_3(config_paths: list[str]) -> None: assert launcher.scheduler.trial_config_repeat_count == 2 +def test_launcher_args_parse_4(config_paths: list[str]) -> None: + """ + Test that using multiple --globals arguments works and that multiple space separated + options to --config-paths works. + + Check $var expansion and Environment loading. + """ + # Here we have multiple paths following --config-paths and --service. + cli_args = ( + "--config-paths " + + " ".join(config_paths) + + " --num-trial-runners 5" + + " --service services/remote/mock/mock_auth_service.jsonc" + " services/remote/mock/mock_remote_exec_service.jsonc" + " --scheduler schedulers/parallel_scheduler.jsonc" + f" --environment {ENV_CONF_PATH}" + " --globals globals/global_test_config.jsonc" + " --globals globals/global_test_extra_config.jsonc" + " --test_global_value_2 from-args" + ) + launcher = _get_launcher(__name__, cli_args) + # Check some additional features of the the parent service + assert isinstance(launcher.service, SupportsAuth) # from --service + assert isinstance(launcher.service, SupportsRemoteExec) # from --service + # Check that the first --globals file is loaded and $var expansion is handled. + assert launcher.global_config["experiment_id"] == "MockExperiment" + assert launcher.global_config["testVmName"] == "MockExperiment-vm" + # Check that secondary expansion also works. + assert launcher.global_config["testVnetName"] == "MockExperiment-vm-vnet" + # Check that the second --globals file is loaded. + assert launcher.global_config["test_global_value"] == "from-file" + # Check overriding values in a file from the command line. + assert launcher.global_config["test_global_value_2"] == "from-args" + # Check that we can expand a $var in a config file that references an environment variable. + assert path_join(launcher.global_config["pathVarWithEnvVarRef"], abs_path=True) == path_join( + os.getcwd(), "foo", abs_path=True + ) + assert launcher.global_config["varWithEnvVarRef"] == f"user:{getuser()}" + assert launcher.teardown + # Make sure we have the right number of trial runners. + assert len(launcher.trial_runners) == 5 # from cli args + # Check that the environment that got loaded looks to be of the right type. + env_config = launcher.config_loader.load_config(ENV_CONF_PATH, ConfigSchema.ENVIRONMENT) + assert env_config["class"] == "mlos_bench.environments.mock_env.MockEnv" + # All TrialRunners should get the same Environment. + assert all( + check_class_name(trial_runner.environment, env_config["class"]) + for trial_runner in launcher.trial_runners + ) + # Check that the optimizer looks right. + assert isinstance(launcher.optimizer, OneShotOptimizer) + # Check that the optimizer got initialized with defaults. + assert launcher.optimizer.tunable_params.is_defaults() + assert launcher.optimizer.max_suggestions == 1 # value for OneShotOptimizer + # Check that we pick up the right scheduler config: + assert isinstance(launcher.scheduler, ParallelScheduler) + assert ( + launcher.scheduler.trial_config_repeat_count == 3 + ) # from the custom sync_scheduler.jsonc config + assert launcher.scheduler.max_trials == -1 + + if __name__ == "__main__": pytest.main([__file__, "-n0"]) diff --git a/mlos_bench/mlos_bench/tests/storage/conftest.py b/mlos_bench/mlos_bench/tests/storage/conftest.py index a1437052823..212bf4acd4c 100644 --- a/mlos_bench/mlos_bench/tests/storage/conftest.py +++ b/mlos_bench/mlos_bench/tests/storage/conftest.py @@ -16,5 +16,7 @@ exp_no_tunables_storage = sql_storage_fixtures.exp_no_tunables_storage mixed_numerics_exp_storage = sql_storage_fixtures.mixed_numerics_exp_storage exp_data = sql_storage_fixtures.exp_data +parallel_exp_data = sql_storage_fixtures.parallel_exp_data + exp_no_tunables_data = sql_storage_fixtures.exp_no_tunables_data mixed_numerics_exp_data = sql_storage_fixtures.mixed_numerics_exp_data diff --git a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py index cb83bffd4ff..3cec974fcf5 100644 --- a/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py +++ b/mlos_bench/mlos_bench/tests/storage/sql/fixtures.py @@ -4,12 +4,13 @@ # """Test fixtures for mlos_bench storage.""" -from collections.abc import Generator +from collections.abc import Callable, Generator from random import seed as rand_seed import pytest from mlos_bench.optimizers.mock_optimizer import MockOptimizer +from mlos_bench.schedulers.parallel_scheduler import ParallelScheduler from mlos_bench.schedulers.sync_scheduler import SyncScheduler from mlos_bench.schedulers.trial_runner import TrialRunner from mlos_bench.services.config_persistence import ConfigPersistenceService @@ -109,6 +110,94 @@ def mixed_numerics_exp_storage( assert not exp._in_context +def _parallel_dummy_run_exp( + storage: SqlStorage, + exp: SqlStorage.Experiment, +) -> ExperimentData: + """ + Generates data by doing a simulated run of the given experiment. + + Parameters + ---------- + storage : SqlStorage + The storage object to use. + exp : SqlStorage.Experiment + The experiment to "run". + Note: this particular object won't be updated, but a new one will be created + from its metadata. + + Returns + ------- + ExperimentData + The data generated by the simulated run. + """ + # pylint: disable=too-many-locals + + rand_seed(SEED) + + trial_runners: list[TrialRunner] = [] + global_config: dict = {} + config_loader = ConfigPersistenceService() + tunable_params = ",".join(f'"{name}"' for name in exp.tunables.get_covariant_group_names()) + mock_env_json = f""" + {{ + "class": "mlos_bench.environments.mock_env.MockEnv", + "name": "Test Env", + "config": {{ + "tunable_params": [{tunable_params}], + "mock_env_seed": {SEED}, + "mock_env_range": [60, 120], + "mock_env_metrics": ["score"] + }} + }} + """ + trial_runners = TrialRunner.create_from_json( + config_loader=config_loader, + global_config=global_config, + tunable_groups=exp.tunables, + env_json=mock_env_json, + svcs_json=None, + num_trial_runners=TRIAL_RUNNER_COUNT, + ) + + opt = MockOptimizer( + tunables=exp.tunables, + config={ + "optimization_targets": exp.opt_targets, + "seed": SEED, + # This should be the default, so we leave it omitted for now to test the default. + # But the test logic relies on this (e.g., trial 1 is config 1 is the + # default values for the tunable params) + # "start_with_defaults": True, + "max_suggestions": MAX_TRIALS, + }, + global_config=global_config, + ) + + scheduler = ParallelScheduler( + # All config values can be overridden from global config + config={ + "experiment_id": exp.experiment_id, + "trial_id": exp.trial_id, + "config_id": -1, + "trial_config_repeat_count": CONFIG_TRIAL_REPEAT_COUNT, + "max_trials": MAX_TRIALS, + }, + global_config=global_config, + trial_runners=trial_runners, + optimizer=opt, + storage=storage, + root_env_config=exp.root_env_config, + ) + + # Add some trial data to that experiment by "running" it. + with scheduler: + scheduler.start() + scheduler.teardown() + + return storage.experiments[exp.experiment_id] + + def _dummy_run_exp( storage: SqlStorage, exp: SqlStorage.Experiment, @@ -197,13 +286,49 @@ def _dummy_run_exp( return storage.experiments[exp.experiment_id] +def _exp_data( + storage: SqlStorage, + exp_storage: SqlStorage.Experiment, + run_exp: Callable[[SqlStorage, SqlStorage.Experiment], ExperimentData] = _dummy_run_exp, +) -> ExperimentData: + """Test fixture for ExperimentData.""" + return run_exp(storage, exp_storage) + + +def _exp_no_tunables_data( + storage: SqlStorage, + exp_no_tunables_storage: SqlStorage.Experiment, + run_exp: Callable[[SqlStorage, SqlStorage.Experiment], ExperimentData] = _dummy_run_exp, +) -> ExperimentData: + """Test fixture for ExperimentData with no tunable configs.""" + return run_exp(storage, exp_no_tunables_storage) + + +def _mixed_numerics_exp_data( + storage: SqlStorage, + mixed_numerics_exp_storage: SqlStorage.Experiment, + run_exp: Callable[[SqlStorage, SqlStorage.Experiment], ExperimentData] = _dummy_run_exp, +) -> ExperimentData: + """Test fixture for ExperimentData with mixed numerical tunable types.""" + return run_exp(storage, mixed_numerics_exp_storage) + + @pytest.fixture def exp_data( storage: SqlStorage, exp_storage: SqlStorage.Experiment, ) -> ExperimentData: """Test fixture for ExperimentData.""" - return _dummy_run_exp(storage, exp_storage) + return _exp_data(storage, exp_storage) + + +@pytest.fixture +def parallel_exp_data( + storage: SqlStorage, + exp_storage: SqlStorage.Experiment, +) -> ExperimentData: + """Test fixture for ExperimentData with parallel scheduling.""" + return _exp_data(storage, exp_storage, run_exp=_parallel_dummy_run_exp) @pytest.fixture @@ -212,7 +337,7 @@ def exp_no_tunables_data( exp_no_tunables_storage: SqlStorage.Experiment, ) -> ExperimentData: """Test fixture for ExperimentData with no tunable configs.""" - return _dummy_run_exp(storage, exp_no_tunables_storage) + return _exp_no_tunables_data(storage, exp_no_tunables_storage) @pytest.fixture @@ -221,4 +346,4 @@ def mixed_numerics_exp_data( mixed_numerics_exp_storage: SqlStorage.Experiment, ) -> ExperimentData: """Test fixture for ExperimentData with mixed numerical tunable types.""" - return _dummy_run_exp(storage, mixed_numerics_exp_storage) + return _mixed_numerics_exp_data(storage, mixed_numerics_exp_storage) diff --git a/mlos_bench/mlos_bench/tests/storage/trial_schedule_test.py b/mlos_bench/mlos_bench/tests/storage/trial_schedule_test.py index a1ab74f9f5c..98d244160fc 100644 --- a/mlos_bench/mlos_bench/tests/storage/trial_schedule_test.py +++ b/mlos_bench/mlos_bench/tests/storage/trial_schedule_test.py @@ -3,14 +3,17 @@ # Licensed under the MIT License. # """Unit tests for scheduling trials for some future time.""" -from collections.abc import Iterator +from collections.abc import Callable, Iterator from datetime import datetime, timedelta +from typing import Any +import numpy as np from pytz import UTC from mlos_bench.environments.status import Status from mlos_bench.storage.base_experiment_data import ExperimentData from mlos_bench.storage.base_storage import Storage +from mlos_bench.storage.base_trial_data import TrialData from mlos_bench.tests.storage import ( CONFIG_COUNT, CONFIG_TRIAL_REPEAT_COUNT, @@ -156,3 +159,29 @@ def test_rr_scheduling(exp_data: ExperimentData) -> None: assert ( trial.trial_runner_id == expected_runner_id ), f"Expected trial_runner_id {expected_runner_id} for {trial}" + + +def test_parallel_scheduling(parallel_exp_data: ExperimentData) -> None: + """ + Checks that the scheduler schedules all of Trials across TrialRunners. + + Note that we can no longer assume the order of the trials, since they can complete + in any order. + """ + extractor: Callable[[Callable[[TrialData], Any]], list[Any]] = lambda fn: [ + fn(parallel_exp_data.trials[id]) + for id in range(1, CONFIG_COUNT * CONFIG_TRIAL_REPEAT_COUNT + 1) + ] + + trial_ids = extractor(lambda trial: trial.trial_id) + assert set(trial_ids) == set(range(1, CONFIG_COUNT * CONFIG_TRIAL_REPEAT_COUNT + 1)) + + config_ids = extractor(lambda trial: trial.tunable_config_id) + unique_config_ids, config_counts = np.unique(config_ids, return_counts=True) + assert len(unique_config_ids) == CONFIG_COUNT + assert all(count == CONFIG_TRIAL_REPEAT_COUNT for count in config_counts) + + repeat_nums = extractor(lambda trial: trial.metadata_dict["repeat_i"]) + unique_repeat_nums, repeat_nums_counts = np.unique(repeat_nums, return_counts=True) + assert len(unique_repeat_nums) == CONFIG_TRIAL_REPEAT_COUNT + assert all(count == CONFIG_COUNT for count in repeat_nums_counts) From 1373ff19d7ea3e3b239e0afacb69ed88a3840c6a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Apr 2025 15:46:07 +0000 Subject: [PATCH 02/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../schedulers/parallel_scheduler.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 5393cb1e933..ecf9edb07e5 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -5,28 +5,24 @@ """A simple single-threaded synchronous optimization loop implementation.""" import copy -from datetime import datetime import logging import threading -from typing import Callable, Optional - -from pytz import UTC - -import logging +import time +from collections.abc import Callable from datetime import datetime +from typing import Optional from pytz import UTC -from mlos_bench.storage.base_storage import Storage - from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.storage.base_storage import Storage -import time _LOG = logging.getLogger(__name__) + class ParallelScheduler(Scheduler): """A simple multi-threaded asynchronous optimization loop implementation.""" + def start(self) -> None: """Start the optimization loop.""" super().start() @@ -75,7 +71,7 @@ def _run_schedule(self, running: bool = False) -> None: for trial in pending_trials: # Wait for an idle trial runner - trial_runner_id: Optional[int] = None + trial_runner_id: int | None = None while trial_runner_id is None: with self._runner_lock: for update in self._pending_updates: @@ -106,13 +102,12 @@ def async_run_trial(self, trial: Storage.Trial) -> None: trial : Storage.Trial A Storage class based Trial used to persist the experiment trial data. """ - register_fn = self.get_trial_runner(trial)._run_trial( - trial, copy.copy(self.global_config) - ) + register_fn = self.get_trial_runner(trial)._run_trial(trial, copy.copy(self.global_config)) def callback(trial: Storage.Trial = trial): """ - Callback to pass to the main thread to register the results with the storage. + Callback to pass to the main thread to register the results with the + storage. Parameters ---------- From b45c71eebbbfae6cc75748ba7679ca384bfe95e1 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Thu, 24 Apr 2025 14:03:26 -0500 Subject: [PATCH 03/38] alternative implementation for threads --- mlos_bench/mlos_bench/environments/status.py | 8 + .../schedulers/parallel_scheduler.py | 164 +++++++++--------- .../mlos_bench/schedulers/trial_runner.py | 2 +- mlos_bench/mlos_bench/storage/base_storage.py | 23 +++ .../mlos_bench/storage/sql/experiment.py | 17 +- 5 files changed, 124 insertions(+), 90 deletions(-) diff --git a/mlos_bench/mlos_bench/environments/status.py b/mlos_bench/mlos_bench/environments/status.py index ca35b3473da..066b659f154 100644 --- a/mlos_bench/mlos_bench/environments/status.py +++ b/mlos_bench/mlos_bench/environments/status.py @@ -18,6 +18,7 @@ class Status(enum.Enum): CANCELED = 5 FAILED = 6 TIMED_OUT = 7 + SCHEDULED = 8 def is_good(self) -> bool: """Check if the status of the benchmark/environment is good.""" @@ -26,6 +27,7 @@ def is_good(self) -> bool: Status.READY, Status.RUNNING, Status.SUCCEEDED, + Status.SCHEDULED, } def is_completed(self) -> bool: @@ -74,3 +76,9 @@ def is_timed_out(self) -> bool: TIMED_OUT. """ return self == Status.FAILED + + def is_scheduled(self) -> bool: + """Check if the status of the benchmark/environment Trial or Experiment is + SCHEDULED. + """ + return self == Status.SCHEDULED diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index ecf9edb07e5..2e56b068316 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -4,18 +4,19 @@ # """A simple single-threaded synchronous optimization loop implementation.""" -import copy +import asyncio import logging -import threading -import time from collections.abc import Callable +from concurrent.futures import Future, ThreadPoolExecutor from datetime import datetime -from typing import Optional +from typing import Any from pytz import UTC +from mlos_bench.environments.status import Status from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.storage.base_storage import Storage +from mlos_bench.tunables.tunable_groups import TunableGroups _LOG = logging.getLogger(__name__) @@ -23,17 +24,15 @@ class ParallelScheduler(Scheduler): """A simple multi-threaded asynchronous optimization loop implementation.""" + def __init__(self, *args: Any, **kwargs: Any) -> None: + + super().__init__(*args, **kwargs) + self.pool = ThreadPoolExecutor(max_workers=len(self._trial_runners)) + def start(self) -> None: """Start the optimization loop.""" super().start() - self._idle_runners: set[int] = set(self._trial_runners.keys()) - self._busy_runners: set[int] = set() - self._scheduled_trials: set[int] = set() - self._pending_updates: list[Callable[[], None]] = [] - self._pending_threads: list[threading.Thread] = [] - self._runner_lock: threading.Lock = threading.Lock() - is_warm_up: bool = self.optimizer.supports_preload if not is_warm_up: _LOG.warning("Skip pending trials and warm-up: %s", self.optimizer) @@ -41,16 +40,34 @@ def start(self) -> None: not_done: bool = True while not_done: _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) + self._run_callbacks() self._run_schedule(is_warm_up) not_done = self._schedule_new_optimizer_suggestions() is_warm_up = False - # Wait for all pending runners to finish - while len(self._busy_runners) > 0: - with self._runner_lock: - for update in self._pending_updates: - update() - time.sleep(1) + def teardown(self) -> None: + """Stop the optimization loop.""" + super().teardown() + self.pool.shutdown(wait=True) + self._run_callbacks() + + def schedule_trial(self, tunables: TunableGroups) -> None: + """Assign a trial to a trial runner.""" + assert self.experiment is not None + + super().schedule_trial(tunables) + + pending_trials: list[Storage.Trial] = list( + self.experiment.pending_trials(datetime.now(UTC), running=False) + ) + + idle_runner_ids = [ + id for id, runner in self.trial_runners.items() if not runner.is_running + ] + + for trial, runner_id in zip(pending_trials, idle_runner_ids): + trial.update(status=Status.SCHEDULED, timestamp=datetime.now(UTC)) + trial.set_trial_runner(runner_id) def _run_schedule(self, running: bool = False) -> None: """ @@ -59,82 +76,63 @@ def _run_schedule(self, running: bool = False) -> None: Check for pending trials in the queue and run them. """ assert self.experiment is not None - # Collect all pending trials - # It is critical that we filter out trials that are already assigned to a trial runner - # If we do not filter out these trials, it will cause configurations to be double scheduled - # and will cause the storage backend to fail. - pending_trials: list[Storage.Trial] = [ - t - for t in self.experiment.pending_trials(datetime.now(UTC), running=running) - if t.trial_id not in self._scheduled_trials - ] - for trial in pending_trials: - # Wait for an idle trial runner - trial_runner_id: int | None = None - while trial_runner_id is None: - with self._runner_lock: - for update in self._pending_updates: - update() - self._pending_updates.clear() - if len(self._idle_runners) > 0: - # Schedule a Trial to a Trial Runner - trial_runner_id = self._idle_runners.pop() - self._busy_runners.add(trial_runner_id) - self._scheduled_trials.add(trial.trial_id) - - # Assign the trial to the trial runner. Note that this will be reset - # if pending_trials is queried again from the experiment - trial.set_trial_runner(trial_runner_id) - self.run_trial(trial) - - if trial_runner_id is None: - # Sleep for a short time if failed to find to prevent busy wait - _LOG.debug("No idle trial runners available. Waiting...") - time.sleep(1) - - def async_run_trial(self, trial: Storage.Trial) -> None: - """ - Run a single trial in the background. + scheduled_trials: list[Storage.Trial] = list( + self.experiment.filter_trials_by_status(datetime.now(UTC), [Status.SCHEDULED]) + ) - Parameters - ---------- - trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. - """ - register_fn = self.get_trial_runner(trial)._run_trial(trial, copy.copy(self.global_config)) - - def callback(trial: Storage.Trial = trial): - """ - Callback to pass to the main thread to register the results with the - storage. - - Parameters - ---------- - trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. - """ - assert trial.trial_runner_id is not None, "Trial runner ID should not be None" + for trial in scheduled_trials: + trial.update(status=Status.READY, timestamp=datetime.now(UTC)) + task = self.pool.submit(self.async_run_trial, trial) + asyncio.get_event_loop().call_soon_threadsafe(self._on_trial_finished, task) - register_fn() - self._busy_runners.remove(trial.trial_runner_id) - self._idle_runners.add(trial.trial_runner_id) + @staticmethod + def _on_trial_finished(result: Future[Callable[[], None]]) -> None: + """ + Callback to be called when a trial is finished. - with self._runner_lock: - self._pending_updates.append(callback) + This must always be called from the main thread. Exceptions can also be handled + here + """ + try: + callback = result.result() + callback() + except Exception as exception: # pylint: disable=broad-except + _LOG.error("Trial failed: %s", exception) + + @staticmethod + def _run_callbacks() -> None: + """Run all pending callbacks in the main thread.""" + loop = asyncio.get_event_loop() + pending = asyncio.all_tasks(loop) + loop.run_until_complete(asyncio.gather(*pending)) def run_trial(self, trial: Storage.Trial) -> None: - """ - Schedule a single trial to be run in the background. + """Parallel Scheduler does not support run_trial. Use async_run_trial instead. Parameters ---------- trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. + The trial to run. + + Raises + ------ + NotImplementedError + Error to indicate that this method is not supported in ParallelScheduler. + """ + raise NotImplementedError( + "ParallelScheduler does not support run_trial. Use async_run_trial instead." + ) + + def async_run_trial(self, trial: Storage.Trial) -> Callable[[], None]: + """ + Set up and run a single trial asynchronously. + + Returns a callback to save the results in the storage. """ super().run_trial(trial) + # In the sync scheduler we run each trial on its own TrialRunner in sequence. trial_runner = self.get_trial_runner(trial) - - thread = threading.Thread(target=self.async_run_trial, args=(trial,)) - self._pending_threads.append(thread) - thread.start() + result = trial_runner.deferred_run_trial(trial, self.global_config) + _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + return result diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 6c39279d434..4dc5a558fb6 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -259,7 +259,7 @@ def run_trial( """ self._run_trial(trial, global_config)() - async def async_run_trial( + def deferred_run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index f2d393994f7..8587f87b3b6 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -307,6 +307,29 @@ def load( Trial ids, Tunable values, benchmark scores, and status of the trials. """ + @abstractmethod + def filter_trials_by_status( + self, + timestamp: datetime, + status: list[Status], + ) -> Iterator["Storage.Trial"]: + """ + Return an iterator over the pending trials that are scheduled to run on or + before the specified timestamp matching one of status listed. + + Parameters + ---------- + timestamp : datetime.datetime + The time in UTC to check for scheduled trials. + status : list[Status] + Status of the trials to filter in. + + Returns + ------- + trials : Iterator[Storage.Trial] + An iterator over the matching trials. + """ + @abstractmethod def pending_trials( self, diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index eb47de7d714..eb052f6d1fc 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -235,13 +235,11 @@ def _get_key_val(conn: Connection, table: Table, field: str, **kwargs: Any) -> d row._tuple() for row in cur_result.fetchall() # pylint: disable=protected-access ) - def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Storage.Trial]: + def filter_trials_by_status( + self, timestamp: datetime, status: list[Status] + ) -> Iterator[Storage.Trial]: timestamp = utcify_timestamp(timestamp, origin="local") _LOG.info("Retrieve pending trials for: %s @ %s", self._experiment_id, timestamp) - if running: - pending_status = [Status.PENDING.name, Status.READY.name, Status.RUNNING.name] - else: - pending_status = [Status.PENDING.name] with self._engine.connect() as conn: cur_trials = conn.execute( self._schema.trial.select().where( @@ -251,7 +249,7 @@ def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Stor | (self._schema.trial.c.ts_start <= timestamp) ), self._schema.trial.c.ts_end.is_(None), - self._schema.trial.c.status.in_(pending_status), + self._schema.trial.c.status.in_([s.name for s in status]), ) ) for trial in cur_trials.fetchall(): @@ -281,6 +279,13 @@ def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Stor config=config, ) + def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Storage.Trial]: + if running: + pending_status = [Status.PENDING, Status.READY, Status.RUNNING] + else: + pending_status = [Status.PENDING] + return self.filter_trials_by_status(timestamp=timestamp, status=pending_status) + def _get_config_id(self, conn: Connection, tunables: TunableGroups) -> int: """ Get the config ID for the given tunables. From a326cdfb144a0cd3939910af8999fbae1cf621f3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 24 Apr 2025 19:04:08 +0000 Subject: [PATCH 04/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 2e56b068316..929f555f515 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -108,7 +108,8 @@ def _run_callbacks() -> None: loop.run_until_complete(asyncio.gather(*pending)) def run_trial(self, trial: Storage.Trial) -> None: - """Parallel Scheduler does not support run_trial. Use async_run_trial instead. + """ + Parallel Scheduler does not support run_trial. Use async_run_trial instead. Parameters ---------- From 8d346866179e973ec630a73a6ca4a533cba874cc Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Thu, 24 Apr 2025 14:05:33 -0500 Subject: [PATCH 05/38] add comments --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 2e56b068316..af1940ce08a 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -48,6 +48,8 @@ def start(self) -> None: def teardown(self) -> None: """Stop the optimization loop.""" super().teardown() + + # Shutdown the thread pool and wait for all tasks to finish self.pool.shutdown(wait=True) self._run_callbacks() @@ -65,6 +67,7 @@ def schedule_trial(self, tunables: TunableGroups) -> None: id for id, runner in self.trial_runners.items() if not runner.is_running ] + # Assign pending trials to idle runners for trial, runner_id in zip(pending_trials, idle_runner_ids): trial.update(status=Status.SCHEDULED, timestamp=datetime.now(UTC)) trial.set_trial_runner(runner_id) @@ -83,7 +86,9 @@ def _run_schedule(self, running: bool = False) -> None: for trial in scheduled_trials: trial.update(status=Status.READY, timestamp=datetime.now(UTC)) - task = self.pool.submit(self.async_run_trial, trial) + task = self.pool.submit(self.deferred_run_trial, trial) + + # This is required to ensure that the callback happens on the main thread asyncio.get_event_loop().call_soon_threadsafe(self._on_trial_finished, task) @staticmethod @@ -124,7 +129,7 @@ def run_trial(self, trial: Storage.Trial) -> None: "ParallelScheduler does not support run_trial. Use async_run_trial instead." ) - def async_run_trial(self, trial: Storage.Trial) -> Callable[[], None]: + def deferred_run_trial(self, trial: Storage.Trial) -> Callable[[], None]: """ Set up and run a single trial asynchronously. From 4bbb534c06b97c28110cde797f30675f1a08525e Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 17:26:31 -0500 Subject: [PATCH 06/38] switch from threads to processes --- .../schedulers/parallel_scheduler.py | 59 ++++++----- .../mlos_bench/schedulers/trial_runner.py | 99 ++++--------------- 2 files changed, 50 insertions(+), 108 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 31ca139f57e..03da415b8d0 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -2,12 +2,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -"""A simple single-threaded synchronous optimization loop implementation.""" +"""A simple multi-threaded asynchronous optimization loop implementation.""" import asyncio import logging -from collections.abc import Callable -from concurrent.futures import Future, ThreadPoolExecutor +from concurrent.futures import Future, ProcessPoolExecutor from datetime import datetime from typing import Any @@ -27,7 +26,7 @@ class ParallelScheduler(Scheduler): def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) - self.pool = ThreadPoolExecutor(max_workers=len(self._trial_runners)) + self.pool = ProcessPoolExecutor(max_workers=len(self._trial_runners)) def start(self) -> None: """Start the optimization loop.""" @@ -47,11 +46,10 @@ def start(self) -> None: def teardown(self) -> None: """Stop the optimization loop.""" - super().teardown() - # Shutdown the thread pool and wait for all tasks to finish self.pool.shutdown(wait=True) self._run_callbacks() + super().teardown() def schedule_trial(self, tunables: TunableGroups) -> None: """Assign a trial to a trial runner.""" @@ -86,24 +84,25 @@ def _run_schedule(self, running: bool = False) -> None: for trial in scheduled_trials: trial.update(status=Status.READY, timestamp=datetime.now(UTC)) - task = self.pool.submit(self.deferred_run_trial, trial) - - # This is required to ensure that the callback happens on the main thread - asyncio.get_event_loop().call_soon_threadsafe(self._on_trial_finished, task) - - @staticmethod - def _on_trial_finished(result: Future[Callable[[], None]]) -> None: - """ - Callback to be called when a trial is finished. - - This must always be called from the main thread. Exceptions can also be handled - here - """ - try: - callback = result.result() - callback() - except Exception as exception: # pylint: disable=broad-except - _LOG.error("Trial failed: %s", exception) + self.deferred_run_trial(trial) + + def _on_trial_finished_closure(self, trial: Storage.Trial): + def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: + """ + Callback to be called when a trial is finished. + + This must always be called from the main thread. Exceptions can also be handled + here + """ + try: + (status, timestamp, results, telemetry) = result.result() + self.get_trial_runner(trial)._finalize_run_trial( + trial, status, timestamp, results, telemetry + ) + except Exception as exception: # pylint: disable=broad-except + _LOG.error("Trial failed: %s", exception) + + return _on_trial_finished @staticmethod def _run_callbacks() -> None: @@ -130,7 +129,7 @@ def run_trial(self, trial: Storage.Trial) -> None: "ParallelScheduler does not support run_trial. Use async_run_trial instead." ) - def deferred_run_trial(self, trial: Storage.Trial) -> Callable[[], None]: + def deferred_run_trial(self, trial: Storage.Trial) -> None: """ Set up and run a single trial asynchronously. @@ -139,6 +138,12 @@ def deferred_run_trial(self, trial: Storage.Trial) -> Callable[[], None]: super().run_trial(trial) # In the sync scheduler we run each trial on its own TrialRunner in sequence. trial_runner = self.get_trial_runner(trial) - result = trial_runner.deferred_run_trial(trial, self.global_config) + trial_runner._prepare_run_trial(trial, self.global_config) + + task = self.pool.submit(trial_runner._execute_run_trial, trial_runner.environment) + # This is required to ensure that the callback happens on the main thread + asyncio.get_event_loop().call_soon_threadsafe( + self._on_trial_finished_closure(trial), self, task + ) + _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) - return result diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 4dc5a558fb6..ed54ef10aec 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -5,7 +5,6 @@ """Simple class to run an individual Trial on a given Environment.""" import logging -from collections.abc import Callable from datetime import datetime from types import TracebackType from typing import Any, Literal @@ -14,7 +13,6 @@ from mlos_bench.environments.base_environment import Environment from mlos_bench.environments.status import Status -from mlos_bench.event_loop_context import EventLoopContext from mlos_bench.services.base_service import Service from mlos_bench.services.config_persistence import ConfigPersistenceService from mlos_bench.services.local.local_exec import LocalExecService @@ -118,7 +116,6 @@ def __init__(self, trial_runner_id: int, env: Environment) -> None: assert self._env.parameters["trial_runner_id"] == self._trial_runner_id self._in_context = False self._is_running = False - self._event_loop_context = EventLoopContext() def __repr__(self) -> str: return ( @@ -165,27 +162,11 @@ def is_running(self) -> bool: """Get the running state of the current TrialRunner.""" return self._is_running - def _run_trial( + def _prepare_run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, - ) -> Callable[[], None]: - """ - Run a single trial on this TrialRunner's Environment and return a callback to - store the results in the backend Trial Storage. - - Parameters - ---------- - trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. - global_config : dict - Global configuration parameters. - - Returns - ------- - callback : Callable[[], None] - Returns a callback to register the results with the storage backend. - """ + ): assert self._in_context assert not self._is_running @@ -197,49 +178,26 @@ def _run_trial( ) if not self.environment.setup(trial.tunables, trial.config(global_config)): - _LOG.warning("Setup failed: %s :: %s", self.environment, trial.tunables) - # FIXME: Use the actual timestamp from the environment. - _LOG.info("TrialRunner: Update trial results: %s :: %s", trial, Status.FAILED) - - def fail_callback() -> None: - """ - A callback to register the results with the storage backend. - - This must be called from the main thread. For a synchronous scheduler - this can just be called directly. For an asynchronous scheduler, this - will be passed to the main thread when the trial is finished. - """ - trial.update(Status.FAILED, datetime.now(UTC)) - - return fail_callback - - # TODO: start background status polling of the environments in the event loop. + trial.update(Status.FAILED, datetime.now(UTC)) + @staticmethod + def _execute_run_trial( + environment: Environment, + ): # Block and wait for the final result. - (status, timestamp, results) = self.environment.run() - _LOG.info("TrialRunner Results: %s :: %s\n%s", trial.tunables, status, results) + (status, timestamp, results) = environment.run() # In async mode (TODO), poll the environment for status and telemetry # and update the storage with the intermediate results. - (_status, _timestamp, telemetry) = self.environment.status() - - # Use the status and timestamp from `.run()` as it is the final status of the experiment. - # TODO: Use the `.status()` output in async mode. - def success_callback() -> None: - """ - A callback to register the results with the storage backend. - - This must be called from the main thread. For a synchronous scheduler this - can just be called directly. For an asynchronous scheduler, this will be - passed to the main thread when the trial is finished. - """ - trial.update_telemetry(status, timestamp, telemetry) - trial.update(status, timestamp, results) - _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) + (_status, _timestamp, telemetry) = environment.status() - self._is_running = False + return (status, timestamp, results, telemetry) - return success_callback + def _finalize_run_trial(self, trial, status, timestamp, results, telemetry): + trial.update_telemetry(status, timestamp, telemetry) + trial.update(status, timestamp, results) + _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) + self._is_running = False def run_trial( self, @@ -257,30 +215,9 @@ def run_trial( global_config : dict Global configuration parameters. """ - self._run_trial(trial, global_config)() - - def deferred_run_trial( - self, - trial: Storage.Trial, - global_config: dict[str, Any] | None = None, - ) -> Callable[[], None]: - """ - Run a single trial on this TrialRunner's Environment and return a callback to - store the results in the backend Trial Storage. - - Parameters - ---------- - trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. - global_config : dict - Global configuration parameters. - - Returns - ------- - callback : Callable[[], None] - Returns a callback to register the results with the storage backend. - """ - return self._run_trial(trial, global_config) + self._prepare_run_trial(trial, global_config) + (status, timestamp, results, telemetry) = self._execute_run_trial(self._env) + self._finalize_run_trial(trial, status, timestamp, results, telemetry) def teardown(self) -> None: """ From 123ecd3b7331fe177c4c59ae441d6445a8570938 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Apr 2025 22:26:54 +0000 Subject: [PATCH 07/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 03da415b8d0..bc1b7372c2e 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -91,8 +91,8 @@ def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: """ Callback to be called when a trial is finished. - This must always be called from the main thread. Exceptions can also be handled - here + This must always be called from the main thread. Exceptions can also be + handled here """ try: (status, timestamp, results, telemetry) = result.result() From 3357fcfb2bc262c90315fbe1fdb1172a85186c34 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 17:30:18 -0500 Subject: [PATCH 08/38] Update mlos_bench/mlos_bench/schedulers/parallel_scheduler.py Co-authored-by: Brian Kroth --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index bc1b7372c2e..4be62c32347 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -21,7 +21,7 @@ class ParallelScheduler(Scheduler): - """A simple multi-threaded asynchronous optimization loop implementation.""" + """A simple multi-process asynchronous optimization loop implementation.""" def __init__(self, *args: Any, **kwargs: Any) -> None: From 7a656296c9603f1923f2ce57dde4f6a46fe1fa24 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 17:33:24 -0500 Subject: [PATCH 09/38] Update mlos_bench/mlos_bench/storage/sql/experiment.py Co-authored-by: Brian Kroth --- mlos_bench/mlos_bench/storage/sql/experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index eb052f6d1fc..06826245e1a 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -236,7 +236,7 @@ def _get_key_val(conn: Connection, table: Table, field: str, **kwargs: Any) -> d ) def filter_trials_by_status( - self, timestamp: datetime, status: list[Status] + self, timestamp: datetime, status: list[Status], ) -> Iterator[Storage.Trial]: timestamp = utcify_timestamp(timestamp, origin="local") _LOG.info("Retrieve pending trials for: %s @ %s", self._experiment_id, timestamp) From 2c9ecb8f63ab98419aa53c715d6e3135a88d7872 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Apr 2025 22:33:43 +0000 Subject: [PATCH 10/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/storage/sql/experiment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index 06826245e1a..0581dd95897 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -236,7 +236,9 @@ def _get_key_val(conn: Connection, table: Table, field: str, **kwargs: Any) -> d ) def filter_trials_by_status( - self, timestamp: datetime, status: list[Status], + self, + timestamp: datetime, + status: list[Status], ) -> Iterator[Storage.Trial]: timestamp = utcify_timestamp(timestamp, origin="local") _LOG.info("Retrieve pending trials for: %s @ %s", self._experiment_id, timestamp) From 40c7f83feb3ecdc13a27013df9154cbbf2b64126 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 17:35:04 -0500 Subject: [PATCH 11/38] updates for comments --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 4 ++-- mlos_bench/mlos_bench/storage/base_storage.py | 2 +- mlos_bench/mlos_bench/storage/sql/experiment.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 03da415b8d0..bc1b7372c2e 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -91,8 +91,8 @@ def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: """ Callback to be called when a trial is finished. - This must always be called from the main thread. Exceptions can also be handled - here + This must always be called from the main thread. Exceptions can also be + handled here """ try: (status, timestamp, results, telemetry) = result.result() diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 8587f87b3b6..5e3400661ab 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -311,7 +311,7 @@ def load( def filter_trials_by_status( self, timestamp: datetime, - status: list[Status], + statuses: list[Status], ) -> Iterator["Storage.Trial"]: """ Return an iterator over the pending trials that are scheduled to run on or diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index eb052f6d1fc..3bfd4c258c7 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -236,7 +236,7 @@ def _get_key_val(conn: Connection, table: Table, field: str, **kwargs: Any) -> d ) def filter_trials_by_status( - self, timestamp: datetime, status: list[Status] + self, timestamp: datetime, statuses: list[Status] ) -> Iterator[Storage.Trial]: timestamp = utcify_timestamp(timestamp, origin="local") _LOG.info("Retrieve pending trials for: %s @ %s", self._experiment_id, timestamp) @@ -249,7 +249,7 @@ def filter_trials_by_status( | (self._schema.trial.c.ts_start <= timestamp) ), self._schema.trial.c.ts_end.is_(None), - self._schema.trial.c.status.in_([s.name for s in status]), + self._schema.trial.c.status.in_([s.name for s in statuses]), ) ) for trial in cur_trials.fetchall(): @@ -284,7 +284,7 @@ def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Stor pending_status = [Status.PENDING, Status.READY, Status.RUNNING] else: pending_status = [Status.PENDING] - return self.filter_trials_by_status(timestamp=timestamp, status=pending_status) + return self.filter_trials_by_status(timestamp=timestamp, statuses=pending_status) def _get_config_id(self, conn: Connection, tunables: TunableGroups) -> int: """ From ee9e7e0904270167a29c37af4c068c7c9fcf1ab1 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 18:29:29 -0500 Subject: [PATCH 12/38] fix linting errors --- .../schedulers/parallel_scheduler.py | 19 ++++-- .../mlos_bench/schedulers/trial_runner.py | 60 ++++++++++++++++--- mlos_bench/mlos_bench/storage/base_storage.py | 4 +- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 4be62c32347..e1b74ec7b74 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -8,12 +8,14 @@ import logging from concurrent.futures import Future, ProcessPoolExecutor from datetime import datetime +from collections.abc import Callable from typing import Any from pytz import UTC from mlos_bench.environments.status import Status from mlos_bench.schedulers.base_scheduler import Scheduler +from mlos_bench.schedulers.trial_runner import TrialRunner from mlos_bench.storage.base_storage import Storage from mlos_bench.tunables.tunable_groups import TunableGroups @@ -86,7 +88,16 @@ def _run_schedule(self, running: bool = False) -> None: trial.update(status=Status.READY, timestamp=datetime.now(UTC)) self.deferred_run_trial(trial) - def _on_trial_finished_closure(self, trial: Storage.Trial): + def _on_trial_finished_closure( + self, trial: Storage.Trial + ) -> Callable[["ParallelScheduler", Future], None]: + """Generate a closure to handle the callback for when a trial is finished. + + Parameters + ---------- + trial : Storage.Trial + The trial to finish. + """ def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: """ Callback to be called when a trial is finished. @@ -96,7 +107,7 @@ def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: """ try: (status, timestamp, results, telemetry) = result.result() - self.get_trial_runner(trial)._finalize_run_trial( + self.get_trial_runner(trial).finalize_run_trial( trial, status, timestamp, results, telemetry ) except Exception as exception: # pylint: disable=broad-except @@ -138,9 +149,9 @@ def deferred_run_trial(self, trial: Storage.Trial) -> None: super().run_trial(trial) # In the sync scheduler we run each trial on its own TrialRunner in sequence. trial_runner = self.get_trial_runner(trial) - trial_runner._prepare_run_trial(trial, self.global_config) + trial_runner.prepare_run_trial(trial, self.global_config) - task = self.pool.submit(trial_runner._execute_run_trial, trial_runner.environment) + task = self.pool.submit(TrialRunner.execute_run_trial, trial_runner.environment) # This is required to ensure that the callback happens on the main thread asyncio.get_event_loop().call_soon_threadsafe( self._on_trial_finished_closure(trial), self, task diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index ed54ef10aec..3ff8b2c195e 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -19,6 +19,7 @@ from mlos_bench.services.types import SupportsConfigLoading from mlos_bench.storage.base_storage import Storage from mlos_bench.tunables.tunable_groups import TunableGroups +from mlos_bench.tunables.tunable_types import TunableValue _LOG = logging.getLogger(__name__) @@ -162,11 +163,20 @@ def is_running(self) -> bool: """Get the running state of the current TrialRunner.""" return self._is_running - def _prepare_run_trial( + def prepare_run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, - ): + ) -> None: + """Prepare the trial runner for running a trial. + + Parameters + ---------- + trial : Storage.Trial + The trial to prepare. + global_config : dict[str, Any] | None, optional + Global configuration parameters, by default None + """ assert self._in_context assert not self._is_running @@ -181,9 +191,21 @@ def _prepare_run_trial( trial.update(Status.FAILED, datetime.now(UTC)) @staticmethod - def _execute_run_trial( + def execute_run_trial( environment: Environment, - ): + ) -> tuple[Status, datetime, dict[str, TunableValue] | None, list[tuple[datetime, str, Any]]]: + """Execute the trial run on the environment. + + Parameters + ---------- + environment : Environment + The environment to run the trial on. + + Returns + ------- + tuple[Status, datetime, Optional[dict[str, TunableValue]], list[tuple[datetime, str, Any]]] + The fill results of the trial run, including status, timestamp, results, and telemetry. + """ # Block and wait for the final result. (status, timestamp, results) = environment.run() @@ -193,7 +215,29 @@ def _execute_run_trial( return (status, timestamp, results, telemetry) - def _finalize_run_trial(self, trial, status, timestamp, results, telemetry): + def finalize_run_trial( # pylint: disable=too-many-arguments, too-many-positional-arguments + self, + trial: Storage.Trial, + status: Status, + timestamp: datetime, + results: dict[str, TunableValue] | None, + telemetry: list[tuple[datetime, str, Any]], + ) -> None: + """Finalize the trial run in the storage backend. + + Parameters + ---------- + trial : Storage.Trial + The trial to finalize. + status : Status + The status of the trial. + timestamp : datetime + The timestamp of the trial execution. + results : Optional[dict[str, TunableValue]] + The results of the trial + telemetry : list[tuple[datetime, str, Any]] + The telemetry data of the trial. + """ trial.update_telemetry(status, timestamp, telemetry) trial.update(status, timestamp, results) _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) @@ -215,9 +259,9 @@ def run_trial( global_config : dict Global configuration parameters. """ - self._prepare_run_trial(trial, global_config) - (status, timestamp, results, telemetry) = self._execute_run_trial(self._env) - self._finalize_run_trial(trial, status, timestamp, results, telemetry) + self.prepare_run_trial(trial, global_config) + (status, timestamp, results, telemetry) = self.execute_run_trial(self._env) + self.finalize_run_trial(trial, status, timestamp, results, telemetry) def teardown(self) -> None: """ diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 5e3400661ab..87d61a6723e 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -315,13 +315,13 @@ def filter_trials_by_status( ) -> Iterator["Storage.Trial"]: """ Return an iterator over the pending trials that are scheduled to run on or - before the specified timestamp matching one of status listed. + before the specified timestamp matching one of statuses listed. Parameters ---------- timestamp : datetime.datetime The time in UTC to check for scheduled trials. - status : list[Status] + statuses : list[Status] Status of the trials to filter in. Returns From d6199991a67f71e87428a0594e26da4a67fc218f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 25 Apr 2025 23:29:52 +0000 Subject: [PATCH 13/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 6 ++++-- mlos_bench/mlos_bench/schedulers/trial_runner.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index e1b74ec7b74..2092d5f5bb8 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -6,9 +6,9 @@ import asyncio import logging +from collections.abc import Callable from concurrent.futures import Future, ProcessPoolExecutor from datetime import datetime -from collections.abc import Callable from typing import Any from pytz import UTC @@ -91,13 +91,15 @@ def _run_schedule(self, running: bool = False) -> None: def _on_trial_finished_closure( self, trial: Storage.Trial ) -> Callable[["ParallelScheduler", Future], None]: - """Generate a closure to handle the callback for when a trial is finished. + """ + Generate a closure to handle the callback for when a trial is finished. Parameters ---------- trial : Storage.Trial The trial to finish. """ + def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: """ Callback to be called when a trial is finished. diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 3ff8b2c195e..9bed1049f74 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -168,7 +168,8 @@ def prepare_run_trial( trial: Storage.Trial, global_config: dict[str, Any] | None = None, ) -> None: - """Prepare the trial runner for running a trial. + """ + Prepare the trial runner for running a trial. Parameters ---------- @@ -194,7 +195,8 @@ def prepare_run_trial( def execute_run_trial( environment: Environment, ) -> tuple[Status, datetime, dict[str, TunableValue] | None, list[tuple[datetime, str, Any]]]: - """Execute the trial run on the environment. + """ + Execute the trial run on the environment. Parameters ---------- @@ -223,7 +225,8 @@ def finalize_run_trial( # pylint: disable=too-many-arguments, too-many-position results: dict[str, TunableValue] | None, telemetry: list[tuple[datetime, str, Any]], ) -> None: - """Finalize the trial run in the storage backend. + """ + Finalize the trial run in the storage backend. Parameters ---------- From 3dfb7deade284616568525537771909ba9accabf Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Fri, 25 Apr 2025 21:48:31 -0500 Subject: [PATCH 14/38] try to fix the docs --- .../mlos_bench/schedulers/trial_runner.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 3ff8b2c195e..dad132423d2 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -174,7 +174,7 @@ def prepare_run_trial( ---------- trial : Storage.Trial The trial to prepare. - global_config : dict[str, Any] | None, optional + global_config : dict[str, Any] | None Global configuration parameters, by default None """ assert self._in_context @@ -203,8 +203,13 @@ def execute_run_trial( Returns ------- - tuple[Status, datetime, Optional[dict[str, TunableValue]], list[tuple[datetime, str, Any]]] - The fill results of the trial run, including status, timestamp, results, and telemetry. + tuple[ + Status, + datetime.datetime, + dict[str, TunableValue] | None, + list[tuple[datetime.datetime, str, Any]] + ] + The full results of the trial run, including status, timestamp, results, and telemetry. """ # Block and wait for the final result. (status, timestamp, results) = environment.run() @@ -231,11 +236,11 @@ def finalize_run_trial( # pylint: disable=too-many-arguments, too-many-position The trial to finalize. status : Status The status of the trial. - timestamp : datetime + timestamp : datetime.datetime The timestamp of the trial execution. - results : Optional[dict[str, TunableValue]] + results : dict[str, TunableValue] | None, The results of the trial - telemetry : list[tuple[datetime, str, Any]] + telemetry : list[tuple[datetime.datetime, str, Any]] The telemetry data of the trial. """ trial.update_telemetry(status, timestamp, telemetry) From 0b36f640bd27466ff1eddc806098d50323d394b5 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Mon, 28 Apr 2025 21:38:49 +0000 Subject: [PATCH 15/38] Delay enter trial runner context until the trial is actually being run. This is part of an attempt to try and see if can work around issues with `multiprocessing.Pool` needing to pickle certain objects when forking. For instance, if the Environment is using an SshServer, we need to start an EventLoopContext in the background to handle the SSH connections and threads are not picklable. Nor are file handles, DB connections, etc., so there may be other things we also need to adjust to make this work. See Also #967 --- mlos_bench/mlos_bench/schedulers/base_scheduler.py | 11 +++++++---- mlos_bench/mlos_bench/schedulers/sync_scheduler.py | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index eaa5527c6d6..b711388609d 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -200,8 +200,9 @@ def __enter__(self) -> "Scheduler": _LOG.debug("Scheduler START :: %s", self) assert self.experiment is None assert not self._in_context - for trial_runner in self._trial_runners.values(): - trial_runner.__enter__() + # NOTE: We delay entering the context of trial_runners until it's time + # to run the trial in order to avoid incompatibilities with + # multiprocessing.Pool. self._optimizer.__enter__() # Start new or resume the existing experiment. Verify that the # experiment configuration is compatible with the previous runs. @@ -235,7 +236,8 @@ def __exit__( self._experiment.__exit__(ex_type, ex_val, ex_tb) self._optimizer.__exit__(ex_type, ex_val, ex_tb) for trial_runner in self._trial_runners.values(): - trial_runner.__exit__(ex_type, ex_val, ex_tb) + # TrialRunners should have already exited their context after running the Trial. + assert not trial_runner._in_context # pylint: disable=protected-access self._experiment = None self._in_context = False return False # Do not suppress exceptions @@ -267,7 +269,8 @@ def teardown(self) -> None: if self._do_teardown: for trial_runner in self._trial_runners.values(): assert not trial_runner.is_running - trial_runner.teardown() + with trial_runner: + trial_runner.teardown() def get_best_observation(self) -> tuple[dict[str, float] | None, TunableGroups | None]: """Get the best observation from the optimizer.""" diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index 4b864942dce..f450b28b8f1 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -39,5 +39,6 @@ def run_trial(self, trial: Storage.Trial) -> None: super().run_trial(trial) # In the sync scheduler we run each trial on its own TrialRunner in sequence. trial_runner = self.get_trial_runner(trial) - trial_runner.run_trial(trial, self.global_config) - _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + with trial_runner: + trial_runner.run_trial(trial, self.global_config) + _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) From 5ea30c9336fabbe47d36229e5d931b56442c4686 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Tue, 29 Apr 2025 19:57:43 +0000 Subject: [PATCH 16/38] restore original trial runner --- .../mlos_bench/schedulers/trial_runner.py | 100 +++++------------- 1 file changed, 25 insertions(+), 75 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 3c5e62e690a..80eb696bc6d 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -13,13 +13,13 @@ from mlos_bench.environments.base_environment import Environment from mlos_bench.environments.status import Status +from mlos_bench.event_loop_context import EventLoopContext from mlos_bench.services.base_service import Service from mlos_bench.services.config_persistence import ConfigPersistenceService from mlos_bench.services.local.local_exec import LocalExecService from mlos_bench.services.types import SupportsConfigLoading from mlos_bench.storage.base_storage import Storage from mlos_bench.tunables.tunable_groups import TunableGroups -from mlos_bench.tunables.tunable_types import TunableValue _LOG = logging.getLogger(__name__) @@ -117,6 +117,7 @@ def __init__(self, trial_runner_id: int, env: Environment) -> None: assert self._env.parameters["trial_runner_id"] == self._trial_runner_id self._in_context = False self._is_running = False + self._event_loop_context = EventLoopContext() def __repr__(self) -> str: return ( @@ -163,20 +164,26 @@ def is_running(self) -> bool: """Get the running state of the current TrialRunner.""" return self._is_running - def prepare_run_trial( + def run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, ) -> None: """ - Prepare the trial runner for running a trial. + Run a single trial on this TrialRunner's Environment and stores the results in + the backend Trial Storage. Parameters ---------- trial : Storage.Trial - The trial to prepare. - global_config : dict[str, Any] | None - Global configuration parameters, by default None + A Storage class based Trial used to persist the experiment trial data. + global_config : dict + Global configuration parameters. + + Returns + ------- + (trial_status, trial_score) : (Status, dict[str, float] | None) + Status and results of the trial. """ assert self._in_context @@ -189,87 +196,30 @@ def prepare_run_trial( ) if not self.environment.setup(trial.tunables, trial.config(global_config)): + _LOG.warning("Setup failed: %s :: %s", self.environment, trial.tunables) + # FIXME: Use the actual timestamp from the environment. + _LOG.info("TrialRunner: Update trial results: %s :: %s", trial, Status.FAILED) trial.update(Status.FAILED, datetime.now(UTC)) + return - @staticmethod - def execute_run_trial( - environment: Environment, - ) -> tuple[Status, datetime, dict[str, TunableValue] | None, list[tuple[datetime, str, Any]]]: - """ - Execute the trial run on the environment. - - Parameters - ---------- - environment : Environment - The environment to run the trial on. + # TODO: start background status polling of the environments in the event loop. - Returns - ------- - tuple[ - Status, - datetime.datetime, - dict[str, TunableValue] | None, - list[tuple[datetime.datetime, str, Any]] - ] - The full results of the trial run, including status, timestamp, results, and telemetry. - """ # Block and wait for the final result. - (status, timestamp, results) = environment.run() + (status, timestamp, results) = self.environment.run() + _LOG.info("TrialRunner Results: %s :: %s\n%s", trial.tunables, status, results) # In async mode (TODO), poll the environment for status and telemetry # and update the storage with the intermediate results. - (_status, _timestamp, telemetry) = environment.status() - - return (status, timestamp, results, telemetry) - - def finalize_run_trial( # pylint: disable=too-many-arguments, too-many-positional-arguments - self, - trial: Storage.Trial, - status: Status, - timestamp: datetime, - results: dict[str, TunableValue] | None, - telemetry: list[tuple[datetime, str, Any]], - ) -> None: - """ - Finalize the trial run in the storage backend. + (_status, _timestamp, telemetry) = self.environment.status() - Parameters - ---------- - trial : Storage.Trial - The trial to finalize. - status : Status - The status of the trial. - timestamp : datetime.datetime - The timestamp of the trial execution. - results : dict[str, TunableValue] | None, - The results of the trial - telemetry : list[tuple[datetime.datetime, str, Any]] - The telemetry data of the trial. - """ + # Use the status and timestamp from `.run()` as it is the final status of the experiment. + # TODO: Use the `.status()` output in async mode. trial.update_telemetry(status, timestamp, telemetry) + trial.update(status, timestamp, results) _LOG.info("TrialRunner: Update trial results: %s :: %s %s", trial, status, results) - self._is_running = False - - def run_trial( - self, - trial: Storage.Trial, - global_config: dict[str, Any] | None = None, - ) -> None: - """ - Run a single trial on this TrialRunner's Environment and store the results in - the backend Trial Storage. - Parameters - ---------- - trial : Storage.Trial - A Storage class based Trial used to persist the experiment trial data. - global_config : dict - Global configuration parameters. - """ - self.prepare_run_trial(trial, global_config) - (status, timestamp, results, telemetry) = self.execute_run_trial(self._env) - self.finalize_run_trial(trial, status, timestamp, results, telemetry) + self._is_running = False def teardown(self) -> None: """ From 4f56ec40f0fa484af3c8975b68d61a4261e02a53 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:01:21 +0000 Subject: [PATCH 17/38] remove scheduled state --- mlos_bench/mlos_bench/environments/status.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mlos_bench/mlos_bench/environments/status.py b/mlos_bench/mlos_bench/environments/status.py index 066b659f154..ca35b3473da 100644 --- a/mlos_bench/mlos_bench/environments/status.py +++ b/mlos_bench/mlos_bench/environments/status.py @@ -18,7 +18,6 @@ class Status(enum.Enum): CANCELED = 5 FAILED = 6 TIMED_OUT = 7 - SCHEDULED = 8 def is_good(self) -> bool: """Check if the status of the benchmark/environment is good.""" @@ -27,7 +26,6 @@ def is_good(self) -> bool: Status.READY, Status.RUNNING, Status.SUCCEEDED, - Status.SCHEDULED, } def is_completed(self) -> bool: @@ -76,9 +74,3 @@ def is_timed_out(self) -> bool: TIMED_OUT. """ return self == Status.FAILED - - def is_scheduled(self) -> bool: - """Check if the status of the benchmark/environment Trial or Experiment is - SCHEDULED. - """ - return self == Status.SCHEDULED From 04b926677a63d6e1bafd317ca3d3103df956315a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:02:31 +0000 Subject: [PATCH 18/38] adding some todo and refactoring notes and comments --- mlos_bench/mlos_bench/event_loop_context.py | 1 + .../mlos_bench/schedulers/base_scheduler.py | 13 + .../schedulers/parallel_scheduler.py | 270 +++++++++++------- .../mlos_bench/schedulers/trial_runner.py | 2 + 4 files changed, 190 insertions(+), 96 deletions(-) diff --git a/mlos_bench/mlos_bench/event_loop_context.py b/mlos_bench/mlos_bench/event_loop_context.py index d071c629912..4aaa74e64ee 100644 --- a/mlos_bench/mlos_bench/event_loop_context.py +++ b/mlos_bench/mlos_bench/event_loop_context.py @@ -39,6 +39,7 @@ class EventLoopContext: def __init__(self) -> None: self._event_loop: AbstractEventLoop | None = None self._event_loop_thread: Thread | None = None + # TODO: Check if we can fork the ThreadLock or need to delay using that. self._event_loop_thread_lock = ThreadLock() self._event_loop_thread_refcnt: int = 0 diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index b711388609d..e6e89223602 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -290,6 +290,7 @@ def load_tunable_config(self, config_id: int) -> TunableGroups: _LOG.debug("Config %d ::\n%s", config_id, json.dumps(tunable_values, indent=2)) return tunables.copy() + # FIXME: Maybe rename this to "add_new_optimizer_suggestions" (see below). def _schedule_new_optimizer_suggestions(self) -> bool: """ Optimizer part of the loop. @@ -311,6 +312,9 @@ def _schedule_new_optimizer_suggestions(self) -> bool: return not_done + # FIXME: This might be better called: `enqueue_trial` or + # `add_trial_to_queue` since `assign_trial_runners` is a separate method + # that actually assigns Trials to TrialRunners. def schedule_trial(self, tunables: TunableGroups) -> None: """Add a configuration to the queue of trials.""" # TODO: Alternative scheduling policies may prefer to expand repeats over @@ -378,6 +382,11 @@ def assign_trial_runners( trial.set_trial_runner(trial_runner) ... + Notes + ----- + Subclasses are *not* required to assign a TrialRunner to the Trial + (e.g., if the Trial should be deferred to a later time). + Parameters ---------- trials : Iterable[Storage.Trial] @@ -425,7 +434,11 @@ def get_trial_runner(self, trial: Storage.Trial) -> TrialRunner: ------- TrialRunner """ + # FIXME: May need to improve handling here in the case of + # assign_trial_runners doesn't assign a TrialRunner to a particular + # Trial for some reason. if trial.trial_runner_id is None: + # TODO: Maybe we can force it here? self.assign_trial_runners([trial]) assert trial.trial_runner_id is not None trial_runner = self._trial_runners.get(trial.trial_runner_id) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 2092d5f5bb8..652a7f479e4 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -41,122 +41,200 @@ def start(self) -> None: not_done: bool = True while not_done: _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) - self._run_callbacks() self._run_schedule(is_warm_up) not_done = self._schedule_new_optimizer_suggestions() + pending_trials = self.experiment.pending_trials(datetime.now(UTC), running=False) + self.assign_trial_runners(pending_trials) is_warm_up = False - def teardown(self) -> None: - """Stop the optimization loop.""" - # Shutdown the thread pool and wait for all tasks to finish - self.pool.shutdown(wait=True) - self._run_callbacks() - super().teardown() - - def schedule_trial(self, tunables: TunableGroups) -> None: - """Assign a trial to a trial runner.""" - assert self.experiment is not None - - super().schedule_trial(tunables) - - pending_trials: list[Storage.Trial] = list( - self.experiment.pending_trials(datetime.now(UTC), running=False) - ) - - idle_runner_ids = [ - id for id, runner in self.trial_runners.items() if not runner.is_running - ] - - # Assign pending trials to idle runners - for trial, runner_id in zip(pending_trials, idle_runner_ids): - trial.update(status=Status.SCHEDULED, timestamp=datetime.now(UTC)) - trial.set_trial_runner(runner_id) + def _teardown_trial_runner( + self, + trial_runner_id: int, + ) -> TrialRunnerResult: + """Tear down a specific TrialRunner in a Pool worker.""" + assert self._pool is None, "This should only be called in a Pool worker." + trial_runner = self._trial_runners[trial_runner_id] + with trial_runner: + return TrialRunnerResult( + trial_runner_id=trial_runner_id, + result=trial_runner.teardown(), + ) + + def _teardown_trial_runner_finished_callback( + self, + result: TrialRunnerResult, + ) -> None: + """Callback to be called when a TrialRunner is finished with teardown.""" + trial_runner_id = result.trial_runner_id + assert trial_runner_id in self._trial_runners_status + assert self._trial_runners_status[trial_runner_id] is not None + self._trial_runners_status[result.trial_runner_id] = None + + def _teardown_trial_runner_failed_closure( + self, + trial_runner_id: int, + ) -> Callable[[Any], None]: + # pylint: disable=no-self-use + """Callback to be called when a TrialRunner failed running teardown.""" + + def _teardown_trial_runner_failed(obj: Any) -> None: + """Callback to be called when a TrialRunner failed running teardown.""" + # TODO: improve error handling here + _LOG.error("TrialRunner %d failed to run teardown: %s", trial_runner_id, obj) + raise RuntimeError(f"TrialRunner {trial_runner_id} failed to run teardown: {obj}") + + return _teardown_trial_runner_failed - def _run_schedule(self, running: bool = False) -> None: + def run_trial(self, trial: Storage.Trial) -> None: """ - Scheduler part of the loop. + Set up and run a single Trial on a TrialRunner in a child process in the pool. - Check for pending trials in the queue and run them. + Save the results in the storage. """ - assert self.experiment is not None - - scheduled_trials: list[Storage.Trial] = list( - self.experiment.filter_trials_by_status(datetime.now(UTC), [Status.SCHEDULED]) - ) - - for trial in scheduled_trials: - trial.update(status=Status.READY, timestamp=datetime.now(UTC)) - self.deferred_run_trial(trial) - - def _on_trial_finished_closure( - self, trial: Storage.Trial - ) -> Callable[["ParallelScheduler", Future], None]: + assert self._pool is None, "This should only be called in a Pool worker." + super().run_trial(trial) + # In the sync scheduler we run each trial on its own TrialRunner in sequence. + trial_runner = self.get_trial_runner(trial) + with trial_runner: + trial_runner.run_trial(trial, self.global_config) + _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + + def run_trial_on_trial_runner( + self, + trial_runner: TrialRunner, + storage: Storage, + experiment_id: str, + trial_id: int, + ): + """Run a single trial on a TrialRunner in a child process in the pool. + + Save the results in the storage. """ - Generate a closure to handle the callback for when a trial is finished. - Parameters - ---------- - trial : Storage.Trial - The trial to finish. - """ + def _run_schedule(self, running: bool = False) -> None: + assert self._pool is not None + + pending_trials = self.experiment.pending_trials(datetime.now(UTC), running=running) + scheduled_trials = [ + pending_trial + for pending_trial in pending_trials + if pending_trial.trial_runner_id is not None and pending_trial.trial_runner_id >= 0 + ] - def _on_trial_finished(self: ParallelScheduler, result: Future) -> None: - """ - Callback to be called when a trial is finished. - - This must always be called from the main thread. Exceptions can also be - handled here - """ - try: - (status, timestamp, results, telemetry) = result.result() - self.get_trial_runner(trial).finalize_run_trial( - trial, status, timestamp, results, telemetry + for trial in scheduled_trials: + trial_runner_id = trial.trial_runner_id + assert trial_runner_id is not None + trial_runner_status = self._trial_runners_status[trial_runner_id] + if trial_runner_status is not None: + _LOG.warning( + "Cannot start Trial %d - its assigned TrialRunner %d is already running: %s", + trial.trial_id, + trial_runner_id, + trial_runner_status, ) - except Exception as exception: # pylint: disable=broad-except - _LOG.error("Trial failed: %s", exception) + continue + + # Update our trial bookkeeping. + super().run_trial(trial) + # Run the trial in the child process targeting a particular runner. + # TODO: + + # Wait for all trial runners to finish. + while self._has_running_trial_runners(): + sleep(self._polling_interval) + assert self._get_idle_trial_runners_count() == len(self._trial_runners) + + def _teardown_trial_runner( + self, + trial_runner_id: int, + ) -> TrialRunnerResult: + """Tear down a specific TrialRunner in a Pool worker.""" + assert self._pool is None, "This should only be called in a Pool worker." + trial_runner = self._trial_runners[trial_runner_id] + with trial_runner: + return TrialRunnerResult( + trial_runner_id=trial_runner_id, + result=trial_runner.teardown(), + ) + + def _teardown_trial_runner_finished_callback( + self, + result: TrialRunnerResult, + ) -> None: + """Callback to be called when a TrialRunner is finished with teardown.""" + trial_runner_id = result.trial_runner_id + assert trial_runner_id in self._trial_runners_status + assert self._trial_runners_status[trial_runner_id] is not None + self._trial_runners_status[result.trial_runner_id] = None + + def _teardown_trial_runner_failed_closure( + self, + trial_runner_id: int, + ) -> Callable[[Any], None]: + # pylint: disable=no-self-use + """Callback to be called when a TrialRunner failed running teardown.""" + + def _teardown_trial_runner_failed(obj: Any) -> None: + """Callback to be called when a TrialRunner failed running teardown.""" + # TODO: improve error handling here + _LOG.error("TrialRunner %d failed to run teardown: %s", trial_runner_id, obj) + raise RuntimeError(f"TrialRunner {trial_runner_id} failed to run teardown: {obj}") + + return _teardown_trial_runner_failed - return _on_trial_finished + def teardown(self) -> None: + assert self._pool is not None + if self._do_teardown: + # Call teardown on each TrialRunner in the pool in parallel. + for trial_runner_id in self._trial_runners: + assert ( + self._trial_runners_status[trial_runner_id] is None + ), f"TrialRunner {trial_runner_id} is still active." + self._trial_runners_status[trial_runner_id] = self._pool.apply_async( + # Call the teardown function in the child process targeting + # a particular trial_runner_id. + self._teardown_trial_runner, + args=(trial_runner_id,), + callback=self._teardown_trial_runner_finished_callback, + error_callback=self._teardown_trial_runner_failed_closure(trial_runner_id), + ) - @staticmethod - def _run_callbacks() -> None: - """Run all pending callbacks in the main thread.""" - loop = asyncio.get_event_loop() - pending = asyncio.all_tasks(loop) - loop.run_until_complete(asyncio.gather(*pending)) + # Wait for all trial runners to finish. + while self._has_running_trial_runners(): + sleep(self._polling_interval) + assert self._get_idle_trial_runners_count() == len(self._trial_runners) - def run_trial(self, trial: Storage.Trial) -> None: + def assign_trial_runners(self, trials: Iterable[Storage.Trial]) -> None: """ - Parallel Scheduler does not support run_trial. Use async_run_trial instead. + Assign Trials to the first available and idle TrialRunner. Parameters ---------- - trial : Storage.Trial - The trial to run. - - Raises - ------ - NotImplementedError - Error to indicate that this method is not supported in ParallelScheduler. - """ - raise NotImplementedError( - "ParallelScheduler does not support run_trial. Use async_run_trial instead." - ) - - def deferred_run_trial(self, trial: Storage.Trial) -> None: - """ - Set up and run a single trial asynchronously. - - Returns a callback to save the results in the storage. + trials : Iterable[Storage.Trial] """ - super().run_trial(trial) - # In the sync scheduler we run each trial on its own TrialRunner in sequence. - trial_runner = self.get_trial_runner(trial) - trial_runner.prepare_run_trial(trial, self.global_config) + assert self._in_context + assert self.experiment is not None - task = self.pool.submit(TrialRunner.execute_run_trial, trial_runner.environment) - # This is required to ensure that the callback happens on the main thread - asyncio.get_event_loop().call_soon_threadsafe( - self._on_trial_finished_closure(trial), self, task + pending_trials: list[Storage.Trial] = list( + trial + for trial in trials + if trial.status.is_pending() and trial.trial_runner_id is None ) - _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + idle_runner_ids = [ + trial_runner_id + for trial_runner_id, status in self._trial_runners_status.items() + if status is None + ] + + # Assign pending trials to idle runners + for trial, runner_id in zip(pending_trials, idle_runner_ids): + # FIXME: This results in two separate non-transactional updates. + # Should either set Status=SCHEDULED when we set_trial_runner + # or remove SCHEDULED as a Status altogether and filter by + # "Status=PENDING AND trial_runner_id != NULL" + # Or ... even better, we could use a single transaction to update + # the status and trial_runner_id of all trials in the same batch at once. + trial.set_trial_runner(runner_id) + # Moreover this doesn't even update the status of the Trial - it only updates the telemetry. + trial.update(status=Status.SCHEDULED, timestamp=datetime.now(UTC)) diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index 80eb696bc6d..cb1dffc8665 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -117,6 +117,8 @@ def __init__(self, trial_runner_id: int, env: Environment) -> None: assert self._env.parameters["trial_runner_id"] == self._trial_runner_id self._in_context = False self._is_running = False + # TODO: Check and see if we need to delay creating the event loop + # context until context entry. self._event_loop_context = EventLoopContext() def __repr__(self) -> str: From d5f59af42b80a4f7c0075033dbd4616e3866bea1 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:03:39 +0000 Subject: [PATCH 19/38] prepping pickability of Storage --- .../mlos_bench/schedulers/base_scheduler.py | 3 +- mlos_bench/mlos_bench/storage/base_storage.py | 49 +++++++++++++-- .../mlos_bench/storage/sql/experiment.py | 40 +++++++++++++ mlos_bench/mlos_bench/storage/sql/storage.py | 60 +++++++++++++++++-- 4 files changed, 141 insertions(+), 11 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index e6e89223602..a8515555fd3 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -263,7 +263,8 @@ def teardown(self) -> None: """ Tear down the TrialRunners/Environment(s). - Call it after the completion of the `.start()` in the scheduler context. + Call it after the completion of the :py:meth:`Scheduler.start` in the + Scheduler context. """ assert self.experiment is not None if self._do_teardown: diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 87d61a6723e..00dd3d68477 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -22,6 +22,8 @@ Base interface for accessing the stored benchmark trial data. """ +from __future__ import annotations + import logging from abc import ABCMeta, abstractmethod from collections.abc import Iterator, Mapping @@ -31,7 +33,6 @@ from typing import Any, Literal from mlos_bench.config.schemas import ConfigSchema -from mlos_bench.dict_templater import DictTemplater from mlos_bench.environments.status import Status from mlos_bench.services.base_service import Service from mlos_bench.storage.base_experiment_data import ExperimentData @@ -94,6 +95,25 @@ def experiments(self) -> dict[str, ExperimentData]: A dictionary of the experiments' data, keyed by experiment id. """ + @abstractmethod + def get_experiment_by_id( + self, + experiment_id: str, + ) -> Storage.Experiment | None: + """ + Gets an Experiment by its ID. + + Parameters + ---------- + experiment_id : str + ID of the Experiment to retrieve. + + Returns + ------- + experiment : Storage.Experiment | None + The Experiment object, or None if it doesn't exist. + """ + @abstractmethod def experiment( # pylint: disable=too-many-arguments self, @@ -307,12 +327,31 @@ def load( Trial ids, Tunable values, benchmark scores, and status of the trials. """ + @abstractmethod + def get_trial_by_id( + self, + trial_id: int, + ) -> Storage.Trial | None: + """ + Gets a Trial by its ID. + + Parameters + ---------- + trial_id : int + ID of the Trial to retrieve for this Experiment. + + Returns + ------- + trial : Storage.Trial | None + The Trial object, or None if it doesn't exist. + """ + @abstractmethod def filter_trials_by_status( self, timestamp: datetime, statuses: list[Status], - ) -> Iterator["Storage.Trial"]: + ) -> Iterator[Storage.Trial]: """ Return an iterator over the pending trials that are scheduled to run on or before the specified timestamp matching one of statuses listed. @@ -336,7 +375,7 @@ def pending_trials( timestamp: datetime, *, running: bool, - ) -> Iterator["Storage.Trial"]: + ) -> Iterator[Storage.Trial]: """ Return an iterator over the pending trials that are scheduled to run on or before the specified timestamp. @@ -360,7 +399,7 @@ def new_trial( tunables: TunableGroups, ts_start: datetime | None = None, config: dict[str, Any] | None = None, - ) -> "Storage.Trial": + ) -> Storage.Trial: """ Create a new experiment run in the storage. @@ -397,7 +436,7 @@ def _new_trial( tunables: TunableGroups, ts_start: datetime | None = None, config: dict[str, Any] | None = None, - ) -> "Storage.Trial": + ) -> Storage.Trial: """ Create a new experiment run in the storage. diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index 97581b949cb..3a09c0edae8 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -235,6 +235,46 @@ def _get_key_val(conn: Connection, table: Table, field: str, **kwargs: Any) -> d row._tuple() for row in cur_result.fetchall() # pylint: disable=protected-access ) + # TODO: Needs tests. + def get_trial_by_id( + self, + trial_id: int, + ) -> Storage.Trial | None: + with self._engine.connect() as conn: + trial = conn.execute( + self._schema.trial.select().where( + self._schema.trial.c.exp_id == self._experiment_id, + self._schema.trial.c.trial_id == trial_id, + ) + ).fetchone() + if trial is None: + return None + tunables = self._get_key_val( + conn, + self._schema.config_param, + "param", + config_id=trial.config_id, + ) + config = self._get_key_val( + conn, + self._schema.trial_param, + "param", + exp_id=self._experiment_id, + trial_id=trial_id, + ) + return Trial( + engine=self._engine, + schema=self._schema, + # Reset .is_updated flag after the assignment: + tunables=self._tunables.copy().assign(tunables).reset(), + experiment_id=self._experiment_id, + trial_id=trial_id, + config_id=trial.config_id, + trial_runner_id=trial.trial_runner_id, + opt_targets=self._opt_targets, + config=config, + ) + def filter_trials_by_status( self, timestamp: datetime, diff --git a/mlos_bench/mlos_bench/storage/sql/storage.py b/mlos_bench/mlos_bench/storage/sql/storage.py index b3bf63d0edb..a37a8f7ea0a 100644 --- a/mlos_bench/mlos_bench/storage/sql/storage.py +++ b/mlos_bench/mlos_bench/storage/sql/storage.py @@ -7,7 +7,7 @@ import logging from typing import Literal -from sqlalchemy import URL, create_engine +from sqlalchemy import Engine, URL, create_engine from mlos_bench.services.base_service import Service from mlos_bench.storage.base_experiment_data import ExperimentData @@ -32,21 +32,44 @@ def __init__( service: Service | None = None, ): super().__init__(config, global_config, service) - lazy_schema_create = self._config.pop("lazy_schema_create", False) + self._lazy_schema_create = self._config.pop("lazy_schema_create", False) self._log_sql = self._config.pop("log_sql", False) self._url = URL.create(**self._config) self._repr = f"{self._url.get_backend_name()}:{self._url.database}" + self._engine: Engine + self._db_schema: DbSchema + self._schema_created = False + self._schema_updated = False + self._init_engine() + + def _init_engine(self) -> None: + """Initialize the SQLAlchemy engine.""" + # This is a no-op, as the engine is created in __init__. _LOG.info("Connect to the database: %s", self) self._engine = create_engine(self._url, echo=self._log_sql) self._db_schema = DbSchema(self._engine) - self._schema_created = False - self._schema_updated = False - if not lazy_schema_create: + if not self._lazy_schema_create: assert self._schema self.update_schema() else: _LOG.info("Using lazy schema create for database: %s", self) + # Make the object picklable. + + def __getstate__(self) -> dict: + """Return the state of the object for pickling.""" + state = self.__dict__.copy() + # Don't pickle the engine, as it cannot be pickled. + state.pop("_engine", None) + state.pop("_db_schema", None) + return state + + def __setstate__(self, state: dict) -> None: + """Restore the state of the object from pickling.""" + self.__dict__.update(state) + # Recreate the engine and schema. + self._init_engine() + @property def _schema(self) -> DbSchema: """Lazily create schema upon first access.""" @@ -66,6 +89,33 @@ def update_schema(self) -> None: def __repr__(self) -> str: return self._repr + # TODO: Implement me: + # TODO: Needs tests. + def get_experiment_by_id(self, experiment_id): + with self._engine.connect() as conn: + # TODO: need to join with the trial table to get the max trial_id, + # objectives to get the opt_targets, and tunable_params to get the + # tunables. + cur_exp = conn.execute( + self._schema.experiment.select().where( + self._schema.experiment.c.exp_id == experiment_id, + ) + ) + exp = cur_exp.fetchone() + if exp is None: + return None + return Experiment( + engine=self._engine, + schema=self._schema, + experiment_id=exp.exp_id, + trial_id=-1, + tunables=TunableGroups(), + description=exp.description, + root_env_config=exp.root_env_config, + git_repo=exp.git_repo, + git_commit=exp.git_commit, + ) + def experiment( # pylint: disable=too-many-arguments self, *, From 3cf25c9ef9d6e35cb2111fb47a822263d7d5a00c Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:06:37 +0000 Subject: [PATCH 20/38] start some other consolidation of other functions --- mlos_bench/mlos_bench/schedulers/base_scheduler.py | 1 + mlos_bench/mlos_bench/schedulers/sync_scheduler.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index a8515555fd3..61e142939c9 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -460,6 +460,7 @@ def _run_schedule(self, running: bool = False) -> None: assert self.experiment is not None # Make sure that any pending trials have a TrialRunner assigned. pending_trials = list(self.experiment.pending_trials(datetime.now(UTC), running=running)) + # TODO: Move this to main loop. self.assign_trial_runners(pending_trials) for trial in pending_trials: self.run_trial(trial) diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index f450b28b8f1..fd40898ab6d 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -15,9 +15,12 @@ class SyncScheduler(Scheduler): """A simple single-threaded synchronous optimization loop implementation.""" + # TODO: This is now general enough we could use it in the parallel scheduler too. + # Maybe move to the base class? def start(self) -> None: """Start the optimization loop.""" super().start() + assert self.experiment is not None is_warm_up = self.optimizer.supports_preload if not is_warm_up: @@ -28,6 +31,8 @@ def start(self) -> None: _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) self._run_schedule(is_warm_up) not_done = self._schedule_new_optimizer_suggestions() + pending_trial = self.experiment.pending_trials(datetime.now(UTC), running=False) + self.assign_trial_runners(pending_trial) is_warm_up = False def run_trial(self, trial: Storage.Trial) -> None: From 3a37e9bc216d07c3d9faa343388e2ed6f1a04bbc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:07:05 +0000 Subject: [PATCH 21/38] start consolidating some things toward the base class --- .../mlos_bench/schedulers/base_scheduler.py | 3 + .../schedulers/parallel_scheduler.py | 146 ++++++++++++++++-- 2 files changed, 139 insertions(+), 10 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index 61e142939c9..61b59a987d3 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -463,6 +463,9 @@ def _run_schedule(self, running: bool = False) -> None: # TODO: Move this to main loop. self.assign_trial_runners(pending_trials) for trial in pending_trials: + if trial.trial_runner_id is None: + logging.warning("Trial %s has no TrialRunner assigned yet.") + continue self.run_trial(trial) def not_done(self) -> bool: diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 652a7f479e4..d1831db8e06 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -2,37 +2,163 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. # -"""A simple multi-threaded asynchronous optimization loop implementation.""" +""" +A simple multi-process asynchronous optimization loop implementation. + +TODO: Add more details about the design and constraints and gotchas here. + +Examples +-------- +TODO: Add config examples here. +""" -import asyncio import logging -from collections.abc import Callable -from concurrent.futures import Future, ProcessPoolExecutor +from collections.abc import Callable, Iterable +from multiprocessing import current_process as mp_proccess_name +from multiprocessing.pool import AsyncResult, Pool from datetime import datetime from typing import Any +from time import sleep +from attr import dataclass from pytz import UTC -from mlos_bench.environments.status import Status from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.schedulers.trial_runner import TrialRunner from mlos_bench.storage.base_storage import Storage -from mlos_bench.tunables.tunable_groups import TunableGroups +from mlos_bench.optimizers.base_optimizer import Optimizer +from mlos_bench.tunables.tunable_types import TunableValue _LOG = logging.getLogger(__name__) +def _is_child_process() -> bool: + """Check if the current process is a child process.""" + return mp_proccess_name() != "MainProcess" + + +@dataclass +class TrialRunnerResult: + """A simple data class to hold the :py:class:`AsyncResult` of a + :py:class:`TrialRunner` operation.""" + + trial_runner_id: int + result: dict[str, TunableValue] | None + trial_id: int | None = None + + class ParallelScheduler(Scheduler): - """A simple multi-process asynchronous optimization loop implementation.""" + """A simple multi-process asynchronous optimization loop implementation. + + See :py:mod:`mlos_bench.schedulers.parallel_scheduler` for more usage details. + + Notes + ----- + This schedule uses :ext:py:class:`multiprocessing.Pool` to run trials in parallel. + + To avoid issues with Python's forking implementation, which relies on pickling + objects from the main process and sending them to the child process, we need + to avoid incompatible objects, which includes any additional threads (e.g., + :py:mod:`asyncio` tasks such as + :py:class:`mlos_bench.event_loop_context.EventLoopContext`), database + connections (e.g., :py:mod:`mlos_bench.storage`), and file handles (e.g., + :ext:py:mod:`logging`) that are pickle incompatible. + + To accomplish this, we avoid entering the :py:class:`~.TrialRunner` context + until we are in the child process and allow each child to manage its own + incompatible resources via that context. + + Hence, each child process in the pool actually starts in functions in + special handler functions in the :py:class:`~.ParallelScheduler` class that + receive as inputs all the necessary (and picklable) info as arguments, then + enter the given :py:class:`~.TrialRunner` instance context and invoke that + procedure. + """ + + def __init__( # pylint: disable=too-many-arguments + self, + *, + config: dict[str, Any], + global_config: dict[str, Any], + trial_runners: Iterable[TrialRunner], + optimizer: Optimizer, + storage: Storage, + root_env_config: str, + ): + super().__init__( + config=config, + global_config=global_config, + trial_runners=trial_runners, + optimizer=optimizer, + storage=storage, + root_env_config=root_env_config, + ) + + self._polling_interval: float = config.get("polling_interval", 1.0) + + # TODO: Setup logging for the child processes via a logging queue. + + self._pool: Pool | None = None + """Parallel pool to run Trials in separate TrialRunner processes. - def __init__(self, *args: Any, **kwargs: Any) -> None: + Only initiated on context __enter__. + """ + + self._trial_runners_status: dict[int, AsyncResult[TrialRunnerResult] | None] = { + trial_runner.trial_runner_id: None for trial_runner in self._trial_runners.values() + } + """A dict to keep track of the status of each TrialRunner. + + Since TrialRunners enter their running context within each pool task, we + can't check :py:meth:`.TrialRunner.is_running` within the parent + generally. + + Instead, we use a dict to keep track of the status of each TrialRunner + as either None (idle) or AsyncResult (running). + + This also helps us to gather AsyncResults from each worker. + """ + + def _get_idle_trial_runners_count(self) -> int: + return len( + [ + trial_runner_status + for trial_runner_status in self._trial_runners_status.values() + if trial_runner_status is None + ] + ) + + def _has_running_trial_runners(self) -> bool: + return any( + True + for trial_runner_status in self._trial_runners_status.values() + if trial_runner_status is not None + ) - super().__init__(*args, **kwargs) - self.pool = ProcessPoolExecutor(max_workers=len(self._trial_runners)) + def __enter__(self): + assert self._pool is None + self._pool = Pool(processes=len(self.trial_runners), maxtasksperchild=1) + self._pool.__enter__() + # Delay context entry in the parent process + return super().__enter__() + def __exit__(self, ex_type, ex_val, ex_tb): + assert self._pool is not None + # Shutdown the process pool and wait for all tasks to finish + # (everything should be done by now) + assert self._has_running_trial_runners() is False + assert self._get_idle_trial_runners_count() == len(self._trial_runners) + self._pool.close() + self._pool.join() + self._pool.__exit__(ex_type, ex_val, ex_tb) + self._pool = None + return super().__exit__(ex_type, ex_val, ex_tb) + + # TODO: Consolidate to base_scheduler? def start(self) -> None: """Start the optimization loop.""" super().start() + assert self.experiment is not None is_warm_up: bool = self.optimizer.supports_preload if not is_warm_up: From f7fcecf19227d3524cdde6867028b5eacaad681a Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Wed, 30 Apr 2025 22:07:18 +0000 Subject: [PATCH 22/38] support returning data from run_trial --- mlos_bench/mlos_bench/schedulers/sync_scheduler.py | 3 +++ mlos_bench/mlos_bench/schedulers/trial_runner.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index fd40898ab6d..10aecaa8b34 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -5,6 +5,9 @@ """A simple single-threaded synchronous optimization loop implementation.""" import logging +from datetime import datetime + +from pytz import UTC from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.storage.base_storage import Storage diff --git a/mlos_bench/mlos_bench/schedulers/trial_runner.py b/mlos_bench/mlos_bench/schedulers/trial_runner.py index cb1dffc8665..2f5ccdfda14 100644 --- a/mlos_bench/mlos_bench/schedulers/trial_runner.py +++ b/mlos_bench/mlos_bench/schedulers/trial_runner.py @@ -20,6 +20,7 @@ from mlos_bench.services.types import SupportsConfigLoading from mlos_bench.storage.base_storage import Storage from mlos_bench.tunables.tunable_groups import TunableGroups +from mlos_bench.tunables.tunable_types import TunableValue _LOG = logging.getLogger(__name__) @@ -170,7 +171,7 @@ def run_trial( self, trial: Storage.Trial, global_config: dict[str, Any] | None = None, - ) -> None: + ) -> tuple[Status, datetime, dict[str, TunableValue] | None]: """ Run a single trial on this TrialRunner's Environment and stores the results in the backend Trial Storage. @@ -200,9 +201,10 @@ def run_trial( if not self.environment.setup(trial.tunables, trial.config(global_config)): _LOG.warning("Setup failed: %s :: %s", self.environment, trial.tunables) # FIXME: Use the actual timestamp from the environment. - _LOG.info("TrialRunner: Update trial results: %s :: %s", trial, Status.FAILED) - trial.update(Status.FAILED, datetime.now(UTC)) - return + (status, timestamp, results) = (Status.FAILED, datetime.now(UTC), None) + _LOG.info("TrialRunner: Update trial results: %s :: %s", trial, status) + trial.update(status, timestamp) + return (status, timestamp, results) # TODO: start background status polling of the environments in the event loop. @@ -223,6 +225,8 @@ def run_trial( self._is_running = False + return (status, timestamp, results) + def teardown(self) -> None: """ Tear down the Environment. From e8635d405476240cdcc6365a3b3c35a989a547b8 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 2 May 2025 18:17:14 -0500 Subject: [PATCH 23/38] wip: refactoring and implementation for parallel scheduler --- .../mlos_bench/schedulers/base_scheduler.py | 143 ++++--- .../schedulers/parallel_scheduler.py | 366 +++++++++++------- .../mlos_bench/schedulers/sync_scheduler.py | 28 +- mlos_bench/mlos_bench/storage/base_storage.py | 41 +- .../mlos_bench/storage/sql/experiment.py | 41 +- 5 files changed, 372 insertions(+), 247 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index 61b59a987d3..c9e6d8fa666 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -242,7 +242,6 @@ def __exit__( self._in_context = False return False # Do not suppress exceptions - @abstractmethod def start(self) -> None: """Start the scheduling loop.""" assert self.experiment is not None @@ -257,7 +256,26 @@ def start(self) -> None: if self._config_id > 0: tunables = self.load_tunable_config(self._config_id) - self.schedule_trial(tunables) + # If a config_id is provided, assume it is expected to be run immediately. + self.add_trial_to_queue(tunables, ts_start=datetime.now(UTC)) + + is_warm_up: bool = self.optimizer.supports_preload + if not is_warm_up: + _LOG.warning("Skip pending trials and warm-up: %s", self.optimizer) + + not_done: bool = True + while not_done: + _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) + self.run_schedule(is_warm_up) + not_done = self.add_new_optimizer_suggestions() + self.assign_trial_runners( + self.experiment.pending_trials( + datetime.now(UTC), + running=False, + trial_runner_assigned=False, + ) + ) + is_warm_up = False def teardown(self) -> None: """ @@ -291,16 +309,27 @@ def load_tunable_config(self, config_id: int) -> TunableGroups: _LOG.debug("Config %d ::\n%s", config_id, json.dumps(tunable_values, indent=2)) return tunables.copy() - # FIXME: Maybe rename this to "add_new_optimizer_suggestions" (see below). - def _schedule_new_optimizer_suggestions(self) -> bool: + def add_new_optimizer_suggestions(self) -> bool: """ Optimizer part of the loop. - Load the results of the executed trials into the optimizer, suggest new - configurations, and add them to the queue. Return True if optimization is not - over, False otherwise. + Load the results of the executed trials into the + :py:class:`~.Optimizer`, suggest new configurations, and add them to the + queue. + + Returns + ------- + bool + The return value indicates whether the optimization process should + continue to get suggestions from the Optimizer or not. + See Also: :py:meth:`~.Scheduler.not_done`. """ assert self.experiment is not None + # Load the results of the trials that have been run since the last time + # we queried the Optimizer. + # FIXME: This can miss some straggler results from parallel trial + # executions. + # Maybe just maintain a set? (trial_ids, configs, scores, status) = self.experiment.load(self._last_trial_id) _LOG.info("QUEUE: Update the optimizer with trial results: %s", trial_ids) self.optimizer.bulk_register(configs, scores, status) @@ -309,40 +338,63 @@ def _schedule_new_optimizer_suggestions(self) -> bool: not_done = self.not_done() if not_done: tunables = self.optimizer.suggest() - self.schedule_trial(tunables) - + self.add_trial_to_queue(tunables) return not_done - # FIXME: This might be better called: `enqueue_trial` or - # `add_trial_to_queue` since `assign_trial_runners` is a separate method - # that actually assigns Trials to TrialRunners. - def schedule_trial(self, tunables: TunableGroups) -> None: - """Add a configuration to the queue of trials.""" - # TODO: Alternative scheduling policies may prefer to expand repeats over - # time as well as space, or adjust the number of repeats (budget) of a given - # trial based on whether initial results are promising. + def add_trial_to_queue( + self, + tunables: TunableGroups, + ts_start: datetime | None = None, + ) -> None: + """ + Add a configuration to the queue of trials 1 or more times. + + (e.g., according to the :py:attr:`.trial_config_repeat_count`) + + Parameters + ---------- + tunables : TunableGroups + The tunable configuration to add to the queue. + + ts_start : datetime | None + Optional timestamp to use to start the trial. + + Notes + ----- + Alternative scheduling policies may prefer to expand repeats over + time as well as space, or adjust the number of repeats (budget) of a given + trial based on whether initial results are promising. + """ for repeat_i in range(1, self._trial_config_repeat_count + 1): self._add_trial_to_queue( tunables, - config={ - # Add some additional metadata to track for the trial such as the - # optimizer config used. - # Note: these values are unfortunately mutable at the moment. - # Consider them as hints of what the config was the trial *started*. - # It is possible that the experiment configs were changed - # between resuming the experiment (since that is not currently - # prevented). - "optimizer": self.optimizer.name, - "repeat_i": repeat_i, - "is_defaults": tunables.is_defaults(), - **{ - f"opt_{key}_{i}": val - for (i, opt_target) in enumerate(self.optimizer.targets.items()) - for (key, val) in zip(["target", "direction"], opt_target) - }, - }, + ts_start=ts_start, + config=self._augment_trial_config_metadata(tunables, repeat_i), ) + def _augment_trial_config_metadata( + self, + tunables: TunableGroups, + repeat_i: int, + ) -> dict[str, Any]: + return { + # Add some additional metadata to track for the trial such as the + # optimizer config used. + # Note: these values are unfortunately mutable at the moment. + # Consider them as hints of what the config was the trial *started*. + # It is possible that the experiment configs were changed + # between resuming the experiment (since that is not currently + # prevented). + "optimizer": self.optimizer.name, + "repeat_i": repeat_i, + "is_defaults": tunables.is_defaults(), + **{ + f"opt_{key}_{i}": val + for (i, opt_target) in enumerate(self.optimizer.targets.items()) + for (key, val) in zip(["target", "direction"], opt_target) + }, + } + def _add_trial_to_queue( self, tunables: TunableGroups, @@ -360,10 +412,10 @@ def _add_trial_to_queue( def assign_trial_runners(self, trials: Iterable[Storage.Trial]) -> None: """ - Assigns TrialRunners to the given Trial in batch. + Assigns :py:class:`~.TrialRunner`s to the given :py:class:`~.Trial`s in batch. - The base class implements a simple round-robin scheduling algorithm for each - Trial in sequence. + The base class implements a simple round-robin scheduling algorithm for + each Trial in sequence. Subclasses can override this method to implement a more sophisticated policy. For instance:: @@ -424,7 +476,8 @@ def assign_trial_runners( def get_trial_runner(self, trial: Storage.Trial) -> TrialRunner: """ - Gets the TrialRunner associated with the given Trial. + Gets the :py:class:`~.TrialRunner` associated with the given + :py:class:`~.Storage.Trial`. Parameters ---------- @@ -451,17 +504,16 @@ def get_trial_runner(self, trial: Storage.Trial) -> TrialRunner: assert trial_runner.trial_runner_id == trial.trial_runner_id return trial_runner - def _run_schedule(self, running: bool = False) -> None: + def run_schedule(self, running: bool = False) -> None: """ - Scheduler part of the loop. + Runs the current schedule of trials. - Check for pending trials in the queue and run them. + Check for :py:class:`.Trial`s with `:py:attr:`.Status.PENDING` and an + assigned :py:attr:`~.Trial.trial_runner_id` in the queue and run them + with :py:meth:`~.Scheduler.run_trial`. """ assert self.experiment is not None - # Make sure that any pending trials have a TrialRunner assigned. pending_trials = list(self.experiment.pending_trials(datetime.now(UTC), running=running)) - # TODO: Move this to main loop. - self.assign_trial_runners(pending_trials) for trial in pending_trials: if trial.trial_runner_id is None: logging.warning("Trial %s has no TrialRunner assigned yet.") @@ -472,7 +524,8 @@ def not_done(self) -> bool: """ Check the stopping conditions. - By default, stop when the optimizer converges or max limit of trials reached. + By default, stop when the :py:class:`.Optimizer` converges or the limit + of :py:attr:`~.Scheduler.max_trials` is reached. """ return self.optimizer.not_converged() and ( self._trial_count < self._max_trials or self._max_trials <= 0 diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index d1831db8e06..2922c834188 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -14,7 +14,7 @@ import logging from collections.abc import Callable, Iterable -from multiprocessing import current_process as mp_proccess_name +from multiprocessing import current_process from multiprocessing.pool import AsyncResult, Pool from datetime import datetime from typing import Any @@ -23,6 +23,7 @@ from attr import dataclass from pytz import UTC +from mlos_bench.environments.status import Status from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.schedulers.trial_runner import TrialRunner from mlos_bench.storage.base_storage import Storage @@ -32,18 +33,29 @@ _LOG = logging.getLogger(__name__) -def _is_child_process() -> bool: +MAIN_PROCESS_NAME = "MainProcess" +""" +Name of the main process in control of the +:external:py:class:`multiprocessing.Pool`. +""" + + +def is_child_process() -> bool: """Check if the current process is a child process.""" - return mp_proccess_name() != "MainProcess" + return current_process().name != MAIN_PROCESS_NAME @dataclass class TrialRunnerResult: - """A simple data class to hold the :py:class:`AsyncResult` of a - :py:class:`TrialRunner` operation.""" + """ + A simple data class to hold the :py:class:`AsyncResult` of a + :py:class:`TrialRunner` operation. + """ trial_runner_id: int - result: dict[str, TunableValue] | None + results: dict[str, TunableValue] | None + timestamp: datetime | None = None + status: Status | None = None trial_id: int | None = None @@ -54,12 +66,13 @@ class ParallelScheduler(Scheduler): Notes ----- - This schedule uses :ext:py:class:`multiprocessing.Pool` to run trials in parallel. + This schedule uses :ext:py:class:`multiprocessing.Pool` to run + :py:class:`~.Storage.Trial`s in parallel. To avoid issues with Python's forking implementation, which relies on pickling - objects from the main process and sending them to the child process, we need - to avoid incompatible objects, which includes any additional threads (e.g., - :py:mod:`asyncio` tasks such as + objects and functions from the main process and sending them to the child + process to invoke, we need to avoid incompatible objects, which includes any + additional threads (e.g., :py:mod:`asyncio` tasks such as :py:class:`mlos_bench.event_loop_context.EventLoopContext`), database connections (e.g., :py:mod:`mlos_bench.storage`), and file handles (e.g., :ext:py:mod:`logging`) that are pickle incompatible. @@ -73,6 +86,8 @@ class ParallelScheduler(Scheduler): receive as inputs all the necessary (and picklable) info as arguments, then enter the given :py:class:`~.TrialRunner` instance context and invoke that procedure. + + For instance :py:meth:`~.ParallelScheduler._teardown_trial_runner` is a function that """ def __init__( # pylint: disable=too-many-arguments @@ -94,20 +109,22 @@ def __init__( # pylint: disable=too-many-arguments root_env_config=root_env_config, ) - self._polling_interval: float = config.get("polling_interval", 1.0) + self._polling_interval = float(config.get("polling_interval", 1.0)) # TODO: Setup logging for the child processes via a logging queue. self._pool: Pool | None = None - """Parallel pool to run Trials in separate TrialRunner processes. + """ + Parallel :external:py:class:`.Pool` to run :py:class:`~.Storage.Trial`s + in separate :py:class:`.TrialRunner` processes. - Only initiated on context __enter__. + Only initiated on context :py:meth:`.__enter__`. """ self._trial_runners_status: dict[int, AsyncResult[TrialRunnerResult] | None] = { trial_runner.trial_runner_id: None for trial_runner in self._trial_runners.values() } - """A dict to keep track of the status of each TrialRunner. + """A dict to keep track of the status of each :py:class:`.TrialRunner`. Since TrialRunners enter their running context within each pool task, we can't check :py:meth:`.TrialRunner.is_running` within the parent @@ -120,6 +137,11 @@ def __init__( # pylint: disable=too-many-arguments """ def _get_idle_trial_runners_count(self) -> int: + """Return a count of idle trial runners. + + Can be used as a hint for the number of new trials we can run when we + next get more suggestions from the Optimizer. + """ return len( [ trial_runner_status @@ -129,6 +151,7 @@ def _get_idle_trial_runners_count(self) -> int: ) def _has_running_trial_runners(self) -> bool: + """Check to see if any TrialRunners are currently busy.""" return any( True for trial_runner_status in self._trial_runners_status.values() @@ -136,7 +159,7 @@ def _has_running_trial_runners(self) -> bool: ) def __enter__(self): - assert self._pool is None + # Setup the process pool to run the trials in parallel. self._pool = Pool(processes=len(self.trial_runners), maxtasksperchild=1) self._pool.__enter__() # Delay context entry in the parent process @@ -145,163 +168,243 @@ def __enter__(self): def __exit__(self, ex_type, ex_val, ex_tb): assert self._pool is not None # Shutdown the process pool and wait for all tasks to finish - # (everything should be done by now) - assert self._has_running_trial_runners() is False + # (everything should be done by now anyways) + assert not self._has_running_trial_runners() assert self._get_idle_trial_runners_count() == len(self._trial_runners) self._pool.close() self._pool.join() self._pool.__exit__(ex_type, ex_val, ex_tb) - self._pool = None return super().__exit__(ex_type, ex_val, ex_tb) - # TODO: Consolidate to base_scheduler? - def start(self) -> None: - """Start the optimization loop.""" - super().start() - assert self.experiment is not None - - is_warm_up: bool = self.optimizer.supports_preload - if not is_warm_up: - _LOG.warning("Skip pending trials and warm-up: %s", self.optimizer) - - not_done: bool = True - while not_done: - _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) - self._run_schedule(is_warm_up) - not_done = self._schedule_new_optimizer_suggestions() - pending_trials = self.experiment.pending_trials(datetime.now(UTC), running=False) - self.assign_trial_runners(pending_trials) - is_warm_up = False - - def _teardown_trial_runner( - self, - trial_runner_id: int, + @staticmethod + def run_trial_on_trial_runner( + storage: Storage, + experiment_id: str, + trial_id: int, + trial_runner: TrialRunner, + global_config: dict[str, Any] | None, ) -> TrialRunnerResult: - """Tear down a specific TrialRunner in a Pool worker.""" - assert self._pool is None, "This should only be called in a Pool worker." - trial_runner = self._trial_runners[trial_runner_id] + """ + Retrieve and run a :py:class:`~.Storage.Trial` on a specific :py:class:`.TrialRunner` + in a :py:class:`~.Pool` background worker process. + + Parameters + ---------- + storage : Storage + The :py:class:`~.Storage` to use to retrieve the :py:class:`.Storage.Trial`. + experiment_id : str + The ID of the experiment the trial is a part of. + trial_id : int + The ID of the trial. + trial_runner : TrialRunner + The :py:class:`.TrialRunner` to run on. + global_config : dict[str, Any] | None + The global configuration to use for the trial. + + Returns + ------- + TrialRunnerResult + The result of the :py:meth:`.TrialRunner.run_trial` operation. + + Notes + ----- + This is called in the Pool worker process, so it must receive arguments + that are picklable and be able to construct all necessary state from that. + Upon completion a callback is used to update the status of the + TrialRunner in the ParallelScheduler with the value in the + TrialRunnerResult. + """ + assert is_child_process(), "This should be called in a Pool worker." + exp = storage.get_experiment_by_id(experiment_id) + assert exp is not None, "Experiment not found." + trial = exp.get_trial_by_id(trial_id) + assert trial is not None, "Trial not found." + assert ( + trial.trial_runner_id == trial_runner.trial_runner_id + ), f"Unexpected Trial Runner {trial_runner} for Trial {trial}." with trial_runner: + (status, ts, results) = trial_runner.run_trial(trial, global_config) return TrialRunnerResult( - trial_runner_id=trial_runner_id, - result=trial_runner.teardown(), + trial_runner_id=trial_runner.trial_runner_id, + results=results, + timestamp=ts, + status=status, + trial_id=trial.trial_id, ) - def _teardown_trial_runner_finished_callback( + def _run_trial_on_trial_runner_finished_callback( self, result: TrialRunnerResult, ) -> None: - """Callback to be called when a TrialRunner is finished with teardown.""" + """Callback to be called when a TrialRunner is finished with run_trial.""" trial_runner_id = result.trial_runner_id - assert trial_runner_id in self._trial_runners_status - assert self._trial_runners_status[trial_runner_id] is not None + assert ( + trial_runner_id in self._trial_runners_status + ), f"Unexpected TrialRunner {trial_runner_id}." + assert ( + self._trial_runners_status[trial_runner_id] is not None + ), f"TrialRunner {trial_runner_id} should have been running." + # Mark the TrialRunner as finished. self._trial_runners_status[result.trial_runner_id] = None + # TODO: save the results? + # TODO: Allow scheduling of new trials here. - def _teardown_trial_runner_failed_closure( + def _run_trial_on_trial_runner_failed_closure( self, trial_runner_id: int, ) -> Callable[[Any], None]: # pylint: disable=no-self-use - """Callback to be called when a TrialRunner failed running teardown.""" + """Callback to be called when a TrialRunner failed running run_trial.""" - def _teardown_trial_runner_failed(obj: Any) -> None: - """Callback to be called when a TrialRunner failed running teardown.""" + def _run_trial_on_trial_runner_failed(obj: Any) -> None: + """Callback to be called when a TrialRunner failed running run_trial.""" # TODO: improve error handling here - _LOG.error("TrialRunner %d failed to run teardown: %s", trial_runner_id, obj) - raise RuntimeError(f"TrialRunner {trial_runner_id} failed to run teardown: {obj}") + _LOG.error("TrialRunner %d failed on run_trial: %s", trial_runner_id, obj) + raise RuntimeError(f"TrialRunner {trial_runner_id} failed on run_trial: {obj}") - return _teardown_trial_runner_failed + return _run_trial_on_trial_runner_failed def run_trial(self, trial: Storage.Trial) -> None: """ Set up and run a single Trial on a TrialRunner in a child process in the pool. - Save the results in the storage. + The TrialRunner saves the results in the Storage. """ - assert self._pool is None, "This should only be called in a Pool worker." + assert self._pool is not None + assert self._in_context + assert not is_child_process(), "This should be called in the parent process." + + # Run the given trial in the child process targeting a particular runner. + trial_runner_id = trial.trial_runner_id + assert trial_runner_id is not None, f"Trial {trial} has not been assigned a trial runner." + trial_runner = self._trial_runners[trial_runner_id] + + if self._trial_runners_status[trial_runner_id] is not None: + _LOG.info("TrialRunner %s is still active. Skipping trial %s.", trial_runner, trial) + + # Update our trial bookkeeping. super().run_trial(trial) - # In the sync scheduler we run each trial on its own TrialRunner in sequence. - trial_runner = self.get_trial_runner(trial) - with trial_runner: - trial_runner.run_trial(trial, self.global_config) - _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + # Start the trial in a child process. + self._trial_runners_status[trial_runner_id] = self._pool.apply_async( + # Call the teardown function in the child process targeting + # a particular trial_runner. + self.run_trial_on_trial_runner, + args=( + self.storage, + self._experiment_id, + trial.trial_id, + trial_runner, + self.global_config, + ), + callback=self._run_trial_on_trial_runner_finished_callback, + error_callback=self._run_trial_on_trial_runner_failed_closure(trial_runner_id), + ) - def run_trial_on_trial_runner( - self, - trial_runner: TrialRunner, - storage: Storage, - experiment_id: str, - trial_id: int, - ): - """Run a single trial on a TrialRunner in a child process in the pool. + def run_schedule(self, running: bool = False) -> None: + """ + Runs the current schedule of Trials on parallel background workers. - Save the results in the storage. + Check for :py:class:`.Trial`s with `:py:attr:`.Status.PENDING` and an + assigned :py:attr:`~.Trial.trial_runner_id` in the queue and run them + with :py:meth:`~.Scheduler.run_trial`. """ - def _run_schedule(self, running: bool = False) -> None: + assert not is_child_process(), "This should be called in the parent process." assert self._pool is not None + assert self._experiment is not None - pending_trials = self.experiment.pending_trials(datetime.now(UTC), running=running) + scheduled_trials = self._experiment.pending_trials( + datetime.now(UTC), + running=running, + trial_runner_assigned=True, + ) scheduled_trials = [ - pending_trial - for pending_trial in pending_trials - if pending_trial.trial_runner_id is not None and pending_trial.trial_runner_id >= 0 + trial + for trial in scheduled_trials + if trial.trial_runner_id is not None and trial.trial_runner_id >= 0 ] + # Start each of the scheduled trials in the background. for trial in scheduled_trials: - trial_runner_id = trial.trial_runner_id - assert trial_runner_id is not None - trial_runner_status = self._trial_runners_status[trial_runner_id] - if trial_runner_status is not None: - _LOG.warning( - "Cannot start Trial %d - its assigned TrialRunner %d is already running: %s", - trial.trial_id, - trial_runner_id, - trial_runner_status, - ) - continue - - # Update our trial bookkeeping. - super().run_trial(trial) - # Run the trial in the child process targeting a particular runner. - # TODO: + self.run_trial(trial) + # Now all available trial should be started in the background. # Wait for all trial runners to finish. while self._has_running_trial_runners(): sleep(self._polling_interval) + + # NOTE: This organization is blocking in that it will wait for *all* + # scheduled trials to finish running prior to scheduling more. + # TODO: This can be improved. + # For instance: + # 1. Allow eagerly scheduling new trials in the callback immediately + # after one finishes (maybe make this a configurable option). + # 2. Run the above in a while loop to continually evaluate for newly + # scheduled trials that are available to run. + # Alternatively: + # We can move the "has_running_trial_runners" check to the start() + # method and allow this return eagerly (so that it becomes more like + # "start_schedule"). + assert self._get_idle_trial_runners_count() == len(self._trial_runners) - def _teardown_trial_runner( - self, - trial_runner_id: int, - ) -> TrialRunnerResult: - """Tear down a specific TrialRunner in a Pool worker.""" - assert self._pool is None, "This should only be called in a Pool worker." - trial_runner = self._trial_runners[trial_runner_id] + @staticmethod + def teardown_trial_runner(trial_runner: TrialRunner) -> TrialRunnerResult: + """ + Tear down a specific :py:class:`.TrialRunner` (and its + :py:class:`~mlos_bench.environments.base_environment.Environment`) in a + :py:class:`.Pool` worker. + + Parameters + ---------- + trial_runner : TrialRunner + The :py:class:`.TrialRunner` to tear down. + + Returns + ------- + TrialRunnerResult + The result of the teardown operation, including the trial_runner_id + and the result of the teardown operation. + + Notes + ----- + This is called in the Pool worker process, so it must receive arguments + that are picklable. + To keep life simple we pass the entire TrialRunner object, which should + **not** be have entered its context (else it may have non-picklable + state), and make this a static method of the class to avoid needing to + pass the :py:class:`~.ParallelScheduler` instance. + Upon completion a callback is used to update the status of the + TrialRunner in the ParallelScheduler with the value in the + TrialRunnerResult. + """ + assert is_child_process(), "This should be called in a Pool worker." with trial_runner: return TrialRunnerResult( - trial_runner_id=trial_runner_id, - result=trial_runner.teardown(), + trial_runner_id=trial_runner.trial_runner_id, + results=trial_runner.teardown(), ) - def _teardown_trial_runner_finished_callback( - self, - result: TrialRunnerResult, - ) -> None: + def _teardown_trial_runner_finished_callback(self, result: TrialRunnerResult) -> None: """Callback to be called when a TrialRunner is finished with teardown.""" + assert not is_child_process(), "This should be called in the parent process." trial_runner_id = result.trial_runner_id - assert trial_runner_id in self._trial_runners_status - assert self._trial_runners_status[trial_runner_id] is not None + assert ( + trial_runner_id in self._trial_runners_status + ), f"Unexpected TrialRunner {trial_runner_id}." + assert ( + self._trial_runners_status[trial_runner_id] is not None + ), f"TrialRunner {trial_runner_id} should have been running." self._trial_runners_status[result.trial_runner_id] = None + # Nothing to do with the result. - def _teardown_trial_runner_failed_closure( - self, - trial_runner_id: int, - ) -> Callable[[Any], None]: - # pylint: disable=no-self-use + @staticmethod + def _teardown_trial_runner_failed_closure(trial_runner_id: int) -> Callable[[Any], None]: """Callback to be called when a TrialRunner failed running teardown.""" def _teardown_trial_runner_failed(obj: Any) -> None: """Callback to be called when a TrialRunner failed running teardown.""" + assert not is_child_process(), "This should be called in the parent process." # TODO: improve error handling here _LOG.error("TrialRunner %d failed to run teardown: %s", trial_runner_id, obj) raise RuntimeError(f"TrialRunner {trial_runner_id} failed to run teardown: {obj}") @@ -309,30 +412,34 @@ def _teardown_trial_runner_failed(obj: Any) -> None: return _teardown_trial_runner_failed def teardown(self) -> None: + assert not is_child_process(), "This should be called in the parent process." assert self._pool is not None + assert self._in_context + assert not self._has_running_trial_runners(), "All trial runners should be idle." if self._do_teardown: # Call teardown on each TrialRunner in the pool in parallel. - for trial_runner_id in self._trial_runners: + for trial_runner_id, trial_runner in self._trial_runners.items(): assert ( self._trial_runners_status[trial_runner_id] is None - ), f"TrialRunner {trial_runner_id} is still active." + ), f"TrialRunner {trial_runner} is still active." self._trial_runners_status[trial_runner_id] = self._pool.apply_async( # Call the teardown function in the child process targeting - # a particular trial_runner_id. - self._teardown_trial_runner, - args=(trial_runner_id,), + # a particular trial_runner. + self.teardown_trial_runner, + args=(trial_runner,), callback=self._teardown_trial_runner_finished_callback, error_callback=self._teardown_trial_runner_failed_closure(trial_runner_id), ) - # Wait for all trial runners to finish. - while self._has_running_trial_runners(): - sleep(self._polling_interval) - assert self._get_idle_trial_runners_count() == len(self._trial_runners) + # Wait for all trial runners to finish. + while self._has_running_trial_runners(): + sleep(self._polling_interval) + assert not self._has_running_trial_runners(), "All trial runners should be idle." def assign_trial_runners(self, trials: Iterable[Storage.Trial]) -> None: """ - Assign Trials to the first available and idle TrialRunner. + Assign :py:class:`~.Storage.Trial`s to the first available and idle + :py:class:`.TrialRunner`. Parameters ---------- @@ -340,8 +447,9 @@ def assign_trial_runners(self, trials: Iterable[Storage.Trial]) -> None: """ assert self._in_context assert self.experiment is not None + assert not is_child_process(), "This should be called in the parent process." - pending_trials: list[Storage.Trial] = list( + scheduleable_trials: list[Storage.Trial] = list( trial for trial in trials if trial.status.is_pending() and trial.trial_runner_id is None @@ -354,13 +462,5 @@ def assign_trial_runners(self, trials: Iterable[Storage.Trial]) -> None: ] # Assign pending trials to idle runners - for trial, runner_id in zip(pending_trials, idle_runner_ids): - # FIXME: This results in two separate non-transactional updates. - # Should either set Status=SCHEDULED when we set_trial_runner - # or remove SCHEDULED as a Status altogether and filter by - # "Status=PENDING AND trial_runner_id != NULL" - # Or ... even better, we could use a single transaction to update - # the status and trial_runner_id of all trials in the same batch at once. + for trial, runner_id in zip(scheduleable_trials, idle_runner_ids): trial.set_trial_runner(runner_id) - # Moreover this doesn't even update the status of the Trial - it only updates the telemetry. - trial.update(status=Status.SCHEDULED, timestamp=datetime.now(UTC)) diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index 10aecaa8b34..ec81fae3733 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -5,9 +5,7 @@ """A simple single-threaded synchronous optimization loop implementation.""" import logging -from datetime import datetime -from pytz import UTC from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.storage.base_storage import Storage @@ -18,35 +16,19 @@ class SyncScheduler(Scheduler): """A simple single-threaded synchronous optimization loop implementation.""" - # TODO: This is now general enough we could use it in the parallel scheduler too. - # Maybe move to the base class? - def start(self) -> None: - """Start the optimization loop.""" - super().start() - assert self.experiment is not None - - is_warm_up = self.optimizer.supports_preload - if not is_warm_up: - _LOG.warning("Skip pending trials and warm-up: %s", self.optimizer) - - not_done = True - while not_done: - _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) - self._run_schedule(is_warm_up) - not_done = self._schedule_new_optimizer_suggestions() - pending_trial = self.experiment.pending_trials(datetime.now(UTC), running=False) - self.assign_trial_runners(pending_trial) - is_warm_up = False - def run_trial(self, trial: Storage.Trial) -> None: """ - Set up and run a single trial. + Set up and run a single :py:class:`~.Storage.Trial` on its + :py:class:`~.TrialRunner`. Save the results in the storage. """ super().run_trial(trial) # In the sync scheduler we run each trial on its own TrialRunner in sequence. trial_runner = self.get_trial_runner(trial) + if trial_runner is None: + _LOG.warning("No trial runner found for %s", trial) + return with trial_runner: trial_runner.run_trial(trial, self.global_config) _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 00dd3d68477..51ebc9fbc35 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -33,6 +33,7 @@ from typing import Any, Literal from mlos_bench.config.schemas import ConfigSchema +from mlos_bench.dict_templater import DictTemplater from mlos_bench.environments.status import Status from mlos_bench.services.base_service import Service from mlos_bench.storage.base_experiment_data import ExperimentData @@ -346,47 +347,33 @@ def get_trial_by_id( The Trial object, or None if it doesn't exist. """ - @abstractmethod - def filter_trials_by_status( - self, - timestamp: datetime, - statuses: list[Status], - ) -> Iterator[Storage.Trial]: - """ - Return an iterator over the pending trials that are scheduled to run on or - before the specified timestamp matching one of statuses listed. - - Parameters - ---------- - timestamp : datetime.datetime - The time in UTC to check for scheduled trials. - statuses : list[Status] - Status of the trials to filter in. - - Returns - ------- - trials : Iterator[Storage.Trial] - An iterator over the matching trials. - """ - @abstractmethod def pending_trials( self, timestamp: datetime, *, running: bool, + trial_runner_assigned: bool | None = None, ) -> Iterator[Storage.Trial]: """ - Return an iterator over the pending trials that are scheduled to run on or - before the specified timestamp. + Return an iterator over the :py:class:`~.Storage.Trial`s that are + :py:attr:`~.Status.PENDING` and have a scheduled + :py:attr:`~.Storage.Trial.ts_start` time to run on or before the + specified timestamp. Parameters ---------- timestamp : datetime.datetime - The time in UTC to check for scheduled trials. + The time in UTC to check for scheduled Trials. running : bool - If True, include the trials that are already running. + If True, include the Trials that are also + :py:attr:`~.Status.RUNNING` or :py:attr:`~.Status.READY`. Otherwise, return only the scheduled trials. + trial_runner_assigned : bool | None + If True, include the Trials that are assigned to a + :py:class:`~.TrialRunner`. If False, return only the trials + that are not assigned to any :py:class:`~.TrialRunner`. + If None, return all trials regardless of their assignment. Returns ------- diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index 3a09c0edae8..1dd4d452527 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -275,25 +275,35 @@ def get_trial_by_id( config=config, ) - def filter_trials_by_status( + def pending_trials( self, timestamp: datetime, - statuses: list[Status], + *, + running: bool = False, + trial_runner_assigned: bool | None = None, ) -> Iterator[Storage.Trial]: + if running: + statuses = [Status.PENDING, Status.READY, Status.RUNNING] + else: + statuses = [Status.PENDING] timestamp = utcify_timestamp(timestamp, origin="local") _LOG.info("Retrieve pending trials for: %s @ %s", self._experiment_id, timestamp) with self._engine.connect() as conn: - cur_trials = conn.execute( - self._schema.trial.select().where( - self._schema.trial.c.exp_id == self._experiment_id, - ( - self._schema.trial.c.ts_start.is_(None) - | (self._schema.trial.c.ts_start <= timestamp) - ), - self._schema.trial.c.ts_end.is_(None), - self._schema.trial.c.status.in_([s.name for s in statuses]), - ) + stmt = self._schema.trial.select().where( + self._schema.trial.c.exp_id == self._experiment_id, + ( + self._schema.trial.c.ts_start.is_(None) + | (self._schema.trial.c.ts_start <= timestamp) + ), + self._schema.trial.c.ts_end.is_(None), + self._schema.trial.c.status.in_([s.name for s in statuses]), ) + if trial_runner_assigned: + stmt.where(self._schema.trial.c.trial_runner_id.isnot(None)) + elif trial_runner_assigned is False: + stmt.where(self._schema.trial.c.trial_runner_id.is_(None)) + # else: # No filtering by trial_runner_id + cur_trials = conn.execute(stmt) for trial in cur_trials.fetchall(): tunables = self._get_key_val( conn, @@ -321,13 +331,6 @@ def filter_trials_by_status( config=config, ) - def pending_trials(self, timestamp: datetime, *, running: bool) -> Iterator[Storage.Trial]: - if running: - pending_status = [Status.PENDING, Status.READY, Status.RUNNING] - else: - pending_status = [Status.PENDING] - return self.filter_trials_by_status(timestamp=timestamp, statuses=pending_status) - def _get_config_id(self, conn: Connection, tunables: TunableGroups) -> int: """ Get the config ID for the given tunables. From a8c3d61c5ee78719007476da18e97c318a4cc410 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 2 May 2025 23:24:23 +0000 Subject: [PATCH 24/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../schedulers/parallel_scheduler.py | 33 ++++++++++--------- .../mlos_bench/schedulers/sync_scheduler.py | 1 - mlos_bench/mlos_bench/storage/base_storage.py | 8 ++--- mlos_bench/mlos_bench/storage/sql/storage.py | 2 +- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 2922c834188..c0e5e1f49cd 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -14,28 +14,27 @@ import logging from collections.abc import Callable, Iterable +from datetime import datetime from multiprocessing import current_process from multiprocessing.pool import AsyncResult, Pool -from datetime import datetime -from typing import Any from time import sleep +from typing import Any from attr import dataclass from pytz import UTC from mlos_bench.environments.status import Status +from mlos_bench.optimizers.base_optimizer import Optimizer from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.schedulers.trial_runner import TrialRunner from mlos_bench.storage.base_storage import Storage -from mlos_bench.optimizers.base_optimizer import Optimizer from mlos_bench.tunables.tunable_types import TunableValue _LOG = logging.getLogger(__name__) MAIN_PROCESS_NAME = "MainProcess" -""" -Name of the main process in control of the +"""Name of the main process in control of the :external:py:class:`multiprocessing.Pool`. """ @@ -47,8 +46,7 @@ def is_child_process() -> bool: @dataclass class TrialRunnerResult: - """ - A simple data class to hold the :py:class:`AsyncResult` of a + """A simple data class to hold the :py:class:`AsyncResult` of a :py:class:`TrialRunner` operation. """ @@ -60,7 +58,8 @@ class TrialRunnerResult: class ParallelScheduler(Scheduler): - """A simple multi-process asynchronous optimization loop implementation. + """ + A simple multi-process asynchronous optimization loop implementation. See :py:mod:`mlos_bench.schedulers.parallel_scheduler` for more usage details. @@ -115,8 +114,8 @@ def __init__( # pylint: disable=too-many-arguments self._pool: Pool | None = None """ - Parallel :external:py:class:`.Pool` to run :py:class:`~.Storage.Trial`s - in separate :py:class:`.TrialRunner` processes. + Parallel :external:py:class:`.Pool` to run :py:class:`~.Storage.Trial`s in + separate :py:class:`.TrialRunner` processes. Only initiated on context :py:meth:`.__enter__`. """ @@ -124,7 +123,8 @@ def __init__( # pylint: disable=too-many-arguments self._trial_runners_status: dict[int, AsyncResult[TrialRunnerResult] | None] = { trial_runner.trial_runner_id: None for trial_runner in self._trial_runners.values() } - """A dict to keep track of the status of each :py:class:`.TrialRunner`. + """ + A dict to keep track of the status of each :py:class:`.TrialRunner`. Since TrialRunners enter their running context within each pool task, we can't check :py:meth:`.TrialRunner.is_running` within the parent @@ -137,10 +137,11 @@ def __init__( # pylint: disable=too-many-arguments """ def _get_idle_trial_runners_count(self) -> int: - """Return a count of idle trial runners. + """ + Return a count of idle trial runners. - Can be used as a hint for the number of new trials we can run when we - next get more suggestions from the Optimizer. + Can be used as a hint for the number of new trials we can run when we next get + more suggestions from the Optimizer. """ return len( [ @@ -185,8 +186,8 @@ def run_trial_on_trial_runner( global_config: dict[str, Any] | None, ) -> TrialRunnerResult: """ - Retrieve and run a :py:class:`~.Storage.Trial` on a specific :py:class:`.TrialRunner` - in a :py:class:`~.Pool` background worker process. + Retrieve and run a :py:class:`~.Storage.Trial` on a specific + :py:class:`.TrialRunner` in a :py:class:`~.Pool` background worker process. Parameters ---------- diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index ec81fae3733..a6e7ee002a1 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -6,7 +6,6 @@ import logging - from mlos_bench.schedulers.base_scheduler import Scheduler from mlos_bench.storage.base_storage import Storage diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 51ebc9fbc35..428203b26e5 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -125,7 +125,7 @@ def experiment( # pylint: disable=too-many-arguments description: str, tunables: TunableGroups, opt_targets: dict[str, Literal["min", "max"]], - ) -> "Storage.Experiment": + ) -> Storage.Experiment: """ Create a new experiment in the storage. @@ -182,7 +182,7 @@ def __init__( # pylint: disable=too-many-arguments self._opt_targets = opt_targets self._in_context = False - def __enter__(self) -> "Storage.Experiment": + def __enter__(self) -> Storage.Experiment: """ Enter the context of the experiment. @@ -358,8 +358,8 @@ def pending_trials( """ Return an iterator over the :py:class:`~.Storage.Trial`s that are :py:attr:`~.Status.PENDING` and have a scheduled - :py:attr:`~.Storage.Trial.ts_start` time to run on or before the - specified timestamp. + :py:attr:`~.Storage.Trial.ts_start` time to run on or before the specified + timestamp. Parameters ---------- diff --git a/mlos_bench/mlos_bench/storage/sql/storage.py b/mlos_bench/mlos_bench/storage/sql/storage.py index a37a8f7ea0a..4d606d624e4 100644 --- a/mlos_bench/mlos_bench/storage/sql/storage.py +++ b/mlos_bench/mlos_bench/storage/sql/storage.py @@ -7,7 +7,7 @@ import logging from typing import Literal -from sqlalchemy import Engine, URL, create_engine +from sqlalchemy import URL, Engine, create_engine from mlos_bench.services.base_service import Service from mlos_bench.storage.base_experiment_data import ExperimentData From c79df57cecfc0d9cc60a534fcfc314325df7efdc Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 15:00:25 -0500 Subject: [PATCH 25/38] add copilot instructions customizations --- .github/instructions/README.md | 20 +++++ .github/instructions/bash.instructions.md | 21 +++++ .github/instructions/default.instructions.md | 16 ++++ .github/instructions/json.instructions.md | 12 +++ .github/instructions/markdown.instructions.md | 9 +++ .github/instructions/python.instructions.md | 76 +++++++++++++++++++ .github/instructions/rst.instructions.md | 10 +++ .vscode/settings.json | 10 ++- CONTRIBUTING.md | 4 +- 9 files changed, 175 insertions(+), 3 deletions(-) create mode 100644 .github/instructions/README.md create mode 100644 .github/instructions/bash.instructions.md create mode 100644 .github/instructions/default.instructions.md create mode 100644 .github/instructions/json.instructions.md create mode 100644 .github/instructions/markdown.instructions.md create mode 100644 .github/instructions/python.instructions.md create mode 100644 .github/instructions/rst.instructions.md diff --git a/.github/instructions/README.md b/.github/instructions/README.md new file mode 100644 index 00000000000..5b10d1b2df3 --- /dev/null +++ b/.github/instructions/README.md @@ -0,0 +1,20 @@ +# Custom Copilot Instructions + +This directory contains custom instructions for the Copilot AI assistant. +The instructions are designed to guide the AI in providing responses that align with specific project needs and preferences. + +## Organization + +- Language specific instructions go in their own file, with the cross task instructions in the root of this directory. + + (e.g., [python.instructions.md](python.instructions.md), [bash.instructions.md](bash.instructions.md), [markdown.instructions.md](markdown.instructions.md), [json.instructions.md](json.instructions.md)). + +- Instructions relevant to all languages go in the `default.instructions.md` file. + + (e.g., [default.instructions.md](default.instructions.md)). + +## See Also + +- +- [.vscode/settings.json](../.vscode/settings.json) + - Configures which of these instructions are used for each file type. diff --git a/.github/instructions/bash.instructions.md b/.github/instructions/bash.instructions.md new file mode 100644 index 00000000000..55fd976c2a0 --- /dev/null +++ b/.github/instructions/bash.instructions.md @@ -0,0 +1,21 @@ +--- +applyTo: "**/*.sh" +--- + +# Bash shell scripting language instructions + +- Include instructions from [default.instructions.md](default.instructions.md) for all languages. + +- Scripts schould use `set -e` or `set -o errexit` to exit on error. +- Scripts should use use `set -u` or `set -o nounset` to exit on unset variables. +- Scripts should use `set -o pipefail` to exit on errors in pipelines. +- Commands should be checked for non-zero exit codes and either handled or reported. +- Scripts should use portable syntax for MacOS vs. Linux +- Scripts should validate input. +- Scripts should include usage instructions. +- Scripts should be executable (e.g., `chmod +x`). +- Scripts should include a shebang line (e.g., `#!/usr/bin/env bash`). +- Scripts should be well commented. +- Scripts should include documentation updates if needed. +- Scripts should be well formatted. +- `if` `then` statements should be on the same line. diff --git a/.github/instructions/default.instructions.md b/.github/instructions/default.instructions.md new file mode 100644 index 00000000000..dd290695c73 --- /dev/null +++ b/.github/instructions/default.instructions.md @@ -0,0 +1,16 @@ +--- +applyTo: "**" +--- + +# Default language review selection instructions + +- PRs should be small and focused on a single change or feature. +- PRs should have a clear description of what the change is and why it is needed. +- PRs should have tests that cover the changes made. +- PRs should try not to include any commented-out code. +- PRs should not include any unnecessary files or changes to files that are not related to the change being made. +- PRs should not include any changes to the README or other documentation unless it is directly related to the change being made. +- PRs should be well commented. +- PRs should include documentation updates if needed. +- PRs should try to avoid unnecessary formatting changes or else keep them to their own PR that is just for formatting. +- PRs that change the CI pipeline or pre-commit hooks should generally be kept to their own PR. diff --git a/.github/instructions/json.instructions.md b/.github/instructions/json.instructions.md new file mode 100644 index 00000000000..979f81b0b39 --- /dev/null +++ b/.github/instructions/json.instructions.md @@ -0,0 +1,12 @@ +--- +applyTo: "**/*.json,**/*.jsonc,**.json5" +--- + +# JSON language instructions + +- Include instructions from [default.instructions.md](default.instructions.md) for all languages. + +- Files with a `.json` extension that are ARM templates or JSON schema files should be well formatted and valid JSON, without any comments, trailing commas, etc. +- Files with a `.json` extension that are VSCode settings files (e.g., inside the [.vscode](../../../.vscode)) or [.devcontainer](../../../.devcontainer) directories) should be well formatted and valid JSON, but may contain comments, trailing commas, etc. +- Files with a `.jsonc` or `.json5` extension should be well formatted and valid JSON5 or JSONC or JSON, and can include comments, trailing commas, etc. +- If a file is an `mlos_bench` config, it should have a `.mlos.jsonc` of `.mlos.json` or `.mlos.json5` extension, and should generally match the schemas defined in the [mlos_bench/configs/schemas/](../../../mlos_bench/mlos_bench/config/schemas/) directory (e.g., [mlos-bench-config-schema.json](../../../mlos_bench/mlos_bench/config/schemas/mlos-bench-config-schema.json)), unless it is a test config under the [tests/configs/schemas](../../../mlos_bench/mlos_bench/tests/configs/schemas/) directory. diff --git a/.github/instructions/markdown.instructions.md b/.github/instructions/markdown.instructions.md new file mode 100644 index 00000000000..2363d6129c7 --- /dev/null +++ b/.github/instructions/markdown.instructions.md @@ -0,0 +1,9 @@ +--- +applyTo: "**/*.md" +--- + +# Markdown language instructions + +- Include instructions from [default.instructions.md](default.instructions.md) for all languages. +- Documentation should include relative links to other documentation files whenever possible. +- Markdown files should be well formatted and valid Markdown conforming to markdownlint rules. diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md new file mode 100644 index 00000000000..53b864ab827 --- /dev/null +++ b/.github/instructions/python.instructions.md @@ -0,0 +1,76 @@ +--- +applyTo: "**/*.py" +--- + +# Python language file instructions + +- Include instructions from [default.instructions.md](default.instructions.md) for all languages. +- All functions, methods, classes, and attributes should have docstrings. +- Docstrings should include Sphinx style crossref directives for functions, methods, classes, attributes, and data whenever possible using `:py:class:` or `:py:func:` or `:py:meth:` or `:py:attr:` or `:py:data` syntax, respectively, + + See Also + +- Docstrings for modules should include a summary of the module's purpose and any important details about its usage. + - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. + + For instance: + + ```python + ''' + This is an example module docstring for the mlos_bench.environments.my_special_env module. + + It should include some descriptive text about the module and its purpose. + + Example + ------- + It also includes some executable code examples. + + >>> import json5 as json + >>> # Load a JSON config string for a MySpecialEnvironment instance. + >>> json_string = """ + ... { + ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", + ... "name": "MySpecialEnvironment", + ... "config": { + ... "param1": 42, + ... "param2": "foo", + ... }, + ... } + ... """ + >>> config = json.loads(json_string) + + >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment + >>> my_env = MySpecialEnvironment(config=config) + >>> print(my_env) + MySpecialEnvironment(param1=42, param2='foo') + ''' + ``` + + - Docstrings for classes can refer to their module docstring with `:py:mod:` cross-references for usage examples to allow easier browser navigation of generated documentation. + + For instance: + + ```python + class MySpecialEnv: + """ + This is class docstring for MySpecialEnv. + + It should include some descriptive text about the class and its purpose. + + Example + ------- + Refer to to :py:mod:`mlos_bench.environments.my_special_env` for usage examples. + """ + ``` + +- If not all arguments to a function or method fit on the same line, then they should each be on their own line. + + Adding a trailing comma to the last argument is optional, but recommended for consistency whenever a single line is insufficient. + +- Code should be formatting using `black`. +- Code should be type checked using `mypy`. +- All function and method parameters should be type annotated. +- Code should be linted using `pylint`. + +- Tests should be included for all new code and should be run using `pytest`. +- Tests should be organized roughly the same way as the code they are testing (e.g., `tests/some/test_module.py` for `some/module.py`). diff --git a/.github/instructions/rst.instructions.md b/.github/instructions/rst.instructions.md new file mode 100644 index 00000000000..3237497501e --- /dev/null +++ b/.github/instructions/rst.instructions.md @@ -0,0 +1,10 @@ +--- +applyTo: "**/*.rst" +--- + +# RST markup language instructions + +- Include instructions from [default.instructions.md](default.instructions.md) for all languages. +- Documentation should include sphinx crossref directives for functions, methods, and classes whenever possible. + + See Also: diff --git a/.vscode/settings.json b/.vscode/settings.json index 23e43fed683..8101f60b7ae 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,4 @@ -// vim: set ft=jsonc: +// vim: set ft=json5: { "makefile.extensionOutputFolder": "./.vscode", "files.exclude": { @@ -170,5 +170,11 @@ "python.testing.unittestEnabled": false, "debugpy.debugJustMyCode": false, "python.analysis.autoImportCompletions": true, - "python.analysis.supportRestructuredText": true + "python.analysis.supportRestructuredText": true, + "makefile.configureOnOpen": false, + + "githubPullRequests.experimental.chat": true, + "github.copilot.chat.codesearch.enabled": true, + "github.copilot.chat.copilotDebugCommand.enabled": true, + "github.copilot.chat.reviewSelection.enabled": true } diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ef9ab4fdea6..27121a96ce5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,10 +97,12 @@ We expect development to follow a typical "forking" style workflow: Some notes on organizing changes to help reviewers: - 1. Please try to keep PRs small whenver possible and don't include unnecessaary formatting changes. + 1. Please try to keep PRs small whenever possible and don't include unnecessary formatting changes. 1. Larger changes can be planned in [Issues](https://github.com/microsoft/MLOS/issues), prototyped in a large draft PR for early feedback, and split into smaller PRs via discussion. 1. All changes should include test coverage (either new or existing). + > For additional advice on PR reviews, see [.github/instructions/](.github/instructions/) for Copilot instructions. + 1. PRs are associated with [Github Issues](https://github.com/microsoft/MLOS/issues) and need [MLOS-committers](https://github.com/orgs/microsoft/teams/MLOS-committers) to sign-off (in addition to other CI pipeline checks like tests and lint checks to pass). 1. Once approved, the PR can be completed using a squash merge in order to keep a nice linear history. From 3815ed117bd37db280a36967c2117031816dc245 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 15:34:20 -0500 Subject: [PATCH 26/38] adding some custom prompts for github copilot to use --- .github/prompts/README.md | 29 +++++++++++ .github/prompts/TODO.md | 17 ++++++ ...fig-examples-to-module-docstring.prompt.md | 51 ++++++++++++++++++ ...dd-sphinx-crossref-to-docstrings.prompt.md | 52 +++++++++++++++++++ 4 files changed, 149 insertions(+) create mode 100644 .github/prompts/README.md create mode 100644 .github/prompts/TODO.md create mode 100644 .github/prompts/add-config-examples-to-module-docstring.prompt.md create mode 100644 .github/prompts/add-sphinx-crossref-to-docstrings.prompt.md diff --git a/.github/prompts/README.md b/.github/prompts/README.md new file mode 100644 index 00000000000..c9866e63142 --- /dev/null +++ b/.github/prompts/README.md @@ -0,0 +1,29 @@ +# Custom `.prompt.md` files for Github Copilot + +This directory contains custom `.prompt.md` files for Github Copilot. + +These files are used to customize the behavior of Github Copilot when generating code. + +The can be invoked with the `/custom-prompt-file-name-prefix` command in the Copilot Chat view (generally when in Agent mode). + +For instance: + +```txt +/add-sphinx-crossref-to-docstrings +``` + +will invoke the [`add-sphinx-crossref-to-docstrings.prompt.md`](./add-sphinx-crossref-to-docstrings.prompt.md) file. + +Some prompts take additional arguments to help Copilot understand the context of the code being generated or other action to take. + +## Types of Custom Prompts + +There are two types of custom prompts: + +1. Those for MLOS developers (e.g. `add-sphinx-crossref-to-docstrings.prompt.md`). +2. Those for MLOS users (e.g., `generate-mlos-configuration-file.prompt.md`). + +## See Also + +- +- [TODO.md](./TODO.md) diff --git a/.github/prompts/TODO.md b/.github/prompts/TODO.md new file mode 100644 index 00000000000..7202d800289 --- /dev/null +++ b/.github/prompts/TODO.md @@ -0,0 +1,17 @@ +# Custom Prompt TODOs + +- MLOS + - [ ] Create experiment configs + - [ ] Create (or reuse) environments configs + - [ ] Use `include` directives with `CompositeEnv` to help structure and nest things. + - [ ] Create (or reuse) services configs + - [ ] Create (or reuse) ARM templates + - [ ] Create (or reuse) storage configs + - [ ] Create (or reuse) scheduler configs + +- mlos-autotuning-template + - [ ] Create new X config + - [ ] `include` config to do Y + - Add `globals` variables to configure X + - Create (or reuse) ARM template for X + - Make sure scripting commands are idempotent diff --git a/.github/prompts/add-config-examples-to-module-docstring.prompt.md b/.github/prompts/add-config-examples-to-module-docstring.prompt.md new file mode 100644 index 00000000000..532c893d148 --- /dev/null +++ b/.github/prompts/add-config-examples-to-module-docstring.prompt.md @@ -0,0 +1,51 @@ +# Custom Prompt: Add config examples to module docstrings + +Let's add config examples to module docstrings in this mlos_bench module. + +- Docstrings for modules should include a summary of the module's purpose and any important details about its usage. + - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. + + For instance: + + ```python + ''' + This is an example module docstring for the mlos_bench.environments.my_special_env module. + + It should include some descriptive text about the module and its purpose. + + Example + ------- + It also includes some executable code examples. + + >>> import json5 as json + >>> # Load a JSON config string for a MySpecialEnvironment instance. + >>> json_string = """ + ... { + ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", + ... "name": "MySpecialEnvironment", + ... "config": { + ... "param1": 42, + ... "param2": "foo", + ... }, + ... } + ... """ + >>> config = json.loads(json_string) + + >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment + >>> my_env = MySpecialEnvironment(config=config) + >>> print(my_env) + MySpecialEnvironment(param1=42, param2='foo') + ''' + ``` + + - Configuration options for these modules should be derived from a JSON, included as a string in the module docstring so users reading the documentation can easily copy/paste, but generally they are loaded from a separate `.mlos.jsonc` config file. + - The JSON config string should be formatted using `json5` to allow for comments and trailing commas. + - The JSON config options should conform to the relevant JSON schema for the module, usually defined in the [mlos_bench/configs/schemas](../../mlos_bench/mlos_bench/config/schemas/) directory. + For instance: + - For an `mlos_bench.environments` module, the JSON config options should conform to the [mlos_bench/configs/schemas/environments](../../mlos_bench/mlos_bench/config/schemas/environments/environment-schema.json) schema file. + - For an `mlos_bench.services` module, the JSON config options should conform to the [mlos_bench/configs/schemas/services](../../mlos_bench/mlos_bench/config/schemas/services/service-schema.json) schema file. + - For an `mlos_bench.schedulers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/schedulers](../../mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json) schema file. + - For an `mlos_bench.storage` module, the JSON config options should conform to the [mlos_bench/configs/schemas/storage](../../mlos_bench/mlos_bench/config/schemas/storage/storage-schema.json) schema file. + - For an `mlos_bench.optimizers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/optimizers](../../mlos_bench/mlos_bench/config/schemas/optimizers/optimizer-schema.json) schema file. + + - The other options that the config can take can often also be found in the parsing of the `config` argument in the `__init__` method body of the class, but they should be included in the module docstring as well. diff --git a/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md b/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md new file mode 100644 index 00000000000..e14c2d28be1 --- /dev/null +++ b/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md @@ -0,0 +1,52 @@ +# Custom Prompt: Add Sphinx Crossref Links to Python Docstrings + +Add Sphinx cross-references to python docstrings referencing classes or functions or methods or attributes or data in this file using `:py:class:` or `:py:func:` or `:py:meth:` or `:py:attr:` or `:py:data` syntax, respectively. + - See Also + +We don't need to do this for the parameter types listed in the Parameters or Returns sections of the docstring though. + +For example: + +```python +def example_function(param1: MyClass, param2: MyOtherClass) -> SomeOtherType: + """ + Example function working on an instance of MyClass and MyOtherClass. + + Parameters + ---------- + param1 : MyClass + An instance of MyClass. + param2 : MyOtherClass + An instance of MyOtherClass. + + Returns + ------- + SomeOtherType + An instance of SomeOtherType. + """ + pass +``` + +should be changed to: + +```python +def example_function(param1: MyClass, param2: MyOtherClass) -> SomeOtherType: + """ + Example function working on an instance of :py:class:`MyClass` and :py:class:`MyOtherClass`. + + Uses the :py:meth:`MyClass.method_name` method and the :py:attr:`MyOtherClass.attribute_name` attribute. + + Parameters + ---------- + param1 : MyClass + An instance of :py:class:`MyClass`. + param2 : MyOtherClass + An instance of :py:class:`MyOtherClass`. + + Returns + ------- + SomeOtherType + An instance of :py:class:`SomeOtherType`. + """ + pass +``` From f177f56aba7d2bdf116aa0c6c03a4988c174c596 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 May 2025 20:34:50 +0000 Subject: [PATCH 27/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/instructions/README.md | 4 +- .github/instructions/bash.instructions.md | 14 ++- .github/instructions/default.instructions.md | 2 +- .github/instructions/json.instructions.md | 5 +- .github/instructions/markdown.instructions.md | 2 +- .github/instructions/python.instructions.md | 101 ++++++++++-------- .github/instructions/rst.instructions.md | 3 +- .github/prompts/README.md | 2 +- .github/prompts/TODO.md | 2 + ...fig-examples-to-module-docstring.prompt.md | 91 ++++++++-------- ...dd-sphinx-crossref-to-docstrings.prompt.md | 2 +- 11 files changed, 128 insertions(+), 100 deletions(-) diff --git a/.github/instructions/README.md b/.github/instructions/README.md index 5b10d1b2df3..9675885d684 100644 --- a/.github/instructions/README.md +++ b/.github/instructions/README.md @@ -7,11 +7,11 @@ The instructions are designed to guide the AI in providing responses that align - Language specific instructions go in their own file, with the cross task instructions in the root of this directory. - (e.g., [python.instructions.md](python.instructions.md), [bash.instructions.md](bash.instructions.md), [markdown.instructions.md](markdown.instructions.md), [json.instructions.md](json.instructions.md)). + (e.g., [python.instructions.md](python.instructions.md), [bash.instructions.md](bash.instructions.md), [markdown.instructions.md](markdown.instructions.md), [json.instructions.md](json.instructions.md)). - Instructions relevant to all languages go in the `default.instructions.md` file. - (e.g., [default.instructions.md](default.instructions.md)). + (e.g., [default.instructions.md](default.instructions.md)). ## See Also diff --git a/.github/instructions/bash.instructions.md b/.github/instructions/bash.instructions.md index 55fd976c2a0..02b863f55bf 100644 --- a/.github/instructions/bash.instructions.md +++ b/.github/instructions/bash.instructions.md @@ -1,5 +1,5 @@ --- -applyTo: "**/*.sh" +applyTo: '**/*.sh' --- # Bash shell scripting language instructions @@ -7,15 +7,27 @@ applyTo: "**/*.sh" - Include instructions from [default.instructions.md](default.instructions.md) for all languages. - Scripts schould use `set -e` or `set -o errexit` to exit on error. + - Scripts should use use `set -u` or `set -o nounset` to exit on unset variables. + - Scripts should use `set -o pipefail` to exit on errors in pipelines. + - Commands should be checked for non-zero exit codes and either handled or reported. + - Scripts should use portable syntax for MacOS vs. Linux + - Scripts should validate input. + - Scripts should include usage instructions. + - Scripts should be executable (e.g., `chmod +x`). + - Scripts should include a shebang line (e.g., `#!/usr/bin/env bash`). + - Scripts should be well commented. + - Scripts should include documentation updates if needed. + - Scripts should be well formatted. + - `if` `then` statements should be on the same line. diff --git a/.github/instructions/default.instructions.md b/.github/instructions/default.instructions.md index dd290695c73..573ef7e2708 100644 --- a/.github/instructions/default.instructions.md +++ b/.github/instructions/default.instructions.md @@ -1,5 +1,5 @@ --- -applyTo: "**" +applyTo: '**' --- # Default language review selection instructions diff --git a/.github/instructions/json.instructions.md b/.github/instructions/json.instructions.md index 979f81b0b39..49ed88ab035 100644 --- a/.github/instructions/json.instructions.md +++ b/.github/instructions/json.instructions.md @@ -1,5 +1,5 @@ --- -applyTo: "**/*.json,**/*.jsonc,**.json5" +applyTo: '**/*.json,**/*.jsonc,**.json5' --- # JSON language instructions @@ -7,6 +7,9 @@ applyTo: "**/*.json,**/*.jsonc,**.json5" - Include instructions from [default.instructions.md](default.instructions.md) for all languages. - Files with a `.json` extension that are ARM templates or JSON schema files should be well formatted and valid JSON, without any comments, trailing commas, etc. + - Files with a `.json` extension that are VSCode settings files (e.g., inside the [.vscode](../../../.vscode)) or [.devcontainer](../../../.devcontainer) directories) should be well formatted and valid JSON, but may contain comments, trailing commas, etc. + - Files with a `.jsonc` or `.json5` extension should be well formatted and valid JSON5 or JSONC or JSON, and can include comments, trailing commas, etc. + - If a file is an `mlos_bench` config, it should have a `.mlos.jsonc` of `.mlos.json` or `.mlos.json5` extension, and should generally match the schemas defined in the [mlos_bench/configs/schemas/](../../../mlos_bench/mlos_bench/config/schemas/) directory (e.g., [mlos-bench-config-schema.json](../../../mlos_bench/mlos_bench/config/schemas/mlos-bench-config-schema.json)), unless it is a test config under the [tests/configs/schemas](../../../mlos_bench/mlos_bench/tests/configs/schemas/) directory. diff --git a/.github/instructions/markdown.instructions.md b/.github/instructions/markdown.instructions.md index 2363d6129c7..1b2a450a68f 100644 --- a/.github/instructions/markdown.instructions.md +++ b/.github/instructions/markdown.instructions.md @@ -1,5 +1,5 @@ --- -applyTo: "**/*.md" +applyTo: '**/*.md' --- # Markdown language instructions diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md index 53b864ab827..5ace2693a94 100644 --- a/.github/instructions/python.instructions.md +++ b/.github/instructions/python.instructions.md @@ -1,76 +1,83 @@ --- -applyTo: "**/*.py" +applyTo: '**/*.py' --- # Python language file instructions - Include instructions from [default.instructions.md](default.instructions.md) for all languages. + - All functions, methods, classes, and attributes should have docstrings. + - Docstrings should include Sphinx style crossref directives for functions, methods, classes, attributes, and data whenever possible using `:py:class:` or `:py:func:` or `:py:meth:` or `:py:attr:` or `:py:data` syntax, respectively, See Also - Docstrings for modules should include a summary of the module's purpose and any important details about its usage. - - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. - For instance: + - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. + + For instance: + + ```python + ''' + This is an example module docstring for the mlos_bench.environments.my_special_env module. + + It should include some descriptive text about the module and its purpose. + + Example + ------- + It also includes some executable code examples. + + >>> import json5 as json + >>> # Load a JSON config string for a MySpecialEnvironment instance. + >>> json_string = """ + ... { + ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", + ... "name": "MySpecialEnvironment", + ... "config": { + ... "param1": 42, + ... "param2": "foo", + ... }, + ... } + ... """ + >>> config = json.loads(json_string) - ```python - ''' - This is an example module docstring for the mlos_bench.environments.my_special_env module. + >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment + >>> my_env = MySpecialEnvironment(config=config) + >>> print(my_env) + MySpecialEnvironment(param1=42, param2='foo') + ''' + ``` - It should include some descriptive text about the module and its purpose. + - Docstrings for classes can refer to their module docstring with `:py:mod:` cross-references for usage examples to allow easier browser navigation of generated documentation. + + For instance: + + ```python + class MySpecialEnv: + """ + This is class docstring for MySpecialEnv. + + It should include some descriptive text about the class and its purpose. Example ------- - It also includes some executable code examples. - - >>> import json5 as json - >>> # Load a JSON config string for a MySpecialEnvironment instance. - >>> json_string = """ - ... { - ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", - ... "name": "MySpecialEnvironment", - ... "config": { - ... "param1": 42, - ... "param2": "foo", - ... }, - ... } - ... """ - >>> config = json.loads(json_string) - - >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment - >>> my_env = MySpecialEnvironment(config=config) - >>> print(my_env) - MySpecialEnvironment(param1=42, param2='foo') - ''' - ``` - - - Docstrings for classes can refer to their module docstring with `:py:mod:` cross-references for usage examples to allow easier browser navigation of generated documentation. - - For instance: - - ```python - class MySpecialEnv: - """ - This is class docstring for MySpecialEnv. - - It should include some descriptive text about the class and its purpose. - - Example - ------- - Refer to to :py:mod:`mlos_bench.environments.my_special_env` for usage examples. - """ - ``` + Refer to to :py:mod:`mlos_bench.environments.my_special_env` for usage examples. + """ + ``` - If not all arguments to a function or method fit on the same line, then they should each be on their own line. - Adding a trailing comma to the last argument is optional, but recommended for consistency whenever a single line is insufficient. + Adding a trailing comma to the last argument is optional, but recommended for consistency whenever a single line is insufficient. - Code should be formatting using `black`. + - Code should be type checked using `mypy`. + - All function and method parameters should be type annotated. + - Code should be linted using `pylint`. - Tests should be included for all new code and should be run using `pytest`. + - Tests should be organized roughly the same way as the code they are testing (e.g., `tests/some/test_module.py` for `some/module.py`). diff --git a/.github/instructions/rst.instructions.md b/.github/instructions/rst.instructions.md index 3237497501e..1096e8b7a28 100644 --- a/.github/instructions/rst.instructions.md +++ b/.github/instructions/rst.instructions.md @@ -1,10 +1,11 @@ --- -applyTo: "**/*.rst" +applyTo: '**/*.rst' --- # RST markup language instructions - Include instructions from [default.instructions.md](default.instructions.md) for all languages. + - Documentation should include sphinx crossref directives for functions, methods, and classes whenever possible. See Also: diff --git a/.github/prompts/README.md b/.github/prompts/README.md index c9866e63142..05d11399989 100644 --- a/.github/prompts/README.md +++ b/.github/prompts/README.md @@ -21,7 +21,7 @@ Some prompts take additional arguments to help Copilot understand the context of There are two types of custom prompts: 1. Those for MLOS developers (e.g. `add-sphinx-crossref-to-docstrings.prompt.md`). -2. Those for MLOS users (e.g., `generate-mlos-configuration-file.prompt.md`). +1. Those for MLOS users (e.g., `generate-mlos-configuration-file.prompt.md`). ## See Also diff --git a/.github/prompts/TODO.md b/.github/prompts/TODO.md index 7202d800289..4deda2addac 100644 --- a/.github/prompts/TODO.md +++ b/.github/prompts/TODO.md @@ -1,6 +1,7 @@ # Custom Prompt TODOs - MLOS + - [ ] Create experiment configs - [ ] Create (or reuse) environments configs - [ ] Use `include` directives with `CompositeEnv` to help structure and nest things. @@ -10,6 +11,7 @@ - [ ] Create (or reuse) scheduler configs - mlos-autotuning-template + - [ ] Create new X config - [ ] `include` config to do Y - Add `globals` variables to configure X diff --git a/.github/prompts/add-config-examples-to-module-docstring.prompt.md b/.github/prompts/add-config-examples-to-module-docstring.prompt.md index 532c893d148..e9007eff197 100644 --- a/.github/prompts/add-config-examples-to-module-docstring.prompt.md +++ b/.github/prompts/add-config-examples-to-module-docstring.prompt.md @@ -3,49 +3,52 @@ Let's add config examples to module docstrings in this mlos_bench module. - Docstrings for modules should include a summary of the module's purpose and any important details about its usage. - - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. - + - Module docstrings should also include an executable example of how to use the module, including any important functions or classes or configuration options (especially those derived from a JSON config file) like any of those in `mlos_bench.environments`, `mlos_bench.services`, `mlos_bench.schedulers`, `mlos_bench.optimizers`, and `mlos_bench.storage`. + + For instance: + + ```python + ''' + This is an example module docstring for the mlos_bench.environments.my_special_env module. + + It should include some descriptive text about the module and its purpose. + + Example + ------- + It also includes some executable code examples. + + >>> import json5 as json + >>> # Load a JSON config string for a MySpecialEnvironment instance. + >>> json_string = """ + ... { + ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", + ... "name": "MySpecialEnvironment", + ... "config": { + ... "param1": 42, + ... "param2": "foo", + ... }, + ... } + ... """ + >>> config = json.loads(json_string) + + >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment + >>> my_env = MySpecialEnvironment(config=config) + >>> print(my_env) + MySpecialEnvironment(param1=42, param2='foo') + ''' + ``` + + - Configuration options for these modules should be derived from a JSON, included as a string in the module docstring so users reading the documentation can easily copy/paste, but generally they are loaded from a separate `.mlos.jsonc` config file. + + - The JSON config string should be formatted using `json5` to allow for comments and trailing commas. + + - The JSON config options should conform to the relevant JSON schema for the module, usually defined in the [mlos_bench/configs/schemas](../../mlos_bench/mlos_bench/config/schemas/) directory. For instance: - ```python - ''' - This is an example module docstring for the mlos_bench.environments.my_special_env module. - - It should include some descriptive text about the module and its purpose. - - Example - ------- - It also includes some executable code examples. - - >>> import json5 as json - >>> # Load a JSON config string for a MySpecialEnvironment instance. - >>> json_string = """ - ... { - ... "class": "mlos_bench.environments.my_special_env.MySpecialEnvironment", - ... "name": "MySpecialEnvironment", - ... "config": { - ... "param1": 42, - ... "param2": "foo", - ... }, - ... } - ... """ - >>> config = json.loads(json_string) - - >>> from mlos_bench.environments.my_special_env import MySpecialEnvironment - >>> my_env = MySpecialEnvironment(config=config) - >>> print(my_env) - MySpecialEnvironment(param1=42, param2='foo') - ''' - ``` - - - Configuration options for these modules should be derived from a JSON, included as a string in the module docstring so users reading the documentation can easily copy/paste, but generally they are loaded from a separate `.mlos.jsonc` config file. - - The JSON config string should be formatted using `json5` to allow for comments and trailing commas. - - The JSON config options should conform to the relevant JSON schema for the module, usually defined in the [mlos_bench/configs/schemas](../../mlos_bench/mlos_bench/config/schemas/) directory. - For instance: - - For an `mlos_bench.environments` module, the JSON config options should conform to the [mlos_bench/configs/schemas/environments](../../mlos_bench/mlos_bench/config/schemas/environments/environment-schema.json) schema file. - - For an `mlos_bench.services` module, the JSON config options should conform to the [mlos_bench/configs/schemas/services](../../mlos_bench/mlos_bench/config/schemas/services/service-schema.json) schema file. - - For an `mlos_bench.schedulers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/schedulers](../../mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json) schema file. - - For an `mlos_bench.storage` module, the JSON config options should conform to the [mlos_bench/configs/schemas/storage](../../mlos_bench/mlos_bench/config/schemas/storage/storage-schema.json) schema file. - - For an `mlos_bench.optimizers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/optimizers](../../mlos_bench/mlos_bench/config/schemas/optimizers/optimizer-schema.json) schema file. - - - The other options that the config can take can often also be found in the parsing of the `config` argument in the `__init__` method body of the class, but they should be included in the module docstring as well. + - For an `mlos_bench.environments` module, the JSON config options should conform to the [mlos_bench/configs/schemas/environments](../../mlos_bench/mlos_bench/config/schemas/environments/environment-schema.json) schema file. + - For an `mlos_bench.services` module, the JSON config options should conform to the [mlos_bench/configs/schemas/services](../../mlos_bench/mlos_bench/config/schemas/services/service-schema.json) schema file. + - For an `mlos_bench.schedulers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/schedulers](../../mlos_bench/mlos_bench/config/schemas/schedulers/scheduler-schema.json) schema file. + - For an `mlos_bench.storage` module, the JSON config options should conform to the [mlos_bench/configs/schemas/storage](../../mlos_bench/mlos_bench/config/schemas/storage/storage-schema.json) schema file. + - For an `mlos_bench.optimizers` module, the JSON config options should conform to the [mlos_bench/configs/schemas/optimizers](../../mlos_bench/mlos_bench/config/schemas/optimizers/optimizer-schema.json) schema file. + + - The other options that the config can take can often also be found in the parsing of the `config` argument in the `__init__` method body of the class, but they should be included in the module docstring as well. diff --git a/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md b/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md index e14c2d28be1..a6e7203e5ac 100644 --- a/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md +++ b/.github/prompts/add-sphinx-crossref-to-docstrings.prompt.md @@ -1,7 +1,7 @@ # Custom Prompt: Add Sphinx Crossref Links to Python Docstrings Add Sphinx cross-references to python docstrings referencing classes or functions or methods or attributes or data in this file using `:py:class:` or `:py:func:` or `:py:meth:` or `:py:attr:` or `:py:data` syntax, respectively. - - See Also +\- See Also We don't need to do this for the parameter types listed in the Parameters or Returns sections of the docstring though. From b482564fe574fb3eee87756c5e1a1fbfc5ec81c2 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 17:13:44 -0500 Subject: [PATCH 28/38] don't remove unused imports - it's annoying --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 8101f60b7ae..f791711652c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -147,7 +147,7 @@ "[python]": { "editor.codeActionsOnSave": { "source.organizeImports": "explicit", - "source.unusedImports": "explicit" + //"source.unusedImports": "explicit" }, "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, From dd2faee5ff980f6ea22bd3df53a6a7206a9f9ff3 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 17:15:24 -0500 Subject: [PATCH 29/38] Make sure to handle stragger trials when re-loading the storage --- .../mlos_bench/schedulers/base_scheduler.py | 38 +++++++++++++--- mlos_bench/mlos_bench/storage/base_storage.py | 22 ++++++++- .../mlos_bench/storage/sql/experiment.py | 45 +++++++++++++++++-- 3 files changed, 94 insertions(+), 11 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index c9e6d8fa666..84c2338f6f0 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -100,8 +100,9 @@ def __init__( # pylint: disable=too-many-arguments self._optimizer = optimizer self._storage = storage self._root_env_config = root_env_config - self._last_trial_id = -1 + self._longest_finished_trial_sequence_id = -1 self._ran_trials: list[Storage.Trial] = [] + self._registered_trial_ids: set[int] = set() _LOG.debug("Scheduler instantiated: %s :: %s", self, config) @@ -265,7 +266,10 @@ def start(self) -> None: not_done: bool = True while not_done: - _LOG.info("Optimization loop: Last trial ID: %d", self._last_trial_id) + _LOG.info( + "Optimization loop: Longest finished trial sequence ID: %d", + self._longest_finished_trial_sequence_id, + ) self.run_schedule(is_warm_up) not_done = self.add_new_optimizer_suggestions() self.assign_trial_runners( @@ -327,14 +331,33 @@ def add_new_optimizer_suggestions(self) -> bool: assert self.experiment is not None # Load the results of the trials that have been run since the last time # we queried the Optimizer. - # FIXME: This can miss some straggler results from parallel trial - # executions. - # Maybe just maintain a set? - (trial_ids, configs, scores, status) = self.experiment.load(self._last_trial_id) + # Note: We need to handle the case of straggler trials that finish out of order. + (trial_ids, configs, scores, status) = self.experiment.load( + last_trial_id=self._longest_finished_trial_sequence_id, + omit_registered_trial_ids=self._registered_trial_ids, + ) _LOG.info("QUEUE: Update the optimizer with trial results: %s", trial_ids) self.optimizer.bulk_register(configs, scores, status) - self._last_trial_id = max(trial_ids, default=self._last_trial_id) + # Mark those trials as registered so we don't load them again. + self._registered_trial_ids.update(trial_ids) + # Update the longest finished trial sequence ID. + self._longest_finished_trial_sequence_id = max( + [ + self.experiment.get_longest_prefix_finished_trial_id(), + self._longest_finished_trial_sequence_id, + ], + default=self._longest_finished_trial_sequence_id, + ) + # Remove trial ids that are older than the longest finished trial sequence ID. + # This is an optimization to avoid a long list of trial ids to omit from + # the load() operation or a long list of trial ids to maintain in memory. + self._registered_trial_ids = { + trial_id + for trial_id in self._registered_trial_ids + if trial_id > self._longest_finished_trial_sequence_id + } + # Check if the optimizer has converged or not. not_done = self.not_done() if not_done: tunables = self.optimizer.suggest() @@ -527,6 +550,7 @@ def not_done(self) -> bool: By default, stop when the :py:class:`.Optimizer` converges or the limit of :py:attr:`~.Scheduler.max_trials` is reached. """ + # TODO: Add more stopping conditions: https://github.com/microsoft/MLOS/issues/427 return self.optimizer.not_converged() and ( self._trial_count < self._max_trials or self._max_trials <= 0 ) diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index 428203b26e5..d2fde6e9f1f 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -26,7 +26,7 @@ import logging from abc import ABCMeta, abstractmethod -from collections.abc import Iterator, Mapping +from collections.abc import Iterator, Mapping, Iterable from contextlib import AbstractContextManager as ContextManager from datetime import datetime from types import TracebackType @@ -305,10 +305,20 @@ def load_telemetry(self, trial_id: int) -> list[tuple[datetime, str, Any]]: Telemetry data. """ + @abstractmethod + def get_longest_prefix_finished_trial_id(self) -> int: + """ + Calculate the last trial ID for the experiment. + + This is used to determine the last trial ID that finished (failed or + successful) such that all Trials before it are also finished. + """ + @abstractmethod def load( self, last_trial_id: int = -1, + omit_registered_trial_ids: Iterable[int] | None = None, ) -> tuple[list[int], list[dict], list[dict[str, Any] | None], list[Status]]: """ Load (tunable values, benchmark scores, status) to warm-up the optimizer. @@ -317,10 +327,20 @@ def load( that were scheduled *after* the given trial ID. Otherwise, return data from ALL merged-in experiments and attempt to impute the missing tunable values. + Additionally, if `omit_registered_trial_ids` is provided, omit the + trials matching those ids. + + The parameters together allow us to efficiently load data from + finished trials that we haven't registered with the Optimizer yet + for bulk registering. + Parameters ---------- last_trial_id : int (Optional) Trial ID to start from. + omit_registered_trial_ids : Iterable[int] | None = None, + (Optional) List of trial IDs to omit. If None, load all trials. + Returns ------- diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index 1dd4d452527..6d85b8730eb 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -8,7 +8,7 @@ import hashlib import logging -from collections.abc import Iterator +from collections.abc import Iterator, Iterable from datetime import datetime from typing import Any, Literal @@ -153,13 +153,43 @@ def load_telemetry(self, trial_id: int) -> list[tuple[datetime, str, Any]]: for row in cur_telemetry.fetchall() ] + # TODO: Add a test for this method. + def get_longest_prefix_finished_trial_id(self) -> int: + with self._engine.connect() as conn: + # Get the first (minimum) trial ID with an unfinished status. + first_unfinished_trial_id_stmt = ( + self._schema.trial.select() + .with_only_columns( + func.min(self._schema.trial.c.trial_id), + ) + .where( + self._schema.trial.c.exp_id == self._experiment_id, + func.not_( + self._schema.trial.c.status.in_( + [ + Status.SUCCEEDED.name, + Status.FAILED.name, + Status.TIMED_OUT.name, + ] + ), + ), + ) + ) + + max_trial_id = conn.execute(first_unfinished_trial_id_stmt).scalar() + if max_trial_id is None: + return -1 + # Return one less than the first unfinished trial ID - it should be + # finished (or not exist, which is fine as a limit). + return int(max_trial_id) - 1 + def load( self, last_trial_id: int = -1, + omit_registered_trial_ids: Iterable[int] | None = None, ) -> tuple[list[int], list[dict], list[dict[str, Any] | None], list[Status]]: - with self._engine.connect() as conn: - cur_trials = conn.execute( + stmt = ( self._schema.trial.select() .with_only_columns( self._schema.trial.c.trial_id, @@ -182,6 +212,15 @@ def load( ) ) + # TODO: Add a test for this parameter. + + # Note: if we have a very large number of trials, this may encounter + # SQL text length limits, so we may need to chunk this. + if omit_registered_trial_ids is not None: + stmt = stmt.where(self._schema.trial.c.trial_id.notin_(omit_registered_trial_ids)) + + cur_trials = conn.execute(stmt) + trial_ids: list[int] = [] configs: list[dict[str, Any]] = [] scores: list[dict[str, Any] | None] = [] From 24cdf0cad1b5734108503f1086551d391373d528 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 17:15:50 -0500 Subject: [PATCH 30/38] stubbing out 'wait_for_trial_runners' support in base class --- .../mlos_bench/schedulers/base_scheduler.py | 12 ++++++++++++ .../schedulers/parallel_scheduler.py | 19 +++++++++++++++++++ .../mlos_bench/schedulers/sync_scheduler.py | 7 +++++++ 3 files changed, 38 insertions(+) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index 84c2338f6f0..5229738f1fe 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -28,6 +28,7 @@ class Scheduler(ContextManager, metaclass=ABCMeta): # pylint: disable=too-many-instance-attributes + # pylint: disable=too-many-public-methods """Base class for the optimization loop scheduling policies.""" def __init__( # pylint: disable=too-many-arguments @@ -271,6 +272,7 @@ def start(self) -> None: self._longest_finished_trial_sequence_id, ) self.run_schedule(is_warm_up) + self.wait_for_trial_runners() not_done = self.add_new_optimizer_suggestions() self.assign_trial_runners( self.experiment.pending_trials( @@ -281,6 +283,16 @@ def start(self) -> None: ) is_warm_up = False + @abstractmethod + def wait_for_trial_runners(self) -> None: + """ + Wait for (enough) TrialRunners to finish. + + This is a blocking call that waits for enough of the the TrialRunners to finish. + The base class implementation waits for all of the TrialRunners to finish. + However this can be overridden in subclasses to implement a more asynchronous behavior. + """ + def teardown(self) -> None: """ Tear down the TrialRunners/Environment(s). diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index c0e5e1f49cd..7df4b3dea37 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -108,6 +108,25 @@ def __init__( # pylint: disable=too-many-arguments root_env_config=root_env_config, ) + # TODO: Add schema support for this config. + self._idle_worker_scheduling_batch_size = int( + # By default wait for 1 idle workers before scheduling new trials. + config.get("idle_worker_scheduling_batch_size", 1) + ) + # Never wait for more than the number of trial runners. + self._idle_worker_scheduling_batch_size = min( + self._idle_worker_scheduling_batch_size, + len(self._trial_runners), + ) + if self._idle_worker_scheduling_batch_size < 1: + _LOG.warning( + "Idle worker scheduling is set to %d, which is less than 1. " + f"Setting it to number of TrialRunners {len(self._trial_runners)}.", + self._idle_worker_scheduling_batch_size, + ) + self._idle_worker_scheduling_batch_size = len(self._trial_runners) + + # TODO: Add schema support for this config. self._polling_interval = float(config.get("polling_interval", 1.0)) # TODO: Setup logging for the child processes via a logging queue. diff --git a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py index a6e7ee002a1..6ea2c5fe1ba 100644 --- a/mlos_bench/mlos_bench/schedulers/sync_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/sync_scheduler.py @@ -31,3 +31,10 @@ def run_trial(self, trial: Storage.Trial) -> None: with trial_runner: trial_runner.run_trial(trial, self.global_config) _LOG.info("QUEUE: Finished trial: %s on %s", trial, trial_runner) + + def wait_for_trial_runners(self) -> None: + # The default base implementation of wait_for_trial_runners() is a no-op + # because trial_runner.run_trial() is blocking so SyncScheduler only + # runs a single trial at a time. + # pylint: disable=useless-super-delegation + super().wait_for_trial_runners() From ff5fcc26ac2d200288fdab878cfe72ba69a1aa56 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 17:30:23 -0500 Subject: [PATCH 31/38] make to wait for all trial runners to finish --- .../mlos_bench/schedulers/base_scheduler.py | 10 ++- .../schedulers/parallel_scheduler.py | 61 +++++++++++++------ 2 files changed, 53 insertions(+), 18 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index 5229738f1fe..5da39fe8974 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -282,15 +282,23 @@ def start(self) -> None: ) ) is_warm_up = False + self.wait_for_trial_runners(wait_all=True) @abstractmethod - def wait_for_trial_runners(self) -> None: + def wait_for_trial_runners(self, wait_all: bool = False) -> None: """ Wait for (enough) TrialRunners to finish. This is a blocking call that waits for enough of the the TrialRunners to finish. The base class implementation waits for all of the TrialRunners to finish. However this can be overridden in subclasses to implement a more asynchronous behavior. + + Parameters + ---------- + wait_all : bool + If True, wait for all TrialRunners to finish. + If False, wait for "enough" TrialRunners to finish (which for the + base class is all of them). """ def teardown(self) -> None: diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index 7df4b3dea37..f4d1bfd83ee 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -155,6 +155,15 @@ def __init__( # pylint: disable=too-many-arguments This also helps us to gather AsyncResults from each worker. """ + @property + def idle_worker_scheduling_batch_size(self) -> int: + """ + Get the batch size for idle worker scheduling. + + This is the number of idle workers to wait for before scheduling new trials. + """ + return self._idle_worker_scheduling_batch_size + def _get_idle_trial_runners_count(self) -> int: """ Return a count of idle trial runners. @@ -348,25 +357,43 @@ def run_schedule(self, running: bool = False) -> None: for trial in scheduled_trials: self.run_trial(trial) # Now all available trial should be started in the background. + # We can move on to wait_trial_runners() to wait for some to finish. + + def wait_for_trial_runners(self, wait_all: bool = False) -> None: + """ + Wait for all :py:class:`.TrialRunner`s to finish running. - # Wait for all trial runners to finish. - while self._has_running_trial_runners(): - sleep(self._polling_interval) - - # NOTE: This organization is blocking in that it will wait for *all* - # scheduled trials to finish running prior to scheduling more. - # TODO: This can be improved. - # For instance: - # 1. Allow eagerly scheduling new trials in the callback immediately - # after one finishes (maybe make this a configurable option). - # 2. Run the above in a while loop to continually evaluate for newly - # scheduled trials that are available to run. - # Alternatively: - # We can move the "has_running_trial_runners" check to the start() - # method and allow this return eagerly (so that it becomes more like - # "start_schedule"). + This is a blocking call that will wait for all trial runners to finish + running before returning. - assert self._get_idle_trial_runners_count() == len(self._trial_runners) + Parameters + ---------- + wait_all : bool + If True, wait for all trial runners to finish. If False, wait for + :py:attr:`~.TrialRunner.idle_worker_scheduling_batch_size` number of + idle trial runners to finish. Default is False. + + Notes + ----- + This is called in the parent process, so it must not block the main + thread. + """ + assert not is_child_process(), "This should be called in the parent process." + if wait_all: + # Wait for all trial runners to finish. + _LOG.info("Waiting for all trial runners to finish.") + while self._has_running_trial_runners(): + sleep(self._polling_interval) + assert not self._has_running_trial_runners(), "All trial runners should be idle." + else: + # Wait for a batch of idle trial runners to finish. + _LOG.info( + "Waiting for %d idle trial runners to finish.", + self._idle_worker_scheduling_batch_size, + ) + while self._get_idle_trial_runners_count() < self._idle_worker_scheduling_batch_size: + sleep(self._polling_interval) + assert self._get_idle_trial_runners_count() >= self._idle_worker_scheduling_batch_size @staticmethod def teardown_trial_runner(trial_runner: TrialRunner) -> TrialRunnerResult: From 30ee291793f6bbba71ee78fd79a43e469f544e27 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Thu, 8 May 2025 17:30:32 -0500 Subject: [PATCH 32/38] comments --- mlos_bench/mlos_bench/schedulers/base_scheduler.py | 3 ++- mlos_bench/mlos_bench/schedulers/parallel_scheduler.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlos_bench/mlos_bench/schedulers/base_scheduler.py b/mlos_bench/mlos_bench/schedulers/base_scheduler.py index 5da39fe8974..11db5a88bdb 100644 --- a/mlos_bench/mlos_bench/schedulers/base_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/base_scheduler.py @@ -380,6 +380,7 @@ def add_new_optimizer_suggestions(self) -> bool: # Check if the optimizer has converged or not. not_done = self.not_done() if not_done: + # TODO: Allow scheduling multiple configs at once (e.g., in the case of idle workers). tunables = self.optimizer.suggest() self.add_trial_to_queue(tunables) return not_done @@ -392,7 +393,7 @@ def add_trial_to_queue( """ Add a configuration to the queue of trials 1 or more times. - (e.g., according to the :py:attr:`.trial_config_repeat_count`) + (e.g., according to the :py:attr:`~.Scheduler.trial_config_repeat_count`) Parameters ---------- diff --git a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py index f4d1bfd83ee..1faf6fbb136 100644 --- a/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py +++ b/mlos_bench/mlos_bench/schedulers/parallel_scheduler.py @@ -276,7 +276,6 @@ def _run_trial_on_trial_runner_finished_callback( # Mark the TrialRunner as finished. self._trial_runners_status[result.trial_runner_id] = None # TODO: save the results? - # TODO: Allow scheduling of new trials here. def _run_trial_on_trial_runner_failed_closure( self, @@ -311,7 +310,7 @@ def run_trial(self, trial: Storage.Trial) -> None: if self._trial_runners_status[trial_runner_id] is not None: _LOG.info("TrialRunner %s is still active. Skipping trial %s.", trial_runner, trial) - # Update our trial bookkeeping. + # Update the scheduler's trial bookkeeping. super().run_trial(trial) # Start the trial in a child process. self._trial_runners_status[trial_runner_id] = self._pool.apply_async( From 2b21a0d6af13dd97d231802b94da392a1285cc56 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 8 May 2025 22:31:32 +0000 Subject: [PATCH 33/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mlos_bench/mlos_bench/storage/base_storage.py | 2 +- mlos_bench/mlos_bench/storage/sql/experiment.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlos_bench/mlos_bench/storage/base_storage.py b/mlos_bench/mlos_bench/storage/base_storage.py index d2fde6e9f1f..da47d839634 100644 --- a/mlos_bench/mlos_bench/storage/base_storage.py +++ b/mlos_bench/mlos_bench/storage/base_storage.py @@ -26,7 +26,7 @@ import logging from abc import ABCMeta, abstractmethod -from collections.abc import Iterator, Mapping, Iterable +from collections.abc import Iterable, Iterator, Mapping from contextlib import AbstractContextManager as ContextManager from datetime import datetime from types import TracebackType diff --git a/mlos_bench/mlos_bench/storage/sql/experiment.py b/mlos_bench/mlos_bench/storage/sql/experiment.py index 6d85b8730eb..dfe13d397fc 100644 --- a/mlos_bench/mlos_bench/storage/sql/experiment.py +++ b/mlos_bench/mlos_bench/storage/sql/experiment.py @@ -8,7 +8,7 @@ import hashlib import logging -from collections.abc import Iterator, Iterable +from collections.abc import Iterable, Iterator from datetime import datetime from typing import Any, Literal From 50b970626a439fdd41d0b46acd3e4e3f75c9bb58 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 9 May 2025 12:44:07 -0500 Subject: [PATCH 34/38] rework instructions a bit --- .github/instructions/python.instructions.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md index 5ace2693a94..13ed7a10f6d 100644 --- a/.github/instructions/python.instructions.md +++ b/.github/instructions/python.instructions.md @@ -71,13 +71,10 @@ applyTo: '**/*.py' Adding a trailing comma to the last argument is optional, but recommended for consistency whenever a single line is insufficient. - Code should be formatting using `black`. - - Code should be type checked using `mypy`. - -- All function and method parameters should be type annotated. - + - All function and method parameters should be type annotated. - Code should be linted using `pylint`. - Tests should be included for all new code and should be run using `pytest`. - - Tests should be organized roughly the same way as the code they are testing (e.g., `tests/some/test_module.py` for `some/module.py`). +- Test fixtures that setup the resources for the tests (e.g., Environments, Services, Storage, Optimizer, Scheduler, etc.) should be included in a `conftest.py` file in the same directory as the tests or else a `fixtures.py` file in the same directory as the tests. From 816ada1c63ddeafca6c58ecb7c7a337c6442a300 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 17:45:29 +0000 Subject: [PATCH 35/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/instructions/python.instructions.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/instructions/python.instructions.md b/.github/instructions/python.instructions.md index 13ed7a10f6d..f81a76df843 100644 --- a/.github/instructions/python.instructions.md +++ b/.github/instructions/python.instructions.md @@ -71,10 +71,15 @@ applyTo: '**/*.py' Adding a trailing comma to the last argument is optional, but recommended for consistency whenever a single line is insufficient. - Code should be formatting using `black`. + - Code should be type checked using `mypy`. + - All function and method parameters should be type annotated. + - Code should be linted using `pylint`. - Tests should be included for all new code and should be run using `pytest`. + - Tests should be organized roughly the same way as the code they are testing (e.g., `tests/some/test_module.py` for `some/module.py`). + - Test fixtures that setup the resources for the tests (e.g., Environments, Services, Storage, Optimizer, Scheduler, etc.) should be included in a `conftest.py` file in the same directory as the tests or else a `fixtures.py` file in the same directory as the tests. From 77424cbb0e752892ae8f2e749dbf924fd4cd8a92 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 9 May 2025 13:09:28 -0500 Subject: [PATCH 36/38] Adding new prompt --- ...rge-change-branch-to-separate-change-branches.prompt.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md diff --git a/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md new file mode 100644 index 00000000000..cf7b6b63744 --- /dev/null +++ b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md @@ -0,0 +1,7 @@ +# Custom Prompt: Split A Large Change Branch into Separate Change Branches + +The branch I have has too many changes and I want to start splitting them out to separate PRs. Can you help me do the following: +1. Summarize the `git diff` patches between my current branch and the `main` branch into distinct related groups. You can use the PR description associated with this branch to see what else was done and needs to be split out. +2. Propose a sequence of changes to stage each of those sets of changes as different self contained and testable PRs. +3. Help me create those PRs by applying that group's changes to new and different branches. We don't need to retain the history of this branch, just the changes. For instance, we could use `git checkout -b new-branch && git diff -- some/list of/files | git apply -` to make a new branch with some selected changes applied and the use stage only the relevant changes to each file for that particular PR. +Show me the plan and pause to check in with me between each step. From 06ab1892345a567c7c902d0589aa429682db64d0 Mon Sep 17 00:00:00 2001 From: Brian Kroth Date: Fri, 9 May 2025 13:11:42 -0500 Subject: [PATCH 37/38] update the prompt --- ...it-large-change-branch-to-separate-change-branches.prompt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md index cf7b6b63744..45064fd25b7 100644 --- a/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md +++ b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md @@ -1,7 +1,7 @@ # Custom Prompt: Split A Large Change Branch into Separate Change Branches The branch I have has too many changes and I want to start splitting them out to separate PRs. Can you help me do the following: -1. Summarize the `git diff` patches between my current branch and the `main` branch into distinct related groups. You can use the PR description associated with this branch to see what else was done and needs to be split out. +1. Run `git diff main > /tmp/git-diff-main.patch` to get the differences between the `main` branch and the current branch and then use `/tmp/git-diff-main.patch` to summarize the changes branch into distinct groups. You can use the description from the Github PR associated with this branch to see what else was done and needs to be split out. 2. Propose a sequence of changes to stage each of those sets of changes as different self contained and testable PRs. 3. Help me create those PRs by applying that group's changes to new and different branches. We don't need to retain the history of this branch, just the changes. For instance, we could use `git checkout -b new-branch && git diff -- some/list of/files | git apply -` to make a new branch with some selected changes applied and the use stage only the relevant changes to each file for that particular PR. Show me the plan and pause to check in with me between each step. From 073f9bfc0fe242adfad85855f718b151f8528ff4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 May 2025 18:12:18 +0000 Subject: [PATCH 38/38] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ...e-change-branch-to-separate-change-branches.prompt.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md index 45064fd25b7..60d6adeaa28 100644 --- a/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md +++ b/.github/prompts/split-large-change-branch-to-separate-change-branches.prompt.md @@ -1,7 +1,8 @@ # Custom Prompt: Split A Large Change Branch into Separate Change Branches The branch I have has too many changes and I want to start splitting them out to separate PRs. Can you help me do the following: -1. Run `git diff main > /tmp/git-diff-main.patch` to get the differences between the `main` branch and the current branch and then use `/tmp/git-diff-main.patch` to summarize the changes branch into distinct groups. You can use the description from the Github PR associated with this branch to see what else was done and needs to be split out. -2. Propose a sequence of changes to stage each of those sets of changes as different self contained and testable PRs. -3. Help me create those PRs by applying that group's changes to new and different branches. We don't need to retain the history of this branch, just the changes. For instance, we could use `git checkout -b new-branch && git diff -- some/list of/files | git apply -` to make a new branch with some selected changes applied and the use stage only the relevant changes to each file for that particular PR. -Show me the plan and pause to check in with me between each step. + +1. Run `git diff main > /tmp/git-diff-main.patch` to get the differences between the `main` branch and the current branch and then use `/tmp/git-diff-main.patch` to summarize the changes branch into distinct groups. You can use the description from the Github PR associated with this branch to see what else was done and needs to be split out. +1. Propose a sequence of changes to stage each of those sets of changes as different self contained and testable PRs. +1. Help me create those PRs by applying that group's changes to new and different branches. We don't need to retain the history of this branch, just the changes. For instance, we could use `git checkout -b new-branch && git diff -- some/list of/files | git apply -` to make a new branch with some selected changes applied and the use stage only the relevant changes to each file for that particular PR. + Show me the plan and pause to check in with me between each step.