Start cluster automatically; use update to wait until started

dandavison · dandavison · commit f336fccc2444 · 2024-12-07T13:49:01.000-05:00
diff --git a/message_passing/safe_message_handlers/README.md b/message_passing/safe_message_handlers/README.md
@@ -3,7 +3,6 @@
 This sample shows off important techniques for handling signals and updates, aka messages.  In particular, it illustrates how message handlers can interleave or not be completed before the workflow completes, and how you can manage that.
 
 * Here, using workflow.wait_condition, signal and update handlers will only operate when the workflow is within a certain state--between cluster_started and cluster_shutdown.
-* You can run start_workflow with an initializer signal that you want to run before anything else other than the workflow's constructor.  This pattern is known as "signal-with-start."
 * Message handlers can block and their actions can be interleaved with one another and with the main workflow.  This can easily cause bugs, so you can use a lock to protect shared state from interleaved access.
 * An "Entity" workflow, i.e. a long-lived workflow, periodically "continues as new".  It must do this to prevent its history from growing too large, and it passes its state to the next workflow.  You can check `workflow.info().is_continue_as_new_suggested()` to see when it's time. 
 * Most people want their message handlers to finish before the workflow run completes or continues as new.  Use `await workflow.wait_condition(lambda: workflow.all_handlers_finished())` to achieve this.
diff --git a/message_passing/safe_message_handlers/activities.py b/message_passing/safe_message_handlers/activities.py
@@ -11,6 +11,16 @@ class AssignNodesToJobInput:
     job_name: str
 
 
+@dataclass
+class ClusterState:
+    node_ids: List[str]
+
+
+@activity.defn
+async def start_cluster() -> ClusterState:
+    return ClusterState(node_ids=[f"{i}" for i in range(25)])
+
+
 @activity.defn
 async def assign_nodes_to_job(input: AssignNodesToJobInput) -> None:
     print(f"Assigning nodes {input.nodes} to job {input.job_name}")
diff --git a/message_passing/safe_message_handlers/starter.py b/message_passing/safe_message_handlers/starter.py
@@ -16,8 +16,10 @@
 
 
 async def do_cluster_lifecycle(wf: WorkflowHandle, delay_seconds: Optional[int] = None):
-
-    await wf.signal(ClusterManagerWorkflow.start_cluster)
+    cluster_status = await wf.execute_update(
+        ClusterManagerWorkflow.wait_until_cluster_started
+    )
+    print(f"Cluster started with {len(cluster_status.nodes)} nodes")
 
     print("Assigning jobs to nodes...")
     allocation_updates = []
diff --git a/message_passing/safe_message_handlers/worker.py b/message_passing/safe_message_handlers/worker.py
@@ -8,6 +8,7 @@
     ClusterManagerWorkflow,
     assign_nodes_to_job,
     find_bad_nodes,
+    start_cluster,
     unassign_nodes_for_job,
 )
 
@@ -21,7 +22,12 @@ async def main():
         client,
         task_queue="safe-message-handlers-task-queue",
         workflows=[ClusterManagerWorkflow],
-        activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes],
+        activities=[
+            assign_nodes_to_job,
+            unassign_nodes_for_job,
+            find_bad_nodes,
+            start_cluster,
+        ],
     ):
         logging.info("ClusterManagerWorkflow worker started, ctrl+c to exit")
         await interrupt_event.wait()
diff --git a/message_passing/safe_message_handlers/workflow.py b/message_passing/safe_message_handlers/workflow.py
@@ -14,6 +14,7 @@
     UnassignNodesForJobInput,
     assign_nodes_to_job,
     find_bad_nodes,
+    start_cluster,
     unassign_nodes_for_job,
 )
 
@@ -81,11 +82,10 @@ def __init__(self, input: ClusterManagerInput) -> None:
         self.max_history_length: Optional[int] = None
         self.sleep_interval_seconds: int = 600
 
-    @workflow.signal
-    async def start_cluster(self) -> None:
-        self.state.cluster_started = True
-        self.state.nodes = {str(k): None for k in range(25)}
-        workflow.logger.info("Cluster started")
+    @workflow.update
+    async def wait_until_cluster_started(self) -> ClusterManagerState:
+        await workflow.wait_condition(lambda: self.state.cluster_started)
+        return self.state
 
     @workflow.signal
     async def shutdown_cluster(self) -> None:
@@ -213,6 +213,13 @@ async def perform_health_checks(self) -> None:
 
     @workflow.run
     async def run(self, input: ClusterManagerInput) -> ClusterManagerResult:
+        cluster_state = await workflow.execute_activity(
+            start_cluster, schedule_to_close_timeout=timedelta(seconds=10)
+        )
+        self.state.nodes = {k: None for k in cluster_state.node_ids}
+        self.state.cluster_started = True
+        workflow.logger.info("Cluster started")
+
         await workflow.wait_condition(lambda: self.state.cluster_started)
         # Perform health checks at intervals.
         while True:
diff --git a/tests/message_passing/safe_message_handlers/workflow_test.py b/tests/message_passing/safe_message_handlers/workflow_test.py
@@ -10,6 +10,7 @@
 from message_passing.safe_message_handlers.activities import (
     assign_nodes_to_job,
     find_bad_nodes,
+    start_cluster,
     unassign_nodes_for_job,
 )
 from message_passing.safe_message_handlers.workflow import (
@@ -19,6 +20,13 @@
     ClusterManagerWorkflow,
 )
 
+ACTIVITIES = [
+    assign_nodes_to_job,
+    unassign_nodes_for_job,
+    find_bad_nodes,
+    start_cluster,
+]
+
 
 async def test_safe_message_handlers(client: Client, env: WorkflowEnvironment):
     if env.supports_time_skipping:
@@ -30,15 +38,17 @@ async def test_safe_message_handlers(client: Client, env: WorkflowEnvironment):
         client,
         task_queue=task_queue,
         workflows=[ClusterManagerWorkflow],
-        activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes],
+        activities=ACTIVITIES,
     ):
         cluster_manager_handle = await client.start_workflow(
             ClusterManagerWorkflow.run,
             ClusterManagerInput(),
             id=f"ClusterManagerWorkflow-{uuid.uuid4()}",
             task_queue=task_queue,
         )
-        await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster)
+        await cluster_manager_handle.execute_update(
+            ClusterManagerWorkflow.wait_until_cluster_started
+        )
 
         allocation_updates = []
         for i in range(6):
@@ -82,7 +92,7 @@ async def test_update_idempotency(client: Client, env: WorkflowEnvironment):
         client,
         task_queue=task_queue,
         workflows=[ClusterManagerWorkflow],
-        activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes],
+        activities=ACTIVITIES,
     ):
         cluster_manager_handle = await client.start_workflow(
             ClusterManagerWorkflow.run,
@@ -91,7 +101,9 @@ async def test_update_idempotency(client: Client, env: WorkflowEnvironment):
             task_queue=task_queue,
         )
 
-        await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster)
+        await cluster_manager_handle.execute_update(
+            ClusterManagerWorkflow.wait_until_cluster_started
+        )
 
         result_1 = await cluster_manager_handle.execute_update(
             ClusterManagerWorkflow.assign_nodes_to_job,
@@ -121,7 +133,7 @@ async def test_update_failure(client: Client, env: WorkflowEnvironment):
         client,
         task_queue=task_queue,
         workflows=[ClusterManagerWorkflow],
-        activities=[assign_nodes_to_job, unassign_nodes_for_job, find_bad_nodes],
+        activities=ACTIVITIES,
     ):
         cluster_manager_handle = await client.start_workflow(
             ClusterManagerWorkflow.run,
@@ -130,7 +142,9 @@ async def test_update_failure(client: Client, env: WorkflowEnvironment):
             task_queue=task_queue,
         )
 
-        await cluster_manager_handle.signal(ClusterManagerWorkflow.start_cluster)
+        await cluster_manager_handle.execute_update(
+            ClusterManagerWorkflow.wait_until_cluster_started
+        )
 
         await cluster_manager_handle.execute_update(
             ClusterManagerWorkflow.assign_nodes_to_job,