From da99ecf0db35502c8db787ebf536377923b86c65 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:05:06 -0300
Subject: [PATCH 01/55] feat: rewrite types for orchestration engine

---
 src/types.zig | 85 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/src/types.zig b/src/types.zig
index fbe28c2..ea52cd5 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -7,10 +7,11 @@ const std = @import("std");
 pub const RunStatus = enum {
     pending,
     running,
-    paused,
+    interrupted,
     completed,
     failed,
     cancelled,
+    forked,
 
     pub fn toString(self: RunStatus) []const u8 {
         return @tagName(self);
@@ -31,7 +32,7 @@ pub const StepStatus = enum {
     completed,
     failed,
     skipped,
-    waiting_approval,
+    interrupted,
 
     pub fn toString(self: StepStatus) []const u8 {
         return @tagName(self);
@@ -47,19 +48,11 @@ pub const StepStatus = enum {
 
 pub const StepType = enum {
     task,
-    fan_out,
-    map,
-    condition,
-    approval,
-    reduce,
-    loop,
-    sub_workflow,
-    wait,
-    router,
+    route,
+    interrupt,
+    agent,
+    send,
     transform,
-    saga,
-    debate,
-    group_chat,
 
     pub fn toString(self: StepType) []const u8 {
         return @tagName(self);
@@ -246,6 +239,58 @@ pub const SagaStateRow = struct {
     status: []const u8,
 };
 
+pub const WorkflowRow = struct {
+    id: []const u8,
+    name: []const u8,
+    definition_json: []const u8,
+    created_at_ms: i64,
+    updated_at_ms: i64,
+};
+
+pub const CheckpointRow = struct {
+    id: []const u8,
+    run_id: []const u8,
+    step_id: []const u8,
+    parent_id: ?[]const u8,
+    state_json: []const u8,
+    completed_nodes_json: []const u8,
+    version: i64,
+    metadata_json: ?[]const u8,
+    created_at_ms: i64,
+};
+
+pub const AgentEventRow = struct {
+    id: i64,
+    run_id: []const u8,
+    step_id: []const u8,
+    iteration: i64,
+    tool: ?[]const u8,
+    args_json: ?[]const u8,
+    result_text: ?[]const u8,
+    status: []const u8,
+    created_at_ms: i64,
+};
+
+pub const ReducerType = enum {
+    last_value,
+    append,
+    merge,
+    add,
+    min,
+    max,
+
+    pub fn toString(self: ReducerType) []const u8 {
+        return @tagName(self);
+    }
+
+    pub fn fromString(s: []const u8) ?ReducerType {
+        inline for (@typeInfo(ReducerType).@"enum".fields) |f| {
+            if (std.mem.eql(u8, s, f.name)) return @enumFromInt(f.value);
+        }
+        return null;
+    }
+};
+
 // ── API Response Types ─────────────────────────────────────────────────
 
 pub const HealthResponse = struct {
@@ -275,17 +320,17 @@ test "RunStatus round-trip" {
 }
 
 test "StepStatus round-trip" {
-    const s = StepStatus.waiting_approval;
+    const s = StepStatus.interrupted;
     const name = s.toString();
-    try std.testing.expectEqualStrings("waiting_approval", name);
+    try std.testing.expectEqualStrings("interrupted", name);
     const parsed = StepStatus.fromString(name);
-    try std.testing.expectEqual(StepStatus.waiting_approval, parsed.?);
+    try std.testing.expectEqual(StepStatus.interrupted, parsed.?);
 }
 
 test "StepType round-trip" {
-    const s = StepType.fan_out;
-    try std.testing.expectEqualStrings("fan_out", s.toString());
-    try std.testing.expectEqual(StepType.fan_out, StepType.fromString("fan_out").?);
+    const s = StepType.route;
+    try std.testing.expectEqualStrings("route", s.toString());
+    try std.testing.expectEqual(StepType.route, StepType.fromString("route").?);
 }
 
 test "WorkerStatus round-trip" {

From 531d9ea43d89e695ac27697dc6b240b15ef6441f Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:09:02 -0300
Subject: [PATCH 02/55] feat: add migration 004 for orchestration schema

---
 src/migrations/004_orchestration.sql | 64 ++++++++++++++++++++++++++++
 src/store.zig                        | 11 +++++
 2 files changed, 75 insertions(+)
 create mode 100644 src/migrations/004_orchestration.sql

diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
new file mode 100644
index 0000000..4dc564f
--- /dev/null
+++ b/src/migrations/004_orchestration.sql
@@ -0,0 +1,64 @@
+-- Drop deprecated tables
+DROP TABLE IF EXISTS step_deps;
+DROP TABLE IF EXISTS cycle_state;
+DROP TABLE IF EXISTS saga_state;
+
+-- Saved workflow definitions
+CREATE TABLE IF NOT EXISTS workflows (
+    id TEXT PRIMARY KEY,
+    name TEXT NOT NULL,
+    definition_json TEXT NOT NULL,
+    created_at_ms INTEGER NOT NULL,
+    updated_at_ms INTEGER NOT NULL
+);
+
+-- State checkpoints (snapshots after each step)
+CREATE TABLE IF NOT EXISTS checkpoints (
+    id TEXT PRIMARY KEY,
+    run_id TEXT NOT NULL REFERENCES runs(id),
+    step_id TEXT NOT NULL,
+    parent_id TEXT REFERENCES checkpoints(id),
+    state_json TEXT NOT NULL,
+    completed_nodes_json TEXT NOT NULL,
+    version INTEGER NOT NULL,
+    metadata_json TEXT,
+    created_at_ms INTEGER NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_checkpoints_run ON checkpoints(run_id, version);
+CREATE INDEX IF NOT EXISTS idx_checkpoints_parent ON checkpoints(parent_id);
+
+-- Agent intermediate events (from nullclaw callback)
+CREATE TABLE IF NOT EXISTS agent_events (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT NOT NULL REFERENCES runs(id),
+    step_id TEXT NOT NULL,
+    iteration INTEGER NOT NULL,
+    tool TEXT,
+    args_json TEXT,
+    result_text TEXT,
+    status TEXT NOT NULL,
+    created_at_ms INTEGER NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_agent_events_run_step ON agent_events(run_id, step_id);
+
+-- Pending state injections (thread-safe queue for POST /runs/{id}/state)
+CREATE TABLE IF NOT EXISTS pending_state_injections (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT NOT NULL REFERENCES runs(id),
+    updates_json TEXT NOT NULL,
+    apply_after_step TEXT,
+    created_at_ms INTEGER NOT NULL
+);
+
+-- Extend runs table
+ALTER TABLE runs ADD COLUMN state_json TEXT;
+ALTER TABLE runs ADD COLUMN workflow_id TEXT REFERENCES workflows(id);
+ALTER TABLE runs ADD COLUMN forked_from_run_id TEXT REFERENCES runs(id);
+ALTER TABLE runs ADD COLUMN forked_from_checkpoint_id TEXT REFERENCES checkpoints(id);
+ALTER TABLE runs ADD COLUMN checkpoint_count INTEGER DEFAULT 0;
+
+-- Extend steps table
+ALTER TABLE steps ADD COLUMN state_before_json TEXT;
+ALTER TABLE steps ADD COLUMN state_after_json TEXT;
+ALTER TABLE steps ADD COLUMN state_updates_json TEXT;
+-- NOTE: parent_step_id already exists from 001_init.sql — do NOT add it again
diff --git a/src/store.zig b/src/store.zig
index 4eeb2dc..bda701e 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -140,6 +140,17 @@ pub const Store = struct {
             }
             return error.MigrationFailed;
         }
+
+        // Migration 004 — orchestration schema (workflows, checkpoints, agent_events)
+        const sql_004 = @embedFile("migrations/004_orchestration.sql");
+        prc = c.sqlite3_exec(self.db, sql_004.ptr, null, null, &err_msg);
+        if (prc != c.SQLITE_OK) {
+            if (err_msg) |msg| {
+                log.err("migration 004 failed (rc={d}): {s}", .{ prc, std.mem.span(msg) });
+                c.sqlite3_free(msg);
+            }
+            return error.MigrationFailed;
+        }
     }
 
     pub fn beginTransaction(self: *Self) !void {

From 046126f8936a97c3f50fe573afe1af839f689aec Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:16:14 -0300
Subject: [PATCH 03/55] feat: add store methods for workflows, checkpoints,
 agent events

Add CRUD methods to Store for all new tables from migration 004:
- Workflow methods: create, get, list, update, delete
- Checkpoint methods: create, get, list, getLatest
- Agent event methods: create, listByRunStep
- Run state methods: updateRunState, incrementCheckpointCount,
  createRunWithState, createForkedRun
- Pending injection methods: create, consume, discard

Also adds PendingInjectionRow to types.zig and fixes migration 004
to preserve step_deps/cycle_state/saga_state tables until the engine
rewrite (Task 8) removes their usage.

Includes comprehensive tests for all new methods.
---
 src/migrations/004_orchestration.sql |   6 +-
 src/store.zig                        | 714 +++++++++++++++++++++++++++
 src/types.zig                        |   8 +
 3 files changed, 724 insertions(+), 4 deletions(-)

diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
index 4dc564f..30ffa68 100644
--- a/src/migrations/004_orchestration.sql
+++ b/src/migrations/004_orchestration.sql
@@ -1,7 +1,5 @@
--- Drop deprecated tables
-DROP TABLE IF EXISTS step_deps;
-DROP TABLE IF EXISTS cycle_state;
-DROP TABLE IF EXISTS saga_state;
+-- Note: step_deps, cycle_state, saga_state kept for backward compatibility
+-- until engine.zig is rewritten (Task 8). They will be removed then.
 
 -- Saved workflow definitions
 CREATE TABLE IF NOT EXISTS workflows (
diff --git a/src/store.zig b/src/store.zig
index bda701e..331f143 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -1278,6 +1278,400 @@ pub const Store = struct {
             return error.SqliteStepFailed;
         }
     }
+
+    // ── Workflow CRUD ─────────────────────────────────────────────────
+
+    pub fn createWorkflow(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8) !void {
+        const sql = "INSERT INTO workflows (id, name, definition_json, created_at_ms, updated_at_ms) VALUES (?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        const now = ids.nowMs();
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, name.ptr, @intCast(name.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 3, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 4, now);
+        _ = c.sqlite3_bind_int64(stmt, 5, now);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn getWorkflow(self: *Self, alloc: std.mem.Allocator, id: []const u8) !?types.WorkflowRow {
+        const sql = "SELECT id, name, definition_json, created_at_ms, updated_at_ms FROM workflows WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
+
+        return types.WorkflowRow{
+            .id = try allocStr(alloc, stmt, 0),
+            .name = try allocStr(alloc, stmt, 1),
+            .definition_json = try allocStr(alloc, stmt, 2),
+            .created_at_ms = colInt(stmt, 3),
+            .updated_at_ms = colInt(stmt, 4),
+        };
+    }
+
+    pub fn listWorkflows(self: *Self, alloc: std.mem.Allocator) ![]types.WorkflowRow {
+        const sql = "SELECT id, name, definition_json, created_at_ms, updated_at_ms FROM workflows ORDER BY created_at_ms DESC";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        var list: std.ArrayListUnmanaged(types.WorkflowRow) = .empty;
+        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
+            try list.append(alloc, .{
+                .id = try allocStr(alloc, stmt, 0),
+                .name = try allocStr(alloc, stmt, 1),
+                .definition_json = try allocStr(alloc, stmt, 2),
+                .created_at_ms = colInt(stmt, 3),
+                .updated_at_ms = colInt(stmt, 4),
+            });
+        }
+        return list.toOwnedSlice(alloc);
+    }
+
+    pub fn updateWorkflow(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8) !void {
+        const sql = "UPDATE workflows SET name = ?, definition_json = ?, updated_at_ms = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, name.ptr, @intCast(name.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 3, ids.nowMs());
+        _ = c.sqlite3_bind_text(stmt, 4, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn deleteWorkflow(self: *Self, id: []const u8) !void {
+        const sql = "DELETE FROM workflows WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    // ── Checkpoint CRUD ───────────────────────────────────────────────
+
+    pub fn createCheckpoint(self: *Self, id: []const u8, run_id: []const u8, step_id: []const u8, parent_id: ?[]const u8, state_json: []const u8, completed_nodes_json: []const u8, version: i64, metadata_json: ?[]const u8) !void {
+        const sql = "INSERT INTO checkpoints (id, run_id, step_id, parent_id, state_json, completed_nodes_json, version, metadata_json, created_at_ms) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 3, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
+        bindTextOpt(stmt, 4, parent_id);
+        _ = c.sqlite3_bind_text(stmt, 5, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 6, completed_nodes_json.ptr, @intCast(completed_nodes_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 7, version);
+        bindTextOpt(stmt, 8, metadata_json);
+        _ = c.sqlite3_bind_int64(stmt, 9, ids.nowMs());
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn getCheckpoint(self: *Self, alloc: std.mem.Allocator, id: []const u8) !?types.CheckpointRow {
+        const sql = "SELECT id, run_id, step_id, parent_id, state_json, completed_nodes_json, version, metadata_json, created_at_ms FROM checkpoints WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
+
+        return try readCheckpointRow(alloc, stmt);
+    }
+
+    pub fn listCheckpoints(self: *Self, alloc: std.mem.Allocator, run_id: []const u8) ![]types.CheckpointRow {
+        const sql = "SELECT id, run_id, step_id, parent_id, state_json, completed_nodes_json, version, metadata_json, created_at_ms FROM checkpoints WHERE run_id = ? ORDER BY version ASC";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        var list: std.ArrayListUnmanaged(types.CheckpointRow) = .empty;
+        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
+            try list.append(alloc, try readCheckpointRow(alloc, stmt));
+        }
+        return list.toOwnedSlice(alloc);
+    }
+
+    pub fn getLatestCheckpoint(self: *Self, alloc: std.mem.Allocator, run_id: []const u8) !?types.CheckpointRow {
+        const sql = "SELECT id, run_id, step_id, parent_id, state_json, completed_nodes_json, version, metadata_json, created_at_ms FROM checkpoints WHERE run_id = ? ORDER BY version DESC LIMIT 1";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
+
+        return try readCheckpointRow(alloc, stmt);
+    }
+
+    fn readCheckpointRow(alloc: std.mem.Allocator, stmt: ?*c.sqlite3_stmt) !types.CheckpointRow {
+        return .{
+            .id = try allocStr(alloc, stmt, 0),
+            .run_id = try allocStr(alloc, stmt, 1),
+            .step_id = try allocStr(alloc, stmt, 2),
+            .parent_id = try allocStrOpt(alloc, stmt, 3),
+            .state_json = try allocStr(alloc, stmt, 4),
+            .completed_nodes_json = try allocStr(alloc, stmt, 5),
+            .version = colInt(stmt, 6),
+            .metadata_json = try allocStrOpt(alloc, stmt, 7),
+            .created_at_ms = colInt(stmt, 8),
+        };
+    }
+
+    // ── Agent Event CRUD ──────────────────────────────────────────────
+
+    pub fn createAgentEvent(self: *Self, run_id: []const u8, step_id: []const u8, iteration: i64, tool: ?[]const u8, args_json: ?[]const u8, result_text: ?[]const u8, status: []const u8) !void {
+        const sql = "INSERT INTO agent_events (run_id, step_id, iteration, tool, args_json, result_text, status, created_at_ms) VALUES (?, ?, ?, ?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 3, iteration);
+        bindTextOpt(stmt, 4, tool);
+        bindTextOpt(stmt, 5, args_json);
+        bindTextOpt(stmt, 6, result_text);
+        _ = c.sqlite3_bind_text(stmt, 7, status.ptr, @intCast(status.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 8, ids.nowMs());
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn listAgentEvents(self: *Self, alloc: std.mem.Allocator, run_id: []const u8, step_id: []const u8) ![]types.AgentEventRow {
+        const sql = "SELECT id, run_id, step_id, iteration, tool, args_json, result_text, status, created_at_ms FROM agent_events WHERE run_id = ? AND step_id = ? ORDER BY id ASC";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
+
+        var list: std.ArrayListUnmanaged(types.AgentEventRow) = .empty;
+        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
+            try list.append(alloc, .{
+                .id = colInt(stmt, 0),
+                .run_id = try allocStr(alloc, stmt, 1),
+                .step_id = try allocStr(alloc, stmt, 2),
+                .iteration = colInt(stmt, 3),
+                .tool = try allocStrOpt(alloc, stmt, 4),
+                .args_json = try allocStrOpt(alloc, stmt, 5),
+                .result_text = try allocStrOpt(alloc, stmt, 6),
+                .status = try allocStr(alloc, stmt, 7),
+                .created_at_ms = colInt(stmt, 8),
+            });
+        }
+        return list.toOwnedSlice(alloc);
+    }
+
+    // ── Run State Management ──────────────────────────────────────────
+
+    pub fn updateRunState(self: *Self, run_id: []const u8, state_json: []const u8) !void {
+        const sql = "UPDATE runs SET state_json = ?, updated_at_ms = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 2, ids.nowMs());
+        _ = c.sqlite3_bind_text(stmt, 3, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn incrementCheckpointCount(self: *Self, run_id: []const u8) !void {
+        const sql = "UPDATE runs SET checkpoint_count = COALESCE(checkpoint_count, 0) + 1, updated_at_ms = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_int64(stmt, 1, ids.nowMs());
+        _ = c.sqlite3_bind_text(stmt, 2, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn createRunWithState(self: *Self, id: []const u8, workflow_id: ?[]const u8, workflow_json: []const u8, input_json: []const u8, state_json: []const u8) !void {
+        const sql = "INSERT INTO runs (id, status, workflow_id, workflow_json, input_json, callbacks_json, state_json, created_at_ms, updated_at_ms) VALUES (?, 'pending', ?, ?, ?, '[]', ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        const now = ids.nowMs();
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+        bindTextOpt(stmt, 2, workflow_id);
+        _ = c.sqlite3_bind_text(stmt, 3, workflow_json.ptr, @intCast(workflow_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 4, input_json.ptr, @intCast(input_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 5, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 6, now);
+        _ = c.sqlite3_bind_int64(stmt, 7, now);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn createForkedRun(self: *Self, id: []const u8, workflow_json: []const u8, state_json: []const u8, forked_from_run_id: []const u8, forked_from_checkpoint_id: []const u8) !void {
+        const sql = "INSERT INTO runs (id, status, workflow_json, input_json, callbacks_json, state_json, forked_from_run_id, forked_from_checkpoint_id, created_at_ms, updated_at_ms) VALUES (?, 'pending', ?, '{}', '[]', ?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        const now = ids.nowMs();
+        _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, workflow_json.ptr, @intCast(workflow_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 3, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 4, forked_from_run_id.ptr, @intCast(forked_from_run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 5, forked_from_checkpoint_id.ptr, @intCast(forked_from_checkpoint_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 6, now);
+        _ = c.sqlite3_bind_int64(stmt, 7, now);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    // ── Pending State Injection CRUD ──────────────────────────────────
+
+    pub fn createPendingInjection(self: *Self, run_id: []const u8, updates_json: []const u8, apply_after_step: ?[]const u8) !void {
+        const sql = "INSERT INTO pending_state_injections (run_id, updates_json, apply_after_step, created_at_ms) VALUES (?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, updates_json.ptr, @intCast(updates_json.len), SQLITE_STATIC);
+        bindTextOpt(stmt, 3, apply_after_step);
+        _ = c.sqlite3_bind_int64(stmt, 4, ids.nowMs());
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn consumePendingInjections(self: *Self, alloc: std.mem.Allocator, run_id: []const u8, completed_step: []const u8) ![]types.PendingInjectionRow {
+        // Select injections where apply_after_step matches the completed step or is NULL
+        const sql = "SELECT id, run_id, updates_json, apply_after_step, created_at_ms FROM pending_state_injections WHERE run_id = ? AND (apply_after_step IS NULL OR apply_after_step = ?) ORDER BY id ASC";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, completed_step.ptr, @intCast(completed_step.len), SQLITE_STATIC);
+
+        var list: std.ArrayListUnmanaged(types.PendingInjectionRow) = .empty;
+        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
+            try list.append(alloc, .{
+                .id = colInt(stmt, 0),
+                .run_id = try allocStr(alloc, stmt, 1),
+                .updates_json = try allocStr(alloc, stmt, 2),
+                .apply_after_step = try allocStrOpt(alloc, stmt, 3),
+                .created_at_ms = colInt(stmt, 4),
+            });
+        }
+
+        const result = try list.toOwnedSlice(alloc);
+
+        // Delete consumed injections
+        if (result.len > 0) {
+            const del_sql = "DELETE FROM pending_state_injections WHERE run_id = ? AND (apply_after_step IS NULL OR apply_after_step = ?)";
+            var del_stmt: ?*c.sqlite3_stmt = null;
+            if (c.sqlite3_prepare_v2(self.db, del_sql, -1, &del_stmt, null) != c.SQLITE_OK) {
+                return error.SqlitePrepareFailed;
+            }
+            defer _ = c.sqlite3_finalize(del_stmt);
+
+            _ = c.sqlite3_bind_text(del_stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_text(del_stmt, 2, completed_step.ptr, @intCast(completed_step.len), SQLITE_STATIC);
+
+            if (c.sqlite3_step(del_stmt) != c.SQLITE_DONE) {
+                return error.SqliteStepFailed;
+            }
+        }
+
+        return result;
+    }
+
+    pub fn discardPendingInjections(self: *Self, run_id: []const u8) !void {
+        const sql = "DELETE FROM pending_state_injections WHERE run_id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
 };
 
 // ── Tests ─────────────────────────────────────────────────────────────
@@ -1824,3 +2218,323 @@ test "updateStepChildRunId: sets child_run_id on step" {
     }
     try std.testing.expectEqualStrings("child_r1", step.child_run_id.?);
 }
+
+test "workflow CRUD" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create
+    try s.createWorkflow("wf1", "My Workflow", "{\"steps\":[]}");
+
+    // Get
+    const wf = (try s.getWorkflow(allocator, "wf1")).?;
+    defer {
+        allocator.free(wf.id);
+        allocator.free(wf.name);
+        allocator.free(wf.definition_json);
+    }
+    try std.testing.expectEqualStrings("wf1", wf.id);
+    try std.testing.expectEqualStrings("My Workflow", wf.name);
+    try std.testing.expectEqualStrings("{\"steps\":[]}", wf.definition_json);
+    try std.testing.expect(wf.created_at_ms > 0);
+    try std.testing.expect(wf.updated_at_ms > 0);
+
+    // Update
+    try s.updateWorkflow("wf1", "Updated Workflow", "{\"steps\":[{\"id\":\"s1\"}]}");
+    const wf2 = (try s.getWorkflow(allocator, "wf1")).?;
+    defer {
+        allocator.free(wf2.id);
+        allocator.free(wf2.name);
+        allocator.free(wf2.definition_json);
+    }
+    try std.testing.expectEqualStrings("Updated Workflow", wf2.name);
+    try std.testing.expectEqualStrings("{\"steps\":[{\"id\":\"s1\"}]}", wf2.definition_json);
+
+    // List
+    try s.createWorkflow("wf2", "Second Workflow", "{}");
+    const workflows = try s.listWorkflows(allocator);
+    defer {
+        for (workflows) |w| {
+            allocator.free(w.id);
+            allocator.free(w.name);
+            allocator.free(w.definition_json);
+        }
+        allocator.free(workflows);
+    }
+    try std.testing.expectEqual(@as(usize, 2), workflows.len);
+
+    // Delete
+    try s.deleteWorkflow("wf1");
+    const deleted = try s.getWorkflow(allocator, "wf1");
+    try std.testing.expect(deleted == null);
+
+    // Remaining list
+    const remaining = try s.listWorkflows(allocator);
+    defer {
+        for (remaining) |w| {
+            allocator.free(w.id);
+            allocator.free(w.name);
+            allocator.free(w.definition_json);
+        }
+        allocator.free(remaining);
+    }
+    try std.testing.expectEqual(@as(usize, 1), remaining.len);
+    try std.testing.expectEqualStrings("wf2", remaining[0].id);
+}
+
+test "checkpoint lifecycle" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create a run
+    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
+
+    // Create checkpoints with parent chain
+    try s.createCheckpoint("cp1", "r1", "step_a", null, "{\"x\":1}", "[\"step_a\"]", 1, null);
+    try s.createCheckpoint("cp2", "r1", "step_b", "cp1", "{\"x\":2}", "[\"step_a\",\"step_b\"]", 2, "{\"note\":\"test\"}");
+    try s.createCheckpoint("cp3", "r1", "step_c", "cp2", "{\"x\":3}", "[\"step_a\",\"step_b\",\"step_c\"]", 3, null);
+
+    // Get single checkpoint
+    const cp1 = (try s.getCheckpoint(allocator, "cp1")).?;
+    defer {
+        allocator.free(cp1.id);
+        allocator.free(cp1.run_id);
+        allocator.free(cp1.step_id);
+        if (cp1.parent_id) |pid| allocator.free(pid);
+        allocator.free(cp1.state_json);
+        allocator.free(cp1.completed_nodes_json);
+        if (cp1.metadata_json) |mj| allocator.free(mj);
+    }
+    try std.testing.expectEqualStrings("cp1", cp1.id);
+    try std.testing.expectEqualStrings("r1", cp1.run_id);
+    try std.testing.expectEqualStrings("step_a", cp1.step_id);
+    try std.testing.expect(cp1.parent_id == null);
+    try std.testing.expectEqualStrings("{\"x\":1}", cp1.state_json);
+    try std.testing.expectEqual(@as(i64, 1), cp1.version);
+    try std.testing.expect(cp1.metadata_json == null);
+
+    // Get checkpoint with parent and metadata
+    const cp2 = (try s.getCheckpoint(allocator, "cp2")).?;
+    defer {
+        allocator.free(cp2.id);
+        allocator.free(cp2.run_id);
+        allocator.free(cp2.step_id);
+        if (cp2.parent_id) |pid| allocator.free(pid);
+        allocator.free(cp2.state_json);
+        allocator.free(cp2.completed_nodes_json);
+        if (cp2.metadata_json) |mj| allocator.free(mj);
+    }
+    try std.testing.expectEqualStrings("cp1", cp2.parent_id.?);
+    try std.testing.expectEqualStrings("{\"note\":\"test\"}", cp2.metadata_json.?);
+
+    // List checkpoints (ordered by version ASC)
+    const cps = try s.listCheckpoints(allocator, "r1");
+    defer {
+        for (cps) |cp| {
+            allocator.free(cp.id);
+            allocator.free(cp.run_id);
+            allocator.free(cp.step_id);
+            if (cp.parent_id) |pid| allocator.free(pid);
+            allocator.free(cp.state_json);
+            allocator.free(cp.completed_nodes_json);
+            if (cp.metadata_json) |mj| allocator.free(mj);
+        }
+        allocator.free(cps);
+    }
+    try std.testing.expectEqual(@as(usize, 3), cps.len);
+    try std.testing.expectEqualStrings("cp1", cps[0].id);
+    try std.testing.expectEqualStrings("cp3", cps[2].id);
+
+    // Get latest checkpoint
+    const latest = (try s.getLatestCheckpoint(allocator, "r1")).?;
+    defer {
+        allocator.free(latest.id);
+        allocator.free(latest.run_id);
+        allocator.free(latest.step_id);
+        if (latest.parent_id) |pid| allocator.free(pid);
+        allocator.free(latest.state_json);
+        allocator.free(latest.completed_nodes_json);
+        if (latest.metadata_json) |mj| allocator.free(mj);
+    }
+    try std.testing.expectEqualStrings("cp3", latest.id);
+    try std.testing.expectEqual(@as(i64, 3), latest.version);
+
+    // Get nonexistent checkpoint
+    const none = try s.getCheckpoint(allocator, "nonexistent");
+    try std.testing.expect(none == null);
+
+    // Get latest for run with no checkpoints
+    const no_latest = try s.getLatestCheckpoint(allocator, "no_run");
+    try std.testing.expect(no_latest == null);
+}
+
+test "agent events" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create a run
+    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
+
+    // Create agent events
+    try s.createAgentEvent("r1", "step_a", 1, "read_file", "{\"path\":\"foo.txt\"}", "contents here", "completed");
+    try s.createAgentEvent("r1", "step_a", 2, "write_file", "{\"path\":\"bar.txt\"}", null, "completed");
+    try s.createAgentEvent("r1", "step_a", 3, null, null, null, "thinking");
+    try s.createAgentEvent("r1", "step_b", 1, "search", "{}", "results", "completed");
+
+    // List by run+step
+    const events_a = try s.listAgentEvents(allocator, "r1", "step_a");
+    defer {
+        for (events_a) |ev| {
+            allocator.free(ev.run_id);
+            allocator.free(ev.step_id);
+            if (ev.tool) |t| allocator.free(t);
+            if (ev.args_json) |a| allocator.free(a);
+            if (ev.result_text) |r| allocator.free(r);
+            allocator.free(ev.status);
+        }
+        allocator.free(events_a);
+    }
+    try std.testing.expectEqual(@as(usize, 3), events_a.len);
+    try std.testing.expectEqualStrings("read_file", events_a[0].tool.?);
+    try std.testing.expectEqual(@as(i64, 1), events_a[0].iteration);
+    try std.testing.expectEqualStrings("contents here", events_a[0].result_text.?);
+    try std.testing.expect(events_a[2].tool == null);
+    try std.testing.expectEqualStrings("thinking", events_a[2].status);
+
+    // List different step
+    const events_b = try s.listAgentEvents(allocator, "r1", "step_b");
+    defer {
+        for (events_b) |ev| {
+            allocator.free(ev.run_id);
+            allocator.free(ev.step_id);
+            if (ev.tool) |t| allocator.free(t);
+            if (ev.args_json) |a| allocator.free(a);
+            if (ev.result_text) |r| allocator.free(r);
+            allocator.free(ev.status);
+        }
+        allocator.free(events_b);
+    }
+    try std.testing.expectEqual(@as(usize, 1), events_b.len);
+    try std.testing.expectEqualStrings("search", events_b[0].tool.?);
+
+    // Empty list for nonexistent
+    const empty = try s.listAgentEvents(allocator, "r1", "nonexistent");
+    defer allocator.free(empty);
+    try std.testing.expectEqual(@as(usize, 0), empty.len);
+}
+
+test "pending state injections" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create a run
+    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
+
+    // Create pending injections
+    try s.createPendingInjection("r1", "{\"counter\":5}", "step_a");
+    try s.createPendingInjection("r1", "{\"flag\":true}", "step_b");
+    try s.createPendingInjection("r1", "{\"immediate\":1}", null); // apply immediately (NULL apply_after_step)
+
+    // Consume by step_a -- should get the step_a injection and the NULL one
+    const consumed_a = try s.consumePendingInjections(allocator, "r1", "step_a");
+    defer {
+        for (consumed_a) |inj| {
+            allocator.free(inj.run_id);
+            allocator.free(inj.updates_json);
+            if (inj.apply_after_step) |s_a| allocator.free(s_a);
+        }
+        allocator.free(consumed_a);
+    }
+    try std.testing.expectEqual(@as(usize, 2), consumed_a.len);
+    try std.testing.expectEqualStrings("{\"counter\":5}", consumed_a[0].updates_json);
+    try std.testing.expectEqualStrings("{\"immediate\":1}", consumed_a[1].updates_json);
+
+    // Consuming again for step_a should return empty (already consumed)
+    const consumed_again = try s.consumePendingInjections(allocator, "r1", "step_a");
+    defer allocator.free(consumed_again);
+    try std.testing.expectEqual(@as(usize, 0), consumed_again.len);
+
+    // step_b injection should still be pending
+    const consumed_b = try s.consumePendingInjections(allocator, "r1", "step_b");
+    defer {
+        for (consumed_b) |inj| {
+            allocator.free(inj.run_id);
+            allocator.free(inj.updates_json);
+            if (inj.apply_after_step) |s_a| allocator.free(s_a);
+        }
+        allocator.free(consumed_b);
+    }
+    try std.testing.expectEqual(@as(usize, 1), consumed_b.len);
+    try std.testing.expectEqualStrings("{\"flag\":true}", consumed_b[0].updates_json);
+
+    // Test discard
+    try s.createPendingInjection("r1", "{\"discard_me\":true}", "step_c");
+    try s.discardPendingInjections("r1");
+    const after_discard = try s.consumePendingInjections(allocator, "r1", "step_c");
+    defer allocator.free(after_discard);
+    try std.testing.expectEqual(@as(usize, 0), after_discard.len);
+}
+
+test "run state management" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create run with state
+    try s.createRunWithState("r1", null, "{\"steps\":[]}", "{\"input\":1}", "{\"counter\":0}");
+    const run = (try s.getRun(allocator, "r1")).?;
+    defer {
+        allocator.free(run.id);
+        if (run.idempotency_key) |ik| allocator.free(ik);
+        allocator.free(run.status);
+        allocator.free(run.workflow_json);
+        allocator.free(run.input_json);
+        allocator.free(run.callbacks_json);
+        if (run.error_text) |et| allocator.free(et);
+    }
+    try std.testing.expectEqualStrings("r1", run.id);
+    try std.testing.expectEqualStrings("pending", run.status);
+    try std.testing.expectEqualStrings("{\"steps\":[]}", run.workflow_json);
+
+    // Create run with workflow_id
+    try s.createWorkflow("wf1", "Test WF", "{\"steps\":[]}");
+    try s.createRunWithState("r2", "wf1", "{\"steps\":[]}", "{}", "{}");
+    const run2 = (try s.getRun(allocator, "r2")).?;
+    defer {
+        allocator.free(run2.id);
+        if (run2.idempotency_key) |ik| allocator.free(ik);
+        allocator.free(run2.status);
+        allocator.free(run2.workflow_json);
+        allocator.free(run2.input_json);
+        allocator.free(run2.callbacks_json);
+        if (run2.error_text) |et| allocator.free(et);
+    }
+    try std.testing.expectEqualStrings("r2", run2.id);
+
+    // Update run state
+    try s.updateRunState("r1", "{\"counter\":42}");
+
+    // Increment checkpoint count
+    try s.incrementCheckpointCount("r1");
+    try s.incrementCheckpointCount("r1");
+
+    // Create forked run
+    try s.createCheckpoint("cp1", "r1", "step_a", null, "{}", "[]", 1, null);
+    try s.createForkedRun("r3", "{\"steps\":[]}", "{\"counter\":42}", "r1", "cp1");
+    const forked = (try s.getRun(allocator, "r3")).?;
+    defer {
+        allocator.free(forked.id);
+        if (forked.idempotency_key) |ik| allocator.free(ik);
+        allocator.free(forked.status);
+        allocator.free(forked.workflow_json);
+        allocator.free(forked.input_json);
+        allocator.free(forked.callbacks_json);
+        if (forked.error_text) |et| allocator.free(et);
+    }
+    try std.testing.expectEqualStrings("r3", forked.id);
+    try std.testing.expectEqualStrings("pending", forked.status);
+}
diff --git a/src/types.zig b/src/types.zig
index ea52cd5..b989295 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -271,6 +271,14 @@ pub const AgentEventRow = struct {
     created_at_ms: i64,
 };
 
+pub const PendingInjectionRow = struct {
+    id: i64,
+    run_id: []const u8,
+    updates_json: []const u8,
+    apply_after_step: ?[]const u8,
+    created_at_ms: i64,
+};
+
 pub const ReducerType = enum {
     last_value,
     append,

From 665d0b1e47288f439b6052f5d8d3c8d748ff7302 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:17:37 -0300
Subject: [PATCH 04/55] feat: add self_url config for callback URL construction

---
 config.example.json | 1 +
 src/config.zig      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/config.example.json b/config.example.json
index d069183..fc64a5a 100644
--- a/config.example.json
+++ b/config.example.json
@@ -3,6 +3,7 @@
   "port": 8080,
   "db": "nullboiler.db",
   "api_token": null,
+  "self_url": null,
   "workers": [
     {
       "id": "nullclaw-1",
diff --git a/src/config.zig b/src/config.zig
index 430f5b2..0d3406e 100644
--- a/src/config.zig
+++ b/src/config.zig
@@ -77,6 +77,7 @@ pub const Config = struct {
     port: u16 = 8080,
     db: []const u8 = "nullboiler.db",
     api_token: ?[]const u8 = null,
+    self_url: ?[]const u8 = null,
     strategies_dir: []const u8 = "strategies",
     workers: []const WorkerConfig = &.{},
     engine: EngineConfig = .{},

From ffb36fbc925362cbf71f7540e76d306eb32c365f Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:34:24 -0300
Subject: [PATCH 05/55] feat: state management module with reducers

---
 src/main.zig  |   1 +
 src/state.zig | 674 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 675 insertions(+)
 create mode 100644 src/state.zig

diff --git a/src/main.zig b/src/main.zig
index 437a610..4a527a8 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -638,4 +638,5 @@ comptime {
     _ = @import("subprocess.zig");
     _ = @import("tracker_client.zig");
     _ = @import("tracker.zig");
+    _ = @import("state.zig");
 }
diff --git a/src/state.zig b/src/state.zig
new file mode 100644
index 0000000..fe66131
--- /dev/null
+++ b/src/state.zig
@@ -0,0 +1,674 @@
+/// State management module for NullBoiler orchestration.
+/// Implements reducers and state operations for the unified state model.
+/// Every node in the orchestration graph reads state, returns partial updates,
+/// and the engine applies reducers to compute the new state.
+const std = @import("std");
+const types = @import("types.zig");
+const ReducerType = types.ReducerType;
+const Allocator = std.mem.Allocator;
+const json = std.json;
+
+// ── Helpers ───────────────────────────────────────────────────────────
+
+/// Serialize a std.json.Value to an allocated JSON string.
+fn serializeValue(alloc: Allocator, value: json.Value) ![]const u8 {
+    var out: std.io.Writer.Allocating = .init(alloc);
+    var jw: json.Stringify = .{ .writer = &out.writer };
+    try jw.write(value);
+    return try out.toOwnedSlice();
+}
+
+/// Extract f64 from a json.Value (handles both .integer and .float).
+fn jsonToFloat(val: json.Value) ?f64 {
+    return switch (val) {
+        .float => |f| f,
+        .integer => |i| @as(f64, @floatFromInt(i)),
+        else => null,
+    };
+}
+
+/// Format an f64 as a string. Renders integers without decimal point.
+fn formatFloat(alloc: Allocator, f: f64) ![]const u8 {
+    const i: i64 = @intFromFloat(f);
+    if (@as(f64, @floatFromInt(i)) == f) {
+        return try std.fmt.allocPrint(alloc, "{d}", .{i});
+    }
+    return try std.fmt.allocPrint(alloc, "{d}", .{f});
+}
+
+// ── Public API ────────────────────────────────────────────────────────
+
+/// Apply a single reducer to merge old_value + update into new_value.
+/// Returns newly allocated JSON string owned by the caller.
+pub fn applyReducer(alloc: Allocator, reducer: ReducerType, old_value_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    switch (reducer) {
+        .last_value => {
+            return try alloc.dupe(u8, update_json);
+        },
+        .append => {
+            return try applyAppend(alloc, old_value_json, update_json);
+        },
+        .merge => {
+            return try applyMerge(alloc, old_value_json, update_json);
+        },
+        .add => {
+            return try applyAdd(alloc, old_value_json, update_json);
+        },
+        .min => {
+            return try applyMin(alloc, old_value_json, update_json);
+        },
+        .max => {
+            return try applyMax(alloc, old_value_json, update_json);
+        },
+    }
+}
+
+/// Apply partial state updates to full state using schema reducers.
+/// For each key in updates_json:
+///   1. Look up reducer type from schema_json (format: {"key": {"type": "...", "reducer": "..."}})
+///   2. Get old value from state_json (may be null/missing)
+///   3. Apply reducer(old_value, new_value)
+///   4. Write result to output state
+pub fn applyUpdates(alloc: Allocator, state_json: []const u8, updates_json: []const u8, schema_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const state_parsed = try json.parseFromSlice(json.Value, arena_alloc, state_json, .{});
+    const state_obj = if (state_parsed.value == .object) state_parsed.value.object else json.ObjectMap.init(arena_alloc);
+
+    const updates_parsed = try json.parseFromSlice(json.Value, arena_alloc, updates_json, .{});
+    if (updates_parsed.value != .object) return try alloc.dupe(u8, state_json);
+
+    const schema_parsed = try json.parseFromSlice(json.Value, arena_alloc, schema_json, .{});
+    const schema_obj = if (schema_parsed.value == .object) schema_parsed.value.object else json.ObjectMap.init(arena_alloc);
+
+    // Start with a copy of all existing state keys
+    var result_obj = json.ObjectMap.init(arena_alloc);
+    var state_it = state_obj.iterator();
+    while (state_it.next()) |entry| {
+        try result_obj.put(entry.key_ptr.*, entry.value_ptr.*);
+    }
+
+    // For each update key, apply the reducer
+    var updates_it = updates_parsed.value.object.iterator();
+    while (updates_it.next()) |entry| {
+        const key = entry.key_ptr.*;
+        const update_value = entry.value_ptr.*;
+
+        // Serialize the update value
+        const update_str = try serializeValue(arena_alloc, update_value);
+
+        // Look up reducer from schema
+        const reducer_type = blk: {
+            if (schema_obj.get(key)) |schema_entry| {
+                if (schema_entry == .object) {
+                    if (schema_entry.object.get("reducer")) |reducer_val| {
+                        if (reducer_val == .string) {
+                            break :blk ReducerType.fromString(reducer_val.string) orelse .last_value;
+                        }
+                    }
+                }
+            }
+            break :blk ReducerType.last_value;
+        };
+
+        // Get old value as JSON string (or null if missing)
+        const old_str: ?[]const u8 = blk: {
+            if (state_obj.get(key)) |old_val| {
+                break :blk try serializeValue(arena_alloc, old_val);
+            }
+            break :blk null;
+        };
+
+        // Apply the reducer (allocates into arena)
+        const new_str = try applyReducer(arena_alloc, reducer_type, old_str, update_str);
+
+        // Parse the result back into a json.Value and put in result
+        const new_parsed = try json.parseFromSlice(json.Value, arena_alloc, new_str, .{});
+        try result_obj.put(key, new_parsed.value);
+    }
+
+    // Serialize the result into the caller's allocator
+    const result_str = try serializeValue(arena_alloc, json.Value{ .object = result_obj });
+    return try alloc.dupe(u8, result_str);
+}
+
+/// Initialize state from input JSON and schema defaults.
+/// For each key in schema:
+///   - if key exists in input -> use input value
+///   - else -> use type default: "" for string, [] for array, 0 for number, false for boolean, {} for object, null otherwise
+pub fn initState(alloc: Allocator, input_json: []const u8, schema_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const input_parsed = try json.parseFromSlice(json.Value, arena_alloc, input_json, .{});
+    const input_obj = if (input_parsed.value == .object) input_parsed.value.object else json.ObjectMap.init(arena_alloc);
+
+    const schema_parsed = try json.parseFromSlice(json.Value, arena_alloc, schema_json, .{});
+    if (schema_parsed.value != .object) return try alloc.dupe(u8, input_json);
+
+    var result_obj = json.ObjectMap.init(arena_alloc);
+
+    var schema_it = schema_parsed.value.object.iterator();
+    while (schema_it.next()) |entry| {
+        const key = entry.key_ptr.*;
+        const schema_entry = entry.value_ptr.*;
+
+        if (input_obj.get(key)) |input_val| {
+            try result_obj.put(key, input_val);
+        } else {
+            const type_str = blk: {
+                if (schema_entry == .object) {
+                    if (schema_entry.object.get("type")) |type_val| {
+                        if (type_val == .string) {
+                            break :blk type_val.string;
+                        }
+                    }
+                }
+                break :blk "";
+            };
+
+            const default_val: json.Value = if (std.mem.eql(u8, type_str, "string"))
+                .{ .string = "" }
+            else if (std.mem.eql(u8, type_str, "array"))
+                .{ .array = json.Array.init(arena_alloc) }
+            else if (std.mem.eql(u8, type_str, "number"))
+                .{ .integer = 0 }
+            else if (std.mem.eql(u8, type_str, "boolean"))
+                .{ .bool = false }
+            else if (std.mem.eql(u8, type_str, "object"))
+                .{ .object = json.ObjectMap.init(arena_alloc) }
+            else
+                .null;
+
+            try result_obj.put(key, default_val);
+        }
+    }
+
+    const result_str = try serializeValue(arena_alloc, json.Value{ .object = result_obj });
+    return try alloc.dupe(u8, result_str);
+}
+
+/// Extract a value from state JSON by dotted path.
+/// Supports:
+///   - "state.messages" -> strips "state." prefix, returns value at key "messages"
+///   - "state.plan.files" -> nested object access
+///   - "state.messages[-1]" -> last element of array
+pub fn getStateValue(alloc: Allocator, state_json: []const u8, path: []const u8) !?[]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    // Strip "state." prefix if present
+    const effective_path = if (std.mem.startsWith(u8, path, "state."))
+        path["state.".len..]
+    else
+        path;
+
+    const parsed = try json.parseFromSlice(json.Value, arena_alloc, state_json, .{});
+    var current = parsed.value;
+
+    // Split by "." and walk the path
+    var segments = std.mem.splitScalar(u8, effective_path, '.');
+    while (segments.next()) |segment| {
+        // Check for array index like "messages[-1]"
+        if (std.mem.indexOfScalar(u8, segment, '[')) |bracket_pos| {
+            const key = segment[0..bracket_pos];
+            const index_str = segment[bracket_pos..];
+
+            // Navigate to the key first
+            if (current != .object) return null;
+            current = current.object.get(key) orelse return null;
+
+            // Parse the array index
+            if (std.mem.eql(u8, index_str, "[-1]")) {
+                if (current != .array) return null;
+                if (current.array.items.len == 0) return null;
+                current = current.array.items[current.array.items.len - 1];
+            } else {
+                // Parse positive index: [N]
+                if (index_str.len < 3) return null;
+                const num_str = index_str[1 .. index_str.len - 1];
+                const idx = std.fmt.parseInt(usize, num_str, 10) catch return null;
+                if (current != .array) return null;
+                if (idx >= current.array.items.len) return null;
+                current = current.array.items[idx];
+            }
+        } else {
+            if (current != .object) return null;
+            current = current.object.get(segment) orelse return null;
+        }
+    }
+
+    const result_str = try serializeValue(arena_alloc, current);
+    return try alloc.dupe(u8, result_str);
+}
+
+/// Convert JSON value to string for route matching.
+/// - true/false -> "true"/"false"
+/// - numbers -> decimal string representation
+/// - "quoted string" -> strip quotes, return inner string
+/// - anything else -> return as-is
+pub fn stringifyForRoute(alloc: Allocator, value_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const parsed = try json.parseFromSlice(json.Value, arena_alloc, value_json, .{});
+
+    switch (parsed.value) {
+        .bool => |b| {
+            return try alloc.dupe(u8, if (b) "true" else "false");
+        },
+        .integer => |i| {
+            return try std.fmt.allocPrint(alloc, "{d}", .{i});
+        },
+        .float => |f| {
+            const tmp = try formatFloat(arena_alloc, f);
+            return try alloc.dupe(u8, tmp);
+        },
+        .string => |s| {
+            return try alloc.dupe(u8, s);
+        },
+        else => {
+            return try alloc.dupe(u8, value_json);
+        },
+    }
+}
+
+// ── Reducer implementations ───────────────────────────────────────────
+
+/// append: if old is null/empty -> wrap update in array [update].
+/// If old is array -> parse, append update (element or array elements), serialize.
+fn applyAppend(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+
+    const old = old_json orelse {
+        var arr = json.Array.init(arena_alloc);
+        try arr.append(update_parsed.value);
+        const result = try serializeValue(arena_alloc, json.Value{ .array = arr });
+        return try alloc.dupe(u8, result);
+    };
+
+    if (old.len == 0) {
+        var arr = json.Array.init(arena_alloc);
+        try arr.append(update_parsed.value);
+        const result = try serializeValue(arena_alloc, json.Value{ .array = arr });
+        return try alloc.dupe(u8, result);
+    }
+
+    const old_parsed = try json.parseFromSlice(json.Value, arena_alloc, old, .{});
+
+    if (old_parsed.value != .array) {
+        var arr = json.Array.init(arena_alloc);
+        try arr.append(old_parsed.value);
+        try arr.append(update_parsed.value);
+        const result = try serializeValue(arena_alloc, json.Value{ .array = arr });
+        return try alloc.dupe(u8, result);
+    }
+
+    // Old is array - copy elements then append update
+    var arr = json.Array.init(arena_alloc);
+    for (old_parsed.value.array.items) |item| {
+        try arr.append(item);
+    }
+
+    // If update is an array, append each element; otherwise append the single value
+    if (update_parsed.value == .array) {
+        for (update_parsed.value.array.items) |item| {
+            try arr.append(item);
+        }
+    } else {
+        try arr.append(update_parsed.value);
+    }
+
+    const result = try serializeValue(arena_alloc, json.Value{ .array = arr });
+    return try alloc.dupe(u8, result);
+}
+
+/// merge: deep merge two JSON objects. Update keys override old keys.
+/// Nested objects are recursively merged.
+fn applyMerge(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+
+    if (update_parsed.value != .object) {
+        return try alloc.dupe(u8, update_json);
+    }
+
+    const old = old_json orelse {
+        return try alloc.dupe(u8, update_json);
+    };
+
+    if (old.len == 0) {
+        return try alloc.dupe(u8, update_json);
+    }
+
+    const old_parsed = try json.parseFromSlice(json.Value, arena_alloc, old, .{});
+
+    if (old_parsed.value != .object) {
+        return try alloc.dupe(u8, update_json);
+    }
+
+    const merged = try deepMerge(arena_alloc, old_parsed.value, update_parsed.value);
+    const result = try serializeValue(arena_alloc, merged);
+    return try alloc.dupe(u8, result);
+}
+
+/// Recursively deep-merge two JSON objects.
+fn deepMerge(alloc: Allocator, base: json.Value, overlay: json.Value) !json.Value {
+    if (base != .object or overlay != .object) {
+        return overlay;
+    }
+
+    var result = json.ObjectMap.init(alloc);
+
+    // Copy all base keys
+    var base_it = base.object.iterator();
+    while (base_it.next()) |entry| {
+        try result.put(entry.key_ptr.*, entry.value_ptr.*);
+    }
+
+    // Apply overlay keys, recursively merging nested objects
+    var overlay_it = overlay.object.iterator();
+    while (overlay_it.next()) |entry| {
+        const key = entry.key_ptr.*;
+        const overlay_val = entry.value_ptr.*;
+
+        if (result.get(key)) |existing| {
+            if (existing == .object and overlay_val == .object) {
+                const merged = try deepMerge(alloc, existing, overlay_val);
+                try result.put(key, merged);
+            } else {
+                try result.put(key, overlay_val);
+            }
+        } else {
+            try result.put(key, overlay_val);
+        }
+    }
+
+    return json.Value{ .object = result };
+}
+
+/// add: parse both as numbers (f64), add, return string. If old is null, treat as 0.
+fn applyAdd(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+    const update_val = jsonToFloat(update_parsed.value) orelse return error.InvalidNumber;
+
+    const old_val: f64 = blk: {
+        const old = old_json orelse break :blk 0;
+        if (old.len == 0) break :blk 0;
+        const old_parsed = json.parseFromSlice(json.Value, arena_alloc, old, .{}) catch break :blk 0;
+        break :blk jsonToFloat(old_parsed.value) orelse 0;
+    };
+
+    return try formatFloat(alloc, old_val + update_val);
+}
+
+/// min: parse both as numbers, return the smaller. If old is null, return update.
+fn applyMin(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+    const update_val = jsonToFloat(update_parsed.value) orelse return error.InvalidNumber;
+
+    const old = old_json orelse return try formatFloat(alloc, update_val);
+    if (old.len == 0) return try formatFloat(alloc, update_val);
+
+    const old_parsed = json.parseFromSlice(json.Value, arena_alloc, old, .{}) catch
+        return try formatFloat(alloc, update_val);
+    const old_val = jsonToFloat(old_parsed.value) orelse return try formatFloat(alloc, update_val);
+
+    return try formatFloat(alloc, @min(old_val, update_val));
+}
+
+/// max: parse both as numbers, return the larger. If old is null, return update.
+fn applyMax(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+    const update_val = jsonToFloat(update_parsed.value) orelse return error.InvalidNumber;
+
+    const old = old_json orelse return try formatFloat(alloc, update_val);
+    if (old.len == 0) return try formatFloat(alloc, update_val);
+
+    const old_parsed = json.parseFromSlice(json.Value, arena_alloc, old, .{}) catch
+        return try formatFloat(alloc, update_val);
+    const old_val = jsonToFloat(old_parsed.value) orelse return try formatFloat(alloc, update_val);
+
+    return try formatFloat(alloc, @max(old_val, update_val));
+}
+
+// ── Custom errors ─────────────────────────────────────────────────────
+
+const InvalidNumber = error{InvalidNumber};
+
+// ── Tests ─────────────────────────────────────────────────────────────
+
+fn parseTestJson(alloc: Allocator, json_str: []const u8) !json.Parsed(json.Value) {
+    return try json.parseFromSlice(json.Value, alloc, json_str, .{});
+}
+
+test "last_value reducer" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .last_value, "\"old\"", "\"new\"");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("\"new\"", result);
+}
+
+test "add reducer" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add, "10", "5");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("15", result);
+}
+
+test "add reducer with null old" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add, null, "7");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("7", result);
+}
+
+test "append reducer" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .append, "[1,2]", "3");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("[1,2,3]", result);
+}
+
+test "append reducer with null old" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .append, null, "\"hello\"");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("[\"hello\"]", result);
+}
+
+test "merge reducer - flat objects" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .merge, "{\"a\":1,\"b\":2}", "{\"b\":3,\"c\":4}");
+    defer alloc.free(result);
+    // Parse result to check keys since JSON object key order is not guaranteed
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+
+    const a = parsed.value.object.get("a") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(@as(i64, 1), a.integer);
+
+    const b = parsed.value.object.get("b") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(@as(i64, 3), b.integer);
+
+    const c = parsed.value.object.get("c") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(@as(i64, 4), c.integer);
+}
+
+test "merge reducer - null old" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .merge, null, "{\"x\":1}");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("{\"x\":1}", result);
+}
+
+test "min reducer" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .min, "10", "3");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("3", result);
+}
+
+test "max reducer" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .max, "10", "3");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("10", result);
+}
+
+test "applyUpdates with mixed reducers" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"count":5,"messages":["hello"],"config":{"a":1}}
+    ;
+    const updates =
+        \\{"count":3,"messages":"world","config":{"b":2}}
+    ;
+    const schema =
+        \\{"count":{"type":"number","reducer":"add"},"messages":{"type":"array","reducer":"append"},"config":{"type":"object","reducer":"merge"}}
+    ;
+
+    const result = try applyUpdates(alloc, state, updates, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+
+    // count: 5 + 3 = 8
+    const count = parsed.value.object.get("count") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(@as(i64, 8), count.integer);
+
+    // messages: ["hello"] + "world" = ["hello","world"]
+    const messages = parsed.value.object.get("messages") orelse return error.TestUnexpectedResult;
+    try std.testing.expect(messages == .array);
+    try std.testing.expectEqual(@as(usize, 2), messages.array.items.len);
+
+    // config: merge {a:1} + {b:2} = {a:1, b:2}
+    const config = parsed.value.object.get("config") orelse return error.TestUnexpectedResult;
+    try std.testing.expect(config == .object);
+    try std.testing.expect(config.object.get("a") != null);
+    try std.testing.expect(config.object.get("b") != null);
+}
+
+test "initState with defaults" {
+    const alloc = std.testing.allocator;
+    const input =
+        \\{"prompt":"hi"}
+    ;
+    const schema =
+        \\{"prompt":{"type":"string","reducer":"last_value"},"messages":{"type":"array","reducer":"append"},"count":{"type":"number","reducer":"add"},"done":{"type":"boolean","reducer":"last_value"},"meta":{"type":"object","reducer":"merge"}}
+    ;
+
+    const result = try initState(alloc, input, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+
+    // prompt should be from input
+    const prompt = parsed.value.object.get("prompt") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("hi", prompt.string);
+
+    // messages should default to []
+    const messages = parsed.value.object.get("messages") orelse return error.TestUnexpectedResult;
+    try std.testing.expect(messages == .array);
+    try std.testing.expectEqual(@as(usize, 0), messages.array.items.len);
+
+    // count should default to 0
+    const count = parsed.value.object.get("count") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(@as(i64, 0), count.integer);
+
+    // done should default to false
+    const done = parsed.value.object.get("done") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqual(false, done.bool);
+
+    // meta should default to {}
+    const meta = parsed.value.object.get("meta") orelse return error.TestUnexpectedResult;
+    try std.testing.expect(meta == .object);
+    try std.testing.expectEqual(@as(usize, 0), meta.object.count());
+}
+
+test "getStateValue simple key" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"prompt":"hello","count":42}
+    ;
+    const result = try getStateValue(alloc, state, "state.prompt");
+    defer if (result) |r| alloc.free(r);
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("\"hello\"", result.?);
+}
+
+test "getStateValue nested" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"plan":{"files":["a.zig","b.zig"]}}
+    ;
+    const result = try getStateValue(alloc, state, "state.plan.files");
+    defer if (result) |r| alloc.free(r);
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("[\"a.zig\",\"b.zig\"]", result.?);
+}
+
+test "getStateValue array last element" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"messages":["first","second","third"]}
+    ;
+    const result = try getStateValue(alloc, state, "state.messages[-1]");
+    defer if (result) |r| alloc.free(r);
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("\"third\"", result.?);
+}
+
+test "stringifyForRoute boolean" {
+    const alloc = std.testing.allocator;
+    const result_true = try stringifyForRoute(alloc, "true");
+    defer alloc.free(result_true);
+    try std.testing.expectEqualStrings("true", result_true);
+
+    const result_false = try stringifyForRoute(alloc, "false");
+    defer alloc.free(result_false);
+    try std.testing.expectEqualStrings("false", result_false);
+}
+
+test "stringifyForRoute number" {
+    const alloc = std.testing.allocator;
+    const result = try stringifyForRoute(alloc, "42");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("42", result);
+}
+
+test "stringifyForRoute string" {
+    const alloc = std.testing.allocator;
+    const result = try stringifyForRoute(alloc, "\"hello world\"");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("hello world", result);
+}

From 689fd3e7bb7ff18be25d3d2cfa93893b015bf61a Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:48:54 -0300
Subject: [PATCH 06/55] feat: rewrite template engine for state-based
 interpolation

---
 src/templates.zig | 322 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 321 insertions(+), 1 deletion(-)

diff --git a/src/templates.zig b/src/templates.zig
index d928522..b2cd211 100644
--- a/src/templates.zig
+++ b/src/templates.zig
@@ -403,7 +403,327 @@ fn jsonValueToString(allocator: std.mem.Allocator, val: std.json.Value) RenderEr
     }
 }
 
-// ── Tests ─────────────────────────────────────────────────────────────
+// ── New state-based template engine ───────────────────────────────────
+
+const state_mod = @import("state.zig");
+const Allocator = std.mem.Allocator;
+
+/// Strip surrounding double quotes from a JSON string value.
+/// `"hello"` -> `hello`, `42` -> `42`, `[1,2]` -> `[1,2]`
+fn stripJsonQuotes(s: []const u8) []const u8 {
+    if (s.len >= 2 and s[0] == '"' and s[s.len - 1] == '"') {
+        return s[1 .. s.len - 1];
+    }
+    return s;
+}
+
+/// Look up a value from a JSON blob by dotted path (no prefix stripping).
+/// E.g. lookupJsonPath(alloc, '{"topic":"AI"}', "topic") -> "AI"
+fn lookupJsonPath(alloc: Allocator, json_bytes: []const u8, path: []const u8) !?[]const u8 {
+    // Reuse state_mod.getStateValue but without "state." prefix.
+    // getStateValue strips "state." if present, otherwise uses path as-is.
+    return try state_mod.getStateValue(alloc, json_bytes, path);
+}
+
+/// Resolve a template expression (the text inside `{{ }}`) to a string value.
+/// Handles state.X, input.X, item, item.X expressions.
+fn resolveNewExpression(
+    alloc: Allocator,
+    expr: []const u8,
+    state_json: []const u8,
+    input_json: ?[]const u8,
+    item_json: ?[]const u8,
+) ![]const u8 {
+    if (std.mem.startsWith(u8, expr, "state.")) {
+        // Use getStateValue which handles "state." prefix, nested paths, [-1] indexing
+        const raw = try state_mod.getStateValue(alloc, state_json, expr);
+        if (raw) |r| {
+            // Strip quotes for strings; leave numbers/bools/arrays/objects as-is
+            const stripped = stripJsonQuotes(r);
+            if (stripped.ptr != r.ptr or stripped.len != r.len) {
+                // It was a quoted string — dupe the unquoted version and free the original
+                const result = alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+                alloc.free(r);
+                return result;
+            }
+            return r;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
+    if (std.mem.startsWith(u8, expr, "input.")) {
+        const ij = input_json orelse {
+            return alloc.dupe(u8, "") catch return error.OutOfMemory;
+        };
+        const field = expr["input.".len..];
+        const raw = try lookupJsonPath(alloc, ij, field);
+        if (raw) |r| {
+            const stripped = stripJsonQuotes(r);
+            if (stripped.ptr != r.ptr or stripped.len != r.len) {
+                const result = alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+                alloc.free(r);
+                return result;
+            }
+            return r;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
+    if (std.mem.eql(u8, expr, "item")) {
+        if (item_json) |ij| {
+            const stripped = stripJsonQuotes(ij);
+            return alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
+    if (std.mem.startsWith(u8, expr, "item.")) {
+        const ij = item_json orelse {
+            return alloc.dupe(u8, "") catch return error.OutOfMemory;
+        };
+        const field = expr["item.".len..];
+        const raw = try lookupJsonPath(alloc, ij, field);
+        if (raw) |r| {
+            const stripped = stripJsonQuotes(r);
+            if (stripped.ptr != r.ptr or stripped.len != r.len) {
+                const result = alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+                alloc.free(r);
+                return result;
+            }
+            return r;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
+    // Unknown expression — return empty
+    return alloc.dupe(u8, "") catch return error.OutOfMemory;
+}
+
+/// Check if a condition expression is truthy for the new template engine.
+/// Truthy: non-null, non-empty, not "false", not "0", not "null", not empty array "[]"
+fn isNewTruthy(
+    alloc: Allocator,
+    expr: []const u8,
+    state_json: []const u8,
+    input_json: ?[]const u8,
+    item_json: ?[]const u8,
+) bool {
+    const value = resolveNewExpression(alloc, expr, state_json, input_json, item_json) catch return false;
+    defer alloc.free(value);
+
+    if (value.len == 0) return false;
+    if (std.mem.eql(u8, value, "false")) return false;
+    if (std.mem.eql(u8, value, "0")) return false;
+    if (std.mem.eql(u8, value, "null")) return false;
+    if (std.mem.eql(u8, value, "[]")) return false;
+    return true;
+}
+
+/// Process `{% if expr %}...{% endif %}` conditional blocks for the new engine.
+fn processNewConditionals(
+    alloc: Allocator,
+    template: []const u8,
+    state_json: []const u8,
+    input_json: ?[]const u8,
+    item_json: ?[]const u8,
+) ![]const u8 {
+    var result: std.ArrayListUnmanaged(u8) = .empty;
+    errdefer result.deinit(alloc);
+
+    var pos: usize = 0;
+
+    while (pos < template.len) {
+        if (std.mem.indexOfPos(u8, template, pos, "{%")) |open| {
+            result.appendSlice(alloc, template[pos..open]) catch return error.OutOfMemory;
+
+            const after_open = open + 2;
+            const close = std.mem.indexOfPos(u8, template, after_open, "%}") orelse
+                return error.OutOfMemory;
+            const tag_content = std.mem.trim(u8, template[after_open..close], " \t\n\r");
+            const after_tag = close + 2;
+
+            if (std.mem.startsWith(u8, tag_content, "if ")) {
+                const expr = std.mem.trim(u8, tag_content["if ".len..], " \t\n\r");
+
+                // Find matching {% endif %} at this nesting level
+                var depth: usize = 0;
+                var scan: usize = after_tag;
+                var else_start: ?usize = null;
+                var else_end: ?usize = null;
+                var endif_start: ?usize = null;
+                var endif_end: ?usize = null;
+
+                while (scan < template.len) {
+                    if (std.mem.indexOfPos(u8, template, scan, "{%")) |inner_open| {
+                        const inner_after = inner_open + 2;
+                        const inner_close = std.mem.indexOfPos(u8, template, inner_after, "%}") orelse
+                            return error.OutOfMemory;
+                        const inner_tag = std.mem.trim(u8, template[inner_after..inner_close], " \t\n\r");
+                        const inner_after_tag = inner_close + 2;
+
+                        if (std.mem.startsWith(u8, inner_tag, "if ")) {
+                            depth += 1;
+                            scan = inner_after_tag;
+                        } else if (std.mem.eql(u8, inner_tag, "else") and depth == 0) {
+                            else_start = inner_open;
+                            else_end = inner_after_tag;
+                            scan = inner_after_tag;
+                        } else if (std.mem.eql(u8, inner_tag, "endif")) {
+                            if (depth == 0) {
+                                endif_start = inner_open;
+                                endif_end = inner_after_tag;
+                                break;
+                            }
+                            depth -= 1;
+                            scan = inner_after_tag;
+                        } else {
+                            scan = inner_after_tag;
+                        }
+                    } else {
+                        break;
+                    }
+                }
+
+                if (endif_end == null) {
+                    return error.OutOfMemory;
+                }
+
+                const truthy = isNewTruthy(alloc, expr, state_json, input_json, item_json);
+
+                if (truthy) {
+                    const branch_end = else_start orelse endif_start.?;
+                    const branch = template[after_tag..branch_end];
+                    const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json);
+                    defer alloc.free(processed);
+                    result.appendSlice(alloc, processed) catch return error.OutOfMemory;
+                } else {
+                    if (else_end) |ee| {
+                        const branch = template[ee..endif_start.?];
+                        const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json);
+                        defer alloc.free(processed);
+                        result.appendSlice(alloc, processed) catch return error.OutOfMemory;
+                    }
+                }
+
+                pos = endif_end.?;
+            } else {
+                result.appendSlice(alloc, template[open..after_tag]) catch return error.OutOfMemory;
+                pos = after_tag;
+            }
+        } else {
+            result.appendSlice(alloc, template[pos..]) catch return error.OutOfMemory;
+            break;
+        }
+    }
+
+    return result.toOwnedSlice(alloc) catch return error.OutOfMemory;
+}
+
+/// Render a template using the new state-based interpolation syntax.
+///
+/// Supported expressions:
+///   - `{{state.X}}` — state key value
+///   - `{{state.X.Y}}` — nested state access
+///   - `{{state.X[-1]}}` — last array element from state
+///   - `{{input.X}}` — original input (read-only)
+///   - `{{item}}` — current item in send context
+///   - `{{item.X}}` — nested access on item
+///   - `{% if state.X %}...{% endif %}` — conditionals
+///
+/// Processing order:
+///   1. Process `{% if ... %}...{% endif %}` blocks
+///   2. Process `{{...}}` interpolations
+pub fn renderTemplate(
+    alloc: Allocator,
+    template: []const u8,
+    state_json: []const u8,
+    input_json: ?[]const u8,
+    item_json: ?[]const u8,
+) ![]const u8 {
+    // Phase 1: Process conditional blocks
+    const preprocessed = try processNewConditionals(alloc, template, state_json, input_json, item_json);
+    defer alloc.free(preprocessed);
+
+    // Phase 2: Resolve {{expression}} substitutions
+    var result: std.ArrayListUnmanaged(u8) = .empty;
+    errdefer result.deinit(alloc);
+
+    var pos: usize = 0;
+
+    while (pos < preprocessed.len) {
+        if (std.mem.indexOfPos(u8, preprocessed, pos, "{{")) |open| {
+            result.appendSlice(alloc, preprocessed[pos..open]) catch return error.OutOfMemory;
+
+            const after_open = open + 2;
+            if (std.mem.indexOfPos(u8, preprocessed, after_open, "}}")) |close| {
+                const raw_expr = preprocessed[after_open..close];
+                const expr = std.mem.trim(u8, raw_expr, " \t\n\r");
+
+                const value = try resolveNewExpression(alloc, expr, state_json, input_json, item_json);
+                defer alloc.free(value);
+
+                result.appendSlice(alloc, value) catch return error.OutOfMemory;
+                pos = close + 2;
+            } else {
+                // Unterminated — just append the rest as literal
+                result.appendSlice(alloc, preprocessed[pos..]) catch return error.OutOfMemory;
+                break;
+            }
+        } else {
+            result.appendSlice(alloc, preprocessed[pos..]) catch return error.OutOfMemory;
+            break;
+        }
+    }
+
+    return result.toOwnedSlice(alloc) catch return error.OutOfMemory;
+}
+
+// ── New template engine tests ─────────────────────────────────────────
+
+test "template state interpolation" {
+    const alloc = std.testing.allocator;
+    const s = "{\"name\":\"test\",\"count\":42}";
+    const result = try renderTemplate(alloc, "Hello {{state.name}}, count={{state.count}}", s, null, null);
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("Hello test, count=42", result);
+}
+
+test "template input interpolation" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplate(alloc, "Topic: {{input.topic}}", "{}", "{\"topic\":\"AI\"}", null);
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("Topic: AI", result);
+}
+
+test "template item interpolation" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplate(alloc, "File: {{item.path}}", "{}", null, "{\"path\":\"main.py\"}");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("File: main.py", result);
+}
+
+test "template conditional true" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplate(alloc, "{% if state.name %}Hi {{state.name}}{% endif %}", "{\"name\":\"Bob\"}", null, null);
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("Hi Bob", result);
+}
+
+test "template conditional false" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplate(alloc, "{% if state.missing %}hidden{% endif %}visible", "{}", null, null);
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("visible", result);
+}
+
+test "template no interpolation" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplate(alloc, "plain text", "{}", null, null);
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("plain text", result);
+}
+
+// ── Old template engine tests ─────────────────────────────────────────
 
 test "render literal text unchanged" {
     const allocator = std.testing.allocator;

From 3f4124512ecf7e25ec83bbe7e0c84a99d4a21606 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 18:54:25 -0300
Subject: [PATCH 07/55] feat: rewrite workflow validation for new graph format

---
 src/workflow_validation.zig | 571 +++++++++++++++++++++++++++++++++++-
 1 file changed, 570 insertions(+), 1 deletion(-)

diff --git a/src/workflow_validation.zig b/src/workflow_validation.zig
index 3374e94..f0ae69e 100644
--- a/src/workflow_validation.zig
+++ b/src/workflow_validation.zig
@@ -1,4 +1,7 @@
 const std = @import("std");
+const Allocator = std.mem.Allocator;
+
+// ── Legacy validation (used by api.zig for POST /runs) ────────────────
 
 pub const ValidateError = error{
     StepMustBeObject,
@@ -140,7 +143,478 @@ fn validateExecutionControls(step_obj: std.json.ObjectMap) ValidateError!void {
     }
 }
 
-// ── Tests ─────────────────────────────────────────────────────────────
+// ── New graph-based workflow validation ───────────────────────────────
+
+pub const ValidationError = struct {
+    err_type: []const u8,
+    node: ?[]const u8,
+    key: ?[]const u8,
+    message: []const u8,
+};
+
+/// Validate a workflow definition JSON (new graph format).
+/// Returns a slice of ValidationError; caller must free with alloc.free().
+/// Individual string fields inside each ValidationError point into the
+/// parsed JSON tree (or are literals) and do not need separate freeing.
+pub fn validate(alloc: Allocator, definition_json: []const u8) ![]ValidationError {
+    var errors: std.ArrayListUnmanaged(ValidationError) = .empty;
+    defer errors.deinit(alloc);
+
+    const parsed = std.json.parseFromSlice(std.json.Value, alloc, definition_json, .{}) catch {
+        try errors.append(alloc, .{
+            .err_type = "parse_error",
+            .node = null,
+            .key = null,
+            .message = "failed to parse workflow JSON",
+        });
+        return errors.toOwnedSlice(alloc);
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        try errors.append(alloc, .{
+            .err_type = "parse_error",
+            .node = null,
+            .key = null,
+            .message = "workflow must be a JSON object",
+        });
+        return errors.toOwnedSlice(alloc);
+    }
+    const root = parsed.value.object;
+
+    // Extract nodes map
+    const nodes_val = root.get("nodes") orelse {
+        try errors.append(alloc, .{
+            .err_type = "missing_field",
+            .node = null,
+            .key = "nodes",
+            .message = "workflow must have a 'nodes' object",
+        });
+        return errors.toOwnedSlice(alloc);
+    };
+    if (nodes_val != .object) {
+        try errors.append(alloc, .{
+            .err_type = "missing_field",
+            .node = null,
+            .key = "nodes",
+            .message = "'nodes' must be an object",
+        });
+        return errors.toOwnedSlice(alloc);
+    }
+    const nodes = nodes_val.object;
+
+    // Extract edges array
+    const edges_val = root.get("edges") orelse {
+        try errors.append(alloc, .{
+            .err_type = "missing_field",
+            .node = null,
+            .key = "edges",
+            .message = "workflow must have an 'edges' array",
+        });
+        return errors.toOwnedSlice(alloc);
+    };
+    if (edges_val != .array) {
+        try errors.append(alloc, .{
+            .err_type = "missing_field",
+            .node = null,
+            .key = "edges",
+            .message = "'edges' must be an array",
+        });
+        return errors.toOwnedSlice(alloc);
+    }
+    const edges = edges_val.array.items;
+
+    // Extract state_schema (may be absent or empty object)
+    var state_schema: ?std.json.ObjectMap = null;
+    if (root.get("state_schema")) |ss_val| {
+        if (ss_val == .object) state_schema = ss_val.object;
+    }
+
+    // --- Collect send target_nodes (exempt from reachability) ---
+    var send_targets = std.StringHashMap(void).init(alloc);
+    defer send_targets.deinit();
+    var node_it = nodes.iterator();
+    while (node_it.next()) |entry| {
+        const nobj = entry.value_ptr.*;
+        if (nobj != .object) continue;
+        const ntype = getJsonStringFromObj(nobj.object, "type") orelse continue;
+        if (std.mem.eql(u8, ntype, "send")) {
+            if (getJsonStringFromObj(nobj.object, "target_node")) |tn| {
+                try send_targets.put(tn, {});
+            }
+        }
+    }
+
+    // --- Check 1: nodes_in_edges_exist ---
+    // Build adjacency list while we're at it
+    // Edge source format: "node" or "node:route_value"
+    // We'll parse edge sources to get the actual node name
+    var edge_sources: std.ArrayListUnmanaged([]const u8) = .empty;
+    defer edge_sources.deinit(alloc);
+    var edge_targets: std.ArrayListUnmanaged([]const u8) = .empty;
+    defer edge_targets.deinit(alloc);
+
+    for (edges) |edge_val| {
+        if (edge_val != .array or edge_val.array.items.len < 2) continue;
+        const src_raw = if (edge_val.array.items[0] == .string) edge_val.array.items[0].string else continue;
+        const tgt = if (edge_val.array.items[1] == .string) edge_val.array.items[1].string else continue;
+
+        // Parse "node:route_value" -> node name
+        const src_node = edgeSourceNode(src_raw);
+
+        try edge_sources.append(alloc, src_raw);
+        try edge_targets.append(alloc, tgt);
+
+        // Check source node exists (skip __start__, __end__)
+        if (!isReserved(src_node)) {
+            if (!nodes.contains(src_node)) {
+                try errors.append(alloc, .{
+                    .err_type = "nodes_in_edges_exist",
+                    .node = src_node,
+                    .key = null,
+                    .message = "edge source node does not exist in nodes map",
+                });
+            }
+        }
+        // Check target node exists (skip __start__, __end__)
+        if (!isReserved(tgt)) {
+            if (!nodes.contains(tgt)) {
+                try errors.append(alloc, .{
+                    .err_type = "nodes_in_edges_exist",
+                    .node = tgt,
+                    .key = null,
+                    .message = "edge target node does not exist in nodes map",
+                });
+            }
+        }
+    }
+
+    // --- Build reachability set from __start__ ---
+    // We do a BFS/DFS using static edges only (not send target_nodes).
+    var reachable = std.StringHashMap(void).init(alloc);
+    defer reachable.deinit();
+    var queue: std.ArrayListUnmanaged([]const u8) = .empty;
+    defer queue.deinit(alloc);
+
+    try reachable.put("__start__", {});
+    try queue.append(alloc, "__start__");
+
+    var qi: usize = 0;
+    while (qi < queue.items.len) : (qi += 1) {
+        const current = queue.items[qi];
+        for (edge_sources.items, edge_targets.items) |src_raw, tgt| {
+            const src_node = edgeSourceNode(src_raw);
+            if (std.mem.eql(u8, src_node, current) or std.mem.eql(u8, src_raw, current)) {
+                if (!reachable.contains(tgt)) {
+                    try reachable.put(tgt, {});
+                    try queue.append(alloc, tgt);
+                }
+            }
+        }
+    }
+
+    // --- Check 2: unreachable_node ---
+    node_it = nodes.iterator();
+    while (node_it.next()) |entry| {
+        const nname = entry.key_ptr.*;
+        if (reachable.contains(nname)) continue;
+        // Exempt send target_nodes
+        if (send_targets.contains(nname)) continue;
+        try errors.append(alloc, .{
+            .err_type = "unreachable_node",
+            .node = nname,
+            .key = null,
+            .message = "node is not reachable from __start__",
+        });
+    }
+
+    // --- Check 3: end_unreachable ---
+    // __end__ must be reachable from __start__ (simple check: it appears in
+    // reachable set, or at least one edge targets __end__).
+    // For leaf nodes that are not send_targets, there should be a path to __end__.
+    // We do a simplified check: __end__ must be in the reachable set.
+    if (!reachable.contains("__end__")) {
+        try errors.append(alloc, .{
+            .err_type = "end_unreachable",
+            .node = null,
+            .key = null,
+            .message = "__end__ is not reachable from __start__",
+        });
+    }
+
+    // --- Check 4: unintentional_cycle ---
+    // Detect cycles via DFS. Edges from route nodes (src contains ':') back to
+    // earlier nodes are intentional. Other back-edges are cycles (errors).
+    {
+        const CycleState = enum { unvisited, in_stack, done };
+        var cycle_state = std.StringHashMap(CycleState).init(alloc);
+        defer cycle_state.deinit();
+
+        // Initialize all known nodes
+        node_it = nodes.iterator();
+        while (node_it.next()) |entry| {
+            try cycle_state.put(entry.key_ptr.*, .unvisited);
+        }
+        try cycle_state.put("__start__", .unvisited);
+        try cycle_state.put("__end__", .unvisited);
+
+        // We need to track which src_raw produced the edge to know if it's a route edge
+        // Build adjacency: node -> list of (tgt, src_raw_is_route)
+        const EdgeInfo = struct { tgt: []const u8, from_route: bool };
+        var adj = std.StringHashMap(std.ArrayListUnmanaged(EdgeInfo)).init(alloc);
+        defer {
+            var adj_it = adj.iterator();
+            while (adj_it.next()) |e| e.value_ptr.deinit(alloc);
+            adj.deinit();
+        }
+
+        for (edge_sources.items, edge_targets.items) |src_raw, tgt| {
+            const src_node = edgeSourceNode(src_raw);
+            const is_route_edge = std.mem.indexOfScalar(u8, src_raw, ':') != null;
+            const res = try adj.getOrPut(src_node);
+            if (!res.found_existing) {
+                res.value_ptr.* = .empty;
+            }
+            try res.value_ptr.append(alloc, .{ .tgt = tgt, .from_route = is_route_edge });
+        }
+
+        // Iterative DFS
+        var visited_for_dfs = std.StringHashMap(CycleState).init(alloc);
+        defer visited_for_dfs.deinit();
+
+        // Initialize
+        var cs_it = cycle_state.iterator();
+        while (cs_it.next()) |e| {
+            try visited_for_dfs.put(e.key_ptr.*, .unvisited);
+        }
+
+        var dfs_nodes: std.ArrayListUnmanaged([]const u8) = .empty;
+        defer dfs_nodes.deinit(alloc);
+        var cs_it2 = cycle_state.iterator();
+        while (cs_it2.next()) |e| {
+            try dfs_nodes.append(alloc, e.key_ptr.*);
+        }
+
+        for (dfs_nodes.items) |start_node| {
+            const s = visited_for_dfs.get(start_node) orelse .unvisited;
+            if (s != .unvisited) continue;
+
+            // DFS iterative with path tracking
+            var path = std.StringHashMap(void).init(alloc);
+            defer path.deinit();
+
+            const DfsEntry = struct { node: []const u8, child_idx: usize };
+            var stack: std.ArrayListUnmanaged(DfsEntry) = .empty;
+            defer stack.deinit(alloc);
+
+            try stack.append(alloc, .{ .node = start_node, .child_idx = 0 });
+            try path.put(start_node, {});
+            visited_for_dfs.put(start_node, .in_stack) catch {};
+
+            while (stack.items.len > 0) {
+                const top = &stack.items[stack.items.len - 1];
+                const neighbors = adj.get(top.node);
+                if (neighbors == null or top.child_idx >= neighbors.?.items.len) {
+                    // Done with this node
+                    _ = path.remove(top.node);
+                    visited_for_dfs.put(top.node, .done) catch {};
+                    _ = stack.pop();
+                    continue;
+                }
+                const neighbor = neighbors.?.items[top.child_idx];
+                top.child_idx += 1;
+
+                const tgt = neighbor.tgt;
+                const from_route = neighbor.from_route;
+
+                // Skip reserved endpoints for cycle detection
+                if (isReserved(tgt)) continue;
+
+                const tgt_state = visited_for_dfs.get(tgt) orelse .unvisited;
+                if (tgt_state == .in_stack) {
+                    // Back edge found — cycle
+                    if (!from_route) {
+                        // Report cycle error only once per target
+                        var already_reported = false;
+                        for (errors.items) |e| {
+                            if (std.mem.eql(u8, e.err_type, "unintentional_cycle") and
+                                e.node != null and std.mem.eql(u8, e.node.?, tgt))
+                            {
+                                already_reported = true;
+                                break;
+                            }
+                        }
+                        if (!already_reported) {
+                            try errors.append(alloc, .{
+                                .err_type = "unintentional_cycle",
+                                .node = tgt,
+                                .key = null,
+                                .message = "cycle detected: non-route edge creates a cycle",
+                            });
+                        }
+                    }
+                    // Intentional route cycle — skip
+                } else if (tgt_state == .unvisited) {
+                    visited_for_dfs.put(tgt, .in_stack) catch {};
+                    try path.put(tgt, {});
+                    try stack.append(alloc, .{ .node = tgt, .child_idx = 0 });
+                }
+                // .done: already processed, no cycle through this path
+            }
+        }
+    }
+
+    // --- Check 5: undefined_state_key ---
+    if (state_schema) |schema| {
+        node_it = nodes.iterator();
+        while (node_it.next()) |entry| {
+            const nname = entry.key_ptr.*;
+            const nval = entry.value_ptr.*;
+            if (nval != .object) continue;
+            const nobj = nval.object;
+
+            // Check prompt field
+            if (getJsonStringFromObj(nobj, "prompt")) |prompt| {
+                try checkStateRefs(alloc, &errors, schema, nname, "prompt", prompt);
+            }
+            // Check message field (interrupt)
+            if (getJsonStringFromObj(nobj, "message")) |msg| {
+                try checkStateRefs(alloc, &errors, schema, nname, "message", msg);
+            }
+        }
+    }
+
+    // --- Check 6: invalid_route_target ---
+    node_it = nodes.iterator();
+    while (node_it.next()) |entry| {
+        const nname = entry.key_ptr.*;
+        const nval = entry.value_ptr.*;
+        if (nval != .object) continue;
+        const nobj = nval.object;
+        const ntype = getJsonStringFromObj(nobj, "type") orelse continue;
+        if (!std.mem.eql(u8, ntype, "route")) continue;
+
+        const routes_val = nobj.get("routes") orelse continue;
+        if (routes_val != .object) continue;
+        var routes_it = routes_val.object.iterator();
+        while (routes_it.next()) |re| {
+            const target = if (re.value_ptr.* == .string) re.value_ptr.*.string else continue;
+            if (!nodes.contains(target)) {
+                try errors.append(alloc, .{
+                    .err_type = "invalid_route_target",
+                    .node = nname,
+                    .key = re.key_ptr.*,
+                    .message = "route target node does not exist",
+                });
+            }
+        }
+    }
+
+    // --- Check 7: invalid_send_target ---
+    node_it = nodes.iterator();
+    while (node_it.next()) |entry| {
+        const nname = entry.key_ptr.*;
+        const nval = entry.value_ptr.*;
+        if (nval != .object) continue;
+        const nobj = nval.object;
+        const ntype = getJsonStringFromObj(nobj, "type") orelse continue;
+        if (!std.mem.eql(u8, ntype, "send")) continue;
+
+        if (getJsonStringFromObj(nobj, "target_node")) |tn| {
+            if (!nodes.contains(tn)) {
+                try errors.append(alloc, .{
+                    .err_type = "invalid_send_target",
+                    .node = nname,
+                    .key = "target_node",
+                    .message = "send target_node does not exist in nodes map",
+                });
+            }
+        }
+    }
+
+    // The errors list contains slices pointing into `parsed` which will be
+    // freed by `defer parsed.deinit()`. We need to copy all strings into
+    // alloc-owned memory before returning.
+    const result = try copyErrors(alloc, errors.items);
+    return result;
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────
+
+fn isReserved(name: []const u8) bool {
+    return std.mem.eql(u8, name, "__start__") or std.mem.eql(u8, name, "__end__");
+}
+
+/// Given a raw edge source like "node:route_value", return "node".
+/// If no colon, returns the whole string.
+fn edgeSourceNode(src_raw: []const u8) []const u8 {
+    if (std.mem.indexOfScalar(u8, src_raw, ':')) |colon_pos| {
+        return src_raw[0..colon_pos];
+    }
+    return src_raw;
+}
+
+fn getJsonStringFromObj(obj: std.json.ObjectMap, key: []const u8) ?[]const u8 {
+    const val = obj.get(key) orelse return null;
+    if (val == .string) return val.string;
+    return null;
+}
+
+/// Scan `text` for {{state.KEY}} references and check them against schema.
+fn checkStateRefs(
+    alloc: Allocator,
+    errors: *std.ArrayListUnmanaged(ValidationError),
+    schema: std.json.ObjectMap,
+    node_name: []const u8,
+    _field_name: []const u8,
+    text: []const u8,
+) !void {
+    _ = _field_name;
+    var pos: usize = 0;
+    while (pos < text.len) {
+        // Find "{{"
+        const open = std.mem.indexOfPos(u8, text, pos, "{{") orelse break;
+        const close = std.mem.indexOfPos(u8, text, open + 2, "}}") orelse break;
+        const expr = text[open + 2 .. close];
+        pos = close + 2;
+
+        // Check if it's "state.KEY"
+        if (std.mem.startsWith(u8, expr, "state.")) {
+            const key = expr["state.".len..];
+            if (key.len > 0 and !schema.contains(key)) {
+                // Copy strings to avoid dangling references after parsed.deinit()
+                // (We'll do a bulk copy in copyErrors later, but here we need
+                // to store enough info. We store literals or slices into
+                // node_name/field_name which come from the parsed JSON tree;
+                // copyErrors will deep-copy them.)
+                try errors.append(alloc, .{
+                    .err_type = "undefined_state_key",
+                    .node = node_name,
+                    .key = key,
+                    .message = "state key referenced in template is not defined in state_schema",
+                });
+            }
+        }
+    }
+}
+
+/// Deep-copy all strings in the error list into alloc-owned memory.
+/// This is needed because the source strings point into a parsed JSON tree
+/// that will be freed after validate() returns.
+fn copyErrors(alloc: Allocator, src: []const ValidationError) ![]ValidationError {
+    const result = try alloc.alloc(ValidationError, src.len);
+    for (src, 0..) |e, i| {
+        result[i] = .{
+            .err_type = try alloc.dupe(u8, e.err_type),
+            .node = if (e.node) |n| try alloc.dupe(u8, n) else null,
+            .key = if (e.key) |k| try alloc.dupe(u8, k) else null,
+            .message = try alloc.dupe(u8, e.message),
+        };
+    }
+    return result;
+}
+
+// ── Tests: legacy ─────────────────────────────────────────────────────
 
 test "validateStepsForCreateRun: valid workflow" {
     const allocator = std.testing.allocator;
@@ -378,3 +852,98 @@ test "validateStepsForCreateRun: rejects invalid wait signal type" {
     defer parsed.deinit();
     try std.testing.expectError(error.WaitSignalInvalid, validateStepsForCreateRun(allocator, parsed.value.array.items));
 }
+
+// ── Tests: new graph validation ────────────────────────────────────────
+
+test "validate valid simple workflow" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{"msg":{"type":"string","reducer":"last_value"}},"nodes":{"a":{"type":"task","prompt":"{{state.msg}}"}},"edges":[["__start__","a"],["a","__end__"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    try std.testing.expectEqual(@as(usize, 0), errors.len);
+}
+
+test "validate unreachable node" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{},"nodes":{"a":{"type":"task","prompt":"x"},"orphan":{"type":"task","prompt":"y"}},"edges":[["__start__","a"],["a","__end__"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    try std.testing.expect(errors.len > 0);
+    try std.testing.expectEqualStrings("unreachable_node", errors[0].err_type);
+}
+
+test "validate undefined state key" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{"msg":{"type":"string","reducer":"last_value"}},"nodes":{"a":{"type":"task","prompt":"{{state.typo}}"}},"edges":[["__start__","a"],["a","__end__"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    try std.testing.expect(errors.len > 0);
+    try std.testing.expectEqualStrings("undefined_state_key", errors[0].err_type);
+}
+
+test "validate send target exempt from reachability" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{"items":{"type":"array","reducer":"last_value"},"results":{"type":"array","reducer":"append"}},"nodes":{"s":{"type":"send","items_key":"state.items","target_node":"worker","output_key":"results"},"worker":{"type":"task","prompt":"do work"}},"edges":[["__start__","s"],["s","__end__"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    try std.testing.expectEqual(@as(usize, 0), errors.len);
+}
+
+test "validate invalid route target" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{"x":{"type":"string","reducer":"last_value"}},"nodes":{"r":{"type":"route","input":"state.x","routes":{"a":"nonexistent"}}},"edges":[["__start__","r"],["r:a","nonexistent"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    // Should have error about nonexistent node (either in route target or edge target)
+    try std.testing.expect(errors.len > 0);
+}

From 28c7bd90d6ba73db26a354e587c14017d421710d Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 19:07:47 -0300
Subject: [PATCH 08/55] feat: rewrite engine with unified state model

Replace the old 14-step-type engine with a graph-based state model
using 6 node types (task, route, interrupt, agent, send, transform).
The engine now processes workflows as a DAG with edges, applying
state updates through reducers and saving checkpoints after each node.

Key changes:
- processRun loops finding ready nodes until no more progress
- findReadyNodes with dead-node detection for conditional routing
- State flows through applyUpdates/reducers instead of templates
- Route nodes use conditional edges (source:value) for branching
- Interrupt nodes pause the run with checkpoint preservation
- Transform nodes apply static updates without worker dispatch
- Send nodes dispatch target_node per item from state array

Also adds state_json to RunRow and updates store queries to include it.
---
 src/engine.zig | 5797 +++++++++---------------------------------------
 src/store.zig  |   17 +-
 src/types.zig  |    1 +
 3 files changed, 1001 insertions(+), 4814 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index c3b3baa..19104a4 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -1,15 +1,22 @@
-/// DAG Engine — Scheduler Loop
+/// DAG Engine — Unified State Model Scheduler
 ///
 /// The engine runs on its own thread, polling the database for active runs
-/// and processing their steps according to the DAG dependencies.
+/// and processing them using a graph-based state model with 6 node types:
+///   task, route, interrupt, agent, send, transform
 ///
 /// Each tick:
-///   1. Get active runs
-///   2. For each run, promote pending steps to ready
-///   3. Process ready steps by type (task, fan_out, map, reduce, condition, approval)
-///   4. Check run completion
+///   1. Get active runs (status = running)
+///   2. For each run:
+///      a. Load current state from run.state_json
+///      b. Load workflow definition from run.workflow_json
+///      c. Get completed nodes from latest checkpoint (or [])
+///      d. Find ready nodes (all nodes whose inbound edges are satisfied)
+///      e. Execute ready nodes in sequence
+///      f. Apply state updates via reducers, save checkpoint
+///      g. Check termination / deadlock
 const std = @import("std");
 const log = std.log.scoped(.engine);
+const json = std.json;
 
 const Store = @import("store.zig").Store;
 const types = @import("types.zig");
@@ -19,6 +26,9 @@ const dispatch = @import("dispatch.zig");
 const callbacks = @import("callbacks.zig");
 const metrics_mod = @import("metrics.zig");
 const async_dispatch = @import("async_dispatch.zig");
+const state_mod = @import("state.zig");
+
+// TODO: add SseHub integration in Task 12
 
 // ── Engine ───────────────────────────────────────────────────────────
 
@@ -42,11 +52,6 @@ pub const Engine = struct {
     metrics: ?*metrics_mod.Metrics,
     response_queue: ?*async_dispatch.ResponseQueue,
 
-    const TaskPromptSource = union(enum) {
-        rendered: []const u8,
-        template: []const u8,
-    };
-
     pub fn init(store: *Store, allocator: std.mem.Allocator, poll_interval_ms: u64) Engine {
         return .{
             .store = store,
@@ -137,201 +142,381 @@ pub const Engine = struct {
         }
     }
 
-    // ── processRun ───────────────────────────────────────────────────
+    // ── processRun — state-based graph execution ─────────────────────
 
     fn processRun(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow) !void {
-        // 1. Get all steps for this run
-        const steps = try self.store.getStepsByRun(alloc, run_row.id);
-
-        // 2. Promote pending -> ready: for each pending step, check if
-        //    all its deps are completed/skipped.
-        for (steps) |step| {
-            if (!std.mem.eql(u8, step.status, "pending")) continue;
-
-            const dep_ids = try self.store.getStepDeps(alloc, step.id);
-            var all_deps_met = true;
-
-            for (dep_ids) |dep_id| {
-                // Find the dep step status from our already-fetched steps
-                const dep_status = findStepStatus(steps, dep_id);
-                if (dep_status) |ds| {
-                    if (!std.mem.eql(u8, ds, "completed") and !std.mem.eql(u8, ds, "skipped")) {
-                        all_deps_met = false;
-                        break;
+        // 1. Load current state
+        const current_state = run_row.state_json orelse "{}";
+
+        // 2. Load workflow definition
+        const workflow_json = run_row.workflow_json;
+
+        // 3. Get completed nodes from latest checkpoint
+        var completed_nodes = std.StringHashMap(void).init(alloc);
+        var route_results = std.StringHashMap([]const u8).init(alloc);
+
+        const latest_checkpoint = try self.store.getLatestCheckpoint(alloc, run_row.id);
+        if (latest_checkpoint) |cp| {
+            // Parse completed_nodes_json array
+            const cn_parsed = json.parseFromSlice(json.Value, alloc, cp.completed_nodes_json, .{}) catch null;
+            if (cn_parsed) |p| {
+                if (p.value == .array) {
+                    for (p.value.array.items) |item| {
+                        if (item == .string) {
+                            try completed_nodes.put(item.string, {});
+                        }
                     }
-                } else {
-                    // Dep step not found — treat as unmet
-                    all_deps_met = false;
-                    break;
                 }
             }
 
-            if (all_deps_met) {
-                try self.store.updateStepStatus(step.id, "ready", null, null, null, step.attempt);
-                log.info("promoted step {s} to ready", .{step.id});
+            // Parse route results from checkpoint metadata
+            if (cp.metadata_json) |meta_str| {
+                const meta_parsed = json.parseFromSlice(json.Value, alloc, meta_str, .{}) catch null;
+                if (meta_parsed) |mp| {
+                    if (mp.value == .object) {
+                        if (mp.value.object.get("route_results")) |rr| {
+                            if (rr == .object) {
+                                var it = rr.object.iterator();
+                                while (it.next()) |entry| {
+                                    if (entry.value_ptr.* == .string) {
+                                        try route_results.put(entry.key_ptr.*, entry.value_ptr.string);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
             }
         }
 
-        // 3. Re-fetch steps to get updated statuses
-        const updated_steps = try self.store.getStepsByRun(alloc, run_row.id);
+        var version: i64 = if (latest_checkpoint) |cp| cp.version else 0;
+        const initial_version = version;
 
-        // 4. Process ready steps based on their type
-        for (updated_steps) |step| {
-            if (!std.mem.eql(u8, step.status, "ready")) continue;
+        // 4. Main execution loop: find ready nodes, execute, repeat
+        var running_state: []const u8 = try alloc.dupe(u8, current_state);
+        var max_iterations: u32 = 1000; // safety guard against infinite loops
 
-            if (std.mem.eql(u8, step.type, "task")) {
-                self.executeTaskStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing task step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "fan_out")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeFanOutStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing fan_out step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "map")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeMapStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing map step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "reduce")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeReduceStep(alloc, run_row, step, updated_steps) catch |err| {
-                    log.err("error executing reduce step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "condition")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeConditionStep(alloc, run_row, step, updated_steps) catch |err| {
-                    log.err("error executing condition step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "approval")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeApprovalStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing approval step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "transform")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeTransformStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing transform step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "wait")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeWaitStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing wait step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "router")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeRouterStep(alloc, run_row, step, updated_steps) catch |err| {
-                    log.err("error executing router step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "loop")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeLoopStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing loop step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "sub_workflow")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeSubWorkflowStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing sub_workflow step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "debate")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeDebateStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing debate step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "group_chat")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeGroupChatStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing group_chat step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "saga")) {
-                const claimed = self.store.claimReadyStep(step.id, null, ids.nowMs()) catch false;
-                if (!claimed) continue;
-                self.executeSagaStep(alloc, run_row, step) catch |err| {
-                    log.err("error executing saga step {s}: {}", .{ step.id, err });
-                };
-            } else {
-                log.warn("unknown step type {s} for step {s}", .{ step.type, step.id });
+        while (max_iterations > 0) : (max_iterations -= 1) {
+            const ready_nodes = try findReadyNodes(alloc, workflow_json, &completed_nodes, &route_results);
+            if (ready_nodes.len == 0) {
+                // Check termination: if all paths reached __end__
+                if (completed_nodes.get("__end__") != null) {
+                    // Save final state if we made progress
+                    if (version > initial_version) {
+                        try self.store.updateRunState(run_row.id, running_state);
+                    }
+                    try self.store.updateRunStatus(run_row.id, "completed", null);
+                    try self.store.insertEvent(run_row.id, null, "run.completed", "{}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.completed", run_row.id, null, "{}", self.metrics);
+                    log.info("run {s} completed", .{run_row.id});
+                    return;
+                }
+                // Deadlock: no ready nodes and not done
+                if (completed_nodes.count() > 0) {
+                    // Check if any step is still running asynchronously
+                    const steps = try self.store.getStepsByRun(alloc, run_row.id);
+                    var has_running = false;
+                    for (steps) |step| {
+                        if (std.mem.eql(u8, step.status, "running")) {
+                            has_running = true;
+                            break;
+                        }
+                    }
+                    if (has_running) {
+                        for (steps) |step| {
+                            if (std.mem.eql(u8, step.status, "running")) {
+                                self.pollAsyncTaskStep(alloc, run_row, step) catch |err| {
+                                    log.err("error polling async step {s}: {}", .{ step.id, err });
+                                };
+                            }
+                        }
+                        return;
+                    }
+                    log.err("run {s} deadlocked: no ready nodes, not completed", .{run_row.id});
+                    try self.store.updateRunStatus(run_row.id, "failed", "deadlock: no ready nodes");
+                    try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"deadlock\"}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                }
+                return;
             }
-        }
 
-        // 4b. Check running steps that need tick-based polling
-        for (updated_steps) |step| {
-            if (!std.mem.eql(u8, step.status, "running")) continue;
-            if (std.mem.eql(u8, step.type, "wait")) {
-                self.executeWaitStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling wait step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "loop")) {
-                self.pollRunningLoopStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling loop step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "sub_workflow")) {
-                self.pollRunningSubWorkflowStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling sub_workflow step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "debate")) {
-                self.pollRunningDebateStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling debate step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "group_chat")) {
-                self.pollRunningGroupChatStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling group_chat step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "saga")) {
-                self.pollRunningSagaStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling saga step {s}: {}", .{ step.id, err });
-                };
-            } else if (std.mem.eql(u8, step.type, "task")) {
-                self.pollAsyncTaskStep(alloc, run_row, step) catch |err| {
-                    log.err("error polling async task step {s}: {}", .{ step.id, err });
+            // 5. Execute ready nodes sequentially
+            var made_progress = false;
+
+            for (ready_nodes) |node_name| {
+            if (std.mem.eql(u8, node_name, "__end__")) {
+                // Mark __end__ as completed
+                try completed_nodes.put("__end__", {});
+                version += 1;
+
+                // Save checkpoint
+                const cp_id_buf = ids.generateId();
+                const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const meta_json = try serializeRouteResults(alloc, &route_results);
+                try self.store.createCheckpoint(cp_id, run_row.id, "__end__", parent_id, running_state, cn_json, version, meta_json);
+                try self.store.incrementCheckpointCount(run_row.id);
+                try self.store.updateRunState(run_row.id, running_state);
+
+                // Run is completed
+                try self.store.updateRunStatus(run_row.id, "completed", null);
+                try self.store.insertEvent(run_row.id, null, "run.completed", "{}");
+                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.completed", run_row.id, null, "{}", self.metrics);
+                log.info("run {s} completed", .{run_row.id});
+                return;
+            }
+
+            // Get node definition from workflow
+            const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
+                log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
+                try self.store.updateRunStatus(run_row.id, "failed", "node not found in workflow");
+                return;
+            };
+
+            // Get node type
+            const node_type = getNodeField(alloc, node_json, "type") orelse "task";
+
+            // Execute based on type
+            if (std.mem.eql(u8, node_type, "route")) {
+                // Route: evaluate routing logic, no worker dispatch
+                const result = try self.executeRouteNode(alloc, node_name, node_json, running_state);
+                if (result.route_value) |rv| {
+                    try route_results.put(try alloc.dupe(u8, node_name), rv);
+                }
+                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+
+                // Create step record
+                const step_id_buf = ids.generateId();
+                const step_id = try alloc.dupe(u8, &step_id_buf);
+                try self.store.insertStep(step_id, run_row.id, node_name, "route", "completed", "{}", 1, null, null, null);
+                const route_output = try std.fmt.allocPrint(alloc, "{{\"route\":\"{s}\"}}", .{result.route_value orelse "default"});
+                try self.store.updateStepStatus(step_id, "completed", null, route_output, null, 1);
+                try self.store.insertEvent(run_row.id, step_id, "step.completed", route_output);
+
+                log.info("route node {s} -> {s}", .{ node_name, result.route_value orelse "default" });
+            } else if (std.mem.eql(u8, node_type, "interrupt")) {
+                // Interrupt: save checkpoint, set run to interrupted
+                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                version += 1;
+
+                const step_id_buf = ids.generateId();
+                const step_id = try alloc.dupe(u8, &step_id_buf);
+                try self.store.insertStep(step_id, run_row.id, node_name, "interrupt", "completed", "{}", 1, null, null, null);
+                try self.store.updateStepStatus(step_id, "completed", null, "{\"interrupted\":true}", null, 1);
+                try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+
+                const cp_id_buf = ids.generateId();
+                const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const meta_json = try serializeRouteResults(alloc, &route_results);
+                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                try self.store.incrementCheckpointCount(run_row.id);
+                try self.store.updateRunState(run_row.id, running_state);
+
+                try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                log.info("run {s} interrupted at node {s}", .{ run_row.id, node_name });
+                return;
+            } else if (std.mem.eql(u8, node_type, "transform")) {
+                // Transform: apply static updates, no worker dispatch
+                const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
+
+                // Get schema from workflow
+                const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+
+                // Apply updates via reducers
+                const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
+                    log.err("transform node {s} failed to apply updates: {}", .{ node_name, err });
+                    try self.store.updateRunStatus(run_row.id, "failed", "transform failed");
+                    return;
                 };
+                running_state = new_state;
+
+                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+
+                // Create step record
+                const step_id_buf = ids.generateId();
+                const step_id = try alloc.dupe(u8, &step_id_buf);
+                try self.store.insertStep(step_id, run_row.id, node_name, "transform", "completed", "{}", 1, null, null, null);
+                try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
+                try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+
+                log.info("transform node {s} completed", .{node_name});
+            } else if (std.mem.eql(u8, node_type, "task") or std.mem.eql(u8, node_type, "agent")) {
+                // Task/Agent: render prompt, dispatch to worker, apply state updates
+                const result = try self.executeTaskNode(alloc, run_row, node_name, node_json, running_state);
+
+                switch (result) {
+                    .completed => |cr| {
+                        // Apply state updates
+                        if (cr.state_updates) |updates| {
+                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                                log.err("task node {s} failed to apply updates: {}", .{ node_name, err });
+                                try self.store.updateRunStatus(run_row.id, "failed", "state update failed");
+                                return;
+                            };
+                            running_state = new_state;
+                        }
+
+                        // Consume pending injections
+                        const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
+                        for (injections) |injection| {
+                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const new_state = state_mod.applyUpdates(alloc, running_state, injection.updates_json, schema_json) catch |err| {
+                                log.warn("failed to apply injection for run {s}: {}", .{ run_row.id, err });
+                                continue;
+                            };
+                            running_state = new_state;
+                        }
+
+                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                        log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
+                    },
+                    .async_pending => {
+                        // Step is dispatched async, don't mark as completed yet
+                        // Will be polled on next tick
+                        log.info("task node {s} dispatched async for run {s}", .{ node_name, run_row.id });
+                        // Save checkpoint with current progress before returning
+                        version += 1;
+                        const cp_id_buf = ids.generateId();
+                        const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                        const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                        const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                        const meta_json = try serializeRouteResults(alloc, &route_results);
+                        try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                        try self.store.incrementCheckpointCount(run_row.id);
+                        try self.store.updateRunState(run_row.id, running_state);
+                        return;
+                    },
+                    .no_worker => {
+                        // No worker available, will retry next tick
+                        log.debug("no worker for task node {s}, will retry", .{node_name});
+                        // Save progress so far
+                        if (version > initial_version) {
+                            const cp_id_buf = ids.generateId();
+                            const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                            const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                            const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                            const meta_json = try serializeRouteResults(alloc, &route_results);
+                            try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                            try self.store.incrementCheckpointCount(run_row.id);
+                            try self.store.updateRunState(run_row.id, running_state);
+                        }
+                        return;
+                    },
+                    .failed => |err_text| {
+                        log.err("task node {s} failed: {s}", .{ node_name, err_text });
+                        try self.store.updateRunStatus(run_row.id, "failed", err_text);
+                        try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
+                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                        return;
+                    },
+                }
+            } else if (std.mem.eql(u8, node_type, "send")) {
+                // Send: read items from state, dispatch target_node per item
+                const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
+                if (result.state_updates) |updates| {
+                    const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                    const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                        log.err("send node {s} failed to apply updates: {}", .{ node_name, err });
+                        try self.store.updateRunStatus(run_row.id, "failed", "send state update failed");
+                        return;
+                    };
+                    running_state = new_state;
+                }
+                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                log.info("send node {s} completed for run {s}", .{ node_name, run_row.id });
+            } else {
+                log.warn("unknown node type {s} for node {s}", .{ node_type, node_name });
+                try self.store.updateRunStatus(run_row.id, "failed", "unknown node type");
+                return;
             }
+
+            // Save checkpoint after each node
+            made_progress = true;
+            version += 1;
+            const cp_id_buf = ids.generateId();
+            const cp_id = try alloc.dupe(u8, &cp_id_buf);
+            const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+            const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+            const meta_json = try serializeRouteResults(alloc, &route_results);
+            try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+            try self.store.incrementCheckpointCount(run_row.id);
+            try self.store.updateRunState(run_row.id, running_state);
         }
 
-        // 5. Check run completion
-        try self.checkRunCompletion(run_row.id, alloc);
+            // If no progress was made in this iteration, break
+            if (!made_progress) break;
+        } // end while loop
     }
 
-    // ── executeTaskStep ──────────────────────────────────────────────
+    // ── Node Execution Results ───────────────────────────────────────
+
+    const TaskNodeResult = union(enum) {
+        completed: struct {
+            state_updates: ?[]const u8,
+        },
+        async_pending: void,
+        no_worker: void,
+        failed: []const u8,
+    };
+
+    const SendNodeResult = struct {
+        state_updates: ?[]const u8,
+    };
+
+    const RouteNodeResult = struct {
+        route_value: ?[]const u8,
+    };
+
+    // ── executeRouteNode ─────────────────────────────────────────────
+
+    fn executeRouteNode(self: *Engine, alloc: std.mem.Allocator, node_name: []const u8, node_json: []const u8, state_json: []const u8) !RouteNodeResult {
+        _ = self;
+        _ = node_name;
 
-    fn executeTaskStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        if (step.next_attempt_at_ms) |next_attempt| {
-            if (ids.nowMs() < next_attempt) return;
+        // Get the input path to read from state
+        const input_path = getNodeField(alloc, node_json, "input") orelse "state.route_input";
+
+        // Read value from state
+        const value_json = state_mod.getStateValue(alloc, state_json, input_path) catch null;
+        if (value_json == null) {
+            // No value at path, try default route
+            const default_route = getNodeField(alloc, node_json, "default");
+            return RouteNodeResult{ .route_value = default_route };
         }
 
-        // 1. Resolve prompt source for this task step.
-        const prompt_source = try self.resolveTaskPromptSource(alloc, run_row, step) orelse {
-            log.warn("no prompt_template for step {s}", .{step.def_step_id});
-            return;
+        // Stringify value for route matching
+        const route_key = state_mod.stringifyForRoute(alloc, value_json.?) catch {
+            const default_route = getNodeField(alloc, node_json, "default");
+            return RouteNodeResult{ .route_value = default_route };
         };
 
-        // 2. Build final prompt.
-        const rendered_prompt = switch (prompt_source) {
-            .rendered => |prompt| prompt,
-            .template => |prompt_template| blk: {
-                const ctx = try buildTemplateContext(alloc, run_row, step, self.store);
-                break :blk templates.render(alloc, prompt_template, ctx) catch |err| {
-                    log.err("template render failed for step {s}: {}", .{ step.id, err });
-                    try self.store.updateStepStatus(step.id, "failed", null, null, "template render failed", step.attempt);
-                    try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                    return;
-                };
-            },
+        // Look up in routes map — but routes are encoded in edges, not in node
+        // The route value is used for conditional edge matching like "node:value"
+        return RouteNodeResult{ .route_value = route_key };
+    }
+
+    // ── executeTaskNode ──────────────────────────────────────────────
+
+    fn executeTaskNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !TaskNodeResult {
+        // 1. Get prompt template from node definition
+        const prompt_template = getNodeField(alloc, node_json, "prompt_template") orelse {
+            // No prompt template — mark as completed with no state updates
+            return TaskNodeResult{ .completed = .{ .state_updates = null } };
+        };
+
+        // 2. Render prompt using new templates.renderTemplate
+        const rendered_prompt = templates.renderTemplate(alloc, prompt_template, state_json, run_row.input_json, null) catch |err| {
+            log.err("template render failed for node {s}: {}", .{ node_name, err });
+            return TaskNodeResult{ .failed = "template render failed" };
         };
 
-        // 4. Get all workers and build WorkerInfo list
+        // 3. Get workers and select one
         const workers = try self.store.listWorkers(alloc);
         var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
         for (workers) |w| {
@@ -349,189 +534,213 @@ pub const Engine = struct {
             });
         }
 
-        // 5. Parse worker_tags from the step definition
-        const required_tags = try getStepTags(alloc, run_row.workflow_json, step.def_step_id);
-
-        // 6. Select an available worker
+        const required_tags = getNodeTags(alloc, node_json);
         const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
         if (selected_worker == null) {
-            // No worker available — leave as "ready", will retry next tick
-            log.debug("no worker available for step {s}, will retry", .{step.id});
-            return;
+            return TaskNodeResult{ .no_worker = {} };
         }
         const worker = selected_worker.?;
 
-        // 7. Atomically claim the step to avoid duplicate dispatch across instances.
-        const claim_ts = ids.nowMs();
-        const claimed = try self.store.claimReadyStep(step.id, worker.id, claim_ts);
-        if (!claimed) {
-            return;
-        }
+        // 4. Create step record
+        const step_id_buf = ids.generateId();
+        const step_id = try alloc.dupe(u8, &step_id_buf);
+        const node_type = getNodeField(alloc, node_json, "type") orelse "task";
+        try self.store.insertStep(step_id, run_row.id, node_name, node_type, "running", state_json, 1, null, null, null);
+        try self.store.insertEvent(run_row.id, step_id, "step.running", "{}");
+
         if (self.metrics) |m| {
             metrics_mod.Metrics.incr(&m.steps_claimed_total);
         }
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-
-        // 8. Dispatch to worker with handoff support
-        var current_worker = worker;
-        var current_prompt = rendered_prompt;
-        var handoff_count: u32 = 0;
-        const max_handoffs: u32 = 5;
-
-        var final_result: dispatch.DispatchResult = undefined;
 
-        while (true) {
-            final_result = try dispatch.dispatchStep(
-                alloc,
-                current_worker.url,
-                current_worker.token,
-                current_worker.protocol,
-                current_worker.model,
-                run_row.id,
-                step.id,
-                current_prompt,
-            );
-
-            if (!final_result.success) break;
-
-            // Check for handoff_to in the output
-            const handoff_target = extractHandoffTarget(alloc, final_result.output);
-            if (handoff_target == null) break; // Normal completion
-
-            handoff_count += 1;
-            if (handoff_count >= max_handoffs) {
-                final_result = .{
-                    .output = "",
-                    .success = false,
-                    .error_text = "handoff chain limit exceeded (max 5)",
-                };
-                break;
-            }
-
-            // Log the handoff event
-            const handoff_event = try std.fmt.allocPrint(alloc, "{{\"handoff_from\":\"{s}\",\"handoff_to_tags\":\"{s}\"}}", .{ current_worker.id, handoff_target.?.tags_str });
-            try self.store.insertEvent(run_row.id, step.id, "step.handoff", handoff_event);
-            log.info("step {s} handoff #{d} from worker {s}", .{ step.id, handoff_count, current_worker.id });
-
-            // Select new worker by handoff tags
-            const new_worker = try dispatch.selectWorker(alloc, worker_infos.items, handoff_target.?.tags);
-            if (new_worker == null) {
-                final_result = .{
-                    .output = "",
-                    .success = false,
-                    .error_text = "no worker available for handoff",
-                };
-                break;
-            }
-            current_worker = new_worker.?;
+        // 5. Dispatch to worker
+        const result = try dispatch.dispatchStep(
+            alloc,
+            worker.url,
+            worker.token,
+            worker.protocol,
+            worker.model,
+            run_row.id,
+            step_id,
+            rendered_prompt,
+        );
 
-            // Build handoff prompt with message
-            if (handoff_target.?.message) |msg| {
-                current_prompt = msg;
-            }
-            // Otherwise reuse current_prompt
+        // 6. Handle async dispatch
+        if (result.async_pending) {
+            const async_state = try mergeAsyncState(alloc, state_json, result.correlation_id orelse "");
+            try self.store.updateStepInputJson(step_id, async_state);
+            log.info("step {s} dispatched async, correlation_id={s}", .{ step_id, result.correlation_id orelse "?" });
+            return TaskNodeResult{ .async_pending = {} };
         }
 
-        // 8.5. If async dispatch, save state and leave step running
-        if (final_result.async_pending) {
-            const async_state = try mergeAsyncState(alloc, step.input_json, final_result.correlation_id orelse "");
-            try self.store.updateStepInputJson(step.id, async_state);
-            log.info("step {s} dispatched async, correlation_id={s}", .{ step.id, final_result.correlation_id orelse "?" });
-            return;
-        }
+        // 7. Handle result
+        if (result.success) {
+            const output = result.output;
+            const output_json = try wrapOutput(alloc, output);
+            try self.store.updateStepStatus(step_id, "completed", worker.id, output_json, null, 1);
+            try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+            try self.store.markWorkerSuccess(worker.id, ids.nowMs());
 
-        // 9. Handle result
-        if (final_result.success) {
-            // Mark step as completed, save output_json
-            const output_json = try wrapOutput(alloc, final_result.output);
-            try self.store.updateStepStatus(step.id, "completed", current_worker.id, output_json, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-            try self.store.markWorkerSuccess(current_worker.id, ids.nowMs());
             if (self.metrics) |m| {
                 metrics_mod.Metrics.incr(&m.worker_dispatch_success_total);
             }
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output_json, self.metrics);
-            log.info("step {s} completed", .{step.id});
+            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step_id, output_json, self.metrics);
+
+            // Build state_updates from output
+            // Try parsing as JSON with "state_updates" field, otherwise wrap output in "output" key
+            const state_updates = extractStateUpdates(alloc, output) orelse
+                try std.fmt.allocPrint(alloc, "{{\"output\":{s}}}", .{try jsonStringify(alloc, output)});
+
+            return TaskNodeResult{ .completed = .{ .state_updates = state_updates } };
         } else {
-            // On failure: retry or fail
-            const err_text = final_result.error_text orelse "dispatch failed";
+            const err_text = result.error_text orelse "dispatch failed";
+            try self.store.updateStepStatus(step_id, "failed", worker.id, null, err_text, 1);
+            try self.store.insertEvent(run_row.id, step_id, "step.failed", "{}");
+
             const now_ms = ids.nowMs();
             const circuit_until = now_ms + self.runtime_cfg.worker_circuit_breaker_ms;
             try self.store.markWorkerFailure(
-                current_worker.id,
+                worker.id,
                 err_text,
                 now_ms,
                 self.runtime_cfg.worker_failure_threshold,
                 circuit_until,
             );
+
             if (self.metrics) |m| {
                 metrics_mod.Metrics.incr(&m.worker_dispatch_failure_total);
             }
+            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step_id, "{}", self.metrics);
 
-            if (step.attempt < step.max_attempts) {
-                const elapsed_ms = now_ms - step.created_at_ms;
-                if (elapsed_ms > self.runtime_cfg.retry_max_elapsed_ms) {
-                    const elapsed_err = try std.fmt.allocPrint(alloc, "retry max elapsed exceeded ({d}ms)", .{self.runtime_cfg.retry_max_elapsed_ms});
-                    try self.store.updateStepStatus(step.id, "failed", current_worker.id, null, elapsed_err, step.attempt);
-                    try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-                    log.err("step {s} failed: {s}", .{ step.id, elapsed_err });
-                    return;
-                }
-
-                const delay_ms = computeRetryDelayMs(self.runtime_cfg, step, now_ms);
-                const next_attempt_ms = now_ms + delay_ms;
-                try self.store.scheduleStepRetry(step.id, next_attempt_ms, step.attempt + 1, err_text);
-                const retry_event = try std.fmt.allocPrint(alloc, "{{\"next_attempt_at_ms\":{d},\"delay_ms\":{d}}}", .{ next_attempt_ms, delay_ms });
-                try self.store.insertEvent(run_row.id, step.id, "step.retry", retry_event);
-                if (self.metrics) |m| {
-                    metrics_mod.Metrics.incr(&m.steps_retry_scheduled_total);
-                }
-                log.info("step {s} will retry (attempt {d}/{d}, delay={d}ms)", .{ step.id, step.attempt + 1, step.max_attempts, delay_ms });
-            } else {
-                try self.store.updateStepStatus(step.id, "failed", current_worker.id, null, err_text, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-                log.err("step {s} failed: {s}", .{ step.id, err_text });
-            }
+            return TaskNodeResult{ .failed = err_text };
         }
     }
 
-    // ── async helpers ──────────────────────────────────────────────
+    // ── executeSendNode ──────────────────────────────────────────────
 
-    /// Merge async_pending + correlation_id into existing input_json,
-    /// preserving any existing fields (e.g. rendered_prompt for retries).
-    fn mergeAsyncState(alloc: std.mem.Allocator, existing_input: []const u8, correlation_id: []const u8) ![]const u8 {
-        var obj = std.json.ObjectMap.init(alloc);
+    fn executeSendNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !SendNodeResult {
+        // Read items_from state path
+        const items_path = getNodeField(alloc, node_json, "items_from") orelse {
+            log.warn("send node {s} missing items_from", .{node_name});
+            return SendNodeResult{ .state_updates = null };
+        };
 
-        // Parse and copy existing fields
-        if (existing_input.len > 0) {
-            const parsed = std.json.parseFromSlice(std.json.Value, alloc, existing_input, .{}) catch null;
-            if (parsed) |p| {
-                if (p.value == .object) {
-                    var it = p.value.object.iterator();
-                    while (it.next()) |entry| {
-                        try obj.put(entry.key_ptr.*, entry.value_ptr.*);
-                    }
-                }
+        // Get the target_node
+        const target_node = getNodeField(alloc, node_json, "target_node") orelse {
+            log.warn("send node {s} missing target_node", .{node_name});
+            return SendNodeResult{ .state_updates = null };
+        };
+
+        // Get target node definition from workflow
+        const target_json = getNodeJson(alloc, run_row.workflow_json, target_node) orelse {
+            log.warn("send node {s} target {s} not found", .{ node_name, target_node });
+            return SendNodeResult{ .state_updates = null };
+        };
+
+        // Read items from state
+        const items_json = state_mod.getStateValue(alloc, state_json, items_path) catch null;
+        if (items_json == null) {
+            log.warn("send node {s}: no items at path {s}", .{ node_name, items_path });
+            return SendNodeResult{ .state_updates = null };
+        }
+
+        // Parse items as array
+        const items_parsed = json.parseFromSlice(json.Value, alloc, items_json.?, .{}) catch {
+            log.warn("send node {s}: items not valid JSON", .{node_name});
+            return SendNodeResult{ .state_updates = null };
+        };
+        if (items_parsed.value != .array) {
+            log.warn("send node {s}: items not an array", .{node_name});
+            return SendNodeResult{ .state_updates = null };
+        }
+
+        // For each item, execute the target node
+        var results: std.ArrayListUnmanaged([]const u8) = .empty;
+        for (items_parsed.value.array.items, 0..) |item, idx| {
+            // Serialize item
+            const item_str = serializeJsonValue(alloc, item) catch continue;
+
+            // Get prompt template from target node
+            const prompt_template = getNodeField(alloc, target_json, "prompt_template") orelse continue;
+
+            // Render with item
+            const rendered = templates.renderTemplate(alloc, prompt_template, state_json, run_row.input_json, item_str) catch continue;
+
+            // Select worker and dispatch
+            const workers = try self.store.listWorkers(alloc);
+            var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
+            for (workers) |w| {
+                const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
+                try worker_infos.append(alloc, .{
+                    .id = w.id,
+                    .url = w.url,
+                    .token = w.token,
+                    .protocol = w.protocol,
+                    .model = w.model,
+                    .tags_json = w.tags_json,
+                    .max_concurrent = w.max_concurrent,
+                    .status = w.status,
+                    .current_tasks = current_tasks,
+                });
+            }
+
+            const required_tags = getNodeTags(alloc, target_json);
+            const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
+            if (selected_worker == null) {
+                try results.append(alloc, "null");
+                continue;
+            }
+            const worker = selected_worker.?;
+
+            // Create child step
+            const child_step_id_buf = ids.generateId();
+            const child_step_id = try alloc.dupe(u8, &child_step_id_buf);
+            const child_def_id = try std.fmt.allocPrint(alloc, "{s}_{d}", .{ node_name, idx });
+            try self.store.insertStep(child_step_id, run_row.id, child_def_id, "task", "running", item_str, 1, null, null, @as(?i64, @intCast(idx)));
+            try self.store.insertEvent(run_row.id, child_step_id, "step.running", "{}");
+
+            const dr = try dispatch.dispatchStep(
+                alloc,
+                worker.url,
+                worker.token,
+                worker.protocol,
+                worker.model,
+                run_row.id,
+                child_step_id,
+                rendered,
+            );
+
+            if (dr.success) {
+                const output_json = try wrapOutput(alloc, dr.output);
+                try self.store.updateStepStatus(child_step_id, "completed", worker.id, output_json, null, 1);
+                try self.store.insertEvent(run_row.id, child_step_id, "step.completed", "{}");
+                try results.append(alloc, try jsonStringify(alloc, dr.output));
+            } else {
+                try self.store.updateStepStatus(child_step_id, "failed", worker.id, null, dr.error_text, 1);
+                try results.append(alloc, "null");
             }
         }
 
-        // Add async fields
-        try obj.put("async_pending", .{ .bool = true });
-        try obj.put("correlation_id", .{ .string = correlation_id });
+        // Build state_updates from collected results
+        const results_json = try serializeStringArray(alloc, results.items);
+        const state_updates = try std.fmt.allocPrint(alloc, "{{\"send_results\":{s}}}", .{results_json});
+
+        // Create parent step record
+        const step_id_buf = ids.generateId();
+        const step_id = try alloc.dupe(u8, &step_id_buf);
+        try self.store.insertStep(step_id, run_row.id, node_name, "send", "completed", "{}", 1, null, null, null);
+        try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
+        try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
 
-        return std.json.Stringify.valueAlloc(alloc, std.json.Value{ .object = obj }, .{});
+        return SendNodeResult{ .state_updates = state_updates };
     }
 
+    // ── Async polling ────────────────────────────────────────────────
+
     fn pollAsyncTaskStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // Only handle steps that are async (have async_pending in input_json)
         const input_json = step.input_json;
         if (input_json.len == 0) return;
 
-        // Parse input_json to check for async_pending flag
-        const parsed = std.json.parseFromSlice(std.json.Value, alloc, input_json, .{}) catch return;
-        defer parsed.deinit();
+        const parsed = json.parseFromSlice(json.Value, alloc, input_json, .{}) catch return;
         if (parsed.value != .object) return;
 
         const async_flag = parsed.value.object.get("async_pending") orelse return;
@@ -541,10 +750,8 @@ pub const Engine = struct {
         if (corr_val != .string) return;
         const correlation_id = corr_val.string;
 
-        // Check response queue
         const queue = self.response_queue orelse return;
         const response = queue.take(correlation_id) orelse {
-            // Check timeout
             if (step.timeout_ms) |timeout_ms| {
                 if (step.started_at_ms) |started_at| {
                     const elapsed = ids.nowMs() - started_at;
@@ -563,7 +770,6 @@ pub const Engine = struct {
             return;
         };
 
-        // Got a response — complete or fail the step
         if (response.success) {
             const output_json = try wrapOutput(alloc, response.output);
             try self.store.updateStepStatus(step.id, "completed", step.worker_id, output_json, null, step.attempt);
@@ -593,4477 +799,522 @@ pub const Engine = struct {
         }
     }
 
-    fn resolveTaskPromptSource(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !?TaskPromptSource {
-        // Explicit rendered_prompt is highest priority for generated children
-        // (for example debate judge prompts).
-        if (extractRenderedPromptFromInput(alloc, step.input_json)) |rendered_prompt| {
-            return .{ .rendered = rendered_prompt };
-        }
-
-        // Normal task step definition prompt.
-        if (try getStepField(alloc, run_row.workflow_json, step.def_step_id, "prompt_template")) |tpl| {
-            return .{ .template = tpl };
-        }
+    /// Merge async_pending + correlation_id into existing input_json.
+    fn mergeAsyncState(alloc: std.mem.Allocator, existing_input: []const u8, correlation_id: []const u8) ![]const u8 {
+        var obj = json.ObjectMap.init(alloc);
 
-        // Fallback for generated child tasks that should reuse parent prompt template.
-        if (step.parent_step_id) |parent_id| {
-            if (try self.store.getStep(alloc, parent_id)) |parent_step| {
-                if (try getStepField(alloc, run_row.workflow_json, parent_step.def_step_id, "prompt_template")) |parent_tpl| {
-                    return .{ .template = parent_tpl };
+        if (existing_input.len > 0) {
+            const p = json.parseFromSlice(json.Value, alloc, existing_input, .{}) catch null;
+            if (p) |parsed| {
+                if (parsed.value == .object) {
+                    var it = parsed.value.object.iterator();
+                    while (it.next()) |entry| {
+                        try obj.put(entry.key_ptr.*, entry.value_ptr.*);
+                    }
                 }
             }
         }
 
-        return null;
-    }
-
-    // ── executeFanOutStep ────────────────────────────────────────────
-
-    fn executeFanOutStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Parse step definition from workflow_json, get "count"
-        const count_val = try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "count") orelse {
-            log.warn("no count for fan_out step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing count in fan_out definition", step.attempt);
-            return;
-        };
-        const count: usize = @intCast(count_val);
-
-        // 2. Create N child steps
-        for (0..count) |i| {
-            const child_id_buf = ids.generateId();
-            const child_id = try alloc.dupe(u8, &child_id_buf);
-            const child_def_id = try std.fmt.allocPrint(alloc, "{s}_{d}", .{ step.def_step_id, i });
-            const idx: i64 = @intCast(i);
-
-            try self.store.insertStep(
-                child_id,
-                run_row.id,
-                child_def_id,
-                "task",
-                "ready",
-                step.input_json,
-                step.max_attempts,
-                step.timeout_ms,
-                step.id, // parent_step_id
-                idx,
-            );
-            log.info("created fan_out child step {s} (index {d})", .{ child_id, i });
-        }
+        try obj.put("async_pending", .{ .bool = true });
+        try obj.put("correlation_id", .{ .string = correlation_id });
 
-        // 3. Mark fan_out step as "completed"
-        try self.store.updateStepStatus(step.id, "completed", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-        log.info("fan_out step {s} completed, created {d} children", .{ step.id, count });
+        return json.Stringify.valueAlloc(alloc, json.Value{ .object = obj }, .{});
     }
+};
 
-    // ── executeMapStep ───────────────────────────────────────────────
+// ── findReadyNodes ──────────────────────────────────────────────────
+
+/// Find nodes that are ready to execute.
+/// A node is ready when ALL its inbound edges have their source in completed_nodes.
+/// __start__ is always "completed" (synthetic).
+/// For conditional edges "source:value", the source is just "source" (strip after `:`)
+/// and the edge is only satisfied if route_results[source] == value.
+pub fn findReadyNodes(
+    alloc: std.mem.Allocator,
+    workflow_json: []const u8,
+    completed_nodes: *std.StringHashMap(void),
+    route_results: *std.StringHashMap([]const u8),
+) ![]const []const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch {
+        return &.{};
+    };
+    const root = parsed.value;
+    if (root != .object) return &.{};
 
-    fn executeMapStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Parse step definition, get "items_from" (e.g. "$.topics")
-        const items_from = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "items_from") orelse {
-            log.warn("no items_from for map step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing items_from in map definition", step.attempt);
-            return;
-        };
+    // Get edges array
+    const edges_val = root.object.get("edges") orelse return &.{};
+    if (edges_val != .array) return &.{};
 
-        // 2. Resolve items_from against run.input_json — extract the array
-        //    items_from format: "$.field_name"
-        const field_name = if (std.mem.startsWith(u8, items_from, "$."))
-            items_from[2..]
-        else
-            items_from;
+    // Get all node names from "nodes" object
+    const nodes_val = root.object.get("nodes") orelse return &.{};
+    if (nodes_val != .object) return &.{};
 
-        const items = try extractJsonArray(alloc, run_row.input_json, field_name) orelse {
-            log.warn("items_from field '{s}' not found or not an array in input", .{field_name});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "items_from field not found or not an array", step.attempt);
-            return;
-        };
+    // Build inbound edge map: target -> list of (source, condition_value?)
+    const EdgeInfo = struct {
+        source: []const u8,
+        condition: ?[]const u8, // null for unconditional, "value" for conditional
+    };
 
-        // 3. For each item in the array, create a child step
-        for (items, 0..) |item, i| {
-            const child_id_buf = ids.generateId();
-            const child_id = try alloc.dupe(u8, &child_id_buf);
-            const child_def_id = try std.fmt.allocPrint(alloc, "{s}_{d}", .{ step.def_step_id, i });
-            const idx: i64 = @intCast(i);
+    var inbound = std.StringHashMap(std.ArrayListUnmanaged(EdgeInfo)).init(alloc);
+
+    // Also collect all target nodes mentioned in edges
+    for (edges_val.array.items) |edge_item| {
+        if (edge_item != .array) continue;
+        if (edge_item.array.items.len < 2) continue;
+
+        const source_raw = if (edge_item.array.items[0] == .string) edge_item.array.items[0].string else continue;
+        const target = if (edge_item.array.items[1] == .string) edge_item.array.items[1].string else continue;
+
+        // Parse source: might be "node:value" for conditional edges
+        var source: []const u8 = source_raw;
+        var condition: ?[]const u8 = null;
+        if (std.mem.indexOfScalar(u8, source_raw, ':')) |colon_pos| {
+            source = source_raw[0..colon_pos];
+            condition = source_raw[colon_pos + 1 ..];
+        }
+
+        var entry = inbound.getPtr(target);
+        if (entry == null) {
+            try inbound.put(target, std.ArrayListUnmanaged(EdgeInfo){});
+            entry = inbound.getPtr(target);
+        }
+        try entry.?.append(alloc, .{
+            .source = source,
+            .condition = condition,
+        });
+    }
+
+    // Detect dead nodes: nodes that are unreachable because a conditional
+    // edge was not taken. A node is dead if ALL its inbound edges are
+    // conditional and none match the route result. Dead nodes propagate:
+    // any node whose only inbound edges come from dead nodes is also dead.
+    var dead_nodes = std.StringHashMap(void).init(alloc);
+
+    // Iterative dead node detection (propagate through the graph)
+    var changed = true;
+    while (changed) {
+        changed = false;
+        var dead_it = inbound.iterator();
+        while (dead_it.next()) |kv| {
+            const target = kv.key_ptr.*;
+            const edges = kv.value_ptr.items;
+
+            if (dead_nodes.get(target) != null) continue;
+            if (completed_nodes.get(target) != null) continue;
+
+            var all_dead_or_unsat = true;
+            for (edges) |edge| {
+                if (std.mem.eql(u8, edge.source, "__start__")) {
+                    // __start__ is never dead
+                    all_dead_or_unsat = false;
+                    break;
+                }
 
-            // Store the item as input_json for the child
-            const item_json = try wrapItemJson(alloc, item);
+                // If source is dead, this edge is dead
+                if (dead_nodes.get(edge.source) != null) continue;
+
+                if (edge.condition) |cond| {
+                    // Conditional edge: check if source completed and condition matched
+                    if (completed_nodes.get(edge.source) != null) {
+                        if (route_results.get(edge.source)) |actual| {
+                            if (std.mem.eql(u8, actual, cond)) {
+                                // This edge IS satisfied
+                                all_dead_or_unsat = false;
+                                break;
+                            }
+                        }
+                        // Source completed but condition didn't match -> dead edge
+                    } else {
+                        // Source not completed yet and not dead -> not dead yet
+                        all_dead_or_unsat = false;
+                        break;
+                    }
+                } else {
+                    // Non-conditional edge from a live, non-dead source
+                    all_dead_or_unsat = false;
+                    break;
+                }
+            }
 
-            try self.store.insertStep(
-                child_id,
-                run_row.id,
-                child_def_id,
-                "task",
-                "ready",
-                item_json,
-                step.max_attempts,
-                step.timeout_ms,
-                step.id, // parent_step_id
-                idx,
-            );
-            log.info("created map child step {s} for item {d}", .{ child_id, i });
+            if (all_dead_or_unsat) {
+                try dead_nodes.put(target, {});
+                changed = true;
+            }
         }
-
-        // 4. Mark map step as "completed"
-        try self.store.updateStepStatus(step.id, "completed", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-        log.info("map step {s} completed, created {d} children", .{ step.id, items.len });
     }
 
-    // ── executeReduceStep ────────────────────────────────────────────
+    // Find ready nodes: for each node, check if all inbound edges are satisfied
+    // (treating dead source nodes as satisfied)
+    var ready: std.ArrayListUnmanaged([]const u8) = .empty;
 
-    fn executeReduceStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, all_steps: []const types.StepRow) !void {
-        // 1. Find the dependency step (the fan_out or map step this depends on)
-        const dep_ids = try self.store.getStepDeps(alloc, step.id);
-        if (dep_ids.len == 0) {
-            log.warn("reduce step {s} has no dependencies", .{step.id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "reduce step has no dependencies", step.attempt);
-            return;
-        }
+    var inbound_it = inbound.iterator();
+    while (inbound_it.next()) |kv| {
+        const target = kv.key_ptr.*;
+        const edges = kv.value_ptr.items;
 
-        // The reduce depends on a fan_out/map step; find it
-        const dep_step_id = dep_ids[0];
+        // Skip if already completed or dead
+        if (completed_nodes.get(target) != null) continue;
+        if (dead_nodes.get(target) != null) continue;
 
-        // 2. Get all child steps of that dependency
-        const children = try self.store.getChildSteps(alloc, dep_step_id);
+        var all_satisfied = true;
+        var any_conditional_edge = false;
+        var any_conditional_satisfied = false;
 
-        if (children.len == 0) {
-            // If the dep is a fan_out/map that hasn't spawned children yet, wait
-            // Check if dep step itself is completed
-            const dep_status = findStepStatus(all_steps, dep_step_id);
-            if (dep_status == null or !std.mem.eql(u8, dep_status.?, "completed")) {
-                // Dep not completed yet, stay ready
-                return;
-            }
-            // Dep completed but no children? Odd, proceed with empty outputs
-        }
+        for (edges) |edge| {
+            // __start__ is always satisfied
+            if (std.mem.eql(u8, edge.source, "__start__")) continue;
+
+            // Dead sources are considered satisfied (their branch was skipped)
+            if (dead_nodes.get(edge.source) != null) continue;
+
+            const source_completed = completed_nodes.get(edge.source) != null;
 
-        // 3. Check if ALL children are completed
-        var all_done = true;
-        for (children) |child| {
-            if (!std.mem.eql(u8, child.status, "completed") and !std.mem.eql(u8, child.status, "skipped")) {
-                all_done = false;
+            if (!source_completed) {
+                all_satisfied = false;
                 break;
             }
-        }
-        if (!all_done) {
-            // Not all children done, leave reduce as "ready", try next tick
-            return;
-        }
 
-        // 4. Collect all child outputs into an array
-        var child_outputs: std.ArrayListUnmanaged([]const u8) = .empty;
-        for (children) |child| {
-            if (child.output_json) |oj| {
-                // Extract "output" field from JSON, or use the raw JSON
-                const extracted = extractOutputField(alloc, oj) catch oj;
-                try child_outputs.append(alloc, extracted);
-            } else {
-                try child_outputs.append(alloc, "");
+            if (edge.condition) |cond| {
+                any_conditional_edge = true;
+                if (route_results.get(edge.source)) |actual| {
+                    if (std.mem.eql(u8, actual, cond)) {
+                        any_conditional_satisfied = true;
+                    }
+                }
             }
         }
 
-        // 5. Build template context with outputs array
-        // Find the dep step's def_step_id for template referencing
-        const dep_def_step_id = findStepDefId(all_steps, dep_step_id) orelse step.def_step_id;
+        if (!all_satisfied) continue;
 
-        const step_output = templates.Context.StepOutput{
-            .step_id = dep_def_step_id,
-            .output = null,
-            .outputs = child_outputs.items,
-        };
+        // If there are conditional edges, at least one must be satisfied
+        if (any_conditional_edge and !any_conditional_satisfied) continue;
 
-        const prompt_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "prompt_template") orelse {
-            // No template — just collect outputs and mark completed
-            const outputs_json = try serializeStringArray(alloc, child_outputs.items);
-            try self.store.updateStepStatus(step.id, "completed", null, outputs_json, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-            return;
-        };
+        try ready.append(alloc, target);
+    }
 
-        const ctx = templates.Context{
-            .input_json = run_row.input_json,
-            .step_outputs = &.{step_output},
-            .item = null,
-        };
+    return ready.toOwnedSlice(alloc);
+}
 
-        // 6. Render template
-        const rendered_prompt = templates.render(alloc, prompt_template, ctx) catch |err| {
-            log.err("template render failed for reduce step {s}: {}", .{ step.id, err });
-            try self.store.updateStepStatus(step.id, "failed", null, null, "template render failed", step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-            return;
-        };
+// ── Workflow JSON Helpers ────────────────────────────────────────────
 
-        // 7. Get workers and dispatch
-        const workers = try self.store.listWorkers(alloc);
-        var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-        for (workers) |w| {
-            const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-            try worker_infos.append(alloc, .{
-                .id = w.id,
-                .url = w.url,
-                .token = w.token,
-                .protocol = w.protocol,
-                .model = w.model,
-                .tags_json = w.tags_json,
-                .max_concurrent = w.max_concurrent,
-                .status = w.status,
-                .current_tasks = current_tasks,
-            });
-        }
-
-        const required_tags = try getStepTags(alloc, run_row.workflow_json, step.def_step_id);
-        const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
-        if (selected_worker == null) {
-            log.debug("no worker available for reduce step {s}, will retry", .{step.id});
-            return;
-        }
-        const worker = selected_worker.?;
-
-        try self.store.updateStepStatus(step.id, "running", worker.id, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-
-        const result = try dispatch.dispatchStep(
-            alloc,
-            worker.url,
-            worker.token,
-            worker.protocol,
-            worker.model,
-            run_row.id,
-            step.id,
-            rendered_prompt,
-        );
-
-        if (result.success) {
-            const output_json = try wrapOutput(alloc, result.output);
-            try self.store.updateStepStatus(step.id, "completed", worker.id, output_json, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output_json, self.metrics);
-            log.info("reduce step {s} completed", .{step.id});
-        } else {
-            const err_text = result.error_text orelse "dispatch failed";
-            if (step.attempt < step.max_attempts) {
-                try self.store.updateStepStatus(step.id, "ready", null, null, err_text, step.attempt + 1);
-                try self.store.insertEvent(run_row.id, step.id, "step.retry", "{}");
-            } else {
-                try self.store.updateStepStatus(step.id, "failed", worker.id, null, err_text, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-            }
-        }
-    }
-
-    // ── executeConditionStep ─────────────────────────────────────────
-
-    fn executeConditionStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, all_steps: []const types.StepRow) !void {
-        // 1. Get the dependency step's output
-        const dep_ids = try self.store.getStepDeps(alloc, step.id);
-        if (dep_ids.len == 0) {
-            log.warn("condition step {s} has no dependencies", .{step.id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "condition step has no dependencies", step.attempt);
-            return;
-        }
-
-        const dep_step_id = dep_ids[0];
-        const dep_output = findStepOutput(all_steps, dep_step_id) orelse "";
-
-        // 2. Parse the "expression" from step definition
-        const expression = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "expression") orelse "true";
-
-        // 3. Evaluate: for MVP, support simple "contains" check
-        //    Expression format: check if the dependency output contains a certain substring
-        //    If expression is "true", always take true branch
-        //    Otherwise, check if dep output contains the expression text
-        const condition_met = if (std.mem.eql(u8, expression, "true"))
-            true
-        else if (std.mem.eql(u8, expression, "false"))
-            false
-        else
-            std.mem.indexOf(u8, dep_output, expression) != null;
-
-        // 4. Determine branch
-        const true_target = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "true_target");
-        const false_target = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "false_target");
-
-        // 5. Determine the winning target and check for graph cycles
-        const winning_target: ?[]const u8 = if (condition_met) true_target else false_target;
-
-        // Check if the winning target is a backward edge (cycle)
-        if (winning_target) |target| {
-            const cycle_handled = try self.handleCycleBack(alloc, run_row, step, target, all_steps);
-            if (cycle_handled) return; // Cycle was handled, step is already completed
-        }
-
-        // 6. For the losing branch target: mark steps as "skipped"
-        if (condition_met) {
-            // Skip the false branch target
-            if (false_target) |target_def_id| {
-                try self.skipStepByDefId(alloc, all_steps, run_row.id, target_def_id);
-            }
-        } else {
-            // Skip the true branch target
-            if (true_target) |target_def_id| {
-                try self.skipStepByDefId(alloc, all_steps, run_row.id, target_def_id);
-            }
-        }
-
-        // 7. Mark condition step as "completed"
-        const branch_result = if (condition_met) "true" else "false";
-        const output_json = try std.fmt.allocPrint(alloc, "{{\"branch\":\"{s}\"}}", .{branch_result});
-        try self.store.updateStepStatus(step.id, "completed", null, output_json, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.completed", "{}");
-        log.info("condition step {s} evaluated to {s}", .{ step.id, branch_result });
-    }
-
-    // ── executeApprovalStep ──────────────────────────────────────────
-
-    fn executeApprovalStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        _ = alloc;
-        // 1. Mark step as "waiting_approval"
-        try self.store.updateStepStatus(step.id, "waiting_approval", null, null, null, step.attempt);
-        // 2. Insert event
-        try self.store.insertEvent(run_row.id, step.id, "step.waiting_approval", "{}");
-        log.info("approval step {s} waiting for approval", .{step.id});
-    }
-
-    // ── executeTransformStep ────────────────────────────────────────
-
-    fn executeTransformStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Get output_template from workflow_json
-        const output_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "output_template") orelse {
-            log.warn("no output_template for transform step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing output_template", step.attempt);
-            return;
-        };
-
-        // 2. Build template context (same as task step)
-        const ctx = try buildTemplateContext(alloc, run_row, step, self.store);
-
-        // 3. Render template
-        const rendered = templates.render(alloc, output_template, ctx) catch |err| {
-            const err_msg = std.fmt.allocPrint(alloc, "template render error: {}", .{err}) catch "template render error";
-            try self.store.updateStepStatus(step.id, "failed", null, null, err_msg, step.attempt);
-            return;
-        };
-
-        // 4. Wrap as output and mark completed
-        const output = try wrapOutput(alloc, rendered);
-        try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-
-        // 5. Fire callback + event
-        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-        try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-        log.info("transform step {s} completed", .{step.id});
-    }
-
-    // ── executeWaitStep ──────────────────────────────────────────────
-
-    fn executeWaitStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        const now = ids.nowMs();
-
-        // Check signal mode first
-        if (try getStepField(alloc, run_row.workflow_json, step.def_step_id, "signal")) |_| {
-            // Signal mode: set to waiting_approval and wait for external POST /signal
-            try self.store.updateStepStatus(step.id, "waiting_approval", null, null, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.waiting_signal", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.waiting_signal", run_row.id, step.id, "{}", self.metrics);
-            log.info("wait step {s} waiting for signal", .{step.id});
-            return;
-        }
-
-        // Duration mode
-        const duration_opt: ?i64 = blk: {
-            const duration_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "duration_ms");
-            if (duration_raw != null) {
-                const dur_int = (try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "duration_ms")) orelse {
-                    try self.failStepWithError(alloc, run_row, step, "duration_ms must be an integer");
-                    return;
-                };
-                if (dur_int < 0) {
-                    try self.failStepWithError(alloc, run_row, step, "duration_ms must be >= 0");
-                    return;
-                }
-                break :blk dur_int;
-            }
-            break :blk null;
-        };
-        if (duration_opt) |duration| {
-            if (step.started_at_ms) |started| {
-                // Already running -- check if duration elapsed
-                if (now - started >= duration) {
-                    const waited = now - started;
-                    const output = try std.fmt.allocPrint(alloc, "{{\"output\":\"waited\",\"waited_ms\":{d}}}", .{waited});
-                    try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-                    try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-                    log.info("wait step {s} completed after {d}ms", .{ step.id, waited });
-                    return;
-                }
-                // Not yet -- stay running (do nothing, will be checked next tick)
-                return;
-            }
-            // First time -- mark running and set started_at_ms
-            try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-            try self.store.setStepStartedAt(step.id, now);
-            return;
-        }
-
-        // Until_ms mode (check integer field)
-        if (try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "until_ms")) |until| {
-            if (until < 0) {
-                try self.failStepWithError(alloc, run_row, step, "until_ms must be >= 0");
-                return;
-            }
-            if (now >= until) {
-                const output = try std.fmt.allocPrint(alloc, "{{\"output\":\"waited\",\"waited_ms\":{d}}}", .{now - (step.started_at_ms orelse now)});
-                try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-                log.info("wait step {s} completed (until_ms reached)", .{step.id});
-                return;
-            }
-            if (step.started_at_ms == null) {
-                try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-                try self.store.setStepStartedAt(step.id, now);
-            }
-            return;
-        }
-
-        // No wait configuration -- fail
-        try self.failStepWithError(alloc, run_row, step, "wait step missing duration_ms, until_ms, or signal");
-    }
-
-    // ── executeRouterStep ────────────────────────────────────────────
-
-    fn executeRouterStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, all_steps: []const types.StepRow) !void {
-        // 1. Get dependency output
-        const deps = try self.store.getStepDeps(alloc, step.id);
-        if (deps.len == 0) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "router has no dependencies", step.attempt);
-            return;
-        }
-
-        const dep_step = (try self.store.getStep(alloc, deps[0])) orelse {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "dependency step not found", step.attempt);
-            return;
-        };
-        const dep_output = extractOutputField(alloc, dep_step.output_json orelse "") catch "";
-
-        // 2. Parse routes from workflow definition (routes is a JSON object, not a string)
-        const routes_str = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "routes") orelse {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "router missing routes", step.attempt);
-            return;
-        };
-
-        const default_target = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "default");
-
-        // 3. Parse routes JSON object and find match
-        var matched_target: ?[]const u8 = null;
-        var all_targets: std.ArrayListUnmanaged([]const u8) = .empty;
-
-        const parsed = std.json.parseFromSlice(std.json.Value, alloc, routes_str, .{}) catch {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "invalid routes JSON", step.attempt);
-            return;
-        };
-
-        if (parsed.value == .object) {
-            var it = parsed.value.object.iterator();
-            while (it.next()) |entry| {
-                const target = switch (entry.value_ptr.*) {
-                    .string => |s| s,
-                    else => continue,
-                };
-                try all_targets.append(alloc, target);
-
-                if (matched_target == null) {
-                    // Check if dep_output contains the route key
-                    if (std.mem.indexOf(u8, dep_output, entry.key_ptr.*) != null) {
-                        matched_target = target;
-                    }
-                }
-            }
-        }
-
-        // 4. Use default if no match
-        if (matched_target == null) {
-            matched_target = default_target;
-        }
-
-        if (matched_target == null) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "no matching route and no default", step.attempt);
-            return;
-        }
-
-        // 5. Check if matched target is a backward edge (cycle)
-        const cycle_handled = try self.handleCycleBack(alloc, run_row, step, matched_target.?, all_steps);
-        if (cycle_handled) return; // Cycle was handled, step is already completed
-
-        // 6. Skip all non-matched targets
-        for (all_targets.items) |target| {
-            if (!std.mem.eql(u8, target, matched_target.?)) {
-                self.skipStepByDefId(alloc, all_steps, run_row.id, target) catch {};
-            }
-        }
-
-        // 7. Mark router completed
-        const output = try std.fmt.allocPrint(alloc, "{{\"output\":\"routed\",\"routed_to\":\"{s}\"}}", .{matched_target.?});
-        try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-        log.info("router step {s} routed to {s}", .{ step.id, matched_target.? });
-    }
-
-    // ── executeLoopStep ─────────────────────────────────────────────
-    //
-    // First tick (step is "ready", no children exist):
-    //   - Parse body array from workflow definition
-    //   - Create child step instances for iteration 0
-    //   - Chain body steps sequentially within the iteration
-    //   - Mark loop step as "running"
-
-    fn executeLoopStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // Parse body array from step definition
-        const body_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "body") orelse {
-            log.warn("no body for loop step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing body in loop definition", step.attempt);
-            return;
-        };
-
-        const body_parsed = std.json.parseFromSlice(std.json.Value, alloc, body_raw, .{}) catch {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "invalid body JSON in loop definition", step.attempt);
-            return;
-        };
-
-        if (body_parsed.value != .array or body_parsed.value.array.items.len == 0) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "body must be a non-empty array", step.attempt);
-            return;
-        }
-
-        const body_items = body_parsed.value.array.items;
-
-        // Create child steps for iteration 0
-        try self.createLoopIterationChildren(alloc, run_row, step, body_items, 0);
-
-        // Mark loop step as "running"
-        try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-        log.info("loop step {s} started iteration 0", .{step.id});
-    }
-
-    // ── pollRunningLoopStep ─────────────────────────────────────────
-    //
-    // Checks progress of a running loop step each tick:
-    //   - Find current iteration (max iteration_index)
-    //   - Check if all children in current iteration are done
-    //   - If any failed -> loop fails
-    //   - If all done: evaluate exit_condition
-    //   - If met -> loop completes
-    //   - If max_iterations reached -> loop completes
-    //   - Else -> create next iteration
-
-    fn pollRunningLoopStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // Get all children of this loop step
-        const children = try self.store.getChildSteps(alloc, step.id);
-        if (children.len == 0) return; // No children yet, wait
-
-        // Find the current (max) iteration_index
-        var max_iter: i64 = 0;
-        for (children) |child| {
-            if (child.iteration_index > max_iter) {
-                max_iter = child.iteration_index;
-            }
-        }
-
-        // Check if all children in the current iteration are in terminal states
-        var all_done = true;
-        var any_failed = false;
-        var last_child_output: ?[]const u8 = null;
-
-        for (children) |child| {
-            if (child.iteration_index != max_iter) continue;
-
-            if (std.mem.eql(u8, child.status, "failed")) {
-                any_failed = true;
-                continue;
-            }
-            if (std.mem.eql(u8, child.status, "completed") or std.mem.eql(u8, child.status, "skipped")) {
-                // Track the last completed child's output (by item_index order)
-                if (child.output_json != null) {
-                    last_child_output = child.output_json;
-                }
-                continue;
-            }
-            // Still pending/ready/running
-            all_done = false;
-        }
-
-        if (!all_done) return; // Not done yet, wait
-
-        if (any_failed) {
-            // Loop fails if any child fails
-            try self.store.updateStepStatus(step.id, "failed", null, null, "loop child step failed", step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-            log.info("loop step {s} failed (child failed)", .{step.id});
-            return;
-        }
-
-        // All children in current iteration are done. Evaluate exit_condition.
-        const exit_condition = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "exit_condition");
-        const max_iterations = try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "max_iterations") orelse 10;
-
-        // Extract output text from last child for condition matching
-        const last_output_text = if (last_child_output) |oj|
-            (extractOutputField(alloc, oj) catch oj)
-        else
-            "";
-
-        // Check exit condition (substring match, same as condition step)
-        const condition_met = if (exit_condition) |cond|
-            std.mem.indexOf(u8, last_output_text, cond) != null
-        else
-            false;
-
-        if (condition_met) {
-            // Exit condition met -- loop completes with last child's output
-            const output = last_child_output orelse try wrapOutput(alloc, "loop completed");
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("loop step {s} completed (exit condition met at iteration {d})", .{ step.id, max_iter });
-            return;
-        }
-
-        // Check if max_iterations reached
-        if (max_iter + 1 >= max_iterations) {
-            // Max iterations reached -- loop completes with last child's output
-            const output = last_child_output orelse try wrapOutput(alloc, "loop completed (max iterations)");
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("loop step {s} completed (max iterations {d} reached)", .{ step.id, max_iterations });
-            return;
-        }
-
-        // Create next iteration
-        const next_iter = max_iter + 1;
-
-        // Re-parse body to get the body step def IDs
-        const body_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "body") orelse return;
-        const body_parsed = std.json.parseFromSlice(std.json.Value, alloc, body_raw, .{}) catch return;
-        if (body_parsed.value != .array) return;
-        const body_items = body_parsed.value.array.items;
-
-        try self.createLoopIterationChildren(alloc, run_row, step, body_items, next_iter);
-        log.info("loop step {s} started iteration {d}", .{ step.id, next_iter });
-    }
-
-    /// Create child steps for one iteration of a loop.
-    fn createLoopIterationChildren(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, loop_step: types.StepRow, body_items: []const std.json.Value, iteration: i64) !void {
-        var prev_child_id: ?[]const u8 = null;
-
-        for (body_items, 0..) |body_item, i| {
-            // Each body_item should be a string (step def ID)
-            const body_def_id = switch (body_item) {
-                .string => |s| s,
-                else => continue,
-            };
-
-            // Look up the body step's type from the workflow definition
-            const body_step_type = try getStepField(alloc, run_row.workflow_json, body_def_id, "type") orelse "task";
-
-            // Generate unique child step ID
-            const child_id_buf = ids.generateId();
-            const child_id = try alloc.dupe(u8, &child_id_buf);
-
-            // First step in chain is "ready", rest are "pending"
-            const initial_status: []const u8 = if (i == 0) "ready" else "pending";
-            const idx: i64 = @intCast(i);
-
-            try self.store.insertStepWithIteration(
-                child_id,
-                run_row.id,
-                body_def_id, // original def_step_id for template/tag lookup
-                body_step_type,
-                initial_status,
-                "{}", // input_json
-                1, // max_attempts
-                null, // timeout_ms
-                loop_step.id, // parent_step_id
-                idx, // item_index (position in body)
-                iteration, // iteration_index
-            );
-
-            // Chain: this step depends on previous step in the body
-            if (prev_child_id) |prev_id| {
-                try self.store.insertStepDep(child_id, prev_id);
-            }
-
-            prev_child_id = child_id;
-        }
-    }
-
-    // ── executeSubWorkflowStep ──────────────────────────────────────
-    //
-    // First tick (step is "ready", child_run_id is null):
-    //   - Get nested workflow definition
-    //   - Create a child run with the nested workflow
-    //   - Create child run's steps
-    //   - Store child_run_id on the parent step
-    //   - Mark step as "running"
-
-    fn executeSubWorkflowStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Get nested workflow definition from the step def
-        const workflow_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "workflow") orelse {
-            log.warn("no workflow for sub_workflow step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing workflow in sub_workflow definition", step.attempt);
-            return;
-        };
-
-        // 2. Parse the nested workflow to extract steps
-        const nested_parsed = std.json.parseFromSlice(std.json.Value, alloc, workflow_raw, .{}) catch {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "invalid workflow JSON in sub_workflow definition", step.attempt);
-            return;
-        };
-
-        if (nested_parsed.value != .object) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "workflow must be a JSON object", step.attempt);
-            return;
-        }
-
-        const nested_steps_val = nested_parsed.value.object.get("steps") orelse {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "workflow missing steps array", step.attempt);
-            return;
-        };
-        if (nested_steps_val != .array or nested_steps_val.array.items.len == 0) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "workflow steps must be a non-empty array", step.attempt);
-            return;
-        }
-
-        // 3. Build input for child run from input_mapping (optional)
-        var child_input_json: []const u8 = run_row.input_json;
-        if (try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "input_mapping")) |mapping_raw| {
-            const mapping_parsed = std.json.parseFromSlice(std.json.Value, alloc, mapping_raw, .{}) catch null;
-            if (mapping_parsed) |mp| {
-                if (mp.value == .object) {
-                    // Render each value in the mapping using template context
-                    const ctx = try buildTemplateContext(alloc, run_row, step, self.store);
-                    var result_buf: std.ArrayListUnmanaged(u8) = .empty;
-                    try result_buf.append(alloc, '{');
-                    var first = true;
-                    var it = mp.value.object.iterator();
-                    while (it.next()) |entry| {
-                        if (!first) try result_buf.append(alloc, ',');
-                        first = false;
-                        // Write key
-                        try result_buf.append(alloc, '"');
-                        try result_buf.appendSlice(alloc, entry.key_ptr.*);
-                        try result_buf.appendSlice(alloc, "\":");
-                        // Render value as template if it's a string
-                        if (entry.value_ptr.* == .string) {
-                            const rendered = templates.render(alloc, entry.value_ptr.string, ctx) catch entry.value_ptr.string;
-                            try result_buf.append(alloc, '"');
-                            for (rendered) |ch| {
-                                switch (ch) {
-                                    '"' => try result_buf.appendSlice(alloc, "\\\""),
-                                    '\\' => try result_buf.appendSlice(alloc, "\\\\"),
-                                    '\n' => try result_buf.appendSlice(alloc, "\\n"),
-                                    '\r' => try result_buf.appendSlice(alloc, "\\r"),
-                                    '\t' => try result_buf.appendSlice(alloc, "\\t"),
-                                    else => try result_buf.append(alloc, ch),
-                                }
-                            }
-                            try result_buf.append(alloc, '"');
-                        } else {
-                            // Non-string values: serialize as-is
-                            var out: std.io.Writer.Allocating = .init(alloc);
-                            var jw: std.json.Stringify = .{ .writer = &out.writer };
-                            jw.write(entry.value_ptr.*) catch {};
-                            const serialized = out.toOwnedSlice() catch "null";
-                            try result_buf.appendSlice(alloc, serialized);
-                        }
-                    }
-                    try result_buf.append(alloc, '}');
-                    child_input_json = try result_buf.toOwnedSlice(alloc);
-                }
-            }
-        }
-
-        // 4. Create child run
-        const child_run_id_buf = ids.generateId();
-        const child_run_id = try alloc.dupe(u8, &child_run_id_buf);
-
-        // Build the child workflow_json: wrap the nested workflow with its steps
-        // The child run's workflow_json should be the workflow_raw itself
-        try self.store.insertRun(child_run_id, null, "running", workflow_raw, child_input_json, run_row.callbacks_json);
-
-        // 5. Create child run's steps from the nested workflow definition
-        const nested_steps = nested_steps_val.array.items;
-
-        // Build mapping from def_step_id -> generated step_id
-        var def_ids: std.ArrayListUnmanaged([]const u8) = .empty;
-        var gen_ids: std.ArrayListUnmanaged([]const u8) = .empty;
-
-        // First pass: create all steps
-        for (nested_steps) |step_val| {
-            if (step_val != .object) continue;
-            const step_obj = step_val.object;
-
-            const def_step_id = if (step_obj.get("id")) |id_val| blk: {
-                if (id_val == .string) break :blk id_val.string;
-                break :blk null;
-            } else null;
-            if (def_step_id == null) continue;
-
-            const step_type_str = if (step_obj.get("type")) |t| blk: {
-                if (t == .string) break :blk t.string;
-                break :blk "task";
-            } else "task";
-
-            const child_step_id_buf = ids.generateId();
-            const child_step_id = try alloc.dupe(u8, &child_step_id_buf);
-
-            // Determine initial status
-            const has_deps = if (step_obj.get("depends_on")) |deps| blk: {
-                if (deps == .array and deps.array.items.len > 0) break :blk true;
-                break :blk false;
-            } else false;
-            const initial_status: []const u8 = if (has_deps) "pending" else "ready";
-
-            try self.store.insertStep(
-                child_step_id,
-                child_run_id,
-                def_step_id.?,
-                step_type_str,
-                initial_status,
-                "{}",
-                1, // max_attempts
-                null, // timeout_ms
-                null, // parent_step_id
-                null, // item_index
-            );
-
-            try def_ids.append(alloc, def_step_id.?);
-            try gen_ids.append(alloc, child_step_id);
-        }
-
-        // Second pass: insert step dependencies
-        for (nested_steps) |step_val| {
-            if (step_val != .object) continue;
-            const step_obj = step_val.object;
-
-            const def_step_id = if (step_obj.get("id")) |id_val| blk: {
-                if (id_val == .string) break :blk id_val.string;
-                break :blk null;
-            } else null;
-            if (def_step_id == null) continue;
-
-            // Find generated step_id
-            const gen_step_id = lookupId(def_ids.items, gen_ids.items, def_step_id.?) orelse continue;
-
-            const deps_val = step_obj.get("depends_on") orelse continue;
-            if (deps_val != .array) continue;
-
-            for (deps_val.array.items) |dep_item| {
-                if (dep_item != .string) continue;
-                const dep_gen_id = lookupId(def_ids.items, gen_ids.items, dep_item.string) orelse continue;
-                try self.store.insertStepDep(gen_step_id, dep_gen_id);
-            }
-        }
-
-        // 6. Store child_run_id on the parent step
-        try self.store.updateStepChildRunId(step.id, child_run_id);
-
-        // 7. Mark sub_workflow step as "running"
-        try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-        log.info("sub_workflow step {s} created child run {s}", .{ step.id, child_run_id });
-    }
-
-    // ── pollRunningSubWorkflowStep ──────────────────────────────────
-    //
-    // Checks the child run's status each tick:
-    //   - If completed -> mark parent step completed with child's output
-    //   - If failed -> mark parent step failed
-    //   - Otherwise -> wait
-
-    fn pollRunningSubWorkflowStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        const child_run_id = step.child_run_id orelse return; // No child run yet
-
-        // Get child run
-        const child_run = (try self.store.getRun(alloc, child_run_id)) orelse {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "child run not found", step.attempt);
-            return;
-        };
-
-        if (std.mem.eql(u8, child_run.status, "completed")) {
-            // Get the child run's last completed step output
-            const child_steps = try self.store.getStepsByRun(alloc, child_run_id);
-            var last_output: ?[]const u8 = null;
-            for (child_steps) |cs| {
-                if (std.mem.eql(u8, cs.status, "completed") and cs.output_json != null) {
-                    last_output = cs.output_json;
-                }
-            }
-            const output = last_output orelse try wrapOutput(alloc, "sub_workflow completed");
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("sub_workflow step {s} completed (child run {s})", .{ step.id, child_run_id });
-        } else if (std.mem.eql(u8, child_run.status, "failed")) {
-            const err_text = child_run.error_text orelse "child run failed";
-            try self.store.updateStepStatus(step.id, "failed", null, null, err_text, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-            log.info("sub_workflow step {s} failed (child run {s})", .{ step.id, child_run_id });
-        }
-        // Otherwise: child run still in progress, wait
-    }
-
-    // ── executeDebateStep ──────────────────────────────────────────
-    //
-    // Phase 1 (step is "ready"): Create N participant child steps
-    // Phase 2 (step is "running"): polled by pollRunningDebateStep
-
-    fn executeDebateStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Parse count from workflow_json
-        const count_val = try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "count") orelse {
-            log.warn("no count for debate step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing count in debate definition", step.attempt);
-            return;
-        };
-        const count: usize = @intCast(count_val);
-
-        // 2. Get prompt_template and render it
-        const prompt_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "prompt_template") orelse {
-            log.warn("no prompt_template for debate step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing prompt_template in debate definition", step.attempt);
-            return;
-        };
-
-        const ctx = try buildTemplateContext(alloc, run_row, step, self.store);
-        const rendered_prompt = templates.render(alloc, prompt_template, ctx) catch |err| {
-            log.err("template render failed for debate step {s}: {}", .{ step.id, err });
-            try self.store.updateStepStatus(step.id, "failed", null, null, "template render failed", step.attempt);
-            return;
-        };
-
-        // 3. Create N participant child steps
-        for (0..count) |i| {
-            const child_id_buf = ids.generateId();
-            const child_id = try alloc.dupe(u8, &child_id_buf);
-            const child_def_id = try std.fmt.allocPrint(alloc, "{s}_participant_{d}", .{ step.def_step_id, i });
-            const idx: i64 = @intCast(i);
-
-            // Store rendered prompt in input_json so participant children can be dispatched.
-            const input_json = try buildRenderedPromptInputJson(alloc, rendered_prompt);
-
-            try self.store.insertStep(
-                child_id,
-                run_row.id,
-                child_def_id,
-                "task",
-                "ready",
-                input_json,
-                step.max_attempts,
-                step.timeout_ms,
-                step.id, // parent_step_id
-                idx,
-            );
-            log.info("created debate participant child step {s} (index {d})", .{ child_id, i });
-        }
-
-        // 4. Mark debate step as "running"
-        try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-        log.info("debate step {s} started with {d} participants", .{ step.id, count });
-    }
-
-    // ── pollRunningDebateStep ────────────────────────────────────────
-    //
-    // Checks if all participant children are done, then dispatches judge.
-
-    fn pollRunningDebateStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        const children = try self.store.getChildSteps(alloc, step.id);
-        if (children.len == 0) return;
-
-        // Separate participants from judge child
-        var participants: std.ArrayListUnmanaged(types.StepRow) = .empty;
-        var judge_child: ?types.StepRow = null;
-
-        for (children) |child| {
-            if (std.mem.indexOf(u8, child.def_step_id, "_judge") != null) {
-                judge_child = child;
-            } else {
-                try participants.append(alloc, child);
-            }
-        }
-
-        // Check if judge child exists and is terminal
-        if (judge_child) |judge| {
-            if (std.mem.eql(u8, judge.status, "completed")) {
-                // Debate completes with judge output
-                const output = judge.output_json orelse try wrapOutput(alloc, "debate completed");
-                try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-                log.info("debate step {s} completed (judge decided)", .{step.id});
-                return;
-            } else if (std.mem.eql(u8, judge.status, "failed")) {
-                const err_text = judge.error_text orelse "judge failed";
-                try self.store.updateStepStatus(step.id, "failed", null, null, err_text, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-                log.info("debate step {s} failed (judge failed)", .{step.id});
-                return;
-            }
-            // Judge still in progress, wait
-            return;
-        }
-
-        // No judge child yet — check if all participants are done
-        var all_done = true;
-        var any_failed = false;
-        for (participants.items) |child| {
-            if (std.mem.eql(u8, child.status, "failed")) {
-                any_failed = true;
-                continue;
-            }
-            if (!std.mem.eql(u8, child.status, "completed") and !std.mem.eql(u8, child.status, "skipped")) {
-                all_done = false;
-            }
-        }
-
-        if (!all_done) return; // Still waiting for participants
-
-        if (any_failed) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "debate participant failed", step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-            return;
-        }
-
-        // All participants done — collect outputs and create judge child
-        var response_items: std.ArrayListUnmanaged([]const u8) = .empty;
-        for (participants.items) |child| {
-            if (child.output_json) |oj| {
-                const extracted = extractOutputField(alloc, oj) catch oj;
-                try response_items.append(alloc, extracted);
-            } else {
-                try response_items.append(alloc, "");
-            }
-        }
-
-        // Build debate_responses as JSON array
-        const debate_responses = try serializeStringArray(alloc, response_items.items);
-
-        // Get judge_template
-        const judge_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "judge_template") orelse {
-            // No judge template — complete with collected responses
-            const output = try wrapOutput(alloc, debate_responses);
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("debate step {s} completed (no judge template, returning responses)", .{step.id});
-            return;
-        };
-
-        // Render judge_template: replace {{debate_responses}} with actual responses
-        // Simple string replacement since it's a special variable
-        var rendered_judge_prompt: []const u8 = judge_template;
-        if (std.mem.indexOf(u8, judge_template, "{{debate_responses}}")) |_| {
-            rendered_judge_prompt = try std.mem.replaceOwned(u8, alloc, judge_template, "{{debate_responses}}", debate_responses);
-        }
-
-        // Create judge child step with rendered prompt in input_json
-        const judge_id_buf = ids.generateId();
-        const judge_id = try alloc.dupe(u8, &judge_id_buf);
-        const judge_def_id = try std.fmt.allocPrint(alloc, "{s}_judge", .{step.def_step_id});
-
-        const judge_input = try buildRenderedPromptInputJson(alloc, rendered_judge_prompt);
-        const judge_idx: i64 = @intCast(participants.items.len);
-
-        try self.store.insertStep(
-            judge_id,
-            run_row.id,
-            judge_def_id,
-            "task",
-            "ready",
-            judge_input,
-            step.max_attempts,
-            step.timeout_ms,
-            step.id, // parent_step_id
-            judge_idx,
-        );
-
-        log.info("debate step {s} created judge child {s}", .{ step.id, judge_id });
-    }
-
-    // ── executeGroupChatStep ─────────────────────────────────────────
-    //
-    // First tick: parse participants, mark as running, start round 1.
-    // Dispatch is attempted but may fail (no workers in test).
-
-    fn executeGroupChatStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Parse participants from workflow_json
-        const participants_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "participants") orelse {
-            log.warn("no participants for group_chat step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing participants in group_chat definition", step.attempt);
-            return;
-        };
-
-        const parsed_participants = std.json.parseFromSlice(std.json.Value, alloc, participants_raw, .{}) catch {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "invalid participants JSON", step.attempt);
-            return;
-        };
-
-        if (parsed_participants.value != .array or parsed_participants.value.array.items.len == 0) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "participants must be a non-empty array", step.attempt);
-            return;
-        }
-
-        // 2. Get prompt_template for round 1
-        const prompt_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "prompt_template") orelse {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing prompt_template in group_chat definition", step.attempt);
-            return;
-        };
-
-        // 3. Render prompt template
-        const ctx = try buildTemplateContext(alloc, run_row, step, self.store);
-        const rendered_prompt = templates.render(alloc, prompt_template, ctx) catch |err| {
-            log.err("template render failed for group_chat step {s}: {}", .{ step.id, err });
-            try self.store.updateStepStatus(step.id, "failed", null, null, "template render failed", step.attempt);
-            return;
-        };
-
-        // 4. Mark step as "running"
-        try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-
-        // 5. Dispatch round 1 to each participant (best-effort, failures logged)
-        const participant_items = parsed_participants.value.array.items;
-        for (participant_items) |p_val| {
-            if (p_val != .object) continue;
-            const p_obj = p_val.object;
-
-            const role = if (p_obj.get("role")) |r| blk: {
-                if (r == .string) break :blk r.string;
-                break :blk "participant";
-            } else "participant";
-
-            // Try to dispatch to a worker matching participant tags
-            const tags_val = p_obj.get("tags");
-            var tag_list: std.ArrayListUnmanaged([]const u8) = .empty;
-            if (tags_val) |tv| {
-                if (tv == .array) {
-                    for (tv.array.items) |tag_item| {
-                        if (tag_item == .string) {
-                            try tag_list.append(alloc, tag_item.string);
-                        }
-                    }
-                }
-            }
-
-            // Get workers
-            const workers = try self.store.listWorkers(alloc);
-            var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-            for (workers) |w| {
-                const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-                try worker_infos.append(alloc, .{
-                    .id = w.id,
-                    .url = w.url,
-                    .token = w.token,
-                    .protocol = w.protocol,
-                    .model = w.model,
-                    .tags_json = w.tags_json,
-                    .max_concurrent = w.max_concurrent,
-                    .status = w.status,
-                    .current_tasks = current_tasks,
-                });
-            }
-
-            const selected = try dispatch.selectWorker(alloc, worker_infos.items, tag_list.items);
-            if (selected) |worker| {
-                const result = try dispatch.dispatchStep(
-                    alloc,
-                    worker.url,
-                    worker.token,
-                    worker.protocol,
-                    worker.model,
-                    run_row.id,
-                    step.id,
-                    rendered_prompt,
-                );
-                if (result.success) {
-                    try self.store.insertChatMessage(run_row.id, step.id, 1, role, worker.id, result.output);
-                } else {
-                    log.warn("group_chat dispatch failed for role {s}: {s}", .{ role, result.error_text orelse "unknown" });
-                }
-            } else {
-                log.debug("no worker available for group_chat participant role {s}", .{role});
-            }
-        }
-
-        log.info("group_chat step {s} started round 1 with {d} participants", .{ step.id, participant_items.len });
-    }
-
-    // ── pollRunningGroupChatStep ─────────────────────────────────────
-    //
-    // Each tick: check current round, dispatch next round or complete.
-
-    fn pollRunningGroupChatStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Get all chat messages for this step
-        const messages = try self.store.getChatMessages(alloc, step.id);
-
-        // 2. Parse configuration
-        const max_rounds = try getStepFieldInt(alloc, run_row.workflow_json, step.def_step_id, "max_rounds") orelse 5;
-        const exit_condition = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "exit_condition");
-
-        // 3. Parse participants to know expected count per round
-        const participants_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "participants") orelse return;
-        const parsed_participants = std.json.parseFromSlice(std.json.Value, alloc, participants_raw, .{}) catch return;
-        if (parsed_participants.value != .array) return;
-        const num_participants: i64 = @intCast(parsed_participants.value.array.items.len);
-
-        // 4. Determine current round from messages
-        var current_round: i64 = 0;
-        var current_round_count: i64 = 0;
-        for (messages) |msg| {
-            if (msg.round > current_round) {
-                current_round = msg.round;
-                current_round_count = 1;
-            } else if (msg.round == current_round) {
-                current_round_count += 1;
-            }
-        }
-
-        if (current_round == 0) return; // No messages yet, wait for initial dispatch
-
-        // 5. Check if current round is complete (all participants responded)
-        if (current_round_count < num_participants) {
-            // Round not complete, wait
-            return;
-        }
-
-        // 6. Check exit condition in latest round's messages
-        if (exit_condition) |cond| {
-            for (messages) |msg| {
-                if (msg.round == current_round) {
-                    if (std.mem.indexOf(u8, msg.message, cond) != null) {
-                        // Exit condition met — complete with transcript
-                        const transcript = try buildChatTranscript(alloc, messages);
-                        const output = try wrapOutput(alloc, transcript);
-                        try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-                        try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-                        log.info("group_chat step {s} completed (exit condition met at round {d})", .{ step.id, current_round });
-                        return;
-                    }
-                }
-            }
-        }
-
-        // 7. Check if max rounds reached
-        if (current_round >= max_rounds) {
-            const transcript = try buildChatTranscript(alloc, messages);
-            const output = try wrapOutput(alloc, transcript);
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("group_chat step {s} completed (max rounds {d} reached)", .{ step.id, max_rounds });
-            return;
-        }
-
-        // 8. Start next round — build chat history and dispatch
-        const next_round = current_round + 1;
-        const chat_history = try buildChatTranscript(alloc, messages);
-
-        const round_template = try getStepField(alloc, run_row.workflow_json, step.def_step_id, "round_template") orelse {
-            // No round_template — complete with what we have
-            const output = try wrapOutput(alloc, chat_history);
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            return;
-        };
-
-        // Dispatch to each participant with round_template
-        const participant_items = parsed_participants.value.array.items;
-        for (participant_items) |p_val| {
-            if (p_val != .object) continue;
-            const p_obj = p_val.object;
-
-            const role = if (p_obj.get("role")) |r| blk: {
-                if (r == .string) break :blk r.string;
-                break :blk "participant";
-            } else "participant";
-
-            // Render round_template with {{chat_history}} and {{role}}
-            var rendered = try std.mem.replaceOwned(u8, alloc, round_template, "{{chat_history}}", chat_history);
-            rendered = try std.mem.replaceOwned(u8, alloc, rendered, "{{role}}", role);
-
-            // Get participant tags
-            const tags_val = p_obj.get("tags");
-            var tag_list: std.ArrayListUnmanaged([]const u8) = .empty;
-            if (tags_val) |tv| {
-                if (tv == .array) {
-                    for (tv.array.items) |tag_item| {
-                        if (tag_item == .string) {
-                            try tag_list.append(alloc, tag_item.string);
-                        }
-                    }
-                }
-            }
-
-            // Select worker and dispatch
-            const workers = try self.store.listWorkers(alloc);
-            var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-            for (workers) |w| {
-                const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-                try worker_infos.append(alloc, .{
-                    .id = w.id,
-                    .url = w.url,
-                    .token = w.token,
-                    .protocol = w.protocol,
-                    .model = w.model,
-                    .tags_json = w.tags_json,
-                    .max_concurrent = w.max_concurrent,
-                    .status = w.status,
-                    .current_tasks = current_tasks,
-                });
-            }
-
-            const selected = try dispatch.selectWorker(alloc, worker_infos.items, tag_list.items);
-            if (selected) |worker| {
-                const result = try dispatch.dispatchStep(
-                    alloc,
-                    worker.url,
-                    worker.token,
-                    worker.protocol,
-                    worker.model,
-                    run_row.id,
-                    step.id,
-                    rendered,
-                );
-                if (result.success) {
-                    try self.store.insertChatMessage(run_row.id, step.id, next_round, role, worker.id, result.output);
-                } else {
-                    log.warn("group_chat round {d} dispatch failed for role {s}", .{ next_round, role });
-                }
-            } else {
-                log.debug("no worker for group_chat round {d} participant role {s}", .{ next_round, role });
-            }
-        }
-
-        log.info("group_chat step {s} dispatched round {d}", .{ step.id, next_round });
-    }
-
-    // ── executeSagaStep ─────────────────────────────────────────────
-    //
-    // First tick (step is "ready"):
-    //   - Parse body array and compensations map from workflow definition
-    //   - Create first body step as child (status="ready")
-    //   - Initialize saga_state entries for all body steps
-    //   - Mark saga step as "running"
-
-    fn executeSagaStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        // 1. Parse body array from step definition
-        const body_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "body") orelse {
-            log.warn("no body for saga step {s}", .{step.def_step_id});
-            try self.store.updateStepStatus(step.id, "failed", null, null, "missing body in saga definition", step.attempt);
-            return;
-        };
-
-        const body_parsed = std.json.parseFromSlice(std.json.Value, alloc, body_raw, .{}) catch {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "invalid body JSON in saga definition", step.attempt);
-            return;
-        };
-
-        if (body_parsed.value != .array or body_parsed.value.array.items.len == 0) {
-            try self.store.updateStepStatus(step.id, "failed", null, null, "body must be a non-empty array", step.attempt);
-            return;
-        }
-
-        const body_items = body_parsed.value.array.items;
-
-        // 2. Parse compensations map (optional)
-        const comp_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "compensations");
-        var comp_map: ?std.json.ObjectMap = null;
-        if (comp_raw) |cr| {
-            const comp_parsed = std.json.parseFromSlice(std.json.Value, alloc, cr, .{}) catch null;
-            if (comp_parsed) |cp| {
-                if (cp.value == .object) {
-                    comp_map = cp.value.object;
-                }
-            }
-        }
-
-        // 3. Initialize saga_state for all body steps and create first child
-        for (body_items, 0..) |body_item, i| {
-            const body_def_id = switch (body_item) {
-                .string => |s| s,
-                else => continue,
-            };
-
-            // Look up compensation for this body step
-            var comp_def_id: ?[]const u8 = null;
-            if (comp_map) |cm| {
-                if (cm.get(body_def_id)) |cv| {
-                    if (cv == .string) {
-                        comp_def_id = cv.string;
-                    }
-                }
-            }
-
-            // Insert saga_state entry
-            try self.store.insertSagaState(run_row.id, step.id, body_def_id, comp_def_id);
-
-            // Create child step for first body step only (rest created sequentially)
-            if (i == 0) {
-                const body_step_type = try getStepField(alloc, run_row.workflow_json, body_def_id, "type") orelse "task";
-                const child_id_buf = ids.generateId();
-                const child_id = try alloc.dupe(u8, &child_id_buf);
-
-                try self.store.insertStep(
-                    child_id,
-                    run_row.id,
-                    body_def_id,
-                    body_step_type,
-                    "ready",
-                    step.input_json,
-                    step.max_attempts,
-                    step.timeout_ms,
-                    step.id, // parent_step_id
-                    0, // item_index
-                );
-                log.info("saga step {s} created first body child {s} (def: {s})", .{ step.id, child_id, body_def_id });
-            }
-        }
-
-        // 4. Mark saga step as "running"
-        try self.store.updateStepStatus(step.id, "running", null, null, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.running", "{}");
-        log.info("saga step {s} started with {d} body steps", .{ step.id, body_items.len });
-    }
-
-    // ── pollRunningSagaStep ──────────────────────────────────────────
-    //
-    // Each tick:
-    //   - Get saga_state entries to understand progress
-    //   - Find current body step child and check its status
-    //   - If completed: update saga_state, create next body step
-    //   - If all body steps completed: mark saga completed
-    //   - If body step failed: enter compensation mode
-    //   - Track compensation progress
-
-    fn pollRunningSagaStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
-        const children = try self.store.getChildSteps(alloc, step.id);
-        if (children.len == 0) return;
-
-        const saga_states = try self.store.getSagaStates(alloc, run_row.id, step.id);
-        if (saga_states.len == 0) return;
-
-        // Parse body array to know the order
-        const body_raw = try getStepFieldRaw(alloc, run_row.workflow_json, step.def_step_id, "body") orelse return;
-        const body_parsed = std.json.parseFromSlice(std.json.Value, alloc, body_raw, .{}) catch return;
-        if (body_parsed.value != .array) return;
-        const body_items = body_parsed.value.array.items;
-
-        // Build body def IDs list in order
-        var body_def_ids: std.ArrayListUnmanaged([]const u8) = .empty;
-        for (body_items) |bi| {
-            if (bi == .string) {
-                try body_def_ids.append(alloc, bi.string);
-            }
-        }
-
-        // Check if we're in compensation mode (any saga_state has status "compensating")
-        var in_compensation = false;
-        for (saga_states) |ss| {
-            if (std.mem.eql(u8, ss.status, "compensating")) {
-                in_compensation = true;
-                break;
-            }
-        }
-
-        if (in_compensation) {
-            // In compensation mode: check if current compensation child is done
-            try self.pollSagaCompensation(alloc, run_row, step, children, saga_states, body_def_ids.items);
-            return;
-        }
-
-        // Forward mode: check the current body step child
-        // Find which body step we're on by looking at saga_states
-        var current_body_idx: ?usize = null;
-        var failed_body_def_id: ?[]const u8 = null;
-
-        for (saga_states, 0..) |ss, i| {
-            if (std.mem.eql(u8, ss.status, "pending")) {
-                // This is the next body step to process or the current one
-                // Check if there's a child for this body step
-                var has_child = false;
-                for (children) |child| {
-                    if (std.mem.eql(u8, child.def_step_id, ss.body_step_id)) {
-                        has_child = true;
-                        if (std.mem.eql(u8, child.status, "completed")) {
-                            // Body step completed — update saga_state
-                            try self.store.updateSagaState(run_row.id, step.id, ss.body_step_id, "completed");
-                            log.info("saga body step {s} completed", .{ss.body_step_id});
-                            // Create next body step if there is one
-                            if (i + 1 < saga_states.len) {
-                                const next_def_id = saga_states[i + 1].body_step_id;
-                                const next_type = try getStepField(alloc, run_row.workflow_json, next_def_id, "type") orelse "task";
-                                const next_id_buf = ids.generateId();
-                                const next_id = try alloc.dupe(u8, &next_id_buf);
-                                const next_idx: i64 = @intCast(i + 1);
-
-                                try self.store.insertStep(
-                                    next_id,
-                                    run_row.id,
-                                    next_def_id,
-                                    next_type,
-                                    "ready",
-                                    step.input_json,
-                                    step.max_attempts,
-                                    step.timeout_ms,
-                                    step.id,
-                                    next_idx,
-                                );
-                                log.info("saga step {s} created body child {s} (def: {s})", .{ step.id, next_id, next_def_id });
-                            }
-                            // Don't process further this tick
-                            return;
-                        } else if (std.mem.eql(u8, child.status, "failed")) {
-                            // Body step failed — enter compensation mode
-                            failed_body_def_id = ss.body_step_id;
-                            current_body_idx = i;
-                            break;
-                        }
-                        // Still running/ready — wait
-                        return;
-                    }
-                }
-                if (!has_child) {
-                    // First pending step without a child — this shouldn't happen normally
-                    // since executeSagaStep creates the first and we create subsequent ones
-                    return;
-                }
-                break;
-            }
-        }
-
-        // Check if ALL body steps are completed
-        var all_completed = true;
-        for (saga_states) |ss| {
-            if (!std.mem.eql(u8, ss.status, "completed")) {
-                all_completed = false;
-                break;
-            }
-        }
-
-        if (all_completed) {
-            // Saga completed successfully — output is last body step's output
-            var last_output: ?[]const u8 = null;
-            for (children) |child| {
-                if (std.mem.eql(u8, child.status, "completed") and child.output_json != null) {
-                    // Check if this child is the last body step
-                    if (body_def_ids.items.len > 0 and
-                        std.mem.eql(u8, child.def_step_id, body_def_ids.items[body_def_ids.items.len - 1]))
-                    {
-                        last_output = child.output_json;
-                    }
-                }
-            }
-            const output = last_output orelse try wrapOutput(alloc, "saga completed");
-            try self.store.updateStepStatus(step.id, "completed", null, output, null, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.completed", output);
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step.id, output, self.metrics);
-            log.info("saga step {s} completed successfully", .{step.id});
-            return;
-        }
-
-        // Check if compensation has fully completed (all compensating states
-        // have become "compensated" and at least one is "failed")
-        {
-            var has_failed_state = false;
-            var has_unfinished_compensation = false;
-            for (saga_states) |ss| {
-                if (std.mem.eql(u8, ss.status, "failed")) {
-                    has_failed_state = true;
-                } else if (std.mem.eql(u8, ss.status, "compensating")) {
-                    has_unfinished_compensation = true;
-                }
-            }
-            if (has_failed_state and !has_unfinished_compensation) {
-                try self.finishSagaCompensation(alloc, run_row, step, saga_states);
-                return;
-            }
-        }
-
-        // If a body step failed, start compensation
-        if (failed_body_def_id) |failed_def| {
-            log.info("saga step {s} body step {s} failed, starting compensation", .{ step.id, failed_def });
-
-            // Mark the failed body step in saga_state
-            try self.store.updateSagaState(run_row.id, step.id, failed_def, "failed");
-
-            // Find completed body steps and start compensating in reverse
-            // Mark all completed body steps as "compensating"
-            var completed_steps: std.ArrayListUnmanaged([]const u8) = .empty;
-            for (saga_states) |ss| {
-                if (std.mem.eql(u8, ss.status, "completed")) {
-                    try completed_steps.append(alloc, ss.body_step_id);
-                    try self.store.updateSagaState(run_row.id, step.id, ss.body_step_id, "compensating");
-                }
-            }
-
-            if (completed_steps.items.len == 0) {
-                // No completed steps to compensate — saga fails immediately
-                const output = try std.fmt.allocPrint(alloc, "{{\"failed_at\":\"{s}\",\"compensated\":[]}}", .{failed_def});
-                try self.store.updateStepStatus(step.id, "failed", null, output, null, step.attempt);
-                try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-                log.info("saga step {s} failed at {s}, no compensations needed", .{ step.id, failed_def });
-                return;
-            }
-
-            // Create the last completed step's compensation child (reverse order)
-            // Start from the last completed body step
-            const last_completed = completed_steps.items[completed_steps.items.len - 1];
-            try self.createCompensationChild(alloc, run_row, step, saga_states, last_completed);
-        }
-    }
-
-    /// Create a compensation child step for a given body step.
-    fn createCompensationChild(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, saga_step: types.StepRow, saga_states: []const types.SagaStateRow, body_def_id: []const u8) !void {
-        // Find the compensation def_id for this body step
-        var comp_def_id: ?[]const u8 = null;
-        for (saga_states) |ss| {
-            if (std.mem.eql(u8, ss.body_step_id, body_def_id)) {
-                comp_def_id = ss.compensation_step_id;
-                break;
-            }
-        }
-
-        if (comp_def_id == null) {
-            // No compensation for this step — mark as compensated immediately
-            try self.store.updateSagaState(run_row.id, saga_step.id, body_def_id, "compensated");
-            log.info("saga body step {s} has no compensation, marking compensated", .{body_def_id});
-            return;
-        }
-
-        const comp_type = try getStepField(alloc, run_row.workflow_json, comp_def_id.?, "type") orelse "task";
-        const comp_child_id_buf = ids.generateId();
-        const comp_child_id = try alloc.dupe(u8, &comp_child_id_buf);
-
-        try self.store.insertStep(
-            comp_child_id,
-            run_row.id,
-            comp_def_id.?,
-            comp_type,
-            "ready",
-            "{}",
-            1, // max_attempts
-            null, // timeout_ms
-            saga_step.id, // parent_step_id
-            null, // item_index
-        );
-        log.info("saga step {s} created compensation child {s} for body {s}", .{ saga_step.id, comp_child_id, body_def_id });
-    }
-
-    /// Poll compensation progress in a saga step.
-    fn pollSagaCompensation(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, children: []const types.StepRow, saga_states: []const types.SagaStateRow, body_def_ids: []const []const u8) !void {
-        // Find the body step currently being compensated (has a running/ready compensation child)
-        // Work backwards through body_def_ids to find the current compensating step
-        var compensating_body: ?[]const u8 = null;
-        var compensating_idx: ?usize = null;
-
-        // Find compensating steps in reverse order (last completed first)
-        var i: usize = body_def_ids.len;
-        while (i > 0) {
-            i -= 1;
-            for (saga_states) |ss| {
-                if (std.mem.eql(u8, ss.body_step_id, body_def_ids[i]) and
-                    std.mem.eql(u8, ss.status, "compensating"))
-                {
-                    compensating_body = body_def_ids[i];
-                    compensating_idx = i;
-                    break;
-                }
-            }
-            if (compensating_body != null) break;
-        }
-
-        if (compensating_body == null) {
-            // All compensations done — build failure output and fail saga
-            try self.finishSagaCompensation(alloc, run_row, step, saga_states);
-            return;
-        }
-
-        // Check if there's a compensation child for this body step
-        var comp_def_id: ?[]const u8 = null;
-        for (saga_states) |ss| {
-            if (std.mem.eql(u8, ss.body_step_id, compensating_body.?)) {
-                comp_def_id = ss.compensation_step_id;
-                break;
-            }
-        }
-
-        if (comp_def_id == null) {
-            // No compensation defined — mark as compensated and move on
-            try self.store.updateSagaState(run_row.id, step.id, compensating_body.?, "compensated");
-            return;
-        }
-
-        // Find the compensation child step
-        var comp_child: ?types.StepRow = null;
-        for (children) |child| {
-            if (std.mem.eql(u8, child.def_step_id, comp_def_id.?)) {
-                comp_child = child;
-            }
-        }
-
-        if (comp_child == null) {
-            // Compensation child not created yet — create it
-            try self.createCompensationChild(alloc, run_row, step, saga_states, compensating_body.?);
-            return;
-        }
-
-        const comp = comp_child.?;
-        if (std.mem.eql(u8, comp.status, "completed")) {
-            // Compensation completed — mark this body step as compensated
-            try self.store.updateSagaState(run_row.id, step.id, compensating_body.?, "compensated");
-            log.info("saga compensation for body step {s} completed", .{compensating_body.?});
-
-            // Find next compensating step (earlier in the list)
-            if (compensating_idx.? > 0) {
-                var next_idx: ?usize = null;
-                var j: usize = compensating_idx.?;
-                while (j > 0) {
-                    j -= 1;
-                    for (saga_states) |ss| {
-                        if (std.mem.eql(u8, ss.body_step_id, body_def_ids[j]) and
-                            std.mem.eql(u8, ss.status, "compensating"))
-                        {
-                            next_idx = j;
-                            break;
-                        }
-                    }
-                    if (next_idx != null) break;
-                }
-
-                // Check if any compensating steps remain. We may have already
-                // updated some to compensated in previous iterations, so re-check.
-                // The next tick will pick them up via pollSagaCompensation.
-            }
-        } else if (std.mem.eql(u8, comp.status, "failed")) {
-            // Compensation itself failed — saga fails with compensation error
-            const err_msg = try std.fmt.allocPrint(alloc, "compensation step {s} failed", .{comp_def_id.?});
-            try self.store.updateStepStatus(step.id, "failed", null, null, err_msg, step.attempt);
-            try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-            log.info("saga step {s} failed during compensation", .{step.id});
-        }
-        // Otherwise compensation child still running/ready — wait
-    }
-
-    /// Finish saga compensation and mark saga as failed with output.
-    fn finishSagaCompensation(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, saga_states: []const types.SagaStateRow) !void {
-        // Build list of compensated steps and find failed_at step
-        var failed_at: []const u8 = "unknown";
-        var compensated: std.ArrayListUnmanaged([]const u8) = .empty;
-
-        for (saga_states) |ss| {
-            if (std.mem.eql(u8, ss.status, "failed")) {
-                failed_at = ss.body_step_id;
-            } else if (std.mem.eql(u8, ss.status, "compensated")) {
-                try compensated.append(alloc, ss.body_step_id);
-            }
-        }
-
-        // Build output JSON
-        var comp_json: std.ArrayListUnmanaged(u8) = .empty;
-        try comp_json.append(alloc, '[');
-        for (compensated.items, 0..) |c, ci| {
-            if (ci > 0) try comp_json.append(alloc, ',');
-            try comp_json.append(alloc, '"');
-            try comp_json.appendSlice(alloc, c);
-            try comp_json.append(alloc, '"');
-        }
-        try comp_json.append(alloc, ']');
-        const comp_str = try comp_json.toOwnedSlice(alloc);
-
-        const output = try std.fmt.allocPrint(alloc, "{{\"failed_at\":\"{s}\",\"compensated\":{s}}}", .{ failed_at, comp_str });
-
-        try self.store.updateStepStatus(step.id, "failed", null, output, null, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.failed", output);
-        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, output, self.metrics);
-        log.info("saga step {s} failed at {s}, compensated {d} steps", .{ step.id, failed_at, compensated.items.len });
-    }
-
-    // ── handleCycleBack ─────────────────────────────────────────────
-    //
-    // When a condition/router routes to an already-completed step,
-    // detect the cycle and create new step instances for the cycle body.
-
-    fn handleCycleBack(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, routing_step: types.StepRow, target_def_id: []const u8, all_steps: []const types.StepRow) !bool {
-        // 1. Check if target step is already completed/skipped
-        var target_completed = false;
-        for (all_steps) |s| {
-            if (std.mem.eql(u8, s.def_step_id, target_def_id) and
-                (std.mem.eql(u8, s.status, "completed") or std.mem.eql(u8, s.status, "skipped")))
-            {
-                target_completed = true;
-                break;
-            }
-        }
-
-        if (!target_completed) return false; // Not a backward edge
-
-        // 2. Build cycle_key from routing step's def_step_id
-        const cycle_key = try std.fmt.allocPrint(alloc, "cycle_{s}", .{routing_step.def_step_id});
-
-        // 3. Get or initialize cycle state
-        const cycle_state = try self.store.getCycleState(run_row.id, cycle_key);
-        var iteration_count: i64 = 0;
-        var max_iterations: i64 = 10;
-
-        if (cycle_state) |cs| {
-            iteration_count = cs.iteration_count;
-            max_iterations = cs.max_iterations;
-        }
-
-        // Check max_cycle_iterations from workflow config
-        const wf_max = try getStepFieldInt(alloc, run_row.workflow_json, routing_step.def_step_id, "max_cycle_iterations");
-        if (wf_max) |m| {
-            max_iterations = m;
-        }
-
-        // 4. Check if limit exceeded
-        if (iteration_count >= max_iterations) {
-            const err_msg = try std.fmt.allocPrint(alloc, "cycle iteration limit ({d}) exceeded for {s}", .{ max_iterations, cycle_key });
-            try self.store.updateStepStatus(routing_step.id, "failed", null, null, err_msg, routing_step.attempt);
-            try self.store.insertEvent(run_row.id, routing_step.id, "step.failed", "{}");
-            try self.store.updateRunStatus(run_row.id, "failed", err_msg);
-            log.warn("cycle limit exceeded for {s}", .{cycle_key});
-            return true;
-        }
-
-        // 5. Increment cycle iteration
-        iteration_count += 1;
-        try self.store.upsertCycleState(run_row.id, cycle_key, iteration_count, max_iterations);
-
-        // 6. Walk workflow_json steps to find the cycle body
-        //    (from target_def_id through routing step's def_step_id)
-        const parsed = std.json.parseFromSlice(std.json.Value, alloc, run_row.workflow_json, .{}) catch return false;
-        if (parsed.value != .object) return false;
-        const steps_val = parsed.value.object.get("steps") orelse return false;
-        if (steps_val != .array) return false;
-
-        // Build ordered list of step def IDs and their types + depends_on
-        const StepInfo = struct {
-            def_id: []const u8,
-            step_type: []const u8,
-            depends_on: []const []const u8,
-        };
-
-        var step_infos: std.ArrayListUnmanaged(StepInfo) = .empty;
-        for (steps_val.array.items) |step_val| {
-            if (step_val != .object) continue;
-            const step_obj = step_val.object;
-            const id_val = step_obj.get("id") orelse continue;
-            if (id_val != .string) continue;
-
-            const stype = if (step_obj.get("type")) |t| blk: {
-                if (t == .string) break :blk t.string;
-                break :blk "task";
-            } else "task";
-
-            var deps_list: std.ArrayListUnmanaged([]const u8) = .empty;
-            if (step_obj.get("depends_on")) |deps_val| {
-                if (deps_val == .array) {
-                    for (deps_val.array.items) |dep_item| {
-                        if (dep_item == .string) {
-                            try deps_list.append(alloc, dep_item.string);
-                        }
-                    }
-                }
-            }
-
-            try step_infos.append(alloc, .{
-                .def_id = id_val.string,
-                .step_type = stype,
-                .depends_on = try deps_list.toOwnedSlice(alloc),
-            });
-        }
-
-        // Find indices of target and routing step in the workflow
-        var target_idx: ?usize = null;
-        var routing_idx: ?usize = null;
-        for (step_infos.items, 0..) |si, idx| {
-            if (std.mem.eql(u8, si.def_id, target_def_id)) target_idx = idx;
-            if (std.mem.eql(u8, si.def_id, routing_step.def_step_id)) routing_idx = idx;
-        }
-
-        if (target_idx == null or routing_idx == null) return false;
-        if (target_idx.? >= routing_idx.?) return false; // Not a backward edge
-
-        // 7. Create new step instances for target through routing step
-        var new_step_ids: std.ArrayListUnmanaged([]const u8) = .empty;
-        var new_def_ids: std.ArrayListUnmanaged([]const u8) = .empty;
-
-        var idx: usize = target_idx.?;
-        while (idx <= routing_idx.?) : (idx += 1) {
-            const si = step_infos.items[idx];
-            const new_id_buf = ids.generateId();
-            const new_id = try alloc.dupe(u8, &new_id_buf);
-
-            // First step in cycle is "ready", rest are "pending"
-            const initial_status: []const u8 = if (idx == target_idx.?) "ready" else "pending";
-
-            try self.store.insertStepWithIteration(
-                new_id,
-                run_row.id,
-                si.def_id,
-                si.step_type,
-                initial_status,
-                "{}",
-                1,
-                null,
-                null,
-                null,
-                iteration_count,
-            );
-
-            try new_step_ids.append(alloc, new_id);
-            try new_def_ids.append(alloc, si.def_id);
-        }
-
-        // 8. Chain new instances with deps among themselves
-        for (step_infos.items[target_idx.? .. routing_idx.? + 1], 0..) |si, si_idx| {
-            const new_id = new_step_ids.items[si_idx];
-            for (si.depends_on) |dep_def_id| {
-                // Check if dep is within the cycle body
-                const dep_new_id = lookupId(new_def_ids.items, new_step_ids.items, dep_def_id);
-                if (dep_new_id) |did| {
-                    try self.store.insertStepDep(new_id, did);
-                }
-            }
-        }
-
-        // 9. For any step outside the cycle that depended on the routing step,
-        //    add a dep to the new routing step instance
-        const new_routing_id = new_step_ids.items[new_step_ids.items.len - 1];
-        for (all_steps) |s| {
-            // Skip steps inside the cycle body
-            var in_cycle = false;
-            for (new_def_ids.items) |cd| {
-                if (std.mem.eql(u8, s.def_step_id, cd)) {
-                    in_cycle = true;
-                    break;
-                }
-            }
-            if (in_cycle) continue;
-
-            // Check if this step depends on the old routing step
-            const deps = try self.store.getStepDeps(alloc, s.id);
-            for (deps) |dep_id| {
-                if (std.mem.eql(u8, dep_id, routing_step.id)) {
-                    // Add new dep to the new routing step instance
-                    try self.store.insertStepDep(s.id, new_routing_id);
-                    break;
-                }
-            }
-        }
-
-        // 10. Mark the routing step as completed (the current instance)
-        const output = try std.fmt.allocPrint(alloc, "{{\"output\":\"cycle_back\",\"target\":\"{s}\",\"iteration\":{d}}}", .{ target_def_id, iteration_count });
-        try self.store.updateStepStatus(routing_step.id, "completed", null, output, null, routing_step.attempt);
-        try self.store.insertEvent(run_row.id, routing_step.id, "step.completed", output);
-        log.info("cycle back from {s} to {s} (iteration {d})", .{ routing_step.def_step_id, target_def_id, iteration_count });
-
-        return true;
-    }
-
-    // ── checkRunCompletion ───────────────────────────────────────────
-
-    fn checkRunCompletion(self: *Engine, run_id: []const u8, alloc: std.mem.Allocator) !void {
-        const steps = try self.store.getStepsByRun(alloc, run_id);
-        var all_terminal = true;
-        var any_failed = false;
-        for (steps) |step| {
-            if (std.mem.eql(u8, step.status, "completed") or std.mem.eql(u8, step.status, "skipped")) continue;
-            if (std.mem.eql(u8, step.status, "failed")) {
-                any_failed = true;
-                continue;
-            }
-            if (std.mem.eql(u8, step.status, "waiting_approval")) {
-                all_terminal = false;
-                continue;
-            }
-            all_terminal = false; // pending, ready, running
-        }
-        if (all_terminal and !any_failed) {
-            try self.store.updateRunStatus(run_id, "completed", null);
-            try self.store.insertEvent(run_id, null, "run.completed", "{}");
-            // Fire run.completed callbacks
-            if (try self.store.getRun(alloc, run_id)) |run_row| {
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.completed", run_id, null, "{}", self.metrics);
-            }
-            log.info("run {s} completed", .{run_id});
-        } else if (all_terminal and any_failed) {
-            try self.store.updateRunStatus(run_id, "failed", "one or more steps failed");
-            try self.store.insertEvent(run_id, null, "run.failed", "{}");
-            // Fire run.failed callbacks
-            if (try self.store.getRun(alloc, run_id)) |run_row| {
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_id, null, "{}", self.metrics);
-            }
-            log.info("run {s} failed", .{run_id});
-        }
-    }
-
-    // ── Helpers ──────────────────────────────────────────────────────
-
-    fn skipStepByDefId(self: *Engine, alloc: std.mem.Allocator, all_steps: []const types.StepRow, run_id: []const u8, target_def_id: []const u8) !void {
-        for (all_steps) |s| {
-            if (std.mem.eql(u8, s.def_step_id, target_def_id)) {
-                try self.store.updateStepStatus(s.id, "skipped", null, null, null, s.attempt);
-                try self.store.insertEvent(run_id, s.id, "step.skipped", "{}");
-                log.info("skipped step {s} (def: {s})", .{ s.id, target_def_id });
-                break;
-            }
-        }
-        _ = alloc;
-    }
-
-    fn failStepWithError(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, err_text: []const u8) !void {
-        try self.store.updateStepStatus(step.id, "failed", null, null, err_text, step.attempt);
-        try self.store.insertEvent(run_row.id, step.id, "step.failed", "{}");
-        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.failed", run_row.id, step.id, "{}", self.metrics);
-    }
-};
-
-fn computeRetryDelayMs(cfg: RuntimeConfig, step: types.StepRow, now_ms: i64) i64 {
-    var delay = cfg.retry_base_delay_ms;
-    var remaining_exp = step.attempt - 1;
-    while (remaining_exp > 0) : (remaining_exp -= 1) {
-        if (delay >= cfg.retry_max_delay_ms) break;
-        const doubled = delay * 2;
-        delay = if (doubled > cfg.retry_max_delay_ms) cfg.retry_max_delay_ms else doubled;
-    }
-
-    const jitter_cap = if (cfg.retry_jitter_ms > 0) cfg.retry_jitter_ms else 0;
-    var jitter: i64 = 0;
-    if (jitter_cap > 0) {
-        const seed = std.hash.Wyhash.hash(0, step.id);
-        const mixed = seed ^ @as(u64, @intCast(now_ms));
-        jitter = @as(i64, @intCast(mixed % @as(u64, @intCast(jitter_cap + 1))));
-    }
-    return delay + jitter;
-}
-
-// ── Free functions (workflow JSON helpers) ────────────────────────────
-
-/// Parse workflow_json to find a step definition by def_step_id and return a string field.
-fn getStepField(alloc: std.mem.Allocator, workflow_json: []const u8, def_step_id: []const u8, field: []const u8) !?[]const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, workflow_json, .{}) catch {
-        return null;
-    };
-    // Note: do not deinit here — the alloc is an arena
-
-    const root = parsed.value;
-    if (root != .object) return null;
-
-    const steps_val = root.object.get("steps") orelse return null;
-    if (steps_val != .array) return null;
-
-    for (steps_val.array.items) |step_val| {
-        if (step_val != .object) continue;
-        const step_obj = step_val.object;
-
-        const id_val = step_obj.get("id") orelse continue;
-        if (id_val != .string) continue;
-        if (!std.mem.eql(u8, id_val.string, def_step_id)) continue;
-
-        const field_val = step_obj.get(field) orelse return null;
-        if (field_val == .string) {
-            return try alloc.dupe(u8, field_val.string);
-        }
-        return null;
-    }
-    return null;
-}
-
-/// Parse workflow_json to find a step definition by def_step_id and return a field as raw JSON.
-/// Unlike getStepField which only returns strings, this serializes any JSON value type.
-fn getStepFieldRaw(alloc: std.mem.Allocator, workflow_json: []const u8, def_step_id: []const u8, field: []const u8) !?[]const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, workflow_json, .{}) catch {
-        return null;
-    };
-
-    const root = parsed.value;
-    if (root != .object) return null;
-
-    const steps_val = root.object.get("steps") orelse return null;
-    if (steps_val != .array) return null;
-
-    for (steps_val.array.items) |step_val| {
-        if (step_val != .object) continue;
-        const step_obj = step_val.object;
-
-        const id_val = step_obj.get("id") orelse continue;
-        if (id_val != .string) continue;
-        if (!std.mem.eql(u8, id_val.string, def_step_id)) continue;
-
-        const field_val = step_obj.get(field) orelse return null;
-        if (field_val == .string) {
-            return try alloc.dupe(u8, field_val.string);
-        }
-        // Serialize non-string values as JSON
-        var out: std.io.Writer.Allocating = .init(alloc);
-        var jw: std.json.Stringify = .{ .writer = &out.writer };
-        jw.write(field_val) catch return null;
-        return out.toOwnedSlice() catch return null;
-    }
-    return null;
-}
-
-/// Parse workflow_json to find a step definition by def_step_id and return an integer field.
-fn getStepFieldInt(alloc: std.mem.Allocator, workflow_json: []const u8, def_step_id: []const u8, field: []const u8) !?i64 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, workflow_json, .{}) catch {
-        return null;
-    };
-
-    const root = parsed.value;
-    if (root != .object) return null;
-
-    const steps_val = root.object.get("steps") orelse return null;
-    if (steps_val != .array) return null;
-
-    for (steps_val.array.items) |step_val| {
-        if (step_val != .object) continue;
-        const step_obj = step_val.object;
-
-        const id_val = step_obj.get("id") orelse continue;
-        if (id_val != .string) continue;
-        if (!std.mem.eql(u8, id_val.string, def_step_id)) continue;
-
-        const field_val = step_obj.get(field) orelse return null;
-        if (field_val == .integer) return field_val.integer;
-        return null;
-    }
-    return null;
-}
-
-/// Parse workflow_json to find a step definition and get its worker_tags.
-fn getStepTags(alloc: std.mem.Allocator, workflow_json: []const u8, def_step_id: []const u8) ![]const []const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, workflow_json, .{}) catch {
-        return &.{};
-    };
-
-    const root = parsed.value;
-    if (root != .object) return &.{};
-
-    const steps_val = root.object.get("steps") orelse return &.{};
-    if (steps_val != .array) return &.{};
-
-    for (steps_val.array.items) |step_val| {
-        if (step_val != .object) continue;
-        const step_obj = step_val.object;
-
-        const id_val = step_obj.get("id") orelse continue;
-        if (id_val != .string) continue;
-        if (!std.mem.eql(u8, id_val.string, def_step_id)) continue;
-
-        const tags_val = step_obj.get("worker_tags") orelse return &.{};
-        if (tags_val != .array) return &.{};
-
-        var tags: std.ArrayListUnmanaged([]const u8) = .empty;
-        for (tags_val.array.items) |tag_item| {
-            if (tag_item == .string) {
-                try tags.append(alloc, try alloc.dupe(u8, tag_item.string));
-            }
-        }
-        return tags.toOwnedSlice(alloc);
-    }
-    return &.{};
-}
-
-/// Build a template Context from a run's input and completed step outputs.
-fn buildTemplateContext(alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow, store: *Store) !templates.Context {
-    // Get all steps for this run to collect outputs
-    const all_steps = try store.getStepsByRun(alloc, run_row.id);
-
-    var step_outputs: std.ArrayListUnmanaged(templates.Context.StepOutput) = .empty;
-    for (all_steps) |s| {
-        if (std.mem.eql(u8, s.status, "completed")) {
-            // Check if this step has children (fan_out/map)
-            if (std.mem.eql(u8, s.type, "fan_out") or std.mem.eql(u8, s.type, "map")) {
-                // Collect child outputs
-                const children = try store.getChildSteps(alloc, s.id);
-                var child_outputs: std.ArrayListUnmanaged([]const u8) = .empty;
-                for (children) |child| {
-                    if (child.output_json) |oj| {
-                        const extracted = extractOutputField(alloc, oj) catch oj;
-                        try child_outputs.append(alloc, extracted);
-                    }
-                }
-                try step_outputs.append(alloc, .{
-                    .step_id = s.def_step_id,
-                    .output = null,
-                    .outputs = child_outputs.items,
-                });
-            } else {
-                // Regular step — single output
-                const output = if (s.output_json) |oj|
-                    (extractOutputField(alloc, oj) catch oj)
-                else
-                    null;
-                try step_outputs.append(alloc, .{
-                    .step_id = s.def_step_id,
-                    .output = output,
-                    .outputs = null,
-                });
-            }
-        }
-    }
-
-    // Determine item context (for map child steps)
-    const item: ?[]const u8 = if (step.parent_step_id != null) blk: {
-        // This is a child step of a map/fan_out — extract item from input_json
-        break :blk extractItemFromInput(alloc, step.input_json) catch null;
-    } else null;
-
-    return templates.Context{
-        .input_json = run_row.input_json,
-        .step_outputs = step_outputs.items,
-        .item = item,
-    };
-}
-
-/// Look up a generated ID by definition ID from parallel arrays.
-fn lookupId(def_ids: []const []const u8, gen_ids: []const []const u8, target: []const u8) ?[]const u8 {
-    for (def_ids, 0..) |did, i| {
-        if (std.mem.eql(u8, did, target)) return gen_ids[i];
-    }
-    return null;
-}
-
-/// Find a step's status by ID from a list of steps.
-fn findStepStatus(steps: []const types.StepRow, step_id: []const u8) ?[]const u8 {
-    for (steps) |s| {
-        if (std.mem.eql(u8, s.id, step_id)) return s.status;
-    }
-    return null;
-}
-
-/// Find a step's def_step_id by step ID from a list of steps.
-fn findStepDefId(steps: []const types.StepRow, step_id: []const u8) ?[]const u8 {
-    for (steps) |s| {
-        if (std.mem.eql(u8, s.id, step_id)) return s.def_step_id;
-    }
-    return null;
-}
-
-/// Find a step's output_json by step ID from a list of steps.
-fn findStepOutput(steps: []const types.StepRow, step_id: []const u8) ?[]const u8 {
-    for (steps) |s| {
-        if (std.mem.eql(u8, s.id, step_id)) {
-            if (s.output_json) |oj| {
-                return oj;
-            }
-            return null;
-        }
-    }
-    return null;
-}
-
-/// Wrap a raw output string in a JSON object: {"output": "..."}
-fn wrapOutput(alloc: std.mem.Allocator, output: []const u8) ![]const u8 {
-    // Use JSON serializer for proper escaping
-    var out: std.ArrayListUnmanaged(u8) = .empty;
-    try out.appendSlice(alloc, "{\"output\":");
-
-    // JSON-encode the output string
-    try out.append(alloc, '"');
-    for (output) |ch| {
-        switch (ch) {
-            '"' => try out.appendSlice(alloc, "\\\""),
-            '\\' => try out.appendSlice(alloc, "\\\\"),
-            '\n' => try out.appendSlice(alloc, "\\n"),
-            '\r' => try out.appendSlice(alloc, "\\r"),
-            '\t' => try out.appendSlice(alloc, "\\t"),
-            else => try out.append(alloc, ch),
-        }
-    }
-    try out.append(alloc, '"');
-    try out.append(alloc, '}');
-    return try out.toOwnedSlice(alloc);
-}
-
-/// Wrap an item value in a JSON object: {"item": "..."}
-fn wrapItemJson(alloc: std.mem.Allocator, item: []const u8) ![]const u8 {
-    var out: std.ArrayListUnmanaged(u8) = .empty;
-    try out.appendSlice(alloc, "{\"item\":");
-
-    try out.append(alloc, '"');
-    for (item) |ch| {
-        switch (ch) {
-            '"' => try out.appendSlice(alloc, "\\\""),
-            '\\' => try out.appendSlice(alloc, "\\\\"),
-            '\n' => try out.appendSlice(alloc, "\\n"),
-            '\r' => try out.appendSlice(alloc, "\\r"),
-            '\t' => try out.appendSlice(alloc, "\\t"),
-            else => try out.append(alloc, ch),
-        }
-    }
-    try out.append(alloc, '"');
-    try out.append(alloc, '}');
-    return try out.toOwnedSlice(alloc);
-}
-
-/// Extract the "output" field from a JSON string like {"output": "..."}.
-fn extractOutputField(alloc: std.mem.Allocator, json_str: []const u8) ![]const u8 {
-    const parsed = try std.json.parseFromSlice(std.json.Value, alloc, json_str, .{});
-    const root = parsed.value;
-    if (root != .object) return json_str;
-    const output_val = root.object.get("output") orelse return json_str;
-    if (output_val == .string) return try alloc.dupe(u8, output_val.string);
-    return json_str;
-}
-
-/// Extract an array of strings from a JSON field.
-fn extractJsonArray(alloc: std.mem.Allocator, json_str: []const u8, field_name: []const u8) !?[][]const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, json_str, .{}) catch {
-        return null;
-    };
-    const root = parsed.value;
-    if (root != .object) return null;
-
-    const arr_val = root.object.get(field_name) orelse return null;
-    if (arr_val != .array) return null;
-
-    var items: std.ArrayListUnmanaged([]const u8) = .empty;
-    for (arr_val.array.items) |item| {
-        switch (item) {
-            .string => |s| try items.append(alloc, try alloc.dupe(u8, s)),
-            else => {
-                // Serialize non-string values as JSON
-                var json_out: std.io.Writer.Allocating = .init(alloc);
-                var jw: std.json.Stringify = .{ .writer = &json_out.writer };
-                jw.write(item) catch continue;
-                const slice = json_out.toOwnedSlice() catch continue;
-                try items.append(alloc, slice);
-            },
-        }
-    }
-    const result = try items.toOwnedSlice(alloc);
-    return result;
-}
-
-/// Serialize an array of strings to a JSON array string.
-fn serializeStringArray(alloc: std.mem.Allocator, items: []const []const u8) ![]const u8 {
-    var buf: std.ArrayListUnmanaged(u8) = .empty;
-    try buf.append(alloc, '[');
-    for (items, 0..) |item, i| {
-        if (i > 0) try buf.append(alloc, ',');
-        try buf.append(alloc, '"');
-        for (item) |ch| {
-            switch (ch) {
-                '"' => try buf.appendSlice(alloc, "\\\""),
-                '\\' => try buf.appendSlice(alloc, "\\\\"),
-                '\n' => try buf.appendSlice(alloc, "\\n"),
-                '\r' => try buf.appendSlice(alloc, "\\r"),
-                '\t' => try buf.appendSlice(alloc, "\\t"),
-                else => try buf.append(alloc, ch),
-            }
-        }
-        try buf.append(alloc, '"');
-    }
-    try buf.append(alloc, ']');
-    return try buf.toOwnedSlice(alloc);
-}
-
-/// Parsed handoff target information.
-const HandoffTarget = struct {
-    tags: []const []const u8,
-    tags_str: []const u8,
-    message: ?[]const u8,
-};
-
-/// Extract handoff_to target from a worker output string.
-/// Worker output may be raw text or JSON like: {"output": "...", "handoff_to": {"tags": [...], "message": "..."}}
-fn extractHandoffTarget(alloc: std.mem.Allocator, output: []const u8) ?HandoffTarget {
-    // Try to parse the output as JSON
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, output, .{}) catch return null;
-    const root = parsed.value;
-    if (root != .object) return null;
-
-    const handoff_val = root.object.get("handoff_to") orelse return null;
-    if (handoff_val != .object) return null;
-
-    // Extract tags
-    const tags_val = handoff_val.object.get("tags") orelse return null;
-    if (tags_val != .array) return null;
-
-    var tag_list: std.ArrayListUnmanaged([]const u8) = .empty;
-    var tags_str_buf: std.ArrayListUnmanaged(u8) = .empty;
-
-    for (tags_val.array.items, 0..) |tag_item, i| {
-        if (tag_item == .string) {
-            tag_list.append(alloc, alloc.dupe(u8, tag_item.string) catch return null) catch return null;
-            if (i > 0) tags_str_buf.append(alloc, ',') catch return null;
-            tags_str_buf.appendSlice(alloc, tag_item.string) catch return null;
-        }
-    }
-
-    if (tag_list.items.len == 0) return null;
-
-    // Extract message (optional)
-    var message: ?[]const u8 = null;
-    if (handoff_val.object.get("message")) |msg_val| {
-        if (msg_val == .string) {
-            message = alloc.dupe(u8, msg_val.string) catch null;
-        }
-    }
-
-    return HandoffTarget{
-        .tags = tag_list.toOwnedSlice(alloc) catch return null,
-        .tags_str = tags_str_buf.toOwnedSlice(alloc) catch return null,
-        .message = message,
-    };
-}
-
-/// Build a formatted chat transcript from chat messages.
-fn buildChatTranscript(alloc: std.mem.Allocator, messages: []const types.ChatMessageRow) ![]const u8 {
-    var buf: std.ArrayListUnmanaged(u8) = .empty;
-    for (messages, 0..) |msg, i| {
-        if (i > 0) try buf.appendSlice(alloc, "\\n");
-        const line = try std.fmt.allocPrint(alloc, "[Round {d}] {s}: {s}", .{ msg.round, msg.role, msg.message });
-        try buf.appendSlice(alloc, line);
-    }
-    return try buf.toOwnedSlice(alloc);
-}
-
-/// Build input_json payload that carries an already rendered prompt for child task steps.
-fn buildRenderedPromptInputJson(alloc: std.mem.Allocator, rendered_prompt: []const u8) ![]const u8 {
-    return std.json.Stringify.valueAlloc(alloc, .{
-        .rendered_prompt = rendered_prompt,
-    }, .{});
-}
-
-/// Extract optional input_json.rendered_prompt for dynamic child task execution.
-fn extractRenderedPromptFromInput(alloc: std.mem.Allocator, input_json: []const u8) ?[]const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, input_json, .{}) catch {
-        return null;
-    };
-    const root = parsed.value;
-    if (root != .object) return null;
-    const rendered = root.object.get("rendered_prompt") orelse return null;
-    if (rendered != .string) return null;
-    return alloc.dupe(u8, rendered.string) catch null;
-}
-
-/// Extract the "item" field from input_json, or return the whole input_json
-/// as item text if it's a simple value.
-fn extractItemFromInput(alloc: std.mem.Allocator, input_json: []const u8) ![]const u8 {
-    const parsed = std.json.parseFromSlice(std.json.Value, alloc, input_json, .{}) catch {
-        return input_json;
-    };
-    const root = parsed.value;
-    if (root != .object) return input_json;
-    const item_val = root.object.get("item") orelse return input_json;
-    if (item_val == .string) return try alloc.dupe(u8, item_val.string);
-    return input_json;
-}
-
-// ── Tests ─────────────────────────────────────────────────────────────
-
-test "Engine: init and stop" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    var engine = Engine.init(&store, allocator, 500);
-    try std.testing.expect(engine.running.load(.acquire));
-    engine.stop();
-    try std.testing.expect(!engine.running.load(.acquire));
-}
-
-test "Engine: tick with no active runs" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    var engine = Engine.init(&store, allocator, 500);
-    // Should not error — no active runs
-    try engine.tick();
-}
-
-test "Engine: checkRunCompletion marks run completed" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // Insert a run
-    try store.insertRun("r1", null, "running", "{\"steps\":[]}", "{}", "[]");
-
-    // Insert a completed step
-    try store.insertStep("s1", "r1", "step1", "task", "completed", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-    try engine.checkRunCompletion("r1", arena.allocator());
-
-    // Verify run status is "completed"
-    const run = (try store.getRun(arena.allocator(), "r1")).?;
-    try std.testing.expectEqualStrings("completed", run.status);
-}
-
-test "Engine: checkRunCompletion marks run failed" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    try store.insertRun("r1", null, "running", "{\"steps\":[]}", "{}", "[]");
-    try store.insertStep("s1", "r1", "step1", "task", "completed", "{}", 1, null, null, null);
-    try store.insertStep("s2", "r1", "step2", "task", "failed", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-    try engine.checkRunCompletion("r1", arena.allocator());
-
-    const run = (try store.getRun(arena.allocator(), "r1")).?;
-    try std.testing.expectEqualStrings("failed", run.status);
-}
-
-test "Engine: checkRunCompletion does not complete with pending steps" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    try store.insertRun("r1", null, "running", "{\"steps\":[]}", "{}", "[]");
-    try store.insertStep("s1", "r1", "step1", "task", "completed", "{}", 1, null, null, null);
-    try store.insertStep("s2", "r1", "step2", "task", "pending", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-    try engine.checkRunCompletion("r1", arena.allocator());
-
-    // Run should still be "running"
-    const run = (try store.getRun(arena.allocator(), "r1")).?;
-    try std.testing.expectEqualStrings("running", run.status);
-}
-
-test "Engine: pending to ready promotion" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"s1","type":"task","prompt_template":"hello"},{"id":"s2","type":"task","prompt_template":"world","depends_on":["s1"]}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    // s1 is completed, s2 is pending and depends on s1
-    try store.insertStep("step1", "r1", "s1", "task", "completed", "{}", 1, null, null, null);
-    try store.insertStep("step2", "r1", "s2", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step2", "step1");
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    // Get run row
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-
-    // processRun should promote step2 from pending to ready
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Re-fetch step2
-    const step2 = (try store.getStep(arena.allocator(), "step2")).?;
-    // It should be promoted to "ready" (not "pending")
-    // Note: since there are no workers, the task step won't actually execute,
-    // so it stays at "ready"
-    try std.testing.expectEqualStrings("ready", step2.status);
-}
-
-test "Engine: approval step sets waiting_approval" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"approve1","type":"approval"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step1", "r1", "approve1", "approval", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const step = (try store.getStep(arena.allocator(), "step1")).?;
-    try std.testing.expectEqualStrings("waiting_approval", step.status);
-}
-
-test "Engine: fan_out creates child steps" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"fan1","type":"fan_out","count":3}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step1", "r1", "fan1", "fan_out", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // fan_out step should be completed
-    const step = (try store.getStep(arena.allocator(), "step1")).?;
-    try std.testing.expectEqualStrings("completed", step.status);
-
-    // Should have created 3 child steps
-    const children = try store.getChildSteps(arena.allocator(), "step1");
-    try std.testing.expectEqual(@as(usize, 3), children.len);
-
-    // Each child should be "ready" and type "task"
-    for (children) |child| {
-        try std.testing.expectEqualStrings("ready", child.status);
-        try std.testing.expectEqualStrings("task", child.type);
-    }
-}
-
-test "Engine: map creates child steps from input array" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"map1","type":"map","items_from":"$.topics"}]}
-    ;
-    const input =
-        \\{"topics":["AI","ML","DL"]}
-    ;
-    try store.insertRun("r1", null, "running", wf, input, "[]");
-    try store.insertStep("step1", "r1", "map1", "map", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // map step should be completed
-    const step = (try store.getStep(arena.allocator(), "step1")).?;
-    try std.testing.expectEqualStrings("completed", step.status);
-
-    // Should have created 3 child steps
-    const children = try store.getChildSteps(arena.allocator(), "step1");
-    try std.testing.expectEqual(@as(usize, 3), children.len);
-}
-
-test "getStepField extracts prompt_template" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"research","type":"task","prompt_template":"Research {{input.topic}}"}]}
-    ;
-    const result = try getStepField(arena.allocator(), wf, "research", "prompt_template");
-    try std.testing.expect(result != null);
-    try std.testing.expectEqualStrings("Research {{input.topic}}", result.?);
-}
-
-test "getStepField returns null for missing step" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"research","type":"task"}]}
-    ;
-    const result = try getStepField(arena.allocator(), wf, "nonexistent", "prompt_template");
-    try std.testing.expect(result == null);
-}
-
-test "getStepFieldInt extracts count" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"fan1","type":"fan_out","count":5}]}
-    ;
-    const result = try getStepFieldInt(arena.allocator(), wf, "fan1", "count");
-    try std.testing.expect(result != null);
-    try std.testing.expectEqual(@as(i64, 5), result.?);
-}
-
-test "extractJsonArray extracts string array" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const json =
-        \\{"topics":["AI","ML","DL"]}
-    ;
-    const result = try extractJsonArray(arena.allocator(), json, "topics");
-    try std.testing.expect(result != null);
-    try std.testing.expectEqual(@as(usize, 3), result.?.len);
-    try std.testing.expectEqualStrings("AI", result.?[0]);
-    try std.testing.expectEqualStrings("ML", result.?[1]);
-    try std.testing.expectEqualStrings("DL", result.?[2]);
-}
-
-test "wrapOutput creates valid JSON" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const result = try wrapOutput(arena.allocator(), "hello world");
-    try std.testing.expectEqualStrings("{\"output\":\"hello world\"}", result);
-}
-
-test "wrapOutput escapes special characters" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const result = try wrapOutput(arena.allocator(), "line1\nline2");
-    try std.testing.expectEqualStrings("{\"output\":\"line1\\nline2\"}", result);
-}
-
-test "build/extract rendered_prompt input JSON round-trip" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const input_json = try buildRenderedPromptInputJson(arena.allocator(), "say \"hi\"\\nnext");
-    const prompt = extractRenderedPromptFromInput(arena.allocator(), input_json);
-    try std.testing.expect(prompt != null);
-    try std.testing.expectEqualStrings("say \"hi\"\\nnext", prompt.?);
-}
-
-test "Engine: task step fallback uses input_json.rendered_prompt" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    try store.insertRun("r-rendered", null, "running", "{\"steps\":[]}", "{}", "[]");
-    try store.insertWorker("w-rendered", "http://127.0.0.1:1", "", "webhook", null, "[]", 1, "registered");
-    try store.insertStep("parent-step", "r-rendered", "missing-parent-def", "task", "completed", "{}", 1, null, null, null);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-    const rendered_input = try buildRenderedPromptInputJson(arena.allocator(), "child fallback prompt");
-    try store.insertStep(
-        "child-step",
-        "r-rendered",
-        "missing-child-def",
-        "task",
-        "ready",
-        rendered_input,
-        2,
-        null,
-        "parent-step",
-        0,
-    );
-
-    var engine = Engine.init(&store, allocator, 500);
-    const run_row = (try store.getRun(arena.allocator(), "r-rendered")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const child = (try store.getStep(arena.allocator(), "child-step")).?;
-    try std.testing.expectEqualStrings("ready", child.status);
-    try std.testing.expectEqual(@as(i64, 2), child.attempt);
-}
-
-test "Engine: rendered_prompt has priority over parent prompt_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"parent","type":"debate","prompt_template":"parent template"},{"id":"child","type":"task","prompt_template":"child template"}]}
-    ;
-    try store.insertRun("r-priority", null, "running", wf, "{}", "[]");
-    try store.insertStep("parent-step", "r-priority", "parent", "debate", "running", "{}", 1, null, null, null);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const rendered_input = try buildRenderedPromptInputJson(arena.allocator(), "rendered prompt");
-    try store.insertStep(
-        "child-step",
-        "r-priority",
-        "child",
-        "task",
-        "ready",
-        rendered_input,
-        1,
-        null,
-        "parent-step",
-        0,
-    );
-
-    var engine = Engine.init(&store, allocator, 500);
-    const run_row = (try store.getRun(arena.allocator(), "r-priority")).?;
-    const child_step = (try store.getStep(arena.allocator(), "child-step")).?;
-    const source = (try engine.resolveTaskPromptSource(arena.allocator(), run_row, child_step)).?;
-
-    switch (source) {
-        .rendered => |prompt| try std.testing.expectEqualStrings("rendered prompt", prompt),
-        .template => try std.testing.expect(false),
-    }
-}
-
-test "findStepStatus finds matching step" {
-    const steps = [_]types.StepRow{
-        makeTestStepRow("s1", "completed"),
-        makeTestStepRow("s2", "pending"),
-    };
-    const status = findStepStatus(&steps, "s2");
-    try std.testing.expect(status != null);
-    try std.testing.expectEqualStrings("pending", status.?);
-}
-
-test "findStepStatus returns null for missing step" {
-    const steps = [_]types.StepRow{
-        makeTestStepRow("s1", "completed"),
-    };
-    const status = findStepStatus(&steps, "s999");
-    try std.testing.expect(status == null);
-}
-
-fn makeTestStepRow(id: []const u8, status: []const u8) types.StepRow {
-    return .{
-        .id = id,
-        .run_id = "r1",
-        .def_step_id = id,
-        .type = "task",
-        .status = status,
-        .worker_id = null,
-        .input_json = "{}",
-        .output_json = null,
-        .error_text = null,
-        .attempt = 1,
-        .max_attempts = 1,
-        .timeout_ms = null,
-        .next_attempt_at_ms = null,
-        .parent_step_id = null,
-        .item_index = null,
-        .created_at_ms = 0,
-        .updated_at_ms = 0,
-        .started_at_ms = null,
-        .ended_at_ms = null,
-        .child_run_id = null,
-        .iteration_index = 0,
-    };
-}
-
-// ── Transform step tests ─────────────────────────────────────────────
-
-test "Engine: transform step renders output_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"t1","type":"task","prompt_template":"hello"},{"id":"tr1","type":"transform","output_template":"result: {{steps.t1.output}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    // Insert task1 as completed with output
-    try store.insertStep("step_t1", "r1", "t1", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_t1", "completed", null, "{\"output\":\"hello\"}", null, 1);
-
-    // Insert transform1 as ready with dependency on task1
-    try store.insertStep("step_tr1", "r1", "tr1", "transform", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_tr1", "step_t1");
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Verify transform completed
-    const s = (try store.getStep(arena.allocator(), "step_tr1")).?;
-    try std.testing.expectEqualStrings("completed", s.status);
-    // Output should contain the rendered template
-    try std.testing.expect(s.output_json != null);
-    // The output should contain "hello" from the task step
-    try std.testing.expect(std.mem.indexOf(u8, s.output_json.?, "hello") != null);
-}
-
-test "Engine: transform step fails without output_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"tr1","type":"transform"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_tr1", "r1", "tr1", "transform", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const s = (try store.getStep(arena.allocator(), "step_tr1")).?;
-    try std.testing.expectEqualStrings("failed", s.status);
-    try std.testing.expect(s.error_text != null);
-}
-
-// ── Wait step tests ──────────────────────────────────────────────────
-
-test "Engine: wait step with duration_ms=0 completes after two ticks" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"w1","type":"wait","duration_ms":0}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_w1", "r1", "w1", "wait", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    // First tick: step becomes "running" with started_at_ms
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const s1 = (try store.getStep(arena.allocator(), "step_w1")).?;
-    try std.testing.expectEqualStrings("running", s1.status);
-    try std.testing.expect(s1.started_at_ms != null);
-
-    // Second tick: step should be "completed" since duration=0
-    const run_row2 = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row2);
-
-    const s2 = (try store.getStep(arena.allocator(), "step_w1")).?;
-    try std.testing.expectEqualStrings("completed", s2.status);
-    try std.testing.expect(s2.output_json != null);
-}
-
-test "Engine: wait step with signal enters waiting_approval" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"w1","type":"wait","signal":"deploy"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_w1", "r1", "w1", "wait", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const s = (try store.getStep(arena.allocator(), "step_w1")).?;
-    try std.testing.expectEqualStrings("waiting_approval", s.status);
-}
-
-test "Engine: wait step without config fails" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"w1","type":"wait"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_w1", "r1", "w1", "wait", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const s = (try store.getStep(arena.allocator(), "step_w1")).?;
-    try std.testing.expectEqualStrings("failed", s.status);
-}
-
-test "Engine: wait step with invalid duration string fails" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"w1","type":"wait","duration_ms":"abc"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_w1", "r1", "w1", "wait", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const s = (try store.getStep(arena.allocator(), "step_w1")).?;
-    try std.testing.expectEqualStrings("failed", s.status);
-    try std.testing.expect(s.error_text != null);
-    try std.testing.expect(std.mem.indexOf(u8, s.error_text.?, "duration_ms must be an integer") != null);
-}
-
-// ── Router step tests ────────────────────────────────────────────────
-
-test "Engine: router step routes to matching target" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"classify","type":"task","prompt_template":"classify"},{"id":"router1","type":"router","routes":{"bug":"fix_bug","feature":"add_feature"}},{"id":"fix_bug","type":"task","prompt_template":"fix"},{"id":"add_feature","type":"task","prompt_template":"add"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    // classify step completed with "bug" in output
-    try store.insertStep("step_classify", "r1", "classify", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_classify", "completed", null, "{\"output\":\"this is a bug report\"}", null, 1);
-
-    // router step is ready, depends on classify
-    try store.insertStep("step_router", "r1", "router1", "router", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_router", "step_classify");
-
-    // Target steps are pending
-    try store.insertStep("step_fix", "r1", "fix_bug", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step_fix", "step_router");
-    try store.insertStep("step_add", "r1", "add_feature", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step_add", "step_router");
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Router should be completed
-    const router = (try store.getStep(arena.allocator(), "step_router")).?;
-    try std.testing.expectEqualStrings("completed", router.status);
-    try std.testing.expect(router.output_json != null);
-    try std.testing.expect(std.mem.indexOf(u8, router.output_json.?, "fix_bug") != null);
-
-    // add_feature should be skipped
-    const add = (try store.getStep(arena.allocator(), "step_add")).?;
-    try std.testing.expectEqualStrings("skipped", add.status);
-
-    // fix_bug should still be pending (not skipped)
-    const fix = (try store.getStep(arena.allocator(), "step_fix")).?;
-    try std.testing.expectEqualStrings("pending", fix.status);
-}
-
-test "Engine: router step uses default when no match" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"classify","type":"task","prompt_template":"classify"},{"id":"router1","type":"router","routes":{"bug":"fix_bug"},"default":"fix_bug"},{"id":"fix_bug","type":"task","prompt_template":"fix"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    // classify step completed with something that doesn't match any route
-    try store.insertStep("step_classify", "r1", "classify", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_classify", "completed", null, "{\"output\":\"unknown category\"}", null, 1);
-
-    // router step is ready
-    try store.insertStep("step_router", "r1", "router1", "router", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_router", "step_classify");
-
-    // Target step
-    try store.insertStep("step_fix", "r1", "fix_bug", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step_fix", "step_router");
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Router should be completed with default target
-    const router = (try store.getStep(arena.allocator(), "step_router")).?;
-    try std.testing.expectEqualStrings("completed", router.status);
-    try std.testing.expect(router.output_json != null);
-    try std.testing.expect(std.mem.indexOf(u8, router.output_json.?, "fix_bug") != null);
-}
-
-test "Engine: router step fails without routes" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"classify","type":"task","prompt_template":"classify"},{"id":"router1","type":"router"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    try store.insertStep("step_classify", "r1", "classify", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_classify", "completed", null, "{\"output\":\"test\"}", null, 1);
-
-    try store.insertStep("step_router", "r1", "router1", "router", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_router", "step_classify");
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const router = (try store.getStep(arena.allocator(), "step_router")).?;
-    try std.testing.expectEqualStrings("failed", router.status);
-}
-
-// ── getStepFieldRaw tests ────────────────────────────────────────────
-
-test "getStepFieldRaw returns JSON object as string" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"r1","type":"router","routes":{"bug":"fix_bug","feature":"add_feature"}}]}
-    ;
-    const result = try getStepFieldRaw(arena.allocator(), wf, "r1", "routes");
-    try std.testing.expect(result != null);
-    // Should be a JSON string containing the routes object
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "bug") != null);
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "fix_bug") != null);
-}
-
-test "getStepFieldRaw returns string values directly" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"r1","type":"router","default":"fallback"}]}
-    ;
-    const result = try getStepFieldRaw(arena.allocator(), wf, "r1", "default");
-    try std.testing.expect(result != null);
-    try std.testing.expectEqualStrings("fallback", result.?);
-}
-
-// ── Loop step tests ──────────────────────────────────────────────────
-
-test "Engine: loop step creates first iteration children" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // Workflow: loop with body ["t1"] — single body step for simplicity
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":3,"exit_condition":"done","body":["t1"]},{"id":"t1","type":"task","prompt_template":"do work"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Loop step should be "running"
-    const loop_step = (try store.getStep(arena.allocator(), "step_loop")).?;
-    try std.testing.expectEqualStrings("running", loop_step.status);
-
-    // Should have created 1 child step
-    const children = try store.getChildSteps(arena.allocator(), "step_loop");
-    try std.testing.expectEqual(@as(usize, 1), children.len);
-    try std.testing.expectEqualStrings("ready", children[0].status);
-    try std.testing.expectEqualStrings("t1", children[0].def_step_id);
-    try std.testing.expectEqual(@as(i64, 0), children[0].iteration_index);
-}
-
-test "Engine: loop step iterates until exit condition" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":5,"exit_condition":"done","body":["t1"]},{"id":"t1","type":"task","prompt_template":"do work"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates iteration 0 children, marks loop as running
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Get the first child and mark it completed with "not done"
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        try std.testing.expectEqual(@as(usize, 1), children.len);
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"not done\"}", null, 1);
-    }
-
-    // Tick 2: exit condition "done" not in "not done"... wait, "not done" contains "done"!
-    // Let's use a different output that doesn't contain "done"
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        // Fix: update to something that doesn't contain "done"
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"still working\"}", null, 1);
-    }
-
-    // Tick 2: exit condition not met, creates iteration 1
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Should now have 2 children (iteration 0 and 1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        try std.testing.expectEqual(@as(usize, 2), children.len);
-    }
-
-    // Mark iteration 1 child as completed with "done" in output
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        // Find iteration 1 child
-        for (children) |child| {
-            if (child.iteration_index == 1) {
-                try store.updateStepStatus(child.id, "completed", null, "{\"output\":\"done\"}", null, 1);
-            }
-        }
-    }
-
-    // Tick 3: exit condition met, loop completes
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Loop should be completed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const loop_step = (try store.getStep(arena.allocator(), "step_loop")).?;
-        try std.testing.expectEqualStrings("completed", loop_step.status);
-        try std.testing.expect(loop_step.output_json != null);
-    }
-}
-
-test "Engine: loop step stops at max_iterations" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":2,"exit_condition":"never_match","body":["t1"]},{"id":"t1","type":"task","prompt_template":"do work"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates iteration 0
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete iteration 0 child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"result0\"}", null, 1);
-    }
-
-    // Tick 2: creates iteration 1
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete iteration 1 child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        for (children) |child| {
-            if (child.iteration_index == 1) {
-                try store.updateStepStatus(child.id, "completed", null, "{\"output\":\"result1\"}", null, 1);
-            }
-        }
-    }
-
-    // Tick 3: max_iterations=2 reached (iterations 0,1), loop completes
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Loop should be completed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const loop_step = (try store.getStep(arena.allocator(), "step_loop")).?;
-        try std.testing.expectEqualStrings("completed", loop_step.status);
-    }
-}
-
-test "Engine: loop step fails when child fails" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":3,"exit_condition":"done","body":["t1"]},{"id":"t1","type":"task","prompt_template":"do work"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates iteration 0
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Mark child as failed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        try store.updateStepStatus(children[0].id, "failed", null, null, "child error", 1);
-    }
-
-    // Tick 2: loop should fail
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const loop_step = (try store.getStep(arena.allocator(), "step_loop")).?;
-        try std.testing.expectEqualStrings("failed", loop_step.status);
-    }
-}
-
-test "Engine: loop step with multiple body steps chains them" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":1,"exit_condition":"done","body":["s1","s2"]},{"id":"s1","type":"task","prompt_template":"step1"},{"id":"s2","type":"task","prompt_template":"step2"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates iteration 0 with 2 body steps chained
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_loop");
-        try std.testing.expectEqual(@as(usize, 2), children.len);
-
-        // First child (s1) should be "ready", second (s2) should be "pending"
-        // Children are ordered by item_index ASC
-        var ready_count: usize = 0;
-        var pending_count: usize = 0;
-        for (children) |child| {
-            if (std.mem.eql(u8, child.status, "ready")) ready_count += 1;
-            if (std.mem.eql(u8, child.status, "pending")) pending_count += 1;
-        }
-        try std.testing.expectEqual(@as(usize, 1), ready_count);
-        try std.testing.expectEqual(@as(usize, 1), pending_count);
-    }
-}
-
-// ── Sub-workflow step tests ──────────────────────────────────────────
-
-test "Engine: sub_workflow step creates child run" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // Parent workflow has a sub_workflow step with inline workflow
-    const wf =
-        \\{"steps":[{"id":"sub1","type":"sub_workflow","workflow":{"steps":[{"id":"inner1","type":"task","prompt_template":"inner work"}]}}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_sub", "r1", "sub1", "sub_workflow", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates child run and marks sub_workflow as running
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Verify sub_workflow step is "running" and has child_run_id
-    var child_run_id: []const u8 = undefined;
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        try std.testing.expectEqualStrings("running", sub_step.status);
-        try std.testing.expect(sub_step.child_run_id != null);
-        child_run_id = try allocator.dupe(u8, sub_step.child_run_id.?);
-    }
-    defer allocator.free(child_run_id);
-
-    // Verify child run exists and has steps
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const child_run = (try store.getRun(arena.allocator(), child_run_id)).?;
-        try std.testing.expectEqualStrings("running", child_run.status);
-
-        const child_steps = try store.getStepsByRun(arena.allocator(), child_run_id);
-        try std.testing.expectEqual(@as(usize, 1), child_steps.len);
-        try std.testing.expectEqualStrings("inner1", child_steps[0].def_step_id);
-        try std.testing.expectEqualStrings("ready", child_steps[0].status);
-    }
-}
-
-test "Engine: sub_workflow step completes when child run completes" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"sub1","type":"sub_workflow","workflow":{"steps":[{"id":"inner1","type":"task","prompt_template":"inner work"}]}}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_sub", "r1", "sub1", "sub_workflow", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates child run
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Get child run ID and manually complete its step + run
-    var child_run_id: []const u8 = undefined;
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        child_run_id = try allocator.dupe(u8, sub_step.child_run_id.?);
-    }
-    defer allocator.free(child_run_id);
-
-    // Complete the child run's step
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const child_steps = try store.getStepsByRun(arena.allocator(), child_run_id);
-        try store.updateStepStatus(child_steps[0].id, "completed", null, "{\"output\":\"inner result\"}", null, 1);
-    }
-
-    // Mark child run as completed
-    try store.updateRunStatus(child_run_id, "completed", null);
-
-    // Tick 2: sub_workflow should detect child run completed and complete itself
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Verify sub_workflow step completed with child's output
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        try std.testing.expectEqualStrings("completed", sub_step.status);
-        try std.testing.expect(sub_step.output_json != null);
-        try std.testing.expect(std.mem.indexOf(u8, sub_step.output_json.?, "inner result") != null);
-    }
-}
-
-test "Engine: sub_workflow step fails when child run fails" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"sub1","type":"sub_workflow","workflow":{"steps":[{"id":"inner1","type":"task","prompt_template":"inner work"}]}}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_sub", "r1", "sub1", "sub_workflow", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates child run
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Get child run ID
-    var child_run_id: []const u8 = undefined;
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        child_run_id = try allocator.dupe(u8, sub_step.child_run_id.?);
-    }
-    defer allocator.free(child_run_id);
-
-    // Mark child run as failed
-    try store.updateRunStatus(child_run_id, "failed", "inner step failed");
-
-    // Tick 2: sub_workflow should detect child run failed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Verify sub_workflow step failed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        try std.testing.expectEqualStrings("failed", sub_step.status);
-        try std.testing.expect(sub_step.error_text != null);
-    }
-}
-
-test "Engine: sub_workflow step fails without workflow" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"sub1","type":"sub_workflow"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_sub", "r1", "sub1", "sub_workflow", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const sub_step = (try store.getStep(arena.allocator(), "step_sub")).?;
-        try std.testing.expectEqualStrings("failed", sub_step.status);
-    }
-}
-
-test "Engine: loop step fails without body" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"loop1","type":"loop","max_iterations":3,"exit_condition":"done"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_loop", "r1", "loop1", "loop", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const loop_step = (try store.getStep(arena.allocator(), "step_loop")).?;
-        try std.testing.expectEqualStrings("failed", loop_step.status);
-    }
-}
-
-// ── Debate step tests ────────────────────────────────────────────────
-
-test "Engine: debate step creates participant children" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2,"worker_tags":["reviewer"],"judge_tags":["senior"],"prompt_template":"Review this code","judge_template":"Pick the best:\n{{debate_responses}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Debate step should be "running"
-    const debate_step = (try store.getStep(arena.allocator(), "step_debate")).?;
-    try std.testing.expectEqualStrings("running", debate_step.status);
-
-    // Should have 2 participant children
-    const children = try store.getChildSteps(arena.allocator(), "step_debate");
-    try std.testing.expectEqual(@as(usize, 2), children.len);
-
-    for (children) |child| {
-        try std.testing.expectEqualStrings("ready", child.status);
-        try std.testing.expectEqualStrings("task", child.type);
-    }
-}
-
-test "Engine: debate step fails without count" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","prompt_template":"Review this"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const step = (try store.getStep(arena.allocator(), "step_debate")).?;
-    try std.testing.expectEqualStrings("failed", step.status);
-}
-
-test "Engine: debate step fails without prompt_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const step = (try store.getStep(arena.allocator(), "step_debate")).?;
-    try std.testing.expectEqualStrings("failed", step.status);
-}
-
-test "Engine: debate step creates judge after participants complete" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2,"worker_tags":["reviewer"],"judge_tags":["senior"],"prompt_template":"Review this code","judge_template":"Pick the best:\n{{debate_responses}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates participant children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete both participant children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        try std.testing.expectEqual(@as(usize, 2), children.len);
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"review A\"}", null, 1);
-        try store.updateStepStatus(children[1].id, "completed", null, "{\"output\":\"review B\"}", null, 1);
-    }
-
-    // Tick 2: should create judge child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Should now have 3 children (2 participants + 1 judge)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        try std.testing.expectEqual(@as(usize, 3), children.len);
-
-        // Find judge child
-        var found_judge = false;
-        for (children) |child| {
-            if (std.mem.indexOf(u8, child.def_step_id, "_judge") != null) {
-                found_judge = true;
-                try std.testing.expectEqualStrings("ready", child.status);
-                try std.testing.expectEqualStrings("task", child.type);
-            }
-        }
-        try std.testing.expect(found_judge);
-    }
-}
-
-test "Engine: debate step completes when judge completes" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2,"prompt_template":"Review this","judge_template":"Pick best: {{debate_responses}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates participant children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete participants
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"A\"}", null, 1);
-        try store.updateStepStatus(children[1].id, "completed", null, "{\"output\":\"B\"}", null, 1);
-    }
-
-    // Tick 2: creates judge child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete the judge child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        for (children) |child| {
-            if (std.mem.indexOf(u8, child.def_step_id, "_judge") != null) {
-                try store.updateStepStatus(child.id, "completed", null, "{\"output\":\"A is best\"}", null, 1);
-            }
-        }
-    }
-
-    // Tick 3: debate should be completed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const debate_step = (try store.getStep(arena.allocator(), "step_debate")).?;
-        try std.testing.expectEqualStrings("completed", debate_step.status);
-        try std.testing.expect(debate_step.output_json != null);
-        try std.testing.expect(std.mem.indexOf(u8, debate_step.output_json.?, "A is best") != null);
-    }
-}
-
-test "Engine: debate step completes without judge_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // No judge_template — should complete with collected responses when participants are done
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2,"prompt_template":"Review this"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates participant children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete participants
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"review 1\"}", null, 1);
-        try store.updateStepStatus(children[1].id, "completed", null, "{\"output\":\"review 2\"}", null, 1);
-    }
-
-    // Tick 2: no judge_template, should complete with responses
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const debate_step = (try store.getStep(arena.allocator(), "step_debate")).?;
-        try std.testing.expectEqualStrings("completed", debate_step.status);
-        try std.testing.expect(debate_step.output_json != null);
-    }
-}
-
-test "Engine: debate step fails when participant fails" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"review","type":"debate","count":2,"prompt_template":"Review this","judge_template":"Pick: {{debate_responses}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_debate", "r1", "review", "debate", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates participant children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Fail one participant
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_debate");
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"review A\"}", null, 1);
-        try store.updateStepStatus(children[1].id, "failed", null, null, "worker error", 1);
-    }
-
-    // Tick 2: debate should fail
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const debate_step = (try store.getStep(arena.allocator(), "step_debate")).?;
-        try std.testing.expectEqualStrings("failed", debate_step.status);
-    }
-}
-
-// ── Group chat step tests ────────────────────────────────────────────
-
-test "Engine: group_chat step parses participants and starts" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"discuss","type":"group_chat","participants":[{"tags":["architect"],"role":"Architect"},{"tags":["security"],"role":"Security"}],"max_rounds":3,"exit_condition":"CONSENSUS","prompt_template":"Discuss: topic","round_template":"Previous:\n{{chat_history}}\nYour role: {{role}}. Respond."}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_gc", "r1", "discuss", "group_chat", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // group_chat step should be "running"
-    const gc_step = (try store.getStep(arena.allocator(), "step_gc")).?;
-    try std.testing.expectEqualStrings("running", gc_step.status);
-}
-
-test "Engine: group_chat step fails without participants" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"discuss","type":"group_chat","prompt_template":"Discuss"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_gc", "r1", "discuss", "group_chat", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
+/// Get the JSON string for a specific node from workflow_json.
+/// Workflow format: {"nodes": {"node_name": {...}}, "edges": [...]}
+fn getNodeJson(alloc: std.mem.Allocator, workflow_json: []const u8, node_name: []const u8) ?[]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return null;
+    const root = parsed.value;
+    if (root != .object) return null;
 
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
+    const nodes = root.object.get("nodes") orelse return null;
+    if (nodes != .object) return null;
 
-    const step = (try store.getStep(arena.allocator(), "step_gc")).?;
-    try std.testing.expectEqualStrings("failed", step.status);
+    const node = nodes.object.get(node_name) orelse return null;
+    return serializeJsonValue(alloc, node) catch null;
 }
 
-test "Engine: group_chat step fails without prompt_template" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"discuss","type":"group_chat","participants":[{"tags":["a"],"role":"A"}]}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_gc", "r1", "discuss", "group_chat", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    const step = (try store.getStep(arena.allocator(), "step_gc")).?;
-    try std.testing.expectEqualStrings("failed", step.status);
+/// Get a string field from a node's JSON.
+fn getNodeField(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?[]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val == .string) return alloc.dupe(u8, val.string) catch null;
+    return serializeJsonValue(alloc, val) catch null;
 }
 
-test "Engine: group_chat builds chat history across rounds" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // Manually insert chat messages and test the poll logic
-    const wf =
-        \\{"steps":[{"id":"discuss","type":"group_chat","participants":[{"tags":["a"],"role":"Architect"},{"tags":["b"],"role":"Security"}],"max_rounds":2,"exit_condition":"CONSENSUS","prompt_template":"Discuss topic","round_template":"Previous:\n{{chat_history}}\nYour role: {{role}}. Respond."}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_gc", "r1", "discuss", "group_chat", "running", "{}", 1, null, null, null);
-
-    // Insert round 1 messages (simulating what dispatch would produce)
-    try store.insertChatMessage("r1", "step_gc", 1, "Architect", null, "I suggest microservices");
-    try store.insertChatMessage("r1", "step_gc", 1, "Security", null, "We need auth first");
-
-    var engine = Engine.init(&store, allocator, 500);
+/// Get a top-level field from workflow_json.
+fn getWorkflowField(alloc: std.mem.Allocator, workflow_json: []const u8, field: []const u8) ?[]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val == .string) return alloc.dupe(u8, val.string) catch null;
+    return serializeJsonValue(alloc, val) catch null;
+}
 
-    // Poll: round 1 complete, no CONSENSUS, max_rounds=2, so it should try round 2
-    // Since no workers, dispatch will fail silently. Then next poll round_count stays at 2 for round 1.
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+/// Get worker tags from node definition.
+fn getNodeTags(alloc: std.mem.Allocator, node_json: []const u8) []const []const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return &.{};
+    if (parsed.value != .object) return &.{};
+    const tags = parsed.value.object.get("worker_tags") orelse return &.{};
+    if (tags != .array) return &.{};
 
-    // Step should still be running (no workers to dispatch round 2)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const step = (try store.getStep(arena.allocator(), "step_gc")).?;
-        try std.testing.expectEqualStrings("running", step.status);
+    var result: std.ArrayListUnmanaged([]const u8) = .empty;
+    for (tags.array.items) |item| {
+        if (item == .string) {
+            result.append(alloc, item.string) catch continue;
+        }
     }
+    return result.toOwnedSlice(alloc) catch &.{};
+}
 
-    // Simulate round 2 messages with CONSENSUS
-    try store.insertChatMessage("r1", "step_gc", 2, "Architect", null, "CONSENSUS reached");
-    try store.insertChatMessage("r1", "step_gc", 2, "Security", null, "Agreed, CONSENSUS");
+// ── JSON / Serialization Helpers ────────────────────────────────────
 
-    // Poll: round 2 complete with CONSENSUS, should complete
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+fn serializeJsonValue(alloc: std.mem.Allocator, value: json.Value) ![]const u8 {
+    var out: std.io.Writer.Allocating = .init(alloc);
+    var jw: json.Stringify = .{ .writer = &out.writer };
+    try jw.write(value);
+    return try out.toOwnedSlice();
+}
 
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const step = (try store.getStep(arena.allocator(), "step_gc")).?;
-        try std.testing.expectEqualStrings("completed", step.status);
-        try std.testing.expect(step.output_json != null);
-    }
+/// Wrap a raw output string as {"output": "..."} JSON.
+fn wrapOutput(alloc: std.mem.Allocator, output: []const u8) ![]const u8 {
+    return json.Stringify.valueAlloc(alloc, .{
+        .output = output,
+    }, .{});
 }
 
-test "Engine: group_chat completes at max_rounds" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
+/// Escape a string as a JSON string literal (with quotes).
+fn jsonStringify(alloc: std.mem.Allocator, s: []const u8) ![]const u8 {
+    return json.Stringify.valueAlloc(alloc, s, .{});
+}
 
-    const wf =
-        \\{"steps":[{"id":"discuss","type":"group_chat","participants":[{"tags":["a"],"role":"A"},{"tags":["b"],"role":"B"}],"max_rounds":1,"exit_condition":"NEVER_MATCH","prompt_template":"Discuss","round_template":"{{chat_history}} {{role}}"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_gc", "r1", "discuss", "group_chat", "running", "{}", 1, null, null, null);
+/// Serialize completed_nodes set to JSON array.
+fn serializeCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.StringHashMap(void)) ![]const u8 {
+    var arr: std.ArrayListUnmanaged([]const u8) = .empty;
+    var it = completed_nodes.iterator();
+    while (it.next()) |entry| {
+        try arr.append(alloc, entry.key_ptr.*);
+    }
+    return json.Stringify.valueAlloc(alloc, arr.items, .{});
+}
 
-    // Insert round 1 messages (no exit condition match)
-    try store.insertChatMessage("r1", "step_gc", 1, "A", null, "hello");
-    try store.insertChatMessage("r1", "step_gc", 1, "B", null, "world");
+/// Serialize route_results map to JSON for checkpoint metadata.
+fn serializeRouteResults(alloc: std.mem.Allocator, route_results: *std.StringHashMap([]const u8)) !?[]const u8 {
+    if (route_results.count() == 0) return null;
 
-    var engine = Engine.init(&store, allocator, 500);
+    var obj = json.ObjectMap.init(alloc);
+    var rr_obj = json.ObjectMap.init(alloc);
 
-    // Poll: round 1 complete, no exit match, max_rounds=1, should complete
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+    var it = route_results.iterator();
+    while (it.next()) |entry| {
+        try rr_obj.put(entry.key_ptr.*, .{ .string = entry.value_ptr.* });
     }
+    try obj.put("route_results", .{ .object = rr_obj });
 
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const step = (try store.getStep(arena.allocator(), "step_gc")).?;
-        try std.testing.expectEqualStrings("completed", step.status);
-    }
+    return try serializeJsonValue(alloc, .{ .object = obj });
 }
 
-test "buildChatTranscript formats messages" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const messages = [_]types.ChatMessageRow{
-        .{ .id = 1, .run_id = "r1", .step_id = "s1", .round = 1, .role = "Architect", .worker_id = null, .message = "hello", .ts_ms = 1000 },
-        .{ .id = 2, .run_id = "r1", .step_id = "s1", .round = 1, .role = "Security", .worker_id = null, .message = "world", .ts_ms = 1001 },
-    };
+/// Serialize a string array as JSON.
+fn serializeStringArray(alloc: std.mem.Allocator, items: []const []const u8) ![]const u8 {
+    return json.Stringify.valueAlloc(alloc, items, .{});
+}
 
-    const transcript = try buildChatTranscript(arena.allocator(), &messages);
-    try std.testing.expect(std.mem.indexOf(u8, transcript, "Architect") != null);
-    try std.testing.expect(std.mem.indexOf(u8, transcript, "Security") != null);
-    try std.testing.expect(std.mem.indexOf(u8, transcript, "hello") != null);
-    try std.testing.expect(std.mem.indexOf(u8, transcript, "world") != null);
+/// Try to extract "state_updates" from worker output JSON.
+/// Worker can return: {"state_updates": {"key": "value"}, ...}
+fn extractStateUpdates(alloc: std.mem.Allocator, output: []const u8) ?[]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, output, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const su = parsed.value.object.get("state_updates") orelse return null;
+    return serializeJsonValue(alloc, su) catch null;
 }
 
-// ── Saga step tests ──────────────────────────────────────────────────
+// ── Tests ─────────────────────────────────────────────────────────────
 
-test "Engine: saga step creates first body child and initializes state" {
+test "Engine: init and stop" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
-    const wf =
-        \\{"steps":[{"id":"deploy_saga","type":"saga","body":["provision","deploy","verify"],"compensations":{"provision":"deprovision","deploy":"rollback_deploy"}},{"id":"provision","type":"task","prompt_template":"provision"},{"id":"deploy","type":"task","prompt_template":"deploy"},{"id":"verify","type":"task","prompt_template":"verify"},{"id":"deprovision","type":"task","prompt_template":"deprovision"},{"id":"rollback_deploy","type":"task","prompt_template":"rollback"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_saga", "r1", "deploy_saga", "saga", "ready", "{}", 1, null, null, null);
-
     var engine = Engine.init(&store, allocator, 500);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // Saga step should be "running"
-    const saga_step = (try store.getStep(arena.allocator(), "step_saga")).?;
-    try std.testing.expectEqualStrings("running", saga_step.status);
-
-    // Should have created 1 child step (first body step)
-    const children = try store.getChildSteps(arena.allocator(), "step_saga");
-    try std.testing.expectEqual(@as(usize, 1), children.len);
-    try std.testing.expectEqualStrings("provision", children[0].def_step_id);
-    try std.testing.expectEqualStrings("ready", children[0].status);
-
-    // Should have saga_state entries
-    const saga_states = try store.getSagaStates(arena.allocator(), "r1", "step_saga");
-    try std.testing.expectEqual(@as(usize, 3), saga_states.len);
-    try std.testing.expectEqualStrings("pending", saga_states[0].status);
-    try std.testing.expectEqualStrings("pending", saga_states[1].status);
-    try std.testing.expectEqualStrings("pending", saga_states[2].status);
+    try std.testing.expect(engine.running.load(.acquire));
+    engine.stop();
+    try std.testing.expect(!engine.running.load(.acquire));
 }
 
-test "Engine: saga step executes body sequentially and completes" {
+test "Engine: tick with no active runs" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
-    const wf =
-        \\{"steps":[{"id":"saga1","type":"saga","body":["s1","s2"],"compensations":{"s1":"c1"}},{"id":"s1","type":"task","prompt_template":"step1"},{"id":"s2","type":"task","prompt_template":"step2"},{"id":"c1","type":"task","prompt_template":"comp1"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_saga", "r1", "saga1", "saga", "ready", "{}", 1, null, null, null);
-
     var engine = Engine.init(&store, allocator, 500);
+    try engine.tick();
+}
 
-    // Tick 1: creates first body child (s1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete first body child (s1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        try std.testing.expectEqual(@as(usize, 1), children.len);
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"provisioned\"}", null, 1);
-    }
-
-    // Tick 2: detects s1 completed, creates s2
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Should now have 2 children
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        try std.testing.expectEqual(@as(usize, 2), children.len);
-    }
+test "engine: find ready nodes - simple chain" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
 
-    // Complete second body child (s2)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        for (children) |child| {
-            if (std.mem.eql(u8, child.def_step_id, "s2")) {
-                try store.updateStepStatus(child.id, "completed", null, "{\"output\":\"deployed\"}", null, 1);
-            }
-        }
-    }
+    // Edges: __start__ -> a -> b -> __end__
+    const wf =
+        \\{"nodes":{"a":{"type":"task"},"b":{"type":"task"}},"edges":[["__start__","a"],["a","b"],["b","__end__"]],"schema":{}}
+    ;
 
-    // Tick 3: detects s2 completed, all body steps done, saga completes
+    // Completed: [] -> ready: [a]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("a", ready[0]);
     }
 
-    // Tick 4: saga polls — should now detect all completed
+    // Completed: [a] -> ready: [b]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("a", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("b", ready[0]);
     }
 
-    // Saga should be completed
+    // Completed: [a, b] -> ready: [__end__]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const saga_step = (try store.getStep(arena.allocator(), "step_saga")).?;
-        try std.testing.expectEqualStrings("completed", saga_step.status);
-        try std.testing.expect(saga_step.output_json != null);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("a", {});
+        try completed.put("b", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("__end__", ready[0]);
     }
 }
 
-test "Engine: saga step runs compensation in reverse on failure" {
+test "engine: find ready nodes - parallel" {
     const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
 
+    // Edges: __start__ -> a, __start__ -> b, a -> c, b -> c
     const wf =
-        \\{"steps":[{"id":"saga1","type":"saga","body":["s1","s2"],"compensations":{"s1":"c1","s2":"c2"}},{"id":"s1","type":"task","prompt_template":"step1"},{"id":"s2","type":"task","prompt_template":"step2"},{"id":"c1","type":"task","prompt_template":"comp1"},{"id":"c2","type":"task","prompt_template":"comp2"}]}
+        \\{"nodes":{"a":{"type":"task"},"b":{"type":"task"},"c":{"type":"task"}},"edges":[["__start__","a"],["__start__","b"],["a","c"],["b","c"]],"schema":{}}
     ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_saga", "r1", "saga1", "saga", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
-    // Tick 1: creates first body child (s1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
-
-    // Complete first body child (s1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        try store.updateStepStatus(children[0].id, "completed", null, "{\"output\":\"provisioned\"}", null, 1);
-    }
-
-    // Tick 2: creates s2
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
 
-    // Fail second body child (s2)
+    // Completed: [] -> ready: [a, b]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        for (children) |child| {
-            if (std.mem.eql(u8, child.def_step_id, "s2")) {
-                try store.updateStepStatus(child.id, "failed", null, null, "deploy failed", 1);
-            }
+        var completed = std.StringHashMap(void).init(alloc);
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 2), ready.len);
+        // Both a and b should be ready (order may vary)
+        var has_a = false;
+        var has_b = false;
+        for (ready) |name| {
+            if (std.mem.eql(u8, name, "a")) has_a = true;
+            if (std.mem.eql(u8, name, "b")) has_b = true;
         }
+        try std.testing.expect(has_a);
+        try std.testing.expect(has_b);
     }
 
-    // Tick 3: detects s2 failed, starts compensation (s1 was completed, so compensate s1)
+    // Completed: [a] -> ready: [] (c needs both a and b)
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("a", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        // b is already in completed? No. So b should be ready
+        // Wait - b is from __start__ and __start__ is always completed
+        // b should be ready since its only inbound is __start__
+        // But if we only put "a" as completed, b's inbound __start__ is always satisfied
+        // So b should be ready. And c should NOT be ready since b is not completed.
+        var has_c = false;
+        for (ready) |name| {
+            if (std.mem.eql(u8, name, "c")) has_c = true;
+        }
+        try std.testing.expect(!has_c);
     }
 
-    // Tick 4: compensation child creation may happen here
+    // Completed: [a, b] -> ready: [c]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("a", {});
+        try completed.put("b", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("c", ready[0]);
     }
+}
 
-    // Should have created compensation child for s1
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        var found_comp = false;
-        for (children) |child| {
-            if (std.mem.eql(u8, child.def_step_id, "c1")) {
-                found_comp = true;
-            }
-        }
-        try std.testing.expect(found_comp);
-    }
+test "engine: find ready nodes - route edges" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
 
-    // Complete the compensation child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        for (children) |child| {
-            if (std.mem.eql(u8, child.def_step_id, "c1")) {
-                try store.updateStepStatus(child.id, "completed", null, "{\"output\":\"deprovisioned\"}", null, 1);
-            }
-        }
-    }
+    // Edges: __start__ -> r, r:yes -> a, r:no -> b
+    const wf =
+        \\{"nodes":{"r":{"type":"route"},"a":{"type":"task"},"b":{"type":"task"}},"edges":[["__start__","r"],["r:yes","a"],["r:no","b"]],"schema":{}}
+    ;
 
-    // Tick 5: compensation done
+    // Completed: [r] with route result "yes" -> ready: [a]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("r", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        try routes.put("r", "yes");
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("a", ready[0]);
     }
 
-    // Tick 6: saga should finalize as failed
+    // Completed: [r] with route result "no" -> ready: [b]
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("r", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        try routes.put("r", "no");
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        try std.testing.expectEqual(@as(usize, 1), ready.len);
+        try std.testing.expectEqualStrings("b", ready[0]);
     }
 
-    // Saga should be failed with compensation output
+    // Completed: [r] with route result "yes" -> b should NOT be ready
     {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const saga_step = (try store.getStep(arena.allocator(), "step_saga")).?;
-        try std.testing.expectEqualStrings("failed", saga_step.status);
-        try std.testing.expect(saga_step.output_json != null);
-        // Output should contain failed_at and compensated
-        try std.testing.expect(std.mem.indexOf(u8, saga_step.output_json.?, "failed_at") != null);
-        try std.testing.expect(std.mem.indexOf(u8, saga_step.output_json.?, "compensated") != null);
+        var completed = std.StringHashMap(void).init(alloc);
+        try completed.put("r", {});
+        var routes = std.StringHashMap([]const u8).init(alloc);
+        try routes.put("r", "yes");
+        const ready = try findReadyNodes(alloc, wf, &completed, &routes);
+        for (ready) |name| {
+            try std.testing.expect(!std.mem.eql(u8, name, "b"));
+        }
     }
 }
 
-test "Engine: saga step fails immediately with no completed steps to compensate" {
+test "engine: processRun completes simple workflow" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
+    // Create a workflow with just a transform node
     const wf =
-        \\{"steps":[{"id":"saga1","type":"saga","body":["s1"],"compensations":{"s1":"c1"}},{"id":"s1","type":"task","prompt_template":"step1"},{"id":"c1","type":"task","prompt_template":"comp1"}]}
+        \\{"nodes":{"t1":{"type":"transform","updates":"{\"result\":\"done\"}"}},"edges":[["__start__","t1"],["t1","__end__"]],"schema":{"result":{"type":"string","reducer":"last_value"}}}
     ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_saga", "r1", "saga1", "saga", "ready", "{}", 1, null, null, null);
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
 
     var engine = Engine.init(&store, allocator, 500);
 
-    // Tick 1: creates first body child (s1)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
 
-    // Fail the first body child
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const children = try store.getChildSteps(arena.allocator(), "step_saga");
-        try store.updateStepStatus(children[0].id, "failed", null, null, "provision failed", 1);
-    }
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
 
-    // Tick 2: detects s1 failed, no completed steps, saga fails immediately
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("completed", updated_run.status);
 
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const saga_step = (try store.getStep(arena.allocator(), "step_saga")).?;
-        try std.testing.expectEqualStrings("failed", saga_step.status);
-        try std.testing.expect(saga_step.output_json != null);
-        try std.testing.expect(std.mem.indexOf(u8, saga_step.output_json.?, "compensated\":[]") != null);
+    // Verify state was updated
+    if (updated_run.state_json) |sj| {
+        try std.testing.expect(std.mem.indexOf(u8, sj, "done") != null);
     }
 }
 
-test "Engine: saga step fails without body" {
+test "engine: interrupt node stops run" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
     const wf =
-        \\{"steps":[{"id":"saga1","type":"saga"}]}
+        \\{"nodes":{"i1":{"type":"interrupt"}},"edges":[["__start__","i1"],["i1","__end__"]],"schema":{}}
     ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_saga", "r1", "saga1", "saga", "ready", "{}", 1, null, null, null);
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
 
     var engine = Engine.init(&store, allocator, 500);
 
@@ -5073,213 +1324,141 @@ test "Engine: saga step fails without body" {
     const run_row = (try store.getRun(arena.allocator(), "r1")).?;
     try engine.processRun(arena.allocator(), run_row);
 
-    const saga_step = (try store.getStep(arena.allocator(), "step_saga")).?;
-    try std.testing.expectEqualStrings("failed", saga_step.status);
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("interrupted", updated_run.status);
 }
 
-// ── Graph cycle tests ────────────────────────────────────────────────
-
-test "Engine: condition routes back to earlier step creates new instances" {
+test "engine: route node with conditional edges" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
-    // Workflow: compute -> check -> (if true_target=compute, false_target=done)
+    // Workflow: start -> route -> (yes: t_yes, no: t_no) -> end
     const wf =
-        \\{"steps":[{"id":"compute","type":"task","prompt_template":"compute","depends_on":[]},{"id":"check","type":"condition","expression":"retry","true_target":"compute","false_target":"done","depends_on":["compute"]},{"id":"done","type":"task","prompt_template":"done","depends_on":["check"]}]}
+        \\{"nodes":{"r":{"type":"route","input":"state.decision"},"t_yes":{"type":"transform","updates":"{\"path\":\"yes\"}"},"t_no":{"type":"transform","updates":"{\"path\":\"no\"}"}},"edges":[["__start__","r"],["r:yes","t_yes"],["r:no","t_no"],["t_yes","__end__"],["t_no","__end__"]],"schema":{"decision":{"type":"string","reducer":"last_value"},"path":{"type":"string","reducer":"last_value"}}}
     ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
 
-    // Step "compute" completed
-    try store.insertStep("step_compute", "r1", "compute", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_compute", "completed", null, "{\"output\":\"retry this\"}", null, 1);
-
-    // Step "check" is ready, depends on compute
-    try store.insertStep("step_check", "r1", "check", "condition", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_check", "step_compute");
+    const init_state =
+        \\{"decision":"yes"}
+    ;
 
-    // Step "done" is pending
-    try store.insertStep("step_done", "r1", "done", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step_done", "step_check");
+    try store.createRunWithState("r1", null, wf, "{}", init_state);
+    try store.updateRunStatus("r1", "running", null);
 
     var engine = Engine.init(&store, allocator, 500);
 
-    // Tick 1: condition evaluates to true, target "compute" is already completed
-    //         Should detect cycle and create new step instances
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
 
-    // Verify: condition step should be completed with cycle_back output
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const check_step = (try store.getStep(arena.allocator(), "step_check")).?;
-        try std.testing.expectEqualStrings("completed", check_step.status);
-        try std.testing.expect(check_step.output_json != null);
-        try std.testing.expect(std.mem.indexOf(u8, check_step.output_json.?, "cycle_back") != null);
-    }
+    // First tick: route node executes and completes
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
 
-    // Verify: new step instances were created (total steps > 3)
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const all_steps = try store.getStepsByRun(arena.allocator(), "r1");
-        // Original: compute, check, done = 3
-        // New: compute(iter1), check(iter1) = 2 more
-        try std.testing.expect(all_steps.len > 3);
-
-        // Find new compute instance with iteration_index > 0
-        var found_new_compute = false;
-        for (all_steps) |s| {
-            if (std.mem.eql(u8, s.def_step_id, "compute") and s.iteration_index > 0) {
-                found_new_compute = true;
-                try std.testing.expectEqualStrings("ready", s.status);
-            }
-        }
-        try std.testing.expect(found_new_compute);
+    // May need a second tick to process t_yes and __end__
+    const run_row2 = (try store.getRun(arena.allocator(), "r1")).?;
+    if (std.mem.eql(u8, run_row2.status, "running")) {
+        try engine.processRun(arena.allocator(), run_row2);
     }
 
-    // Verify cycle_state was updated
-    {
-        const cycle_state = try store.getCycleState("r1", "cycle_check");
-        try std.testing.expect(cycle_state != null);
-        try std.testing.expectEqual(@as(i64, 1), cycle_state.?.iteration_count);
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("completed", updated_run.status);
+
+    // Verify the "yes" path was taken
+    if (updated_run.state_json) |sj| {
+        try std.testing.expect(std.mem.indexOf(u8, sj, "yes") != null);
     }
 }
 
-test "Engine: graph cycle respects max_cycle_iterations" {
+test "wrapOutput creates valid JSON" {
     const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    // Workflow with max_cycle_iterations=1
-    const wf =
-        \\{"steps":[{"id":"compute","type":"task","prompt_template":"compute"},{"id":"check","type":"condition","expression":"retry","true_target":"compute","false_target":"done","max_cycle_iterations":1,"depends_on":["compute"]},{"id":"done","type":"task","prompt_template":"done","depends_on":["check"]}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-
-    // Pre-set cycle state to max
-    try store.upsertCycleState("r1", "cycle_check", 1, 1);
-
-    // compute completed
-    try store.insertStep("step_compute", "r1", "compute", "task", "completed", "{}", 1, null, null, null);
-    try store.updateStepStatus("step_compute", "completed", null, "{\"output\":\"retry\"}", null, 1);
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
 
-    // check is ready
-    try store.insertStep("step_check", "r1", "check", "condition", "ready", "{}", 1, null, null, null);
-    try store.insertStepDep("step_check", "step_compute");
+    const result = try wrapOutput(arena.allocator(), "hello world");
+    try std.testing.expectEqualStrings("{\"output\":\"hello world\"}", result);
+}
 
-    // done is pending
-    try store.insertStep("step_done", "r1", "done", "task", "pending", "{}", 1, null, null, null);
-    try store.insertStepDep("step_done", "step_check");
+test "wrapOutput escapes special characters" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
 
-    var engine = Engine.init(&store, allocator, 500);
+    const result = try wrapOutput(arena.allocator(), "line1\nline2");
+    try std.testing.expectEqualStrings("{\"output\":\"line1\\nline2\"}", result);
+}
 
-    // Tick: condition should fail because cycle limit exceeded
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-        try engine.processRun(arena.allocator(), run_row);
-    }
+test "serializeCompletedNodes" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
 
-    // Check step should be failed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const check_step = (try store.getStep(arena.allocator(), "step_check")).?;
-        try std.testing.expectEqualStrings("failed", check_step.status);
-        try std.testing.expect(check_step.error_text != null);
-        try std.testing.expect(std.mem.indexOf(u8, check_step.error_text.?, "exceeded") != null);
-    }
+    var completed = std.StringHashMap(void).init(alloc);
+    try completed.put("a", {});
+    try completed.put("b", {});
 
-    // Run should be failed
-    {
-        var arena = std.heap.ArenaAllocator.init(allocator);
-        defer arena.deinit();
-        const run = (try store.getRun(arena.allocator(), "r1")).?;
-        try std.testing.expectEqualStrings("failed", run.status);
-    }
+    const result = try serializeCompletedNodes(alloc, &completed);
+    // Should be a JSON array containing "a" and "b"
+    try std.testing.expect(std.mem.indexOf(u8, result, "\"a\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "\"b\"") != null);
 }
 
-// ── Worker handoff tests ─────────────────────────────────────────────
-
-test "extractHandoffTarget parses handoff_to from output" {
+test "getNodeJson returns node definition" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
-    const output =
-        \\{"output":"cannot handle","handoff_to":{"tags":["security_expert"],"message":"needs security review"}}
+    const wf =
+        \\{"nodes":{"a":{"type":"task","prompt_template":"hello"}},"edges":[]}
     ;
-    const target = extractHandoffTarget(arena.allocator(), output);
-    try std.testing.expect(target != null);
-    try std.testing.expectEqual(@as(usize, 1), target.?.tags.len);
-    try std.testing.expectEqualStrings("security_expert", target.?.tags[0]);
-    try std.testing.expect(target.?.message != null);
-    try std.testing.expectEqualStrings("needs security review", target.?.message.?);
+    const result = getNodeJson(arena.allocator(), wf, "a");
+    try std.testing.expect(result != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "task") != null);
 }
 
-test "extractHandoffTarget returns null for normal output" {
+test "getNodeJson returns null for missing node" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
-    const output =
-        \\{"output":"all good, no handoff needed"}
+    const wf =
+        \\{"nodes":{"a":{"type":"task"}},"edges":[]}
     ;
-    const target = extractHandoffTarget(arena.allocator(), output);
-    try std.testing.expect(target == null);
+    const result = getNodeJson(arena.allocator(), wf, "b");
+    try std.testing.expect(result == null);
 }
 
-test "extractHandoffTarget returns null for non-JSON output" {
+test "getNodeField extracts string field" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
-    const target = extractHandoffTarget(arena.allocator(), "plain text output");
-    try std.testing.expect(target == null);
+    const node =
+        \\{"type":"task","prompt_template":"hello {{state.name}}"}
+    ;
+    const result = getNodeField(arena.allocator(), node, "prompt_template");
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("hello {{state.name}}", result.?);
 }
 
-test "extractHandoffTarget handles handoff without message" {
+test "extractStateUpdates from worker response" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
     const output =
-        \\{"output":"redirect","handoff_to":{"tags":["expert"]}}
+        \\{"state_updates":{"result":"done","count":5},"other":"ignored"}
     ;
-    const target = extractHandoffTarget(arena.allocator(), output);
-    try std.testing.expect(target != null);
-    try std.testing.expectEqual(@as(usize, 1), target.?.tags.len);
-    try std.testing.expectEqualStrings("expert", target.?.tags[0]);
-    try std.testing.expect(target.?.message == null);
+    const result = extractStateUpdates(arena.allocator(), output);
+    try std.testing.expect(result != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "done") != null);
 }
 
-test "Engine: task step stays ready when no workers available (handoff path)" {
+test "extractStateUpdates returns null for plain text" {
     const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    const wf =
-        \\{"steps":[{"id":"t1","type":"task","prompt_template":"do work"}]}
-    ;
-    try store.insertRun("r1", null, "running", wf, "{}", "[]");
-    try store.insertStep("step_t1", "r1", "t1", "task", "ready", "{}", 1, null, null, null);
-
-    var engine = Engine.init(&store, allocator, 500);
-
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
-    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
-    try engine.processRun(arena.allocator(), run_row);
-
-    // No workers available, step should remain "ready"
-    const step = (try store.getStep(arena.allocator(), "step_t1")).?;
-    try std.testing.expectEqualStrings("ready", step.status);
+    const result = extractStateUpdates(arena.allocator(), "just plain text");
+    try std.testing.expect(result == null);
 }
diff --git a/src/store.zig b/src/store.zig
index 331f143..cb74ee8 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -403,7 +403,7 @@ pub const Store = struct {
     }
 
     pub fn getRun(self: *Self, allocator: std.mem.Allocator, id: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms FROM runs WHERE id = ?";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE id = ?";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -426,11 +426,12 @@ pub const Store = struct {
             .updated_at_ms = colInt(stmt, 8),
             .started_at_ms = colIntOpt(stmt, 9),
             .ended_at_ms = colIntOpt(stmt, 10),
+            .state_json = try allocStrOpt(allocator, stmt, 11),
         };
     }
 
     pub fn getRunByIdempotencyKey(self: *Self, allocator: std.mem.Allocator, key: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -452,13 +453,14 @@ pub const Store = struct {
             .updated_at_ms = colInt(stmt, 8),
             .started_at_ms = colIntOpt(stmt, 9),
             .ended_at_ms = colIntOpt(stmt, 10),
+            .state_json = try allocStrOpt(allocator, stmt, 11),
         };
     }
 
     pub fn listRuns(self: *Self, allocator: std.mem.Allocator, status_filter: ?[]const u8, limit: i64, offset: i64) ![]types.RunRow {
         var stmt: ?*c.sqlite3_stmt = null;
         if (status_filter != null) {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms FROM runs WHERE status = ? ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
+            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE status = ? ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
             if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
                 return error.SqlitePrepareFailed;
             }
@@ -466,7 +468,7 @@ pub const Store = struct {
             _ = c.sqlite3_bind_int64(stmt, 2, limit);
             _ = c.sqlite3_bind_int64(stmt, 3, offset);
         } else {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms FROM runs ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
+            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
             if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
                 return error.SqlitePrepareFailed;
             }
@@ -489,6 +491,7 @@ pub const Store = struct {
                 .updated_at_ms = colInt(stmt, 8),
                 .started_at_ms = colIntOpt(stmt, 9),
                 .ended_at_ms = colIntOpt(stmt, 10),
+                .state_json = try allocStrOpt(allocator, stmt, 11),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -513,7 +516,7 @@ pub const Store = struct {
     }
 
     pub fn getActiveRuns(self: *Self, allocator: std.mem.Allocator) ![]types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms FROM runs WHERE status IN ('running', 'paused') ORDER BY created_at_ms DESC";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE status IN ('running', 'paused') ORDER BY created_at_ms DESC";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -534,6 +537,7 @@ pub const Store = struct {
                 .updated_at_ms = colInt(stmt, 8),
                 .started_at_ms = colIntOpt(stmt, 9),
                 .ended_at_ms = colIntOpt(stmt, 10),
+                .state_json = try allocStrOpt(allocator, stmt, 11),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -2495,6 +2499,7 @@ test "run state management" {
         allocator.free(run.input_json);
         allocator.free(run.callbacks_json);
         if (run.error_text) |et| allocator.free(et);
+        if (run.state_json) |sj| allocator.free(sj);
     }
     try std.testing.expectEqualStrings("r1", run.id);
     try std.testing.expectEqualStrings("pending", run.status);
@@ -2512,6 +2517,7 @@ test "run state management" {
         allocator.free(run2.input_json);
         allocator.free(run2.callbacks_json);
         if (run2.error_text) |et| allocator.free(et);
+        if (run2.state_json) |sj| allocator.free(sj);
     }
     try std.testing.expectEqualStrings("r2", run2.id);
 
@@ -2534,6 +2540,7 @@ test "run state management" {
         allocator.free(forked.input_json);
         allocator.free(forked.callbacks_json);
         if (forked.error_text) |et| allocator.free(et);
+        if (forked.state_json) |sj| allocator.free(sj);
     }
     try std.testing.expectEqualStrings("r3", forked.id);
     try std.testing.expectEqualStrings("pending", forked.status);
diff --git a/src/types.zig b/src/types.zig
index b989295..b334250 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -154,6 +154,7 @@ pub const RunRow = struct {
     updated_at_ms: i64,
     started_at_ms: ?i64,
     ended_at_ms: ?i64,
+    state_json: ?[]const u8 = null,
 };
 
 pub const StepRow = struct {

From 0c0bbf9afb8074ff59624c94e2797c33b7d2bd97 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 19:10:28 -0300
Subject: [PATCH 09/55] feat: SSE hub for streaming run events

---
 src/main.zig |   1 +
 src/sse.zig  | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 src/sse.zig

diff --git a/src/main.zig b/src/main.zig
index 4a527a8..361f253 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -639,4 +639,5 @@ comptime {
     _ = @import("tracker_client.zig");
     _ = @import("tracker.zig");
     _ = @import("state.zig");
+    _ = @import("sse.zig");
 }
diff --git a/src/sse.zig b/src/sse.zig
new file mode 100644
index 0000000..c189221
--- /dev/null
+++ b/src/sse.zig
@@ -0,0 +1,160 @@
+const std = @import("std");
+const Allocator = std.mem.Allocator;
+
+pub const SseEvent = struct {
+    event_type: []const u8, // "state_update", "step_started", etc.
+    data: []const u8, // JSON string
+};
+
+/// Per-run event queue. Thread-safe via mutex.
+pub const RunEventQueue = struct {
+    events: std.ArrayListUnmanaged(SseEvent),
+    alloc: Allocator,
+    mutex: std.Thread.Mutex,
+    closed: std.atomic.Value(bool),
+
+    pub fn init(alloc: Allocator) RunEventQueue {
+        return .{
+            .events = .empty,
+            .alloc = alloc,
+            .mutex = .{},
+            .closed = std.atomic.Value(bool).init(false),
+        };
+    }
+
+    pub fn deinit(self: *RunEventQueue) void {
+        self.events.deinit(self.alloc);
+    }
+
+    /// Push an event to the queue. Thread-safe.
+    pub fn push(self: *RunEventQueue, event: SseEvent) void {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        self.events.append(self.alloc, event) catch {};
+    }
+
+    /// Drain all events from the queue. Returns owned slice. Thread-safe.
+    pub fn drain(self: *RunEventQueue, alloc: Allocator) []SseEvent {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        if (self.events.items.len == 0) return &.{};
+        const items = alloc.dupe(SseEvent, self.events.items) catch return &.{};
+        self.events.clearRetainingCapacity();
+        return items;
+    }
+
+    /// Mark queue as closed (run completed/cancelled).
+    pub fn close(self: *RunEventQueue) void {
+        self.closed.store(true, .release);
+    }
+
+    pub fn isClosed(self: *RunEventQueue) bool {
+        return self.closed.load(.acquire);
+    }
+};
+
+/// Central hub managing per-run event queues.
+pub const SseHub = struct {
+    queues: std.StringHashMap(*RunEventQueue),
+    mutex: std.Thread.Mutex,
+    alloc: Allocator,
+
+    pub fn init(alloc: Allocator) SseHub {
+        return .{
+            .queues = std.StringHashMap(*RunEventQueue).init(alloc),
+            .mutex = .{},
+            .alloc = alloc,
+        };
+    }
+
+    pub fn deinit(self: *SseHub) void {
+        var it = self.queues.iterator();
+        while (it.next()) |entry| {
+            entry.value_ptr.*.deinit();
+            self.alloc.destroy(entry.value_ptr.*);
+            self.alloc.free(entry.key_ptr.*);
+        }
+        self.queues.deinit();
+    }
+
+    /// Get or create queue for a run.
+    pub fn getOrCreateQueue(self: *SseHub, run_id: []const u8) *RunEventQueue {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        if (self.queues.get(run_id)) |q| return q;
+        const queue = self.alloc.create(RunEventQueue) catch @panic("OOM: failed to allocate RunEventQueue");
+        queue.* = RunEventQueue.init(self.alloc);
+        const id_copy = self.alloc.dupe(u8, run_id) catch @panic("OOM: failed to duplicate run_id");
+        self.queues.put(id_copy, queue) catch @panic("OOM: failed to insert queue into map");
+        return queue;
+    }
+
+    /// Broadcast event to a run's queue.
+    pub fn broadcast(self: *SseHub, run_id: []const u8, event: SseEvent) void {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        if (self.queues.get(run_id)) |queue| {
+            queue.push(event);
+        }
+        // If no queue exists, event is silently dropped (no listeners)
+    }
+
+    /// Close and remove queue when run completes.
+    pub fn removeQueue(self: *SseHub, run_id: []const u8) void {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        if (self.queues.fetchRemove(run_id)) |entry| {
+            entry.value.close();
+            entry.value.deinit();
+            self.alloc.destroy(entry.value);
+            self.alloc.free(entry.key);
+        }
+    }
+};
+
+// ── Tests ─────────────────────────────────────────────────────────────
+
+test "sse hub broadcast and drain" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+    queue.push(.{ .event_type = "step_started", .data = "{}" });
+    queue.push(.{ .event_type = "step_completed", .data = "{}" });
+
+    const events = queue.drain(alloc);
+    defer alloc.free(events);
+    try std.testing.expectEqual(@as(usize, 2), events.len);
+    try std.testing.expectEqualStrings("step_started", events[0].event_type);
+}
+
+test "sse hub broadcast to non-existent queue is silent" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    // Should not crash
+    hub.broadcast("nonexistent", .{ .event_type = "test", .data = "{}" });
+}
+
+test "sse hub remove queue" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    _ = hub.getOrCreateQueue("run1");
+    hub.removeQueue("run1");
+    // Queue should be gone
+    try std.testing.expectEqual(@as(usize, 0), hub.queues.count());
+}
+
+test "sse queue close" {
+    const alloc = std.testing.allocator;
+    var queue = RunEventQueue.init(alloc);
+    defer queue.deinit();
+
+    try std.testing.expect(!queue.isClosed());
+    queue.close();
+    try std.testing.expect(queue.isClosed());
+}

From 22ddb3c8f36be4347f646756ffe16ccd8e4593fe Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 19:16:43 -0300
Subject: [PATCH 10/55] feat: add orchestration API endpoints

Add workflow CRUD (create/list/get/update/delete/validate/run),
checkpoint endpoints (list/get), state control (resume/fork/inject),
SSE stream snapshot, and agent events callback. Update handleGetRun
with state_json and checkpoint_count. Add SSE hub cleanup to cancel.
Remove old signal endpoint (replaced by state inject).
---
 src/api.zig | 732 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 725 insertions(+), 7 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 5f1dd59..6cf29c6 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -8,6 +8,8 @@ const metrics_mod = @import("metrics.zig");
 const strategy_mod = @import("strategy.zig");
 const tracker_mod = @import("tracker.zig");
 const config_mod = @import("config.zig");
+const sse_mod = @import("sse.zig");
+const state_mod = @import("state.zig");
 
 // ── Types ────────────────────────────────────────────────────────────
 
@@ -24,6 +26,7 @@ pub const Context = struct {
     strategies: ?*const strategy_mod.StrategyMap = null,
     tracker_state: ?*tracker_mod.TrackerState = null,
     tracker_cfg: ?*const config_mod.TrackerConfig = null,
+    sse_hub: ?*sse_mod.SseHub = null,
 };
 
 pub const HttpResponse = struct {
@@ -51,6 +54,7 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
     const is_get = eql(method, "GET");
     const is_post = eql(method, "POST");
     const is_delete = eql(method, "DELETE");
+    const is_put = eql(method, "PUT");
 
     if (!isAuthorized(ctx, seg0, seg1)) {
         return jsonResponse(401, "{\"error\":{\"code\":\"unauthorized\",\"message\":\"missing or invalid bearer token\"}}");
@@ -111,11 +115,6 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleRejectStep(ctx, seg1.?, seg3.?);
     }
 
-    // POST /runs/{id}/steps/{step_id}/signal
-    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "signal") and seg5 == null) {
-        return handleSignalStep(ctx, seg1.?, seg3.?, body);
-    }
-
     // GET /runs/{id}/steps/{step_id}/chat
     if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "chat") and seg5 == null) {
         return handleGetChatTranscript(ctx, seg1.?, seg3.?);
@@ -171,6 +170,86 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleTrackerRefresh(ctx);
     }
 
+    // ── Workflow CRUD ───────────────────────────────────────────────
+
+    // POST /workflows
+    if (is_post and eql(seg0, "workflows") and seg1 == null) {
+        return handleCreateWorkflow(ctx, body);
+    }
+
+    // GET /workflows
+    if (is_get and eql(seg0, "workflows") and seg1 == null) {
+        return handleListWorkflows(ctx);
+    }
+
+    // GET /workflows/{id}
+    if (is_get and eql(seg0, "workflows") and seg1 != null and seg2 == null) {
+        return handleGetWorkflow(ctx, seg1.?);
+    }
+
+    // PUT /workflows/{id}
+    if (is_put and eql(seg0, "workflows") and seg1 != null and seg2 == null) {
+        return handleUpdateWorkflow(ctx, seg1.?, body);
+    }
+
+    // DELETE /workflows/{id}
+    if (is_delete and eql(seg0, "workflows") and seg1 != null and seg2 == null) {
+        return handleDeleteWorkflow(ctx, seg1.?);
+    }
+
+    // POST /workflows/{id}/validate
+    if (is_post and eql(seg0, "workflows") and seg1 != null and eql(seg2, "validate") and seg3 == null) {
+        return handleValidateWorkflow(ctx, seg1.?);
+    }
+
+    // POST /workflows/{id}/run
+    if (is_post and eql(seg0, "workflows") and seg1 != null and eql(seg2, "run") and seg3 == null) {
+        return handleRunWorkflow(ctx, seg1.?, body);
+    }
+
+    // ── Checkpoint endpoints ────────────────────────────────────────
+
+    // GET /runs/{id}/checkpoints
+    if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "checkpoints") and seg3 == null) {
+        return handleListCheckpoints(ctx, seg1.?);
+    }
+
+    // GET /runs/{id}/checkpoints/{cpId}
+    if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "checkpoints") and seg3 != null and seg4 == null) {
+        return handleGetCheckpoint(ctx, seg1.?, seg3.?);
+    }
+
+    // ── State control endpoints ─────────────────────────────────────
+
+    // POST /runs/{id}/resume
+    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "resume") and seg3 == null) {
+        return handleResumeRun(ctx, seg1.?, body);
+    }
+
+    // POST /runs/fork
+    if (is_post and eql(seg0, "runs") and eql(seg1, "fork") and seg2 == null) {
+        return handleForkRun(ctx, body);
+    }
+
+    // POST /runs/{id}/state
+    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "state") and seg3 == null) {
+        return handleInjectState(ctx, seg1.?, body);
+    }
+
+    // ── SSE stream endpoint ─────────────────────────────────────────
+
+    // GET /runs/{id}/stream
+    if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "stream") and seg3 == null) {
+        return handleStream(ctx, seg1.?);
+    }
+
+    // ── Agent events callback ───────────────────────────────────────
+
+    // POST /internal/agent-events/{run_id}/{step_id}
+    if (is_post and eql(seg0, "internal") and eql(seg1, "agent-events") and seg2 != null and seg3 != null and seg4 == null) {
+        return handleAgentEventCallback(ctx, seg2.?, seg3.?, body);
+    }
+
     return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"endpoint not found\"}}");
 }
 
@@ -616,10 +695,21 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
         break :blk std.fmt.allocPrint(ctx.allocator, ",\"idempotency_key\":{s}", .{ik_json}) catch "";
     } else "";
 
+    // Include state_json if present
+    const state_field = if (run.state_json) |sj|
+        std.fmt.allocPrint(ctx.allocator, ",\"state_json\":{s}", .{sj}) catch ""
+    else
+        "";
+
+    // Count checkpoints
+    const checkpoints = ctx.store.listCheckpoints(ctx.allocator, id) catch &.{};
+    const checkpoint_count: i64 = @intCast(checkpoints.len);
+    const checkpoint_field = std.fmt.allocPrint(ctx.allocator, ",\"checkpoint_count\":{d}", .{checkpoint_count}) catch "";
+
     const run_id_json = jsonQuoted(ctx.allocator, run.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const run_status_json = jsonQuoted(ctx.allocator, run.status) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s},"steps":{s}}}
+        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s}{s}{s},"steps":{s}}}
     , .{
         run_id_json,
         run_status_json,
@@ -629,6 +719,8 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
         error_field,
         started_field,
         ended_field,
+        state_field,
+        checkpoint_field,
         steps_json,
     }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(200, resp);
@@ -765,7 +857,10 @@ fn handleCancelRun(ctx: *Context, run_id: []const u8) HttpResponse {
     // 5. Insert event
     ctx.store.insertEvent(run_id, null, "run.cancelled", "{}") catch {};
 
-    // 6. Return 200
+    // 6. Close SSE queue
+    if (ctx.sse_hub) |hub| hub.removeQueue(run_id);
+
+    // 7. Return 200
     const resp = std.fmt.allocPrint(ctx.allocator,
         \\{{"id":"{s}","status":"cancelled"}}
     , .{run_id}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
@@ -973,6 +1068,629 @@ fn handleListEvents(ctx: *Context, run_id: []const u8) HttpResponse {
     return jsonResponse(200, json_body);
 }
 
+// ── Workflow CRUD Handlers ───────────────────────────────────────────
+
+fn handleCreateWorkflow(ctx: *Context, body: []const u8) HttpResponse {
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    const name = getJsonString(obj, "name") orelse "untitled";
+
+    // Use provided id or generate one
+    const wf_id = if (getJsonString(obj, "id")) |provided_id|
+        ctx.allocator.dupe(u8, provided_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}")
+    else blk: {
+        const id_buf = ids.generateId();
+        break :blk ctx.allocator.dupe(u8, &id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    };
+
+    // If definition_json is a sub-key, extract it; otherwise use the whole body
+    const definition_json = if (obj.get("definition_json")) |def_val| blk: {
+        break :blk serializeJsonValue(ctx.allocator, def_val) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize definition\"}}");
+    } else body;
+
+    ctx.store.createWorkflow(wf_id, name, definition_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create workflow\"}}");
+    };
+
+    const id_json = jsonQuoted(ctx.allocator, wf_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const name_json = jsonQuoted(ctx.allocator, name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"name":{s}}}
+    , .{ id_json, name_json }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(201, resp);
+}
+
+fn handleListWorkflows(ctx: *Context) HttpResponse {
+    const workflows = ctx.store.listWorkflows(ctx.allocator) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to list workflows\"}}");
+    };
+
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    for (workflows, 0..) |wf, i| {
+        if (i > 0) {
+            buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        }
+        const id_json = jsonQuoted(ctx.allocator, wf.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        const name_json = jsonQuoted(ctx.allocator, wf.name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        const entry = std.fmt.allocPrint(ctx.allocator,
+            \\{{"id":{s},"name":{s},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
+        , .{
+            id_json,
+            name_json,
+            wf.definition_json,
+            wf.created_at_ms,
+            wf.updated_at_ms,
+        }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    }
+
+    buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const json_body = buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, json_body);
+}
+
+fn handleGetWorkflow(ctx: *Context, id: []const u8) HttpResponse {
+    const wf = ctx.store.getWorkflow(ctx.allocator, id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    const id_json = jsonQuoted(ctx.allocator, wf.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const name_json = jsonQuoted(ctx.allocator, wf.name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"name":{s},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
+    , .{
+        id_json,
+        name_json,
+        wf.definition_json,
+        wf.created_at_ms,
+        wf.updated_at_ms,
+    }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, resp);
+}
+
+fn handleUpdateWorkflow(ctx: *Context, id: []const u8, body: []const u8) HttpResponse {
+    // Verify workflow exists
+    _ = ctx.store.getWorkflow(ctx.allocator, id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    const name = getJsonString(obj, "name") orelse "untitled";
+    const definition_json = if (obj.get("definition_json")) |def_val| blk: {
+        break :blk serializeJsonValue(ctx.allocator, def_val) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize definition\"}}");
+    } else body;
+
+    ctx.store.updateWorkflow(id, name, definition_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update workflow\"}}");
+    };
+
+    return jsonResponse(200, "{\"ok\":true}");
+}
+
+fn handleDeleteWorkflow(ctx: *Context, id: []const u8) HttpResponse {
+    // Verify workflow exists
+    _ = ctx.store.getWorkflow(ctx.allocator, id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    ctx.store.deleteWorkflow(id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to delete workflow\"}}");
+    };
+
+    return jsonResponse(200, "{\"ok\":true}");
+}
+
+fn handleValidateWorkflow(ctx: *Context, id: []const u8) HttpResponse {
+    const wf = ctx.store.getWorkflow(ctx.allocator, id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    const errors = workflow_validation.validate(ctx.allocator, wf.definition_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"validation failed\"}}");
+    };
+
+    // Build validation result
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    buf.appendSlice(ctx.allocator, "{\"valid\":") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    buf.appendSlice(ctx.allocator, if (errors.len == 0) "true" else "false") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    buf.appendSlice(ctx.allocator, ",\"errors\":[") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    for (errors, 0..) |ve, i| {
+        if (i > 0) {
+            buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        }
+        const err_type_json = jsonQuoted(ctx.allocator, ve.err_type) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        const node_field = if (ve.node) |n| blk: {
+            const n_json = jsonQuoted(ctx.allocator, n) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+            break :blk std.fmt.allocPrint(ctx.allocator, ",\"node\":{s}", .{n_json}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        } else "";
+        const key_field = if (ve.key) |k| blk: {
+            const k_json = jsonQuoted(ctx.allocator, k) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+            break :blk std.fmt.allocPrint(ctx.allocator, ",\"key\":{s}", .{k_json}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        } else "";
+        const msg_json = jsonQuoted(ctx.allocator, ve.message) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        const entry = std.fmt.allocPrint(ctx.allocator,
+            \\{{"type":{s}{s}{s},"message":{s}}}
+        , .{
+            err_type_json,
+            node_field,
+            key_field,
+            msg_json,
+        }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    }
+
+    buf.appendSlice(ctx.allocator, "]}") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const json_body = buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, json_body);
+}
+
+fn handleRunWorkflow(ctx: *Context, workflow_id: []const u8, body: []const u8) HttpResponse {
+    // Load workflow
+    const wf = ctx.store.getWorkflow(ctx.allocator, workflow_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    // Validate
+    const errors = workflow_validation.validate(ctx.allocator, wf.definition_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"validation failed\"}}");
+    };
+    if (errors.len > 0) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"workflow has validation errors\"}}");
+    }
+
+    // Parse definition to extract state_schema for initState
+    const def_parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, wf.definition_json, .{}) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to parse workflow definition\"}}");
+    };
+    defer def_parsed.deinit();
+
+    const schema_json = if (def_parsed.value == .object) blk: {
+        if (def_parsed.value.object.get("state_schema")) |ss| {
+            break :blk serializeJsonValue(ctx.allocator, ss) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize schema\"}}");
+        }
+        break :blk "{}";
+    } else "{}";
+
+    // Parse input from request body (or default to {})
+    const input_json = if (body.len > 0) blk: {
+        const bp = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch break :blk "{}";
+        defer bp.deinit();
+        if (bp.value == .object) {
+            if (bp.value.object.get("input")) |input_val| {
+                break :blk serializeJsonValue(ctx.allocator, input_val) catch break :blk "{}";
+            }
+        }
+        break :blk "{}";
+    } else "{}";
+
+    // Init state
+    const initial_state = state_mod.initState(ctx.allocator, input_json, schema_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to initialize state\"}}");
+    };
+
+    // Generate run ID
+    const run_id_buf = ids.generateId();
+    const run_id = ctx.allocator.dupe(u8, &run_id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    // Create run with state
+    ctx.store.createRunWithState(run_id, workflow_id, wf.definition_json, input_json, initial_state) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create run\"}}");
+    };
+
+    // Create initial checkpoint (version 0, no completed nodes)
+    const cp_id_buf = ids.generateId();
+    const cp_id = ctx.allocator.dupe(u8, &cp_id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    ctx.store.createCheckpoint(cp_id, run_id, "__init__", null, initial_state, "[]", 0, null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create checkpoint\"}}");
+    };
+
+    // Set run status to running
+    ctx.store.updateRunStatus(run_id, "running", null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run status\"}}");
+    };
+
+    const run_id_json = jsonQuoted(ctx.allocator, run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"status":"running"}}
+    , .{run_id_json}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(201, resp);
+}
+
+// ── Checkpoint Handlers ─────────────────────────────────────────────
+
+fn handleListCheckpoints(ctx: *Context, run_id: []const u8) HttpResponse {
+    // Verify run exists
+    _ = ctx.store.getRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
+    };
+
+    const checkpoints = ctx.store.listCheckpoints(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to list checkpoints\"}}");
+    };
+
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    for (checkpoints, 0..) |cp, i| {
+        if (i > 0) {
+            buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        }
+        const entry = buildCheckpointJson(ctx.allocator, cp) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    }
+
+    buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const json_body = buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, json_body);
+}
+
+fn handleGetCheckpoint(ctx: *Context, run_id: []const u8, cp_id: []const u8) HttpResponse {
+    const cp = ctx.store.getCheckpoint(ctx.allocator, cp_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get checkpoint\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"checkpoint not found\"}}");
+    };
+
+    // Verify checkpoint belongs to run
+    if (!std.mem.eql(u8, cp.run_id, run_id)) {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"checkpoint not found\"}}");
+    }
+
+    const json_body = buildCheckpointJson(ctx.allocator, cp) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, json_body);
+}
+
+fn buildCheckpointJson(allocator: std.mem.Allocator, cp: types.CheckpointRow) ![]const u8 {
+    const id_json = try jsonQuoted(allocator, cp.id);
+    const run_id_json = try jsonQuoted(allocator, cp.run_id);
+    const step_id_json = try jsonQuoted(allocator, cp.step_id);
+    const parent_field = if (cp.parent_id) |pid| blk: {
+        const pid_json = try jsonQuoted(allocator, pid);
+        break :blk try std.fmt.allocPrint(allocator, ",\"parent_id\":{s}", .{pid_json});
+    } else "";
+    const metadata_field = if (cp.metadata_json) |md|
+        try std.fmt.allocPrint(allocator, ",\"metadata\":{s}", .{md})
+    else
+        "";
+
+    return try std.fmt.allocPrint(allocator,
+        \\{{"id":{s},"run_id":{s},"step_id":{s}{s},"state":{s},"completed_nodes":{s},"version":{d}{s},"created_at_ms":{d}}}
+    , .{
+        id_json,
+        run_id_json,
+        step_id_json,
+        parent_field,
+        cp.state_json,
+        cp.completed_nodes_json,
+        cp.version,
+        metadata_field,
+        cp.created_at_ms,
+    });
+}
+
+// ── State Control Handlers ──────────────────────────────────────────
+
+fn handleResumeRun(ctx: *Context, run_id: []const u8, body: []const u8) HttpResponse {
+    // Load run — must be status=interrupted
+    const run = ctx.store.getRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
+    };
+
+    if (!std.mem.eql(u8, run.status, "interrupted")) {
+        const resp = std.fmt.allocPrint(ctx.allocator,
+            \\{{"error":{{"code":"conflict","message":"run is not interrupted (current: {s})"}}}}
+        , .{run.status}) catch return jsonResponse(409, "{\"error\":{\"code\":\"conflict\",\"message\":\"run is not interrupted\"}}");
+        return jsonResponse(409, resp);
+    }
+
+    // Load latest checkpoint
+    const latest_cp = ctx.store.getLatestCheckpoint(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get latest checkpoint\"}}");
+    } orelse {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"no checkpoint found for run\"}}");
+    };
+
+    // Get current state
+    var current_state = latest_cp.state_json;
+
+    // Apply state_updates from body if provided
+    if (body.len > 0) {
+        const bp = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{});
+        if (bp) |body_parsed| {
+            defer body_parsed.deinit();
+
+            if (body_parsed.value == .object) {
+                if (body_parsed.value.object.get("state_updates")) |updates_val| {
+                    const updates_json = serializeJsonValue(ctx.allocator, updates_val) catch {
+                        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize updates\"}}");
+                    };
+
+                    // Get schema from workflow definition
+                    const schema_json = getSchemaFromRun(ctx, run);
+
+                    current_state = state_mod.applyUpdates(ctx.allocator, latest_cp.state_json, updates_json, schema_json) catch {
+                        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to apply state updates\"}}");
+                    };
+                }
+            }
+        } else |_| {
+            // Body is not valid JSON — proceed without updates
+        }
+    }
+
+    // Save new state
+    ctx.store.updateRunState(run_id, current_state) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run state\"}}");
+    };
+
+    // Set status to running
+    ctx.store.updateRunStatus(run_id, "running", null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run status\"}}");
+    };
+
+    const run_id_json = jsonQuoted(ctx.allocator, run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"status":"running"}}
+    , .{run_id_json}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, resp);
+}
+
+fn handleForkRun(ctx: *Context, body: []const u8) HttpResponse {
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    // Get checkpoint_id from body
+    const checkpoint_id = getJsonString(obj, "checkpoint_id") orelse {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"missing required field: checkpoint_id\"}}");
+    };
+
+    // Load checkpoint
+    const cp = ctx.store.getCheckpoint(ctx.allocator, checkpoint_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get checkpoint\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"checkpoint not found\"}}");
+    };
+
+    // Load the original run to get workflow_json
+    const orig_run = ctx.store.getRun(ctx.allocator, cp.run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get original run\"}}");
+    } orelse {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"original run not found\"}}");
+    };
+
+    // Apply state_overrides if provided
+    var fork_state = cp.state_json;
+    if (obj.get("state_overrides")) |overrides_val| {
+        const overrides_json = serializeJsonValue(ctx.allocator, overrides_val) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize overrides\"}}");
+        };
+        const schema_json = getSchemaFromRun(ctx, orig_run);
+        fork_state = state_mod.applyUpdates(ctx.allocator, cp.state_json, overrides_json, schema_json) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to apply state overrides\"}}");
+        };
+    }
+
+    // Generate new run ID
+    const new_run_id_buf = ids.generateId();
+    const new_run_id = ctx.allocator.dupe(u8, &new_run_id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    // Create forked run
+    ctx.store.createForkedRun(new_run_id, orig_run.workflow_json, fork_state, cp.run_id, checkpoint_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create forked run\"}}");
+    };
+
+    // Create initial checkpoint for forked run
+    const cp_id_buf = ids.generateId();
+    const cp_id = ctx.allocator.dupe(u8, &cp_id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    ctx.store.createCheckpoint(cp_id, new_run_id, "__fork__", checkpoint_id, fork_state, cp.completed_nodes_json, 0, null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create checkpoint\"}}");
+    };
+
+    // Set to running
+    ctx.store.updateRunStatus(new_run_id, "running", null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run status\"}}");
+    };
+
+    const run_id_json = jsonQuoted(ctx.allocator, new_run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"status":"running","forked_from_checkpoint":{s}}}
+    , .{ run_id_json, checkpoint_id }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(201, resp);
+}
+
+fn handleInjectState(ctx: *Context, run_id: []const u8, body: []const u8) HttpResponse {
+    // Verify run exists
+    const run = ctx.store.getRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
+    };
+
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    // Get updates
+    const updates_val = obj.get("updates") orelse {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"missing required field: updates\"}}");
+    };
+    const updates_json = serializeJsonValue(ctx.allocator, updates_val) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize updates\"}}");
+    };
+
+    // Check apply_after_step
+    const apply_after_step = getJsonString(obj, "apply_after_step");
+
+    if (apply_after_step == null) {
+        // Apply immediately to run.state_json
+        const current_state = run.state_json orelse "{}";
+        const schema_json = getSchemaFromRun(ctx, run);
+        const new_state = state_mod.applyUpdates(ctx.allocator, current_state, updates_json, schema_json) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to apply state updates\"}}");
+        };
+        ctx.store.updateRunState(run_id, new_state) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run state\"}}");
+        };
+        return jsonResponse(200, "{\"applied\":true}");
+    } else {
+        // Insert into pending_state_injections
+        ctx.store.createPendingInjection(run_id, updates_json, apply_after_step) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create pending injection\"}}");
+        };
+        return jsonResponse(200, "{\"applied\":false,\"pending\":true}");
+    }
+}
+
+// ── SSE Stream Handler ──────────────────────────────────────────────
+
+fn handleStream(ctx: *Context, run_id: []const u8) HttpResponse {
+    // For now, return the current state and events as a regular JSON response.
+    // Full SSE streaming with held-open connections will be implemented
+    // when the threading model is wired in main.zig (Task 12).
+    const run = ctx.store.getRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
+    };
+
+    const events = ctx.store.getEventsByRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get events\"}}");
+    };
+
+    // Build events JSON array
+    var events_buf: std.ArrayListUnmanaged(u8) = .empty;
+    events_buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    for (events, 0..) |ev, i| {
+        if (i > 0) {
+            events_buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        }
+        const kind_json = jsonQuoted(ctx.allocator, ev.kind) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        const entry = std.fmt.allocPrint(ctx.allocator,
+            \\{{"kind":{s},"data":{s},"ts_ms":{d}}}
+        , .{ kind_json, ev.data_json, ev.ts_ms }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        events_buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    }
+    events_buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const events_json = events_buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    const status_json = jsonQuoted(ctx.allocator, run.status) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const state_field = if (run.state_json) |sj|
+        std.fmt.allocPrint(ctx.allocator, ",\"state\":{s}", .{sj}) catch ""
+    else
+        "";
+
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"status":{s}{s},"events":{s}}}
+    , .{
+        status_json,
+        state_field,
+        events_json,
+    }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, resp);
+}
+
+// ── Agent Events Callback Handler ───────────────────────────────────
+
+fn handleAgentEventCallback(ctx: *Context, run_id: []const u8, step_id: []const u8, body: []const u8) HttpResponse {
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    const iteration: i64 = if (obj.get("iteration")) |it| blk: {
+        if (it == .integer) break :blk it.integer;
+        break :blk 0;
+    } else 0;
+
+    const tool = getJsonString(obj, "tool");
+    const args_json = if (obj.get("args")) |args_val|
+        serializeJsonValue(ctx.allocator, args_val) catch null
+    else
+        null;
+    const result_text = getJsonString(obj, "result");
+    const status = getJsonString(obj, "status") orelse "running";
+
+    ctx.store.createAgentEvent(run_id, step_id, iteration, tool, args_json, result_text, status) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create agent event\"}}");
+    };
+
+    // If sse_hub is available, broadcast as agent_event
+    if (ctx.sse_hub) |hub| {
+        const event_data = std.fmt.allocPrint(ctx.allocator,
+            \\{{"run_id":"{s}","step_id":"{s}","iteration":{d},"status":"{s}"}}
+        , .{ run_id, step_id, iteration, status }) catch "";
+        if (event_data.len > 0) {
+            hub.broadcast(run_id, .{ .event_type = "agent_event", .data = event_data });
+        }
+    }
+
+    return jsonResponse(200, "{\"ok\":true}");
+}
+
+// ── State Helper ────────────────────────────────────────────────────
+
+fn getSchemaFromRun(ctx: *Context, run: types.RunRow) []const u8 {
+    const def_parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, run.workflow_json, .{}) catch return "{}";
+    defer def_parsed.deinit();
+    if (def_parsed.value != .object) return "{}";
+    if (def_parsed.value.object.get("state_schema")) |ss| {
+        return serializeJsonValue(ctx.allocator, ss) catch "{}";
+    }
+    return "{}";
+}
+
 // ── Chat Transcript Handler ──────────────────────────────────────────
 
 fn handleGetChatTranscript(ctx: *Context, run_id: []const u8, step_id: []const u8) HttpResponse {

From 934e367861d6e5653b14a9f2c9b6418580fd2cc5 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 19:21:07 -0300
Subject: [PATCH 11/55] feat: wire SSE hub and extend dispatch for agent steps

- Add AgentOpts struct and dispatchStepWithOpts() in dispatch.zig so
  webhook bodies for agent steps can include mode, callback_url,
  max_iterations, tools, and state fields
- Wire SseHub into main.zig: create instance on startup and set
  ctx.sse_hub for every request handler
- Add TODO in tracker.zig pollAndClaim noting workflow format update
  needed when nulltickets schema changes land (Task 14)
---
 src/dispatch.zig | 163 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/main.zig     |   5 ++
 src/tracker.zig  |   3 +
 3 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/src/dispatch.zig b/src/dispatch.zig
index c1ccdbb..2b7e002 100644
--- a/src/dispatch.zig
+++ b/src/dispatch.zig
@@ -84,6 +84,23 @@ fn workerMatchesTags(
     return false;
 }
 
+// ── Agent Step Options ────────────────────────────────────────────────
+
+/// Extra fields included in the webhook body when step type is "agent".
+pub const AgentOpts = struct {
+    /// "autonomous" or "managed"
+    mode: ?[]const u8 = null,
+    /// Full callback URL for agent events; if null, omitted from body.
+    /// Typically constructed as: self_url + "/internal/agent-events/{run_id}/{step_id}"
+    callback_url: ?[]const u8 = null,
+    /// Maximum agent iterations; if null, omitted from body.
+    max_iterations: ?i64 = null,
+    /// JSON array of tool names, e.g. "[\"search\",\"code\"]"; if null, omitted from body.
+    tools_json: ?[]const u8 = null,
+    /// Current state JSON to pass to the agent; if null, omitted from body.
+    state_json: ?[]const u8 = null,
+};
+
 // ── HTTP Dispatch ─────────────────────────────────────────────────────
 
 pub fn dispatchStep(
@@ -95,6 +112,24 @@ pub fn dispatchStep(
     run_id: []const u8,
     step_id: []const u8,
     rendered_prompt: []const u8,
+) !DispatchResult {
+    return dispatchStepWithOpts(allocator, worker_url, worker_token, worker_protocol_raw, worker_model, run_id, step_id, rendered_prompt, null);
+}
+
+/// Like dispatchStep but also accepts optional agent-specific fields.
+/// When agent_opts is non-null and the protocol is webhook, the additional
+/// fields (mode, callback_url, max_iterations, tools, state) are merged
+/// into the request body.
+pub fn dispatchStepWithOpts(
+    allocator: std.mem.Allocator,
+    worker_url: []const u8,
+    worker_token: []const u8,
+    worker_protocol_raw: []const u8,
+    worker_model: ?[]const u8,
+    run_id: []const u8,
+    step_id: []const u8,
+    rendered_prompt: []const u8,
+    agent_opts: ?AgentOpts,
 ) !DispatchResult {
     const protocol = worker_protocol.parse(worker_protocol_raw) orelse {
         const err_msg = try std.fmt.allocPrint(allocator, "unsupported worker protocol: {s}", .{worker_protocol_raw});
@@ -131,6 +166,7 @@ pub fn dispatchStep(
         run_id,
         step_id,
         rendered_prompt,
+        agent_opts,
     ) catch |err| switch (err) {
         error.MissingWorkerModel => {
             return DispatchResult{
@@ -234,12 +270,18 @@ fn buildRequestBody(
     run_id: []const u8,
     step_id: []const u8,
     rendered_prompt: []const u8,
+    agent_opts: ?AgentOpts,
 ) ![]const u8 {
     const session_key = try std.fmt.allocPrint(allocator, "run_{s}_step_{s}", .{ run_id, step_id });
     defer allocator.free(session_key);
 
     switch (protocol) {
         .webhook => {
+            // For agent steps with opts, build an extended body that includes
+            // agent-specific fields alongside the standard webhook fields.
+            if (agent_opts) |opts| {
+                return buildWebhookAgentBody(allocator, session_key, rendered_prompt, opts);
+            }
             return std.json.Stringify.valueAlloc(allocator, .{
                 .message = rendered_prompt,
                 .text = rendered_prompt,
@@ -277,6 +319,79 @@ fn buildRequestBody(
     }
 }
 
+/// Build the webhook JSON body for an agent step, merging standard fields with
+/// agent-specific optional fields (mode, callback_url, max_iterations, tools, state).
+/// Only non-null fields from agent_opts are included in the output.
+fn buildWebhookAgentBody(
+    allocator: std.mem.Allocator,
+    session_key: []const u8,
+    rendered_prompt: []const u8,
+    opts: AgentOpts,
+) ![]const u8 {
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    errdefer buf.deinit(allocator);
+
+    // Standard webhook fields
+    try buf.appendSlice(allocator, "{\"message\":");
+    try appendJsonString(&buf, allocator, rendered_prompt);
+    try buf.appendSlice(allocator, ",\"text\":");
+    try appendJsonString(&buf, allocator, rendered_prompt);
+    try buf.appendSlice(allocator, ",\"session_key\":");
+    try appendJsonString(&buf, allocator, session_key);
+    try buf.appendSlice(allocator, ",\"session_id\":");
+    try appendJsonString(&buf, allocator, session_key);
+
+    // Optional agent fields
+    if (opts.mode) |mode| {
+        try buf.appendSlice(allocator, ",\"mode\":");
+        try appendJsonString(&buf, allocator, mode);
+    }
+    if (opts.callback_url) |cb_url| {
+        try buf.appendSlice(allocator, ",\"callback_url\":");
+        try appendJsonString(&buf, allocator, cb_url);
+    }
+    if (opts.max_iterations) |max_iter| {
+        const field = try std.fmt.allocPrint(allocator, ",\"max_iterations\":{d}", .{max_iter});
+        defer allocator.free(field);
+        try buf.appendSlice(allocator, field);
+    }
+    if (opts.tools_json) |tools| {
+        // tools_json is already a JSON array string — embed it verbatim
+        try buf.appendSlice(allocator, ",\"tools\":");
+        try buf.appendSlice(allocator, tools);
+    }
+    if (opts.state_json) |state| {
+        // state_json is already a JSON object/value — embed it verbatim
+        try buf.appendSlice(allocator, ",\"state\":");
+        try buf.appendSlice(allocator, state);
+    }
+
+    try buf.append(allocator, '}');
+
+    return buf.toOwnedSlice(allocator);
+}
+
+/// Append a JSON-encoded string (with surrounding quotes and escapes) to buf.
+fn appendJsonString(buf: *std.ArrayListUnmanaged(u8), allocator: std.mem.Allocator, s: []const u8) !void {
+    try buf.append(allocator, '"');
+    for (s) |byte| {
+        switch (byte) {
+            '"' => try buf.appendSlice(allocator, "\\\""),
+            '\\' => try buf.appendSlice(allocator, "\\\\"),
+            '\n' => try buf.appendSlice(allocator, "\\n"),
+            '\r' => try buf.appendSlice(allocator, "\\r"),
+            '\t' => try buf.appendSlice(allocator, "\\t"),
+            0x00...0x08, 0x0b, 0x0c, 0x0e...0x1f => {
+                const escaped = try std.fmt.allocPrint(allocator, "\\u{x:0>4}", .{byte});
+                defer allocator.free(escaped);
+                try buf.appendSlice(allocator, escaped);
+            },
+            else => try buf.append(allocator, byte),
+        }
+    }
+    try buf.append(allocator, '"');
+}
+
 /// Build the wire-format JSON body for async (MQTT/Redis) dispatch.
 /// Includes correlation_id, reply_to topic/stream, timestamp, auth token,
 /// the rendered prompt, and a session_key matching the correlation_id.
@@ -623,10 +738,56 @@ test "buildRequestBody: openai_chat requires model" {
     const allocator = std.testing.allocator;
     try std.testing.expectError(
         error.MissingWorkerModel,
-        buildRequestBody(allocator, .openai_chat, null, "run-1", "step-1", "hello"),
+        buildRequestBody(allocator, .openai_chat, null, "run-1", "step-1", "hello", null),
     );
 }
 
+test "buildWebhookAgentBody: includes all agent fields when present" {
+    const allocator = std.testing.allocator;
+    const opts = AgentOpts{
+        .mode = "autonomous",
+        .callback_url = "http://localhost:8080/internal/agent-events/run-1/step-1",
+        .max_iterations = 25,
+        .tools_json = "[\"search\",\"code\"]",
+        .state_json = "{\"foo\":\"bar\"}",
+    };
+    const body = try buildRequestBody(allocator, .webhook, null, "run-1", "step-1", "do something", opts);
+    defer allocator.free(body);
+
+    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, body, .{});
+    defer parsed.deinit();
+    const obj = parsed.value.object;
+
+    try std.testing.expectEqualStrings("do something", obj.get("message").?.string);
+    try std.testing.expectEqualStrings("autonomous", obj.get("mode").?.string);
+    try std.testing.expectEqualStrings(
+        "http://localhost:8080/internal/agent-events/run-1/step-1",
+        obj.get("callback_url").?.string,
+    );
+    try std.testing.expectEqual(@as(i64, 25), obj.get("max_iterations").?.integer);
+    // tools and state are embedded JSON — check they round-trip
+    const tools_arr = obj.get("tools").?.array;
+    try std.testing.expectEqual(@as(usize, 2), tools_arr.items.len);
+    try std.testing.expectEqualStrings("search", tools_arr.items[0].string);
+}
+
+test "buildWebhookAgentBody: omits null agent fields" {
+    const allocator = std.testing.allocator;
+    const opts = AgentOpts{ .mode = "managed" };
+    const body = try buildRequestBody(allocator, .webhook, null, "run-1", "step-1", "hello", opts);
+    defer allocator.free(body);
+
+    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, body, .{});
+    defer parsed.deinit();
+    const obj = parsed.value.object;
+
+    try std.testing.expectEqualStrings("managed", obj.get("mode").?.string);
+    try std.testing.expect(obj.get("callback_url") == null);
+    try std.testing.expect(obj.get("max_iterations") == null);
+    try std.testing.expect(obj.get("tools") == null);
+    try std.testing.expect(obj.get("state") == null);
+}
+
 test "buildAsyncRequestBody: produces valid wire-format JSON with all fields" {
     const allocator = std.testing.allocator;
     const before_ms = ids.nowMs();
diff --git a/src/main.zig b/src/main.zig
index 361f253..8276113 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -12,6 +12,7 @@ const redis_client = @import("redis_client.zig");
 const mqtt_client = @import("mqtt_client.zig");
 const tracker_mod = @import("tracker.zig");
 const workflow_loader = @import("workflow_loader.zig");
+const sse_mod = @import("sse.zig");
 const c = @cImport({
     @cInclude("signal.h");
 });
@@ -148,6 +149,9 @@ pub fn main() !void {
     var metrics = metrics_mod.Metrics{};
     var drain_mode = std.atomic.Value(bool).init(false);
 
+    var sse_hub = sse_mod.SseHub.init(allocator);
+    defer sse_hub.deinit();
+
     var response_queue = async_dispatch.ResponseQueue.init(allocator);
     defer response_queue.deinit();
 
@@ -387,6 +391,7 @@ pub fn main() !void {
             .strategies = &strategy_map,
             .tracker_state = if (tracker_instance) |*ti| &ti.state else null,
             .tracker_cfg = if (cfg.tracker) |*tc| tc else null,
+            .sse_hub = &sse_hub,
         };
         const response = api.handleRequest(&ctx, request.method, request.target, request.body);
 
diff --git a/src/tracker.zig b/src/tracker.zig
index 6c1805d..7425e02 100644
--- a/src/tracker.zig
+++ b/src/tracker.zig
@@ -378,6 +378,9 @@ pub const Tracker = struct {
     }
 
     /// Poll NullTickets for each workflow's claim_roles and claim available tasks.
+    // TODO(task14): When nulltickets schema changes are integrated, update WorkflowDef
+    // and pollAndClaim to handle the new workflow format (e.g. new claim fields, task
+    // shape, or execution modes introduced in the orchestration milestone).
     fn pollAndClaim(self: *Tracker, tick_alloc: std.mem.Allocator) void {
         const base_url = self.cfg.url orelse return;
 

From b5f5202793dee3a08c7e0e7107918c9f81350b38 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 19:59:24 -0300
Subject: [PATCH 12/55] feat: use A2A protocol for agent step instead of custom
 webhook

Add native support for nullclaw's Agent-to-Agent protocol (JSON-RPC 2.0
over /a2a endpoint) as a new worker protocol. Agent nodes now prefer
A2A-protocol workers for dispatch, falling back to other protocols when
no A2A worker is available.

- Add 'a2a' variant to worker_protocol.Protocol enum with URL builder
  that appends /a2a to the worker base URL
- Add buildA2aRequestBody() producing tasks/send JSON-RPC requests with
  contextId for session persistence
- Add parseA2aResponse() extracting text from result.artifacts with
  proper error handling for JSON-RPC errors and failed task status
- Route A2A responses through dedicated parser instead of generic
  worker_response.parse
- Engine's executeTaskNode prefers A2A workers for agent-type nodes
- All existing webhook/api_chat/openai_chat paths remain unchanged
---
 config.example.json     |   8 ++
 src/dispatch.zig        | 270 +++++++++++++++++++++++++++++++++++++++-
 src/engine.zig          |  29 ++++-
 src/worker_protocol.zig |  30 ++++-
 4 files changed, 327 insertions(+), 10 deletions(-)

diff --git a/config.example.json b/config.example.json
index fc64a5a..fca25c5 100644
--- a/config.example.json
+++ b/config.example.json
@@ -29,6 +29,14 @@
       "model": "anthropic/claude-sonnet-4-6",
       "tags": ["writer", "editor"],
       "max_concurrent": 2
+    },
+    {
+      "id": "nullclaw-a2a",
+      "url": "http://localhost:3000",
+      "token": "set_same_value_as_nullclaw_gateway_paired_tokens",
+      "protocol": "a2a",
+      "tags": ["coder", "agent"],
+      "max_concurrent": 3
     }
   ],
   "engine": {
diff --git a/src/dispatch.zig b/src/dispatch.zig
index 2b7e002..65fdadd 100644
--- a/src/dispatch.zig
+++ b/src/dispatch.zig
@@ -14,7 +14,7 @@ pub const WorkerInfo = struct {
     id: []const u8,
     url: []const u8,
     token: []const u8,
-    protocol: []const u8 = "webhook", // "webhook", "api_chat", "openai_chat"
+    protocol: []const u8 = "webhook", // "webhook", "api_chat", "openai_chat", "a2a"
     model: ?[]const u8 = null,
     tags_json: []const u8, // JSON array like ["coder","researcher"]
     max_concurrent: i64,
@@ -231,6 +231,12 @@ pub fn dispatchStepWithOpts(
     }
 
     const response_data = response_body.written();
+
+    // A2A uses JSON-RPC 2.0 responses; parse them with the A2A-specific parser
+    if (protocol == .a2a) {
+        return try parseA2aResponse(allocator, response_data);
+    }
+
     return try worker_response.parse(allocator, response_data);
 }
 
@@ -241,7 +247,7 @@ pub fn probeWorker(
 ) bool {
     const protocol = worker_protocol.parse(worker_protocol_raw) orelse return false;
 
-    // Async protocols (mqtt/redis_stream) can't be probed via HTTP
+    // Async protocols (mqtt/redis_stream) can't be probed via HTTP; a2a is probed via its own endpoint
     if (protocol == .mqtt or protocol == .redis_stream) return true;
 
     const url = worker_protocol.buildRequestUrl(allocator, worker_url, protocol) catch return false;
@@ -309,6 +315,9 @@ fn buildRequestBody(
                 .messages = messages[0..],
             }, .{});
         },
+        .a2a => {
+            return buildA2aRequestBody(allocator, rendered_prompt, session_key);
+        },
         .mqtt, .redis_stream => {
             // MQTT and Redis Stream use async dispatch; body built by their respective clients
             return std.json.Stringify.valueAlloc(allocator, .{
@@ -392,6 +401,174 @@ fn appendJsonString(buf: *std.ArrayListUnmanaged(u8), allocator: std.mem.Allocat
     try buf.append(allocator, '"');
 }
 
+// ── A2A Protocol Support ──────────────────────────────────────────────
+
+/// Build an A2A (Agent-to-Agent) JSON-RPC 2.0 request body using tasks/send.
+/// The context_id provides session persistence — same context_id means same conversation.
+fn buildA2aRequestBody(
+    allocator: std.mem.Allocator,
+    prompt: []const u8,
+    context_id: []const u8,
+) ![]const u8 {
+    // Build the parts array
+    const parts = [_]struct {
+        type: []const u8,
+        text: []const u8,
+    }{
+        .{ .type = "text", .text = prompt },
+    };
+
+    // Build the message
+    const message = .{
+        .role = "user",
+        .parts = parts[0..],
+    };
+
+    // Build the params
+    const params = .{
+        .message = message,
+        .contextId = context_id,
+    };
+
+    // Build the full JSON-RPC request
+    return std.json.Stringify.valueAlloc(allocator, .{
+        .jsonrpc = "2.0",
+        .id = context_id,
+        .method = "tasks/send",
+        .params = params,
+    }, .{});
+}
+
+/// Parse an A2A JSON-RPC 2.0 response and extract the text from the first artifact.
+/// Expected structure: result.artifacts[0].parts[0].text (or .kind=="text")
+/// Also checks for JSON-RPC error responses.
+fn parseA2aResponse(allocator: std.mem.Allocator, response_body: []const u8) !DispatchResult {
+    const parsed = std.json.parseFromSlice(std.json.Value, allocator, response_body, .{}) catch {
+        return DispatchResult{
+            .output = "",
+            .success = false,
+            .error_text = "A2A: invalid JSON response",
+        };
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return DispatchResult{
+            .output = "",
+            .success = false,
+            .error_text = "A2A: response is not a JSON object",
+        };
+    }
+    const obj = parsed.value.object;
+
+    // Check for JSON-RPC error
+    if (obj.get("error")) |err_val| {
+        if (err_val == .object) {
+            if (err_val.object.get("message")) |msg_val| {
+                if (msg_val == .string) {
+                    return DispatchResult{
+                        .output = "",
+                        .success = false,
+                        .error_text = try allocator.dupe(u8, msg_val.string),
+                    };
+                }
+            }
+        }
+        return DispatchResult{
+            .output = "",
+            .success = false,
+            .error_text = "A2A: JSON-RPC error",
+        };
+    }
+
+    // Extract result
+    const result_val = obj.get("result") orelse {
+        return DispatchResult{
+            .output = "",
+            .success = false,
+            .error_text = "A2A: missing result field",
+        };
+    };
+    if (result_val != .object) {
+        return DispatchResult{
+            .output = "",
+            .success = false,
+            .error_text = "A2A: result is not an object",
+        };
+    }
+    const result_obj = result_val.object;
+
+    // Check task status
+    if (result_obj.get("status")) |status_val| {
+        if (status_val == .object) {
+            if (status_val.object.get("state")) |state_val| {
+                if (state_val == .string) {
+                    if (std.mem.eql(u8, state_val.string, "failed")) {
+                        // Extract error message from status if available
+                        if (status_val.object.get("message")) |msg| {
+                            if (msg == .object) {
+                                if (msg.object.get("parts")) |msg_parts| {
+                                    if (msg_parts == .array and msg_parts.array.items.len > 0) {
+                                        const first_part = msg_parts.array.items[0];
+                                        if (first_part == .object) {
+                                            if (first_part.object.get("text")) |t| {
+                                                if (t == .string) {
+                                                    return DispatchResult{
+                                                        .output = "",
+                                                        .success = false,
+                                                        .error_text = try allocator.dupe(u8, t.string),
+                                                    };
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        return DispatchResult{
+                            .output = "",
+                            .success = false,
+                            .error_text = "A2A: task failed",
+                        };
+                    }
+                }
+            }
+        }
+    }
+
+    // Extract text from artifacts[0].parts[0].text
+    if (result_obj.get("artifacts")) |artifacts_val| {
+        if (artifacts_val == .array and artifacts_val.array.items.len > 0) {
+            const first_artifact = artifacts_val.array.items[0];
+            if (first_artifact == .object) {
+                if (first_artifact.object.get("parts")) |parts_val| {
+                    if (parts_val == .array and parts_val.array.items.len > 0) {
+                        const first_part = parts_val.array.items[0];
+                        if (first_part == .object) {
+                            // Check for "text" field (A2A uses "text" key for text parts)
+                            if (first_part.object.get("text")) |text_val| {
+                                if (text_val == .string) {
+                                    return DispatchResult{
+                                        .output = try allocator.dupe(u8, text_val.string),
+                                        .success = true,
+                                        .error_text = null,
+                                    };
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return DispatchResult{
+        .output = "",
+        .success = false,
+        .error_text = "A2A: no text found in artifacts",
+    };
+}
+
 /// Build the wire-format JSON body for async (MQTT/Redis) dispatch.
 /// Includes correlation_id, reply_to topic/stream, timestamp, auth token,
 /// the rendered prompt, and a session_key matching the correlation_id.
@@ -864,3 +1041,92 @@ test "dispatchRedis: invalid URL returns error" {
     try std.testing.expect(!result.success);
     try std.testing.expectEqualStrings("invalid redis:// URL", result.error_text.?);
 }
+
+test "buildA2aRequestBody: produces valid JSON-RPC 2.0 request" {
+    const allocator = std.testing.allocator;
+    const body = try buildA2aRequestBody(allocator, "Fix the bug in main.py", "run_abc_step_fix");
+    defer allocator.free(body);
+
+    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, body, .{});
+    defer parsed.deinit();
+    const obj = parsed.value.object;
+
+    try std.testing.expectEqualStrings("2.0", obj.get("jsonrpc").?.string);
+    try std.testing.expectEqualStrings("run_abc_step_fix", obj.get("id").?.string);
+    try std.testing.expectEqualStrings("tasks/send", obj.get("method").?.string);
+
+    const params = obj.get("params").?.object;
+    try std.testing.expectEqualStrings("run_abc_step_fix", params.get("contextId").?.string);
+
+    const message = params.get("message").?.object;
+    try std.testing.expectEqualStrings("user", message.get("role").?.string);
+
+    const parts = message.get("parts").?.array;
+    try std.testing.expectEqual(@as(usize, 1), parts.items.len);
+    try std.testing.expectEqualStrings("text", parts.items[0].object.get("type").?.string);
+    try std.testing.expectEqualStrings("Fix the bug in main.py", parts.items[0].object.get("text").?.string);
+}
+
+test "buildRequestBody: a2a protocol produces JSON-RPC body" {
+    const allocator = std.testing.allocator;
+    const body = try buildRequestBody(allocator, .a2a, null, "run-1", "step-1", "hello agent", null);
+    defer allocator.free(body);
+
+    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, body, .{});
+    defer parsed.deinit();
+    const obj = parsed.value.object;
+
+    try std.testing.expectEqualStrings("2.0", obj.get("jsonrpc").?.string);
+    try std.testing.expectEqualStrings("tasks/send", obj.get("method").?.string);
+    // context_id is "run_{run_id}_step_{step_id}"
+    try std.testing.expectEqualStrings("run_run-1_step_step-1", obj.get("id").?.string);
+}
+
+test "parseA2aResponse: extracts text from successful response" {
+    const allocator = std.testing.allocator;
+    const response =
+        \\{"jsonrpc":"2.0","id":"req-1","result":{"id":"task-1","contextId":"ctx-1","status":{"state":"completed","timestamp":"2025-01-01T00:00:00Z"},"artifacts":[{"artifactId":"a1","parts":[{"kind":"text","text":"The bug has been fixed."}]}]}}
+    ;
+    const result = try parseA2aResponse(allocator, response);
+    defer allocator.free(result.output);
+    try std.testing.expect(result.success);
+    try std.testing.expectEqualStrings("The bug has been fixed.", result.output);
+}
+
+test "parseA2aResponse: handles JSON-RPC error" {
+    const allocator = std.testing.allocator;
+    const response =
+        \\{"jsonrpc":"2.0","id":"req-1","error":{"code":-32600,"message":"Invalid Request"}}
+    ;
+    const result = try parseA2aResponse(allocator, response);
+    defer allocator.free(result.error_text.?);
+    try std.testing.expect(!result.success);
+    try std.testing.expectEqualStrings("Invalid Request", result.error_text.?);
+}
+
+test "parseA2aResponse: handles failed task status" {
+    const allocator = std.testing.allocator;
+    const response =
+        \\{"jsonrpc":"2.0","id":"req-1","result":{"id":"task-1","status":{"state":"failed"}}}
+    ;
+    const result = try parseA2aResponse(allocator, response);
+    try std.testing.expect(!result.success);
+    try std.testing.expectEqualStrings("A2A: task failed", result.error_text.?);
+}
+
+test "parseA2aResponse: handles missing artifacts" {
+    const allocator = std.testing.allocator;
+    const response =
+        \\{"jsonrpc":"2.0","id":"req-1","result":{"id":"task-1","status":{"state":"completed"}}}
+    ;
+    const result = try parseA2aResponse(allocator, response);
+    try std.testing.expect(!result.success);
+    try std.testing.expectEqualStrings("A2A: no text found in artifacts", result.error_text.?);
+}
+
+test "parseA2aResponse: handles invalid JSON" {
+    const allocator = std.testing.allocator;
+    const result = try parseA2aResponse(allocator, "not json");
+    try std.testing.expect(!result.success);
+    try std.testing.expectEqualStrings("A2A: invalid JSON response", result.error_text.?);
+}
diff --git a/src/engine.zig b/src/engine.zig
index 19104a4..4dddbfb 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -535,7 +535,27 @@ pub const Engine = struct {
         }
 
         const required_tags = getNodeTags(alloc, node_json);
-        const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
+        const node_type = getNodeField(alloc, node_json, "type") orelse "task";
+        const is_agent_node = std.mem.eql(u8, node_type, "agent");
+
+        // For agent nodes, prefer A2A-protocol workers first, then fall back to any worker
+        var selected_worker: ?dispatch.WorkerInfo = null;
+        if (is_agent_node) {
+            // Filter to A2A workers only
+            var a2a_workers: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
+            for (worker_infos.items) |w| {
+                if (std.mem.eql(u8, w.protocol, "a2a")) {
+                    try a2a_workers.append(alloc, w);
+                }
+            }
+            if (a2a_workers.items.len > 0) {
+                selected_worker = try dispatch.selectWorker(alloc, a2a_workers.items, required_tags);
+            }
+        }
+        // Fall back to any protocol if no A2A worker found (or not an agent node)
+        if (selected_worker == null) {
+            selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
+        }
         if (selected_worker == null) {
             return TaskNodeResult{ .no_worker = {} };
         }
@@ -544,7 +564,6 @@ pub const Engine = struct {
         // 4. Create step record
         const step_id_buf = ids.generateId();
         const step_id = try alloc.dupe(u8, &step_id_buf);
-        const node_type = getNodeField(alloc, node_json, "type") orelse "task";
         try self.store.insertStep(step_id, run_row.id, node_name, node_type, "running", state_json, 1, null, null, null);
         try self.store.insertEvent(run_row.id, step_id, "step.running", "{}");
 
@@ -552,7 +571,11 @@ pub const Engine = struct {
             metrics_mod.Metrics.incr(&m.steps_claimed_total);
         }
 
-        // 5. Dispatch to worker
+        // 5. Dispatch to worker (A2A protocol for agent nodes with A2A workers,
+        //    or standard protocol dispatch for task nodes / fallback)
+        if (is_agent_node and std.mem.eql(u8, worker.protocol, "a2a")) {
+            log.info("agent node {s} dispatching via A2A to worker {s}", .{ node_name, worker.id });
+        }
         const result = try dispatch.dispatchStep(
             alloc,
             worker.url,
diff --git a/src/worker_protocol.zig b/src/worker_protocol.zig
index d560d38..59be6a6 100644
--- a/src/worker_protocol.zig
+++ b/src/worker_protocol.zig
@@ -6,6 +6,7 @@ pub const Protocol = enum {
     openai_chat,
     mqtt,
     redis_stream,
+    a2a,
 };
 
 pub fn parse(raw: []const u8) ?Protocol {
@@ -14,26 +15,27 @@ pub fn parse(raw: []const u8) ?Protocol {
     if (std.mem.eql(u8, raw, "openai_chat")) return .openai_chat;
     if (std.mem.eql(u8, raw, "mqtt")) return .mqtt;
     if (std.mem.eql(u8, raw, "redis_stream")) return .redis_stream;
+    if (std.mem.eql(u8, raw, "a2a")) return .a2a;
     return null;
 }
 
 pub fn requiresModel(protocol: Protocol) bool {
     return switch (protocol) {
         .openai_chat => true,
-        .webhook, .api_chat, .mqtt, .redis_stream => false,
+        .webhook, .api_chat, .mqtt, .redis_stream, .a2a => false,
     };
 }
 
 pub fn requiresExplicitPath(protocol: Protocol) bool {
     return switch (protocol) {
         .webhook => true,
-        .api_chat, .openai_chat, .mqtt, .redis_stream => false,
+        .api_chat, .openai_chat, .mqtt, .redis_stream, .a2a => false,
     };
 }
 
 pub fn validateUrlForProtocol(url: []const u8, protocol: Protocol) bool {
-    // mqtt and redis_stream URLs are validated by their own parsers
-    if (protocol == .mqtt or protocol == .redis_stream) return true;
+    // mqtt, redis_stream, and a2a URLs are validated by their own parsers / have fixed paths
+    if (protocol == .mqtt or protocol == .redis_stream or protocol == .a2a) return true;
     if (!requiresExplicitPath(protocol)) return true;
     return hasExplicitPath(url);
 }
@@ -47,6 +49,9 @@ pub fn buildRequestUrl(
     if (requiresExplicitPath(protocol) and !hasExplicitPath(trimmed)) {
         return error.WebhookUrlPathRequired;
     }
+    if (protocol == .a2a) {
+        return try std.fmt.allocPrint(allocator, "{s}/a2a", .{trimmed});
+    }
     return try allocator.dupe(u8, trimmed);
 }
 
@@ -140,6 +145,7 @@ test "parse protocol supports known values" {
     try std.testing.expectEqual(Protocol.webhook, parse("webhook").?);
     try std.testing.expectEqual(Protocol.api_chat, parse("api_chat").?);
     try std.testing.expectEqual(Protocol.openai_chat, parse("openai_chat").?);
+    try std.testing.expectEqual(Protocol.a2a, parse("a2a").?);
     try std.testing.expect(parse("unknown") == null);
 }
 
@@ -173,11 +179,25 @@ test "validateUrlForProtocol enforces protocol-specific constraints" {
     try std.testing.expect(validateUrlForProtocol("http://localhost:42617/api/chat", .api_chat));
     try std.testing.expect(validateUrlForProtocol("mqtt://broker:1883/topic", .mqtt));
     try std.testing.expect(validateUrlForProtocol("redis://redis:6379/stream", .redis_stream));
+    try std.testing.expect(validateUrlForProtocol("http://localhost:3000", .a2a));
+}
+
+test "buildRequestUrl appends /a2a for a2a protocol" {
+    const allocator = std.testing.allocator;
+    const url = try buildRequestUrl(allocator, "http://localhost:3000", .a2a);
+    defer allocator.free(url);
+    try std.testing.expectEqualStrings("http://localhost:3000/a2a", url);
+
+    // Trailing slash is trimmed before appending /a2a
+    const url2 = try buildRequestUrl(allocator, "http://localhost:3000/", .a2a);
+    defer allocator.free(url2);
+    try std.testing.expectEqualStrings("http://localhost:3000/a2a", url2);
 }
 
-test "parse supports mqtt and redis_stream" {
+test "parse supports mqtt, redis_stream, and a2a" {
     try std.testing.expectEqual(Protocol.mqtt, parse("mqtt").?);
     try std.testing.expectEqual(Protocol.redis_stream, parse("redis_stream").?);
+    try std.testing.expectEqual(Protocol.a2a, parse("a2a").?);
 }
 
 test "parseMqttUrl extracts host, port, topic" {

From 6654f51b246c8e377e3e696c0fe0e4dad7c7f930 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 20:25:08 -0300
Subject: [PATCH 13/55] feat: add subgraph type, add_messages reducer, run
 config/parent columns

---
 src/migrations/004_orchestration.sql |   4 +
 src/state.zig                        | 190 +++++++++++++++++++++++++++
 src/types.zig                        |   2 +
 3 files changed, 196 insertions(+)

diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
index 30ffa68..29f3332 100644
--- a/src/migrations/004_orchestration.sql
+++ b/src/migrations/004_orchestration.sql
@@ -60,3 +60,7 @@ ALTER TABLE steps ADD COLUMN state_before_json TEXT;
 ALTER TABLE steps ADD COLUMN state_after_json TEXT;
 ALTER TABLE steps ADD COLUMN state_updates_json TEXT;
 -- NOTE: parent_step_id already exists from 001_init.sql — do NOT add it again
+
+-- Subgraph support: parent run linkage and per-run config
+ALTER TABLE runs ADD COLUMN parent_run_id TEXT REFERENCES runs(id);
+ALTER TABLE runs ADD COLUMN config_json TEXT;
diff --git a/src/state.zig b/src/state.zig
index fe66131..3c6ab0b 100644
--- a/src/state.zig
+++ b/src/state.zig
@@ -60,6 +60,9 @@ pub fn applyReducer(alloc: Allocator, reducer: ReducerType, old_value_json: ?[]c
         .max => {
             return try applyMax(alloc, old_value_json, update_json);
         },
+        .add_messages => {
+            return try applyAddMessages(alloc, old_value_json, update_json);
+        },
     }
 }
 
@@ -456,6 +459,115 @@ fn applyMax(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![
     return try formatFloat(alloc, @max(old_val, update_val));
 }
 
+/// add_messages: merge message arrays by "id" field.
+/// - If old is null → wrap update in array
+/// - If update msg has "remove": true → remove matching id from old
+/// - If update msg "id" matches existing → replace in-place
+/// - If update msg "id" doesn't match → append
+/// - If update msg has no "id" → generate one and append
+fn applyAddMessages(alloc: Allocator, old_json: ?[]const u8, update_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    // Parse update: single object or array of objects
+    const update_parsed = try json.parseFromSlice(json.Value, arena_alloc, update_json, .{});
+    var update_msgs = json.Array.init(arena_alloc);
+    if (update_parsed.value == .array) {
+        for (update_parsed.value.array.items) |item| {
+            try update_msgs.append(item);
+        }
+    } else if (update_parsed.value == .object) {
+        try update_msgs.append(update_parsed.value);
+    } else {
+        return try alloc.dupe(u8, update_json);
+    }
+
+    // Parse old array or start empty
+    var result_msgs = json.Array.init(arena_alloc);
+    if (old_json) |old| {
+        if (old.len > 0) {
+            const old_parsed = try json.parseFromSlice(json.Value, arena_alloc, old, .{});
+            if (old_parsed.value == .array) {
+                for (old_parsed.value.array.items) |item| {
+                    try result_msgs.append(item);
+                }
+            }
+        }
+    }
+
+    // Process each update message
+    for (update_msgs.items) |msg| {
+        if (msg != .object) continue;
+
+        const msg_id: ?[]const u8 = blk: {
+            if (msg.object.get("id")) |id_val| {
+                if (id_val == .string) break :blk id_val.string;
+            }
+            break :blk null;
+        };
+
+        // Check for remove flag
+        const is_remove = blk: {
+            if (msg.object.get("remove")) |rm_val| {
+                if (rm_val == .bool) break :blk rm_val.bool;
+            }
+            break :blk false;
+        };
+
+        if (is_remove) {
+            if (msg_id) |id| {
+                // Filter out the message with matching id
+                var filtered = json.Array.init(arena_alloc);
+                for (result_msgs.items) |existing| {
+                    if (existing == .object) {
+                        if (existing.object.get("id")) |eid| {
+                            if (eid == .string and std.mem.eql(u8, eid.string, id)) {
+                                continue; // skip — removing this message
+                            }
+                        }
+                    }
+                    try filtered.append(existing);
+                }
+                result_msgs = filtered;
+            }
+            continue;
+        }
+
+        if (msg_id) |id| {
+            // Try to find and replace existing message with same id
+            var replaced = false;
+            for (result_msgs.items, 0..) |existing, i| {
+                if (existing == .object) {
+                    if (existing.object.get("id")) |eid| {
+                        if (eid == .string and std.mem.eql(u8, eid.string, id)) {
+                            result_msgs.items[i] = msg;
+                            replaced = true;
+                            break;
+                        }
+                    }
+                }
+            }
+            if (!replaced) {
+                try result_msgs.append(msg);
+            }
+        } else {
+            // No id — generate one and append
+            var msg_copy = json.ObjectMap.init(arena_alloc);
+            var it = msg.object.iterator();
+            while (it.next()) |entry| {
+                try msg_copy.put(entry.key_ptr.*, entry.value_ptr.*);
+            }
+            const gen_id = try std.fmt.allocPrint(arena_alloc, "msg_{d}", .{result_msgs.items.len});
+            try msg_copy.put("id", json.Value{ .string = gen_id });
+            try result_msgs.append(json.Value{ .object = msg_copy });
+        }
+    }
+
+    const result = try serializeValue(arena_alloc, json.Value{ .array = result_msgs });
+    return try alloc.dupe(u8, result);
+}
+
 // ── Custom errors ─────────────────────────────────────────────────────
 
 const InvalidNumber = error{InvalidNumber};
@@ -672,3 +784,81 @@ test "stringifyForRoute string" {
     defer alloc.free(result);
     try std.testing.expectEqualStrings("hello world", result);
 }
+
+test "add_messages reducer - append new" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add_messages,
+        \\[{"id":"1","text":"hello"}]
+    ,
+        \\{"id":"2","text":"world"}
+    );
+    defer alloc.free(result);
+    // Parse and verify: should be array with 2 messages
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .array);
+    try std.testing.expectEqual(@as(usize, 2), parsed.value.array.items.len);
+    // First message id=1
+    const m0 = parsed.value.array.items[0];
+    try std.testing.expect(m0 == .object);
+    const id0 = m0.object.get("id") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("1", id0.string);
+    // Second message id=2
+    const m1 = parsed.value.array.items[1];
+    try std.testing.expect(m1 == .object);
+    const id1 = m1.object.get("id") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("2", id1.string);
+    const text1 = m1.object.get("text") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("world", text1.string);
+}
+
+test "add_messages reducer - replace by id" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add_messages,
+        \\[{"id":"1","text":"old"}]
+    ,
+        \\{"id":"1","text":"new"}
+    );
+    defer alloc.free(result);
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .array);
+    try std.testing.expectEqual(@as(usize, 1), parsed.value.array.items.len);
+    const m0 = parsed.value.array.items[0];
+    const text = m0.object.get("text") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("new", text.string);
+}
+
+test "add_messages reducer - remove by id" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add_messages,
+        \\[{"id":"1","text":"hello"},{"id":"2","text":"world"}]
+    ,
+        \\{"id":"1","remove":true}
+    );
+    defer alloc.free(result);
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .array);
+    try std.testing.expectEqual(@as(usize, 1), parsed.value.array.items.len);
+    const m0 = parsed.value.array.items[0];
+    const id0 = m0.object.get("id") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("2", id0.string);
+}
+
+test "add_messages reducer - null old" {
+    const alloc = std.testing.allocator;
+    const result = try applyReducer(alloc, .add_messages, null,
+        \\{"id":"1","text":"first"}
+    );
+    defer alloc.free(result);
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .array);
+    try std.testing.expectEqual(@as(usize, 1), parsed.value.array.items.len);
+    const m0 = parsed.value.array.items[0];
+    const id0 = m0.object.get("id") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("1", id0.string);
+    const text0 = m0.object.get("text") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("first", text0.string);
+}
diff --git a/src/types.zig b/src/types.zig
index b334250..93d3d1d 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -53,6 +53,7 @@ pub const StepType = enum {
     agent,
     send,
     transform,
+    subgraph,
 
     pub fn toString(self: StepType) []const u8 {
         return @tagName(self);
@@ -287,6 +288,7 @@ pub const ReducerType = enum {
     add,
     min,
     max,
+    add_messages,
 
     pub fn toString(self: ReducerType) []const u8 {
         return @tagName(self);

From 5a97bd1c1bfba48282f4cfcf2e83bd31243d47c2 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 20:36:07 -0300
Subject: [PATCH 14/55] feat: command goto, breakpoints, subgraph, multi-turn,
 configurable runs

---
 src/engine.zig | 642 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/store.zig  |  52 +++-
 src/types.zig  |   2 +
 3 files changed, 682 insertions(+), 14 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 4dddbfb..f123eb4 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -1,8 +1,8 @@
 /// DAG Engine — Unified State Model Scheduler
 ///
 /// The engine runs on its own thread, polling the database for active runs
-/// and processing them using a graph-based state model with 6 node types:
-///   task, route, interrupt, agent, send, transform
+/// and processing them using a graph-based state model with 7 node types:
+///   task, route, interrupt, agent, send, transform, subgraph
 ///
 /// Each tick:
 ///   1. Get active runs (status = running)
@@ -14,6 +14,14 @@
 ///      e. Execute ready nodes in sequence
 ///      f. Apply state updates via reducers, save checkpoint
 ///      g. Check termination / deadlock
+///
+/// Features:
+///   - Command primitive (goto): worker responses can contain "goto" to override routing
+///   - Breakpoints: interrupt_before / interrupt_after arrays in workflow definition
+///   - Subgraph: inline execution of child workflows with input/output mapping
+///   - Multi-turn: agent nodes can loop with continuation_prompt up to max_turns
+///   - Configurable runs: config stored as state.__config, accessible via templates
+///   - Reconciliation: check nulltickets task status between steps
 const std = @import("std");
 const log = std.log.scoped(.engine);
 const json = std.json;
@@ -145,12 +153,46 @@ pub const Engine = struct {
     // ── processRun — state-based graph execution ─────────────────────
 
     fn processRun(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow) !void {
+        return self.processRunWithDepth(alloc, run_row, 0);
+    }
+
+    /// Wrapper for inline subgraph execution. Uses anyerror to break
+    /// the recursive inferred-error-set cycle.
+    fn processRunInline(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, recursion_depth: u32) void {
+        self.processRunWithDepth(alloc, run_row, recursion_depth) catch |err| {
+            log.err("inline subgraph run {s} failed: {}", .{ run_row.id, err });
+        };
+    }
+
+    fn processRunWithDepth(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, recursion_depth: u32) !void {
         // 1. Load current state
-        const current_state = run_row.state_json orelse "{}";
+        var current_state = run_row.state_json orelse "{}";
+
+        // 1b. Inject __config into state (configurable runs)
+        if (run_row.config_json) |config_str| {
+            if (config_str.len > 0) {
+                const config_update = std.fmt.allocPrint(alloc, "{{\"__config\":{s}}}", .{config_str}) catch null;
+                if (config_update) |cu| {
+                    // Simple merge: parse state, add __config key
+                    const merged = state_mod.applyUpdates(alloc, current_state, cu, "{}") catch null;
+                    if (merged) |m| {
+                        current_state = m;
+                    }
+                }
+            }
+        }
 
         // 2. Load workflow definition
         const workflow_json = run_row.workflow_json;
 
+        // 2b. Parse breakpoint lists from workflow definition
+        const interrupt_before = parseBreakpointList(alloc, workflow_json, "interrupt_before");
+        const interrupt_after = parseBreakpointList(alloc, workflow_json, "interrupt_after");
+
+        // 2c. Get tracker URL for reconciliation
+        const tracker_url = getWorkflowField(alloc, workflow_json, "tracker_url");
+        const task_id = getWorkflowField(alloc, workflow_json, "task_id");
+
         // 3. Get completed nodes from latest checkpoint
         var completed_nodes = std.StringHashMap(void).init(alloc);
         var route_results = std.StringHashMap([]const u8).init(alloc);
@@ -195,9 +237,14 @@ pub const Engine = struct {
         // 4. Main execution loop: find ready nodes, execute, repeat
         var running_state: []const u8 = try alloc.dupe(u8, current_state);
         var max_iterations: u32 = 1000; // safety guard against infinite loops
+        var goto_ready: ?[]const []const u8 = null; // goto override from command primitive
 
         while (max_iterations > 0) : (max_iterations -= 1) {
-            const ready_nodes = try findReadyNodes(alloc, workflow_json, &completed_nodes, &route_results);
+            // Use goto override if set, otherwise find ready nodes normally
+            const ready_nodes = if (goto_ready) |gr| blk: {
+                goto_ready = null;
+                break :blk gr;
+            } else try findReadyNodes(alloc, workflow_json, &completed_nodes, &route_results);
             if (ready_nodes.len == 0) {
                 // Check termination: if all paths reached __end__
                 if (completed_nodes.get("__end__") != null) {
@@ -242,6 +289,7 @@ pub const Engine = struct {
 
             // 5. Execute ready nodes sequentially
             var made_progress = false;
+            var goto_override: ?[]const []const u8 = null;
 
             for (ready_nodes) |node_name| {
             if (std.mem.eql(u8, node_name, "__end__")) {
@@ -267,6 +315,25 @@ pub const Engine = struct {
                 return;
             }
 
+            // Breakpoint: interrupt_before check
+            if (isInBreakpointList(node_name, interrupt_before)) {
+                log.info("breakpoint interrupt_before at node {s} for run {s}", .{ node_name, run_row.id });
+                version += 1;
+                const cp_id_buf = ids.generateId();
+                const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const meta_json = try serializeRouteResults(alloc, &route_results);
+                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                try self.store.incrementCheckpointCount(run_row.id);
+                try self.store.updateRunState(run_row.id, running_state);
+
+                try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                return;
+            }
+
             // Get node definition from workflow
             const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
                 log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
@@ -374,6 +441,24 @@ pub const Engine = struct {
                         }
 
                         try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+
+                        // Handle goto command: override next ready nodes
+                        if (cr.goto_targets) |targets| {
+                            // Validate goto targets exist in workflow
+                            var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
+                            for (targets) |target| {
+                                if (std.mem.eql(u8, target, "__end__") or getNodeJson(alloc, workflow_json, target) != null) {
+                                    try valid_targets.append(alloc, target);
+                                } else {
+                                    log.warn("goto target {s} not found in workflow, skipping", .{target});
+                                }
+                            }
+                            if (valid_targets.items.len > 0) {
+                                goto_override = try valid_targets.toOwnedSlice(alloc);
+                                log.info("task node {s} goto: {d} targets", .{ node_name, goto_override.?.len });
+                            }
+                        }
+
                         log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
                     },
                     .async_pending => {
@@ -416,6 +501,33 @@ pub const Engine = struct {
                         return;
                     },
                 }
+            } else if (std.mem.eql(u8, node_type, "subgraph")) {
+                // Subgraph: execute child workflow inline
+                const result = try self.executeSubgraphNode(alloc, run_row, node_name, node_json, running_state, recursion_depth);
+
+                switch (result) {
+                    .completed => |cr| {
+                        if (cr.state_updates) |updates| {
+                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                                log.err("subgraph node {s} failed to apply updates: {}", .{ node_name, err });
+                                try self.store.updateRunStatus(run_row.id, "failed", "subgraph state update failed");
+                                return;
+                            };
+                            running_state = new_state;
+                        }
+                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                        log.info("subgraph node {s} completed for run {s}", .{ node_name, run_row.id });
+                    },
+                    .failed => |err_text| {
+                        log.err("subgraph node {s} failed: {s}", .{ node_name, err_text });
+                        try self.store.updateRunStatus(run_row.id, "failed", err_text);
+                        try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
+                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                        return;
+                    },
+                    else => {},
+                }
             } else if (std.mem.eql(u8, node_type, "send")) {
                 // Send: read items from state, dispatch target_node per item
                 const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
@@ -436,6 +548,37 @@ pub const Engine = struct {
                 return;
             }
 
+            // Breakpoint: interrupt_after check
+            if (isInBreakpointList(node_name, interrupt_after)) {
+                log.info("breakpoint interrupt_after at node {s} for run {s}", .{ node_name, run_row.id });
+                // Save checkpoint with updated state first
+                version += 1;
+                const bp_cp_id_buf = ids.generateId();
+                const bp_cp_id = try alloc.dupe(u8, &bp_cp_id_buf);
+                const bp_cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                const bp_parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const bp_meta_json = try serializeRouteResults(alloc, &route_results);
+                try self.store.createCheckpoint(bp_cp_id, run_row.id, node_name, bp_parent_id, running_state, bp_cn_json, version, bp_meta_json);
+                try self.store.incrementCheckpointCount(run_row.id);
+                try self.store.updateRunState(run_row.id, running_state);
+
+                try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                return;
+            }
+
+            // Reconciliation: check tracker task status between steps
+            if (tracker_url != null and task_id != null) {
+                if (!reconcileWithTracker(alloc, tracker_url.?, task_id.?)) {
+                    log.info("run {s} cancelled by reconciliation", .{run_row.id});
+                    try self.store.updateRunStatus(run_row.id, "failed", "cancelled by tracker reconciliation");
+                    try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"tracker_cancelled\"}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                    return;
+                }
+            }
+
             // Save checkpoint after each node
             made_progress = true;
             version += 1;
@@ -449,6 +592,11 @@ pub const Engine = struct {
             try self.store.updateRunState(run_row.id, running_state);
         }
 
+            // If goto override is set, use it for next iteration instead of findReadyNodes
+            if (goto_override) |targets| {
+                goto_ready = targets;
+            }
+
             // If no progress was made in this iteration, break
             if (!made_progress) break;
         } // end while loop
@@ -459,6 +607,7 @@ pub const Engine = struct {
     const TaskNodeResult = union(enum) {
         completed: struct {
             state_updates: ?[]const u8,
+            goto_targets: ?[]const []const u8 = null,
         },
         async_pending: void,
         no_worker: void,
@@ -597,8 +746,46 @@ pub const Engine = struct {
 
         // 7. Handle result
         if (result.success) {
-            const output = result.output;
-            const output_json = try wrapOutput(alloc, output);
+            var final_output = result.output;
+
+            // 7a. Multi-turn continuation for agent nodes
+            if (is_agent_node) {
+                const max_turns_val = getNodeFieldInt(alloc, node_json, "max_turns");
+                const continuation_prompt = getNodeField(alloc, node_json, "continuation_prompt");
+
+                if (max_turns_val != null and continuation_prompt != null) {
+                    const mt = max_turns_val.?;
+                    const max_turns: u32 = @intCast(@min(@max(mt, 1), 100));
+                    if (max_turns > 1) {
+                        var turn: u32 = 1;
+                        while (turn < max_turns) : (turn += 1) {
+                            // Consume pending injections between turns
+                            const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
+                            _ = injections;
+
+                            // Render continuation prompt
+                            const cont_rendered = templates.renderTemplate(alloc, continuation_prompt.?, state_json, run_row.input_json, null) catch break;
+
+                            const cont_result = try dispatch.dispatchStep(
+                                alloc,
+                                worker.url,
+                                worker.token,
+                                worker.protocol,
+                                worker.model,
+                                run_row.id,
+                                step_id,
+                                cont_rendered,
+                            );
+
+                            if (!cont_result.success) break;
+                            final_output = cont_result.output;
+                        }
+                        log.info("agent node {s} completed {d} turns", .{ node_name, turn });
+                    }
+                }
+            }
+
+            const output_json = try wrapOutput(alloc, final_output);
             try self.store.updateStepStatus(step_id, "completed", worker.id, output_json, null, 1);
             try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
             try self.store.markWorkerSuccess(worker.id, ids.nowMs());
@@ -610,10 +797,13 @@ pub const Engine = struct {
 
             // Build state_updates from output
             // Try parsing as JSON with "state_updates" field, otherwise wrap output in "output" key
-            const state_updates = extractStateUpdates(alloc, output) orelse
-                try std.fmt.allocPrint(alloc, "{{\"output\":{s}}}", .{try jsonStringify(alloc, output)});
+            const state_updates = extractStateUpdates(alloc, final_output) orelse
+                try std.fmt.allocPrint(alloc, "{{\"output\":{s}}}", .{try jsonStringify(alloc, final_output)});
 
-            return TaskNodeResult{ .completed = .{ .state_updates = state_updates } };
+            // Extract goto targets from output (command primitive)
+            const goto_targets = extractGotoTargets(alloc, final_output);
+
+            return TaskNodeResult{ .completed = .{ .state_updates = state_updates, .goto_targets = goto_targets } };
         } else {
             const err_text = result.error_text orelse "dispatch failed";
             try self.store.updateStepStatus(step_id, "failed", worker.id, null, err_text, 1);
@@ -638,6 +828,82 @@ pub const Engine = struct {
         }
     }
 
+    // ── executeSubgraphNode ─────────────────────────────────────────
+
+    fn executeSubgraphNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8, recursion_depth: u32) !TaskNodeResult {
+        if (recursion_depth >= 10) {
+            log.err("subgraph node {s}: max recursion depth (10) exceeded", .{node_name});
+            return TaskNodeResult{ .failed = "subgraph max recursion depth exceeded" };
+        }
+
+        // Get workflow_id
+        const workflow_id = getNodeField(alloc, node_json, "workflow_id") orelse {
+            log.err("subgraph node {s}: missing workflow_id", .{node_name});
+            return TaskNodeResult{ .failed = "subgraph missing workflow_id" };
+        };
+
+        // Load workflow definition from store
+        const workflow_row = try self.store.getWorkflow(alloc, workflow_id);
+        if (workflow_row == null) {
+            log.err("subgraph node {s}: workflow {s} not found", .{ node_name, workflow_id });
+            return TaskNodeResult{ .failed = "subgraph workflow not found" };
+        }
+        const definition = workflow_row.?.definition_json;
+
+        // Build input state from parent state using input_mapping
+        const input_mapping_json = getNodeField(alloc, node_json, "input_mapping") orelse "{}";
+        const child_input = buildSubgraphInput(alloc, state_json, input_mapping_json) catch "{}";
+
+        // Get schema from child workflow for initState
+        const child_schema = getWorkflowField(alloc, definition, "schema") orelse "{}";
+        const child_state = state_mod.initState(alloc, child_input, child_schema) catch try alloc.dupe(u8, child_input);
+
+        // Create child run
+        const child_id_buf = ids.generateId();
+        const child_id = try alloc.dupe(u8, &child_id_buf);
+        try self.store.createRunWithState(child_id, workflow_id, definition, child_input, child_state);
+        try self.store.setParentRunId(child_id, run_row.id);
+        try self.store.updateRunStatus(child_id, "running", null);
+
+        // Create step record for the subgraph node
+        const step_id_buf = ids.generateId();
+        const step_id = try alloc.dupe(u8, &step_id_buf);
+        try self.store.insertStep(step_id, run_row.id, node_name, "subgraph", "running", "{}", 1, null, null, null);
+        try self.store.insertEvent(run_row.id, step_id, "step.running", "{}");
+
+        // Execute child run inline (recursive call to processRunWithDepth)
+        const child_run = (try self.store.getRun(alloc, child_id)).?;
+        self.processRunInline(alloc, child_run, recursion_depth + 1);
+
+        // Check child run result
+        const completed_child = (try self.store.getRun(alloc, child_id)).?;
+        if (!std.mem.eql(u8, completed_child.status, "completed")) {
+            const child_error = completed_child.error_text orelse "subgraph did not complete";
+            try self.store.updateStepStatus(step_id, "failed", null, null, child_error, 1);
+            return TaskNodeResult{ .failed = child_error };
+        }
+
+        // Extract output_key from child's final state
+        const output_key = getNodeField(alloc, node_json, "output_key") orelse "output";
+        const child_final_state = completed_child.state_json orelse "{}";
+
+        // Get the value at output_key from child state
+        const output_path = try std.fmt.allocPrint(alloc, "state.{s}", .{output_key});
+        const output_value = state_mod.getStateValue(alloc, child_final_state, output_path) catch null;
+
+        // Build state_updates: {output_key: value}
+        const state_updates = if (output_value) |val|
+            try std.fmt.allocPrint(alloc, "{{\"{s}\":{s}}}", .{ output_key, val })
+        else
+            try std.fmt.allocPrint(alloc, "{{\"{s}\":null}}", .{output_key});
+
+        try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
+        try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+
+        log.info("subgraph node {s} completed (child run {s})", .{ node_name, child_id });
+        return TaskNodeResult{ .completed = .{ .state_updates = state_updates } };
+    }
+
     // ── executeSendNode ──────────────────────────────────────────────
 
     fn executeSendNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !SendNodeResult {
@@ -1129,6 +1395,127 @@ fn extractStateUpdates(alloc: std.mem.Allocator, output: []const u8) ?[]const u8
     return serializeJsonValue(alloc, su) catch null;
 }
 
+/// Extract "goto" field from worker output JSON.
+/// Returns array of target node names. Supports:
+///   - "goto": "node_name" -> ["node_name"]
+///   - "goto": ["node_a", "node_b"] -> ["node_a", "node_b"]
+fn extractGotoTargets(alloc: std.mem.Allocator, output: []const u8) ?[]const []const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, output, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const goto_val = parsed.value.object.get("goto") orelse return null;
+
+    var targets: std.ArrayListUnmanaged([]const u8) = .empty;
+    if (goto_val == .string) {
+        targets.append(alloc, goto_val.string) catch return null;
+    } else if (goto_val == .array) {
+        for (goto_val.array.items) |item| {
+            if (item == .string) {
+                targets.append(alloc, item.string) catch continue;
+            }
+        }
+    } else {
+        return null;
+    }
+
+    if (targets.items.len == 0) return null;
+    return targets.toOwnedSlice(alloc) catch null;
+}
+
+/// Parse interrupt_before / interrupt_after arrays from workflow definition.
+fn parseBreakpointList(alloc: std.mem.Allocator, workflow_json: []const u8, field: []const u8) []const []const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return &.{};
+    if (parsed.value != .object) return &.{};
+    const arr_val = parsed.value.object.get(field) orelse return &.{};
+    if (arr_val != .array) return &.{};
+
+    var result: std.ArrayListUnmanaged([]const u8) = .empty;
+    for (arr_val.array.items) |item| {
+        if (item == .string) {
+            result.append(alloc, item.string) catch continue;
+        }
+    }
+    return result.toOwnedSlice(alloc) catch &.{};
+}
+
+/// Check if a node name is in a breakpoint list.
+fn isInBreakpointList(name: []const u8, list: []const []const u8) bool {
+    for (list) |item| {
+        if (std.mem.eql(u8, name, item)) return true;
+    }
+    return false;
+}
+
+/// Get an integer field from a node's JSON.
+fn getNodeFieldInt(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?i64 {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val == .integer) return val.integer;
+    return null;
+}
+
+/// Build subgraph input state from parent state using input_mapping.
+/// input_mapping is {"child_key": "state.parent_key", ...}
+fn buildSubgraphInput(alloc: std.mem.Allocator, parent_state: []const u8, input_mapping_json: []const u8) ![]const u8 {
+    const mapping_parsed = json.parseFromSlice(json.Value, alloc, input_mapping_json, .{}) catch return try alloc.dupe(u8, "{}");
+    if (mapping_parsed.value != .object) return try alloc.dupe(u8, "{}");
+
+    var result = json.ObjectMap.init(alloc);
+    var it = mapping_parsed.value.object.iterator();
+    while (it.next()) |entry| {
+        const child_key = entry.key_ptr.*;
+        const parent_path = if (entry.value_ptr.* == .string) entry.value_ptr.string else continue;
+
+        // Resolve the value from parent state
+        if (state_mod.getStateValue(alloc, parent_state, parent_path) catch null) |value_str| {
+            const val_parsed = json.parseFromSlice(json.Value, alloc, value_str, .{}) catch continue;
+            try result.put(child_key, val_parsed.value);
+        }
+    }
+
+    return serializeJsonValue(alloc, .{ .object = result });
+}
+
+/// Reconcile with nulltickets: check if associated task has been cancelled.
+/// Returns true if the run should continue, false if it should be cancelled.
+fn reconcileWithTracker(alloc: std.mem.Allocator, tracker_url: []const u8, task_id: []const u8) bool {
+    const url = std.fmt.allocPrint(alloc, "{s}/tasks/{s}", .{ tracker_url, task_id }) catch return true;
+    defer alloc.free(url);
+
+    var client: std.http.Client = .{ .allocator = alloc };
+    defer client.deinit();
+
+    var response_body: std.io.Writer.Allocating = .init(alloc);
+    defer response_body.deinit();
+
+    const result = client.fetch(.{
+        .location = .{ .url = url },
+        .method = .GET,
+        .response_writer = &response_body.writer,
+    }) catch return true; // network errors -> continue
+
+    const status_code = @intFromEnum(result.status);
+    if (status_code < 200 or status_code >= 300) return true;
+
+    const body = response_body.written();
+    const parsed = json.parseFromSlice(json.Value, alloc, body, .{}) catch return true;
+    if (parsed.value != .object) return true;
+
+    const stage = parsed.value.object.get("stage") orelse return true;
+    if (stage != .string) return true;
+
+    // Terminal states -> cancel
+    if (std.mem.eql(u8, stage.string, "done") or
+        std.mem.eql(u8, stage.string, "cancelled") or
+        std.mem.eql(u8, stage.string, "canceled"))
+    {
+        log.info("reconciliation: task {s} is in terminal state '{s}', cancelling run", .{ task_id, stage.string });
+        return false;
+    }
+
+    return true;
+}
+
 // ── Tests ─────────────────────────────────────────────────────────────
 
 test "Engine: init and stop" {
@@ -1485,3 +1872,240 @@ test "extractStateUpdates returns null for plain text" {
     const result = extractStateUpdates(arena.allocator(), "just plain text");
     try std.testing.expect(result == null);
 }
+
+test "extractGotoTargets: string target" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const output =
+        \\{"state_updates":{"x":1},"goto":"merge_step"}
+    ;
+    const targets = extractGotoTargets(arena.allocator(), output);
+    try std.testing.expect(targets != null);
+    try std.testing.expectEqual(@as(usize, 1), targets.?.len);
+    try std.testing.expectEqualStrings("merge_step", targets.?[0]);
+}
+
+test "extractGotoTargets: array targets" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const output =
+        \\{"goto":["step_a","step_b"]}
+    ;
+    const targets = extractGotoTargets(arena.allocator(), output);
+    try std.testing.expect(targets != null);
+    try std.testing.expectEqual(@as(usize, 2), targets.?.len);
+    try std.testing.expectEqualStrings("step_a", targets.?[0]);
+    try std.testing.expectEqualStrings("step_b", targets.?[1]);
+}
+
+test "extractGotoTargets: no goto field" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const targets = extractGotoTargets(arena.allocator(), "{\"state_updates\":{}}");
+    try std.testing.expect(targets == null);
+}
+
+test "extractGotoTargets: not JSON" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const targets = extractGotoTargets(arena.allocator(), "plain text");
+    try std.testing.expect(targets == null);
+}
+
+test "parseBreakpointList: valid list" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const wf =
+        \\{"interrupt_before":["review","merge"],"interrupt_after":["generate"],"nodes":{},"edges":[]}
+    ;
+    const before = parseBreakpointList(arena.allocator(), wf, "interrupt_before");
+    try std.testing.expectEqual(@as(usize, 2), before.len);
+    try std.testing.expectEqualStrings("review", before[0]);
+    try std.testing.expectEqualStrings("merge", before[1]);
+
+    const after = parseBreakpointList(arena.allocator(), wf, "interrupt_after");
+    try std.testing.expectEqual(@as(usize, 1), after.len);
+    try std.testing.expectEqualStrings("generate", after[0]);
+}
+
+test "parseBreakpointList: missing field" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const wf =
+        \\{"nodes":{},"edges":[]}
+    ;
+    const result = parseBreakpointList(arena.allocator(), wf, "interrupt_before");
+    try std.testing.expectEqual(@as(usize, 0), result.len);
+}
+
+test "isInBreakpointList" {
+    const list = [_][]const u8{ "review", "merge" };
+    try std.testing.expect(isInBreakpointList("review", &list));
+    try std.testing.expect(isInBreakpointList("merge", &list));
+    try std.testing.expect(!isInBreakpointList("build", &list));
+}
+
+test "getNodeFieldInt: valid integer" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"agent","max_turns":10}
+    ;
+    const result = getNodeFieldInt(arena.allocator(), node, "max_turns");
+    try std.testing.expect(result != null);
+    try std.testing.expectEqual(@as(i64, 10), result.?);
+}
+
+test "getNodeFieldInt: missing field" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"task"}
+    ;
+    const result = getNodeFieldInt(arena.allocator(), node, "max_turns");
+    try std.testing.expect(result == null);
+}
+
+test "getNodeFieldInt: string field returns null" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"task","max_turns":"five"}
+    ;
+    const result = getNodeFieldInt(arena.allocator(), node, "max_turns");
+    try std.testing.expect(result == null);
+}
+
+test "buildSubgraphInput: maps values from parent state" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    const parent_state =
+        \\{"fix_result":"patched code","count":42}
+    ;
+    const mapping =
+        \\{"code":"state.fix_result"}
+    ;
+
+    const result = try buildSubgraphInput(alloc, parent_state, mapping);
+    const parsed = try json.parseFromSlice(json.Value, alloc, result, .{});
+    try std.testing.expect(parsed.value == .object);
+    const code = parsed.value.object.get("code") orelse return error.TestUnexpectedResult;
+    try std.testing.expectEqualStrings("patched code", code.string);
+}
+
+test "buildSubgraphInput: empty mapping" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const result = try buildSubgraphInput(arena.allocator(), "{\"x\":1}", "{}");
+    try std.testing.expectEqualStrings("{}", result);
+}
+
+test "engine: breakpoint interrupt_before stops run" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    // Workflow with interrupt_before on t1
+    const wf =
+        \\{"interrupt_before":["t1"],"nodes":{"t1":{"type":"transform","updates":"{\"result\":\"done\"}"}},"edges":[["__start__","t1"],["t1","__end__"]],"schema":{"result":{"type":"string","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    // Should be interrupted, not completed, because interrupt_before fires before t1
+    try std.testing.expectEqualStrings("interrupted", updated_run.status);
+}
+
+test "engine: breakpoint interrupt_after stops run after node" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    // Workflow with interrupt_after on t1; there's a t2 after t1
+    const wf =
+        \\{"interrupt_after":["t1"],"nodes":{"t1":{"type":"transform","updates":"{\"x\":\"done\"}"},"t2":{"type":"transform","updates":"{\"y\":\"also\"}"}},"edges":[["__start__","t1"],["t1","t2"],["t2","__end__"]],"schema":{"x":{"type":"string","reducer":"last_value"},"y":{"type":"string","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    // t1 should have executed (state contains x), but run is interrupted
+    try std.testing.expectEqualStrings("interrupted", updated_run.status);
+    // Verify t1's state was saved
+    if (updated_run.state_json) |sj| {
+        try std.testing.expect(std.mem.indexOf(u8, sj, "done") != null);
+    }
+}
+
+test "engine: configurable runs inject __config" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    // Workflow with a transform that sets result
+    const wf =
+        \\{"nodes":{"t1":{"type":"transform","updates":"{\"result\":\"ok\"}"}},"edges":[["__start__","t1"],["t1","__end__"]],"schema":{"result":{"type":"string","reducer":"last_value"},"__config":{"type":"object","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.setConfigJson("r1", "{\"model\":\"gpt-4\"}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("completed", updated_run.status);
+    // Verify __config was injected into state
+    if (updated_run.state_json) |sj| {
+        try std.testing.expect(std.mem.indexOf(u8, sj, "__config") != null);
+        try std.testing.expect(std.mem.indexOf(u8, sj, "gpt-4") != null);
+    }
+}
diff --git a/src/store.zig b/src/store.zig
index cb74ee8..390691d 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -403,7 +403,7 @@ pub const Store = struct {
     }
 
     pub fn getRun(self: *Self, allocator: std.mem.Allocator, id: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE id = ?";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE id = ?";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -427,11 +427,13 @@ pub const Store = struct {
             .started_at_ms = colIntOpt(stmt, 9),
             .ended_at_ms = colIntOpt(stmt, 10),
             .state_json = try allocStrOpt(allocator, stmt, 11),
+            .config_json = try allocStrOpt(allocator, stmt, 12),
+            .parent_run_id = try allocStrOpt(allocator, stmt, 13),
         };
     }
 
     pub fn getRunByIdempotencyKey(self: *Self, allocator: std.mem.Allocator, key: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -454,13 +456,15 @@ pub const Store = struct {
             .started_at_ms = colIntOpt(stmt, 9),
             .ended_at_ms = colIntOpt(stmt, 10),
             .state_json = try allocStrOpt(allocator, stmt, 11),
+            .config_json = try allocStrOpt(allocator, stmt, 12),
+            .parent_run_id = try allocStrOpt(allocator, stmt, 13),
         };
     }
 
     pub fn listRuns(self: *Self, allocator: std.mem.Allocator, status_filter: ?[]const u8, limit: i64, offset: i64) ![]types.RunRow {
         var stmt: ?*c.sqlite3_stmt = null;
         if (status_filter != null) {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE status = ? ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
+            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status = ? ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
             if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
                 return error.SqlitePrepareFailed;
             }
@@ -468,7 +472,7 @@ pub const Store = struct {
             _ = c.sqlite3_bind_int64(stmt, 2, limit);
             _ = c.sqlite3_bind_int64(stmt, 3, offset);
         } else {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
+            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
             if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
                 return error.SqlitePrepareFailed;
             }
@@ -492,6 +496,8 @@ pub const Store = struct {
                 .started_at_ms = colIntOpt(stmt, 9),
                 .ended_at_ms = colIntOpt(stmt, 10),
                 .state_json = try allocStrOpt(allocator, stmt, 11),
+                .config_json = try allocStrOpt(allocator, stmt, 12),
+                .parent_run_id = try allocStrOpt(allocator, stmt, 13),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -516,7 +522,7 @@ pub const Store = struct {
     }
 
     pub fn getActiveRuns(self: *Self, allocator: std.mem.Allocator) ![]types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json FROM runs WHERE status IN ('running', 'paused') ORDER BY created_at_ms DESC";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status IN ('running', 'paused') ORDER BY created_at_ms DESC";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -538,6 +544,8 @@ pub const Store = struct {
                 .started_at_ms = colIntOpt(stmt, 9),
                 .ended_at_ms = colIntOpt(stmt, 10),
                 .state_json = try allocStrOpt(allocator, stmt, 11),
+                .config_json = try allocStrOpt(allocator, stmt, 12),
+                .parent_run_id = try allocStrOpt(allocator, stmt, 13),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -1575,6 +1583,40 @@ pub const Store = struct {
         }
     }
 
+    pub fn setParentRunId(self: *Self, run_id: []const u8, parent_run_id: []const u8) !void {
+        const sql = "UPDATE runs SET parent_run_id = ?, updated_at_ms = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, parent_run_id.ptr, @intCast(parent_run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 2, ids.nowMs());
+        _ = c.sqlite3_bind_text(stmt, 3, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn setConfigJson(self: *Self, run_id: []const u8, config_json: []const u8) !void {
+        const sql = "UPDATE runs SET config_json = ?, updated_at_ms = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, config_json.ptr, @intCast(config_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 2, ids.nowMs());
+        _ = c.sqlite3_bind_text(stmt, 3, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
     pub fn createForkedRun(self: *Self, id: []const u8, workflow_json: []const u8, state_json: []const u8, forked_from_run_id: []const u8, forked_from_checkpoint_id: []const u8) !void {
         const sql = "INSERT INTO runs (id, status, workflow_json, input_json, callbacks_json, state_json, forked_from_run_id, forked_from_checkpoint_id, created_at_ms, updated_at_ms) VALUES (?, 'pending', ?, '{}', '[]', ?, ?, ?, ?, ?)";
         var stmt: ?*c.sqlite3_stmt = null;
diff --git a/src/types.zig b/src/types.zig
index 93d3d1d..17f1551 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -156,6 +156,8 @@ pub const RunRow = struct {
     started_at_ms: ?i64,
     ended_at_ms: ?i64,
     state_json: ?[]const u8 = null,
+    config_json: ?[]const u8 = null,
+    parent_run_id: ?[]const u8 = null,
 };
 
 pub const StepRow = struct {

From 83f70d387feba91e8195357f584b2dc31e485613 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 20:37:45 -0300
Subject: [PATCH 15/55] feat: add config template alias, phase 2 gaps spec

---
 .../2026-03-13-orchestration-gaps-design.md   | 325 ++++++++++++++++++
 src/templates.zig                             |  17 +
 2 files changed, 342 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md

diff --git a/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md b/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md
new file mode 100644
index 0000000..7d94355
--- /dev/null
+++ b/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md
@@ -0,0 +1,325 @@
+# Orchestration Gaps Design — Phase 2
+
+**Date:** 2026-03-13
+**Status:** Draft
+**Scope:** NullBoiler, NullTickets, NullHub
+**Branch:** feat/orchestration (extends Phase 1)
+
+---
+
+## Overview
+
+Phase 2 closes remaining gaps vs LangGraph and Symphony. No backward compatibility needed.
+
+---
+
+## 1. Command Primitive
+
+Nodes can return `goto` alongside `state_updates` to control routing:
+
+```json
+{
+    "state_updates": {"review_grade": "approve"},
+    "goto": "merge_step"
+}
+```
+
+Engine behavior: if response contains `goto`, skip normal edge evaluation and jump directly to the named node. The node must exist in the workflow. `goto` can be a string (single node) or array (fan-out to multiple nodes).
+
+Worker response JSON:
+```json
+{"response": "Approved", "goto": "merge_step"}
+```
+
+Engine parses `goto` from worker response alongside the text response. For `task` and `agent` nodes only. `route`, `transform`, `interrupt` nodes don't use `goto`.
+
+---
+
+## 2. Subgraphs
+
+New node type `subgraph`:
+
+```json
+{
+    "review_flow": {
+        "type": "subgraph",
+        "workflow_id": "code-review-workflow",
+        "input_mapping": {
+            "code": "state.fix_result",
+            "description": "state.task_description"
+        },
+        "output_key": "review_result"
+    }
+}
+```
+
+Engine behavior:
+1. Load workflow definition from `workflows` table by `workflow_id`
+2. Build subgraph input from parent state via `input_mapping` (key = subgraph input key, value = parent state path)
+3. Create a child run with `createRunWithState()`, linking to parent via a new `parent_run_id` column
+4. Execute child run to completion (inline, not spawning a separate engine tick loop — just call `processRun` recursively)
+5. On completion, extract child's final state and write to parent's `output_key`
+6. On failure, propagate failure to parent run
+
+### Schema changes
+
+```sql
+ALTER TABLE runs ADD COLUMN parent_run_id TEXT REFERENCES runs(id);
+```
+
+### StepType update
+
+Add `subgraph` to StepType enum in types.zig.
+
+---
+
+## 3. Breakpoints on Any Node
+
+Workflow-level config:
+
+```json
+{
+    "interrupt_before": ["review", "merge"],
+    "interrupt_after": ["generate"],
+    ...
+}
+```
+
+Engine behavior: before executing a node, check if it's in `interrupt_before`. If so, save checkpoint and set run to `interrupted`. After executing a node, check `interrupt_after`. Same behavior.
+
+Resume works exactly like interrupt node resume — `POST /runs/{id}/resume` with optional `state_updates`.
+
+This is purely engine logic — no schema changes, no new API endpoints.
+
+---
+
+## 4. Store API in NullTickets
+
+New table:
+
+```sql
+CREATE TABLE store (
+    namespace TEXT NOT NULL,
+    key TEXT NOT NULL,
+    value_json TEXT NOT NULL,
+    created_at_ms INTEGER NOT NULL,
+    updated_at_ms INTEGER NOT NULL,
+    PRIMARY KEY (namespace, key)
+);
+CREATE INDEX idx_store_namespace ON store(namespace);
+```
+
+### API endpoints
+
+```
+PUT    /store/{namespace}/{key}     — put (upsert)
+GET    /store/{namespace}/{key}     — get single
+GET    /store/{namespace}           — list all in namespace
+DELETE /store/{namespace}/{key}     — delete
+DELETE /store/{namespace}           — delete namespace
+```
+
+Request body for PUT:
+```json
+{"value": {"any": "json"}}
+```
+
+Response for GET:
+```json
+{
+    "namespace": "user_123",
+    "key": "preferences",
+    "value": {"theme": "dark"},
+    "created_at_ms": 1710300000000,
+    "updated_at_ms": 1710300005000
+}
+```
+
+### Usage from NullBoiler workflows
+
+New template syntax: `{{store.namespace.key}}` — engine fetches from nulltickets Store API during prompt rendering.
+
+New node type isn't needed — `task` nodes can read via template, and `transform` nodes can write via a new `store_updates` field:
+
+```json
+{
+    "save_context": {
+        "type": "transform",
+        "updates": {},
+        "store_updates": {
+            "namespace": "project_context",
+            "key": "latest_review",
+            "value": "state.review_result"
+        }
+    }
+}
+```
+
+Engine calls nulltickets `PUT /store/{namespace}/{key}` when `store_updates` is present.
+
+---
+
+## 5. Multi-Turn Continuation
+
+Extend `agent` node with multi-turn support:
+
+```json
+{
+    "fix_bug": {
+        "type": "agent",
+        "prompt": "Fix this: {{state.task_description}}",
+        "continuation_prompt": "Task is still active. Continue from current state.",
+        "max_turns": 10,
+        "tags": ["coder"],
+        "output_key": "fix_result"
+    }
+}
+```
+
+Engine behavior:
+1. Turn 1: A2A `tasks/send` with rendered `prompt`, `contextId = "run_{id}_step_{name}"`
+2. Parse response — check if agent indicated completion (response contains final answer, no pending tool calls)
+3. If not complete and turn < `max_turns`: send `continuation_prompt` via A2A with same `contextId` (session persistence)
+4. Repeat until complete or `max_turns` exhausted
+5. Final response text → state_updates via `output_key`
+
+Between turns, engine can:
+- Check if nulltickets task state changed (reconciliation)
+- Apply pending state injections
+- Broadcast SSE `agent_turn` events
+
+No schema changes needed — this is engine logic using existing A2A infrastructure.
+
+---
+
+## 6. Configurable Runs
+
+Workflow JSON gets optional `defaults` section:
+
+```json
+{
+    "defaults": {
+        "model": "claude-sonnet-4-6",
+        "temperature": 0.7,
+        "max_agent_turns": 10
+    },
+    ...
+}
+```
+
+Run creation accepts `config` overrides:
+
+```
+POST /workflows/{id}/run
+{
+    "input": {"task": "fix bug"},
+    "config": {"model": "claude-opus-4-6", "temperature": 0.3}
+}
+```
+
+Merged config (run overrides > workflow defaults) stored in `run.config_json`.
+
+Template access: `{{config.model}}`, `{{config.temperature}}`.
+
+### Schema changes
+
+```sql
+ALTER TABLE runs ADD COLUMN config_json TEXT;
+```
+
+---
+
+## 7. Per-State Concurrency in NullTickets
+
+Extend nulltickets claim endpoint to support per-state limits.
+
+Claim request gets optional `concurrency` parameter:
+
+```
+POST /leases/claim
+{
+    "agent_id": "boiler-01",
+    "agent_role": "coder",
+    "concurrency": {
+        "per_state": {"in_progress": 5, "rework": 2}
+    }
+}
+```
+
+Claim logic: before returning a task, count currently-leased tasks in the same state. If at limit, skip to next eligible task.
+
+This is a nulltickets store.zig change in the claim query.
+
+---
+
+## 8. Reconciliation
+
+Engine tick adds a reconciliation step for runs linked to nulltickets tasks:
+
+After each step completes, if `run.task_id` is set (pull-mode run):
+1. Fetch current task state from nulltickets: `GET /tasks/{task_id}`
+2. If task state changed to a terminal state → cancel the run
+3. If task state changed to a different active state → update run metadata, continue
+
+This prevents wasted agent execution on tasks that humans already resolved.
+
+Engine logic only — no schema changes.
+
+---
+
+## 9. Workspace Reuse Per Issue
+
+In NullBoiler's tracker/workspace system, workspaces should be reused for the same nulltickets task:
+
+- Workspace directory name based on `task_id` (not `run_id`)
+- On new run for same task: reuse existing workspace (skip `after_create` hook, still run `before_run`)
+- On task completion: run `after_run` hook, keep workspace
+- On task terminal state + configurable cleanup: run `before_remove`, delete workspace
+
+This is a tracker.zig + workspace.zig change.
+
+---
+
+## 10. Message-Native State (add_messages reducer)
+
+New reducer type `add_messages`:
+
+```json
+{
+    "state_schema": {
+        "messages": {"type": "array", "reducer": "add_messages"}
+    }
+}
+```
+
+Behavior:
+- Each message has an `id` field
+- On update: if message with same `id` exists, replace it. Otherwise append.
+- Special: if update contains `{"remove": true, "id": "msg_123"}`, remove that message.
+- If message has no `id`, auto-generate one and append.
+
+This enables chat-history-aware workflows where messages can be updated or removed by ID.
+
+Implementation: new case in `state.zig` `applyReducer()`.
+
+### ReducerType update
+
+Add `add_messages` to ReducerType enum in types.zig.
+
+---
+
+## Summary of Changes
+
+| Repo | Changes |
+|------|---------|
+| NullBoiler types.zig | Add `subgraph` to StepType, `add_messages` to ReducerType |
+| NullBoiler engine.zig | Command goto, subgraph execution, breakpoints, multi-turn, reconciliation, store_updates |
+| NullBoiler state.zig | add_messages reducer |
+| NullBoiler store.zig | `parent_run_id` + `config_json` columns |
+| NullBoiler api.zig | config in run creation, template store access |
+| NullBoiler templates.zig | `{{store.X.Y}}`, `{{config.X}}` access |
+| NullBoiler tracker.zig | Workspace reuse, reconciliation |
+| nulltickets store.zig | Store KV CRUD, per-state concurrency in claim |
+| nulltickets api.zig | Store endpoints, claim concurrency param |
+| nulltickets migrations | Store table |
+| nullhub UI | Store viewer page (optional) |
diff --git a/src/templates.zig b/src/templates.zig
index b2cd211..955945c 100644
--- a/src/templates.zig
+++ b/src/templates.zig
@@ -495,6 +495,23 @@ fn resolveNewExpression(
         return alloc.dupe(u8, "") catch return error.OutOfMemory;
     }
 
+    // {{config.X}} — alias for {{state.__config.X}}
+    if (std.mem.startsWith(u8, expr, "config.")) {
+        const config_path = try std.fmt.allocPrint(alloc, "state.__config.{s}", .{expr["config.".len..]});
+        defer alloc.free(config_path);
+        const raw = try state_mod.getStateValue(alloc, state_json, config_path);
+        if (raw) |r| {
+            const stripped = stripJsonQuotes(r);
+            if (stripped.ptr != r.ptr or stripped.len != r.len) {
+                const result = alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+                alloc.free(r);
+                return result;
+            }
+            return r;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
     // Unknown expression — return empty
     return alloc.dupe(u8, "") catch return error.OutOfMemory;
 }

From 997e2cbf58029e3613264c216f5d47181254a233 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 20:54:14 -0300
Subject: [PATCH 16/55] feat: per-node retry/cache, pending writes, overwrite,
 deferred nodes, managed values

Gap 2: Per-node retry with exponential backoff (max_attempts, initial_interval_ms, backoff_factor, max_interval_ms)
Gap 3: Per-node cache with TTL (node_cache table, FNV hash cache keys, skip dispatch on hit)
Gap 4: Pending writes table for parallel execution resilience
Gap 5: Overwrite bypass (__overwrite: true) skips reducer in applyUpdates
Gap 6: Deferred nodes (defer: true) execute just before __end__
Gap 7: Managed values (__meta with step, is_last_step, remaining_steps, run_id, node_name)
---
 src/engine.zig                       | 454 +++++++++++++++++++++++++--
 src/migrations/004_orchestration.sql |  21 ++
 src/state.zig                        |  74 ++++-
 src/store.zig                        | 187 +++++++++--
 src/types.zig                        |  10 +
 5 files changed, 703 insertions(+), 43 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index f123eb4..9533ffa 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -35,8 +35,7 @@ const callbacks = @import("callbacks.zig");
 const metrics_mod = @import("metrics.zig");
 const async_dispatch = @import("async_dispatch.zig");
 const state_mod = @import("state.zig");
-
-// TODO: add SseHub integration in Task 12
+const sse_mod = @import("sse.zig");
 
 // ── Engine ───────────────────────────────────────────────────────────
 
@@ -59,6 +58,7 @@ pub const Engine = struct {
     next_health_check_at_ms: i64,
     metrics: ?*metrics_mod.Metrics,
     response_queue: ?*async_dispatch.ResponseQueue,
+    sse_hub: ?*sse_mod.SseHub = null,
 
     pub fn init(store: *Store, allocator: std.mem.Allocator, poll_interval_ms: u64) Engine {
         return .{
@@ -70,6 +70,7 @@ pub const Engine = struct {
             .next_health_check_at_ms = 0,
             .metrics = null,
             .response_queue = null,
+            .sse_hub = null,
         };
     }
 
@@ -189,6 +190,9 @@ pub const Engine = struct {
         const interrupt_before = parseBreakpointList(alloc, workflow_json, "interrupt_before");
         const interrupt_after = parseBreakpointList(alloc, workflow_json, "interrupt_after");
 
+        // 2d. Collect deferred nodes (Gap 6)
+        const deferred_nodes = collectDeferredNodes(alloc, workflow_json);
+
         // 2c. Get tracker URL for reconciliation
         const tracker_url = getWorkflowField(alloc, workflow_json, "tracker_url");
         const task_id = getWorkflowField(alloc, workflow_json, "task_id");
@@ -234,6 +238,17 @@ pub const Engine = struct {
         var version: i64 = if (latest_checkpoint) |cp| cp.version else 0;
         const initial_version = version;
 
+        // 3b. Workflow version migration check
+        const wf_version = getWorkflowVersion(alloc, workflow_json);
+        if (latest_checkpoint) |cp| {
+            const cp_version = getCheckpointWorkflowVersion(alloc, cp.metadata_json);
+            if (cp_version != wf_version) {
+                log.warn("workflow version changed from {d} to {d}, attempting migration", .{ cp_version, wf_version });
+                // Filter completed_nodes to only include nodes that still exist
+                _ = migrateCompletedNodes(alloc, &completed_nodes, workflow_json);
+            }
+        }
+
         // 4. Main execution loop: find ready nodes, execute, repeat
         var running_state: []const u8 = try alloc.dupe(u8, current_state);
         var max_iterations: u32 = 1000; // safety guard against infinite loops
@@ -241,10 +256,19 @@ pub const Engine = struct {
 
         while (max_iterations > 0) : (max_iterations -= 1) {
             // Use goto override if set, otherwise find ready nodes normally
-            const ready_nodes = if (goto_ready) |gr| blk: {
+            const all_ready_nodes = if (goto_ready) |gr| blk: {
                 goto_ready = null;
                 break :blk gr;
             } else try findReadyNodes(alloc, workflow_json, &completed_nodes, &route_results);
+
+            // Gap 6: Filter out deferred nodes from ready list (execute them later)
+            var ready_list: std.ArrayListUnmanaged([]const u8) = .empty;
+            for (all_ready_nodes) |name| {
+                if (!isInBreakpointList(name, deferred_nodes)) {
+                    try ready_list.append(alloc, name);
+                }
+            }
+            const ready_nodes = ready_list.items;
             if (ready_nodes.len == 0) {
                 // Check termination: if all paths reached __end__
                 if (completed_nodes.get("__end__") != null) {
@@ -293,6 +317,36 @@ pub const Engine = struct {
 
             for (ready_nodes) |node_name| {
             if (std.mem.eql(u8, node_name, "__end__")) {
+                // Gap 6: Execute deferred nodes before completing
+                for (deferred_nodes) |deferred_name| {
+                    if (completed_nodes.get(deferred_name) != null) continue;
+
+                    const def_node_json = getNodeJson(alloc, workflow_json, deferred_name) orelse continue;
+                    const def_node_type = getNodeField(alloc, def_node_json, "type") orelse "task";
+
+                    if (std.mem.eql(u8, def_node_type, "transform")) {
+                        const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
+                        const def_schema = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                        const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
+                        running_state = def_new_state;
+                    } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
+                        const def_result = self.executeTaskNode(alloc, run_row, deferred_name, def_node_json, running_state) catch continue;
+                        switch (def_result) {
+                            .completed => |cr| {
+                                if (cr.state_updates) |updates| {
+                                    const def_schema = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                                    const def_new_state = state_mod.applyUpdates(alloc, running_state, updates, def_schema) catch running_state;
+                                    running_state = def_new_state;
+                                }
+                            },
+                            else => {},
+                        }
+                    }
+
+                    try completed_nodes.put(try alloc.dupe(u8, deferred_name), {});
+                    log.info("deferred node {s} completed for run {s}", .{ deferred_name, run_row.id });
+                }
+
                 // Mark __end__ as completed
                 try completed_nodes.put("__end__", {});
                 version += 1;
@@ -302,7 +356,7 @@ pub const Engine = struct {
                 const cp_id = try alloc.dupe(u8, &cp_id_buf);
                 const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                 const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResults(alloc, &route_results);
+                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                 try self.store.createCheckpoint(cp_id, run_row.id, "__end__", parent_id, running_state, cn_json, version, meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
@@ -323,7 +377,7 @@ pub const Engine = struct {
                 const cp_id = try alloc.dupe(u8, &cp_id_buf);
                 const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                 const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResults(alloc, &route_results);
+                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                 try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
@@ -377,7 +431,7 @@ pub const Engine = struct {
                 const cp_id = try alloc.dupe(u8, &cp_id_buf);
                 const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                 const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResults(alloc, &route_results);
+                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                 try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
@@ -413,12 +467,68 @@ pub const Engine = struct {
 
                 log.info("transform node {s} completed", .{node_name});
             } else if (std.mem.eql(u8, node_type, "task") or std.mem.eql(u8, node_type, "agent")) {
-                // Task/Agent: render prompt, dispatch to worker, apply state updates
-                const result = try self.executeTaskNode(alloc, run_row, node_name, node_json, running_state);
+                // Gap 7: Inject __meta managed values
+                const state_with_meta = injectMeta(alloc, running_state, run_row.id, node_name, version, @as(i64, @intCast(max_iterations))) catch running_state;
+
+                // Gap 3: Check cache before executing
+                const cache_ttl = parseCacheTtlMs(alloc, node_json);
+                if (cache_ttl != null) cache_check: {
+                    const pt_c = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_check;
+                    const rnd_c = templates.renderTemplate(alloc, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
+                    const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
+                    const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
+                    if (cached) |cached_upd| {
+                        const cs = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                        running_state = state_mod.applyUpdates(alloc, running_state, cached_upd, cs) catch running_state;
+                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                        log.info("task node {s} cache hit for run {s}", .{ node_name, run_row.id });
+                        made_progress = true;
+                        version += 1;
+                        const ccb = ids.generateId();
+                        const cci = try alloc.dupe(u8, &ccb);
+                        const ccn = try serializeCompletedNodes(alloc, &completed_nodes);
+                        const cpi: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                        const cmj = try serializeRouteResults(alloc, &route_results);
+                        try self.store.createCheckpoint(cci, run_row.id, node_name, cpi, running_state, ccn, version, cmj);
+                        try self.store.incrementCheckpointCount(run_row.id);
+                        try self.store.updateRunState(run_row.id, running_state);
+                        continue;
+                    }
+                }
+
+                // Gap 2: Retry loop
+                const max_attempts = parseRetryMaxAttempts(alloc, node_json) orelse 1;
+                const retry_init_ms = parseRetryInitialMs(alloc, node_json) orelse 500;
+                const retry_bf = parseRetryBackoff(alloc, node_json) orelse 2.0;
+                const retry_max_ms = parseRetryMaxMs(alloc, node_json) orelse 30000;
+                var result: TaskNodeResult = undefined;
+                var attempt: u32 = 0;
+                while (attempt < max_attempts) : (attempt += 1) {
+                    result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
+                    switch (result) {
+                        .failed => {
+                            if (attempt + 1 < max_attempts) {
+                                var dms: u64 = retry_init_ms;
+                                var ei: u32 = 0;
+                                while (ei < attempt) : (ei += 1) {
+                                    const nd = @as(f64, @floatFromInt(dms)) * retry_bf;
+                                    dms = @intFromFloat(@min(nd, @as(f64, @floatFromInt(retry_max_ms))));
+                                }
+                                if (dms > retry_max_ms) dms = retry_max_ms;
+                                log.info("task node {s} attempt {d}/{d} failed, retrying in {d}ms", .{ node_name, attempt + 1, max_attempts, dms });
+                                std.Thread.sleep(dms * std.time.ns_per_ms);
+                                continue;
+                            }
+                        },
+                        else => break,
+                    }
+                }
 
                 switch (result) {
                     .completed => |cr| {
-                        // Apply state updates
+                        // Gap 7: Strip __meta (don't persist)
+                        running_state = stripMeta(alloc, running_state) catch running_state;
+
                         if (cr.state_updates) |updates| {
                             const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
                             const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
@@ -427,6 +537,21 @@ pub const Engine = struct {
                                 return;
                             };
                             running_state = new_state;
+
+                            // Gap 3: Store result in cache
+                            if (cache_ttl) |ttl| cache_store: {
+                                const pt_s = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_store;
+                                const rnd_s = templates.renderTemplate(alloc, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
+                                const ck_s = computeCacheKey(alloc, node_name, rnd_s) catch break :cache_store;
+                                self.store.setCachedResult(ck_s, node_name, updates, ttl) catch |cerr| {
+                                    log.warn("failed to cache result for node {s}: {}", .{ node_name, cerr });
+                                };
+                            }
+
+                            // Gap 4: Save as pending write
+                            self.store.savePendingWrite(run_row.id, node_name, node_name, updates) catch |perr| {
+                                log.warn("failed to save pending write for node {s}: {}", .{ node_name, perr });
+                            };
                         }
 
                         // Consume pending injections
@@ -442,9 +567,7 @@ pub const Engine = struct {
 
                         try completed_nodes.put(try alloc.dupe(u8, node_name), {});
 
-                        // Handle goto command: override next ready nodes
                         if (cr.goto_targets) |targets| {
-                            // Validate goto targets exist in workflow
                             var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
                             for (targets) |target| {
                                 if (std.mem.eql(u8, target, "__end__") or getNodeJson(alloc, workflow_json, target) != null) {
@@ -459,6 +582,9 @@ pub const Engine = struct {
                             }
                         }
 
+                        // Gap 4: Clear pending writes
+                        self.store.clearPendingWrites(run_row.id) catch {};
+
                         log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
                     },
                     .async_pending => {
@@ -471,7 +597,7 @@ pub const Engine = struct {
                         const cp_id = try alloc.dupe(u8, &cp_id_buf);
                         const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                         const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                        const meta_json = try serializeRouteResults(alloc, &route_results);
+                        const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                         try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                         try self.store.incrementCheckpointCount(run_row.id);
                         try self.store.updateRunState(run_row.id, running_state);
@@ -486,7 +612,7 @@ pub const Engine = struct {
                             const cp_id = try alloc.dupe(u8, &cp_id_buf);
                             const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                             const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                            const meta_json = try serializeRouteResults(alloc, &route_results);
+                            const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                             try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                             try self.store.incrementCheckpointCount(run_row.id);
                             try self.store.updateRunState(run_row.id, running_state);
@@ -557,7 +683,7 @@ pub const Engine = struct {
                 const bp_cp_id = try alloc.dupe(u8, &bp_cp_id_buf);
                 const bp_cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
                 const bp_parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const bp_meta_json = try serializeRouteResults(alloc, &route_results);
+                const bp_meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                 try self.store.createCheckpoint(bp_cp_id, run_row.id, node_name, bp_parent_id, running_state, bp_cn_json, version, bp_meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
@@ -586,10 +712,17 @@ pub const Engine = struct {
             const cp_id = try alloc.dupe(u8, &cp_id_buf);
             const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
             const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-            const meta_json = try serializeRouteResults(alloc, &route_results);
+            const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
             try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
             try self.store.incrementCheckpointCount(run_row.id);
             try self.store.updateRunState(run_row.id, running_state);
+
+            // Broadcast rich SSE events for all modes
+            if (self.sse_hub) |hub| {
+                const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
+                const nt = if (node_json_for_sse) |nj| (getNodeField(alloc, nj, "type") orelse "task") else "task";
+                broadcastNodeEvents(hub, alloc, run_row.id, node_name, nt, running_state, null, version, 0);
+            }
         }
 
             // If goto override is set, use it for next iteration instead of findReadyNodes
@@ -1365,18 +1498,28 @@ fn serializeCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.Strin
     return json.Stringify.valueAlloc(alloc, arr.items, .{});
 }
 
-/// Serialize route_results map to JSON for checkpoint metadata.
+/// Serialize route_results map + workflow_version to JSON for checkpoint metadata.
 fn serializeRouteResults(alloc: std.mem.Allocator, route_results: *std.StringHashMap([]const u8)) !?[]const u8 {
-    if (route_results.count() == 0) return null;
+    return serializeRouteResultsWithVersion(alloc, route_results, null);
+}
+
+fn serializeRouteResultsWithVersion(alloc: std.mem.Allocator, route_results: *std.StringHashMap([]const u8), wf_version: ?i64) !?[]const u8 {
+    if (route_results.count() == 0 and wf_version == null) return null;
 
     var obj = json.ObjectMap.init(alloc);
-    var rr_obj = json.ObjectMap.init(alloc);
 
-    var it = route_results.iterator();
-    while (it.next()) |entry| {
-        try rr_obj.put(entry.key_ptr.*, .{ .string = entry.value_ptr.* });
+    if (route_results.count() > 0) {
+        var rr_obj = json.ObjectMap.init(alloc);
+        var it = route_results.iterator();
+        while (it.next()) |entry| {
+            try rr_obj.put(entry.key_ptr.*, .{ .string = entry.value_ptr.* });
+        }
+        try obj.put("route_results", .{ .object = rr_obj });
+    }
+
+    if (wf_version) |v| {
+        try obj.put("workflow_version", .{ .integer = v });
     }
-    try obj.put("route_results", .{ .object = rr_obj });
 
     return try serializeJsonValue(alloc, .{ .object = obj });
 }
@@ -1454,6 +1597,142 @@ fn getNodeFieldInt(alloc: std.mem.Allocator, node_json: []const u8, field: []con
     return null;
 }
 
+/// Get a float field from a node's JSON.
+fn getNodeFieldFloat(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?f64 {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val == .float) return val.float;
+    if (val == .integer) return @as(f64, @floatFromInt(val.integer));
+    return null;
+}
+
+/// Get a boolean field from a node's JSON.
+fn getNodeFieldBool(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?bool {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val == .bool) return val.bool;
+    return null;
+}
+
+/// Get a nested object field as JSON string from a node's JSON.
+fn getNodeObjectField(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?[]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
+    if (parsed.value != .object) return null;
+    const val = parsed.value.object.get(field) orelse return null;
+    if (val != .object) return null;
+    return serializeJsonValue(alloc, val) catch null;
+}
+
+// ── Retry Config Helpers (Gap 2) ────────────────────────────────────
+
+/// Parse retry.max_attempts from node JSON. Returns null if no retry config.
+fn parseRetryMaxAttempts(alloc: std.mem.Allocator, node_json: []const u8) ?u32 {
+    const retry_json = getNodeObjectField(alloc, node_json, "retry") orelse return null;
+    const val = getNodeFieldInt(alloc, retry_json, "max_attempts") orelse return null;
+    if (val < 1) return 1;
+    if (val > 100) return 100;
+    return @intCast(val);
+}
+
+fn parseRetryInitialMs(alloc: std.mem.Allocator, node_json: []const u8) ?u64 {
+    const retry_json = getNodeObjectField(alloc, node_json, "retry") orelse return null;
+    const val = getNodeFieldInt(alloc, retry_json, "initial_interval_ms") orelse return null;
+    if (val < 0) return 0;
+    return @intCast(val);
+}
+
+fn parseRetryBackoff(alloc: std.mem.Allocator, node_json: []const u8) ?f64 {
+    const retry_json = getNodeObjectField(alloc, node_json, "retry") orelse return null;
+    return getNodeFieldFloat(alloc, retry_json, "backoff_factor");
+}
+
+fn parseRetryMaxMs(alloc: std.mem.Allocator, node_json: []const u8) ?u64 {
+    const retry_json = getNodeObjectField(alloc, node_json, "retry") orelse return null;
+    const val = getNodeFieldInt(alloc, retry_json, "max_interval_ms") orelse return null;
+    if (val < 0) return 0;
+    return @intCast(val);
+}
+
+// ── Cache Key Helpers (Gap 3) ───────────────────────────────────────
+
+/// Parse cache.ttl_ms from node JSON. Returns null if no cache config.
+fn parseCacheTtlMs(alloc: std.mem.Allocator, node_json: []const u8) ?i64 {
+    const cache_json = getNodeObjectField(alloc, node_json, "cache") orelse return null;
+    return getNodeFieldInt(alloc, cache_json, "ttl_ms");
+}
+
+/// Compute a cache key from node_name + rendered_prompt using FNV hash.
+fn computeCacheKey(alloc: std.mem.Allocator, node_name: []const u8, rendered_prompt: []const u8) ![]const u8 {
+    var hasher = std.hash.Fnv1a_64.init();
+    hasher.update(node_name);
+    hasher.update("|");
+    hasher.update(rendered_prompt);
+    const hash = hasher.final();
+    return try std.fmt.allocPrint(alloc, "{x:0>16}", .{hash});
+}
+
+// ── Deferred Node Helpers (Gap 6) ───────────────────────────────────
+
+/// Check if a node has "defer": true in its definition.
+fn isNodeDeferred(alloc: std.mem.Allocator, workflow_json: []const u8, node_name: []const u8) bool {
+    const node_json = getNodeJson(alloc, workflow_json, node_name) orelse return false;
+    return getNodeFieldBool(alloc, node_json, "defer") orelse false;
+}
+
+/// Collect all deferred node names from workflow.
+fn collectDeferredNodes(alloc: std.mem.Allocator, workflow_json: []const u8) []const []const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return &.{};
+    if (parsed.value != .object) return &.{};
+    const nodes_val = parsed.value.object.get("nodes") orelse return &.{};
+    if (nodes_val != .object) return &.{};
+
+    var result: std.ArrayListUnmanaged([]const u8) = .empty;
+    var it = nodes_val.object.iterator();
+    while (it.next()) |entry| {
+        const name = entry.key_ptr.*;
+        const node = entry.value_ptr.*;
+        if (node == .object) {
+            if (node.object.get("defer")) |d| {
+                if (d == .bool and d.bool) {
+                    result.append(alloc, name) catch continue;
+                }
+            }
+        }
+    }
+    return result.toOwnedSlice(alloc) catch &.{};
+}
+
+// ── Managed Values Helpers (Gap 7) ──────────────────────────────────
+
+/// Inject __meta into state JSON before node execution.
+fn injectMeta(alloc: std.mem.Allocator, state_json: []const u8, run_id: []const u8, node_name: []const u8, step_number: i64, max_steps: i64) ![]const u8 {
+    const remaining = max_steps - step_number;
+    const is_last = (step_number >= max_steps - 1);
+    const meta_json = try std.fmt.allocPrint(alloc,
+        \\{{"__meta":{{"step":{d},"is_last_step":{s},"remaining_steps":{d},"run_id":"{s}","node_name":"{s}"}}}}
+    , .{ step_number, if (is_last) "true" else "false", remaining, run_id, node_name });
+
+    // Merge __meta into state using simple applyUpdates with empty schema (last_value default)
+    return state_mod.applyUpdates(alloc, state_json, meta_json, "{}");
+}
+
+/// Remove __meta from state JSON after node execution (don't persist in checkpoints).
+fn stripMeta(alloc: std.mem.Allocator, state_json: []const u8) ![]const u8 {
+    const parsed = json.parseFromSlice(json.Value, alloc, state_json, .{}) catch return try alloc.dupe(u8, state_json);
+    if (parsed.value != .object) return try alloc.dupe(u8, state_json);
+
+    var result_obj = json.ObjectMap.init(alloc);
+    var it = parsed.value.object.iterator();
+    while (it.next()) |entry| {
+        if (!std.mem.eql(u8, entry.key_ptr.*, "__meta")) {
+            try result_obj.put(entry.key_ptr.*, entry.value_ptr.*);
+        }
+    }
+    return serializeJsonValue(alloc, .{ .object = result_obj });
+}
+
 /// Build subgraph input state from parent state using input_mapping.
 /// input_mapping is {"child_key": "state.parent_key", ...}
 fn buildSubgraphInput(alloc: std.mem.Allocator, parent_state: []const u8, input_mapping_json: []const u8) ![]const u8 {
@@ -1516,6 +1795,137 @@ fn reconcileWithTracker(alloc: std.mem.Allocator, tracker_url: []const u8, task_
     return true;
 }
 
+// ── Rich Streaming Helpers ──────────────────────────────────────────
+
+/// Broadcast multi-mode SSE events for a node execution.
+/// Emits events in values, updates, tasks, and debug modes.
+fn broadcastNodeEvents(
+    hub: *sse_mod.SseHub,
+    alloc: std.mem.Allocator,
+    run_id: []const u8,
+    node_name: []const u8,
+    node_type: []const u8,
+    state_json: []const u8,
+    state_updates: ?[]const u8,
+    step_number: i64,
+    duration_ms: i64,
+) void {
+    const step_id_buf = ids.generateId();
+    const step_id = alloc.dupe(u8, &step_id_buf) catch return;
+    const now_ms = ids.nowMs();
+    // ISO 8601 timestamp (approximate, using epoch ms)
+    const ts_str = std.fmt.allocPrint(alloc, "{d}", .{now_ms}) catch "0";
+
+    // values mode: full state after step
+    const values_data = std.fmt.allocPrint(alloc,
+        \\{{"event":"values","data":{{"step":"{s}","state":{s}}}}}
+    , .{ node_name, state_json }) catch null;
+    if (values_data) |vd| {
+        hub.broadcast(run_id, .{ .event_type = "values", .data = vd, .mode = .values });
+    }
+
+    // updates mode: node name + partial updates
+    const updates_payload = state_updates orelse "{}";
+    const updates_data = std.fmt.allocPrint(alloc,
+        \\{{"event":"updates","data":{{"step":"{s}","updates":{s}}}}}
+    , .{ node_name, updates_payload }) catch null;
+    if (updates_data) |ud| {
+        hub.broadcast(run_id, .{ .event_type = "updates", .data = ud, .mode = .updates });
+    }
+
+    // tasks mode: task_start and task_result
+    const task_start_data = std.fmt.allocPrint(alloc,
+        \\{{"id":"{s}","name":"{s}","type":"{s}"}}
+    , .{ step_id, node_name, node_type }) catch null;
+    if (task_start_data) |tsd| {
+        hub.broadcast(run_id, .{ .event_type = "task_start", .data = tsd, .mode = .tasks });
+    }
+
+    const task_result_data = std.fmt.allocPrint(alloc,
+        \\{{"id":"{s}","name":"{s}","result":{s},"duration_ms":{d}}}
+    , .{ step_id, node_name, updates_payload, duration_ms }) catch null;
+    if (task_result_data) |trd| {
+        hub.broadcast(run_id, .{ .event_type = "task_result", .data = trd, .mode = .tasks });
+    }
+
+    // debug mode: wrapped with step number and timestamp
+    const debug_data = std.fmt.allocPrint(alloc,
+        \\{{"step_number":{d},"timestamp_ms":{s},"type":"task_result","payload":{{"name":"{s}","updates":{s},"duration_ms":{d}}}}}
+    , .{ step_number, ts_str, node_name, updates_payload, duration_ms }) catch null;
+    if (debug_data) |dd| {
+        hub.broadcast(run_id, .{ .event_type = "debug", .data = dd, .mode = .debug });
+    }
+}
+
+/// Get workflow version from workflow JSON definition.
+fn getWorkflowVersion(alloc: std.mem.Allocator, workflow_json: []const u8) i64 {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return 1;
+    if (parsed.value != .object) return 1;
+    const val = parsed.value.object.get("version") orelse return 1;
+    if (val == .integer) return val.integer;
+    return 1;
+}
+
+/// Get workflow version from checkpoint metadata.
+fn getCheckpointWorkflowVersion(alloc: std.mem.Allocator, metadata_json: ?[]const u8) i64 {
+    const meta = metadata_json orelse return 1;
+    const parsed = json.parseFromSlice(json.Value, alloc, meta, .{}) catch return 1;
+    if (parsed.value != .object) return 1;
+    const val = parsed.value.object.get("workflow_version") orelse return 1;
+    if (val == .integer) return val.integer;
+    return 1;
+}
+
+/// Merge workflow_version into existing checkpoint metadata JSON.
+fn mergeWorkflowVersionIntoMeta(alloc: std.mem.Allocator, existing_meta: ?[]const u8, wf_version: i64) ?[]const u8 {
+    if (existing_meta) |em| {
+        // Parse existing, add workflow_version
+        const parsed = json.parseFromSlice(json.Value, alloc, em, .{}) catch {
+            return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
+        };
+        if (parsed.value == .object) {
+            var obj = json.ObjectMap.init(alloc);
+            var it = parsed.value.object.iterator();
+            while (it.next()) |entry| {
+                obj.put(entry.key_ptr.*, entry.value_ptr.*) catch continue;
+            }
+            obj.put("workflow_version", .{ .integer = wf_version }) catch {};
+            return serializeJsonValue(alloc, .{ .object = obj }) catch null;
+        }
+        return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
+    }
+    return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
+}
+
+/// Filter completed nodes to only those still present in the workflow definition.
+/// Returns true if any nodes were removed (migration happened).
+fn migrateCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.StringHashMap(void), workflow_json: []const u8) bool {
+    const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return false;
+    if (parsed.value != .object) return false;
+    const nodes_val = parsed.value.object.get("nodes") orelse return false;
+    if (nodes_val != .object) return false;
+
+    var to_remove: std.ArrayListUnmanaged([]const u8) = .empty;
+    var it = completed_nodes.iterator();
+    while (it.next()) |entry| {
+        const name = entry.key_ptr.*;
+        // Keep special nodes
+        if (std.mem.eql(u8, name, "__start__") or std.mem.eql(u8, name, "__end__")) continue;
+        // Remove if node no longer exists in workflow
+        if (nodes_val.object.get(name) == null) {
+            to_remove.append(alloc, name) catch continue;
+        }
+    }
+
+    if (to_remove.items.len == 0) return false;
+
+    for (to_remove.items) |name| {
+        _ = completed_nodes.remove(name);
+        log.warn("migration: removed completed node '{s}' (no longer in workflow)", .{name});
+    }
+    return true;
+}
+
 // ── Tests ─────────────────────────────────────────────────────────────
 
 test "Engine: init and stop" {
diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
index 29f3332..bc39ec7 100644
--- a/src/migrations/004_orchestration.sql
+++ b/src/migrations/004_orchestration.sql
@@ -6,6 +6,7 @@ CREATE TABLE IF NOT EXISTS workflows (
     id TEXT PRIMARY KEY,
     name TEXT NOT NULL,
     definition_json TEXT NOT NULL,
+    version INTEGER DEFAULT 1,
     created_at_ms INTEGER NOT NULL,
     updated_at_ms INTEGER NOT NULL
 );
@@ -64,3 +65,23 @@ ALTER TABLE steps ADD COLUMN state_updates_json TEXT;
 -- Subgraph support: parent run linkage and per-run config
 ALTER TABLE runs ADD COLUMN parent_run_id TEXT REFERENCES runs(id);
 ALTER TABLE runs ADD COLUMN config_json TEXT;
+
+-- Node-level cache (Gap 3)
+CREATE TABLE IF NOT EXISTS node_cache (
+    cache_key TEXT PRIMARY KEY,
+    node_name TEXT NOT NULL,
+    result_json TEXT NOT NULL,
+    created_at_ms INTEGER NOT NULL,
+    ttl_ms INTEGER
+);
+
+-- Pending writes from parallel node execution (Gap 4)
+CREATE TABLE IF NOT EXISTS pending_writes (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    run_id TEXT NOT NULL,
+    step_id TEXT NOT NULL,
+    channel TEXT NOT NULL,
+    value_json TEXT NOT NULL,
+    created_at_ms INTEGER NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_pending_writes_run ON pending_writes(run_id);
diff --git a/src/state.zig b/src/state.zig
index 3c6ab0b..779ed8a 100644
--- a/src/state.zig
+++ b/src/state.zig
@@ -36,6 +36,23 @@ fn formatFloat(alloc: Allocator, f: f64) ![]const u8 {
     return try std.fmt.allocPrint(alloc, "{d}", .{f});
 }
 
+// ── Overwrite Bypass (Gap 5) ──────────────────────────────────────────
+
+/// Check if a JSON value is wrapped in {"__overwrite": true, "value": ...}.
+fn isOverwrite(value: json.Value) bool {
+    if (value != .object) return false;
+    const ow = value.object.get("__overwrite") orelse return false;
+    if (ow != .bool) return false;
+    return ow.bool;
+}
+
+/// Extract the "value" field from an overwrite wrapper.
+/// Returns the unwrapped json.Value, or .null if "value" key is missing.
+fn extractOverwriteValue(value: json.Value) json.Value {
+    if (value != .object) return value;
+    return value.object.get("value") orelse .null;
+}
+
 // ── Public API ────────────────────────────────────────────────────────
 
 /// Apply a single reducer to merge old_value + update into new_value.
@@ -93,12 +110,19 @@ pub fn applyUpdates(alloc: Allocator, state_json: []const u8, updates_json: []co
         try result_obj.put(entry.key_ptr.*, entry.value_ptr.*);
     }
 
-    // For each update key, apply the reducer
+    // For each update key, apply the reducer (with overwrite bypass, Gap 5)
     var updates_it = updates_parsed.value.object.iterator();
     while (updates_it.next()) |entry| {
         const key = entry.key_ptr.*;
         const update_value = entry.value_ptr.*;
 
+        // Gap 5: Check for overwrite bypass
+        if (isOverwrite(update_value)) {
+            const raw_val = extractOverwriteValue(update_value);
+            try result_obj.put(key, raw_val);
+            continue;
+        }
+
         // Serialize the update value
         const update_str = try serializeValue(arena_alloc, update_value);
 
@@ -862,3 +886,51 @@ test "add_messages reducer - null old" {
     const text0 = m0.object.get("text") orelse return error.TestUnexpectedResult;
     try std.testing.expectEqualStrings("first", text0.string);
 }
+
+test "overwrite bypasses reducer" {
+    const alloc = std.testing.allocator;
+    // count has "add" reducer, but __overwrite should bypass it
+    const state =
+        \\{"count":10}
+    ;
+    const updates =
+        \\{"count":{"__overwrite":true,"value":42}}
+    ;
+    const schema =
+        \\{"count":{"type":"number","reducer":"add"}}
+    ;
+
+    const result = try applyUpdates(alloc, state, updates, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+    const count = parsed.value.object.get("count") orelse return error.TestUnexpectedResult;
+    // Should be 42 (overwritten), not 52 (10 + 42 via add reducer)
+    try std.testing.expectEqual(@as(i64, 42), count.integer);
+}
+
+test "overwrite with array value" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"items":[1,2,3]}
+    ;
+    const updates =
+        \\{"items":{"__overwrite":true,"value":[99]}}
+    ;
+    const schema =
+        \\{"items":{"type":"array","reducer":"append"}}
+    ;
+
+    const result = try applyUpdates(alloc, state, updates, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    const items = parsed.value.object.get("items") orelse return error.TestUnexpectedResult;
+    try std.testing.expect(items == .array);
+    // Should be [99] (overwritten), not [1,2,3,99] (appended)
+    try std.testing.expectEqual(@as(usize, 1), items.array.items.len);
+    try std.testing.expectEqual(@as(i64, 99), items.array.items[0].integer);
+}
diff --git a/src/store.zig b/src/store.zig
index 390691d..b524de6 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -1294,7 +1294,11 @@ pub const Store = struct {
     // ── Workflow CRUD ─────────────────────────────────────────────────
 
     pub fn createWorkflow(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8) !void {
-        const sql = "INSERT INTO workflows (id, name, definition_json, created_at_ms, updated_at_ms) VALUES (?, ?, ?, ?, ?)";
+        return self.createWorkflowWithVersion(id, name, definition_json, 1);
+    }
+
+    pub fn createWorkflowWithVersion(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8, version: i64) !void {
+        const sql = "INSERT INTO workflows (id, name, definition_json, version, created_at_ms, updated_at_ms) VALUES (?, ?, ?, ?, ?, ?)";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -1305,8 +1309,9 @@ pub const Store = struct {
         _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
         _ = c.sqlite3_bind_text(stmt, 2, name.ptr, @intCast(name.len), SQLITE_STATIC);
         _ = c.sqlite3_bind_text(stmt, 3, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 4, now);
+        _ = c.sqlite3_bind_int64(stmt, 4, version);
         _ = c.sqlite3_bind_int64(stmt, 5, now);
+        _ = c.sqlite3_bind_int64(stmt, 6, now);
 
         if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
             return error.SqliteStepFailed;
@@ -1314,7 +1319,7 @@ pub const Store = struct {
     }
 
     pub fn getWorkflow(self: *Self, alloc: std.mem.Allocator, id: []const u8) !?types.WorkflowRow {
-        const sql = "SELECT id, name, definition_json, created_at_ms, updated_at_ms FROM workflows WHERE id = ?";
+        const sql = "SELECT id, name, definition_json, version, created_at_ms, updated_at_ms FROM workflows WHERE id = ?";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -1329,13 +1334,14 @@ pub const Store = struct {
             .id = try allocStr(alloc, stmt, 0),
             .name = try allocStr(alloc, stmt, 1),
             .definition_json = try allocStr(alloc, stmt, 2),
-            .created_at_ms = colInt(stmt, 3),
-            .updated_at_ms = colInt(stmt, 4),
+            .version = colInt(stmt, 3),
+            .created_at_ms = colInt(stmt, 4),
+            .updated_at_ms = colInt(stmt, 5),
         };
     }
 
     pub fn listWorkflows(self: *Self, alloc: std.mem.Allocator) ![]types.WorkflowRow {
-        const sql = "SELECT id, name, definition_json, created_at_ms, updated_at_ms FROM workflows ORDER BY created_at_ms DESC";
+        const sql = "SELECT id, name, definition_json, version, created_at_ms, updated_at_ms FROM workflows ORDER BY created_at_ms DESC";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -1348,28 +1354,52 @@ pub const Store = struct {
                 .id = try allocStr(alloc, stmt, 0),
                 .name = try allocStr(alloc, stmt, 1),
                 .definition_json = try allocStr(alloc, stmt, 2),
-                .created_at_ms = colInt(stmt, 3),
-                .updated_at_ms = colInt(stmt, 4),
+                .version = colInt(stmt, 3),
+                .created_at_ms = colInt(stmt, 4),
+                .updated_at_ms = colInt(stmt, 5),
             });
         }
         return list.toOwnedSlice(alloc);
     }
 
     pub fn updateWorkflow(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8) !void {
-        const sql = "UPDATE workflows SET name = ?, definition_json = ?, updated_at_ms = ? WHERE id = ?";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
+        return self.updateWorkflowWithVersion(id, name, definition_json, null);
+    }
 
-        _ = c.sqlite3_bind_text(stmt, 1, name.ptr, @intCast(name.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 3, ids.nowMs());
-        _ = c.sqlite3_bind_text(stmt, 4, id.ptr, @intCast(id.len), SQLITE_STATIC);
+    pub fn updateWorkflowWithVersion(self: *Self, id: []const u8, name: []const u8, definition_json: []const u8, version: ?i64) !void {
+        if (version) |v| {
+            const sql = "UPDATE workflows SET name = ?, definition_json = ?, version = ?, updated_at_ms = ? WHERE id = ?";
+            var stmt: ?*c.sqlite3_stmt = null;
+            if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+                return error.SqlitePrepareFailed;
+            }
+            defer _ = c.sqlite3_finalize(stmt);
 
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
+            _ = c.sqlite3_bind_text(stmt, 1, name.ptr, @intCast(name.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_text(stmt, 2, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_int64(stmt, 3, v);
+            _ = c.sqlite3_bind_int64(stmt, 4, ids.nowMs());
+            _ = c.sqlite3_bind_text(stmt, 5, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+            if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+                return error.SqliteStepFailed;
+            }
+        } else {
+            const sql = "UPDATE workflows SET name = ?, definition_json = ?, updated_at_ms = ? WHERE id = ?";
+            var stmt: ?*c.sqlite3_stmt = null;
+            if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+                return error.SqlitePrepareFailed;
+            }
+            defer _ = c.sqlite3_finalize(stmt);
+
+            _ = c.sqlite3_bind_text(stmt, 1, name.ptr, @intCast(name.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_text(stmt, 2, definition_json.ptr, @intCast(definition_json.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_int64(stmt, 3, ids.nowMs());
+            _ = c.sqlite3_bind_text(stmt, 4, id.ptr, @intCast(id.len), SQLITE_STATIC);
+
+            if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+                return error.SqliteStepFailed;
+            }
         }
     }
 
@@ -1718,6 +1748,123 @@ pub const Store = struct {
             return error.SqliteStepFailed;
         }
     }
+
+    // ── Node Cache (Gap 3) ───────────────────────────────────────────
+
+    pub fn getCachedResult(self: *Self, alloc: std.mem.Allocator, cache_key: []const u8) !?[]const u8 {
+        const sql = "SELECT result_json, created_at_ms, ttl_ms FROM node_cache WHERE cache_key = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, cache_key.ptr, @intCast(cache_key.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
+
+        const result_json = try allocStr(alloc, stmt, 0);
+        const created_at_ms = colInt(stmt, 1);
+        const ttl_ms = colIntOpt(stmt, 2);
+
+        // Check expiration
+        if (ttl_ms) |ttl| {
+            const now_ms = ids.nowMs();
+            if (now_ms - created_at_ms > ttl) {
+                // Expired — delete and return null
+                const del_sql = "DELETE FROM node_cache WHERE cache_key = ?";
+                var del_stmt: ?*c.sqlite3_stmt = null;
+                if (c.sqlite3_prepare_v2(self.db, del_sql, -1, &del_stmt, null) == c.SQLITE_OK) {
+                    _ = c.sqlite3_bind_text(del_stmt, 1, cache_key.ptr, @intCast(cache_key.len), SQLITE_STATIC);
+                    _ = c.sqlite3_step(del_stmt);
+                    _ = c.sqlite3_finalize(del_stmt);
+                }
+                alloc.free(result_json);
+                return null;
+            }
+        }
+
+        return result_json;
+    }
+
+    pub fn setCachedResult(self: *Self, cache_key: []const u8, node_name: []const u8, result_json: []const u8, ttl_ms: ?i64) !void {
+        const sql = "INSERT OR REPLACE INTO node_cache (cache_key, node_name, result_json, created_at_ms, ttl_ms) VALUES (?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, cache_key.ptr, @intCast(cache_key.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, node_name.ptr, @intCast(node_name.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 3, result_json.ptr, @intCast(result_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 4, ids.nowMs());
+        bindIntOpt(stmt, 5, ttl_ms);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    // ── Pending Writes (Gap 4) ───────────────────────────────────────
+
+    pub fn savePendingWrite(self: *Self, run_id: []const u8, step_id: []const u8, channel: []const u8, value_json: []const u8) !void {
+        const sql = "INSERT INTO pending_writes (run_id, step_id, channel, value_json, created_at_ms) VALUES (?, ?, ?, ?, ?)";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 3, channel.ptr, @intCast(channel.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 4, value_json.ptr, @intCast(value_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 5, ids.nowMs());
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn getPendingWrites(self: *Self, alloc: std.mem.Allocator, run_id: []const u8) ![]types.PendingWriteRow {
+        const sql = "SELECT id, run_id, step_id, channel, value_json, created_at_ms FROM pending_writes WHERE run_id = ? ORDER BY id ASC";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        var list: std.ArrayListUnmanaged(types.PendingWriteRow) = .empty;
+        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
+            try list.append(alloc, .{
+                .id = colInt(stmt, 0),
+                .run_id = try allocStr(alloc, stmt, 1),
+                .step_id = try allocStr(alloc, stmt, 2),
+                .channel = try allocStr(alloc, stmt, 3),
+                .value_json = try allocStr(alloc, stmt, 4),
+                .created_at_ms = colInt(stmt, 5),
+            });
+        }
+        return list.toOwnedSlice(alloc);
+    }
+
+    pub fn clearPendingWrites(self: *Self, run_id: []const u8) !void {
+        const sql = "DELETE FROM pending_writes WHERE run_id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
 };
 
 // ── Tests ─────────────────────────────────────────────────────────────
diff --git a/src/types.zig b/src/types.zig
index 17f1551..2f28790 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -247,6 +247,7 @@ pub const WorkflowRow = struct {
     id: []const u8,
     name: []const u8,
     definition_json: []const u8,
+    version: i64 = 1,
     created_at_ms: i64,
     updated_at_ms: i64,
 };
@@ -283,6 +284,15 @@ pub const PendingInjectionRow = struct {
     created_at_ms: i64,
 };
 
+pub const PendingWriteRow = struct {
+    id: i64,
+    run_id: []const u8,
+    step_id: []const u8,
+    channel: []const u8,
+    value_json: []const u8,
+    created_at_ms: i64,
+};
+
 pub const ReducerType = enum {
     last_value,
     append,

From 8e273c29ca4973f1e3987553f94dd0c807748d77 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 20:56:34 -0300
Subject: [PATCH 17/55] feat: rich streaming modes, graph migrations, replay
 endpoint

Gap 8: Extend SseEvent with StreamMode enum (values, updates, tasks,
debug, custom). Engine broadcasts multi-mode events after each node
execution. Stream endpoint accepts ?mode= query param to filter.

Gap 9: Workflow version tracking in checkpoint metadata. On resume,
detect version mismatch and migrate completed_nodes by filtering
out nodes that no longer exist in the new workflow definition.
Store functions support versioned workflow CRUD.

Gap 10: POST /runs/{id}/replay endpoint resets run state to a
checkpoint and marks it as running for the engine to pick up.
Validates checkpoint belongs to the target run.
---
 src/api.zig    | 278 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/engine.zig | 134 ++++++++++++++++++++++++
 src/sse.zig    |  61 +++++++++++
 src/store.zig  |  59 +++++++++++
 4 files changed, 523 insertions(+), 9 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 6cf29c6..44e5557 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -240,7 +240,14 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
 
     // GET /runs/{id}/stream
     if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "stream") and seg3 == null) {
-        return handleStream(ctx, seg1.?);
+        return handleStream(ctx, seg1.?, target);
+    }
+
+    // ── Replay endpoint ────────────────────────────────────────────
+
+    // POST /runs/{id}/replay
+    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "replay") and seg3 == null) {
+        return handleReplayRun(ctx, seg1.?, body);
     }
 
     // ── Agent events callback ───────────────────────────────────────
@@ -1096,15 +1103,21 @@ fn handleCreateWorkflow(ctx: *Context, body: []const u8) HttpResponse {
         break :blk serializeJsonValue(ctx.allocator, def_val) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize definition\"}}");
     } else body;
 
-    ctx.store.createWorkflow(wf_id, name, definition_json) catch {
+    // Extract version from body (default 1)
+    const version: i64 = if (obj.get("version")) |v| blk: {
+        if (v == .integer) break :blk v.integer;
+        break :blk 1;
+    } else 1;
+
+    ctx.store.createWorkflowWithVersion(wf_id, name, definition_json, version) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create workflow\"}}");
     };
 
     const id_json = jsonQuoted(ctx.allocator, wf_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const name_json = jsonQuoted(ctx.allocator, name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"id":{s},"name":{s}}}
-    , .{ id_json, name_json }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        \\{{"id":{s},"name":{s},"version":{d}}}
+    , .{ id_json, name_json, version }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(201, resp);
 }
 
@@ -1123,10 +1136,11 @@ fn handleListWorkflows(ctx: *Context) HttpResponse {
         const id_json = jsonQuoted(ctx.allocator, wf.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
         const name_json = jsonQuoted(ctx.allocator, wf.name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
         const entry = std.fmt.allocPrint(ctx.allocator,
-            \\{{"id":{s},"name":{s},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
+            \\{{"id":{s},"name":{s},"version":{d},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
         , .{
             id_json,
             name_json,
+            wf.version,
             wf.definition_json,
             wf.created_at_ms,
             wf.updated_at_ms,
@@ -1149,10 +1163,11 @@ fn handleGetWorkflow(ctx: *Context, id: []const u8) HttpResponse {
     const id_json = jsonQuoted(ctx.allocator, wf.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const name_json = jsonQuoted(ctx.allocator, wf.name) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"id":{s},"name":{s},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
+        \\{{"id":{s},"name":{s},"version":{d},"definition":{s},"created_at_ms":{d},"updated_at_ms":{d}}}
     , .{
         id_json,
         name_json,
+        wf.version,
         wf.definition_json,
         wf.created_at_ms,
         wf.updated_at_ms,
@@ -1183,7 +1198,13 @@ fn handleUpdateWorkflow(ctx: *Context, id: []const u8, body: []const u8) HttpRes
         break :blk serializeJsonValue(ctx.allocator, def_val) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to serialize definition\"}}");
     } else body;
 
-    ctx.store.updateWorkflow(id, name, definition_json) catch {
+    // Extract version if provided
+    const version: ?i64 = if (obj.get("version")) |v| blk: {
+        if (v == .integer) break :blk v.integer;
+        break :blk null;
+    } else null;
+
+    ctx.store.updateWorkflowWithVersion(id, name, definition_json, version) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update workflow\"}}");
     };
 
@@ -1539,6 +1560,64 @@ fn handleForkRun(ctx: *Context, body: []const u8) HttpResponse {
     return jsonResponse(201, resp);
 }
 
+// ── Replay Handler ──────────────────────────────────────────────────
+
+fn handleReplayRun(ctx: *Context, run_id: []const u8, body: []const u8) HttpResponse {
+    // Parse from_checkpoint_id from body
+    const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
+    };
+    defer parsed.deinit();
+
+    if (parsed.value != .object) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"body must be a JSON object\"}}");
+    }
+    const obj = parsed.value.object;
+
+    const checkpoint_id = getJsonString(obj, "from_checkpoint_id") orelse {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"missing required field: from_checkpoint_id\"}}");
+    };
+
+    // Load checkpoint
+    const cp = ctx.store.getCheckpoint(ctx.allocator, checkpoint_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get checkpoint\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"checkpoint not found\"}}");
+    };
+
+    // Verify checkpoint belongs to this run
+    if (!std.mem.eql(u8, cp.run_id, run_id)) {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"checkpoint does not belong to this run\"}}");
+    }
+
+    // Load run to verify it exists
+    _ = ctx.store.getRun(ctx.allocator, run_id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
+    };
+
+    // Reset run state to checkpoint's state
+    ctx.store.updateRunState(run_id, cp.state_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run state\"}}");
+    };
+
+    // Set run status to running — engine will pick it up on next tick
+    // with the checkpoint's completed_nodes
+    ctx.store.updateRunStatus(run_id, "running", null) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run status\"}}");
+    };
+
+    ctx.store.insertEvent(run_id, null, "run.replayed", "{}") catch {};
+
+    const run_id_json = jsonQuoted(ctx.allocator, run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const cp_id_json = jsonQuoted(ctx.allocator, checkpoint_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    const resp = std.fmt.allocPrint(ctx.allocator,
+        \\{{"id":{s},"status":"running","replayed_from_checkpoint":{s}}}
+    , .{ run_id_json, cp_id_json }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, resp);
+}
+
 fn handleInjectState(ctx: *Context, run_id: []const u8, body: []const u8) HttpResponse {
     // Verify run exists
     const run = ctx.store.getRun(ctx.allocator, run_id) catch {
@@ -1590,16 +1669,33 @@ fn handleInjectState(ctx: *Context, run_id: []const u8, body: []const u8) HttpRe
 
 // ── SSE Stream Handler ──────────────────────────────────────────────
 
-fn handleStream(ctx: *Context, run_id: []const u8) HttpResponse {
+fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpResponse {
     // For now, return the current state and events as a regular JSON response.
     // Full SSE streaming with held-open connections will be implemented
     // when the threading model is wired in main.zig (Task 12).
+    //
+    // Supports ?mode=values,tasks,debug,updates,custom query param to filter
+    // which streaming modes the client wants. Default: all modes.
     const run = ctx.store.getRun(ctx.allocator, run_id) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get run\"}}");
     } orelse {
         return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
     };
 
+    // Parse requested modes from ?mode= query param
+    const mode_param = getQueryParam(target, "mode");
+    var requested_modes: [5]bool = .{ true, true, true, true, true }; // all modes by default
+    if (mode_param) |modes_str| {
+        // Reset all to false, then enable requested
+        requested_modes = .{ false, false, false, false, false };
+        var mode_it = std.mem.splitScalar(u8, modes_str, ',');
+        while (mode_it.next()) |mode_name| {
+            if (sse_mod.StreamMode.fromString(mode_name)) |m| {
+                requested_modes[@intFromEnum(m)] = true;
+            }
+        }
+    }
+
     const events = ctx.store.getEventsByRun(ctx.allocator, run_id) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get events\"}}");
     };
@@ -1620,6 +1716,38 @@ fn handleStream(ctx: *Context, run_id: []const u8) HttpResponse {
     events_buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const events_json = events_buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
 
+    // If SSE hub available, drain queued SSE events filtered by requested modes
+    var sse_events_json: []const u8 = "[]";
+    if (ctx.sse_hub) |hub| {
+        const queue = hub.getOrCreateQueue(run_id);
+        const sse_events = queue.drain(ctx.allocator);
+        if (sse_events.len > 0) {
+            var sse_buf: std.ArrayListUnmanaged(u8) = .empty;
+            sse_buf.append(ctx.allocator, '[') catch {};
+            var first = true;
+            for (sse_events) |sse_ev| {
+                // Filter by requested modes
+                if (!requested_modes[@intFromEnum(sse_ev.mode)]) continue;
+                if (!first) {
+                    sse_buf.append(ctx.allocator, ',') catch {};
+                }
+                first = false;
+                const mode_str = sse_ev.mode.toString();
+                const sse_entry = std.fmt.allocPrint(ctx.allocator,
+                    \\{{"event":{s},"mode":"{s}","data":{s}}}
+                , .{
+                    jsonQuoted(ctx.allocator, sse_ev.event_type) catch "\"\"",
+                    mode_str,
+                    sse_ev.data,
+                }) catch continue;
+                sse_buf.appendSlice(ctx.allocator, sse_entry) catch {};
+            }
+            sse_buf.append(ctx.allocator, ']') catch {};
+            sse_events_json = sse_buf.toOwnedSlice(ctx.allocator) catch "[]";
+            ctx.allocator.free(sse_events);
+        }
+    }
+
     const status_json = jsonQuoted(ctx.allocator, run.status) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const state_field = if (run.state_json) |sj|
         std.fmt.allocPrint(ctx.allocator, ",\"state\":{s}", .{sj}) catch ""
@@ -1627,11 +1755,12 @@ fn handleStream(ctx: *Context, run_id: []const u8) HttpResponse {
         "";
 
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"status":{s}{s},"events":{s}}}
+        \\{{"status":{s}{s},"events":{s},"stream_events":{s}}}
     , .{
         status_json,
         state_field,
         events_json,
+        sse_events_json,
     }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(200, resp);
 }
@@ -2660,3 +2789,134 @@ test "API: metrics endpoint returns text format" {
     try std.testing.expect(std.mem.startsWith(u8, resp.content_type, "text/plain"));
     try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_http_requests_total") != null);
 }
+
+test "API: replay run from checkpoint" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    // Create a run with a checkpoint
+    try store.createRunWithState("r1", null, "{\"nodes\":{}}", "{}", "{\"x\":1}");
+    try store.updateRunStatus("r1", "completed", null);
+    try store.createCheckpoint("cp1", "r1", "step_a", null, "{\"x\":1}", "[\"step_a\"]", 1, null);
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const body =
+        \\{"from_checkpoint_id":"cp1"}
+    ;
+
+    const resp = handleRequest(&ctx, "POST", "/runs/r1/replay", body);
+    try std.testing.expectEqual(@as(u16, 200), resp.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "running") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "replayed_from_checkpoint") != null);
+
+    // Verify run state was reset to checkpoint state
+    const run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("running", run.status);
+    if (run.state_json) |sj| {
+        try std.testing.expectEqualStrings("{\"x\":1}", sj);
+    }
+}
+
+test "API: replay run rejects wrong checkpoint" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    // Create two runs, checkpoint belongs to r2
+    try store.createRunWithState("r1", null, "{}", "{}", "{}");
+    try store.createRunWithState("r2", null, "{}", "{}", "{}");
+    try store.createCheckpoint("cp_r2", "r2", "step_a", null, "{}", "[]", 1, null);
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const body =
+        \\{"from_checkpoint_id":"cp_r2"}
+    ;
+
+    const resp = handleRequest(&ctx, "POST", "/runs/r1/replay", body);
+    try std.testing.expectEqual(@as(u16, 400), resp.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "does not belong") != null);
+}
+
+test "API: replay run rejects missing checkpoint" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createRunWithState("r1", null, "{}", "{}", "{}");
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const body =
+        \\{"from_checkpoint_id":"nonexistent"}
+    ;
+
+    const resp = handleRequest(&ctx, "POST", "/runs/r1/replay", body);
+    try std.testing.expectEqual(@as(u16, 404), resp.status_code);
+}
+
+test "API: replay run rejects missing field" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createRunWithState("r1", null, "{}", "{}", "{}");
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const resp = handleRequest(&ctx, "POST", "/runs/r1/replay", "{}");
+    try std.testing.expectEqual(@as(u16, 400), resp.status_code);
+}
+
+test "API: stream with mode query param" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createRunWithState("r1", null, "{}", "{}", "{\"x\":1}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    // Default (no mode param) — should succeed
+    const resp1 = handleRequest(&ctx, "GET", "/runs/r1/stream", "");
+    try std.testing.expectEqual(@as(u16, 200), resp1.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp1.body, "stream_events") != null);
+
+    // With specific modes
+    const resp2 = handleRequest(&ctx, "GET", "/runs/r1/stream?mode=values,debug", "");
+    try std.testing.expectEqual(@as(u16, 200), resp2.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp2.body, "stream_events") != null);
+}
diff --git a/src/engine.zig b/src/engine.zig
index 9533ffa..8ce9c56 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -2519,3 +2519,137 @@ test "engine: configurable runs inject __config" {
         try std.testing.expect(std.mem.indexOf(u8, sj, "gpt-4") != null);
     }
 }
+
+test "getWorkflowVersion: extracts version" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try std.testing.expectEqual(@as(i64, 2), getWorkflowVersion(arena.allocator(), "{\"version\":2,\"nodes\":{}}"));
+    try std.testing.expectEqual(@as(i64, 1), getWorkflowVersion(arena.allocator(), "{\"nodes\":{}}"));
+    try std.testing.expectEqual(@as(i64, 1), getWorkflowVersion(arena.allocator(), "invalid"));
+}
+
+test "getCheckpointWorkflowVersion: extracts from metadata" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try std.testing.expectEqual(@as(i64, 3), getCheckpointWorkflowVersion(arena.allocator(), "{\"workflow_version\":3}"));
+    try std.testing.expectEqual(@as(i64, 1), getCheckpointWorkflowVersion(arena.allocator(), "{\"route_results\":{}}"));
+    try std.testing.expectEqual(@as(i64, 1), getCheckpointWorkflowVersion(arena.allocator(), null));
+}
+
+test "migrateCompletedNodes: filters removed nodes" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const alloc = arena.allocator();
+    var completed = std.StringHashMap(void).init(alloc);
+    try completed.put("analyze", {});
+    try completed.put("old_node", {});
+    try completed.put("__start__", {});
+
+    const wf =
+        \\{"nodes":{"analyze":{"type":"task"},"new_node":{"type":"task"}},"edges":[]}
+    ;
+
+    const migrated = migrateCompletedNodes(alloc, &completed, wf);
+    try std.testing.expect(migrated);
+    try std.testing.expect(completed.get("analyze") != null);
+    try std.testing.expect(completed.get("__start__") != null);
+    try std.testing.expect(completed.get("old_node") == null);
+}
+
+test "migrateCompletedNodes: no changes needed" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const alloc = arena.allocator();
+    var completed = std.StringHashMap(void).init(alloc);
+    try completed.put("analyze", {});
+
+    const wf =
+        \\{"nodes":{"analyze":{"type":"task"}},"edges":[]}
+    ;
+
+    const migrated = migrateCompletedNodes(alloc, &completed, wf);
+    try std.testing.expect(!migrated);
+}
+
+test "mergeWorkflowVersionIntoMeta: new metadata" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const result = mergeWorkflowVersionIntoMeta(arena.allocator(), null, 2);
+    try std.testing.expect(result != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "workflow_version") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "2") != null);
+}
+
+test "mergeWorkflowVersionIntoMeta: existing metadata" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const result = mergeWorkflowVersionIntoMeta(arena.allocator(), "{\"route_results\":{}}", 3);
+    try std.testing.expect(result != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "workflow_version") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "route_results") != null);
+}
+
+test "serializeRouteResultsWithVersion: includes version" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const alloc = arena.allocator();
+    var route_results = std.StringHashMap([]const u8).init(alloc);
+
+    const result = try serializeRouteResultsWithVersion(alloc, &route_results, 5);
+    try std.testing.expect(result != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "workflow_version") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result.?, "5") != null);
+}
+
+test "serializeRouteResultsWithVersion: null version, empty routes" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const alloc = arena.allocator();
+    var route_results = std.StringHashMap([]const u8).init(alloc);
+
+    const result = try serializeRouteResultsWithVersion(alloc, &route_results, null);
+    try std.testing.expect(result == null);
+}
+
+test "engine: workflow version stored in checkpoint metadata" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    const wf =
+        \\{"version":2,"nodes":{"t1":{"type":"transform","updates":"{\"result\":\"done\"}"}},"edges":[["__start__","t1"],["t1","__end__"]],"schema":{"result":{"type":"string","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    // Check that checkpoint has workflow_version in metadata
+    const latest_cp = (try store.getLatestCheckpoint(arena.allocator(), "r1")).?;
+    try std.testing.expect(latest_cp.metadata_json != null);
+    try std.testing.expect(std.mem.indexOf(u8, latest_cp.metadata_json.?, "workflow_version") != null);
+    try std.testing.expect(std.mem.indexOf(u8, latest_cp.metadata_json.?, "2") != null);
+}
diff --git a/src/sse.zig b/src/sse.zig
index c189221..d3bc1bd 100644
--- a/src/sse.zig
+++ b/src/sse.zig
@@ -1,9 +1,29 @@
 const std = @import("std");
 const Allocator = std.mem.Allocator;
 
+pub const StreamMode = enum {
+    values, // Full state after each step
+    updates, // Only node name + updates
+    tasks, // Task start/finish with metadata
+    debug, // Everything with step number + timestamp
+    custom, // User-defined via node output
+
+    pub fn toString(self: StreamMode) []const u8 {
+        return @tagName(self);
+    }
+
+    pub fn fromString(s: []const u8) ?StreamMode {
+        inline for (@typeInfo(StreamMode).@"enum".fields) |f| {
+            if (std.mem.eql(u8, s, f.name)) return @enumFromInt(f.value);
+        }
+        return null;
+    }
+};
+
 pub const SseEvent = struct {
     event_type: []const u8, // "state_update", "step_started", etc.
     data: []const u8, // JSON string
+    mode: StreamMode = .updates, // default mode
 };
 
 /// Per-run event queue. Thread-safe via mutex.
@@ -158,3 +178,44 @@ test "sse queue close" {
     queue.close();
     try std.testing.expect(queue.isClosed());
 }
+
+test "stream mode toString and fromString" {
+    try std.testing.expectEqualStrings("values", StreamMode.values.toString());
+    try std.testing.expectEqualStrings("updates", StreamMode.updates.toString());
+    try std.testing.expectEqualStrings("tasks", StreamMode.tasks.toString());
+    try std.testing.expectEqualStrings("debug", StreamMode.debug.toString());
+    try std.testing.expectEqualStrings("custom", StreamMode.custom.toString());
+
+    try std.testing.expectEqual(StreamMode.values, StreamMode.fromString("values").?);
+    try std.testing.expectEqual(StreamMode.debug, StreamMode.fromString("debug").?);
+    try std.testing.expect(StreamMode.fromString("invalid") == null);
+}
+
+test "sse event default mode is updates" {
+    const ev = SseEvent{ .event_type = "test", .data = "{}" };
+    try std.testing.expectEqual(StreamMode.updates, ev.mode);
+}
+
+test "sse event with explicit mode" {
+    const ev = SseEvent{ .event_type = "values", .data = "{\"state\":{}}", .mode = .values };
+    try std.testing.expectEqual(StreamMode.values, ev.mode);
+    try std.testing.expectEqualStrings("values", ev.event_type);
+}
+
+test "sse hub broadcast with mode" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+    queue.push(.{ .event_type = "values", .data = "{\"full\":true}", .mode = .values });
+    queue.push(.{ .event_type = "task_start", .data = "{}", .mode = .tasks });
+    queue.push(.{ .event_type = "debug", .data = "{}", .mode = .debug });
+
+    const events = queue.drain(alloc);
+    defer alloc.free(events);
+    try std.testing.expectEqual(@as(usize, 3), events.len);
+    try std.testing.expectEqual(StreamMode.values, events[0].mode);
+    try std.testing.expectEqual(StreamMode.tasks, events[1].mode);
+    try std.testing.expectEqual(StreamMode.debug, events[2].mode);
+}
diff --git a/src/store.zig b/src/store.zig
index b524de6..a83ab89 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -2734,3 +2734,62 @@ test "run state management" {
     try std.testing.expectEqualStrings("r3", forked.id);
     try std.testing.expectEqualStrings("pending", forked.status);
 }
+
+test "workflow version CRUD" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    // Create workflow with default version (1)
+    try s.createWorkflow("wf1", "Test Workflow", "{\"nodes\":{}}");
+    const wf1 = (try s.getWorkflow(allocator, "wf1")).?;
+    defer {
+        allocator.free(wf1.id);
+        allocator.free(wf1.name);
+        allocator.free(wf1.definition_json);
+    }
+    try std.testing.expectEqual(@as(i64, 1), wf1.version);
+
+    // Create workflow with explicit version
+    try s.createWorkflowWithVersion("wf2", "Versioned Workflow", "{\"nodes\":{}}", 5);
+    const wf2 = (try s.getWorkflow(allocator, "wf2")).?;
+    defer {
+        allocator.free(wf2.id);
+        allocator.free(wf2.name);
+        allocator.free(wf2.definition_json);
+    }
+    try std.testing.expectEqual(@as(i64, 5), wf2.version);
+
+    // Update workflow with new version
+    try s.updateWorkflowWithVersion("wf2", "Updated", "{\"nodes\":{\"a\":{}}}", 6);
+    const wf3 = (try s.getWorkflow(allocator, "wf2")).?;
+    defer {
+        allocator.free(wf3.id);
+        allocator.free(wf3.name);
+        allocator.free(wf3.definition_json);
+    }
+    try std.testing.expectEqual(@as(i64, 6), wf3.version);
+    try std.testing.expectEqualStrings("Updated", wf3.name);
+
+    // Update without changing version
+    try s.updateWorkflow("wf1", "Still v1", "{\"nodes\":{\"b\":{}}}");
+    const wf4 = (try s.getWorkflow(allocator, "wf1")).?;
+    defer {
+        allocator.free(wf4.id);
+        allocator.free(wf4.name);
+        allocator.free(wf4.definition_json);
+    }
+    try std.testing.expectEqual(@as(i64, 1), wf4.version);
+
+    // List workflows should include version
+    const workflows = try s.listWorkflows(allocator);
+    defer {
+        for (workflows) |w| {
+            allocator.free(w.id);
+            allocator.free(w.name);
+            allocator.free(w.definition_json);
+        }
+        allocator.free(workflows);
+    }
+    try std.testing.expectEqual(@as(usize, 2), workflows.len);
+}

From e709cb2b7bc0104f6286c3ff8c970b0cdfe52965 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 21:12:44 -0300
Subject: [PATCH 18/55] feat: hot reload, token accounting, rate limits, turn
 timeout

Gap 1: WorkflowWatcher in workflow_loader.zig periodically scans
workflows_dir for changed JSON files (FNV1a hash comparison) and
upserts into the workflows table. Wired into engine tick loop and
main.zig startup.

Gap 2: Token accounting columns on runs/steps (migration 004).
Store methods updateStepTokens/updateRunTokens accumulate usage.
Engine extracts usage from worker responses and records per-step
and per-run totals. GET /runs/{id} includes token fields.

Gap 3: In-memory rate_limits map on Engine, populated from worker
response rate_limit objects. GET /rate-limits endpoint returns
current rate limit info for all workers.

Gap 4: Agent multi-turn loop checks turn_timeout_ms from node
config. If elapsed time exceeds the timeout, the loop stops and
uses the last successful response.
---
 src/api.zig                          |  51 +++++-
 src/engine.zig                       | 262 +++++++++++++++++++++++++++
 src/main.zig                         |  15 ++
 src/migrations/004_orchestration.sql |  10 +
 src/store.zig                        |  88 +++++++++
 src/worker_response.zig              | 111 ++++++++++++
 src/workflow_loader.zig              | 137 ++++++++++++++
 7 files changed, 673 insertions(+), 1 deletion(-)

diff --git a/src/api.zig b/src/api.zig
index 44e5557..773faa3 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -10,6 +10,7 @@ const tracker_mod = @import("tracker.zig");
 const config_mod = @import("config.zig");
 const sse_mod = @import("sse.zig");
 const state_mod = @import("state.zig");
+const engine_mod = @import("engine.zig");
 
 // ── Types ────────────────────────────────────────────────────────────
 
@@ -27,6 +28,7 @@ pub const Context = struct {
     tracker_state: ?*tracker_mod.TrackerState = null,
     tracker_cfg: ?*const config_mod.TrackerConfig = null,
     sse_hub: ?*sse_mod.SseHub = null,
+    rate_limits: ?*std.StringHashMap(engine_mod.RateLimitInfo) = null,
 };
 
 pub const HttpResponse = struct {
@@ -170,6 +172,11 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleTrackerRefresh(ctx);
     }
 
+    // GET /rate-limits
+    if (is_get and eql(seg0, "rate-limits") and seg1 == null) {
+        return handleGetRateLimits(ctx);
+    }
+
     // ── Workflow CRUD ───────────────────────────────────────────────
 
     // POST /workflows
@@ -298,6 +305,36 @@ fn handleEnableDrain(ctx: *Context) HttpResponse {
     return jsonResponse(200, "{\"status\":\"draining\"}");
 }
 
+// ── Rate Limit Handler ──────────────────────────────────────────────
+
+fn handleGetRateLimits(ctx: *Context) HttpResponse {
+    const rl_map = ctx.rate_limits orelse {
+        return jsonResponse(200, "[]");
+    };
+
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    var it = rl_map.iterator();
+    var first = true;
+    while (it.next()) |entry| {
+        if (!first) {
+            buf.append(ctx.allocator, ',') catch continue;
+        }
+        first = false;
+
+        const rl = entry.value_ptr.*;
+        const wid_json = jsonQuoted(ctx.allocator, rl.worker_id) catch continue;
+        const item = std.fmt.allocPrint(ctx.allocator,
+            \\{{"worker_id":{s},"remaining":{d},"limit":{d},"reset_ms":{d},"updated_at_ms":{d}}}
+        , .{ wid_json, rl.remaining, rl.limit, rl.reset_ms, rl.updated_at_ms }) catch continue;
+        buf.appendSlice(ctx.allocator, item) catch continue;
+    }
+
+    buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    return jsonResponse(200, buf.items);
+}
+
 // ── Worker Handlers ──────────────────────────────────────────────────
 
 fn handleListWorkers(ctx: *Context) HttpResponse {
@@ -713,10 +750,21 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
     const checkpoint_count: i64 = @intCast(checkpoints.len);
     const checkpoint_field = std.fmt.allocPrint(ctx.allocator, ",\"checkpoint_count\":{d}", .{checkpoint_count}) catch "";
 
+    // Token accounting (Gap 2)
+    var token_input: i64 = 0;
+    var token_output: i64 = 0;
+    var token_total: i64 = 0;
+    if (ctx.store.getRunTokens(id)) |t| {
+        token_input = t.input;
+        token_output = t.output;
+        token_total = t.total;
+    } else |_| {}
+    const token_field = std.fmt.allocPrint(ctx.allocator, ",\"total_input_tokens\":{d},\"total_output_tokens\":{d},\"total_tokens\":{d}", .{ token_input, token_output, token_total }) catch "";
+
     const run_id_json = jsonQuoted(ctx.allocator, run.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const run_status_json = jsonQuoted(ctx.allocator, run.status) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s}{s}{s},"steps":{s}}}
+        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s}{s}{s}{s},"steps":{s}}}
     , .{
         run_id_json,
         run_status_json,
@@ -728,6 +776,7 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
         ended_field,
         state_field,
         checkpoint_field,
+        token_field,
         steps_json,
     }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(200, resp);
diff --git a/src/engine.zig b/src/engine.zig
index 8ce9c56..2e914ed 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -36,6 +36,66 @@ const metrics_mod = @import("metrics.zig");
 const async_dispatch = @import("async_dispatch.zig");
 const state_mod = @import("state.zig");
 const sse_mod = @import("sse.zig");
+const workflow_loader = @import("workflow_loader.zig");
+
+// ── Structured Events ────────────────────────────────────────────────
+
+pub const OrchestratorEvent = struct {
+    event_type: EventType,
+    run_id: ?[]const u8,
+    step_id: ?[]const u8,
+    node_name: ?[]const u8,
+    timestamp_ms: i64,
+    metadata_json: ?[]const u8,
+
+    pub const EventType = enum {
+        run_started,
+        run_completed,
+        run_failed,
+        run_interrupted,
+        run_cancelled,
+        step_started,
+        step_completed,
+        step_failed,
+        step_retrying,
+        agent_turn_started,
+        agent_turn_completed,
+        workflow_reloaded,
+        checkpoint_created,
+        state_injected,
+    };
+
+    pub fn eventKindString(et: EventType) []const u8 {
+        return switch (et) {
+            .run_started => "run.started",
+            .run_completed => "run.completed",
+            .run_failed => "run.failed",
+            .run_interrupted => "run.interrupted",
+            .run_cancelled => "run.cancelled",
+            .step_started => "step.started",
+            .step_completed => "step.completed",
+            .step_failed => "step.failed",
+            .step_retrying => "step.retrying",
+            .agent_turn_started => "agent_turn.started",
+            .agent_turn_completed => "agent_turn.completed",
+            .workflow_reloaded => "workflow.reloaded",
+            .checkpoint_created => "checkpoint.created",
+            .state_injected => "state.injected",
+        };
+    }
+
+    pub fn toJson(self: OrchestratorEvent, alloc: std.mem.Allocator) ?[]const u8 {
+        return std.fmt.allocPrint(alloc,
+            \\{{"event_type":"{s}","run_id":"{s}","step_id":"{s}","node_name":"{s}","timestamp_ms":{d}}}
+        , .{
+            eventKindString(self.event_type),
+            self.run_id orelse "",
+            self.step_id orelse "",
+            self.node_name orelse "",
+            self.timestamp_ms,
+        }) catch null;
+    }
+};
 
 // ── Engine ───────────────────────────────────────────────────────────
 
@@ -49,6 +109,14 @@ pub const RuntimeConfig = struct {
     retry_max_elapsed_ms: i64 = 900_000,
 };
 
+pub const RateLimitInfo = struct {
+    worker_id: []const u8,
+    remaining: i64,
+    limit: i64,
+    reset_ms: i64,
+    updated_at_ms: i64,
+};
+
 pub const Engine = struct {
     store: *Store,
     allocator: std.mem.Allocator,
@@ -59,6 +127,8 @@ pub const Engine = struct {
     metrics: ?*metrics_mod.Metrics,
     response_queue: ?*async_dispatch.ResponseQueue,
     sse_hub: ?*sse_mod.SseHub = null,
+    workflow_watcher: ?*workflow_loader.WorkflowWatcher = null,
+    rate_limits: std.StringHashMap(RateLimitInfo),
 
     pub fn init(store: *Store, allocator: std.mem.Allocator, poll_interval_ms: u64) Engine {
         return .{
@@ -71,6 +141,8 @@ pub const Engine = struct {
             .metrics = null,
             .response_queue = null,
             .sse_hub = null,
+            .workflow_watcher = null,
+            .rate_limits = std.StringHashMap(RateLimitInfo).init(allocator),
         };
     }
 
@@ -94,6 +166,75 @@ pub const Engine = struct {
         log.info("engine stopped", .{});
     }
 
+    // ── Config Validation ────────────────────────────────────────────
+
+    /// Validate that the engine configuration is healthy before dispatching
+    /// new work. Returns true if workers exist and the store is reachable.
+    fn validateConfig(self: *Engine) bool {
+        // Check: at least one worker registered and active
+        var arena = std.heap.ArenaAllocator.init(self.allocator);
+        defer arena.deinit();
+        const alloc = arena.allocator();
+
+        const workers = self.store.listWorkers(alloc) catch {
+            log.warn("config validation: store query failed (listWorkers)", .{});
+            return false;
+        };
+
+        if (workers.len == 0) {
+            log.warn("config validation: no workers registered", .{});
+            return false;
+        }
+
+        // Check: store connection healthy (simple query)
+        _ = self.store.getActiveRuns(alloc) catch {
+            log.warn("config validation: store connection unhealthy", .{});
+            return false;
+        };
+
+        return true;
+    }
+
+    // ── Structured Event Emission ────────────────────────────────────
+
+    /// Emit a structured OrchestratorEvent: persist to the events table and
+    /// broadcast via SseHub for real-time consumption.
+    fn emitEvent(
+        self: *Engine,
+        alloc: std.mem.Allocator,
+        event_type: OrchestratorEvent.EventType,
+        run_id: ?[]const u8,
+        step_id: ?[]const u8,
+        node_name: ?[]const u8,
+        metadata_json: ?[]const u8,
+    ) void {
+        const ev = OrchestratorEvent{
+            .event_type = event_type,
+            .run_id = run_id,
+            .step_id = step_id,
+            .node_name = node_name,
+            .timestamp_ms = ids.nowMs(),
+            .metadata_json = metadata_json,
+        };
+
+        const kind = OrchestratorEvent.eventKindString(event_type);
+        const data = ev.toJson(alloc) orelse "{}";
+
+        // Persist to events table
+        if (run_id) |rid| {
+            self.store.insertEvent(rid, step_id, kind, data) catch |err| {
+                log.warn("failed to persist event {s}: {}", .{ kind, err });
+            };
+        }
+
+        // Broadcast via SSE
+        if (self.sse_hub) |hub| {
+            if (run_id) |rid| {
+                hub.broadcast(rid, .{ .event_type = kind, .data = data });
+            }
+        }
+    }
+
     // ── tick — single scheduler iteration ────────────────────────────
 
     fn tick(self: *Engine) !void {
@@ -101,6 +242,17 @@ pub const Engine = struct {
         defer arena.deinit();
         const alloc = arena.allocator();
 
+        // Validate config before processing — skip dispatch if unhealthy
+        if (!self.validateConfig()) {
+            log.warn("config validation failed, skipping dispatch this tick", .{});
+            return;
+        }
+
+        // Check for hot-reloaded workflow files
+        if (self.workflow_watcher) |watcher| {
+            watcher.checkForChanges();
+        }
+
         const now_ms = ids.nowMs();
         if (now_ms >= self.next_health_check_at_ms) {
             self.runWorkerHealthChecks(alloc, now_ms) catch |err| {
@@ -166,6 +318,9 @@ pub const Engine = struct {
     }
 
     fn processRunWithDepth(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, recursion_depth: u32) !void {
+        // Emit run_started event
+        self.emitEvent(alloc, .run_started, run_row.id, null, null, null);
+
         // 1. Load current state
         var current_state = run_row.state_json orelse "{}";
 
@@ -516,6 +671,7 @@ pub const Engine = struct {
                                 }
                                 if (dms > retry_max_ms) dms = retry_max_ms;
                                 log.info("task node {s} attempt {d}/{d} failed, retrying in {d}ms", .{ node_name, attempt + 1, max_attempts, dms });
+                                self.emitEvent(alloc, .step_retrying, run_row.id, null, node_name, null);
                                 std.Thread.sleep(dms * std.time.ns_per_ms);
                                 continue;
                             }
@@ -717,6 +873,9 @@ pub const Engine = struct {
             try self.store.incrementCheckpointCount(run_row.id);
             try self.store.updateRunState(run_row.id, running_state);
 
+            // Emit structured checkpoint event
+            self.emitEvent(alloc, .checkpoint_created, run_row.id, null, node_name, null);
+
             // Broadcast rich SSE events for all modes
             if (self.sse_hub) |hub| {
                 const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
@@ -848,6 +1007,7 @@ pub const Engine = struct {
         const step_id = try alloc.dupe(u8, &step_id_buf);
         try self.store.insertStep(step_id, run_row.id, node_name, node_type, "running", state_json, 1, null, null, null);
         try self.store.insertEvent(run_row.id, step_id, "step.running", "{}");
+        self.emitEvent(alloc, .step_started, run_row.id, step_id, node_name, null);
 
         if (self.metrics) |m| {
             metrics_mod.Metrics.incr(&m.steps_claimed_total);
@@ -881,10 +1041,20 @@ pub const Engine = struct {
         if (result.success) {
             var final_output = result.output;
 
+            // Track cumulative token usage (Gap 2)
+            var total_input_tokens: i64 = 0;
+            var total_output_tokens: i64 = 0;
+            if (result.usage) |usage| {
+                total_input_tokens += usage.input_tokens;
+                total_output_tokens += usage.output_tokens;
+            }
+
             // 7a. Multi-turn continuation for agent nodes
             if (is_agent_node) {
                 const max_turns_val = getNodeFieldInt(alloc, node_json, "max_turns");
                 const continuation_prompt = getNodeField(alloc, node_json, "continuation_prompt");
+                const turn_timeout_ms_val = getNodeFieldInt(alloc, node_json, "turn_timeout_ms");
+                const turn_start_ms = ids.nowMs();
 
                 if (max_turns_val != null and continuation_prompt != null) {
                     const mt = max_turns_val.?;
@@ -892,6 +1062,15 @@ pub const Engine = struct {
                     if (max_turns > 1) {
                         var turn: u32 = 1;
                         while (turn < max_turns) : (turn += 1) {
+                            // Check turn timeout (Gap 4)
+                            if (turn_timeout_ms_val) |timeout_ms| {
+                                const elapsed = ids.nowMs() - turn_start_ms;
+                                if (elapsed > timeout_ms) {
+                                    log.info("agent node {s} turn timeout after {d}ms (limit={d}ms)", .{ node_name, elapsed, timeout_ms });
+                                    break;
+                                }
+                            }
+
                             // Consume pending injections between turns
                             const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
                             _ = injections;
@@ -912,15 +1091,43 @@ pub const Engine = struct {
 
                             if (!cont_result.success) break;
                             final_output = cont_result.output;
+
+                            // Accumulate token usage from continuation turns
+                            if (cont_result.usage) |usage| {
+                                total_input_tokens += usage.input_tokens;
+                                total_output_tokens += usage.output_tokens;
+                            }
                         }
                         log.info("agent node {s} completed {d} turns", .{ node_name, turn });
                     }
                 }
             }
 
+            // Record token usage (Gap 2)
+            if (total_input_tokens > 0 or total_output_tokens > 0) {
+                self.store.updateStepTokens(step_id, total_input_tokens, total_output_tokens) catch |err| {
+                    log.warn("failed to update step tokens: {}", .{err});
+                };
+                self.store.updateRunTokens(run_row.id, total_input_tokens, total_output_tokens) catch |err| {
+                    log.warn("failed to update run tokens: {}", .{err});
+                };
+            }
+
+            // Store rate limit info (Gap 3)
+            if (result.rate_limit) |rl| {
+                self.rate_limits.put(worker.id, RateLimitInfo{
+                    .worker_id = worker.id,
+                    .remaining = rl.remaining,
+                    .limit = rl.limit,
+                    .reset_ms = rl.reset_ms,
+                    .updated_at_ms = ids.nowMs(),
+                }) catch {};
+            }
+
             const output_json = try wrapOutput(alloc, final_output);
             try self.store.updateStepStatus(step_id, "completed", worker.id, output_json, null, 1);
             try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+            self.emitEvent(alloc, .step_completed, run_row.id, step_id, node_name, null);
             try self.store.markWorkerSuccess(worker.id, ids.nowMs());
 
             if (self.metrics) |m| {
@@ -941,6 +1148,7 @@ pub const Engine = struct {
             const err_text = result.error_text orelse "dispatch failed";
             try self.store.updateStepStatus(step_id, "failed", worker.id, null, err_text, 1);
             try self.store.insertEvent(run_row.id, step_id, "step.failed", "{}");
+            self.emitEvent(alloc, .step_failed, run_row.id, step_id, node_name, null);
 
             const now_ms = ids.nowMs();
             const circuit_until = now_ms + self.runtime_cfg.worker_circuit_breaker_ms;
@@ -2653,3 +2861,57 @@ test "engine: workflow version stored in checkpoint metadata" {
     try std.testing.expect(std.mem.indexOf(u8, latest_cp.metadata_json.?, "workflow_version") != null);
     try std.testing.expect(std.mem.indexOf(u8, latest_cp.metadata_json.?, "2") != null);
 }
+
+test "OrchestratorEvent: eventKindString returns correct strings" {
+    try std.testing.expectEqualStrings("run.started", OrchestratorEvent.eventKindString(.run_started));
+    try std.testing.expectEqualStrings("run.completed", OrchestratorEvent.eventKindString(.run_completed));
+    try std.testing.expectEqualStrings("run.failed", OrchestratorEvent.eventKindString(.run_failed));
+    try std.testing.expectEqualStrings("run.interrupted", OrchestratorEvent.eventKindString(.run_interrupted));
+    try std.testing.expectEqualStrings("run.cancelled", OrchestratorEvent.eventKindString(.run_cancelled));
+    try std.testing.expectEqualStrings("step.started", OrchestratorEvent.eventKindString(.step_started));
+    try std.testing.expectEqualStrings("step.completed", OrchestratorEvent.eventKindString(.step_completed));
+    try std.testing.expectEqualStrings("step.failed", OrchestratorEvent.eventKindString(.step_failed));
+    try std.testing.expectEqualStrings("step.retrying", OrchestratorEvent.eventKindString(.step_retrying));
+    try std.testing.expectEqualStrings("checkpoint.created", OrchestratorEvent.eventKindString(.checkpoint_created));
+    try std.testing.expectEqualStrings("state.injected", OrchestratorEvent.eventKindString(.state_injected));
+}
+
+test "OrchestratorEvent: toJson serializes correctly" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const ev = OrchestratorEvent{
+        .event_type = .run_started,
+        .run_id = "run-123",
+        .step_id = null,
+        .node_name = "analyze",
+        .timestamp_ms = 1700000000000,
+        .metadata_json = null,
+    };
+
+    const json_str = ev.toJson(arena.allocator());
+    try std.testing.expect(json_str != null);
+    try std.testing.expect(std.mem.indexOf(u8, json_str.?, "run.started") != null);
+    try std.testing.expect(std.mem.indexOf(u8, json_str.?, "run-123") != null);
+    try std.testing.expect(std.mem.indexOf(u8, json_str.?, "analyze") != null);
+}
+
+test "engine: validateConfig returns false with no workers" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var engine = Engine.init(&store, allocator, 500);
+    try std.testing.expect(!engine.validateConfig());
+}
+
+test "engine: validateConfig returns true with registered workers" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    try store.insertWorker("w1", "http://localhost:9000", "", "webhook", null, "[]", 5, "config");
+    var engine = Engine.init(&store, allocator, 500);
+    try std.testing.expect(engine.validateConfig());
+}
diff --git a/src/main.zig b/src/main.zig
index 8276113..43e75e8 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -244,6 +244,14 @@ pub fn main() !void {
 
     // Start DAG engine on a background thread
     const poll_ms: u64 = cfg.engine.poll_interval_ms;
+    // Hot reload watcher for workflow definitions
+    var wf_watcher: ?workflow_loader.WorkflowWatcher = null;
+    if (cfg.tracker) |tracker_cfg| {
+        if (tracker_cfg.workflows_dir.len > 0) {
+            wf_watcher = workflow_loader.WorkflowWatcher.init(allocator, tracker_cfg.workflows_dir, &store);
+        }
+    }
+
     var engine = engine_mod.Engine.init(&store, allocator, poll_ms);
     engine.configure(.{
         .health_check_interval_ms = @as(i64, @intCast(cfg.engine.health_check_interval_ms)),
@@ -255,6 +263,9 @@ pub fn main() !void {
         .retry_max_elapsed_ms = @as(i64, @intCast(cfg.engine.retry_max_elapsed_ms)),
     }, &metrics);
     engine.response_queue = &response_queue;
+    if (wf_watcher != null) {
+        engine.workflow_watcher = &wf_watcher.?;
+    }
     const engine_thread = try std.Thread.spawn(.{}, engine_mod.Engine.run, .{&engine});
 
     // Spawn listener threads for async protocols
@@ -341,6 +352,9 @@ pub fn main() !void {
         if (tracker_instance) |*ti| {
             ti.deinit();
         }
+        if (wf_watcher) |*ww| {
+            ww.deinit();
+        }
     }
 
     while (true) {
@@ -392,6 +406,7 @@ pub fn main() !void {
             .tracker_state = if (tracker_instance) |*ti| &ti.state else null,
             .tracker_cfg = if (cfg.tracker) |*tc| tc else null,
             .sse_hub = &sse_hub,
+            .rate_limits = &engine.rate_limits,
         };
         const response = api.handleRequest(&ctx, request.method, request.target, request.body);
 
diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
index bc39ec7..4091294 100644
--- a/src/migrations/004_orchestration.sql
+++ b/src/migrations/004_orchestration.sql
@@ -85,3 +85,13 @@ CREATE TABLE IF NOT EXISTS pending_writes (
     created_at_ms INTEGER NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_pending_writes_run ON pending_writes(run_id);
+
+-- Token accounting columns on runs
+ALTER TABLE runs ADD COLUMN total_input_tokens INTEGER DEFAULT 0;
+ALTER TABLE runs ADD COLUMN total_output_tokens INTEGER DEFAULT 0;
+ALTER TABLE runs ADD COLUMN total_tokens INTEGER DEFAULT 0;
+
+-- Token accounting columns on steps
+ALTER TABLE steps ADD COLUMN input_tokens INTEGER DEFAULT 0;
+ALTER TABLE steps ADD COLUMN output_tokens INTEGER DEFAULT 0;
+ALTER TABLE steps ADD COLUMN total_tokens INTEGER DEFAULT 0;
diff --git a/src/store.zig b/src/store.zig
index a83ab89..31d729e 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -1418,6 +1418,65 @@ pub const Store = struct {
         }
     }
 
+    // ── Token Accounting ──────────────────────────────────────────────
+
+    pub fn updateStepTokens(self: *Self, step_id: []const u8, input_tokens: i64, output_tokens: i64) !void {
+        const sql = "UPDATE steps SET input_tokens = ?, output_tokens = ?, total_tokens = ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_int64(stmt, 1, input_tokens);
+        _ = c.sqlite3_bind_int64(stmt, 2, output_tokens);
+        _ = c.sqlite3_bind_int64(stmt, 3, input_tokens + output_tokens);
+        _ = c.sqlite3_bind_text(stmt, 4, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn updateRunTokens(self: *Self, run_id: []const u8, input_delta: i64, output_delta: i64) !void {
+        const sql = "UPDATE runs SET total_input_tokens = total_input_tokens + ?, total_output_tokens = total_output_tokens + ?, total_tokens = total_tokens + ? WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_int64(stmt, 1, input_delta);
+        _ = c.sqlite3_bind_int64(stmt, 2, output_delta);
+        _ = c.sqlite3_bind_int64(stmt, 3, input_delta + output_delta);
+        _ = c.sqlite3_bind_text(stmt, 4, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    pub fn getRunTokens(self: *Self, run_id: []const u8) !struct { input: i64, output: i64, total: i64 } {
+        const sql = "SELECT COALESCE(total_input_tokens, 0), COALESCE(total_output_tokens, 0), COALESCE(total_tokens, 0) FROM runs WHERE id = ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) {
+            return .{ .input = 0, .output = 0, .total = 0 };
+        }
+
+        return .{
+            .input = colInt(stmt, 0),
+            .output = colInt(stmt, 1),
+            .total = colInt(stmt, 2),
+        };
+    }
+
     // ── Checkpoint CRUD ───────────────────────────────────────────────
 
     pub fn createCheckpoint(self: *Self, id: []const u8, run_id: []const u8, step_id: []const u8, parent_id: ?[]const u8, state_json: []const u8, completed_nodes_json: []const u8, version: i64, metadata_json: ?[]const u8) !void {
@@ -2735,6 +2794,35 @@ test "run state management" {
     try std.testing.expectEqualStrings("pending", forked.status);
 }
 
+test "token accounting: update step and run tokens" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    try s.createRunWithState("r-tok", null, "{}", "{}", "{}");
+    try s.updateRunStatus("r-tok", "running", null);
+    try s.insertStep("s-tok", "r-tok", "task1", "task", "completed", "{}", 1, null, null, null);
+
+    // Update step tokens
+    try s.updateStepTokens("s-tok", 100, 200);
+
+    // Update run tokens
+    try s.updateRunTokens("r-tok", 100, 200);
+
+    // Verify run tokens
+    const tokens = try s.getRunTokens("r-tok");
+    try std.testing.expectEqual(@as(i64, 100), tokens.input);
+    try std.testing.expectEqual(@as(i64, 200), tokens.output);
+    try std.testing.expectEqual(@as(i64, 300), tokens.total);
+
+    // Accumulate more tokens
+    try s.updateRunTokens("r-tok", 50, 75);
+    const tokens2 = try s.getRunTokens("r-tok");
+    try std.testing.expectEqual(@as(i64, 150), tokens2.input);
+    try std.testing.expectEqual(@as(i64, 275), tokens2.output);
+    try std.testing.expectEqual(@as(i64, 425), tokens2.total);
+}
+
 test "workflow version CRUD" {
     const allocator = std.testing.allocator;
     var s = try Store.init(allocator, ":memory:");
diff --git a/src/worker_response.zig b/src/worker_response.zig
index 778109d..55110b0 100644
--- a/src/worker_response.zig
+++ b/src/worker_response.zig
@@ -1,11 +1,24 @@
 const std = @import("std");
 
+pub const UsageInfo = struct {
+    input_tokens: i64 = 0,
+    output_tokens: i64 = 0,
+};
+
+pub const RateLimitData = struct {
+    remaining: i64 = 0,
+    limit: i64 = 0,
+    reset_ms: i64 = 0,
+};
+
 pub const ParseResult = struct {
     output: []const u8,
     success: bool,
     error_text: ?[]const u8,
     async_pending: bool = false,
     correlation_id: ?[]const u8 = null,
+    usage: ?UsageInfo = null,
+    rate_limit: ?RateLimitData = null,
 };
 
 pub const invalid_json_error = "worker response must be a JSON object";
@@ -34,6 +47,8 @@ pub fn parse(allocator: std.mem.Allocator, response_data: []const u8) !ParseResu
             .output = try allocator.dupe(u8, output),
             .success = true,
             .error_text = null,
+            .usage = extractUsage(obj),
+            .rate_limit = extractRateLimit(obj),
         };
     }
 
@@ -94,6 +109,53 @@ fn extractErrorMessage(allocator: std.mem.Allocator, obj: std.json.ObjectMap) !?
     return null;
 }
 
+fn extractUsage(obj: std.json.ObjectMap) ?UsageInfo {
+    const usage_val = obj.get("usage") orelse return null;
+    if (usage_val != .object) return null;
+    const usage_obj = usage_val.object;
+
+    var info = UsageInfo{};
+
+    // OpenAI format: prompt_tokens / completion_tokens
+    if (usage_obj.get("prompt_tokens")) |v| {
+        if (v == .integer) info.input_tokens = v.integer;
+    }
+    if (usage_obj.get("completion_tokens")) |v| {
+        if (v == .integer) info.output_tokens = v.integer;
+    }
+
+    // A2A/generic format: input_tokens / output_tokens
+    if (usage_obj.get("input_tokens")) |v| {
+        if (v == .integer) info.input_tokens = v.integer;
+    }
+    if (usage_obj.get("output_tokens")) |v| {
+        if (v == .integer) info.output_tokens = v.integer;
+    }
+
+    if (info.input_tokens == 0 and info.output_tokens == 0) return null;
+    return info;
+}
+
+fn extractRateLimit(obj: std.json.ObjectMap) ?RateLimitData {
+    const rl_val = obj.get("rate_limit") orelse return null;
+    if (rl_val != .object) return null;
+    const rl_obj = rl_val.object;
+
+    var info = RateLimitData{};
+    if (rl_obj.get("remaining")) |v| {
+        if (v == .integer) info.remaining = v.integer;
+    }
+    if (rl_obj.get("limit")) |v| {
+        if (v == .integer) info.limit = v.integer;
+    }
+    if (rl_obj.get("reset_ms")) |v| {
+        if (v == .integer) info.reset_ms = v.integer;
+    }
+
+    if (info.remaining == 0 and info.limit == 0) return null;
+    return info;
+}
+
 fn isAsyncAckWithoutOutput(obj: std.json.ObjectMap) bool {
     const status_val = obj.get("status") orelse return false;
     return status_val == .string and std.mem.eql(u8, status_val.string, "received");
@@ -165,3 +227,52 @@ test "parse rejects object without supported output fields" {
     try std.testing.expect(!result.success);
     try std.testing.expectEqualStrings(missing_output_error, result.error_text.?);
 }
+
+test "parse extracts usage info from OpenAI format" {
+    const allocator = std.testing.allocator;
+    const result = try parse(
+        allocator,
+        "{\"response\":\"done\",\"usage\":{\"prompt_tokens\":150,\"completion_tokens\":75}}",
+    );
+    defer allocator.free(result.output);
+    try std.testing.expect(result.success);
+    try std.testing.expect(result.usage != null);
+    try std.testing.expectEqual(@as(i64, 150), result.usage.?.input_tokens);
+    try std.testing.expectEqual(@as(i64, 75), result.usage.?.output_tokens);
+}
+
+test "parse extracts usage info from generic format" {
+    const allocator = std.testing.allocator;
+    const result = try parse(
+        allocator,
+        "{\"response\":\"done\",\"usage\":{\"input_tokens\":200,\"output_tokens\":100}}",
+    );
+    defer allocator.free(result.output);
+    try std.testing.expect(result.success);
+    try std.testing.expect(result.usage != null);
+    try std.testing.expectEqual(@as(i64, 200), result.usage.?.input_tokens);
+    try std.testing.expectEqual(@as(i64, 100), result.usage.?.output_tokens);
+}
+
+test "parse extracts rate limit info" {
+    const allocator = std.testing.allocator;
+    const result = try parse(
+        allocator,
+        "{\"response\":\"done\",\"rate_limit\":{\"remaining\":95,\"limit\":100,\"reset_ms\":1700000000000}}",
+    );
+    defer allocator.free(result.output);
+    try std.testing.expect(result.success);
+    try std.testing.expect(result.rate_limit != null);
+    try std.testing.expectEqual(@as(i64, 95), result.rate_limit.?.remaining);
+    try std.testing.expectEqual(@as(i64, 100), result.rate_limit.?.limit);
+    try std.testing.expectEqual(@as(i64, 1700000000000), result.rate_limit.?.reset_ms);
+}
+
+test "parse returns null usage when no usage field" {
+    const allocator = std.testing.allocator;
+    const result = try parse(allocator, "{\"response\":\"done\"}");
+    defer allocator.free(result.output);
+    try std.testing.expect(result.success);
+    try std.testing.expect(result.usage == null);
+    try std.testing.expect(result.rate_limit == null);
+}
diff --git a/src/workflow_loader.zig b/src/workflow_loader.zig
index 5f0fda4..f5d2fe0 100644
--- a/src/workflow_loader.zig
+++ b/src/workflow_loader.zig
@@ -1,4 +1,7 @@
 const std = @import("std");
+const ids = @import("ids.zig");
+const Store = @import("store.zig").Store;
+const log = std.log.scoped(.workflow_loader);
 
 // ── Types ─────────────────────────────────────────────────────────────
 
@@ -100,6 +103,102 @@ test "loadWorkflows: supports absolute workflow directories" {
     try std.testing.expectEqualStrings("absolute", map.get("absolute").?.pipeline_id);
 }
 
+// ── WorkflowWatcher ──────────────────────────────────────────────────
+
+pub const WorkflowWatcher = struct {
+    dir_path: []const u8,
+    store: *Store,
+    last_check_ms: i64,
+    file_hashes: std.StringHashMap(u64),
+    alloc: std.mem.Allocator,
+
+    pub fn init(alloc: std.mem.Allocator, dir_path: []const u8, store: *Store) WorkflowWatcher {
+        return .{
+            .dir_path = dir_path,
+            .store = store,
+            .last_check_ms = 0,
+            .file_hashes = std.StringHashMap(u64).init(alloc),
+            .alloc = alloc,
+        };
+    }
+
+    pub fn deinit(self: *WorkflowWatcher) void {
+        var it = self.file_hashes.iterator();
+        while (it.next()) |entry| {
+            self.alloc.free(entry.key_ptr.*);
+        }
+        self.file_hashes.deinit();
+    }
+
+    /// Check for changed workflow files. Called periodically from engine tick.
+    pub fn checkForChanges(self: *WorkflowWatcher) void {
+        const now = ids.nowMs();
+        if (now - self.last_check_ms < 5000) return; // check every 5 seconds
+        self.last_check_ms = now;
+
+        var dir = if (std.fs.path.isAbsolute(self.dir_path))
+            std.fs.openDirAbsolute(self.dir_path, .{ .iterate = true }) catch return
+        else
+            std.fs.cwd().openDir(self.dir_path, .{ .iterate = true }) catch return;
+        defer dir.close();
+
+        var iter = dir.iterate();
+        while (iter.next() catch null) |entry| {
+            if (entry.kind != .file) continue;
+            if (!std.mem.endsWith(u8, entry.name, ".json")) continue;
+
+            const contents = dir.readFileAlloc(self.alloc, entry.name, 1024 * 1024) catch continue;
+            defer self.alloc.free(contents);
+
+            // Compute FNV1a hash of content
+            const hash = std.hash.Fnv1a_64.hash(contents);
+
+            // Check if hash changed
+            const existing = self.file_hashes.get(entry.name);
+            if (existing) |prev_hash| {
+                if (prev_hash == hash) continue; // unchanged
+            }
+
+            // Parse and validate
+            const parsed = std.json.parseFromSlice(std.json.Value, self.alloc, contents, .{}) catch continue;
+            defer parsed.deinit();
+            if (parsed.value != .object) continue;
+
+            const obj = parsed.value.object;
+
+            // Extract name and id
+            const wf_name = if (obj.get("name")) |v| (if (v == .string) v.string else null) else null;
+            const wf_id = if (obj.get("id")) |v| (if (v == .string) v.string else null) else null;
+            if (wf_id == null and wf_name == null) continue;
+
+            const id = wf_id orelse wf_name.?;
+            const name = wf_name orelse wf_id.?;
+
+            // Upsert into workflows table
+            // Try insert first; if it fails (duplicate id), update instead
+            self.store.createWorkflow(id, name, contents) catch {
+                self.store.updateWorkflow(id, name, contents) catch continue;
+            };
+
+            // Store hash (need to dupe the key since entry.name is transient)
+            const key_dupe = self.alloc.dupe(u8, entry.name) catch continue;
+            if (existing != null) {
+                // Free old key if we're replacing
+                if (self.file_hashes.fetchPut(key_dupe, hash) catch null) |old| {
+                    self.alloc.free(old.key);
+                }
+            } else {
+                self.file_hashes.put(key_dupe, hash) catch {
+                    self.alloc.free(key_dupe);
+                    continue;
+                };
+            }
+
+            log.info("workflow {s} reloaded", .{id});
+        }
+    }
+};
+
 // ── getWorkflowForPipeline ────────────────────────────────────────────
 
 pub fn getWorkflowForPipeline(map: *const WorkflowMap, pipeline_id: []const u8) ?*const WorkflowDef {
@@ -299,3 +398,41 @@ test "parse workflow with continuation_prompt" {
     defer parsed.deinit();
     try std.testing.expectEqualStrings("Continue: attempt #{{attempt}}", parsed.value.subprocess.continuation_prompt.?);
 }
+
+test "WorkflowWatcher: detects file changes" {
+    const allocator = std.testing.allocator;
+    var s = try Store.init(allocator, ":memory:");
+    defer s.deinit();
+
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+
+    const dir_path = try tmp.dir.realpathAlloc(allocator, ".");
+    defer allocator.free(dir_path);
+
+    var watcher = WorkflowWatcher.init(allocator, dir_path, &s);
+    defer watcher.deinit();
+
+    // Force last_check_ms to 0 so check runs immediately
+    watcher.last_check_ms = 0;
+
+    // Write a workflow file
+    try tmp.dir.writeFile(.{
+        .sub_path = "test_wf.json",
+        .data =
+        \\{"id":"wf-test","name":"Test WF","nodes":{}}
+        ,
+    });
+
+    watcher.checkForChanges();
+
+    // Verify workflow was inserted
+    const wf = try s.getWorkflow(allocator, "wf-test");
+    try std.testing.expect(wf != null);
+    allocator.free(wf.?.id);
+    allocator.free(wf.?.name);
+    allocator.free(wf.?.definition_json);
+
+    // Verify hash was stored
+    try std.testing.expectEqual(@as(usize, 1), watcher.file_hashes.count());
+}

From 8f22e3e6b236099452af13eeb168b54e7efbfb9e Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 21:12:53 -0300
Subject: [PATCH 19/55] feat: startup cleanup, path safety, structured events,
 config validation

Gap 5: Add startupCleanup() to Tracker for workspace cleanup on start.
Gap 6: Add validateWorkspacePath() and sanitizeDirectoryName for symlink
       escape prevention; validate paths before workspace operations.
Gap 7: Add OrchestratorEvent struct with typed events; emit structured
       events at run/step lifecycle points and broadcast via SseHub.
Gap 8: Add validateConfig() check per engine tick; skip dispatch when
       no workers registered or store is unhealthy.
---
 src/tracker.zig   | 10 +++++-
 src/workspace.zig | 88 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/src/tracker.zig b/src/tracker.zig
index 7425e02..c61950b 100644
--- a/src/tracker.zig
+++ b/src/tracker.zig
@@ -257,6 +257,14 @@ pub const Tracker = struct {
         _ = self.used_ports.swapRemove(port);
     }
 
+    /// Startup cleanup: remove all stale workspaces from a previous run.
+    /// Workspaces are ephemeral and will be recreated by hooks when tasks are
+    /// claimed again, so a clean slate on restart is safe.
+    pub fn startupCleanup(self: *Tracker) void {
+        log.info("startup: cleaning terminal workspaces", .{});
+        workspace_mod.cleanAll(self.cfg.workspace.root);
+    }
+
     /// Thread entry point — run the poll loop until shutdown is requested.
     pub fn run(self: *Tracker) void {
         log.info("tracker started (poll_interval={d}ms, agent_id={s})", .{
@@ -265,7 +273,7 @@ pub const Tracker = struct {
         });
 
         // Startup cleanup: remove all stale workspaces from previous run
-        workspace_mod.cleanAll(self.cfg.workspace.root);
+        self.startupCleanup();
 
         const poll_ns: u64 = @as(u64, self.cfg.poll_interval_ms) * std.time.ns_per_ms;
 
diff --git a/src/workspace.zig b/src/workspace.zig
index c8b0d39..f7d5f9a 100644
--- a/src/workspace.zig
+++ b/src/workspace.zig
@@ -14,6 +14,52 @@ pub fn sanitizeId(allocator: std.mem.Allocator, id: []const u8) ![]const u8 {
     return buf;
 }
 
+/// Validate that a workspace path is safely contained within the workspace root.
+/// Returns true if the canonical workspace_path starts with the canonical root
+/// and contains no invalid characters. Returns false if a symlink escape or
+/// directory traversal is detected.
+pub fn validateWorkspacePath(allocator: std.mem.Allocator, workspace_root: []const u8, workspace_path: []const u8) bool {
+    // Check for invalid characters (\n, \r, \0) in the raw path
+    for (workspace_path) |ch| {
+        if (ch == '\n' or ch == '\r' or ch == 0) {
+            log.warn("workspace path contains invalid character: {s}", .{workspace_path});
+            return false;
+        }
+    }
+
+    // Canonicalize both paths (resolves symlinks)
+    const canon_root = std.fs.cwd().realpathAlloc(allocator, workspace_root) catch {
+        log.warn("workspace: cannot resolve root {s}", .{workspace_root});
+        return false;
+    };
+    defer allocator.free(canon_root);
+
+    const canon_path = std.fs.cwd().realpathAlloc(allocator, workspace_path) catch {
+        log.warn("workspace: cannot resolve path {s}", .{workspace_path});
+        return false;
+    };
+    defer allocator.free(canon_path);
+
+    // Check that canonical workspace_path starts with canonical workspace_root
+    if (!std.mem.startsWith(u8, canon_path, canon_root)) {
+        log.warn("workspace path escape detected: {s} is not under {s}", .{ canon_path, canon_root });
+        return false;
+    }
+
+    // Ensure there's a separator after the root (not just a prefix match on a longer name)
+    if (canon_path.len > canon_root.len and canon_path[canon_root.len] != std.fs.path.sep) {
+        log.warn("workspace path escape detected: {s} is not under {s}", .{ canon_path, canon_root });
+        return false;
+    }
+
+    return true;
+}
+
+/// Sanitize a directory name by replacing any character not in [A-Za-z0-9._-]
+/// with '_'. This prevents directory traversal via task identifiers.
+/// Alias for sanitizeId — same logic, exported under the canonical name.
+pub const sanitizeDirectoryName = sanitizeId;
+
 /// An isolated workspace directory for a single task.
 pub const Workspace = struct {
     root: []const u8,
@@ -45,6 +91,13 @@ pub const Workspace = struct {
             return err;
         };
 
+        // Validate the created path is safely under the workspace root
+        if (!validateWorkspacePath(allocator, root, path)) {
+            log.warn("workspace: path validation failed for {s}, refusing to use", .{path});
+            allocator.free(path);
+            return error.PathValidationFailed;
+        }
+
         // If the directory already had contents it was not freshly created
         var dir = try std.fs.cwd().openDir(path, .{ .iterate = true });
         defer dir.close();
@@ -262,3 +315,38 @@ test "cleanAll removes all subdirectories" {
     try std.testing.expectError(error.FileNotFound, tmp.dir.openDir("task-001", .{}));
     try std.testing.expectError(error.FileNotFound, tmp.dir.openDir("task-002", .{}));
 }
+
+test "validateWorkspacePath accepts safe path" {
+    const allocator = std.testing.allocator;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+
+    const root = try tmp.dir.realpathAlloc(allocator, ".");
+    defer allocator.free(root);
+
+    // Create a subdirectory
+    try tmp.dir.makeDir("safe-task");
+    const sub_path = try std.fs.path.join(allocator, &.{ root, "safe-task" });
+    defer allocator.free(sub_path);
+
+    try std.testing.expect(validateWorkspacePath(allocator, root, sub_path));
+}
+
+test "validateWorkspacePath rejects path outside root" {
+    const allocator = std.testing.allocator;
+    var tmp = std.testing.tmpDir(.{});
+    defer tmp.cleanup();
+
+    const root = try tmp.dir.realpathAlloc(allocator, ".");
+    defer allocator.free(root);
+
+    // /tmp is definitely not under the test temp dir
+    try std.testing.expect(!validateWorkspacePath(allocator, root, "/tmp"));
+}
+
+test "sanitizeDirectoryName replaces invalid chars" {
+    const allocator = std.testing.allocator;
+    const result = try sanitizeDirectoryName(allocator, "../../etc/passwd");
+    defer allocator.free(result);
+    try std.testing.expectEqualStrings(".._.._etc_passwd", result);
+}

From b2d7a56e8750cd50d3407afa5b50baf85cab3dea Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 21:34:50 -0300
Subject: [PATCH 20/55] feat: UI messages, Mermaid export, ephemeral state,
 push_message

---
 src/api.zig    |  33 ++++-
 src/engine.zig | 339 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/state.zig  |  93 ++++++++++++++
 3 files changed, 463 insertions(+), 2 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 773faa3..e737a7b 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -209,6 +209,11 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleValidateWorkflow(ctx, seg1.?);
     }
 
+    // GET /workflows/{id}/mermaid
+    if (is_get and eql(seg0, "workflows") and seg1 != null and eql(seg2, "mermaid") and seg3 == null) {
+        return handleGetMermaid(ctx, seg1.?);
+    }
+
     // POST /workflows/{id}/run
     if (is_post and eql(seg0, "workflows") and seg1 != null and eql(seg2, "run") and seg3 == null) {
         return handleRunWorkflow(ctx, seg1.?, body);
@@ -1317,11 +1322,37 @@ fn handleValidateWorkflow(ctx: *Context, id: []const u8) HttpResponse {
         buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     }
 
-    buf.appendSlice(ctx.allocator, "]}") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    buf.appendSlice(ctx.allocator, "]") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+
+    // Include Mermaid diagram in validation response
+    const mermaid_str = engine_mod.generateMermaid(ctx.allocator, wf.definition_json) catch null;
+    if (mermaid_str) |ms| {
+        const mermaid_json = jsonQuoted(ctx.allocator, ms) catch null;
+        if (mermaid_json) |mj| {
+            buf.appendSlice(ctx.allocator, ",\"mermaid\":") catch {};
+            buf.appendSlice(ctx.allocator, mj) catch {};
+        }
+    }
+
+    buf.appendSlice(ctx.allocator, "}") catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const json_body = buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(200, json_body);
 }
 
+fn handleGetMermaid(ctx: *Context, id: []const u8) HttpResponse {
+    const wf = ctx.store.getWorkflow(ctx.allocator, id) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get workflow\"}}");
+    } orelse {
+        return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"workflow not found\"}}");
+    };
+
+    const mermaid = engine_mod.generateMermaid(ctx.allocator, wf.definition_json) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to generate mermaid diagram\"}}");
+    };
+
+    return plainResponse(200, mermaid);
+}
+
 fn handleRunWorkflow(ctx: *Context, workflow_id: []const u8, body: []const u8) HttpResponse {
     // Load workflow
     const wf = ctx.store.getWorkflow(ctx.allocator, workflow_id) catch {
diff --git a/src/engine.zig b/src/engine.zig
index 2e914ed..5727cb9 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -710,6 +710,11 @@ pub const Engine = struct {
                             };
                         }
 
+                        // Apply UI messages to state (__ui_messages key)
+                        if (cr.raw_output) |raw_out| {
+                            running_state = applyUiMessagesToState(alloc, running_state, raw_out) catch running_state;
+                        }
+
                         // Consume pending injections
                         const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
                         for (injections) |injection| {
@@ -861,6 +866,10 @@ pub const Engine = struct {
                 }
             }
 
+            // Strip ephemeral keys before checkpoint persistence
+            const schema_for_eph = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+            running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
+
             // Save checkpoint after each node
             made_progress = true;
             version += 1;
@@ -900,6 +909,7 @@ pub const Engine = struct {
         completed: struct {
             state_updates: ?[]const u8,
             goto_targets: ?[]const []const u8 = null,
+            raw_output: ?[]const u8 = null,
         },
         async_pending: void,
         no_worker: void,
@@ -1135,6 +1145,12 @@ pub const Engine = struct {
             }
             callbacks.fireCallbacks(alloc, run_row.callbacks_json, "step.completed", run_row.id, step_id, output_json, self.metrics);
 
+            // Process UI messages and stream messages from worker response
+            if (self.sse_hub) |hub| {
+                processUiMessages(hub, alloc, run_row.id, step_id, final_output);
+                processStreamMessages(hub, alloc, run_row.id, step_id, node_type, final_output);
+            }
+
             // Build state_updates from output
             // Try parsing as JSON with "state_updates" field, otherwise wrap output in "output" key
             const state_updates = extractStateUpdates(alloc, final_output) orelse
@@ -1143,7 +1159,7 @@ pub const Engine = struct {
             // Extract goto targets from output (command primitive)
             const goto_targets = extractGotoTargets(alloc, final_output);
 
-            return TaskNodeResult{ .completed = .{ .state_updates = state_updates, .goto_targets = goto_targets } };
+            return TaskNodeResult{ .completed = .{ .state_updates = state_updates, .goto_targets = goto_targets, .raw_output = final_output } };
         } else {
             const err_text = result.error_text orelse "dispatch failed";
             try self.store.updateStepStatus(step_id, "failed", worker.id, null, err_text, 1);
@@ -2134,6 +2150,217 @@ fn migrateCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.StringH
     return true;
 }
 
+// ── UI Messages ──────────────────────────────────────────────────────
+
+/// Process "ui_messages" from worker response JSON.
+/// For each message:
+///   - If it has "remove": true -> broadcast as "ui_message_delete" SSE event
+///   - Otherwise -> broadcast as "ui_message" SSE event
+/// Also applies to state.__ui_messages via add_messages reducer.
+fn processUiMessages(hub: *sse_mod.SseHub, alloc: std.mem.Allocator, run_id: []const u8, step_id: []const u8, response_json: []const u8) void {
+    const parsed = json.parseFromSlice(json.Value, alloc, response_json, .{}) catch return;
+    if (parsed.value != .object) return;
+    const ui_msgs_val = parsed.value.object.get("ui_messages") orelse return;
+    if (ui_msgs_val != .array) return;
+
+    for (ui_msgs_val.array.items) |msg| {
+        if (msg != .object) continue;
+
+        // Check for remove flag
+        const is_remove = blk: {
+            if (msg.object.get("remove")) |rm_val| {
+                if (rm_val == .bool) break :blk rm_val.bool;
+            }
+            break :blk false;
+        };
+
+        // Add step_id to the event data
+        var event_obj = json.ObjectMap.init(alloc);
+        var it = msg.object.iterator();
+        while (it.next()) |entry| {
+            event_obj.put(entry.key_ptr.*, entry.value_ptr.*) catch continue;
+        }
+        event_obj.put("step_id", .{ .string = step_id }) catch {};
+        const event_data = serializeJsonValue(alloc, .{ .object = event_obj }) catch continue;
+
+        if (is_remove) {
+            hub.broadcast(run_id, .{ .event_type = "ui_message_delete", .data = event_data, .mode = .custom });
+        } else {
+            hub.broadcast(run_id, .{ .event_type = "ui_message", .data = event_data, .mode = .custom });
+        }
+    }
+}
+
+/// Apply ui_messages to run state's __ui_messages key using add_messages reducer.
+fn applyUiMessagesToState(alloc: std.mem.Allocator, state_json: []const u8, response_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const resp_parsed = json.parseFromSlice(json.Value, arena_alloc, response_json, .{}) catch return try alloc.dupe(u8, state_json);
+    if (resp_parsed.value != .object) return try alloc.dupe(u8, state_json);
+    const ui_msgs_val = resp_parsed.value.object.get("ui_messages") orelse return try alloc.dupe(u8, state_json);
+    if (ui_msgs_val != .array) return try alloc.dupe(u8, state_json);
+
+    // Serialize the ui_messages array
+    const ui_msgs_json = serializeJsonValue(arena_alloc, ui_msgs_val) catch return try alloc.dupe(u8, state_json);
+
+    // Build updates: {"__ui_messages": <ui_msgs>}
+    const updates = std.fmt.allocPrint(arena_alloc, "{{\"__ui_messages\":{s}}}", .{ui_msgs_json}) catch return try alloc.dupe(u8, state_json);
+
+    // Build a temporary schema that uses add_messages for __ui_messages
+    const schema =
+        \\{"__ui_messages":{"type":"array","reducer":"add_messages"}}
+    ;
+
+    return state_mod.applyUpdates(alloc, state_json, updates, schema) catch try alloc.dupe(u8, state_json);
+}
+
+// ── Stream Messages ──────────────────────────────────────────────────
+
+/// Process "stream_messages" from worker response JSON.
+/// For each message: broadcast as a "message" SSE event with step context.
+fn processStreamMessages(hub: *sse_mod.SseHub, alloc: std.mem.Allocator, run_id: []const u8, step_id: []const u8, node_type: []const u8, response_json: []const u8) void {
+    const parsed = json.parseFromSlice(json.Value, alloc, response_json, .{}) catch return;
+    if (parsed.value != .object) return;
+    const stream_msgs_val = parsed.value.object.get("stream_messages") orelse return;
+    if (stream_msgs_val != .array) return;
+
+    for (stream_msgs_val.array.items) |msg| {
+        if (msg != .object) continue;
+
+        // Build enriched message with step context
+        var event_obj = json.ObjectMap.init(alloc);
+        var it = msg.object.iterator();
+        while (it.next()) |entry| {
+            event_obj.put(entry.key_ptr.*, entry.value_ptr.*) catch continue;
+        }
+        event_obj.put("step_id", .{ .string = step_id }) catch {};
+        event_obj.put("node_type", .{ .string = node_type }) catch {};
+        const event_data = serializeJsonValue(alloc, .{ .object = event_obj }) catch continue;
+
+        hub.broadcast(run_id, .{ .event_type = "message", .data = event_data, .mode = .custom });
+    }
+}
+
+// ── Mermaid Graph Export ─────────────────────────────────────────────
+
+/// Generate Mermaid diagram syntax from a workflow JSON definition.
+/// Returns a Mermaid flowchart string.
+pub fn generateMermaid(alloc: std.mem.Allocator, definition_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    const parsed = try json.parseFromSlice(json.Value, arena_alloc, definition_json, .{});
+    if (parsed.value != .object) return try alloc.dupe(u8, "graph TD\n");
+
+    const nodes_val = parsed.value.object.get("nodes") orelse return try alloc.dupe(u8, "graph TD\n");
+    if (nodes_val != .object) return try alloc.dupe(u8, "graph TD\n");
+
+    const edges_val = parsed.value.object.get("edges") orelse return try alloc.dupe(u8, "graph TD\n");
+    if (edges_val != .array) return try alloc.dupe(u8, "graph TD\n");
+
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+
+    // Header
+    try buf.appendSlice(arena_alloc, "graph TD\n");
+
+    // __start__ and __end__ nodes
+    try buf.appendSlice(arena_alloc, "    __start__((Start))\n");
+
+    // Node definitions
+    var nodes_it = nodes_val.object.iterator();
+    while (nodes_it.next()) |entry| {
+        const name = entry.key_ptr.*;
+        const node = entry.value_ptr.*;
+
+        const node_type_str = blk: {
+            if (node == .object) {
+                if (node.object.get("type")) |t| {
+                    if (t == .string) break :blk t.string;
+                }
+            }
+            break :blk "task";
+        };
+
+        // Choose Mermaid shape based on node type
+        if (std.mem.eql(u8, node_type_str, "route")) {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "{");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\nroute}\n");
+        } else if (std.mem.eql(u8, node_type_str, "interrupt")) {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "[/");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\ninterrupt/]\n");
+        } else if (std.mem.eql(u8, node_type_str, "send")) {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "[[");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\nsend]]\n");
+        } else if (std.mem.eql(u8, node_type_str, "transform")) {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "(");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\ntransform)\n");
+        } else if (std.mem.eql(u8, node_type_str, "subgraph")) {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "[");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\nsubgraph]\n");
+        } else {
+            // task, agent, and others: rectangle
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "[");
+            try buf.appendSlice(arena_alloc, name);
+            try buf.appendSlice(arena_alloc, "\\n");
+            try buf.appendSlice(arena_alloc, node_type_str);
+            try buf.appendSlice(arena_alloc, "]\n");
+        }
+    }
+
+    // __end__ node
+    try buf.appendSlice(arena_alloc, "    __end__((End))\n");
+
+    // Edges
+    for (edges_val.array.items) |edge_item| {
+        if (edge_item != .array) continue;
+        if (edge_item.array.items.len < 2) continue;
+
+        const source_raw = if (edge_item.array.items[0] == .string) edge_item.array.items[0].string else continue;
+        const target = if (edge_item.array.items[1] == .string) edge_item.array.items[1].string else continue;
+
+        // Parse conditional edge "source:value"
+        if (std.mem.indexOfScalar(u8, source_raw, ':')) |colon_pos| {
+            const source = source_raw[0..colon_pos];
+            const condition = source_raw[colon_pos + 1 ..];
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, source);
+            try buf.appendSlice(arena_alloc, " -->|");
+            try buf.appendSlice(arena_alloc, condition);
+            try buf.appendSlice(arena_alloc, "| ");
+            try buf.appendSlice(arena_alloc, target);
+            try buf.appendSlice(arena_alloc, "\n");
+        } else {
+            try buf.appendSlice(arena_alloc, "    ");
+            try buf.appendSlice(arena_alloc, source_raw);
+            try buf.appendSlice(arena_alloc, " --> ");
+            try buf.appendSlice(arena_alloc, target);
+            try buf.appendSlice(arena_alloc, "\n");
+        }
+    }
+
+    return try alloc.dupe(u8, buf.items);
+}
+
 // ── Tests ─────────────────────────────────────────────────────────────
 
 test "Engine: init and stop" {
@@ -2915,3 +3142,113 @@ test "engine: validateConfig returns true with registered workers" {
     var engine = Engine.init(&store, allocator, 500);
     try std.testing.expect(engine.validateConfig());
 }
+
+test "generateMermaid: simple chain" {
+    const allocator = std.testing.allocator;
+    const wf =
+        \\{"nodes":{"analyze":{"type":"task"},"review":{"type":"task"}},"edges":[["__start__","analyze"],["analyze","review"],["review","__end__"]]}
+    ;
+    const result = try generateMermaid(allocator, wf);
+    defer allocator.free(result);
+
+    try std.testing.expect(std.mem.indexOf(u8, result, "graph TD") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "__start__((Start))") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "__end__((End))") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "analyze[analyze") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "__start__ --> analyze") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "review --> __end__") != null);
+}
+
+test "generateMermaid: route node with conditional edges" {
+    const allocator = std.testing.allocator;
+    const wf =
+        \\{"nodes":{"decide":{"type":"route"},"approve":{"type":"task"},"reject":{"type":"task"}},"edges":[["__start__","decide"],["decide:yes","approve"],["decide:no","reject"],["approve","__end__"],["reject","__end__"]]}
+    ;
+    const result = try generateMermaid(allocator, wf);
+    defer allocator.free(result);
+
+    try std.testing.expect(std.mem.indexOf(u8, result, "decide{decide") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "decide -->|yes| approve") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "decide -->|no| reject") != null);
+}
+
+test "generateMermaid: node type shapes" {
+    const allocator = std.testing.allocator;
+    const wf =
+        \\{"nodes":{"t":{"type":"transform"},"i":{"type":"interrupt"},"s":{"type":"send"},"sg":{"type":"subgraph"}},"edges":[["__start__","t"],["t","__end__"]]}
+    ;
+    const result = try generateMermaid(allocator, wf);
+    defer allocator.free(result);
+
+    // transform uses rounded parens
+    try std.testing.expect(std.mem.indexOf(u8, result, "t(t\\ntransform)") != null);
+    // interrupt uses parallelogram
+    try std.testing.expect(std.mem.indexOf(u8, result, "i[/i\\ninterrupt/]") != null);
+    // send uses double brackets
+    try std.testing.expect(std.mem.indexOf(u8, result, "s[[s\\nsend]]") != null);
+    // subgraph uses rectangle
+    try std.testing.expect(std.mem.indexOf(u8, result, "sg[sg\\nsubgraph]") != null);
+}
+
+test "processUiMessages: broadcasts events" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    var hub = sse_mod.SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+
+    const response =
+        \\{"response":"ok","ui_messages":[{"id":"p1","name":"ProgressBar","props":{"progress":75}},{"id":"old","remove":true}]}
+    ;
+    processUiMessages(&hub, alloc, "run1", "step1", response);
+
+    const events = queue.drain(alloc);
+    try std.testing.expectEqual(@as(usize, 2), events.len);
+    try std.testing.expectEqualStrings("ui_message", events[0].event_type);
+    try std.testing.expectEqualStrings("ui_message_delete", events[1].event_type);
+    // First event should contain step_id
+    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "step1") != null);
+}
+
+test "processStreamMessages: broadcasts message events" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+    const alloc = arena.allocator();
+
+    var hub = sse_mod.SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+
+    const response =
+        \\{"response":"done","stream_messages":[{"role":"assistant","content":"Starting..."},{"role":"tool","content":"Found 3 issues","tool":"lint"}]}
+    ;
+    processStreamMessages(&hub, alloc, "run1", "step1", "task", response);
+
+    const events = queue.drain(alloc);
+    try std.testing.expectEqual(@as(usize, 2), events.len);
+    try std.testing.expectEqualStrings("message", events[0].event_type);
+    try std.testing.expectEqualStrings("message", events[1].event_type);
+    // Should contain step context
+    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "step1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "task") != null);
+    try std.testing.expect(std.mem.indexOf(u8, events[1].data, "tool") != null);
+}
+
+test "applyUiMessagesToState: creates __ui_messages" {
+    const allocator = std.testing.allocator;
+    const state = "{}";
+    const response =
+        \\{"response":"ok","ui_messages":[{"id":"p1","name":"ProgressBar"}]}
+    ;
+    const result = try applyUiMessagesToState(allocator, state, response);
+    defer allocator.free(result);
+
+    try std.testing.expect(std.mem.indexOf(u8, result, "__ui_messages") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "ProgressBar") != null);
+}
diff --git a/src/state.zig b/src/state.zig
index 779ed8a..330f9f4 100644
--- a/src/state.zig
+++ b/src/state.zig
@@ -592,6 +592,51 @@ fn applyAddMessages(alloc: Allocator, old_json: ?[]const u8, update_json: []cons
     return try alloc.dupe(u8, result);
 }
 
+// ── Ephemeral State Keys ──────────────────────────────────────────────
+
+/// Strip ephemeral keys from state before checkpoint persistence.
+/// Parses the schema for keys with `"ephemeral": true` and removes
+/// those keys from the state JSON. Returns a new JSON string.
+pub fn stripEphemeralKeys(alloc: Allocator, state_json: []const u8, schema_json: []const u8) ![]const u8 {
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    // Parse schema to find ephemeral keys
+    const schema_parsed = try json.parseFromSlice(json.Value, arena_alloc, schema_json, .{});
+    if (schema_parsed.value != .object) return try alloc.dupe(u8, state_json);
+
+    var ephemeral_keys = std.StringHashMap(void).init(arena_alloc);
+    var schema_it = schema_parsed.value.object.iterator();
+    while (schema_it.next()) |entry| {
+        const schema_entry = entry.value_ptr.*;
+        if (schema_entry == .object) {
+            if (schema_entry.object.get("ephemeral")) |eph_val| {
+                if (eph_val == .bool and eph_val.bool) {
+                    try ephemeral_keys.put(entry.key_ptr.*, {});
+                }
+            }
+        }
+    }
+
+    if (ephemeral_keys.count() == 0) return try alloc.dupe(u8, state_json);
+
+    // Parse state and remove ephemeral keys
+    const state_parsed = try json.parseFromSlice(json.Value, arena_alloc, state_json, .{});
+    if (state_parsed.value != .object) return try alloc.dupe(u8, state_json);
+
+    var result_obj = json.ObjectMap.init(arena_alloc);
+    var state_it = state_parsed.value.object.iterator();
+    while (state_it.next()) |entry| {
+        if (ephemeral_keys.get(entry.key_ptr.*) == null) {
+            try result_obj.put(entry.key_ptr.*, entry.value_ptr.*);
+        }
+    }
+
+    const result_str = try serializeValue(arena_alloc, json.Value{ .object = result_obj });
+    return try alloc.dupe(u8, result_str);
+}
+
 // ── Custom errors ─────────────────────────────────────────────────────
 
 const InvalidNumber = error{InvalidNumber};
@@ -934,3 +979,51 @@ test "overwrite with array value" {
     try std.testing.expectEqual(@as(usize, 1), items.array.items.len);
     try std.testing.expectEqual(@as(i64, 99), items.array.items[0].integer);
 }
+
+test "stripEphemeralKeys removes ephemeral keys" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"messages":["hello"],"temp_data":"scratch","count":5}
+    ;
+    const schema =
+        \\{"messages":{"type":"array","reducer":"append"},"temp_data":{"type":"string","reducer":"last_value","ephemeral":true},"count":{"type":"number","reducer":"add"}}
+    ;
+
+    const result = try stripEphemeralKeys(alloc, state, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+    // temp_data should be stripped
+    try std.testing.expect(parsed.value.object.get("temp_data") == null);
+    // messages and count should remain
+    try std.testing.expect(parsed.value.object.get("messages") != null);
+    try std.testing.expect(parsed.value.object.get("count") != null);
+}
+
+test "stripEphemeralKeys no-op when no ephemeral keys" {
+    const alloc = std.testing.allocator;
+    const state =
+        \\{"messages":["hello"],"count":5}
+    ;
+    const schema =
+        \\{"messages":{"type":"array","reducer":"append"},"count":{"type":"number","reducer":"add"}}
+    ;
+
+    const result = try stripEphemeralKeys(alloc, state, schema);
+    defer alloc.free(result);
+
+    const parsed = try parseTestJson(alloc, result);
+    defer parsed.deinit();
+    try std.testing.expect(parsed.value == .object);
+    try std.testing.expect(parsed.value.object.get("messages") != null);
+    try std.testing.expect(parsed.value.object.get("count") != null);
+}
+
+test "stripEphemeralKeys with empty state" {
+    const alloc = std.testing.allocator;
+    const result = try stripEphemeralKeys(alloc, "{}", "{}");
+    defer alloc.free(result);
+    try std.testing.expectEqualStrings("{}", result);
+}

From d2311e40f2e965d0bf18d97654ebeb3b4ac6dced Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 21:47:53 -0300
Subject: [PATCH 21/55] refactor: clean architecture, remove dead code, update
 docs

Remove backward-compatibility cruft from the pre-orchestration architecture:

Dead types removed:
- ChatMessageRow, SagaStateRow from types.zig

Dead store methods removed:
- getCycleState, upsertCycleState (cycle_state table)
- insertChatMessage, getChatMessages (chat_messages table)
- insertSagaState, updateSagaState, getSagaStates (saga_state table)
- getReadySteps, getStepDeps (step_deps-based DAG scheduling)
- setStepStartedAt (wait step timer tracking)

Dead template features removed:
- debate_responses, chat_history, role context fields and resolvers
- StepOutput.outputs field (fan_out/map multi-output)
- serializeOutputs helper function

Dead API handlers removed:
- handleApproveStep, handleRejectStep (approval steps -> 410 Gone)
- handleGetChatTranscript (group_chat -> removed)

Dead validation rules removed:
- loop, sub_workflow, wait, router, saga, debate, group_chat step type rules
- All associated error variants and tests

CLAUDE.md rewritten to reflect current architecture:
- 7 step types (not 14)
- 7 reducer types
- 35+ API endpoints (was 19)
- 4 migrations (was 2)
- Full module map with all 27 source files
- Unified state model, checkpoints, SSE streaming docs
---
 CLAUDE.md                            | 151 +++++++-----
 src/api.zig                          | 192 +--------------
 src/migrations/004_orchestration.sql |   4 +-
 src/store.zig                        | 343 ---------------------------
 src/templates.zig                    | 134 +----------
 src/types.zig                        |  19 --
 src/workflow_validation.zig          | 145 +----------
 7 files changed, 118 insertions(+), 870 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index a2f719c..f50946f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,76 +1,104 @@
 # NullBoiler
 
-DAG-based workflow orchestrator for NullClaw AI bot agents. Part of the Null ecosystem (NullTracker, NullClaw).
+Graph-based workflow orchestrator with unified state model for NullClaw AI bot agents. Part of the Null ecosystem (NullTracker, NullClaw).
 
 ## Tech Stack
 
 - **Language**: Zig 0.15.2
 - **Database**: SQLite (vendored in `deps/sqlite/`), WAL mode
 - **Protocol**: HTTP/1.1 REST API with JSON payloads
-- **Dispatch**: HTTP (webhook/api_chat/openai_chat), MQTT, Redis Streams
+- **Dispatch**: HTTP (webhook/api_chat/openai_chat/a2a), MQTT, Redis Streams
 - **Vendored C libs**: SQLite (`deps/sqlite/`), hiredis (`deps/hiredis/`), libmosquitto (`deps/mosquitto/`)
 
 ## Module Map
 
 | File | Role |
 |------|------|
-| `main.zig` | CLI args (`--port`, `--db`, `--config`, `--version`), HTTP accept loop, engine thread, tracker thread |
-| `api.zig` | REST API routing and 19 endpoint handlers (incl. signal, chat, tracker status) |
-| `store.zig` | SQLite layer, 30+ CRUD methods, schema migrations |
-| `engine.zig` | DAG scheduler: tick loop, 14 step type handlers, graph cycles, worker handoff |
-| `dispatch.zig` | Worker selection (tags, capacity), protocol-aware dispatch (`webhook`, `api_chat`, `openai_chat`, `mqtt`, `redis_stream`) |
+| `main.zig` | CLI args (`--port`, `--db`, `--config`, `--version`, `--export-manifest`, `--from-json`), HTTP accept loop, engine thread, tracker thread |
+| `api.zig` | REST API routing and 30+ endpoint handlers (runs, workers, workflows, checkpoints, state, SSE stream, tracker) |
+| `store.zig` | SQLite layer, CRUD methods for all tables, schema migrations (4 migration files) |
+| `engine.zig` | Graph-based state scheduler: tick loop, 7 node type handlers, checkpoints, reducers, goto, breakpoints, deferred nodes, reconciliation |
+| `state.zig` | Unified state model: 7 reducer types (last_value, append, merge, add, min, max, add_messages), overwrite bypass, ephemeral keys, state path resolution |
+| `sse.zig` | Server-Sent Events hub: per-run event queues, 5 stream modes (values, updates, tasks, debug, custom) |
+| `dispatch.zig` | Worker selection (tags, capacity, A2A preference), protocol-aware dispatch |
 | `async_dispatch.zig` | Thread-safe response queue for async MQTT/Redis dispatch (keyed by correlation_id) |
 | `redis_client.zig` | Hiredis wrapper: connect, XADD, listener thread for response streams |
 | `mqtt_client.zig` | Libmosquitto wrapper: connect, publish, subscribe, listener thread for response topics |
-| `templates.zig` | Prompt template rendering: `{{input.X}}`, `{{steps.ID.output}}`, `{{item}}`, `{{task.X}}`, `{{debate_responses}}`, `{{chat_history}}`, `{{role}}` |
+| `templates.zig` | Prompt template rendering: state-based `{{state.X}}`, legacy `{{input.X}}`, `{{item}}`, `{{task.X}}`, `{{attempt}}`, conditional blocks |
 | `callbacks.zig` | Fire-and-forget webhook callbacks on step/run events |
 | `config.zig` | JSON config loader (`Config`, `WorkerConfig`, `EngineConfig`, `TrackerConfig`) |
-| `types.zig` | `RunStatus`, `StepStatus`, `StepType` (14 types), `WorkerStatus`, `TrackerTaskState`, row types |
+| `types.zig` | `RunStatus`, `StepStatus`, `StepType` (7 types), `WorkerStatus`, `ReducerType`, row types |
 | `tracker.zig` | Pull-mode tracker thread: poll NullTickets, claim tasks, heartbeat leases, stall detection |
 | `tracker_client.zig` | HTTP client for NullTickets API (claim, heartbeat, transition, fail, artifacts) |
 | `workspace.zig` | Workspace lifecycle: create, hook execution, cleanup, path sanitization |
 | `subprocess.zig` | NullClaw subprocess: spawn, health check, prompt sending, kill |
-| `workflow_loader.zig` | Load JSON workflow definitions from `workflows/` directory |
+| `workflow_loader.zig` | Load JSON workflow definitions from `workflows/` directory, hot-reload watcher |
+| `workflow_validation.zig` | Graph-based workflow validation: reachability, cycles, state key refs, route/send targets |
 | `ids.zig` | UUID v4 generation, `nowMs()` |
-| `migrations/001_init.sql` | 6 tables: workers, runs, steps, step_deps, events, artifacts |
-| `migrations/002_advanced_steps.sql` | 3 tables: cycle_state, chat_messages, saga_state + ALTER TABLE |
+| `metrics.zig` | Prometheus-style metrics counters |
+| `strategy.zig` | Pluggable strategy map for workflow execution |
+| `worker_protocol.zig` | Protocol-specific request body builders |
+| `worker_response.zig` | Protocol-specific response parsers |
+| `export_manifest.zig` | Export tool manifest for CLI integration |
+| `from_json.zig` | Import workflow from JSON CLI command |
 
 ## Build / Test / Run
 
 ```sh
 zig build              # build
-zig build test         # unit tests
+zig build test         # unit tests (324 tests)
 zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock workers)
 ./zig-out/bin/nullboiler --port 8080 --db nullboiler.db --config config.json
 ```
 
+## Step Types (7)
+
+`task`, `route`, `interrupt`, `agent`, `send`, `transform`, `subgraph`
+
+## Reducers (7)
+
+`last_value`, `append`, `merge`, `add`, `min`, `max`, `add_messages`
+
 ## API Endpoints
 
 | Method | Path | Description |
 |--------|------|-------------|
 | GET | `/health` | Health check |
+| GET | `/metrics` | Prometheus metrics |
 | POST | `/workers` | Register worker |
 | GET | `/workers` | List workers |
 | DELETE | `/workers/{id}` | Remove worker |
-| POST | `/runs` | Create workflow run |
-| GET | `/runs` | List runs |
+| POST | `/runs` | Create workflow run (legacy step-array or graph format) |
+| GET | `/runs` | List runs (supports ?status= filter) |
 | GET | `/runs/{id}` | Get run details |
 | POST | `/runs/{id}/cancel` | Cancel run |
 | POST | `/runs/{id}/retry` | Retry failed run |
+| POST | `/runs/{id}/resume` | Resume interrupted run (with optional state updates) |
+| POST | `/runs/{id}/state` | Inject state into running run (pending injection) |
+| POST | `/runs/{id}/replay` | Replay run from a checkpoint |
+| POST | `/runs/fork` | Fork run from a checkpoint into a new run |
 | GET | `/runs/{id}/steps` | List steps for run |
 | GET | `/runs/{id}/steps/{step_id}` | Get step details |
-| POST | `/runs/{id}/steps/{step_id}/approve` | Approve approval step |
-| POST | `/runs/{id}/steps/{step_id}/reject` | Reject approval step |
 | GET | `/runs/{id}/events` | List run events |
-| POST | `/runs/{id}/steps/{step_id}/signal` | Signal a waiting step |
-| GET | `/runs/{id}/steps/{step_id}/chat` | Get group_chat transcript |
-| GET | `/tracker/status` | Pull-mode tracker status (running tasks, concurrency, counters) |
+| GET | `/runs/{id}/checkpoints` | List checkpoints for run |
+| GET | `/runs/{id}/checkpoints/{cpId}` | Get checkpoint details |
+| GET | `/runs/{id}/stream` | SSE stream (supports ?mode=values\|updates\|tasks\|debug) |
+| POST | `/workflows` | Create workflow definition |
+| GET | `/workflows` | List workflow definitions |
+| GET | `/workflows/{id}` | Get workflow definition |
+| PUT | `/workflows/{id}` | Update workflow definition |
+| DELETE | `/workflows/{id}` | Delete workflow definition |
+| POST | `/workflows/{id}/validate` | Validate workflow definition |
+| GET | `/workflows/{id}/mermaid` | Export workflow as Mermaid diagram |
+| POST | `/workflows/{id}/run` | Start a run from a stored workflow |
+| GET | `/rate-limits` | Get current rate limit info per worker |
+| POST | `/admin/drain` | Enable drain mode |
+| GET | `/tracker/status` | Pull-mode tracker status |
 | GET | `/tracker/tasks` | List running pull-mode tasks |
 | GET | `/tracker/tasks/{task_id}` | Get single pull-mode task details |
-
-## Step Types
-
-`task`, `fan_out`, `map`, `condition`, `approval`, `reduce`, `loop`, `sub_workflow`, `wait`, `router`, `transform`, `saga`, `debate`, `group_chat`
+| GET | `/tracker/stats` | Tracker statistics |
+| POST | `/tracker/refresh` | Force tracker poll |
+| POST | `/internal/agent-events/{run_id}/{step_id}` | Agent event callback (from NullClaw) |
 
 ## Coding Conventions
 
@@ -83,16 +111,47 @@ zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock wo
 
 ## Architecture
 
-- Single-threaded HTTP accept loop on main thread
-- Background engine thread polls DB for active runs (+ polls async response queue for MQTT/Redis steps)
-- `std.atomic.Value(bool)` for coordinated shutdown
-- Config workers seeded into DB on startup (source = "config")
-- Schema in `migrations/001_init.sql` + `002_advanced_steps.sql`, applied on `Store.init`
-- Graph cycles: condition/router can route back to completed steps, engine creates new step instances per iteration
-- Worker handoff: dispatch result can include `handoff_to` for chained delegation (max 5)
-- Async dispatch: MQTT/Redis workers use two-phase dispatch (publish → engine polls response queue)
-- Background listener threads (MQTT/Redis) started conditionally when async workers are configured
-- Pull-mode tracker thread (conditional): polls NullTickets for tasks, claims work, manages subprocess lifecycles
+- **Unified state model**: Every node reads from state, returns partial updates, engine applies reducers
+- **Graph-based execution**: Workflow = `{nodes: {}, edges: [], schema: {}}` with `__start__` and `__end__` synthetic nodes
+- **Checkpoints**: State snapshot after every node, enabling fork/replay/resume
+- **Conditional edges**: Route nodes produce values, edges like `["router:yes", "next"]` are taken when route result matches
+- **Deferred nodes**: Nodes with `"defer": true` execute right before `__end__`
+- **Command primitive**: Workers can return `{"goto": "node_name"}` to override normal graph traversal
+- **Breakpoints**: `interrupt_before` / `interrupt_after` arrays pause execution
+- **Subgraph**: Inline child workflow execution with input/output mapping (max recursion depth 10)
+- **Multi-turn agents**: Agent nodes can loop with `continuation_prompt` up to `max_turns`
+- **Configurable runs**: Per-run config stored as `state.__config`
+- **Node-level cache**: FNV hash of (node_name, rendered_prompt) with configurable TTL
+- **Token accounting**: Cumulative input/output token tracking per step and per run
+- **Workflow hot-reload**: `WorkflowWatcher` polls `workflows/` directory for JSON changes, upserts into DB
+- **Reconciliation**: Check NullTickets task status between steps, cancel if task is terminal
+
+### Thread Model
+
+```
+Main thread:       HTTP accept loop (push API)
+Engine thread:     Graph tick loop (state-based scheduler)
+Tracker thread:    Poll NullTickets -> claim -> workspace -> subprocess/dispatch
+MQTT listener:     (conditional, for async MQTT workers)
+Redis listener:    (conditional, for async Redis workers)
+```
+
+### SSE Streaming
+
+5 modes for real-time consumption via `GET /runs/{id}/stream?mode=X`:
+- `values` -- full state after each step
+- `updates` -- node name + partial state updates
+- `tasks` -- task start/finish with metadata
+- `debug` -- everything with step number + timestamp
+- `custom` -- user-defined events from worker output (`ui_messages`, `stream_messages`)
+
+## Database
+
+SQLite with WAL mode. Schema across 4 migrations:
+- `001_init.sql`: workers, runs, steps, step_deps, events, artifacts
+- `002_advanced_steps.sql`: cycle_state, chat_messages, saga_state (legacy, unused by current engine)
+- `003_tracker.sql`: tracker_runs
+- `004_orchestration.sql`: workflows, checkpoints, agent_events, pending_state_injections, node_cache, pending_writes + ALTER TABLE extensions for state_json, config_json, parent_run_id, token accounting
 
 ## Pull-Mode (NullTickets Integration)
 
@@ -131,27 +190,3 @@ Optional pull-mode where NullBoiler acts as an agent polling NullTickets for wor
 ```
 
 If `tracker` is absent or null, the tracker thread does not start and push-mode operates unchanged.
-
-### Workflow Definitions
-
-JSON files in `workflows/` directory. Two execution modes:
-- `subprocess` — spawn NullClaw child process per task (isolated workspace)
-- `dispatch` — use existing registered workers (no workspace)
-
-Three-axis concurrency: global (`max_concurrent_tasks`) + per-pipeline + per-role limits.
-
-### Thread Model
-
-```
-Main thread:       HTTP accept loop (push API — unchanged)
-Engine thread:     DAG tick loop (unchanged)
-Tracker thread:    Poll NullTickets → claim → workspace → subprocess/dispatch
-MQTT listener:     (unchanged, conditional)
-Redis listener:    (unchanged, conditional)
-```
-
-## Database
-
-SQLite with WAL mode. Schema: 9 tables across 2 migrations.
-- `001_init.sql`: workers, runs, steps, step_deps, events, artifacts
-- `002_advanced_steps.sql`: cycle_state, chat_messages, saga_state + iteration_index/child_run_id columns on steps
diff --git a/src/api.zig b/src/api.zig
index e737a7b..83bb2d5 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -107,19 +107,14 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleGetStep(ctx, seg1.?, seg3.?);
     }
 
-    // POST /runs/{id}/steps/{step_id}/approve
+    // POST /runs/{id}/steps/{step_id}/approve (legacy, removed)
     if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "approve") and seg5 == null) {
-        return handleApproveStep(ctx, seg1.?, seg3.?);
+        return jsonResponse(410, "{\"error\":{\"code\":\"gone\",\"message\":\"approval steps have been removed; use interrupt + resume instead\"}}");
     }
 
-    // POST /runs/{id}/steps/{step_id}/reject
+    // POST /runs/{id}/steps/{step_id}/reject (legacy, removed)
     if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "reject") and seg5 == null) {
-        return handleRejectStep(ctx, seg1.?, seg3.?);
-    }
-
-    // GET /runs/{id}/steps/{step_id}/chat
-    if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "chat") and seg5 == null) {
-        return handleGetChatTranscript(ctx, seg1.?, seg3.?);
+        return jsonResponse(410, "{\"error\":{\"code\":\"gone\",\"message\":\"approval steps have been removed; use interrupt + resume instead\"}}");
     }
 
     // GET /runs/{id}/events
@@ -970,66 +965,6 @@ fn handleRetryRun(ctx: *Context, run_id: []const u8) HttpResponse {
     return jsonResponse(200, resp);
 }
 
-fn handleApproveStep(ctx: *Context, run_id: []const u8, step_id: []const u8) HttpResponse {
-    // 1. Get step from store
-    const step = switch (lookupStepInRun(ctx, run_id, step_id)) {
-        .ok => |s| s,
-        .err => |resp| return resp,
-    };
-
-    // 2. Must be "waiting_approval"
-    if (!std.mem.eql(u8, step.status, "waiting_approval")) {
-        const resp = std.fmt.allocPrint(ctx.allocator,
-            \\{{"error":{{"code":"conflict","message":"step is not waiting_approval (current: {s})"}}}}
-        , .{step.status}) catch return jsonResponse(409, "{\"error\":{\"code\":\"conflict\",\"message\":\"step is not waiting_approval\"}}");
-        return jsonResponse(409, resp);
-    }
-
-    // 3. Update status to "completed"
-    ctx.store.updateStepStatus(step_id, "completed", null, null, null, step.attempt) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update step\"}}");
-    };
-
-    // 4. Insert event
-    ctx.store.insertEvent(run_id, step_id, "step.approved", "{}") catch {};
-
-    // 5. Return 200
-    const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"step_id":"{s}","status":"completed"}}
-    , .{step_id}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    return jsonResponse(200, resp);
-}
-
-fn handleRejectStep(ctx: *Context, run_id: []const u8, step_id: []const u8) HttpResponse {
-    // 1. Get step from store
-    const step = switch (lookupStepInRun(ctx, run_id, step_id)) {
-        .ok => |s| s,
-        .err => |resp| return resp,
-    };
-
-    // 2. Must be "waiting_approval"
-    if (!std.mem.eql(u8, step.status, "waiting_approval")) {
-        const resp = std.fmt.allocPrint(ctx.allocator,
-            \\{{"error":{{"code":"conflict","message":"step is not waiting_approval (current: {s})"}}}}
-        , .{step.status}) catch return jsonResponse(409, "{\"error\":{\"code\":\"conflict\",\"message\":\"step is not waiting_approval\"}}");
-        return jsonResponse(409, resp);
-    }
-
-    // 3. Update status to "failed", set error_text
-    ctx.store.updateStepStatus(step_id, "failed", null, null, "rejected by user", step.attempt) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update step\"}}");
-    };
-
-    // 4. Insert event
-    ctx.store.insertEvent(run_id, step_id, "step.rejected", "{}") catch {};
-
-    // 5. Return 200
-    const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"step_id":"{s}","status":"failed"}}
-    , .{step_id}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    return jsonResponse(200, resp);
-}
-
 fn handleSignalStep(ctx: *Context, run_id: []const u8, step_id: []const u8, body: []const u8) HttpResponse {
     // 1. Get step from store
     const step = switch (lookupStepInRun(ctx, run_id, step_id)) {
@@ -1900,56 +1835,6 @@ fn getSchemaFromRun(ctx: *Context, run: types.RunRow) []const u8 {
     return "{}";
 }
 
-// ── Chat Transcript Handler ──────────────────────────────────────────
-
-fn handleGetChatTranscript(ctx: *Context, run_id: []const u8, step_id: []const u8) HttpResponse {
-    _ = switch (lookupStepInRun(ctx, run_id, step_id)) {
-        .ok => |s| s,
-        .err => |resp| return resp,
-    };
-
-    const messages = ctx.store.getChatMessages(ctx.allocator, step_id) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get chat messages\"}}");
-    };
-
-    // Build JSON array of chat messages
-    var buf: std.ArrayListUnmanaged(u8) = .empty;
-    buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-
-    for (messages, 0..) |msg, i| {
-        if (i > 0) {
-            buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        }
-
-        const worker_field = if (msg.worker_id) |wid| blk: {
-            const wid_json = jsonQuoted(ctx.allocator, wid) catch "";
-            break :blk std.fmt.allocPrint(ctx.allocator, ",\"worker_id\":{s}", .{wid_json}) catch "";
-        } else "";
-        const msg_run_id_json = jsonQuoted(ctx.allocator, msg.run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        const msg_step_id_json = jsonQuoted(ctx.allocator, msg.step_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        const role_json = jsonQuoted(ctx.allocator, msg.role) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        const message_json = jsonQuoted(ctx.allocator, msg.message) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-
-        const entry = std.fmt.allocPrint(ctx.allocator,
-            \\{{"id":{d},"run_id":{s},"step_id":{s},"round":{d},"role":{s}{s},"message":{s},"ts_ms":{d}}}
-        , .{
-            msg.id,
-            msg_run_id_json,
-            msg_step_id_json,
-            msg.round,
-            role_json,
-            worker_field,
-            message_json,
-            msg.ts_ms,
-        }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    }
-
-    buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    const json_body = buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    return jsonResponse(200, json_body);
-}
-
 // ── Tracker Handlers ─────────────────────────────────────────────────
 
 fn formatRunningTask(allocator: std.mem.Allocator, task: tracker_mod.RunningTask) ![]const u8 {
@@ -2115,16 +2000,6 @@ fn validationErrorResponse(err: workflow_validation.ValidateError) HttpResponse
         error.DependsOnItemNotString => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"depends_on items must be strings\"}}"),
         error.DependsOnDuplicate => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"depends_on contains duplicate step id\"}}"),
         error.DependsOnUnknownStepId => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"depends_on references unknown step id\"}}"),
-        error.LoopBodyRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"loop step requires 'body' field\"}}"),
-        error.SubWorkflowRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"sub_workflow step requires 'workflow' field\"}}"),
-        error.WaitConditionRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"wait step requires 'duration_ms', 'until_ms', or 'signal'\"}}"),
-        error.WaitDurationInvalid => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"wait.duration_ms must be a non-negative integer\"}}"),
-        error.WaitUntilInvalid => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"wait.until_ms must be a non-negative integer\"}}"),
-        error.WaitSignalInvalid => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"wait.signal must be a non-empty string\"}}"),
-        error.RouterRoutesRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"router step requires 'routes' field\"}}"),
-        error.SagaBodyRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"saga step requires 'body' field\"}}"),
-        error.DebateCountRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"debate step requires 'count' field\"}}"),
-        error.GroupChatParticipantsRequired => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"group_chat step requires 'participants' field\"}}"),
         error.RetryMustBeObject => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"retry must be an object\"}}"),
         error.MaxAttemptsMustBePositiveInteger => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"retry.max_attempts must be a positive integer\"}}"),
         error.TimeoutMsMustBePositiveInteger => jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"timeout_ms must be a positive integer\"}}"),
@@ -2508,48 +2383,6 @@ test "API: create run rejects non-positive timeout_ms" {
     try std.testing.expectEqual(@as(u16, 400), resp.status_code);
 }
 
-test "API: create run rejects invalid wait duration string" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    var ctx = Context{
-        .store = &store,
-        .allocator = arena.allocator(),
-    };
-
-    const body =
-        \\{"steps":[{"id":"w1","type":"wait","duration_ms":"abc"}]}
-    ;
-
-    const resp = handleRequest(&ctx, "POST", "/runs", body);
-    try std.testing.expectEqual(@as(u16, 400), resp.status_code);
-}
-
-test "API: create run rejects invalid wait signal type" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    var ctx = Context{
-        .store = &store,
-        .allocator = arena.allocator(),
-    };
-
-    const body =
-        \\{"steps":[{"id":"w1","type":"wait","signal":1}]}
-    ;
-
-    const resp = handleRequest(&ctx, "POST", "/runs", body);
-    try std.testing.expectEqual(@as(u16, 400), resp.status_code);
-}
-
 test "API: create run rejects duplicate depends_on items" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
@@ -2591,15 +2424,11 @@ test "API: get step enforces run ownership" {
     try std.testing.expectEqual(@as(u16, 404), resp.status_code);
 }
 
-test "API: chat transcript escapes message content" {
+test "API: approve endpoint returns 410 gone" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
 
-    try store.insertRun("run-chat", null, "running", "{\"steps\":[]}", "{}", "[]");
-    try store.insertStep("step-chat-1", "run-chat", "chat", "group_chat", "completed", "{}", 1, null, null, null);
-    try store.insertChatMessage("run-chat", "step-chat-1", 1, "agent", null, "He said \"go\"\\nline");
-
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
 
@@ -2608,15 +2437,8 @@ test "API: chat transcript escapes message content" {
         .allocator = arena.allocator(),
     };
 
-    const resp = handleRequest(&ctx, "GET", "/runs/run-chat/steps/step-chat-1/chat", "");
-    try std.testing.expectEqual(@as(u16, 200), resp.status_code);
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, resp.body, .{});
-    defer parsed.deinit();
-
-    try std.testing.expectEqual(@as(usize, 1), parsed.value.array.items.len);
-    const msg = parsed.value.array.items[0].object.get("message").?;
-    try std.testing.expectEqualStrings("He said \"go\"\\nline", msg.string);
+    const resp = handleRequest(&ctx, "POST", "/runs/run-1/steps/step-1/approve", "");
+    try std.testing.expectEqual(@as(u16, 410), resp.status_code);
 }
 
 test "API: register worker rejects non-array tags" {
diff --git a/src/migrations/004_orchestration.sql b/src/migrations/004_orchestration.sql
index 4091294..ce69d40 100644
--- a/src/migrations/004_orchestration.sql
+++ b/src/migrations/004_orchestration.sql
@@ -1,5 +1,5 @@
--- Note: step_deps, cycle_state, saga_state kept for backward compatibility
--- until engine.zig is rewritten (Task 8). They will be removed then.
+-- Note: step_deps table is kept for legacy POST /runs endpoint backward compatibility.
+-- cycle_state, chat_messages, saga_state tables are legacy (unused by current engine).
 
 -- Saved workflow definitions
 CREATE TABLE IF NOT EXISTS workflows (
diff --git a/src/store.zig b/src/store.zig
index 31d729e..92a3def 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -716,28 +716,6 @@ pub const Store = struct {
         }
     }
 
-    pub fn getReadySteps(self: *Self, allocator: std.mem.Allocator, run_id: []const u8) ![]types.StepRow {
-        const sql =
-            "SELECT s.id, s.run_id, s.def_step_id, s.type, s.status, s.worker_id, s.input_json, s.output_json, s.error_text, s.attempt, s.max_attempts, s.timeout_ms, s.next_attempt_at_ms, s.parent_step_id, s.item_index, s.created_at_ms, s.updated_at_ms, s.started_at_ms, s.ended_at_ms, s.child_run_id, s.iteration_index " ++
-            "FROM steps s WHERE s.run_id = ? AND s.status = 'ready' " ++
-            "AND NOT EXISTS (" ++
-            "SELECT 1 FROM step_deps d JOIN steps dep ON dep.id = d.depends_on " ++
-            "WHERE d.step_id = s.id AND dep.status NOT IN ('completed', 'skipped'))";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-
-        var list: std.ArrayListUnmanaged(types.StepRow) = .empty;
-        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
-            try list.append(allocator, try readStepRow(allocator, stmt));
-        }
-        return list.toOwnedSlice(allocator);
-    }
-
     pub fn countStepsByStatus(self: *Self, run_id: []const u8, status: []const u8) !i64 {
         const sql = "SELECT COUNT(*) FROM steps WHERE run_id = ? AND status = ?";
         var stmt: ?*c.sqlite3_stmt = null;
@@ -770,24 +748,6 @@ pub const Store = struct {
         return list.toOwnedSlice(allocator);
     }
 
-    /// Get the IDs of steps that a given step depends on.
-    pub fn getStepDeps(self: *Self, allocator: std.mem.Allocator, step_id: []const u8) ![][]const u8 {
-        const sql = "SELECT depends_on FROM step_deps WHERE step_id = ?";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
-
-        var list: std.ArrayListUnmanaged([]const u8) = .empty;
-        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
-            try list.append(allocator, try allocStr(allocator, stmt, 0));
-        }
-        return list.toOwnedSlice(allocator);
-    }
-
     /// Count how many running tasks a worker currently has.
     pub fn countRunningStepsByWorker(self: *Self, worker_id: []const u8) !i64 {
         const sql = "SELECT COUNT(*) FROM steps WHERE worker_id = ? AND status = 'running'";
@@ -803,24 +763,6 @@ pub const Store = struct {
         return colInt(stmt, 0);
     }
 
-    /// Set started_at_ms for a step (used by wait steps to track timer start).
-    pub fn setStepStartedAt(self: *Self, step_id: []const u8, ts_ms: i64) !void {
-        const sql = "UPDATE steps SET started_at_ms = ?, updated_at_ms = ? WHERE id = ?";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_int64(stmt, 1, ts_ms);
-        _ = c.sqlite3_bind_int64(stmt, 2, ids.nowMs());
-        _ = c.sqlite3_bind_text(stmt, 3, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
-        }
-    }
-
     fn readStepRow(allocator: std.mem.Allocator, stmt: ?*c.sqlite3_stmt) !types.StepRow {
         return types.StepRow{
             .id = try allocStr(allocator, stmt, 0),
@@ -1107,156 +1049,6 @@ pub const Store = struct {
         };
     }
 
-    // ── Cycle State CRUD ─────────────────────────────────────────────
-
-    pub fn getCycleState(self: *Self, run_id: []const u8, cycle_key: []const u8) !?struct { iteration_count: i64, max_iterations: i64 } {
-        const sql = "SELECT iteration_count, max_iterations FROM cycle_state WHERE run_id = ? AND cycle_key = ?";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, cycle_key.ptr, @intCast(cycle_key.len), SQLITE_STATIC);
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
-
-        return .{
-            .iteration_count = colInt(stmt, 0),
-            .max_iterations = colInt(stmt, 1),
-        };
-    }
-
-    pub fn upsertCycleState(self: *Self, run_id: []const u8, cycle_key: []const u8, iteration_count: i64, max_iterations: i64) !void {
-        const sql = "INSERT OR REPLACE INTO cycle_state (run_id, cycle_key, iteration_count, max_iterations) VALUES (?, ?, ?, ?)";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, cycle_key.ptr, @intCast(cycle_key.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 3, iteration_count);
-        _ = c.sqlite3_bind_int64(stmt, 4, max_iterations);
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
-        }
-    }
-
-    // ── Chat Message CRUD ────────────────────────────────────────────
-
-    pub fn insertChatMessage(self: *Self, run_id: []const u8, step_id: []const u8, round: i64, role: []const u8, worker_id: ?[]const u8, message: []const u8) !void {
-        const sql = "INSERT INTO chat_messages (run_id, step_id, round, role, worker_id, message, ts_ms) VALUES (?, ?, ?, ?, ?, ?, ?)";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 3, round);
-        _ = c.sqlite3_bind_text(stmt, 4, role.ptr, @intCast(role.len), SQLITE_STATIC);
-        bindTextOpt(stmt, 5, worker_id);
-        _ = c.sqlite3_bind_text(stmt, 6, message.ptr, @intCast(message.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 7, ids.nowMs());
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
-        }
-    }
-
-    pub fn getChatMessages(self: *Self, allocator: std.mem.Allocator, step_id: []const u8) ![]types.ChatMessageRow {
-        const sql = "SELECT id, run_id, step_id, round, role, worker_id, message, ts_ms FROM chat_messages WHERE step_id = ? ORDER BY round, id";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, step_id.ptr, @intCast(step_id.len), SQLITE_STATIC);
-
-        var list: std.ArrayListUnmanaged(types.ChatMessageRow) = .empty;
-        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
-            try list.append(allocator, .{
-                .id = colInt(stmt, 0),
-                .run_id = try allocStr(allocator, stmt, 1),
-                .step_id = try allocStr(allocator, stmt, 2),
-                .round = colInt(stmt, 3),
-                .role = try allocStr(allocator, stmt, 4),
-                .worker_id = try allocStrOpt(allocator, stmt, 5),
-                .message = try allocStr(allocator, stmt, 6),
-                .ts_ms = colInt(stmt, 7),
-            });
-        }
-        return list.toOwnedSlice(allocator);
-    }
-
-    // ── Saga State CRUD ──────────────────────────────────────────────
-
-    pub fn insertSagaState(self: *Self, run_id: []const u8, saga_step_id: []const u8, body_step_id: []const u8, compensation_step_id: ?[]const u8) !void {
-        const sql = "INSERT INTO saga_state (run_id, saga_step_id, body_step_id, compensation_step_id, status) VALUES (?, ?, ?, ?, 'pending')";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, saga_step_id.ptr, @intCast(saga_step_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 3, body_step_id.ptr, @intCast(body_step_id.len), SQLITE_STATIC);
-        bindTextOpt(stmt, 4, compensation_step_id);
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
-        }
-    }
-
-    pub fn updateSagaState(self: *Self, run_id: []const u8, saga_step_id: []const u8, body_step_id: []const u8, status: []const u8) !void {
-        const sql = "UPDATE saga_state SET status = ? WHERE run_id = ? AND saga_step_id = ? AND body_step_id = ?";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 3, saga_step_id.ptr, @intCast(saga_step_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 4, body_step_id.ptr, @intCast(body_step_id.len), SQLITE_STATIC);
-
-        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
-            return error.SqliteStepFailed;
-        }
-    }
-
-    pub fn getSagaStates(self: *Self, allocator: std.mem.Allocator, run_id: []const u8, saga_step_id: []const u8) ![]types.SagaStateRow {
-        const sql = "SELECT run_id, saga_step_id, body_step_id, compensation_step_id, status FROM saga_state WHERE run_id = ? AND saga_step_id = ? ORDER BY rowid";
-        var stmt: ?*c.sqlite3_stmt = null;
-        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-            return error.SqlitePrepareFailed;
-        }
-        defer _ = c.sqlite3_finalize(stmt);
-
-        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 2, saga_step_id.ptr, @intCast(saga_step_id.len), SQLITE_STATIC);
-
-        var list: std.ArrayListUnmanaged(types.SagaStateRow) = .empty;
-        while (c.sqlite3_step(stmt) == c.SQLITE_ROW) {
-            try list.append(allocator, .{
-                .run_id = try allocStr(allocator, stmt, 0),
-                .saga_step_id = try allocStr(allocator, stmt, 1),
-                .body_step_id = try allocStr(allocator, stmt, 2),
-                .compensation_step_id = try allocStrOpt(allocator, stmt, 3),
-                .status = try allocStr(allocator, stmt, 4),
-            });
-        }
-        return list.toOwnedSlice(allocator);
-    }
-
     // ── Sub-workflow Helper ──────────────────────────────────────────
 
     pub fn updateStepInputJson(self: *Self, step_id: []const u8, input_json: []const u8) !void {
@@ -2166,33 +1958,6 @@ test "Store: get active runs" {
     try std.testing.expectEqual(@as(usize, 2), active.len);
 }
 
-test "Store: step deps and ready steps" {
-    const allocator = std.testing.allocator;
-    var s = try Store.init(allocator, ":memory:");
-    defer s.deinit();
-
-    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
-    try s.insertStep("s1", "r1", "step1", "task", "ready", "{}", 1, null, null, null);
-    try s.insertStep("s2", "r1", "step2", "task", "ready", "{}", 1, null, null, null);
-    try s.insertStepDep("s2", "s1");
-
-    // s1 should be ready (no unsatisfied deps), s2 should NOT (depends on s1 which is 'ready' not 'completed')
-    const ready = try s.getReadySteps(allocator, "r1");
-    defer {
-        for (ready) |step| {
-            allocator.free(step.id);
-            allocator.free(step.run_id);
-            allocator.free(step.def_step_id);
-            allocator.free(step.type);
-            allocator.free(step.status);
-            allocator.free(step.input_json);
-        }
-        allocator.free(ready);
-    }
-    try std.testing.expectEqual(@as(usize, 1), ready.len);
-    try std.testing.expectEqualStrings("s1", ready[0].id);
-}
-
 test "Store: count steps by status" {
     const allocator = std.testing.allocator;
     var s = try Store.init(allocator, ":memory:");
@@ -2336,114 +2101,6 @@ test "Store: get nonexistent step returns null" {
     try std.testing.expect(step == null);
 }
 
-test "cycle state: upsert and get" {
-    const allocator = std.testing.allocator;
-    var s = try Store.init(allocator, ":memory:");
-    defer s.deinit();
-
-    // Insert a run first (cycle_state references runs(id))
-    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
-
-    // Upsert cycle state
-    try s.upsertCycleState("r1", "loop_A", 1, 10);
-
-    // Get and verify values
-    const cs = (try s.getCycleState("r1", "loop_A")).?;
-    try std.testing.expectEqual(@as(i64, 1), cs.iteration_count);
-    try std.testing.expectEqual(@as(i64, 10), cs.max_iterations);
-
-    // Upsert again with new iteration_count
-    try s.upsertCycleState("r1", "loop_A", 5, 10);
-
-    // Verify updated value
-    const cs2 = (try s.getCycleState("r1", "loop_A")).?;
-    try std.testing.expectEqual(@as(i64, 5), cs2.iteration_count);
-    try std.testing.expectEqual(@as(i64, 10), cs2.max_iterations);
-}
-
-test "cycle state: get returns null for nonexistent" {
-    const allocator = std.testing.allocator;
-    var s = try Store.init(allocator, ":memory:");
-    defer s.deinit();
-
-    const cs = try s.getCycleState("no_run", "no_key");
-    try std.testing.expect(cs == null);
-}
-
-test "chat messages: insert and get ordered by round" {
-    const allocator = std.testing.allocator;
-    var s = try Store.init(allocator, ":memory:");
-    defer s.deinit();
-
-    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
-    try s.insertStep("s1", "r1", "chat_step", "group_chat", "running", "{}", 1, null, null, null);
-
-    // Insert messages with different rounds (out of order)
-    try s.insertChatMessage("r1", "s1", 2, "assistant", "w1", "round 2 message");
-    try s.insertChatMessage("r1", "s1", 1, "user", null, "round 1 message");
-    try s.insertChatMessage("r1", "s1", 1, "assistant", "w1", "round 1 reply");
-
-    // Verify getChatMessages returns them ordered by round, id
-    const msgs = try s.getChatMessages(allocator, "s1");
-    defer {
-        for (msgs) |m| {
-            allocator.free(m.run_id);
-            allocator.free(m.step_id);
-            allocator.free(m.role);
-            if (m.worker_id) |wid| allocator.free(wid);
-            allocator.free(m.message);
-        }
-        allocator.free(msgs);
-    }
-    try std.testing.expectEqual(@as(usize, 3), msgs.len);
-    // First two should be round 1 (ordered by id within round)
-    try std.testing.expectEqual(@as(i64, 1), msgs[0].round);
-    try std.testing.expectEqual(@as(i64, 1), msgs[1].round);
-    try std.testing.expectEqual(@as(i64, 2), msgs[2].round);
-    try std.testing.expectEqualStrings("round 1 message", msgs[0].message);
-    try std.testing.expectEqualStrings("round 1 reply", msgs[1].message);
-    try std.testing.expectEqualStrings("round 2 message", msgs[2].message);
-}
-
-test "saga state: insert, update status, and get" {
-    const allocator = std.testing.allocator;
-    var s = try Store.init(allocator, ":memory:");
-    defer s.deinit();
-
-    try s.insertRun("r1", null, "running", "{}", "{}", "[]");
-    try s.insertStep("saga1", "r1", "saga_def", "saga", "running", "{}", 1, null, null, null);
-    try s.insertStep("body1", "r1", "body_def1", "task", "pending", "{}", 1, null, "saga1", null);
-    try s.insertStep("body2", "r1", "body_def2", "task", "pending", "{}", 1, null, "saga1", null);
-    try s.insertStep("comp1", "r1", "comp_def1", "task", "pending", "{}", 1, null, "saga1", null);
-
-    // Insert saga states for body steps
-    try s.insertSagaState("r1", "saga1", "body1", "comp1");
-    try s.insertSagaState("r1", "saga1", "body2", null);
-
-    // Update one to 'completed'
-    try s.updateSagaState("r1", "saga1", "body1", "completed");
-
-    // Verify getSagaStates returns correct statuses
-    const states = try s.getSagaStates(allocator, "r1", "saga1");
-    defer {
-        for (states) |st| {
-            allocator.free(st.run_id);
-            allocator.free(st.saga_step_id);
-            allocator.free(st.body_step_id);
-            if (st.compensation_step_id) |cid| allocator.free(cid);
-            allocator.free(st.status);
-        }
-        allocator.free(states);
-    }
-    try std.testing.expectEqual(@as(usize, 2), states.len);
-    try std.testing.expectEqualStrings("body1", states[0].body_step_id);
-    try std.testing.expectEqualStrings("completed", states[0].status);
-    try std.testing.expectEqualStrings("comp1", states[0].compensation_step_id.?);
-    try std.testing.expectEqualStrings("body2", states[1].body_step_id);
-    try std.testing.expectEqualStrings("pending", states[1].status);
-    try std.testing.expect(states[1].compensation_step_id == null);
-}
-
 test "updateStepChildRunId: sets child_run_id on step" {
     const allocator = std.testing.allocator;
     var s = try Store.init(allocator, ":memory:");
diff --git a/src/templates.zig b/src/templates.zig
index 955945c..36d8205 100644
--- a/src/templates.zig
+++ b/src/templates.zig
@@ -1,13 +1,18 @@
 /// Template engine for prompt rendering.
 /// Resolves `{{...}}` expressions against workflow context.
 ///
-/// Supported expressions:
+/// Legacy Context + render():
 ///   - `{{input.X}}`          -- look up key X in the workflow input JSON
 ///   - `{{input.X.Y}}`        -- nested object lookups inside workflow input JSON
-///   - `{{steps.ID.output}}`  -- output of a single completed step
-///   - `{{steps.ID.outputs}}` -- JSON array of outputs from map/fan_out child steps
 ///   - `{{item}}`             -- current item string for map iterations
-///   - `{{task.X}}`           -- look up field X in the NullTickets task JSON (supports nested paths like `task.metadata.repo_url`)
+///   - `{{task.X}}`           -- look up field X in the NullTickets task JSON
+///   - `{{attempt}}`          -- current retry attempt number
+///
+/// State-based renderTemplate():
+///   - `{{state.X}}`          -- look up key X in the unified state JSON
+///   - `{{state.X.Y}}`        -- nested paths with optional [-1] array indexing
+///   - `{{input.X}}`          -- look up key X in the workflow input JSON
+///   - `{{item}}`             -- current item string for send iterations
 ///
 /// Conditional blocks:
 ///   - `{% if <expr> %}...{% endif %}`
@@ -21,18 +26,14 @@ const std = @import("std");
 
 pub const Context = struct {
     input_json: []const u8, // raw JSON string of workflow input
-    step_outputs: []const StepOutput, // completed step outputs
-    item: ?[]const u8, // current map item (null if not in map)
-    debate_responses: ?[]const u8 = null, // JSON array string for debate judge template
-    chat_history: ?[]const u8 = null, // formatted chat transcript for group_chat round_template
-    role: ?[]const u8 = null, // participant role for group_chat round_template
+    step_outputs: []const StepOutput, // completed step outputs (legacy, for tracker.zig)
+    item: ?[]const u8, // current item string (null if not in map/send)
     task_json: ?[]const u8 = null, // raw JSON string of NullTickets task data
     attempt: ?u32 = null, // current retry attempt number
 
     pub const StepOutput = struct {
         step_id: []const u8,
         output: ?[]const u8, // single output (for task steps)
-        outputs: ?[]const []const u8, // array of outputs (for fan_out/map parent)
     };
 };
 
@@ -222,27 +223,6 @@ fn resolveExpression(allocator: std.mem.Allocator, expr: []const u8, ctx: Contex
         return error.ItemNotAvailable;
     }
 
-    if (std.mem.eql(u8, expr, "debate_responses")) {
-        if (ctx.debate_responses) |dr| {
-            return allocator.dupe(u8, dr) catch return error.OutOfMemory;
-        }
-        return allocator.dupe(u8, "[]") catch return error.OutOfMemory;
-    }
-
-    if (std.mem.eql(u8, expr, "chat_history")) {
-        if (ctx.chat_history) |ch| {
-            return allocator.dupe(u8, ch) catch return error.OutOfMemory;
-        }
-        return allocator.dupe(u8, "") catch return error.OutOfMemory;
-    }
-
-    if (std.mem.eql(u8, expr, "role")) {
-        if (ctx.role) |r| {
-            return allocator.dupe(u8, r) catch return error.OutOfMemory;
-        }
-        return allocator.dupe(u8, "") catch return error.OutOfMemory;
-    }
-
     if (std.mem.eql(u8, expr, "attempt")) {
         if (ctx.attempt) |a| {
             return std.fmt.allocPrint(allocator, "{d}", .{a}) catch return error.OutOfMemory;
@@ -292,7 +272,7 @@ fn resolveInputField(allocator: std.mem.Allocator, input_json: []const u8, field
 }
 
 fn resolveStepRef(allocator: std.mem.Allocator, rest: []const u8, step_outputs: []const Context.StepOutput) RenderError![]const u8 {
-    // rest is "ID.output" or "ID.outputs"
+    // rest is "ID.output"
     const dot_pos = std.mem.lastIndexOfScalar(u8, rest, '.') orelse return error.UnknownExpression;
     const step_id = rest[0..dot_pos];
     const field = rest[dot_pos + 1 ..];
@@ -306,9 +286,6 @@ fn resolveStepRef(allocator: std.mem.Allocator, rest: []const u8, step_outputs:
                 }
                 return allocator.dupe(u8, "") catch return error.OutOfMemory;
             }
-            if (std.mem.eql(u8, field, "outputs")) {
-                return serializeOutputs(allocator, so.outputs);
-            }
             return error.UnknownExpression;
         }
     }
@@ -336,38 +313,6 @@ fn resolveTaskField(allocator: std.mem.Allocator, task_json: []const u8, field_p
     return jsonValueToString(allocator, current);
 }
 
-fn serializeOutputs(allocator: std.mem.Allocator, outputs: ?[]const []const u8) RenderError![]const u8 {
-    const items = outputs orelse {
-        return allocator.dupe(u8, "[]") catch return error.OutOfMemory;
-    };
-
-    var buf: std.ArrayListUnmanaged(u8) = .empty;
-    errdefer buf.deinit(allocator);
-
-    buf.append(allocator, '[') catch return error.OutOfMemory;
-    for (items, 0..) |item, i| {
-        if (i > 0) {
-            buf.append(allocator, ',') catch return error.OutOfMemory;
-        }
-        // Write JSON-escaped string
-        buf.append(allocator, '"') catch return error.OutOfMemory;
-        for (item) |c| {
-            switch (c) {
-                '"' => buf.appendSlice(allocator, "\\\"") catch return error.OutOfMemory,
-                '\\' => buf.appendSlice(allocator, "\\\\") catch return error.OutOfMemory,
-                '\n' => buf.appendSlice(allocator, "\\n") catch return error.OutOfMemory,
-                '\r' => buf.appendSlice(allocator, "\\r") catch return error.OutOfMemory,
-                '\t' => buf.appendSlice(allocator, "\\t") catch return error.OutOfMemory,
-                else => buf.append(allocator, c) catch return error.OutOfMemory,
-            }
-        }
-        buf.append(allocator, '"') catch return error.OutOfMemory;
-    }
-    buf.append(allocator, ']') catch return error.OutOfMemory;
-
-    return buf.toOwnedSlice(allocator) catch return error.OutOfMemory;
-}
-
 fn jsonValueToString(allocator: std.mem.Allocator, val: std.json.Value) RenderError![]const u8 {
     switch (val) {
         .string => |s| {
@@ -780,7 +725,7 @@ test "render step output" {
     const result = try render(allocator, "Result: {{steps.s1.output}}", .{
         .input_json = "{}",
         .step_outputs = &.{
-            .{ .step_id = "s1", .output = "found data", .outputs = null },
+            .{ .step_id = "s1", .output = "found data" },
         },
         .item = null,
     });
@@ -788,22 +733,6 @@ test "render step output" {
     try std.testing.expectEqualStrings("Result: found data", result);
 }
 
-test "render step outputs array" {
-    const allocator = std.testing.allocator;
-    const outputs: []const []const u8 = &.{ "result1", "result2" };
-    const result = try render(allocator, "All: {{steps.s1.outputs}}", .{
-        .input_json = "{}",
-        .step_outputs = &.{
-            .{ .step_id = "s1", .output = null, .outputs = outputs },
-        },
-        .item = null,
-    });
-    defer allocator.free(result);
-    // Should produce a JSON array like: ["result1","result2"]
-    try std.testing.expect(std.mem.indexOf(u8, result, "result1") != null);
-    try std.testing.expect(std.mem.indexOf(u8, result, "result2") != null);
-}
-
 test "render item in map context" {
     const allocator = std.testing.allocator;
     const result = try render(allocator, "Research: {{item}}", .{
@@ -912,43 +841,6 @@ test "item without map context returns error" {
     try std.testing.expectError(error.ItemNotAvailable, err);
 }
 
-test "render debate_responses expression" {
-    const allocator = std.testing.allocator;
-    const result = try render(allocator, "Pick best:\n{{debate_responses}}", .{
-        .input_json = "{}",
-        .step_outputs = &.{},
-        .item = null,
-        .debate_responses = "[\"resp1\",\"resp2\"]",
-    });
-    defer allocator.free(result);
-    try std.testing.expect(std.mem.indexOf(u8, result, "resp1") != null);
-    try std.testing.expect(std.mem.indexOf(u8, result, "resp2") != null);
-}
-
-test "render chat_history and role expressions" {
-    const allocator = std.testing.allocator;
-    const result = try render(allocator, "Previous:\n{{chat_history}}\nYour role: {{role}}", .{
-        .input_json = "{}",
-        .step_outputs = &.{},
-        .item = null,
-        .chat_history = "Architect: design first",
-        .role = "Frontend Dev",
-    });
-    defer allocator.free(result);
-    try std.testing.expectEqualStrings("Previous:\nArchitect: design first\nYour role: Frontend Dev", result);
-}
-
-test "debate_responses defaults to empty array when not set" {
-    const allocator = std.testing.allocator;
-    const result = try render(allocator, "{{debate_responses}}", .{
-        .input_json = "{}",
-        .step_outputs = &.{},
-        .item = null,
-    });
-    defer allocator.free(result);
-    try std.testing.expectEqualStrings("[]", result);
-}
-
 test "render task.title variable" {
     const allocator = std.testing.allocator;
     const result = try render(allocator, "Work on: {{task.title}}", .{
diff --git a/src/types.zig b/src/types.zig
index 2f28790..07dac57 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -203,17 +203,6 @@ pub const ArtifactRow = struct {
     created_at_ms: i64,
 };
 
-pub const ChatMessageRow = struct {
-    id: i64,
-    run_id: []const u8,
-    step_id: []const u8,
-    round: i64,
-    role: []const u8,
-    worker_id: ?[]const u8,
-    message: []const u8,
-    ts_ms: i64,
-};
-
 pub const TrackerRunRow = struct {
     task_id: []const u8,
     tracker_run_id: []const u8,
@@ -235,14 +224,6 @@ pub const TrackerRunRow = struct {
     last_error_text: ?[]const u8,
 };
 
-pub const SagaStateRow = struct {
-    run_id: []const u8,
-    saga_step_id: []const u8,
-    body_step_id: []const u8,
-    compensation_step_id: ?[]const u8,
-    status: []const u8,
-};
-
 pub const WorkflowRow = struct {
     id: []const u8,
     name: []const u8,
diff --git a/src/workflow_validation.zig b/src/workflow_validation.zig
index f0ae69e..6c9e9f0 100644
--- a/src/workflow_validation.zig
+++ b/src/workflow_validation.zig
@@ -12,16 +12,6 @@ pub const ValidateError = error{
     DependsOnItemNotString,
     DependsOnDuplicate,
     DependsOnUnknownStepId,
-    LoopBodyRequired,
-    SubWorkflowRequired,
-    WaitConditionRequired,
-    WaitDurationInvalid,
-    WaitUntilInvalid,
-    WaitSignalInvalid,
-    RouterRoutesRequired,
-    SagaBodyRequired,
-    DebateCountRequired,
-    GroupChatParticipantsRequired,
     RetryMustBeObject,
     MaxAttemptsMustBePositiveInteger,
     TimeoutMsMustBePositiveInteger,
@@ -70,47 +60,9 @@ fn getJsonString(obj: std.json.ObjectMap, key: []const u8) ?[]const u8 {
 }
 
 fn validateStepTypeRules(step_type: []const u8, step_obj: std.json.ObjectMap) ValidateError!void {
-    if (std.mem.eql(u8, step_type, "loop") and step_obj.get("body") == null) {
-        return error.LoopBodyRequired;
-    }
-    if (std.mem.eql(u8, step_type, "sub_workflow") and step_obj.get("workflow") == null) {
-        return error.SubWorkflowRequired;
-    }
-    if (std.mem.eql(u8, step_type, "wait")) {
-        if (step_obj.get("duration_ms") == null and step_obj.get("until_ms") == null and step_obj.get("signal") == null) {
-            return error.WaitConditionRequired;
-        }
-        if (step_obj.get("duration_ms")) |duration_val| {
-            switch (duration_val) {
-                .integer => {
-                    if (duration_val.integer < 0) return error.WaitDurationInvalid;
-                },
-                else => return error.WaitDurationInvalid,
-            }
-        }
-        if (step_obj.get("until_ms")) |until_val| {
-            if (until_val != .integer or until_val.integer < 0) {
-                return error.WaitUntilInvalid;
-            }
-        }
-        if (step_obj.get("signal")) |signal_val| {
-            if (signal_val != .string or signal_val.string.len == 0) {
-                return error.WaitSignalInvalid;
-            }
-        }
-    }
-    if (std.mem.eql(u8, step_type, "router") and step_obj.get("routes") == null) {
-        return error.RouterRoutesRequired;
-    }
-    if (std.mem.eql(u8, step_type, "saga") and step_obj.get("body") == null) {
-        return error.SagaBodyRequired;
-    }
-    if (std.mem.eql(u8, step_type, "debate") and step_obj.get("count") == null) {
-        return error.DebateCountRequired;
-    }
-    if (std.mem.eql(u8, step_type, "group_chat") and step_obj.get("participants") == null) {
-        return error.GroupChatParticipantsRequired;
-    }
+    // No specific rules for current step types (task, route, interrupt, agent, send, transform, subgraph)
+    _ = step_type;
+    _ = step_obj;
 }
 
 fn validateDependsOnTypes(allocator: std.mem.Allocator, step_obj: std.json.ObjectMap) ValidateError!void {
@@ -723,58 +675,6 @@ test "validateStepsForCreateRun: rejects duplicate depends_on item" {
     try std.testing.expectError(error.DependsOnDuplicate, validateStepsForCreateRun(allocator, parsed.value.array.items));
 }
 
-test "validateStepsForCreateRun: rejects missing sub_workflow workflow field" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"sw","type":"sub_workflow"}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.SubWorkflowRequired, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
-test "validateStepsForCreateRun: rejects missing saga body field" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"sg","type":"saga"}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.SagaBodyRequired, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
-test "validateStepsForCreateRun: rejects missing debate count field" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"db","type":"debate","prompt_template":"x"}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.DebateCountRequired, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
-test "validateStepsForCreateRun: rejects missing group_chat participants field" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"gc","type":"group_chat","prompt_template":"x"}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.GroupChatParticipantsRequired, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
 test "validateStepsForCreateRun: rejects non-object retry field" {
     const allocator = std.testing.allocator;
     const payload =
@@ -814,45 +714,6 @@ test "validateStepsForCreateRun: rejects non-positive timeout_ms" {
     try std.testing.expectError(error.TimeoutMsMustBePositiveInteger, validateStepsForCreateRun(allocator, parsed.value.array.items));
 }
 
-test "validateStepsForCreateRun: rejects invalid wait duration string" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"w","type":"wait","duration_ms":"abc"}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.WaitDurationInvalid, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
-test "validateStepsForCreateRun: rejects negative wait duration" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"w","type":"wait","duration_ms":-1}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.WaitDurationInvalid, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
-test "validateStepsForCreateRun: rejects invalid wait signal type" {
-    const allocator = std.testing.allocator;
-    const payload =
-        \\[
-        \\  {"id":"w","type":"wait","signal":1}
-        \\]
-    ;
-
-    const parsed = try std.json.parseFromSlice(std.json.Value, allocator, payload, .{});
-    defer parsed.deinit();
-    try std.testing.expectError(error.WaitSignalInvalid, validateStepsForCreateRun(allocator, parsed.value.array.items));
-}
-
 // ── Tests: new graph validation ────────────────────────────────────────
 
 test "validate valid simple workflow" {

From c20ce7e7108436e261c665b6c1bc7e3dab82b122 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:02:20 -0300
Subject: [PATCH 22/55] Fix schema key mismatch and remove dead code

Engine used "schema" key to look up state schema from workflow JSON,
but the API and validation modules use "state_schema". This caused
reducers to silently fall back to last_value for all API-created runs.

Add getSchemaJson() helper that checks both "state_schema" (canonical)
and "schema" (fallback for inline test workflows).

Remove dead code:
- isNodeDeferred (unused, collectDeferredNodes used instead)
- getNodeFieldBool (only caller was isNodeDeferred)
- mergeWorkflowVersionIntoMeta (unused, serializeRouteResultsWithVersion handles this)
- InvalidNumber error set declaration (error.InvalidNumber works via implicit sets)
- _field_name unused parameter in checkStateRefs
---
 src/engine.zig              | 87 ++++++++-----------------------------
 src/state.zig               |  4 --
 src/workflow_validation.zig |  6 +--
 3 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 5727cb9..ecf5790 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -481,7 +481,7 @@ pub const Engine = struct {
 
                     if (std.mem.eql(u8, def_node_type, "transform")) {
                         const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
-                        const def_schema = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                        const def_schema = getSchemaJson(alloc, workflow_json);
                         const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
                         running_state = def_new_state;
                     } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
@@ -489,7 +489,7 @@ pub const Engine = struct {
                         switch (def_result) {
                             .completed => |cr| {
                                 if (cr.state_updates) |updates| {
-                                    const def_schema = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                                    const def_schema = getSchemaJson(alloc, workflow_json);
                                     const def_new_state = state_mod.applyUpdates(alloc, running_state, updates, def_schema) catch running_state;
                                     running_state = def_new_state;
                                 }
@@ -601,7 +601,7 @@ pub const Engine = struct {
                 const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
 
                 // Get schema from workflow
-                const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                const schema_json = getSchemaJson(alloc, workflow_json);
 
                 // Apply updates via reducers
                 const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
@@ -633,7 +633,7 @@ pub const Engine = struct {
                     const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
                     const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
                     if (cached) |cached_upd| {
-                        const cs = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                        const cs = getSchemaJson(alloc, workflow_json);
                         running_state = state_mod.applyUpdates(alloc, running_state, cached_upd, cs) catch running_state;
                         try completed_nodes.put(try alloc.dupe(u8, node_name), {});
                         log.info("task node {s} cache hit for run {s}", .{ node_name, run_row.id });
@@ -686,7 +686,7 @@ pub const Engine = struct {
                         running_state = stripMeta(alloc, running_state) catch running_state;
 
                         if (cr.state_updates) |updates| {
-                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const schema_json = getSchemaJson(alloc, workflow_json);
                             const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                                 log.err("task node {s} failed to apply updates: {}", .{ node_name, err });
                                 try self.store.updateRunStatus(run_row.id, "failed", "state update failed");
@@ -718,7 +718,7 @@ pub const Engine = struct {
                         // Consume pending injections
                         const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
                         for (injections) |injection| {
-                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const schema_json = getSchemaJson(alloc, workflow_json);
                             const new_state = state_mod.applyUpdates(alloc, running_state, injection.updates_json, schema_json) catch |err| {
                                 log.warn("failed to apply injection for run {s}: {}", .{ run_row.id, err });
                                 continue;
@@ -795,7 +795,7 @@ pub const Engine = struct {
                 switch (result) {
                     .completed => |cr| {
                         if (cr.state_updates) |updates| {
-                            const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                            const schema_json = getSchemaJson(alloc, workflow_json);
                             const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                                 log.err("subgraph node {s} failed to apply updates: {}", .{ node_name, err });
                                 try self.store.updateRunStatus(run_row.id, "failed", "subgraph state update failed");
@@ -819,7 +819,7 @@ pub const Engine = struct {
                 // Send: read items from state, dispatch target_node per item
                 const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
                 if (result.state_updates) |updates| {
-                    const schema_json = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+                    const schema_json = getSchemaJson(alloc, workflow_json);
                     const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                         log.err("send node {s} failed to apply updates: {}", .{ node_name, err });
                         try self.store.updateRunStatus(run_row.id, "failed", "send state update failed");
@@ -867,7 +867,7 @@ pub const Engine = struct {
             }
 
             // Strip ephemeral keys before checkpoint persistence
-            const schema_for_eph = getWorkflowField(alloc, workflow_json, "schema") orelse "{}";
+            const schema_for_eph = getSchemaJson(alloc, workflow_json);
             running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
 
             // Save checkpoint after each node
@@ -1212,7 +1212,7 @@ pub const Engine = struct {
         const child_input = buildSubgraphInput(alloc, state_json, input_mapping_json) catch "{}";
 
         // Get schema from child workflow for initState
-        const child_schema = getWorkflowField(alloc, definition, "schema") orelse "{}";
+        const child_schema = getSchemaJson(alloc, definition);
         const child_state = state_mod.initState(alloc, child_input, child_schema) catch try alloc.dupe(u8, child_input);
 
         // Create child run
@@ -1666,6 +1666,15 @@ fn getNodeField(alloc: std.mem.Allocator, node_json: []const u8, field: []const
     return serializeJsonValue(alloc, val) catch null;
 }
 
+/// Get the state schema JSON from a workflow definition.
+/// Looks up "state_schema" first (canonical key used by API/validation),
+/// then falls back to "schema" for inline workflow definitions in tests.
+fn getSchemaJson(alloc: std.mem.Allocator, workflow_json: []const u8) []const u8 {
+    return getWorkflowField(alloc, workflow_json, "state_schema") orelse
+        getWorkflowField(alloc, workflow_json, "schema") orelse
+        "{}";
+}
+
 /// Get a top-level field from workflow_json.
 fn getWorkflowField(alloc: std.mem.Allocator, workflow_json: []const u8, field: []const u8) ?[]const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return null;
@@ -1831,15 +1840,6 @@ fn getNodeFieldFloat(alloc: std.mem.Allocator, node_json: []const u8, field: []c
     return null;
 }
 
-/// Get a boolean field from a node's JSON.
-fn getNodeFieldBool(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?bool {
-    const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
-    if (parsed.value != .object) return null;
-    const val = parsed.value.object.get(field) orelse return null;
-    if (val == .bool) return val.bool;
-    return null;
-}
-
 /// Get a nested object field as JSON string from a node's JSON.
 fn getNodeObjectField(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?[]const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
@@ -1899,12 +1899,6 @@ fn computeCacheKey(alloc: std.mem.Allocator, node_name: []const u8, rendered_pro
 
 // ── Deferred Node Helpers (Gap 6) ───────────────────────────────────
 
-/// Check if a node has "defer": true in its definition.
-fn isNodeDeferred(alloc: std.mem.Allocator, workflow_json: []const u8, node_name: []const u8) bool {
-    const node_json = getNodeJson(alloc, workflow_json, node_name) orelse return false;
-    return getNodeFieldBool(alloc, node_json, "defer") orelse false;
-}
-
 /// Collect all deferred node names from workflow.
 fn collectDeferredNodes(alloc: std.mem.Allocator, workflow_json: []const u8) []const []const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return &.{};
@@ -2100,27 +2094,6 @@ fn getCheckpointWorkflowVersion(alloc: std.mem.Allocator, metadata_json: ?[]cons
     return 1;
 }
 
-/// Merge workflow_version into existing checkpoint metadata JSON.
-fn mergeWorkflowVersionIntoMeta(alloc: std.mem.Allocator, existing_meta: ?[]const u8, wf_version: i64) ?[]const u8 {
-    if (existing_meta) |em| {
-        // Parse existing, add workflow_version
-        const parsed = json.parseFromSlice(json.Value, alloc, em, .{}) catch {
-            return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
-        };
-        if (parsed.value == .object) {
-            var obj = json.ObjectMap.init(alloc);
-            var it = parsed.value.object.iterator();
-            while (it.next()) |entry| {
-                obj.put(entry.key_ptr.*, entry.value_ptr.*) catch continue;
-            }
-            obj.put("workflow_version", .{ .integer = wf_version }) catch {};
-            return serializeJsonValue(alloc, .{ .object = obj }) catch null;
-        }
-        return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
-    }
-    return std.fmt.allocPrint(alloc, "{{\"workflow_version\":{d}}}", .{wf_version}) catch null;
-}
-
 /// Filter completed nodes to only those still present in the workflow definition.
 /// Returns true if any nodes were removed (migration happened).
 fn migrateCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.StringHashMap(void), workflow_json: []const u8) bool {
@@ -3014,28 +2987,6 @@ test "migrateCompletedNodes: no changes needed" {
     try std.testing.expect(!migrated);
 }
 
-test "mergeWorkflowVersionIntoMeta: new metadata" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const result = mergeWorkflowVersionIntoMeta(arena.allocator(), null, 2);
-    try std.testing.expect(result != null);
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "workflow_version") != null);
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "2") != null);
-}
-
-test "mergeWorkflowVersionIntoMeta: existing metadata" {
-    const allocator = std.testing.allocator;
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    const result = mergeWorkflowVersionIntoMeta(arena.allocator(), "{\"route_results\":{}}", 3);
-    try std.testing.expect(result != null);
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "workflow_version") != null);
-    try std.testing.expect(std.mem.indexOf(u8, result.?, "route_results") != null);
-}
-
 test "serializeRouteResultsWithVersion: includes version" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
diff --git a/src/state.zig b/src/state.zig
index 330f9f4..5c266a6 100644
--- a/src/state.zig
+++ b/src/state.zig
@@ -637,10 +637,6 @@ pub fn stripEphemeralKeys(alloc: Allocator, state_json: []const u8, schema_json:
     return try alloc.dupe(u8, result_str);
 }
 
-// ── Custom errors ─────────────────────────────────────────────────────
-
-const InvalidNumber = error{InvalidNumber};
-
 // ── Tests ─────────────────────────────────────────────────────────────
 
 fn parseTestJson(alloc: Allocator, json_str: []const u8) !json.Parsed(json.Value) {
diff --git a/src/workflow_validation.zig b/src/workflow_validation.zig
index 6c9e9f0..1070b4d 100644
--- a/src/workflow_validation.zig
+++ b/src/workflow_validation.zig
@@ -427,11 +427,11 @@ pub fn validate(alloc: Allocator, definition_json: []const u8) ![]ValidationErro
 
             // Check prompt field
             if (getJsonStringFromObj(nobj, "prompt")) |prompt| {
-                try checkStateRefs(alloc, &errors, schema, nname, "prompt", prompt);
+                try checkStateRefs(alloc, &errors, schema, nname, prompt);
             }
             // Check message field (interrupt)
             if (getJsonStringFromObj(nobj, "message")) |msg| {
-                try checkStateRefs(alloc, &errors, schema, nname, "message", msg);
+                try checkStateRefs(alloc, &errors, schema, nname, msg);
             }
         }
     }
@@ -518,10 +518,8 @@ fn checkStateRefs(
     errors: *std.ArrayListUnmanaged(ValidationError),
     schema: std.json.ObjectMap,
     node_name: []const u8,
-    _field_name: []const u8,
     text: []const u8,
 ) !void {
-    _ = _field_name;
     var pos: usize = 0;
     while (pos < text.len) {
         // Find "{{"

From dd3ed674d69782d6f7b0c5df93c046e117d297c9 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:03:16 -0300
Subject: [PATCH 23/55] Update CLAUDE.md: fix test count and schema key name

Test count is 322 after dead code removal. The canonical schema key
in workflow definitions is "state_schema", not "schema".
---
 CLAUDE.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index f50946f..ba2ea99 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -46,7 +46,7 @@ Graph-based workflow orchestrator with unified state model for NullClaw AI bot a
 
 ```sh
 zig build              # build
-zig build test         # unit tests (324 tests)
+zig build test         # unit tests (322 tests)
 zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock workers)
 ./zig-out/bin/nullboiler --port 8080 --db nullboiler.db --config config.json
 ```
@@ -112,7 +112,7 @@ zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock wo
 ## Architecture
 
 - **Unified state model**: Every node reads from state, returns partial updates, engine applies reducers
-- **Graph-based execution**: Workflow = `{nodes: {}, edges: [], schema: {}}` with `__start__` and `__end__` synthetic nodes
+- **Graph-based execution**: Workflow = `{nodes: {}, edges: [], state_schema: {}}` with `__start__` and `__end__` synthetic nodes
 - **Checkpoints**: State snapshot after every node, enabling fork/replay/resume
 - **Conditional edges**: Route nodes produce values, edges like `["router:yes", "next"]` are taken when route result matches
 - **Deferred nodes**: Nodes with `"defer": true` execute right before `__end__`

From 2381cc7ab34ac39c27fff82d6ccb5f31e76de549 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:04:44 -0300
Subject: [PATCH 24/55] Fix multi-turn agent injections silently dropped

Pending state injections consumed between agent continuation turns
were discarded (assigned to _ and never applied). This caused
injections submitted via POST /runs/{id}/state during multi-turn
agent execution to be silently lost.

Re-save consumed injections so they are properly applied after the
full agent node completes its multi-turn loop.
---
 src/engine.zig | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index ecf5790..66456ae 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -318,9 +318,6 @@ pub const Engine = struct {
     }
 
     fn processRunWithDepth(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, recursion_depth: u32) !void {
-        // Emit run_started event
-        self.emitEvent(alloc, .run_started, run_row.id, null, null, null);
-
         // 1. Load current state
         var current_state = run_row.state_json orelse "{}";
 
@@ -393,6 +390,11 @@ pub const Engine = struct {
         var version: i64 = if (latest_checkpoint) |cp| cp.version else 0;
         const initial_version = version;
 
+        // Emit run_started only on the first tick (no prior checkpoints)
+        if (latest_checkpoint == null) {
+            self.emitEvent(alloc, .run_started, run_row.id, null, null, null);
+        }
+
         // 3b. Workflow version migration check
         const wf_version = getWorkflowVersion(alloc, workflow_json);
         if (latest_checkpoint) |cp| {
@@ -1081,9 +1083,13 @@ pub const Engine = struct {
                                 }
                             }
 
-                            // Consume pending injections between turns
-                            const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
-                            _ = injections;
+                            // Consume pending injections between turns — these are
+                            // queued but cannot be applied mid-node. Re-save them so
+                            // they are applied after the full node completes.
+                            const mid_injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
+                            for (mid_injections) |inj| {
+                                self.store.createPendingInjection(run_row.id, inj.updates_json, node_name) catch {};
+                            }
 
                             // Render continuation prompt
                             const cont_rendered = templates.renderTemplate(alloc, continuation_prompt.?, state_json, run_row.input_json, null) catch break;

From 95a5c8dd35a766810920f02c13efced1be5ca44f Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:06:13 -0300
Subject: [PATCH 25/55] Remove dead handleSignalStep function

This handler was defined but never wired to any API route. The
approval/signal step concept was replaced by interrupt + resume.
The legacy approve/reject routes already return 410 Gone.
---
 src/api.zig | 61 +----------------------------------------------------
 1 file changed, 1 insertion(+), 60 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 83bb2d5..5f0b4f8 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -417,7 +417,7 @@ fn handleRegisterWorker(ctx: *Context, body: []const u8) HttpResponse {
     const model = getJsonString(obj, "model");
 
     const protocol = worker_protocol.parse(protocol_raw) orelse {
-        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid protocol (expected webhook|api_chat|openai_chat)\"}}");
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid protocol (expected webhook|api_chat|openai_chat|mqtt|redis_stream|a2a)\"}}");
     };
     if (!worker_protocol.validateUrlForProtocol(url, protocol)) {
         return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"webhook protocol requires explicit URL path (for example /webhook)\"}}");
@@ -965,65 +965,6 @@ fn handleRetryRun(ctx: *Context, run_id: []const u8) HttpResponse {
     return jsonResponse(200, resp);
 }
 
-fn handleSignalStep(ctx: *Context, run_id: []const u8, step_id: []const u8, body: []const u8) HttpResponse {
-    // 1. Get step from store
-    const step = switch (lookupStepInRun(ctx, run_id, step_id)) {
-        .ok => |s| s,
-        .err => |resp| return resp,
-    };
-
-    // 2. Must be "waiting_approval" (signal mode uses this status)
-    if (!std.mem.eql(u8, step.status, "waiting_approval")) {
-        const resp = std.fmt.allocPrint(ctx.allocator,
-            \\{{"error":{{"code":"conflict","message":"step is not waiting for signal (current: {s})"}}}}
-        , .{step.status}) catch return jsonResponse(409, "{\"error\":{\"code\":\"conflict\",\"message\":\"step is not waiting for signal\"}}");
-        return jsonResponse(409, resp);
-    }
-
-    // 3. Parse optional signal data from body
-    var signal_data: []const u8 = "{}";
-    if (body.len > 0) {
-        const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
-            // Body is not valid JSON; use empty
-            signal_data = "{}";
-            // Continue anyway
-            const output = std.fmt.allocPrint(ctx.allocator,
-                \\{{"output":"signaled","data":{{}}}}
-            , .{}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-
-            ctx.store.updateStepStatus(step_id, "completed", null, output, null, step.attempt) catch {
-                return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update step\"}}");
-            };
-            ctx.store.insertEvent(run_id, step_id, "step.signaled", output) catch {};
-            const resp = std.fmt.allocPrint(ctx.allocator,
-                \\{{"step_id":"{s}","status":"completed"}}
-            , .{step_id}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-            return jsonResponse(200, resp);
-        };
-        _ = parsed;
-        signal_data = body;
-    }
-
-    // 4. Build output with signal data
-    const output = std.fmt.allocPrint(ctx.allocator,
-        \\{{"output":"signaled","data":{s}}}
-    , .{signal_data}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-
-    // 5. Update step to "completed"
-    ctx.store.updateStepStatus(step_id, "completed", null, output, null, step.attempt) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update step\"}}");
-    };
-
-    // 6. Insert event
-    ctx.store.insertEvent(run_id, step_id, "step.signaled", output) catch {};
-
-    // 7. Return 200
-    const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"step_id":"{s}","status":"completed"}}
-    , .{step_id}) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    return jsonResponse(200, resp);
-}
-
 fn handleListEvents(ctx: *Context, run_id: []const u8) HttpResponse {
     // 1. Get events from store
     const events = ctx.store.getEventsByRun(ctx.allocator, run_id) catch {

From 4012c8e3b8a2af9dbb4fe81477f63394364312d8 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:06:46 -0300
Subject: [PATCH 26/55] fix: remove vestigial paused status and update protocol
 error message

- getActiveRuns no longer queries for 'paused' status which doesn't
  exist as a valid RunStatus enum value. Only 'running' is queried.
- Worker registration error message now lists all 6 supported protocols
  (was missing mqtt, redis_stream, a2a).
---
 src/api.zig   | 27 ---------------------------
 src/store.zig |  7 +++----
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 5f0b4f8..4adb00e 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -107,16 +107,6 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
         return handleGetStep(ctx, seg1.?, seg3.?);
     }
 
-    // POST /runs/{id}/steps/{step_id}/approve (legacy, removed)
-    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "approve") and seg5 == null) {
-        return jsonResponse(410, "{\"error\":{\"code\":\"gone\",\"message\":\"approval steps have been removed; use interrupt + resume instead\"}}");
-    }
-
-    // POST /runs/{id}/steps/{step_id}/reject (legacy, removed)
-    if (is_post and eql(seg0, "runs") and seg1 != null and eql(seg2, "steps") and seg3 != null and eql(seg4, "reject") and seg5 == null) {
-        return jsonResponse(410, "{\"error\":{\"code\":\"gone\",\"message\":\"approval steps have been removed; use interrupt + resume instead\"}}");
-    }
-
     // GET /runs/{id}/events
     if (is_get and eql(seg0, "runs") and seg1 != null and eql(seg2, "events") and seg3 == null) {
         return handleListEvents(ctx, seg1.?);
@@ -2365,23 +2355,6 @@ test "API: get step enforces run ownership" {
     try std.testing.expectEqual(@as(u16, 404), resp.status_code);
 }
 
-test "API: approve endpoint returns 410 gone" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    var ctx = Context{
-        .store = &store,
-        .allocator = arena.allocator(),
-    };
-
-    const resp = handleRequest(&ctx, "POST", "/runs/run-1/steps/step-1/approve", "");
-    try std.testing.expectEqual(@as(u16, 410), resp.status_code);
-}
-
 test "API: register worker rejects non-array tags" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
diff --git a/src/store.zig b/src/store.zig
index 92a3def..f51cc3c 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -522,7 +522,7 @@ pub const Store = struct {
     }
 
     pub fn getActiveRuns(self: *Self, allocator: std.mem.Allocator) ![]types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status IN ('running', 'paused') ORDER BY created_at_ms DESC";
+        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status = 'running' ORDER BY created_at_ms DESC";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -1940,8 +1940,7 @@ test "Store: get active runs" {
     defer s.deinit();
     try s.insertRun("r1", null, "running", "{}", "{}", "[]");
     try s.insertRun("r2", null, "pending", "{}", "{}", "[]");
-    try s.insertRun("r3", null, "paused", "{}", "{}", "[]");
-    try s.insertRun("r4", null, "completed", "{}", "{}", "[]");
+    try s.insertRun("r3", null, "completed", "{}", "{}", "[]");
 
     const active = try s.getActiveRuns(allocator);
     defer {
@@ -1955,7 +1954,7 @@ test "Store: get active runs" {
         }
         allocator.free(active);
     }
-    try std.testing.expectEqual(@as(usize, 2), active.len);
+    try std.testing.expectEqual(@as(usize, 1), active.len);
 }
 
 test "Store: count steps by status" {

From b4aeec32e6f8c9dbbc11cd69e3801c09018c7952 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:07:18 -0300
Subject: [PATCH 27/55] Remove legacy approve/reject 410 routes

No backward compatibility needed. These endpoints were deprecated
in favor of interrupt + resume pattern. Remove the routes, their
test, and the now-unused seg5 path segment.
---
 src/api.zig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/api.zig b/src/api.zig
index 4adb00e..5244755 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -51,7 +51,6 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
     const seg2 = getPathSegment(path, 2);
     const seg3 = getPathSegment(path, 3);
     const seg4 = getPathSegment(path, 4);
-    const seg5 = getPathSegment(path, 5);
 
     const is_get = eql(method, "GET");
     const is_post = eql(method, "POST");

From 6860ac7a96adfc5b8e1c8426401b97b5a1caba5d Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:07:29 -0300
Subject: [PATCH 28/55] docs: update test count in CLAUDE.md to 321

---
 CLAUDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index ba2ea99..e5aa452 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -46,7 +46,7 @@ Graph-based workflow orchestrator with unified state model for NullClaw AI bot a
 
 ```sh
 zig build              # build
-zig build test         # unit tests (322 tests)
+zig build test         # unit tests (321 tests)
 zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock workers)
 ./zig-out/bin/nullboiler --port 8080 --db nullboiler.db --config config.json
 ```

From 931b5c4646953c2c26af46522513d0a5a225917c Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:07:58 -0300
Subject: [PATCH 29/55] Remove legacy approve route test

The approval step test was a leftover from the removed approve/reject
endpoint. No backward compatibility needed.
---
 src/api.zig | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 5244755..519512f 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -2434,30 +2434,6 @@ test "API: register worker rejects non-positive max_concurrent" {
     try std.testing.expectEqual(@as(u16, 400), resp.status_code);
 }
 
-test "API: approve route does not match extra path segment" {
-    const allocator = std.testing.allocator;
-    var store = try Store.init(allocator, ":memory:");
-    defer store.deinit();
-
-    try store.insertRun("r1", null, "running", "{\"steps\":[]}", "{}", "[]");
-    try store.insertStep("s1", "r1", "approve-1", "approval", "waiting_approval", "{}", 1, null, null, null);
-
-    var arena = std.heap.ArenaAllocator.init(allocator);
-    defer arena.deinit();
-
-    var ctx = Context{
-        .store = &store,
-        .allocator = arena.allocator(),
-    };
-
-    const resp = handleRequest(&ctx, "POST", "/runs/r1/steps/s1/approve/extra", "");
-    try std.testing.expectEqual(@as(u16, 404), resp.status_code);
-    try std.testing.expect(std.mem.indexOf(u8, resp.body, "endpoint not found") != null);
-
-    const step = (try store.getStep(arena.allocator(), "s1")).?;
-    try std.testing.expectEqualStrings("waiting_approval", step.status);
-}
-
 test "API: register openai_chat worker requires model" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");

From a0f56374f0cd7641686d6c74e00c0349605f35f6 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:08:15 -0300
Subject: [PATCH 30/55] Update CLAUDE.md test count to 320

---
 CLAUDE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index e5aa452..8989137 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -46,7 +46,7 @@ Graph-based workflow orchestrator with unified state model for NullClaw AI bot a
 
 ```sh
 zig build              # build
-zig build test         # unit tests (321 tests)
+zig build test         # unit tests (320 tests)
 zig build && bash tests/test_e2e.sh   # e2e tests (requires Python 3 for mock workers)
 ./zig-out/bin/nullboiler --port 8080 --db nullboiler.db --config config.json
 ```

From f5feccd793bf1c4dc8fb8dbabe81ddccada7d937 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:08:19 -0300
Subject: [PATCH 31/55] style: fix misleading indentation in
 processRunWithDepth for loop

The body of `for (ready_nodes) |node_name|` was at the same indent
level as the `for` statement itself, making it look like the code
was outside the loop. Re-indent the ~420 lines to proper nesting.
---
 src/engine.zig | 780 ++++++++++++++++++++++++-------------------------
 1 file changed, 390 insertions(+), 390 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 66456ae..ddb62b6 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -473,304 +473,289 @@ pub const Engine = struct {
             var goto_override: ?[]const []const u8 = null;
 
             for (ready_nodes) |node_name| {
-            if (std.mem.eql(u8, node_name, "__end__")) {
-                // Gap 6: Execute deferred nodes before completing
-                for (deferred_nodes) |deferred_name| {
-                    if (completed_nodes.get(deferred_name) != null) continue;
-
-                    const def_node_json = getNodeJson(alloc, workflow_json, deferred_name) orelse continue;
-                    const def_node_type = getNodeField(alloc, def_node_json, "type") orelse "task";
-
-                    if (std.mem.eql(u8, def_node_type, "transform")) {
-                        const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
-                        const def_schema = getSchemaJson(alloc, workflow_json);
-                        const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
-                        running_state = def_new_state;
-                    } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
-                        const def_result = self.executeTaskNode(alloc, run_row, deferred_name, def_node_json, running_state) catch continue;
-                        switch (def_result) {
-                            .completed => |cr| {
-                                if (cr.state_updates) |updates| {
-                                    const def_schema = getSchemaJson(alloc, workflow_json);
-                                    const def_new_state = state_mod.applyUpdates(alloc, running_state, updates, def_schema) catch running_state;
-                                    running_state = def_new_state;
-                                }
-                            },
-                            else => {},
+                if (std.mem.eql(u8, node_name, "__end__")) {
+                    // Gap 6: Execute deferred nodes before completing
+                    for (deferred_nodes) |deferred_name| {
+                        if (completed_nodes.get(deferred_name) != null) continue;
+    
+                        const def_node_json = getNodeJson(alloc, workflow_json, deferred_name) orelse continue;
+                        const def_node_type = getNodeField(alloc, def_node_json, "type") orelse "task";
+    
+                        if (std.mem.eql(u8, def_node_type, "transform")) {
+                            const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
+                            const def_schema = getSchemaJson(alloc, workflow_json);
+                            const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
+                            running_state = def_new_state;
+                        } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
+                            const def_result = self.executeTaskNode(alloc, run_row, deferred_name, def_node_json, running_state) catch continue;
+                            switch (def_result) {
+                                .completed => |cr| {
+                                    if (cr.state_updates) |updates| {
+                                        const def_schema = getSchemaJson(alloc, workflow_json);
+                                        const def_new_state = state_mod.applyUpdates(alloc, running_state, updates, def_schema) catch running_state;
+                                        running_state = def_new_state;
+                                    }
+                                },
+                                else => {},
+                            }
                         }
+    
+                        try completed_nodes.put(try alloc.dupe(u8, deferred_name), {});
+                        log.info("deferred node {s} completed for run {s}", .{ deferred_name, run_row.id });
                     }
-
-                    try completed_nodes.put(try alloc.dupe(u8, deferred_name), {});
-                    log.info("deferred node {s} completed for run {s}", .{ deferred_name, run_row.id });
+    
+                    // Mark __end__ as completed
+                    try completed_nodes.put("__end__", {});
+                    version += 1;
+    
+                    // Save checkpoint
+                    const cp_id_buf = ids.generateId();
+                    const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                    const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                    try self.store.createCheckpoint(cp_id, run_row.id, "__end__", parent_id, running_state, cn_json, version, meta_json);
+                    try self.store.incrementCheckpointCount(run_row.id);
+                    try self.store.updateRunState(run_row.id, running_state);
+    
+                    // Run is completed
+                    try self.store.updateRunStatus(run_row.id, "completed", null);
+                    try self.store.insertEvent(run_row.id, null, "run.completed", "{}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.completed", run_row.id, null, "{}", self.metrics);
+                    log.info("run {s} completed", .{run_row.id});
+                    return;
                 }
-
-                // Mark __end__ as completed
-                try completed_nodes.put("__end__", {});
-                version += 1;
-
-                // Save checkpoint
-                const cp_id_buf = ids.generateId();
-                const cp_id = try alloc.dupe(u8, &cp_id_buf);
-                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-                try self.store.createCheckpoint(cp_id, run_row.id, "__end__", parent_id, running_state, cn_json, version, meta_json);
-                try self.store.incrementCheckpointCount(run_row.id);
-                try self.store.updateRunState(run_row.id, running_state);
-
-                // Run is completed
-                try self.store.updateRunStatus(run_row.id, "completed", null);
-                try self.store.insertEvent(run_row.id, null, "run.completed", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.completed", run_row.id, null, "{}", self.metrics);
-                log.info("run {s} completed", .{run_row.id});
-                return;
-            }
-
-            // Breakpoint: interrupt_before check
-            if (isInBreakpointList(node_name, interrupt_before)) {
-                log.info("breakpoint interrupt_before at node {s} for run {s}", .{ node_name, run_row.id });
-                version += 1;
-                const cp_id_buf = ids.generateId();
-                const cp_id = try alloc.dupe(u8, &cp_id_buf);
-                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
-                try self.store.incrementCheckpointCount(run_row.id);
-                try self.store.updateRunState(run_row.id, running_state);
-
-                try self.store.updateRunStatus(run_row.id, "interrupted", null);
-                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
-                return;
-            }
-
-            // Get node definition from workflow
-            const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
-                log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
-                try self.store.updateRunStatus(run_row.id, "failed", "node not found in workflow");
-                return;
-            };
-
-            // Get node type
-            const node_type = getNodeField(alloc, node_json, "type") orelse "task";
-
-            // Execute based on type
-            if (std.mem.eql(u8, node_type, "route")) {
-                // Route: evaluate routing logic, no worker dispatch
-                const result = try self.executeRouteNode(alloc, node_name, node_json, running_state);
-                if (result.route_value) |rv| {
-                    try route_results.put(try alloc.dupe(u8, node_name), rv);
+    
+                // Breakpoint: interrupt_before check
+                if (isInBreakpointList(node_name, interrupt_before)) {
+                    log.info("breakpoint interrupt_before at node {s} for run {s}", .{ node_name, run_row.id });
+                    version += 1;
+                    const cp_id_buf = ids.generateId();
+                    const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                    const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                    try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                    try self.store.incrementCheckpointCount(run_row.id);
+                    try self.store.updateRunState(run_row.id, running_state);
+    
+                    try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                    try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                    return;
                 }
-                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-
-                // Create step record
-                const step_id_buf = ids.generateId();
-                const step_id = try alloc.dupe(u8, &step_id_buf);
-                try self.store.insertStep(step_id, run_row.id, node_name, "route", "completed", "{}", 1, null, null, null);
-                const route_output = try std.fmt.allocPrint(alloc, "{{\"route\":\"{s}\"}}", .{result.route_value orelse "default"});
-                try self.store.updateStepStatus(step_id, "completed", null, route_output, null, 1);
-                try self.store.insertEvent(run_row.id, step_id, "step.completed", route_output);
-
-                log.info("route node {s} -> {s}", .{ node_name, result.route_value orelse "default" });
-            } else if (std.mem.eql(u8, node_type, "interrupt")) {
-                // Interrupt: save checkpoint, set run to interrupted
-                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-                version += 1;
-
-                const step_id_buf = ids.generateId();
-                const step_id = try alloc.dupe(u8, &step_id_buf);
-                try self.store.insertStep(step_id, run_row.id, node_name, "interrupt", "completed", "{}", 1, null, null, null);
-                try self.store.updateStepStatus(step_id, "completed", null, "{\"interrupted\":true}", null, 1);
-                try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
-
-                const cp_id_buf = ids.generateId();
-                const cp_id = try alloc.dupe(u8, &cp_id_buf);
-                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
-                try self.store.incrementCheckpointCount(run_row.id);
-                try self.store.updateRunState(run_row.id, running_state);
-
-                try self.store.updateRunStatus(run_row.id, "interrupted", null);
-                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
-                log.info("run {s} interrupted at node {s}", .{ run_row.id, node_name });
-                return;
-            } else if (std.mem.eql(u8, node_type, "transform")) {
-                // Transform: apply static updates, no worker dispatch
-                const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
-
-                // Get schema from workflow
-                const schema_json = getSchemaJson(alloc, workflow_json);
-
-                // Apply updates via reducers
-                const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
-                    log.err("transform node {s} failed to apply updates: {}", .{ node_name, err });
-                    try self.store.updateRunStatus(run_row.id, "failed", "transform failed");
+    
+                // Get node definition from workflow
+                const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
+                    log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
+                    try self.store.updateRunStatus(run_row.id, "failed", "node not found in workflow");
                     return;
                 };
-                running_state = new_state;
-
-                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-
-                // Create step record
-                const step_id_buf = ids.generateId();
-                const step_id = try alloc.dupe(u8, &step_id_buf);
-                try self.store.insertStep(step_id, run_row.id, node_name, "transform", "completed", "{}", 1, null, null, null);
-                try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
-                try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
-
-                log.info("transform node {s} completed", .{node_name});
-            } else if (std.mem.eql(u8, node_type, "task") or std.mem.eql(u8, node_type, "agent")) {
-                // Gap 7: Inject __meta managed values
-                const state_with_meta = injectMeta(alloc, running_state, run_row.id, node_name, version, @as(i64, @intCast(max_iterations))) catch running_state;
-
-                // Gap 3: Check cache before executing
-                const cache_ttl = parseCacheTtlMs(alloc, node_json);
-                if (cache_ttl != null) cache_check: {
-                    const pt_c = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_check;
-                    const rnd_c = templates.renderTemplate(alloc, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
-                    const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
-                    const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
-                    if (cached) |cached_upd| {
-                        const cs = getSchemaJson(alloc, workflow_json);
-                        running_state = state_mod.applyUpdates(alloc, running_state, cached_upd, cs) catch running_state;
-                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-                        log.info("task node {s} cache hit for run {s}", .{ node_name, run_row.id });
-                        made_progress = true;
-                        version += 1;
-                        const ccb = ids.generateId();
-                        const cci = try alloc.dupe(u8, &ccb);
-                        const ccn = try serializeCompletedNodes(alloc, &completed_nodes);
-                        const cpi: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                        const cmj = try serializeRouteResults(alloc, &route_results);
-                        try self.store.createCheckpoint(cci, run_row.id, node_name, cpi, running_state, ccn, version, cmj);
-                        try self.store.incrementCheckpointCount(run_row.id);
-                        try self.store.updateRunState(run_row.id, running_state);
-                        continue;
+    
+                // Get node type
+                const node_type = getNodeField(alloc, node_json, "type") orelse "task";
+    
+                // Execute based on type
+                if (std.mem.eql(u8, node_type, "route")) {
+                    // Route: evaluate routing logic, no worker dispatch
+                    const result = try self.executeRouteNode(alloc, node_name, node_json, running_state);
+                    if (result.route_value) |rv| {
+                        try route_results.put(try alloc.dupe(u8, node_name), rv);
                     }
-                }
-
-                // Gap 2: Retry loop
-                const max_attempts = parseRetryMaxAttempts(alloc, node_json) orelse 1;
-                const retry_init_ms = parseRetryInitialMs(alloc, node_json) orelse 500;
-                const retry_bf = parseRetryBackoff(alloc, node_json) orelse 2.0;
-                const retry_max_ms = parseRetryMaxMs(alloc, node_json) orelse 30000;
-                var result: TaskNodeResult = undefined;
-                var attempt: u32 = 0;
-                while (attempt < max_attempts) : (attempt += 1) {
-                    result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
+                    try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+    
+                    // Create step record
+                    const step_id_buf = ids.generateId();
+                    const step_id = try alloc.dupe(u8, &step_id_buf);
+                    try self.store.insertStep(step_id, run_row.id, node_name, "route", "completed", "{}", 1, null, null, null);
+                    const route_output = try std.fmt.allocPrint(alloc, "{{\"route\":\"{s}\"}}", .{result.route_value orelse "default"});
+                    try self.store.updateStepStatus(step_id, "completed", null, route_output, null, 1);
+                    try self.store.insertEvent(run_row.id, step_id, "step.completed", route_output);
+    
+                    log.info("route node {s} -> {s}", .{ node_name, result.route_value orelse "default" });
+                } else if (std.mem.eql(u8, node_type, "interrupt")) {
+                    // Interrupt: save checkpoint, set run to interrupted
+                    try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                    version += 1;
+    
+                    const step_id_buf = ids.generateId();
+                    const step_id = try alloc.dupe(u8, &step_id_buf);
+                    try self.store.insertStep(step_id, run_row.id, node_name, "interrupt", "completed", "{}", 1, null, null, null);
+                    try self.store.updateStepStatus(step_id, "completed", null, "{\"interrupted\":true}", null, 1);
+                    try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+    
+                    const cp_id_buf = ids.generateId();
+                    const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                    const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                    try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                    try self.store.incrementCheckpointCount(run_row.id);
+                    try self.store.updateRunState(run_row.id, running_state);
+    
+                    try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                    try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                    log.info("run {s} interrupted at node {s}", .{ run_row.id, node_name });
+                    return;
+                } else if (std.mem.eql(u8, node_type, "transform")) {
+                    // Transform: apply static updates, no worker dispatch
+                    const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
+    
+                    // Get schema from workflow
+                    const schema_json = getSchemaJson(alloc, workflow_json);
+    
+                    // Apply updates via reducers
+                    const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
+                        log.err("transform node {s} failed to apply updates: {}", .{ node_name, err });
+                        try self.store.updateRunStatus(run_row.id, "failed", "transform failed");
+                        return;
+                    };
+                    running_state = new_state;
+    
+                    try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+    
+                    // Create step record
+                    const step_id_buf = ids.generateId();
+                    const step_id = try alloc.dupe(u8, &step_id_buf);
+                    try self.store.insertStep(step_id, run_row.id, node_name, "transform", "completed", "{}", 1, null, null, null);
+                    try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
+                    try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
+    
+                    log.info("transform node {s} completed", .{node_name});
+                } else if (std.mem.eql(u8, node_type, "task") or std.mem.eql(u8, node_type, "agent")) {
+                    // Gap 7: Inject __meta managed values
+                    const state_with_meta = injectMeta(alloc, running_state, run_row.id, node_name, version, @as(i64, @intCast(max_iterations))) catch running_state;
+    
+                    // Gap 3: Check cache before executing
+                    const cache_ttl = parseCacheTtlMs(alloc, node_json);
+                    if (cache_ttl != null) cache_check: {
+                        const pt_c = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_check;
+                        const rnd_c = templates.renderTemplate(alloc, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
+                        const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
+                        const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
+                        if (cached) |cached_upd| {
+                            const cs = getSchemaJson(alloc, workflow_json);
+                            running_state = state_mod.applyUpdates(alloc, running_state, cached_upd, cs) catch running_state;
+                            try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                            log.info("task node {s} cache hit for run {s}", .{ node_name, run_row.id });
+                            made_progress = true;
+                            version += 1;
+                            const ccb = ids.generateId();
+                            const cci = try alloc.dupe(u8, &ccb);
+                            const ccn = try serializeCompletedNodes(alloc, &completed_nodes);
+                            const cpi: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                            const cmj = try serializeRouteResults(alloc, &route_results);
+                            try self.store.createCheckpoint(cci, run_row.id, node_name, cpi, running_state, ccn, version, cmj);
+                            try self.store.incrementCheckpointCount(run_row.id);
+                            try self.store.updateRunState(run_row.id, running_state);
+                            continue;
+                        }
+                    }
+    
+                    // Gap 2: Retry loop
+                    const max_attempts = parseRetryMaxAttempts(alloc, node_json) orelse 1;
+                    const retry_init_ms = parseRetryInitialMs(alloc, node_json) orelse 500;
+                    const retry_bf = parseRetryBackoff(alloc, node_json) orelse 2.0;
+                    const retry_max_ms = parseRetryMaxMs(alloc, node_json) orelse 30000;
+                    var result: TaskNodeResult = undefined;
+                    var attempt: u32 = 0;
+                    while (attempt < max_attempts) : (attempt += 1) {
+                        result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
+                        switch (result) {
+                            .failed => {
+                                if (attempt + 1 < max_attempts) {
+                                    var dms: u64 = retry_init_ms;
+                                    var ei: u32 = 0;
+                                    while (ei < attempt) : (ei += 1) {
+                                        const nd = @as(f64, @floatFromInt(dms)) * retry_bf;
+                                        dms = @intFromFloat(@min(nd, @as(f64, @floatFromInt(retry_max_ms))));
+                                    }
+                                    if (dms > retry_max_ms) dms = retry_max_ms;
+                                    log.info("task node {s} attempt {d}/{d} failed, retrying in {d}ms", .{ node_name, attempt + 1, max_attempts, dms });
+                                    self.emitEvent(alloc, .step_retrying, run_row.id, null, node_name, null);
+                                    std.Thread.sleep(dms * std.time.ns_per_ms);
+                                    continue;
+                                }
+                            },
+                            else => break,
+                        }
+                    }
+    
                     switch (result) {
-                        .failed => {
-                            if (attempt + 1 < max_attempts) {
-                                var dms: u64 = retry_init_ms;
-                                var ei: u32 = 0;
-                                while (ei < attempt) : (ei += 1) {
-                                    const nd = @as(f64, @floatFromInt(dms)) * retry_bf;
-                                    dms = @intFromFloat(@min(nd, @as(f64, @floatFromInt(retry_max_ms))));
+                        .completed => |cr| {
+                            // Gap 7: Strip __meta (don't persist)
+                            running_state = stripMeta(alloc, running_state) catch running_state;
+    
+                            if (cr.state_updates) |updates| {
+                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                                    log.err("task node {s} failed to apply updates: {}", .{ node_name, err });
+                                    try self.store.updateRunStatus(run_row.id, "failed", "state update failed");
+                                    return;
+                                };
+                                running_state = new_state;
+    
+                                // Gap 3: Store result in cache
+                                if (cache_ttl) |ttl| cache_store: {
+                                    const pt_s = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_store;
+                                    const rnd_s = templates.renderTemplate(alloc, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
+                                    const ck_s = computeCacheKey(alloc, node_name, rnd_s) catch break :cache_store;
+                                    self.store.setCachedResult(ck_s, node_name, updates, ttl) catch |cerr| {
+                                        log.warn("failed to cache result for node {s}: {}", .{ node_name, cerr });
+                                    };
                                 }
-                                if (dms > retry_max_ms) dms = retry_max_ms;
-                                log.info("task node {s} attempt {d}/{d} failed, retrying in {d}ms", .{ node_name, attempt + 1, max_attempts, dms });
-                                self.emitEvent(alloc, .step_retrying, run_row.id, null, node_name, null);
-                                std.Thread.sleep(dms * std.time.ns_per_ms);
-                                continue;
+    
+                                // Gap 4: Save as pending write
+                                self.store.savePendingWrite(run_row.id, node_name, node_name, updates) catch |perr| {
+                                    log.warn("failed to save pending write for node {s}: {}", .{ node_name, perr });
+                                };
                             }
-                        },
-                        else => break,
-                    }
-                }
-
-                switch (result) {
-                    .completed => |cr| {
-                        // Gap 7: Strip __meta (don't persist)
-                        running_state = stripMeta(alloc, running_state) catch running_state;
-
-                        if (cr.state_updates) |updates| {
-                            const schema_json = getSchemaJson(alloc, workflow_json);
-                            const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
-                                log.err("task node {s} failed to apply updates: {}", .{ node_name, err });
-                                try self.store.updateRunStatus(run_row.id, "failed", "state update failed");
-                                return;
-                            };
-                            running_state = new_state;
-
-                            // Gap 3: Store result in cache
-                            if (cache_ttl) |ttl| cache_store: {
-                                const pt_s = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_store;
-                                const rnd_s = templates.renderTemplate(alloc, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
-                                const ck_s = computeCacheKey(alloc, node_name, rnd_s) catch break :cache_store;
-                                self.store.setCachedResult(ck_s, node_name, updates, ttl) catch |cerr| {
-                                    log.warn("failed to cache result for node {s}: {}", .{ node_name, cerr });
+    
+                            // Apply UI messages to state (__ui_messages key)
+                            if (cr.raw_output) |raw_out| {
+                                running_state = applyUiMessagesToState(alloc, running_state, raw_out) catch running_state;
+                            }
+    
+                            // Consume pending injections
+                            const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
+                            for (injections) |injection| {
+                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const new_state = state_mod.applyUpdates(alloc, running_state, injection.updates_json, schema_json) catch |err| {
+                                    log.warn("failed to apply injection for run {s}: {}", .{ run_row.id, err });
+                                    continue;
                                 };
+                                running_state = new_state;
                             }
-
-                            // Gap 4: Save as pending write
-                            self.store.savePendingWrite(run_row.id, node_name, node_name, updates) catch |perr| {
-                                log.warn("failed to save pending write for node {s}: {}", .{ node_name, perr });
-                            };
-                        }
-
-                        // Apply UI messages to state (__ui_messages key)
-                        if (cr.raw_output) |raw_out| {
-                            running_state = applyUiMessagesToState(alloc, running_state, raw_out) catch running_state;
-                        }
-
-                        // Consume pending injections
-                        const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
-                        for (injections) |injection| {
-                            const schema_json = getSchemaJson(alloc, workflow_json);
-                            const new_state = state_mod.applyUpdates(alloc, running_state, injection.updates_json, schema_json) catch |err| {
-                                log.warn("failed to apply injection for run {s}: {}", .{ run_row.id, err });
-                                continue;
-                            };
-                            running_state = new_state;
-                        }
-
-                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-
-                        if (cr.goto_targets) |targets| {
-                            var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
-                            for (targets) |target| {
-                                if (std.mem.eql(u8, target, "__end__") or getNodeJson(alloc, workflow_json, target) != null) {
-                                    try valid_targets.append(alloc, target);
-                                } else {
-                                    log.warn("goto target {s} not found in workflow, skipping", .{target});
+    
+                            try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+    
+                            if (cr.goto_targets) |targets| {
+                                var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
+                                for (targets) |target| {
+                                    if (std.mem.eql(u8, target, "__end__") or getNodeJson(alloc, workflow_json, target) != null) {
+                                        try valid_targets.append(alloc, target);
+                                    } else {
+                                        log.warn("goto target {s} not found in workflow, skipping", .{target});
+                                    }
+                                }
+                                if (valid_targets.items.len > 0) {
+                                    goto_override = try valid_targets.toOwnedSlice(alloc);
+                                    log.info("task node {s} goto: {d} targets", .{ node_name, goto_override.?.len });
                                 }
                             }
-                            if (valid_targets.items.len > 0) {
-                                goto_override = try valid_targets.toOwnedSlice(alloc);
-                                log.info("task node {s} goto: {d} targets", .{ node_name, goto_override.?.len });
-                            }
-                        }
-
-                        // Gap 4: Clear pending writes
-                        self.store.clearPendingWrites(run_row.id) catch {};
-
-                        log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
-                    },
-                    .async_pending => {
-                        // Step is dispatched async, don't mark as completed yet
-                        // Will be polled on next tick
-                        log.info("task node {s} dispatched async for run {s}", .{ node_name, run_row.id });
-                        // Save checkpoint with current progress before returning
-                        version += 1;
-                        const cp_id_buf = ids.generateId();
-                        const cp_id = try alloc.dupe(u8, &cp_id_buf);
-                        const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                        const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                        const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-                        try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
-                        try self.store.incrementCheckpointCount(run_row.id);
-                        try self.store.updateRunState(run_row.id, running_state);
-                        return;
-                    },
-                    .no_worker => {
-                        // No worker available, will retry next tick
-                        log.debug("no worker for task node {s}, will retry", .{node_name});
-                        // Save progress so far
-                        if (version > initial_version) {
+    
+                            // Gap 4: Clear pending writes
+                            self.store.clearPendingWrites(run_row.id) catch {};
+    
+                            log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
+                        },
+                        .async_pending => {
+                            // Step is dispatched async, don't mark as completed yet
+                            // Will be polled on next tick
+                            log.info("task node {s} dispatched async for run {s}", .{ node_name, run_row.id });
+                            // Save checkpoint with current progress before returning
+                            version += 1;
                             const cp_id_buf = ids.generateId();
                             const cp_id = try alloc.dupe(u8, &cp_id_buf);
                             const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
@@ -779,122 +764,137 @@ pub const Engine = struct {
                             try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                             try self.store.incrementCheckpointCount(run_row.id);
                             try self.store.updateRunState(run_row.id, running_state);
-                        }
-                        return;
-                    },
-                    .failed => |err_text| {
-                        log.err("task node {s} failed: {s}", .{ node_name, err_text });
-                        try self.store.updateRunStatus(run_row.id, "failed", err_text);
-                        try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
-                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
-                        return;
-                    },
+                            return;
+                        },
+                        .no_worker => {
+                            // No worker available, will retry next tick
+                            log.debug("no worker for task node {s}, will retry", .{node_name});
+                            // Save progress so far
+                            if (version > initial_version) {
+                                const cp_id_buf = ids.generateId();
+                                const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
+                                try self.store.incrementCheckpointCount(run_row.id);
+                                try self.store.updateRunState(run_row.id, running_state);
+                            }
+                            return;
+                        },
+                        .failed => |err_text| {
+                            log.err("task node {s} failed: {s}", .{ node_name, err_text });
+                            try self.store.updateRunStatus(run_row.id, "failed", err_text);
+                            try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
+                            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                            return;
+                        },
+                    }
+                } else if (std.mem.eql(u8, node_type, "subgraph")) {
+                    // Subgraph: execute child workflow inline
+                    const result = try self.executeSubgraphNode(alloc, run_row, node_name, node_json, running_state, recursion_depth);
+    
+                    switch (result) {
+                        .completed => |cr| {
+                            if (cr.state_updates) |updates| {
+                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                                    log.err("subgraph node {s} failed to apply updates: {}", .{ node_name, err });
+                                    try self.store.updateRunStatus(run_row.id, "failed", "subgraph state update failed");
+                                    return;
+                                };
+                                running_state = new_state;
+                            }
+                            try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                            log.info("subgraph node {s} completed for run {s}", .{ node_name, run_row.id });
+                        },
+                        .failed => |err_text| {
+                            log.err("subgraph node {s} failed: {s}", .{ node_name, err_text });
+                            try self.store.updateRunStatus(run_row.id, "failed", err_text);
+                            try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
+                            callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
+                            return;
+                        },
+                        else => {},
+                    }
+                } else if (std.mem.eql(u8, node_type, "send")) {
+                    // Send: read items from state, dispatch target_node per item
+                    const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
+                    if (result.state_updates) |updates| {
+                        const schema_json = getSchemaJson(alloc, workflow_json);
+                        const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
+                            log.err("send node {s} failed to apply updates: {}", .{ node_name, err });
+                            try self.store.updateRunStatus(run_row.id, "failed", "send state update failed");
+                            return;
+                        };
+                        running_state = new_state;
+                    }
+                    try completed_nodes.put(try alloc.dupe(u8, node_name), {});
+                    log.info("send node {s} completed for run {s}", .{ node_name, run_row.id });
+                } else {
+                    log.warn("unknown node type {s} for node {s}", .{ node_type, node_name });
+                    try self.store.updateRunStatus(run_row.id, "failed", "unknown node type");
+                    return;
                 }
-            } else if (std.mem.eql(u8, node_type, "subgraph")) {
-                // Subgraph: execute child workflow inline
-                const result = try self.executeSubgraphNode(alloc, run_row, node_name, node_json, running_state, recursion_depth);
-
-                switch (result) {
-                    .completed => |cr| {
-                        if (cr.state_updates) |updates| {
-                            const schema_json = getSchemaJson(alloc, workflow_json);
-                            const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
-                                log.err("subgraph node {s} failed to apply updates: {}", .{ node_name, err });
-                                try self.store.updateRunStatus(run_row.id, "failed", "subgraph state update failed");
-                                return;
-                            };
-                            running_state = new_state;
-                        }
-                        try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-                        log.info("subgraph node {s} completed for run {s}", .{ node_name, run_row.id });
-                    },
-                    .failed => |err_text| {
-                        log.err("subgraph node {s} failed: {s}", .{ node_name, err_text });
-                        try self.store.updateRunStatus(run_row.id, "failed", err_text);
-                        try self.store.insertEvent(run_row.id, null, "run.failed", "{}");
-                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
-                        return;
-                    },
-                    else => {},
+    
+                // Breakpoint: interrupt_after check
+                if (isInBreakpointList(node_name, interrupt_after)) {
+                    log.info("breakpoint interrupt_after at node {s} for run {s}", .{ node_name, run_row.id });
+                    // Save checkpoint with updated state first
+                    version += 1;
+                    const bp_cp_id_buf = ids.generateId();
+                    const bp_cp_id = try alloc.dupe(u8, &bp_cp_id_buf);
+                    const bp_cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                    const bp_parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const bp_meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                    try self.store.createCheckpoint(bp_cp_id, run_row.id, node_name, bp_parent_id, running_state, bp_cn_json, version, bp_meta_json);
+                    try self.store.incrementCheckpointCount(run_row.id);
+                    try self.store.updateRunState(run_row.id, running_state);
+    
+                    try self.store.updateRunStatus(run_row.id, "interrupted", null);
+                    try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
+                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
+                    return;
                 }
-            } else if (std.mem.eql(u8, node_type, "send")) {
-                // Send: read items from state, dispatch target_node per item
-                const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
-                if (result.state_updates) |updates| {
-                    const schema_json = getSchemaJson(alloc, workflow_json);
-                    const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
-                        log.err("send node {s} failed to apply updates: {}", .{ node_name, err });
-                        try self.store.updateRunStatus(run_row.id, "failed", "send state update failed");
+    
+                // Reconciliation: check tracker task status between steps
+                if (tracker_url != null and task_id != null) {
+                    if (!reconcileWithTracker(alloc, tracker_url.?, task_id.?)) {
+                        log.info("run {s} cancelled by reconciliation", .{run_row.id});
+                        try self.store.updateRunStatus(run_row.id, "failed", "cancelled by tracker reconciliation");
+                        try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"tracker_cancelled\"}");
+                        callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
                         return;
-                    };
-                    running_state = new_state;
+                    }
                 }
-                try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-                log.info("send node {s} completed for run {s}", .{ node_name, run_row.id });
-            } else {
-                log.warn("unknown node type {s} for node {s}", .{ node_type, node_name });
-                try self.store.updateRunStatus(run_row.id, "failed", "unknown node type");
-                return;
-            }
-
-            // Breakpoint: interrupt_after check
-            if (isInBreakpointList(node_name, interrupt_after)) {
-                log.info("breakpoint interrupt_after at node {s} for run {s}", .{ node_name, run_row.id });
-                // Save checkpoint with updated state first
+    
+                // Strip ephemeral keys before checkpoint persistence
+                const schema_for_eph = getSchemaJson(alloc, workflow_json);
+                running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
+    
+                // Save checkpoint after each node
+                made_progress = true;
                 version += 1;
-                const bp_cp_id_buf = ids.generateId();
-                const bp_cp_id = try alloc.dupe(u8, &bp_cp_id_buf);
-                const bp_cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                const bp_parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-                const bp_meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-                try self.store.createCheckpoint(bp_cp_id, run_row.id, node_name, bp_parent_id, running_state, bp_cn_json, version, bp_meta_json);
+                const cp_id_buf = ids.generateId();
+                const cp_id = try alloc.dupe(u8, &cp_id_buf);
+                const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
+                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
+                try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
-
-                try self.store.updateRunStatus(run_row.id, "interrupted", null);
-                try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
-                callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
-                return;
-            }
-
-            // Reconciliation: check tracker task status between steps
-            if (tracker_url != null and task_id != null) {
-                if (!reconcileWithTracker(alloc, tracker_url.?, task_id.?)) {
-                    log.info("run {s} cancelled by reconciliation", .{run_row.id});
-                    try self.store.updateRunStatus(run_row.id, "failed", "cancelled by tracker reconciliation");
-                    try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"tracker_cancelled\"}");
-                    callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.failed", run_row.id, null, "{}", self.metrics);
-                    return;
+    
+                // Emit structured checkpoint event
+                self.emitEvent(alloc, .checkpoint_created, run_row.id, null, node_name, null);
+    
+                // Broadcast rich SSE events for all modes
+                if (self.sse_hub) |hub| {
+                    const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
+                    const nt = if (node_json_for_sse) |nj| (getNodeField(alloc, nj, "type") orelse "task") else "task";
+                    broadcastNodeEvents(hub, alloc, run_row.id, node_name, nt, running_state, null, version, 0);
                 }
             }
 
-            // Strip ephemeral keys before checkpoint persistence
-            const schema_for_eph = getSchemaJson(alloc, workflow_json);
-            running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
-
-            // Save checkpoint after each node
-            made_progress = true;
-            version += 1;
-            const cp_id_buf = ids.generateId();
-            const cp_id = try alloc.dupe(u8, &cp_id_buf);
-            const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-            const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
-            const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
-            try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
-            try self.store.incrementCheckpointCount(run_row.id);
-            try self.store.updateRunState(run_row.id, running_state);
-
-            // Emit structured checkpoint event
-            self.emitEvent(alloc, .checkpoint_created, run_row.id, null, node_name, null);
-
-            // Broadcast rich SSE events for all modes
-            if (self.sse_hub) |hub| {
-                const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
-                const nt = if (node_json_for_sse) |nj| (getNodeField(alloc, nj, "type") orelse "task") else "task";
-                broadcastNodeEvents(hub, alloc, run_row.id, node_name, nt, running_state, null, version, 0);
-            }
-        }
-
             // If goto override is set, use it for next iteration instead of findReadyNodes
             if (goto_override) |targets| {
                 goto_ready = targets;

From 3b8443d78d5407f4b3cd15fefa2dce5b60168329 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:09:26 -0300
Subject: [PATCH 32/55] Replace magic numbers with named constants

Extract max_nodes_per_tick (1000) and max_subgraph_depth (10)
into module-level constants for clarity and single-point-of-change.
---
 src/engine.zig | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index ddb62b6..c895a16 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -97,6 +97,14 @@ pub const OrchestratorEvent = struct {
     }
 };
 
+// ── Constants ────────────────────────────────────────────────────────
+
+/// Maximum number of node executions per tick to prevent infinite loops.
+const max_nodes_per_tick: u32 = 1000;
+
+/// Maximum inline subgraph recursion depth.
+const max_subgraph_depth: u32 = 10;
+
 // ── Engine ───────────────────────────────────────────────────────────
 
 pub const RuntimeConfig = struct {
@@ -408,7 +416,7 @@ pub const Engine = struct {
 
         // 4. Main execution loop: find ready nodes, execute, repeat
         var running_state: []const u8 = try alloc.dupe(u8, current_state);
-        var max_iterations: u32 = 1000; // safety guard against infinite loops
+        var max_iterations: u32 = max_nodes_per_tick;
         var goto_ready: ?[]const []const u8 = null; // goto override from command primitive
 
         while (max_iterations > 0) : (max_iterations -= 1) {
@@ -1194,8 +1202,8 @@ pub const Engine = struct {
     // ── executeSubgraphNode ─────────────────────────────────────────
 
     fn executeSubgraphNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8, recursion_depth: u32) !TaskNodeResult {
-        if (recursion_depth >= 10) {
-            log.err("subgraph node {s}: max recursion depth (10) exceeded", .{node_name});
+        if (recursion_depth >= max_subgraph_depth) {
+            log.err("subgraph node {s}: max recursion depth ({d}) exceeded", .{ node_name, max_subgraph_depth });
             return TaskNodeResult{ .failed = "subgraph max recursion depth exceeded" };
         }
 

From 2882436f90667d675cdfad9fc1e73811c6cc21c5 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 22:54:02 -0300
Subject: [PATCH 33/55] Add MIT license

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..bc38dea
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 nullclaw contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

From f6eb0681ef57812f78019c5477e280234d08c0aa Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Fri, 13 Mar 2026 23:02:01 -0300
Subject: [PATCH 34/55] Fix workspace hook tests on Windows

Use cmd.exe instead of /bin/sh on Windows. Skip shell-dependent
tests on Windows since hook commands are Unix shell syntax.
---
 src/workspace.zig | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/workspace.zig b/src/workspace.zig
index f7d5f9a..b7ecd9b 100644
--- a/src/workspace.zig
+++ b/src/workspace.zig
@@ -167,7 +167,11 @@ pub fn cleanAll(root: []const u8) void {
 /// Returns true when the command exits with code 0, false otherwise.
 /// Times out after `timeout_ms` milliseconds (the child is killed on timeout).
 pub fn runHook(allocator: std.mem.Allocator, command: []const u8, cwd: []const u8, timeout_ms: u64) !bool {
-    const argv = [_][]const u8{ "/bin/sh", "-lc", command };
+    const native = @import("builtin").os.tag;
+    const argv = if (native == .windows)
+        [_][]const u8{ "cmd.exe", "/C", command }
+    else
+        [_][]const u8{ "/bin/sh", "-lc", command };
 
     var child = std.process.Child.init(&argv, allocator);
     child.cwd = cwd;
@@ -268,6 +272,9 @@ test "Workspace create and remove" {
 }
 
 test "runHook executes shell command" {
+    const native = @import("builtin").os.tag;
+    if (native == .windows) return error.SkipZigTest;
+
     const allocator = std.testing.allocator;
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();
@@ -286,6 +293,9 @@ test "runHook executes shell command" {
 }
 
 test "runHook returns false for failing command" {
+    const native = @import("builtin").os.tag;
+    if (native == .windows) return error.SkipZigTest;
+
     const allocator = std.testing.allocator;
     var tmp = std.testing.tmpDir(.{});
     defer tmp.cleanup();

From 67964a2995fba91e679c266c7569aa8fcad8102a Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:12:15 -0300
Subject: [PATCH 35/55] Fix blocking retry loop and stale checkpoint parent_id

Replace std.Thread.sleep in retry loop with non-blocking retry
scheduling. Instead of blocking the engine thread, failed retry
attempts now create a step record with next_attempt_at_ms and
return control to the tick loop. Future ticks check the timestamp
and re-execute when the delay has elapsed.

Also fix stale checkpoint parent_id: latest_checkpoint was fetched
once at processRunWithDepth start, causing all checkpoints within
a tick to point to the same parent. Introduce latest_checkpoint_id
that updates after each checkpoint creation for correct chaining.
---
 src/engine.zig | 136 ++++++++++++++++++++++++++++++++++++-------------
 src/store.zig  |  18 +++++++
 2 files changed, 119 insertions(+), 35 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index c895a16..ff1eb0c 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -398,6 +398,11 @@ pub const Engine = struct {
         var version: i64 = if (latest_checkpoint) |cp| cp.version else 0;
         const initial_version = version;
 
+        // Track the latest checkpoint ID for correct parent chaining.
+        // Updated after each checkpoint creation so subsequent checkpoints
+        // within the same tick correctly chain to their predecessor.
+        var latest_checkpoint_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+
         // Emit run_started only on the first tick (no prior checkpoints)
         if (latest_checkpoint == null) {
             self.emitEvent(alloc, .run_started, run_row.id, null, null, null);
@@ -520,12 +525,13 @@ pub const Engine = struct {
                     const cp_id_buf = ids.generateId();
                     const cp_id = try alloc.dupe(u8, &cp_id_buf);
                     const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const parent_id: ?[]const u8 = latest_checkpoint_id;
                     const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                     try self.store.createCheckpoint(cp_id, run_row.id, "__end__", parent_id, running_state, cn_json, version, meta_json);
                     try self.store.incrementCheckpointCount(run_row.id);
                     try self.store.updateRunState(run_row.id, running_state);
-    
+                    latest_checkpoint_id = cp_id;
+
                     // Run is completed
                     try self.store.updateRunStatus(run_row.id, "completed", null);
                     try self.store.insertEvent(run_row.id, null, "run.completed", "{}");
@@ -541,12 +547,13 @@ pub const Engine = struct {
                     const cp_id_buf = ids.generateId();
                     const cp_id = try alloc.dupe(u8, &cp_id_buf);
                     const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const parent_id: ?[]const u8 = latest_checkpoint_id;
                     const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                     try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                     try self.store.incrementCheckpointCount(run_row.id);
                     try self.store.updateRunState(run_row.id, running_state);
-    
+                    latest_checkpoint_id = cp_id;
+
                     try self.store.updateRunStatus(run_row.id, "interrupted", null);
                     try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
                     callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
@@ -595,12 +602,13 @@ pub const Engine = struct {
                     const cp_id_buf = ids.generateId();
                     const cp_id = try alloc.dupe(u8, &cp_id_buf);
                     const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                    const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const parent_id: ?[]const u8 = latest_checkpoint_id;
                     const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                     try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                     try self.store.incrementCheckpointCount(run_row.id);
                     try self.store.updateRunState(run_row.id, running_state);
-    
+                    latest_checkpoint_id = cp_id;
+
                     try self.store.updateRunStatus(run_row.id, "interrupted", null);
                     try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
                     callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
@@ -652,45 +660,99 @@ pub const Engine = struct {
                             const ccb = ids.generateId();
                             const cci = try alloc.dupe(u8, &ccb);
                             const ccn = try serializeCompletedNodes(alloc, &completed_nodes);
-                            const cpi: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                            const cpi: ?[]const u8 = latest_checkpoint_id;
                             const cmj = try serializeRouteResults(alloc, &route_results);
                             try self.store.createCheckpoint(cci, run_row.id, node_name, cpi, running_state, ccn, version, cmj);
                             try self.store.incrementCheckpointCount(run_row.id);
                             try self.store.updateRunState(run_row.id, running_state);
+                            latest_checkpoint_id = cci;
                             continue;
                         }
                     }
     
-                    // Gap 2: Retry loop
+                    // Gap 2: Non-blocking retry — check for pending retry step
                     const max_attempts = parseRetryMaxAttempts(alloc, node_json) orelse 1;
                     const retry_init_ms = parseRetryInitialMs(alloc, node_json) orelse 500;
                     const retry_bf = parseRetryBackoff(alloc, node_json) orelse 2.0;
                     const retry_max_ms = parseRetryMaxMs(alloc, node_json) orelse 30000;
-                    var result: TaskNodeResult = undefined;
-                    var attempt: u32 = 0;
-                    while (attempt < max_attempts) : (attempt += 1) {
-                        result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
-                        switch (result) {
-                            .failed => {
-                                if (attempt + 1 < max_attempts) {
-                                    var dms: u64 = retry_init_ms;
-                                    var ei: u32 = 0;
-                                    while (ei < attempt) : (ei += 1) {
-                                        const nd = @as(f64, @floatFromInt(dms)) * retry_bf;
-                                        dms = @intFromFloat(@min(nd, @as(f64, @floatFromInt(retry_max_ms))));
-                                    }
-                                    if (dms > retry_max_ms) dms = retry_max_ms;
-                                    log.info("task node {s} attempt {d}/{d} failed, retrying in {d}ms", .{ node_name, attempt + 1, max_attempts, dms });
-                                    self.emitEvent(alloc, .step_retrying, run_row.id, null, node_name, null);
-                                    std.Thread.sleep(dms * std.time.ns_per_ms);
-                                    continue;
-                                }
-                            },
-                            else => break,
+
+                    // Check if there's a pending retry step for this node
+                    const retrying_step = self.store.getRetryingStepForNode(alloc, run_row.id, node_name) catch null;
+                    if (retrying_step) |rs| {
+                        const now_ms = ids.nowMs();
+                        if (rs.next_attempt_at_ms) |next_at| {
+                            if (now_ms < next_at) {
+                                // Retry delay not elapsed yet — skip this node, let other runs process
+                                return;
+                            }
                         }
+                        // Retry timer expired — clear the retrying step and re-execute below
+                        // The attempt count is tracked on the step record
                     }
+
+                    const current_attempt: u32 = if (retrying_step) |rs| @intCast(rs.attempt) else 0;
+                    const result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
+
+                    // Handle retry scheduling for failed results (non-blocking)
+                    const result_after_retry: TaskNodeResult = switch (result) {
+                        .failed => |err_text| blk: {
+                            if (current_attempt + 1 < max_attempts) {
+                                // Calculate delay with exponential backoff
+                                var dms: u64 = retry_init_ms;
+                                var ei: u32 = 0;
+                                while (ei < current_attempt) : (ei += 1) {
+                                    const nd = @as(f64, @floatFromInt(dms)) * retry_bf;
+                                    dms = @intFromFloat(@min(nd, @as(f64, @floatFromInt(retry_max_ms))));
+                                }
+                                if (dms > retry_max_ms) dms = retry_max_ms;
+                                log.info("task node {s} attempt {d}/{d} failed, scheduling retry in {d}ms", .{ node_name, current_attempt + 1, max_attempts, dms });
+                                self.emitEvent(alloc, .step_retrying, run_row.id, null, node_name, null);
+
+                                // Create or update step record with retry schedule
+                                const next_retry_at = ids.nowMs() + @as(i64, @intCast(dms));
+                                if (retrying_step) |rs| {
+                                    // Update existing step with next retry time
+                                    self.store.scheduleStepRetry(rs.id, next_retry_at, @as(i64, @intCast(current_attempt + 1)), err_text) catch {};
+                                } else {
+                                    // Create new step record for retry tracking
+                                    const retry_step_id_buf = ids.generateId();
+                                    const retry_step_id = alloc.dupe(u8, &retry_step_id_buf) catch {
+                                        break :blk result;
+                                    };
+                                    self.store.insertStep(retry_step_id, run_row.id, node_name, node_type, "ready", "{}", @intCast(max_attempts), null, null, null) catch {
+                                        break :blk result;
+                                    };
+                                    self.store.scheduleStepRetry(retry_step_id, next_retry_at, 1, err_text) catch {};
+                                }
+
+                                // Save progress checkpoint before returning
+                                if (version > initial_version) {
+                                    const cp_id_buf = ids.generateId();
+                                    const cp_id = alloc.dupe(u8, &cp_id_buf) catch {
+                                        break :blk result;
+                                    };
+                                    const cn_json = serializeCompletedNodes(alloc, &completed_nodes) catch {
+                                        break :blk result;
+                                    };
+                                    const parent_id: ?[]const u8 = if (latest_checkpoint_id) |pid| pid else null;
+                                    const meta_json = serializeRouteResultsWithVersion(alloc, &route_results, wf_version) catch {
+                                        break :blk result;
+                                    };
+                                    self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json) catch {};
+                                    self.store.incrementCheckpointCount(run_row.id) catch {};
+                                    self.store.updateRunState(run_row.id, running_state) catch {};
+                                    latest_checkpoint_id = cp_id;
+                                }
+
+                                // Return without marking node as completed — next tick will retry
+                                return;
+                            }
+                            break :blk result;
+                        },
+                        else => result,
+                    };
     
-                    switch (result) {
+                    switch (result_after_retry) {
                         .completed => |cr| {
                             // Gap 7: Strip __meta (don't persist)
                             running_state = stripMeta(alloc, running_state) catch running_state;
@@ -767,11 +829,12 @@ pub const Engine = struct {
                             const cp_id_buf = ids.generateId();
                             const cp_id = try alloc.dupe(u8, &cp_id_buf);
                             const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                            const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                            const parent_id: ?[]const u8 = latest_checkpoint_id;
                             const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                             try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                             try self.store.incrementCheckpointCount(run_row.id);
                             try self.store.updateRunState(run_row.id, running_state);
+                            latest_checkpoint_id = cp_id;
                             return;
                         },
                         .no_worker => {
@@ -782,11 +845,12 @@ pub const Engine = struct {
                                 const cp_id_buf = ids.generateId();
                                 const cp_id = try alloc.dupe(u8, &cp_id_buf);
                                 const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                                const parent_id: ?[]const u8 = latest_checkpoint_id;
                                 const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                                 try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                                 try self.store.incrementCheckpointCount(run_row.id);
                                 try self.store.updateRunState(run_row.id, running_state);
+                                latest_checkpoint_id = cp_id;
                             }
                             return;
                         },
@@ -853,12 +917,13 @@ pub const Engine = struct {
                     const bp_cp_id_buf = ids.generateId();
                     const bp_cp_id = try alloc.dupe(u8, &bp_cp_id_buf);
                     const bp_cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                    const bp_parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                    const bp_parent_id: ?[]const u8 = latest_checkpoint_id;
                     const bp_meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                     try self.store.createCheckpoint(bp_cp_id, run_row.id, node_name, bp_parent_id, running_state, bp_cn_json, version, bp_meta_json);
                     try self.store.incrementCheckpointCount(run_row.id);
                     try self.store.updateRunState(run_row.id, running_state);
-    
+                    latest_checkpoint_id = bp_cp_id;
+
                     try self.store.updateRunStatus(run_row.id, "interrupted", null);
                     try self.store.insertEvent(run_row.id, null, "run.interrupted", "{}");
                     callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
@@ -886,11 +951,12 @@ pub const Engine = struct {
                 const cp_id_buf = ids.generateId();
                 const cp_id = try alloc.dupe(u8, &cp_id_buf);
                 const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
-                const parent_id: ?[]const u8 = if (latest_checkpoint) |cp| cp.id else null;
+                const parent_id: ?[]const u8 = latest_checkpoint_id;
                 const meta_json = try serializeRouteResultsWithVersion(alloc, &route_results, wf_version);
                 try self.store.createCheckpoint(cp_id, run_row.id, node_name, parent_id, running_state, cn_json, version, meta_json);
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
+                latest_checkpoint_id = cp_id;
     
                 // Emit structured checkpoint event
                 self.emitEvent(alloc, .checkpoint_created, run_row.id, null, node_name, null);
diff --git a/src/store.zig b/src/store.zig
index f51cc3c..2977f49 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -716,6 +716,24 @@ pub const Store = struct {
         }
     }
 
+    /// Get a retrying step for a given run and node name (def_step_id).
+    /// Returns the step if it exists with status='ready' and next_attempt_at_ms set.
+    pub fn getRetryingStepForNode(self: *Self, allocator: std.mem.Allocator, run_id: []const u8, node_name: []const u8) !?types.StepRow {
+        const sql = "SELECT id, run_id, def_step_id, type, status, worker_id, input_json, output_json, error_text, attempt, max_attempts, timeout_ms, next_attempt_at_ms, parent_step_id, item_index, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, child_run_id, iteration_index FROM steps WHERE run_id = ? AND def_step_id = ? AND status = 'ready' AND next_attempt_at_ms IS NOT NULL ORDER BY created_at_ms DESC LIMIT 1";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 2, node_name.ptr, @intCast(node_name.len), SQLITE_STATIC);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_ROW) return null;
+
+        return try readStepRow(allocator, stmt);
+    }
+
     pub fn countStepsByStatus(self: *Self, run_id: []const u8, status: []const u8) !i64 {
         const sql = "SELECT COUNT(*) FROM steps WHERE run_id = ? AND status = ?";
         var stmt: ?*c.sqlite3_stmt = null;

From 46af7ba7ceafe3efb6cc89ce21ce73e31df5d5f9 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:13:32 -0300
Subject: [PATCH 36/55] Fix misleading free in handleStream SSE event drain

Remove the explicit ctx.allocator.free(sse_events) call which was
misleading since ctx.allocator is a per-request arena where free
is a no-op. Add comment explaining why inner strings (event_type,
data) must not be freed here as they originate from the engine's
per-tick arena, not the request arena.
---
 src/api.zig | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/api.zig b/src/api.zig
index 519512f..e2032c7 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -1689,7 +1689,10 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
             }
             sse_buf.append(ctx.allocator, ']') catch {};
             sse_events_json = sse_buf.toOwnedSlice(ctx.allocator) catch "[]";
-            ctx.allocator.free(sse_events);
+            // Note: sse_events slice is allocated via ctx.allocator which is a
+            // per-request arena — no explicit free needed. The inner strings
+            // (event_type, data) are not owned by this allocator either (they
+            // originate from the engine's per-tick arena), so we must not free them.
         }
     }
 

From 05a46bb5a9d9d2425b56d84cf8a7312bff83bf88 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:14:17 -0300
Subject: [PATCH 37/55] Cache validateConfig result to avoid 2 DB queries per
 tick

validateConfig runs listWorkers + getActiveRuns every 200ms tick.
Cache the result with a timestamp, only re-validate every 30s.
On validation failure, immediately invalidate the cache so the
next tick re-checks.
---
 src/engine.zig | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/engine.zig b/src/engine.zig
index ff1eb0c..746d738 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -137,6 +137,11 @@ pub const Engine = struct {
     sse_hub: ?*sse_mod.SseHub = null,
     workflow_watcher: ?*workflow_loader.WorkflowWatcher = null,
     rate_limits: std.StringHashMap(RateLimitInfo),
+    config_valid: bool = false,
+    last_config_check_ms: i64 = 0,
+
+    /// How often to re-run config validation (default 30s).
+    const config_check_interval_ms: i64 = 30_000;
 
     pub fn init(store: *Store, allocator: std.mem.Allocator, poll_interval_ms: u64) Engine {
         return .{
@@ -151,6 +156,8 @@ pub const Engine = struct {
             .sse_hub = null,
             .workflow_watcher = null,
             .rate_limits = std.StringHashMap(RateLimitInfo).init(allocator),
+            .config_valid = false,
+            .last_config_check_ms = 0,
         };
     }
 
@@ -178,7 +185,14 @@ pub const Engine = struct {
 
     /// Validate that the engine configuration is healthy before dispatching
     /// new work. Returns true if workers exist and the store is reachable.
+    /// Results are cached for config_check_interval_ms to avoid running
+    /// 2 DB queries (listWorkers + getActiveRuns) on every tick.
     fn validateConfig(self: *Engine) bool {
+        const now_ms = ids.nowMs();
+        if (self.config_valid and (now_ms - self.last_config_check_ms) < config_check_interval_ms) {
+            return true;
+        }
+
         // Check: at least one worker registered and active
         var arena = std.heap.ArenaAllocator.init(self.allocator);
         defer arena.deinit();
@@ -186,20 +200,25 @@ pub const Engine = struct {
 
         const workers = self.store.listWorkers(alloc) catch {
             log.warn("config validation: store query failed (listWorkers)", .{});
+            self.config_valid = false;
             return false;
         };
 
         if (workers.len == 0) {
             log.warn("config validation: no workers registered", .{});
+            self.config_valid = false;
             return false;
         }
 
         // Check: store connection healthy (simple query)
         _ = self.store.getActiveRuns(alloc) catch {
             log.warn("config validation: store connection unhealthy", .{});
+            self.config_valid = false;
             return false;
         };
 
+        self.config_valid = true;
+        self.last_config_check_ms = now_ms;
         return true;
     }
 

From 864cbdf987146d49edce71420883c3707020494a Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:14:57 -0300
Subject: [PATCH 38/55] Hoist worker list fetch out of send node per-item loop

executeSendNode was calling store.listWorkers and building
worker_infos for every item in the send array. Move both the
worker list fetch and worker_infos construction before the loop
since the worker list doesn't change between items.
---
 src/engine.zig | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 746d738..a083325 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -1398,6 +1398,25 @@ pub const Engine = struct {
             return SendNodeResult{ .state_updates = null };
         }
 
+        // Build worker list once before iterating items
+        const workers = try self.store.listWorkers(alloc);
+        var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
+        for (workers) |w| {
+            const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
+            try worker_infos.append(alloc, .{
+                .id = w.id,
+                .url = w.url,
+                .token = w.token,
+                .protocol = w.protocol,
+                .model = w.model,
+                .tags_json = w.tags_json,
+                .max_concurrent = w.max_concurrent,
+                .status = w.status,
+                .current_tasks = current_tasks,
+            });
+        }
+        const required_tags = getNodeTags(alloc, target_json);
+
         // For each item, execute the target node
         var results: std.ArrayListUnmanaged([]const u8) = .empty;
         for (items_parsed.value.array.items, 0..) |item, idx| {
@@ -1410,25 +1429,6 @@ pub const Engine = struct {
             // Render with item
             const rendered = templates.renderTemplate(alloc, prompt_template, state_json, run_row.input_json, item_str) catch continue;
 
-            // Select worker and dispatch
-            const workers = try self.store.listWorkers(alloc);
-            var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-            for (workers) |w| {
-                const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-                try worker_infos.append(alloc, .{
-                    .id = w.id,
-                    .url = w.url,
-                    .token = w.token,
-                    .protocol = w.protocol,
-                    .model = w.model,
-                    .tags_json = w.tags_json,
-                    .max_concurrent = w.max_concurrent,
-                    .status = w.status,
-                    .current_tasks = current_tasks,
-                });
-            }
-
-            const required_tags = getNodeTags(alloc, target_json);
             const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
             if (selected_worker == null) {
                 try results.append(alloc, "null");

From 3b96e77bc4287859178207c8431665798916a7dd Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:16:00 -0300
Subject: [PATCH 39/55] Eliminate race in handleRunWorkflow by creating run as
 running

Previously, run was created with 'pending' status then updated to
'running' in a separate operation. The engine could miss the run
between these two DB operations. Add createRunWithStateAndStatus
to store and use it in handleRunWorkflow to create the run directly
with 'running' status in a single INSERT.
---
 src/api.zig   | 10 +++-------
 src/store.zig | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index e2032c7..43191e5 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -1268,8 +1268,9 @@ fn handleRunWorkflow(ctx: *Context, workflow_id: []const u8, body: []const u8) H
     const run_id_buf = ids.generateId();
     const run_id = ctx.allocator.dupe(u8, &run_id_buf) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
 
-    // Create run with state
-    ctx.store.createRunWithState(run_id, workflow_id, wf.definition_json, input_json, initial_state) catch {
+    // Create run directly with "running" status to avoid race window where
+    // engine could miss a run created as "pending" then updated to "running".
+    ctx.store.createRunWithStateAndStatus(run_id, workflow_id, wf.definition_json, input_json, initial_state, "running") catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create run\"}}");
     };
 
@@ -1280,11 +1281,6 @@ fn handleRunWorkflow(ctx: *Context, workflow_id: []const u8, body: []const u8) H
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to create checkpoint\"}}");
     };
 
-    // Set run status to running
-    ctx.store.updateRunStatus(run_id, "running", null) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run status\"}}");
-    };
-
     const run_id_json = jsonQuoted(ctx.allocator, run_id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
         \\{{"id":{s},"status":"running"}}
diff --git a/src/store.zig b/src/store.zig
index 2977f49..d7da317 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -1461,7 +1461,13 @@ pub const Store = struct {
     }
 
     pub fn createRunWithState(self: *Self, id: []const u8, workflow_id: ?[]const u8, workflow_json: []const u8, input_json: []const u8, state_json: []const u8) !void {
-        const sql = "INSERT INTO runs (id, status, workflow_id, workflow_json, input_json, callbacks_json, state_json, created_at_ms, updated_at_ms) VALUES (?, 'pending', ?, ?, ?, '[]', ?, ?, ?)";
+        return self.createRunWithStateAndStatus(id, workflow_id, workflow_json, input_json, state_json, "pending");
+    }
+
+    /// Create a run with explicit initial status. Use "running" to avoid the
+    /// race window between creating with "pending" and updating to "running".
+    pub fn createRunWithStateAndStatus(self: *Self, id: []const u8, workflow_id: ?[]const u8, workflow_json: []const u8, input_json: []const u8, state_json: []const u8, status: []const u8) !void {
+        const sql = "INSERT INTO runs (id, status, workflow_id, workflow_json, input_json, callbacks_json, state_json, created_at_ms, updated_at_ms) VALUES (?, ?, ?, ?, ?, '[]', ?, ?, ?)";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -1470,12 +1476,13 @@ pub const Store = struct {
 
         const now = ids.nowMs();
         _ = c.sqlite3_bind_text(stmt, 1, id.ptr, @intCast(id.len), SQLITE_STATIC);
-        bindTextOpt(stmt, 2, workflow_id);
-        _ = c.sqlite3_bind_text(stmt, 3, workflow_json.ptr, @intCast(workflow_json.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 4, input_json.ptr, @intCast(input_json.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_text(stmt, 5, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
-        _ = c.sqlite3_bind_int64(stmt, 6, now);
+        _ = c.sqlite3_bind_text(stmt, 2, status.ptr, @intCast(status.len), SQLITE_STATIC);
+        bindTextOpt(stmt, 3, workflow_id);
+        _ = c.sqlite3_bind_text(stmt, 4, workflow_json.ptr, @intCast(workflow_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 5, input_json.ptr, @intCast(input_json.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_text(stmt, 6, state_json.ptr, @intCast(state_json.len), SQLITE_STATIC);
         _ = c.sqlite3_bind_int64(stmt, 7, now);
+        _ = c.sqlite3_bind_int64(stmt, 8, now);
 
         if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
             return error.SqliteStepFailed;

From 47d7be3d3845707cfe0df55b5f6c799b799ca789 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:17:03 -0300
Subject: [PATCH 40/55] Clear old steps and checkpoints on replay

When replaying from a checkpoint, steps and checkpoints created after
the replay point remained in the DB, causing stale data. Add
deleteStepsAfterTimestamp and deleteCheckpointsAfterVersion to the
store, and call both in handleReplayRun before resetting the run state.
---
 src/api.zig   |  9 +++++++++
 src/store.zig | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/src/api.zig b/src/api.zig
index 43191e5..d69aa85 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -1538,6 +1538,15 @@ fn handleReplayRun(ctx: *Context, run_id: []const u8, body: []const u8) HttpResp
         return jsonResponse(404, "{\"error\":{\"code\":\"not_found\",\"message\":\"run not found\"}}");
     };
 
+    // Delete steps and checkpoints created after the replay checkpoint
+    // so the engine re-executes from a clean slate.
+    ctx.store.deleteStepsAfterTimestamp(run_id, cp.created_at_ms) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to clear old steps\"}}");
+    };
+    ctx.store.deleteCheckpointsAfterVersion(run_id, cp.version) catch {
+        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to clear old checkpoints\"}}");
+    };
+
     // Reset run state to checkpoint's state
     ctx.store.updateRunState(run_id, cp.state_json) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to update run state\"}}");
diff --git a/src/store.zig b/src/store.zig
index d7da317..b8a67f6 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -766,6 +766,42 @@ pub const Store = struct {
         return list.toOwnedSlice(allocator);
     }
 
+    /// Delete steps for a run that were created after a given timestamp.
+    /// Used during replay to remove steps that will be re-executed.
+    pub fn deleteStepsAfterTimestamp(self: *Self, run_id: []const u8, after_ms: i64) !void {
+        const sql = "DELETE FROM steps WHERE run_id = ? AND created_at_ms > ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 2, after_ms);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
+    /// Delete checkpoints for a run with version greater than a given version.
+    /// Used during replay to remove checkpoints that will be superseded.
+    pub fn deleteCheckpointsAfterVersion(self: *Self, run_id: []const u8, after_version: i64) !void {
+        const sql = "DELETE FROM checkpoints WHERE run_id = ? AND version > ?";
+        var stmt: ?*c.sqlite3_stmt = null;
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        defer _ = c.sqlite3_finalize(stmt);
+
+        _ = c.sqlite3_bind_text(stmt, 1, run_id.ptr, @intCast(run_id.len), SQLITE_STATIC);
+        _ = c.sqlite3_bind_int64(stmt, 2, after_version);
+
+        if (c.sqlite3_step(stmt) != c.SQLITE_DONE) {
+            return error.SqliteStepFailed;
+        }
+    }
+
     /// Count how many running tasks a worker currently has.
     pub fn countRunningStepsByWorker(self: *Self, worker_id: []const u8) !i64 {
         const sql = "SELECT COUNT(*) FROM steps WHERE worker_id = ? AND status = 'running'";

From 6e93a945d894b149ca62decfc5b0488390b9c376 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 07:18:13 -0300
Subject: [PATCH 41/55] Parse workflow_json once and cache schema in
 processRunWithDepth

workflow_json was parsed independently by ~15 helper calls per tick.
Parse it once at the top of processRunWithDepth and pre-extract the
state schema. Replace all getSchemaJson(alloc, workflow_json) calls
within the function with the cached value, eliminating ~9 redundant
JSON parses per node execution.
---
 src/engine.zig | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index a083325..6dac20e 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -362,8 +362,27 @@ pub const Engine = struct {
             }
         }
 
-        // 2. Load workflow definition
+        // 2. Load and parse workflow definition once for the entire tick.
+        // Helper functions still accept raw JSON strings for external callers,
+        // but we pre-extract commonly used values here to avoid redundant parsing.
         const workflow_json = run_row.workflow_json;
+        const wf_parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch {
+            log.err("failed to parse workflow_json for run {s}", .{run_row.id});
+            try self.store.updateRunStatus(run_row.id, "failed", "invalid workflow JSON");
+            return;
+        };
+        const wf_root = wf_parsed.value;
+
+        // Pre-extract schema (used many times in the loop)
+        const cached_schema_json = if (wf_root == .object) blk: {
+            if (wf_root.object.get("state_schema")) |ss| {
+                break :blk serializeJsonValue(alloc, ss) catch "{}";
+            }
+            if (wf_root.object.get("schema")) |ss| {
+                break :blk serializeJsonValue(alloc, ss) catch "{}";
+            }
+            break :blk "{}";
+        } else "{}";
 
         // 2b. Parse breakpoint lists from workflow definition
         const interrupt_before = parseBreakpointList(alloc, workflow_json, "interrupt_before");
@@ -515,7 +534,7 @@ pub const Engine = struct {
     
                         if (std.mem.eql(u8, def_node_type, "transform")) {
                             const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
-                            const def_schema = getSchemaJson(alloc, workflow_json);
+                            const def_schema = cached_schema_json;
                             const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
                             running_state = def_new_state;
                         } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
@@ -523,7 +542,7 @@ pub const Engine = struct {
                             switch (def_result) {
                                 .completed => |cr| {
                                     if (cr.state_updates) |updates| {
-                                        const def_schema = getSchemaJson(alloc, workflow_json);
+                                        const def_schema = cached_schema_json;
                                         const def_new_state = state_mod.applyUpdates(alloc, running_state, updates, def_schema) catch running_state;
                                         running_state = def_new_state;
                                     }
@@ -638,7 +657,7 @@ pub const Engine = struct {
                     const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
     
                     // Get schema from workflow
-                    const schema_json = getSchemaJson(alloc, workflow_json);
+                    const schema_json = cached_schema_json;
     
                     // Apply updates via reducers
                     const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
@@ -670,7 +689,7 @@ pub const Engine = struct {
                         const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
                         const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
                         if (cached) |cached_upd| {
-                            const cs = getSchemaJson(alloc, workflow_json);
+                            const cs = cached_schema_json;
                             running_state = state_mod.applyUpdates(alloc, running_state, cached_upd, cs) catch running_state;
                             try completed_nodes.put(try alloc.dupe(u8, node_name), {});
                             log.info("task node {s} cache hit for run {s}", .{ node_name, run_row.id });
@@ -777,7 +796,7 @@ pub const Engine = struct {
                             running_state = stripMeta(alloc, running_state) catch running_state;
     
                             if (cr.state_updates) |updates| {
-                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const schema_json = cached_schema_json;
                                 const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                                     log.err("task node {s} failed to apply updates: {}", .{ node_name, err });
                                     try self.store.updateRunStatus(run_row.id, "failed", "state update failed");
@@ -809,7 +828,7 @@ pub const Engine = struct {
                             // Consume pending injections
                             const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
                             for (injections) |injection| {
-                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const schema_json = cached_schema_json;
                                 const new_state = state_mod.applyUpdates(alloc, running_state, injection.updates_json, schema_json) catch |err| {
                                     log.warn("failed to apply injection for run {s}: {}", .{ run_row.id, err });
                                     continue;
@@ -888,7 +907,7 @@ pub const Engine = struct {
                     switch (result) {
                         .completed => |cr| {
                             if (cr.state_updates) |updates| {
-                                const schema_json = getSchemaJson(alloc, workflow_json);
+                                const schema_json = cached_schema_json;
                                 const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                                     log.err("subgraph node {s} failed to apply updates: {}", .{ node_name, err });
                                     try self.store.updateRunStatus(run_row.id, "failed", "subgraph state update failed");
@@ -912,7 +931,7 @@ pub const Engine = struct {
                     // Send: read items from state, dispatch target_node per item
                     const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
                     if (result.state_updates) |updates| {
-                        const schema_json = getSchemaJson(alloc, workflow_json);
+                        const schema_json = cached_schema_json;
                         const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
                             log.err("send node {s} failed to apply updates: {}", .{ node_name, err });
                             try self.store.updateRunStatus(run_row.id, "failed", "send state update failed");
@@ -961,7 +980,7 @@ pub const Engine = struct {
                 }
     
                 // Strip ephemeral keys before checkpoint persistence
-                const schema_for_eph = getSchemaJson(alloc, workflow_json);
+                const schema_for_eph = cached_schema_json;
                 running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
     
                 // Save checkpoint after each node

From 5c9156f4f5bea6e03296c36054b861145db77e68 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:20:24 -0300
Subject: [PATCH 42/55] Decode encoded orchestration path ids

---
 src/api.zig | 41 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 5 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index d69aa85..00e5ce7 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -46,11 +46,11 @@ pub fn handleRequest(ctx: *Context, method: []const u8, target: []const u8, body
     }
 
     const path = parsePath(target);
-    const seg0 = getPathSegment(path, 0);
-    const seg1 = getPathSegment(path, 1);
-    const seg2 = getPathSegment(path, 2);
-    const seg3 = getPathSegment(path, 3);
-    const seg4 = getPathSegment(path, 4);
+    const seg0 = decodePathSegment(ctx.allocator, getPathSegment(path, 0));
+    const seg1 = decodePathSegment(ctx.allocator, getPathSegment(path, 1));
+    const seg2 = decodePathSegment(ctx.allocator, getPathSegment(path, 2));
+    const seg3 = decodePathSegment(ctx.allocator, getPathSegment(path, 3));
+    const seg4 = decodePathSegment(ctx.allocator, getPathSegment(path, 4));
 
     const is_get = eql(method, "GET");
     const is_post = eql(method, "POST");
@@ -2117,6 +2117,14 @@ fn getPathSegment(segments: [max_segments]?[]const u8, index: usize) ?[]const u8
     return segments[index];
 }
 
+fn decodePathSegment(allocator: std.mem.Allocator, segment: ?[]const u8) ?[]const u8 {
+    const raw = segment orelse return null;
+    if (std.mem.indexOfScalar(u8, raw, '%') == null) return raw;
+
+    const encoded = allocator.dupe(u8, raw) catch return raw;
+    return std.Uri.percentDecodeInPlace(encoded);
+}
+
 fn eql(a: ?[]const u8, b: []const u8) bool {
     if (a) |val| return std.mem.eql(u8, val, b);
     return false;
@@ -2719,3 +2727,26 @@ test "API: stream with mode query param" {
     try std.testing.expectEqual(@as(u16, 200), resp2.status_code);
     try std.testing.expect(std.mem.indexOf(u8, resp2.body, "stream_events") != null);
 }
+
+test "API: workflow routes decode percent-encoded ids" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createWorkflowWithVersion("wf/alpha beta", "Encoded Workflow", "{\"nodes\":{},\"edges\":[]}", 1);
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const get_resp = handleRequest(&ctx, "GET", "/workflows/wf%2Falpha%20beta", "");
+    try std.testing.expectEqual(@as(u16, 200), get_resp.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, get_resp.body, "\"id\":\"wf/alpha beta\"") != null);
+
+    const validate_resp = handleRequest(&ctx, "POST", "/workflows/wf%2Falpha%20beta/validate", "");
+    try std.testing.expectEqual(@as(u16, 200), validate_resp.status_code);
+}

From 56fa13fea996fe1744b43c906d3d4ed26a205e4c Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:26:17 -0300
Subject: [PATCH 43/55] Support workflow filtering in runs API

---
 src/api.zig   |  41 ++++++++++-
 src/store.zig | 185 ++++++++++++++++++++++++++++++++------------------
 src/types.zig |   1 +
 3 files changed, 159 insertions(+), 68 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 00e5ce7..65343d7 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -727,6 +727,10 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
         const ik_json = jsonQuoted(ctx.allocator, ik) catch "";
         break :blk std.fmt.allocPrint(ctx.allocator, ",\"idempotency_key\":{s}", .{ik_json}) catch "";
     } else "";
+    const workflow_id_field = if (run.workflow_id) |wid| blk: {
+        const wid_json = jsonQuoted(ctx.allocator, wid) catch "";
+        break :blk std.fmt.allocPrint(ctx.allocator, ",\"workflow_id\":{s}", .{wid_json}) catch "";
+    } else "";
 
     // Include state_json if present
     const state_field = if (run.state_json) |sj|
@@ -753,13 +757,14 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
     const run_id_json = jsonQuoted(ctx.allocator, run.id) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const run_status_json = jsonQuoted(ctx.allocator, run.status) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s}{s}{s}{s},"steps":{s}}}
+        \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}{s}{s}{s}{s}{s}{s}{s},"steps":{s}}}
     , .{
         run_id_json,
         run_status_json,
         idempotency_field,
         run.created_at_ms,
         run.updated_at_ms,
+        workflow_id_field,
         error_field,
         started_field,
         ended_field,
@@ -773,11 +778,12 @@ fn handleGetRun(ctx: *Context, id: []const u8) HttpResponse {
 
 fn handleListRuns(ctx: *Context, target: []const u8) HttpResponse {
     const status_filter = getQueryParam(target, "status");
+    const workflow_id_filter = getQueryParam(target, "workflow_id");
     const limit = parseQueryInt(target, "limit", 100, 1, 1000);
     const offset = parseQueryInt(target, "offset", 0, 0, 1_000_000_000);
 
     // Fetch one extra row to compute has_more.
-    const runs = ctx.store.listRuns(ctx.allocator, status_filter, limit + 1, offset) catch {
+    const runs = ctx.store.listRuns(ctx.allocator, status_filter, workflow_id_filter, limit + 1, offset) catch {
         return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to list runs\"}}");
     };
 
@@ -797,12 +803,17 @@ fn handleListRuns(ctx: *Context, target: []const u8) HttpResponse {
             const ik_json = jsonQuoted(ctx.allocator, ik) catch "";
             break :blk std.fmt.allocPrint(ctx.allocator, ",\"idempotency_key\":{s}", .{ik_json}) catch "";
         } else "";
+        const workflow_id_field = if (r.workflow_id) |wid| blk: {
+            const wid_json = jsonQuoted(ctx.allocator, wid) catch "";
+            break :blk std.fmt.allocPrint(ctx.allocator, ",\"workflow_id\":{s}", .{wid_json}) catch "";
+        } else "";
         const entry = std.fmt.allocPrint(ctx.allocator,
-            \\{{"id":{s},"status":{s}{s},"created_at_ms":{d},"updated_at_ms":{d}}}
+            \\{{"id":{s},"status":{s}{s}{s},"created_at_ms":{d},"updated_at_ms":{d}}}
         , .{
             run_id_json,
             run_status_json,
             idempotency_field,
+            workflow_id_field,
             r.created_at_ms,
             r.updated_at_ms,
         }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
@@ -2597,6 +2608,30 @@ test "API: metrics endpoint returns text format" {
     try std.testing.expect(std.mem.indexOf(u8, resp.body, "nullboiler_http_requests_total") != null);
 }
 
+test "API: list runs supports workflow_id filter" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createWorkflowWithVersion("wf_1", "WF 1", "{\"nodes\":{},\"edges\":[]}", 1);
+    try store.createWorkflowWithVersion("wf_2", "WF 2", "{\"nodes\":{},\"edges\":[]}", 1);
+    try store.createRunWithStateAndStatus("r1", "wf_1", "{\"nodes\":{},\"edges\":[]}", "{}", "{}", "running");
+    try store.createRunWithStateAndStatus("r2", "wf_2", "{\"nodes\":{},\"edges\":[]}", "{}", "{}", "running");
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const resp = handleRequest(&ctx, "GET", "/runs?workflow_id=wf_1", "");
+    try std.testing.expectEqual(@as(u16, 200), resp.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"workflow_id\":\"wf_1\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"workflow_id\":\"wf_2\"") == null);
+}
+
 test "API: replay run from checkpoint" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
diff --git a/src/store.zig b/src/store.zig
index b8a67f6..ebf0ca5 100644
--- a/src/store.zig
+++ b/src/store.zig
@@ -403,7 +403,7 @@ pub const Store = struct {
     }
 
     pub fn getRun(self: *Self, allocator: std.mem.Allocator, id: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE id = ?";
+        const sql = "SELECT id, idempotency_key, status, workflow_id, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE id = ?";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -418,22 +418,23 @@ pub const Store = struct {
             .id = try allocStr(allocator, stmt, 0),
             .idempotency_key = try allocStrOpt(allocator, stmt, 1),
             .status = try allocStr(allocator, stmt, 2),
-            .workflow_json = try allocStr(allocator, stmt, 3),
-            .input_json = try allocStr(allocator, stmt, 4),
-            .callbacks_json = try allocStr(allocator, stmt, 5),
-            .error_text = try allocStrOpt(allocator, stmt, 6),
-            .created_at_ms = colInt(stmt, 7),
-            .updated_at_ms = colInt(stmt, 8),
-            .started_at_ms = colIntOpt(stmt, 9),
-            .ended_at_ms = colIntOpt(stmt, 10),
-            .state_json = try allocStrOpt(allocator, stmt, 11),
-            .config_json = try allocStrOpt(allocator, stmt, 12),
-            .parent_run_id = try allocStrOpt(allocator, stmt, 13),
+            .workflow_id = try allocStrOpt(allocator, stmt, 3),
+            .workflow_json = try allocStr(allocator, stmt, 4),
+            .input_json = try allocStr(allocator, stmt, 5),
+            .callbacks_json = try allocStr(allocator, stmt, 6),
+            .error_text = try allocStrOpt(allocator, stmt, 7),
+            .created_at_ms = colInt(stmt, 8),
+            .updated_at_ms = colInt(stmt, 9),
+            .started_at_ms = colIntOpt(stmt, 10),
+            .ended_at_ms = colIntOpt(stmt, 11),
+            .state_json = try allocStrOpt(allocator, stmt, 12),
+            .config_json = try allocStrOpt(allocator, stmt, 13),
+            .parent_run_id = try allocStrOpt(allocator, stmt, 14),
         };
     }
 
     pub fn getRunByIdempotencyKey(self: *Self, allocator: std.mem.Allocator, key: []const u8) !?types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
+        const sql = "SELECT id, idempotency_key, status, workflow_id, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE idempotency_key = ? ORDER BY created_at_ms DESC LIMIT 1";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -447,38 +448,45 @@ pub const Store = struct {
             .id = try allocStr(allocator, stmt, 0),
             .idempotency_key = try allocStrOpt(allocator, stmt, 1),
             .status = try allocStr(allocator, stmt, 2),
-            .workflow_json = try allocStr(allocator, stmt, 3),
-            .input_json = try allocStr(allocator, stmt, 4),
-            .callbacks_json = try allocStr(allocator, stmt, 5),
-            .error_text = try allocStrOpt(allocator, stmt, 6),
-            .created_at_ms = colInt(stmt, 7),
-            .updated_at_ms = colInt(stmt, 8),
-            .started_at_ms = colIntOpt(stmt, 9),
-            .ended_at_ms = colIntOpt(stmt, 10),
-            .state_json = try allocStrOpt(allocator, stmt, 11),
-            .config_json = try allocStrOpt(allocator, stmt, 12),
-            .parent_run_id = try allocStrOpt(allocator, stmt, 13),
+            .workflow_id = try allocStrOpt(allocator, stmt, 3),
+            .workflow_json = try allocStr(allocator, stmt, 4),
+            .input_json = try allocStr(allocator, stmt, 5),
+            .callbacks_json = try allocStr(allocator, stmt, 6),
+            .error_text = try allocStrOpt(allocator, stmt, 7),
+            .created_at_ms = colInt(stmt, 8),
+            .updated_at_ms = colInt(stmt, 9),
+            .started_at_ms = colIntOpt(stmt, 10),
+            .ended_at_ms = colIntOpt(stmt, 11),
+            .state_json = try allocStrOpt(allocator, stmt, 12),
+            .config_json = try allocStrOpt(allocator, stmt, 13),
+            .parent_run_id = try allocStrOpt(allocator, stmt, 14),
         };
     }
 
-    pub fn listRuns(self: *Self, allocator: std.mem.Allocator, status_filter: ?[]const u8, limit: i64, offset: i64) ![]types.RunRow {
+    pub fn listRuns(self: *Self, allocator: std.mem.Allocator, status_filter: ?[]const u8, workflow_id_filter: ?[]const u8, limit: i64, offset: i64) ![]types.RunRow {
         var stmt: ?*c.sqlite3_stmt = null;
-        if (status_filter != null) {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status = ? ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
-            if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-                return error.SqlitePrepareFailed;
-            }
-            _ = c.sqlite3_bind_text(stmt, 1, status_filter.?.ptr, @intCast(status_filter.?.len), SQLITE_STATIC);
-            _ = c.sqlite3_bind_int64(stmt, 2, limit);
-            _ = c.sqlite3_bind_int64(stmt, 3, offset);
+        const sql =
+            "SELECT id, idempotency_key, status, workflow_id, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id " ++
+            "FROM runs WHERE (? IS NULL OR status = ?) AND (? IS NULL OR workflow_id = ?) ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
+        if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
+            return error.SqlitePrepareFailed;
+        }
+        if (status_filter) |status| {
+            _ = c.sqlite3_bind_text(stmt, 1, status.ptr, @intCast(status.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_text(stmt, 2, status.ptr, @intCast(status.len), SQLITE_STATIC);
         } else {
-            const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs ORDER BY created_at_ms DESC LIMIT ? OFFSET ?";
-            if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
-                return error.SqlitePrepareFailed;
-            }
-            _ = c.sqlite3_bind_int64(stmt, 1, limit);
-            _ = c.sqlite3_bind_int64(stmt, 2, offset);
+            _ = c.sqlite3_bind_null(stmt, 1);
+            _ = c.sqlite3_bind_null(stmt, 2);
+        }
+        if (workflow_id_filter) |workflow_id| {
+            _ = c.sqlite3_bind_text(stmt, 3, workflow_id.ptr, @intCast(workflow_id.len), SQLITE_STATIC);
+            _ = c.sqlite3_bind_text(stmt, 4, workflow_id.ptr, @intCast(workflow_id.len), SQLITE_STATIC);
+        } else {
+            _ = c.sqlite3_bind_null(stmt, 3);
+            _ = c.sqlite3_bind_null(stmt, 4);
         }
+        _ = c.sqlite3_bind_int64(stmt, 5, limit);
+        _ = c.sqlite3_bind_int64(stmt, 6, offset);
         defer _ = c.sqlite3_finalize(stmt);
 
         var list: std.ArrayListUnmanaged(types.RunRow) = .empty;
@@ -487,17 +495,18 @@ pub const Store = struct {
                 .id = try allocStr(allocator, stmt, 0),
                 .idempotency_key = try allocStrOpt(allocator, stmt, 1),
                 .status = try allocStr(allocator, stmt, 2),
-                .workflow_json = try allocStr(allocator, stmt, 3),
-                .input_json = try allocStr(allocator, stmt, 4),
-                .callbacks_json = try allocStr(allocator, stmt, 5),
-                .error_text = try allocStrOpt(allocator, stmt, 6),
-                .created_at_ms = colInt(stmt, 7),
-                .updated_at_ms = colInt(stmt, 8),
-                .started_at_ms = colIntOpt(stmt, 9),
-                .ended_at_ms = colIntOpt(stmt, 10),
-                .state_json = try allocStrOpt(allocator, stmt, 11),
-                .config_json = try allocStrOpt(allocator, stmt, 12),
-                .parent_run_id = try allocStrOpt(allocator, stmt, 13),
+                .workflow_id = try allocStrOpt(allocator, stmt, 3),
+                .workflow_json = try allocStr(allocator, stmt, 4),
+                .input_json = try allocStr(allocator, stmt, 5),
+                .callbacks_json = try allocStr(allocator, stmt, 6),
+                .error_text = try allocStrOpt(allocator, stmt, 7),
+                .created_at_ms = colInt(stmt, 8),
+                .updated_at_ms = colInt(stmt, 9),
+                .started_at_ms = colIntOpt(stmt, 10),
+                .ended_at_ms = colIntOpt(stmt, 11),
+                .state_json = try allocStrOpt(allocator, stmt, 12),
+                .config_json = try allocStrOpt(allocator, stmt, 13),
+                .parent_run_id = try allocStrOpt(allocator, stmt, 14),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -522,7 +531,7 @@ pub const Store = struct {
     }
 
     pub fn getActiveRuns(self: *Self, allocator: std.mem.Allocator) ![]types.RunRow {
-        const sql = "SELECT id, idempotency_key, status, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status = 'running' ORDER BY created_at_ms DESC";
+        const sql = "SELECT id, idempotency_key, status, workflow_id, workflow_json, input_json, callbacks_json, error_text, created_at_ms, updated_at_ms, started_at_ms, ended_at_ms, state_json, config_json, parent_run_id FROM runs WHERE status = 'running' ORDER BY created_at_ms DESC";
         var stmt: ?*c.sqlite3_stmt = null;
         if (c.sqlite3_prepare_v2(self.db, sql, -1, &stmt, null) != c.SQLITE_OK) {
             return error.SqlitePrepareFailed;
@@ -535,17 +544,18 @@ pub const Store = struct {
                 .id = try allocStr(allocator, stmt, 0),
                 .idempotency_key = try allocStrOpt(allocator, stmt, 1),
                 .status = try allocStr(allocator, stmt, 2),
-                .workflow_json = try allocStr(allocator, stmt, 3),
-                .input_json = try allocStr(allocator, stmt, 4),
-                .callbacks_json = try allocStr(allocator, stmt, 5),
-                .error_text = try allocStrOpt(allocator, stmt, 6),
-                .created_at_ms = colInt(stmt, 7),
-                .updated_at_ms = colInt(stmt, 8),
-                .started_at_ms = colIntOpt(stmt, 9),
-                .ended_at_ms = colIntOpt(stmt, 10),
-                .state_json = try allocStrOpt(allocator, stmt, 11),
-                .config_json = try allocStrOpt(allocator, stmt, 12),
-                .parent_run_id = try allocStrOpt(allocator, stmt, 13),
+                .workflow_id = try allocStrOpt(allocator, stmt, 3),
+                .workflow_json = try allocStr(allocator, stmt, 4),
+                .input_json = try allocStr(allocator, stmt, 5),
+                .callbacks_json = try allocStr(allocator, stmt, 6),
+                .error_text = try allocStrOpt(allocator, stmt, 7),
+                .created_at_ms = colInt(stmt, 8),
+                .updated_at_ms = colInt(stmt, 9),
+                .started_at_ms = colIntOpt(stmt, 10),
+                .ended_at_ms = colIntOpt(stmt, 11),
+                .state_json = try allocStrOpt(allocator, stmt, 12),
+                .config_json = try allocStrOpt(allocator, stmt, 13),
+                .parent_run_id = try allocStrOpt(allocator, stmt, 14),
             });
         }
         return list.toOwnedSlice(allocator);
@@ -1896,6 +1906,7 @@ test "Store: insert and get run" {
         allocator.free(run.id);
         if (run.idempotency_key) |ik| allocator.free(ik);
         allocator.free(run.status);
+        if (run.workflow_id) |wid| allocator.free(wid);
         allocator.free(run.workflow_json);
         allocator.free(run.input_json);
         allocator.free(run.callbacks_json);
@@ -1931,6 +1942,7 @@ test "Store: transaction commit persists inserted run" {
         allocator.free(run.id);
         if (run.idempotency_key) |ik| allocator.free(ik);
         allocator.free(run.status);
+        if (run.workflow_id) |wid| allocator.free(wid);
         allocator.free(run.workflow_json);
         allocator.free(run.input_json);
         allocator.free(run.callbacks_json);
@@ -1945,34 +1957,67 @@ test "Store: list runs with filter" {
     try s.insertRun("r1", null, "running", "{}", "{}", "[]");
     try s.insertRun("r2", null, "pending", "{}", "{}", "[]");
     try s.insertRun("r3", null, "running", "{}", "{}", "[]");
+    try s.createWorkflow("wf_filter", "Filter WF", "{\"nodes\":{}}");
+    try s.createRunWithState("r4", "wf_filter", "{\"nodes\":{}}", "{}", "{}");
 
-    const running = try s.listRuns(allocator, "running", 100, 0);
+    const running = try s.listRuns(allocator, "running", null, 100, 0);
     defer {
         for (running) |r| {
             allocator.free(r.id);
             if (r.idempotency_key) |ik| allocator.free(ik);
             allocator.free(r.status);
+            if (r.workflow_id) |wid| allocator.free(wid);
             allocator.free(r.workflow_json);
             allocator.free(r.input_json);
             allocator.free(r.callbacks_json);
+            if (r.error_text) |et| allocator.free(et);
+            if (r.state_json) |sj| allocator.free(sj);
+            if (r.config_json) |cj| allocator.free(cj);
+            if (r.parent_run_id) |pid| allocator.free(pid);
         }
         allocator.free(running);
     }
     try std.testing.expectEqual(@as(usize, 2), running.len);
 
-    const all = try s.listRuns(allocator, null, 100, 0);
+    const all = try s.listRuns(allocator, null, null, 100, 0);
     defer {
         for (all) |r| {
             allocator.free(r.id);
             if (r.idempotency_key) |ik| allocator.free(ik);
             allocator.free(r.status);
+            if (r.workflow_id) |wid| allocator.free(wid);
             allocator.free(r.workflow_json);
             allocator.free(r.input_json);
             allocator.free(r.callbacks_json);
+            if (r.error_text) |et| allocator.free(et);
+            if (r.state_json) |sj| allocator.free(sj);
+            if (r.config_json) |cj| allocator.free(cj);
+            if (r.parent_run_id) |pid| allocator.free(pid);
         }
         allocator.free(all);
     }
-    try std.testing.expectEqual(@as(usize, 3), all.len);
+    try std.testing.expectEqual(@as(usize, 4), all.len);
+
+    const filtered = try s.listRuns(allocator, null, "wf_filter", 100, 0);
+    defer {
+        for (filtered) |r| {
+            allocator.free(r.id);
+            if (r.idempotency_key) |ik| allocator.free(ik);
+            allocator.free(r.status);
+            if (r.workflow_id) |wid| allocator.free(wid);
+            allocator.free(r.workflow_json);
+            allocator.free(r.input_json);
+            allocator.free(r.callbacks_json);
+            if (r.error_text) |et| allocator.free(et);
+            if (r.state_json) |sj| allocator.free(sj);
+            if (r.config_json) |cj| allocator.free(cj);
+            if (r.parent_run_id) |pid| allocator.free(pid);
+        }
+        allocator.free(filtered);
+    }
+    try std.testing.expectEqual(@as(usize, 1), filtered.len);
+    try std.testing.expectEqualStrings("r4", filtered[0].id);
+    try std.testing.expectEqualStrings("wf_filter", filtered[0].workflow_id.?);
 }
 
 test "Store: update run status" {
@@ -1986,6 +2031,7 @@ test "Store: update run status" {
         allocator.free(run.id);
         if (run.idempotency_key) |ik| allocator.free(ik);
         allocator.free(run.status);
+        if (run.workflow_id) |wid| allocator.free(wid);
         allocator.free(run.workflow_json);
         allocator.free(run.input_json);
         allocator.free(run.callbacks_json);
@@ -2009,9 +2055,14 @@ test "Store: get active runs" {
             allocator.free(r.id);
             if (r.idempotency_key) |ik| allocator.free(ik);
             allocator.free(r.status);
+            if (r.workflow_id) |wid| allocator.free(wid);
             allocator.free(r.workflow_json);
             allocator.free(r.input_json);
             allocator.free(r.callbacks_json);
+            if (r.error_text) |et| allocator.free(et);
+            if (r.state_json) |sj| allocator.free(sj);
+            if (r.config_json) |cj| allocator.free(cj);
+            if (r.parent_run_id) |pid| allocator.free(pid);
         }
         allocator.free(active);
     }
@@ -2460,6 +2511,7 @@ test "run state management" {
         allocator.free(run.id);
         if (run.idempotency_key) |ik| allocator.free(ik);
         allocator.free(run.status);
+        if (run.workflow_id) |wid| allocator.free(wid);
         allocator.free(run.workflow_json);
         allocator.free(run.input_json);
         allocator.free(run.callbacks_json);
@@ -2478,6 +2530,7 @@ test "run state management" {
         allocator.free(run2.id);
         if (run2.idempotency_key) |ik| allocator.free(ik);
         allocator.free(run2.status);
+        if (run2.workflow_id) |wid| allocator.free(wid);
         allocator.free(run2.workflow_json);
         allocator.free(run2.input_json);
         allocator.free(run2.callbacks_json);
@@ -2485,6 +2538,7 @@ test "run state management" {
         if (run2.state_json) |sj| allocator.free(sj);
     }
     try std.testing.expectEqualStrings("r2", run2.id);
+    try std.testing.expectEqualStrings("wf1", run2.workflow_id.?);
 
     // Update run state
     try s.updateRunState("r1", "{\"counter\":42}");
@@ -2501,6 +2555,7 @@ test "run state management" {
         allocator.free(forked.id);
         if (forked.idempotency_key) |ik| allocator.free(ik);
         allocator.free(forked.status);
+        if (forked.workflow_id) |wid| allocator.free(wid);
         allocator.free(forked.workflow_json);
         allocator.free(forked.input_json);
         allocator.free(forked.callbacks_json);
diff --git a/src/types.zig b/src/types.zig
index 07dac57..b4dd51a 100644
--- a/src/types.zig
+++ b/src/types.zig
@@ -147,6 +147,7 @@ pub const RunRow = struct {
     id: []const u8,
     idempotency_key: ?[]const u8,
     status: []const u8,
+    workflow_id: ?[]const u8 = null,
     workflow_json: []const u8,
     input_json: []const u8,
     callbacks_json: []const u8,

From 0510d629b0b79f1a042842861f5fe24ef73bbf02 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:32:42 -0300
Subject: [PATCH 44/55] Accept replay checkpoint_id alias

---
 src/api.zig | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 65343d7..d88f3ab 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -1515,7 +1515,9 @@ fn handleForkRun(ctx: *Context, body: []const u8) HttpResponse {
 // ── Replay Handler ──────────────────────────────────────────────────
 
 fn handleReplayRun(ctx: *Context, run_id: []const u8, body: []const u8) HttpResponse {
-    // Parse from_checkpoint_id from body
+    // Parse replay checkpoint ID. Accept both the canonical
+    // `from_checkpoint_id` field and the older `checkpoint_id` alias so
+    // existing clients keep working.
     const parsed = std.json.parseFromSlice(std.json.Value, ctx.allocator, body, .{}) catch {
         return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"invalid JSON body\"}}");
     };
@@ -1526,8 +1528,8 @@ fn handleReplayRun(ctx: *Context, run_id: []const u8, body: []const u8) HttpResp
     }
     const obj = parsed.value.object;
 
-    const checkpoint_id = getJsonString(obj, "from_checkpoint_id") orelse {
-        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"missing required field: from_checkpoint_id\"}}");
+    const checkpoint_id = getJsonString(obj, "from_checkpoint_id") orelse getJsonString(obj, "checkpoint_id") orelse {
+        return jsonResponse(400, "{\"error\":{\"code\":\"bad_request\",\"message\":\"missing required field: from_checkpoint_id or checkpoint_id\"}}");
     };
 
     // Load checkpoint
@@ -2667,6 +2669,32 @@ test "API: replay run from checkpoint" {
     }
 }
 
+test "API: replay run accepts checkpoint_id alias" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    try store.createRunWithState("r1", null, "{\"nodes\":{}}", "{}", "{\"x\":1}");
+    try store.updateRunStatus("r1", "completed", null);
+    try store.createCheckpoint("cp1", "r1", "step_a", null, "{\"x\":1}", "[\"step_a\"]", 1, null);
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+    };
+
+    const body =
+        \\{"checkpoint_id":"cp1"}
+    ;
+
+    const resp = handleRequest(&ctx, "POST", "/runs/r1/replay", body);
+    try std.testing.expectEqual(@as(u16, 200), resp.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, resp.body, "replayed_from_checkpoint") != null);
+}
+
 test "API: replay run rejects wrong checkpoint" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");

From ff43f14f1df96557e5f6d91a7903d3e4928c35fe Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:32:51 -0300
Subject: [PATCH 45/55] Honor workflow output contracts in engine

---
 src/engine.zig | 132 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 124 insertions(+), 8 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 6dac20e..9c879e0 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -1269,10 +1269,10 @@ pub const Engine = struct {
                 processStreamMessages(hub, alloc, run_row.id, step_id, node_type, final_output);
             }
 
-            // Build state_updates from output
-            // Try parsing as JSON with "state_updates" field, otherwise wrap output in "output" key
-            const state_updates = extractStateUpdates(alloc, final_output) orelse
-                try std.fmt.allocPrint(alloc, "{{\"output\":{s}}}", .{try jsonStringify(alloc, final_output)});
+            // Build state_updates from output. Prefer explicit state_updates
+            // from the worker, otherwise honor node-level output_key /
+            // output_mapping before falling back to the legacy "output" key.
+            const state_updates = try buildTaskStateUpdates(alloc, node_json, final_output);
 
             // Extract goto targets from output (command primitive)
             const goto_targets = extractGotoTargets(alloc, final_output);
@@ -1382,9 +1382,9 @@ pub const Engine = struct {
     // ── executeSendNode ──────────────────────────────────────────────
 
     fn executeSendNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !SendNodeResult {
-        // Read items_from state path
-        const items_path = getNodeField(alloc, node_json, "items_from") orelse {
-            log.warn("send node {s} missing items_from", .{node_name});
+        // Read items_key state path, with items_from kept as a legacy alias.
+        const items_path = getSendItemsPath(alloc, node_json) orelse {
+            log.warn("send node {s} missing items_key/items_from", .{node_name});
             return SendNodeResult{ .state_updates = null };
         };
 
@@ -1486,7 +1486,8 @@ pub const Engine = struct {
 
         // Build state_updates from collected results
         const results_json = try serializeStringArray(alloc, results.items);
-        const state_updates = try std.fmt.allocPrint(alloc, "{{\"send_results\":{s}}}", .{results_json});
+        const output_key = getNodeField(alloc, node_json, "output_key") orelse "send_results";
+        const state_updates = try std.fmt.allocPrint(alloc, "{{\"{s}\":{s}}}", .{ output_key, results_json });
 
         // Create parent step record
         const step_id_buf = ids.generateId();
@@ -1839,6 +1840,66 @@ fn jsonStringify(alloc: std.mem.Allocator, s: []const u8) ![]const u8 {
     return json.Stringify.valueAlloc(alloc, s, .{});
 }
 
+/// Resolve the state path used by a send node. `items_key` is the canonical
+/// field; `items_from` is accepted as a compatibility alias.
+fn getSendItemsPath(alloc: std.mem.Allocator, node_json: []const u8) ?[]const u8 {
+    return getNodeField(alloc, node_json, "items_key") orelse
+        getNodeField(alloc, node_json, "items_from");
+}
+
+/// Build the state update payload for a task/agent node result.
+///
+/// Precedence:
+/// 1. explicit worker-provided `state_updates`
+/// 2. node `output_key` / `output_mapping`
+/// 3. legacy fallback to `{"output": "..."}`
+fn buildTaskStateUpdates(alloc: std.mem.Allocator, node_json: []const u8, output: []const u8) ![]const u8 {
+    if (extractStateUpdates(alloc, output)) |updates| {
+        return updates;
+    }
+
+    const output_key = getNodeField(alloc, node_json, "output_key");
+    const output_mapping_json = getNodeObjectField(alloc, node_json, "output_mapping");
+    if (output_key == null and output_mapping_json == null) {
+        return std.fmt.allocPrint(alloc, "{{\"output\":{s}}}", .{try jsonStringify(alloc, output)});
+    }
+
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    defer arena.deinit();
+    const arena_alloc = arena.allocator();
+
+    var result = json.ObjectMap.init(arena_alloc);
+    const parsed_output = json.parseFromSlice(json.Value, arena_alloc, output, .{}) catch null;
+
+    if (output_key) |key| {
+        if (parsed_output) |parsed| {
+            try result.put(key, parsed.value);
+        } else {
+            try result.put(key, .{ .string = output });
+        }
+    }
+
+    if (output_mapping_json) |mapping_json| {
+        const parsed_mapping = json.parseFromSlice(json.Value, arena_alloc, mapping_json, .{}) catch null;
+        if (parsed_mapping) |mapping| {
+            if (mapping.value == .object and parsed_output != null) {
+                var it = mapping.value.object.iterator();
+                while (it.next()) |entry| {
+                    if (entry.value_ptr.* != .string) continue;
+                    const source_path = entry.value_ptr.string;
+                    const raw_val = state_mod.getStateValue(arena_alloc, output, source_path) catch null;
+                    if (raw_val) |value_json| {
+                        const parsed_value = json.parseFromSlice(json.Value, arena_alloc, value_json, .{}) catch continue;
+                        try result.put(entry.key_ptr.*, parsed_value.value);
+                    }
+                }
+            }
+        }
+    }
+
+    return serializeJsonValue(alloc, .{ .object = result });
+}
+
 /// Serialize completed_nodes set to JSON array.
 fn serializeCompletedNodes(alloc: std.mem.Allocator, completed_nodes: *std.StringHashMap(void)) ![]const u8 {
     var arr: std.ArrayListUnmanaged([]const u8) = .empty;
@@ -2809,6 +2870,61 @@ test "extractStateUpdates returns null for plain text" {
     try std.testing.expect(result == null);
 }
 
+test "buildTaskStateUpdates uses output_key for plain text output" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"task","output_key":"plan"}
+    ;
+    const result = try buildTaskStateUpdates(arena.allocator(), node, "draft plan");
+    try std.testing.expectEqualStrings("{\"plan\":\"draft plan\"}", result);
+}
+
+test "buildTaskStateUpdates applies output_mapping from JSON output" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"task","output_key":"review_result","output_mapping":{"grade":"grade","feedback":"details.feedback"}}
+    ;
+    const output =
+        \\{"grade":"approve","details":{"feedback":"looks good"}}
+    ;
+    const result = try buildTaskStateUpdates(arena.allocator(), node, output);
+    try std.testing.expect(std.mem.indexOf(u8, result, "\"review_result\":{\"grade\":\"approve\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "\"grade\":\"approve\"") != null);
+    try std.testing.expect(std.mem.indexOf(u8, result, "\"feedback\":\"looks good\"") != null);
+}
+
+test "getSendItemsPath prefers canonical items_key" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"send","items_key":"state.files","items_from":"state.legacy"}
+    ;
+    const result = getSendItemsPath(arena.allocator(), node);
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("state.files", result.?);
+}
+
+test "getSendItemsPath accepts legacy items_from alias" {
+    const allocator = std.testing.allocator;
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const node =
+        \\{"type":"send","items_from":"state.files"}
+    ;
+    const result = getSendItemsPath(arena.allocator(), node);
+    try std.testing.expect(result != null);
+    try std.testing.expectEqualStrings("state.files", result.?);
+}
+
 test "extractGotoTargets: string target" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);

From 2b74774970ebdb4d24eddcff48709d4272cab333 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:43:41 -0300
Subject: [PATCH 46/55] Add graph store read/write support

---
 src/engine.zig         | 274 ++++++++++++++++++++++++++++++++++-------
 src/templates.zig      | 116 +++++++++++++++--
 src/tracker_client.zig |  94 +++++++++++++-
 3 files changed, 430 insertions(+), 54 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 9c879e0..db2c342 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -36,6 +36,7 @@ const metrics_mod = @import("metrics.zig");
 const async_dispatch = @import("async_dispatch.zig");
 const state_mod = @import("state.zig");
 const sse_mod = @import("sse.zig");
+const tracker_client = @import("tracker_client.zig");
 const workflow_loader = @import("workflow_loader.zig");
 
 // ── Structured Events ────────────────────────────────────────────────
@@ -105,6 +106,15 @@ const max_nodes_per_tick: u32 = 1000;
 /// Maximum inline subgraph recursion depth.
 const max_subgraph_depth: u32 = 10;
 
+const StoreWriter = *const fn (
+    alloc: std.mem.Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+    value_json: []const u8,
+) anyerror!void;
+
 // ── Engine ───────────────────────────────────────────────────────────
 
 pub const RuntimeConfig = struct {
@@ -137,6 +147,8 @@ pub const Engine = struct {
     sse_hub: ?*sse_mod.SseHub = null,
     workflow_watcher: ?*workflow_loader.WorkflowWatcher = null,
     rate_limits: std.StringHashMap(RateLimitInfo),
+    store_fetcher: templates.StoreFetcher,
+    store_writer: StoreWriter,
     config_valid: bool = false,
     last_config_check_ms: i64 = 0,
 
@@ -156,6 +168,8 @@ pub const Engine = struct {
             .sse_hub = null,
             .workflow_watcher = null,
             .rate_limits = std.StringHashMap(RateLimitInfo).init(allocator),
+            .store_fetcher = templates.fetchStoreValueHttp,
+            .store_writer = putStoreValueViaHttp,
             .config_valid = false,
             .last_config_check_ms = 0,
         };
@@ -391,9 +405,9 @@ pub const Engine = struct {
         // 2d. Collect deferred nodes (Gap 6)
         const deferred_nodes = collectDeferredNodes(alloc, workflow_json);
 
-        // 2c. Get tracker URL for reconciliation
-        const tracker_url = getWorkflowField(alloc, workflow_json, "tracker_url");
-        const task_id = getWorkflowField(alloc, workflow_json, "task_id");
+        // 2c. Get tracker URL / task id for reconciliation and store access.
+        const tracker_url = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{ "tracker_url", "nulltickets_url" });
+        const task_id = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{"task_id"});
 
         // 3. Get completed nodes from latest checkpoint
         var completed_nodes = std.StringHashMap(void).init(alloc);
@@ -528,10 +542,10 @@ pub const Engine = struct {
                     // Gap 6: Execute deferred nodes before completing
                     for (deferred_nodes) |deferred_name| {
                         if (completed_nodes.get(deferred_name) != null) continue;
-    
+
                         const def_node_json = getNodeJson(alloc, workflow_json, deferred_name) orelse continue;
                         const def_node_type = getNodeField(alloc, def_node_json, "type") orelse "task";
-    
+
                         if (std.mem.eql(u8, def_node_type, "transform")) {
                             const def_updates = getNodeField(alloc, def_node_json, "updates") orelse "{}";
                             const def_schema = cached_schema_json;
@@ -550,15 +564,15 @@ pub const Engine = struct {
                                 else => {},
                             }
                         }
-    
+
                         try completed_nodes.put(try alloc.dupe(u8, deferred_name), {});
                         log.info("deferred node {s} completed for run {s}", .{ deferred_name, run_row.id });
                     }
-    
+
                     // Mark __end__ as completed
                     try completed_nodes.put("__end__", {});
                     version += 1;
-    
+
                     // Save checkpoint
                     const cp_id_buf = ids.generateId();
                     const cp_id = try alloc.dupe(u8, &cp_id_buf);
@@ -577,7 +591,7 @@ pub const Engine = struct {
                     log.info("run {s} completed", .{run_row.id});
                     return;
                 }
-    
+
                 // Breakpoint: interrupt_before check
                 if (isInBreakpointList(node_name, interrupt_before)) {
                     log.info("breakpoint interrupt_before at node {s} for run {s}", .{ node_name, run_row.id });
@@ -597,17 +611,17 @@ pub const Engine = struct {
                     callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
                     return;
                 }
-    
+
                 // Get node definition from workflow
                 const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
                     log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
                     try self.store.updateRunStatus(run_row.id, "failed", "node not found in workflow");
                     return;
                 };
-    
+
                 // Get node type
                 const node_type = getNodeField(alloc, node_json, "type") orelse "task";
-    
+
                 // Execute based on type
                 if (std.mem.eql(u8, node_type, "route")) {
                     // Route: evaluate routing logic, no worker dispatch
@@ -616,7 +630,7 @@ pub const Engine = struct {
                         try route_results.put(try alloc.dupe(u8, node_name), rv);
                     }
                     try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-    
+
                     // Create step record
                     const step_id_buf = ids.generateId();
                     const step_id = try alloc.dupe(u8, &step_id_buf);
@@ -624,19 +638,19 @@ pub const Engine = struct {
                     const route_output = try std.fmt.allocPrint(alloc, "{{\"route\":\"{s}\"}}", .{result.route_value orelse "default"});
                     try self.store.updateStepStatus(step_id, "completed", null, route_output, null, 1);
                     try self.store.insertEvent(run_row.id, step_id, "step.completed", route_output);
-    
+
                     log.info("route node {s} -> {s}", .{ node_name, result.route_value orelse "default" });
                 } else if (std.mem.eql(u8, node_type, "interrupt")) {
                     // Interrupt: save checkpoint, set run to interrupted
                     try completed_nodes.put(try alloc.dupe(u8, node_name), {});
                     version += 1;
-    
+
                     const step_id_buf = ids.generateId();
                     const step_id = try alloc.dupe(u8, &step_id_buf);
                     try self.store.insertStep(step_id, run_row.id, node_name, "interrupt", "completed", "{}", 1, null, null, null);
                     try self.store.updateStepStatus(step_id, "completed", null, "{\"interrupted\":true}", null, 1);
                     try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
-    
+
                     const cp_id_buf = ids.generateId();
                     const cp_id = try alloc.dupe(u8, &cp_id_buf);
                     const cn_json = try serializeCompletedNodes(alloc, &completed_nodes);
@@ -655,10 +669,10 @@ pub const Engine = struct {
                 } else if (std.mem.eql(u8, node_type, "transform")) {
                     // Transform: apply static updates, no worker dispatch
                     const state_updates = getNodeField(alloc, node_json, "updates") orelse "{}";
-    
+
                     // Get schema from workflow
                     const schema_json = cached_schema_json;
-    
+
                     // Apply updates via reducers
                     const new_state = state_mod.applyUpdates(alloc, running_state, state_updates, schema_json) catch |err| {
                         log.err("transform node {s} failed to apply updates: {}", .{ node_name, err });
@@ -666,26 +680,34 @@ pub const Engine = struct {
                         return;
                     };
                     running_state = new_state;
-    
+
+                    if (getNodeField(alloc, node_json, "store_updates")) |store_updates_json| {
+                        self.applyStoreUpdates(alloc, workflow_json, running_state, store_updates_json) catch |err| {
+                            log.err("transform node {s} failed to write store updates: {}", .{ node_name, err });
+                            try self.store.updateRunStatus(run_row.id, "failed", "transform store update failed");
+                            return;
+                        };
+                    }
+
                     try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-    
+
                     // Create step record
                     const step_id_buf = ids.generateId();
                     const step_id = try alloc.dupe(u8, &step_id_buf);
                     try self.store.insertStep(step_id, run_row.id, node_name, "transform", "completed", "{}", 1, null, null, null);
                     try self.store.updateStepStatus(step_id, "completed", null, state_updates, null, 1);
                     try self.store.insertEvent(run_row.id, step_id, "step.completed", "{}");
-    
+
                     log.info("transform node {s} completed", .{node_name});
                 } else if (std.mem.eql(u8, node_type, "task") or std.mem.eql(u8, node_type, "agent")) {
                     // Gap 7: Inject __meta managed values
                     const state_with_meta = injectMeta(alloc, running_state, run_row.id, node_name, version, @as(i64, @intCast(max_iterations))) catch running_state;
-    
+
                     // Gap 3: Check cache before executing
                     const cache_ttl = parseCacheTtlMs(alloc, node_json);
                     if (cache_ttl != null) cache_check: {
                         const pt_c = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_check;
-                        const rnd_c = templates.renderTemplate(alloc, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
+                        const rnd_c = self.renderWorkflowTemplate(alloc, workflow_json, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
                         const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
                         const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
                         if (cached) |cached_upd| {
@@ -707,7 +729,7 @@ pub const Engine = struct {
                             continue;
                         }
                     }
-    
+
                     // Gap 2: Non-blocking retry — check for pending retry step
                     const max_attempts = parseRetryMaxAttempts(alloc, node_json) orelse 1;
                     const retry_init_ms = parseRetryInitialMs(alloc, node_json) orelse 500;
@@ -789,12 +811,12 @@ pub const Engine = struct {
                         },
                         else => result,
                     };
-    
+
                     switch (result_after_retry) {
                         .completed => |cr| {
                             // Gap 7: Strip __meta (don't persist)
                             running_state = stripMeta(alloc, running_state) catch running_state;
-    
+
                             if (cr.state_updates) |updates| {
                                 const schema_json = cached_schema_json;
                                 const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
@@ -803,28 +825,28 @@ pub const Engine = struct {
                                     return;
                                 };
                                 running_state = new_state;
-    
+
                                 // Gap 3: Store result in cache
                                 if (cache_ttl) |ttl| cache_store: {
                                     const pt_s = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_store;
-                                    const rnd_s = templates.renderTemplate(alloc, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
+                                    const rnd_s = self.renderWorkflowTemplate(alloc, workflow_json, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
                                     const ck_s = computeCacheKey(alloc, node_name, rnd_s) catch break :cache_store;
                                     self.store.setCachedResult(ck_s, node_name, updates, ttl) catch |cerr| {
                                         log.warn("failed to cache result for node {s}: {}", .{ node_name, cerr });
                                     };
                                 }
-    
+
                                 // Gap 4: Save as pending write
                                 self.store.savePendingWrite(run_row.id, node_name, node_name, updates) catch |perr| {
                                     log.warn("failed to save pending write for node {s}: {}", .{ node_name, perr });
                                 };
                             }
-    
+
                             // Apply UI messages to state (__ui_messages key)
                             if (cr.raw_output) |raw_out| {
                                 running_state = applyUiMessagesToState(alloc, running_state, raw_out) catch running_state;
                             }
-    
+
                             // Consume pending injections
                             const injections = self.store.consumePendingInjections(alloc, run_row.id, node_name) catch &.{};
                             for (injections) |injection| {
@@ -835,9 +857,9 @@ pub const Engine = struct {
                                 };
                                 running_state = new_state;
                             }
-    
+
                             try completed_nodes.put(try alloc.dupe(u8, node_name), {});
-    
+
                             if (cr.goto_targets) |targets| {
                                 var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
                                 for (targets) |target| {
@@ -852,10 +874,10 @@ pub const Engine = struct {
                                     log.info("task node {s} goto: {d} targets", .{ node_name, goto_override.?.len });
                                 }
                             }
-    
+
                             // Gap 4: Clear pending writes
                             self.store.clearPendingWrites(run_row.id) catch {};
-    
+
                             log.info("task node {s} completed for run {s}", .{ node_name, run_row.id });
                         },
                         .async_pending => {
@@ -903,7 +925,7 @@ pub const Engine = struct {
                 } else if (std.mem.eql(u8, node_type, "subgraph")) {
                     // Subgraph: execute child workflow inline
                     const result = try self.executeSubgraphNode(alloc, run_row, node_name, node_json, running_state, recursion_depth);
-    
+
                     switch (result) {
                         .completed => |cr| {
                             if (cr.state_updates) |updates| {
@@ -946,7 +968,7 @@ pub const Engine = struct {
                     try self.store.updateRunStatus(run_row.id, "failed", "unknown node type");
                     return;
                 }
-    
+
                 // Breakpoint: interrupt_after check
                 if (isInBreakpointList(node_name, interrupt_after)) {
                     log.info("breakpoint interrupt_after at node {s} for run {s}", .{ node_name, run_row.id });
@@ -967,7 +989,7 @@ pub const Engine = struct {
                     callbacks.fireCallbacks(alloc, run_row.callbacks_json, "run.interrupted", run_row.id, null, "{}", self.metrics);
                     return;
                 }
-    
+
                 // Reconciliation: check tracker task status between steps
                 if (tracker_url != null and task_id != null) {
                     if (!reconcileWithTracker(alloc, tracker_url.?, task_id.?)) {
@@ -978,11 +1000,11 @@ pub const Engine = struct {
                         return;
                     }
                 }
-    
+
                 // Strip ephemeral keys before checkpoint persistence
                 const schema_for_eph = cached_schema_json;
                 running_state = state_mod.stripEphemeralKeys(alloc, running_state, schema_for_eph) catch running_state;
-    
+
                 // Save checkpoint after each node
                 made_progress = true;
                 version += 1;
@@ -995,10 +1017,10 @@ pub const Engine = struct {
                 try self.store.incrementCheckpointCount(run_row.id);
                 try self.store.updateRunState(run_row.id, running_state);
                 latest_checkpoint_id = cp_id;
-    
+
                 // Emit structured checkpoint event
                 self.emitEvent(alloc, .checkpoint_created, run_row.id, null, node_name, null);
-    
+
                 // Broadcast rich SSE events for all modes
                 if (self.sse_hub) |hub| {
                     const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
@@ -1075,8 +1097,8 @@ pub const Engine = struct {
             return TaskNodeResult{ .completed = .{ .state_updates = null } };
         };
 
-        // 2. Render prompt using new templates.renderTemplate
-        const rendered_prompt = templates.renderTemplate(alloc, prompt_template, state_json, run_row.input_json, null) catch |err| {
+        // 2. Render prompt with graph template interpolation and optional store access.
+        const rendered_prompt = self.renderWorkflowTemplate(alloc, run_row.workflow_json, prompt_template, state_json, run_row.input_json, null) catch |err| {
             log.err("template render failed for node {s}: {}", .{ node_name, err });
             return TaskNodeResult{ .failed = "template render failed" };
         };
@@ -1204,7 +1226,7 @@ pub const Engine = struct {
                             }
 
                             // Render continuation prompt
-                            const cont_rendered = templates.renderTemplate(alloc, continuation_prompt.?, state_json, run_row.input_json, null) catch break;
+                            const cont_rendered = self.renderWorkflowTemplate(alloc, run_row.workflow_json, continuation_prompt.?, state_json, run_row.input_json, null) catch break;
 
                             const cont_result = try dispatch.dispatchStep(
                                 alloc,
@@ -1446,7 +1468,7 @@ pub const Engine = struct {
             const prompt_template = getNodeField(alloc, target_json, "prompt_template") orelse continue;
 
             // Render with item
-            const rendered = templates.renderTemplate(alloc, prompt_template, state_json, run_row.input_json, item_str) catch continue;
+            const rendered = self.renderWorkflowTemplate(alloc, run_row.workflow_json, prompt_template, state_json, run_row.input_json, item_str) catch continue;
 
             const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
             if (selected_worker == null) {
@@ -1499,6 +1521,56 @@ pub const Engine = struct {
         return SendNodeResult{ .state_updates = state_updates };
     }
 
+    fn renderWorkflowTemplate(
+        self: *Engine,
+        alloc: std.mem.Allocator,
+        workflow_json: []const u8,
+        template: []const u8,
+        state_json: []const u8,
+        input_json: ?[]const u8,
+        item_json: ?[]const u8,
+    ) ![]const u8 {
+        const store_access = self.resolveRuntimeStoreAccess(alloc, workflow_json, state_json);
+        return templates.renderTemplateWithStore(alloc, template, state_json, input_json, item_json, store_access);
+    }
+
+    fn resolveRuntimeStoreAccess(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8) ?templates.StoreAccess {
+        const base_url = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{ "tracker_url", "nulltickets_url" }) orelse return null;
+        const api_token = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{ "tracker_api_token", "nulltickets_api_token" });
+        return .{
+            .base_url = base_url,
+            .api_token = api_token,
+            .fetcher = self.store_fetcher,
+        };
+    }
+
+    fn applyStoreUpdates(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8, store_updates_json: []const u8) !void {
+        const access = self.resolveRuntimeStoreAccess(alloc, workflow_json, state_json) orelse return error.StoreNotConfigured;
+        const parsed = try json.parseFromSlice(json.Value, alloc, store_updates_json, .{});
+
+        switch (parsed.value) {
+            .object => try self.applySingleStoreUpdate(alloc, access, state_json, parsed.value.object),
+            .array => |arr| {
+                for (arr.items) |item| {
+                    if (item != .object) return error.InvalidStoreUpdates;
+                    try self.applySingleStoreUpdate(alloc, access, state_json, item.object);
+                }
+            },
+            else => return error.InvalidStoreUpdates,
+        }
+    }
+
+    fn applySingleStoreUpdate(self: *Engine, alloc: std.mem.Allocator, access: templates.StoreAccess, state_json: []const u8, obj: json.ObjectMap) !void {
+        const namespace_val = obj.get("namespace") orelse return error.InvalidStoreUpdates;
+        const key_val = obj.get("key") orelse return error.InvalidStoreUpdates;
+        const value_val = obj.get("value") orelse return error.InvalidStoreUpdates;
+
+        if (namespace_val != .string or key_val != .string) return error.InvalidStoreUpdates;
+
+        const value_json = try resolveStoreUpdateValue(alloc, state_json, value_val);
+        try self.store_writer(alloc, access.base_url, access.api_token, namespace_val.string, key_val.string, value_json);
+    }
+
     // ── Async polling ────────────────────────────────────────────────
 
     fn pollAsyncTaskStep(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, step: types.StepRow) !void {
@@ -1803,6 +1875,78 @@ fn getWorkflowField(alloc: std.mem.Allocator, workflow_json: []const u8, field:
     return serializeJsonValue(alloc, val) catch null;
 }
 
+fn getRuntimeStringSetting(
+    alloc: std.mem.Allocator,
+    state_json: []const u8,
+    workflow_json: []const u8,
+    field_names: []const []const u8,
+) ?[]const u8 {
+    for (field_names) |field_name| {
+        if (getConfigString(alloc, state_json, field_name)) |value| return value;
+    }
+    for (field_names) |field_name| {
+        if (getWorkflowField(alloc, workflow_json, field_name)) |value| return value;
+    }
+    return null;
+}
+
+fn getConfigString(alloc: std.mem.Allocator, state_json: []const u8, field_name: []const u8) ?[]const u8 {
+    const path = std.fmt.allocPrint(alloc, "state.__config.{s}", .{field_name}) catch return null;
+    defer alloc.free(path);
+
+    const raw = state_mod.getStateValue(alloc, state_json, path) catch return null;
+    const raw_value = raw orelse return null;
+    defer alloc.free(raw_value);
+
+    const parsed = json.parseFromSlice(json.Value, alloc, raw_value, .{}) catch return null;
+    defer parsed.deinit();
+    if (parsed.value != .string) return null;
+    return alloc.dupe(u8, parsed.value.string) catch null;
+}
+
+fn resolveStoreUpdateValue(alloc: std.mem.Allocator, state_json: []const u8, value: json.Value) ![]const u8 {
+    if (value == .string and std.mem.startsWith(u8, value.string, "state.")) {
+        const raw = try state_mod.getStateValue(alloc, state_json, value.string);
+        return raw orelse try alloc.dupe(u8, "null");
+    }
+    return serializeJsonValue(alloc, value);
+}
+
+fn putStoreValueViaHttp(
+    alloc: std.mem.Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+    value_json: []const u8,
+) !void {
+    var client = tracker_client.TrackerClient.init(alloc, base_url, api_token);
+    const ok = try client.storePutValue(namespace, key, value_json);
+    if (!ok) return error.StoreWriteFailed;
+}
+
+var test_store_write_base_url: []const u8 = "";
+var test_store_write_api_token: ?[]const u8 = null;
+var test_store_write_namespace: []const u8 = "";
+var test_store_write_key: []const u8 = "";
+var test_store_write_value_json: []const u8 = "";
+
+fn mockStoreWriter(
+    alloc: std.mem.Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+    value_json: []const u8,
+) !void {
+    _ = alloc;
+    test_store_write_base_url = base_url;
+    test_store_write_api_token = api_token;
+    test_store_write_namespace = namespace;
+    test_store_write_key = key;
+    test_store_write_value_json = value_json;
+}
+
 /// Get worker tags from node definition.
 fn getNodeTags(alloc: std.mem.Allocator, node_json: []const u8) []const []const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return &.{};
@@ -3162,6 +3306,44 @@ test "engine: configurable runs inject __config" {
     }
 }
 
+test "engine: transform store_updates writes updated state value using config tracker settings" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    test_store_write_base_url = "";
+    test_store_write_api_token = null;
+    test_store_write_namespace = "";
+    test_store_write_key = "";
+    test_store_write_value_json = "";
+
+    const wf =
+        \\{"nodes":{"save":{"type":"transform","updates":"{\"review_result\":{\"grade\":\"approved\"}}","store_updates":{"namespace":"project_context","key":"latest_review","value":"state.review_result"}}},"edges":[["__start__","save"],["save","__end__"]],"schema":{"review_result":{"type":"object","reducer":"last_value"},"__config":{"type":"object","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.setConfigJson("r1", "{\"tracker_url\":\"http://tickets.test\",\"tracker_api_token\":\"secret-token\"}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+    engine.store_writer = mockStoreWriter;
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("completed", updated_run.status);
+    try std.testing.expectEqualStrings("http://tickets.test", test_store_write_base_url);
+    try std.testing.expect(test_store_write_api_token != null);
+    try std.testing.expectEqualStrings("secret-token", test_store_write_api_token.?);
+    try std.testing.expectEqualStrings("project_context", test_store_write_namespace);
+    try std.testing.expectEqualStrings("latest_review", test_store_write_key);
+    try std.testing.expectEqualStrings("{\"grade\":\"approved\"}", test_store_write_value_json);
+}
+
 test "getWorkflowVersion: extracts version" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
diff --git a/src/templates.zig b/src/templates.zig
index 36d8205..0332f02 100644
--- a/src/templates.zig
+++ b/src/templates.zig
@@ -13,13 +13,13 @@
 ///   - `{{state.X.Y}}`        -- nested paths with optional [-1] array indexing
 ///   - `{{input.X}}`          -- look up key X in the workflow input JSON
 ///   - `{{item}}`             -- current item string for send iterations
+///   - `{{store.ns.key}}`     -- fetch NullTickets store entry value
 ///
 /// Conditional blocks:
 ///   - `{% if <expr> %}...{% endif %}`
 ///   - `{% if <expr> %}...{% else %}...{% endif %}`
 ///   Conditionals are processed before expression substitution.
 ///   Truthiness: non-null, non-empty, not "false", not "null" string values are truthy.
-
 const std = @import("std");
 
 // ── Context ───────────────────────────────────────────────────────────
@@ -351,8 +351,34 @@ fn jsonValueToString(allocator: std.mem.Allocator, val: std.json.Value) RenderEr
 // ── New state-based template engine ───────────────────────────────────
 
 const state_mod = @import("state.zig");
+const tracker_client = @import("tracker_client.zig");
 const Allocator = std.mem.Allocator;
 
+pub const StoreFetcher = *const fn (
+    alloc: Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+) anyerror!?[]const u8;
+
+pub const StoreAccess = struct {
+    base_url: []const u8,
+    api_token: ?[]const u8 = null,
+    fetcher: StoreFetcher,
+};
+
+pub fn fetchStoreValueHttp(
+    alloc: Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+) !?[]const u8 {
+    var client = tracker_client.TrackerClient.init(alloc, base_url, api_token);
+    return client.storeGetValue(namespace, key);
+}
+
 /// Strip surrounding double quotes from a JSON string value.
 /// `"hello"` -> `hello`, `42` -> `42`, `[1,2]` -> `[1,2]`
 fn stripJsonQuotes(s: []const u8) []const u8 {
@@ -378,6 +404,7 @@ fn resolveNewExpression(
     state_json: []const u8,
     input_json: ?[]const u8,
     item_json: ?[]const u8,
+    store_access: ?StoreAccess,
 ) ![]const u8 {
     if (std.mem.startsWith(u8, expr, "state.")) {
         // Use getStateValue which handles "state." prefix, nested paths, [-1] indexing
@@ -457,6 +484,27 @@ fn resolveNewExpression(
         return alloc.dupe(u8, "") catch return error.OutOfMemory;
     }
 
+    if (std.mem.startsWith(u8, expr, "store.")) {
+        const access = store_access orelse return error.StoreNotConfigured;
+        const store_expr = expr["store.".len..];
+        const dot = std.mem.indexOfScalar(u8, store_expr, '.') orelse return error.InvalidStoreExpression;
+        const namespace = store_expr[0..dot];
+        const key = store_expr[dot + 1 ..];
+        if (namespace.len == 0 or key.len == 0) return error.InvalidStoreExpression;
+
+        const raw = try access.fetcher(alloc, access.base_url, access.api_token, namespace, key);
+        if (raw) |r| {
+            const stripped = stripJsonQuotes(r);
+            if (stripped.ptr != r.ptr or stripped.len != r.len) {
+                const result = alloc.dupe(u8, stripped) catch return error.OutOfMemory;
+                alloc.free(r);
+                return result;
+            }
+            return r;
+        }
+        return alloc.dupe(u8, "") catch return error.OutOfMemory;
+    }
+
     // Unknown expression — return empty
     return alloc.dupe(u8, "") catch return error.OutOfMemory;
 }
@@ -469,8 +517,9 @@ fn isNewTruthy(
     state_json: []const u8,
     input_json: ?[]const u8,
     item_json: ?[]const u8,
+    store_access: ?StoreAccess,
 ) bool {
-    const value = resolveNewExpression(alloc, expr, state_json, input_json, item_json) catch return false;
+    const value = resolveNewExpression(alloc, expr, state_json, input_json, item_json, store_access) catch return false;
     defer alloc.free(value);
 
     if (value.len == 0) return false;
@@ -488,6 +537,7 @@ fn processNewConditionals(
     state_json: []const u8,
     input_json: ?[]const u8,
     item_json: ?[]const u8,
+    store_access: ?StoreAccess,
 ) ![]const u8 {
     var result: std.ArrayListUnmanaged(u8) = .empty;
     errdefer result.deinit(alloc);
@@ -550,18 +600,18 @@ fn processNewConditionals(
                     return error.OutOfMemory;
                 }
 
-                const truthy = isNewTruthy(alloc, expr, state_json, input_json, item_json);
+                const truthy = isNewTruthy(alloc, expr, state_json, input_json, item_json, store_access);
 
                 if (truthy) {
                     const branch_end = else_start orelse endif_start.?;
                     const branch = template[after_tag..branch_end];
-                    const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json);
+                    const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json, store_access);
                     defer alloc.free(processed);
                     result.appendSlice(alloc, processed) catch return error.OutOfMemory;
                 } else {
                     if (else_end) |ee| {
                         const branch = template[ee..endif_start.?];
-                        const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json);
+                        const processed = try processNewConditionals(alloc, branch, state_json, input_json, item_json, store_access);
                         defer alloc.free(processed);
                         result.appendSlice(alloc, processed) catch return error.OutOfMemory;
                     }
@@ -601,9 +651,20 @@ pub fn renderTemplate(
     state_json: []const u8,
     input_json: ?[]const u8,
     item_json: ?[]const u8,
+) ![]const u8 {
+    return renderTemplateWithStore(alloc, template, state_json, input_json, item_json, null);
+}
+
+pub fn renderTemplateWithStore(
+    alloc: Allocator,
+    template: []const u8,
+    state_json: []const u8,
+    input_json: ?[]const u8,
+    item_json: ?[]const u8,
+    store_access: ?StoreAccess,
 ) ![]const u8 {
     // Phase 1: Process conditional blocks
-    const preprocessed = try processNewConditionals(alloc, template, state_json, input_json, item_json);
+    const preprocessed = try processNewConditionals(alloc, template, state_json, input_json, item_json, store_access);
     defer alloc.free(preprocessed);
 
     // Phase 2: Resolve {{expression}} substitutions
@@ -621,7 +682,7 @@ pub fn renderTemplate(
                 const raw_expr = preprocessed[after_open..close];
                 const expr = std.mem.trim(u8, raw_expr, " \t\n\r");
 
-                const value = try resolveNewExpression(alloc, expr, state_json, input_json, item_json);
+                const value = try resolveNewExpression(alloc, expr, state_json, input_json, item_json, store_access);
                 defer alloc.free(value);
 
                 result.appendSlice(alloc, value) catch return error.OutOfMemory;
@@ -685,6 +746,47 @@ test "template no interpolation" {
     try std.testing.expectEqualStrings("plain text", result);
 }
 
+fn mockStoreFetcher(
+    alloc: Allocator,
+    base_url: []const u8,
+    api_token: ?[]const u8,
+    namespace: []const u8,
+    key: []const u8,
+) !?[]const u8 {
+    _ = base_url;
+    _ = api_token;
+    if (std.mem.eql(u8, namespace, "prefs") and std.mem.eql(u8, key, "theme")) {
+        return try alloc.dupe(u8, "\"dark\"");
+    }
+    return null;
+}
+
+test "template store interpolation" {
+    const alloc = std.testing.allocator;
+    const result = try renderTemplateWithStore(
+        alloc,
+        "Theme: {{store.prefs.theme}}",
+        "{}",
+        null,
+        null,
+        .{
+            .base_url = "http://example.test",
+            .fetcher = mockStoreFetcher,
+        },
+    );
+    defer alloc.free(result);
+
+    try std.testing.expectEqualStrings("Theme: dark", result);
+}
+
+test "template store interpolation errors without store access" {
+    const alloc = std.testing.allocator;
+    try std.testing.expectError(
+        error.StoreNotConfigured,
+        renderTemplateWithStore(alloc, "Theme: {{store.prefs.theme}}", "{}", null, null, null),
+    );
+}
+
 // ── Old template engine tests ─────────────────────────────────────────
 
 test "render literal text unchanged" {
diff --git a/src/tracker_client.zig b/src/tracker_client.zig
index 57b5c65..212e926 100644
--- a/src/tracker_client.zig
+++ b/src/tracker_client.zig
@@ -183,7 +183,8 @@ pub const TrackerClient = struct {
         const url = try std.fmt.allocPrint(self.allocator, "{s}/artifacts", .{self.base_url});
         defer self.allocator.free(url);
 
-        const body = try std.fmt.allocPrint(self.allocator,
+        const body = try std.fmt.allocPrint(
+            self.allocator,
             "{{\"task_id\":{f},\"run_id\":{f},\"kind\":{f},\"uri\":{f},\"meta\":{s}}}",
             .{
                 std.json.fmt(task_id, .{}),
@@ -226,6 +227,59 @@ pub const TrackerClient = struct {
         return result.body;
     }
 
+    pub fn storeGetValue(self: *TrackerClient, namespace: []const u8, key: []const u8) !?[]const u8 {
+        const namespace_enc = try encodePathSegment(self.allocator, namespace);
+        defer self.allocator.free(namespace_enc);
+        const key_enc = try encodePathSegment(self.allocator, key);
+        defer self.allocator.free(key_enc);
+
+        const url = try std.fmt.allocPrint(
+            self.allocator,
+            "{s}/store/{s}/{s}",
+            .{ trimTrailingSlash(self.base_url), namespace_enc, key_enc },
+        );
+        defer self.allocator.free(url);
+
+        const result = try self.httpRequest(url, .GET, null, null);
+        defer self.allocator.free(result.body);
+
+        if (result.status_code == 404) return null;
+        if (result.status_code < 200 or result.status_code >= 300) return null;
+
+        const parsed = std.json.parseFromSlice(std.json.Value, self.allocator, result.body, .{
+            .allocate = .alloc_always,
+            .ignore_unknown_fields = true,
+        }) catch return null;
+        defer parsed.deinit();
+        if (parsed.value != .object) return null;
+
+        const value = parsed.value.object.get("value") orelse return null;
+        const value_json = try std.json.Stringify.valueAlloc(self.allocator, value, .{});
+        return value_json;
+    }
+
+    pub fn storePutValue(self: *TrackerClient, namespace: []const u8, key: []const u8, value_json: []const u8) !bool {
+        const namespace_enc = try encodePathSegment(self.allocator, namespace);
+        defer self.allocator.free(namespace_enc);
+        const key_enc = try encodePathSegment(self.allocator, key);
+        defer self.allocator.free(key_enc);
+
+        const url = try std.fmt.allocPrint(
+            self.allocator,
+            "{s}/store/{s}/{s}",
+            .{ trimTrailingSlash(self.base_url), namespace_enc, key_enc },
+        );
+        defer self.allocator.free(url);
+
+        const body = try std.fmt.allocPrint(self.allocator, "{{\"value\":{s}}}", .{value_json});
+        defer self.allocator.free(body);
+
+        const result = try self.httpRequest(url, .PUT, body, null);
+        defer self.allocator.free(result.body);
+
+        return result.status_code >= 200 and result.status_code < 300;
+    }
+
     fn httpRequest(
         self: *TrackerClient,
         url: []const u8,
@@ -272,6 +326,36 @@ pub const TrackerClient = struct {
     }
 };
 
+fn trimTrailingSlash(url: []const u8) []const u8 {
+    if (url.len > 0 and url[url.len - 1] == '/') return url[0 .. url.len - 1];
+    return url;
+}
+
+fn encodePathSegment(allocator: std.mem.Allocator, value: []const u8) ![]const u8 {
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    errdefer buf.deinit(allocator);
+
+    for (value) |ch| {
+        if (isUnreserved(ch)) {
+            try buf.append(allocator, ch);
+            continue;
+        }
+        try buf.writer(allocator).print("%{X:0>2}", .{ch});
+    }
+
+    return try buf.toOwnedSlice(allocator);
+}
+
+fn isUnreserved(ch: u8) bool {
+    return (ch >= 'A' and ch <= 'Z') or
+        (ch >= 'a' and ch <= 'z') or
+        (ch >= '0' and ch <= '9') or
+        ch == '-' or
+        ch == '_' or
+        ch == '.' or
+        ch == '~';
+}
+
 fn parseTaskInfo(allocator: std.mem.Allocator, task_value: std.json.Value) !TaskInfo {
     if (task_value != .object) return error.InvalidTaskPayload;
     const obj = task_value.object;
@@ -365,3 +449,11 @@ test "TrackerClient exposes optimistic transition support" {
     try std.testing.expect(@hasDecl(TrackerClient, "transition"));
     try std.testing.expect(@hasDecl(TrackerClient, "postArtifact"));
 }
+
+test "encodePathSegment percent-encodes reserved characters" {
+    const allocator = std.testing.allocator;
+    const encoded = try encodePathSegment(allocator, "team alpha/key");
+    defer allocator.free(encoded);
+
+    try std.testing.expectEqualStrings("team%20alpha%2Fkey", encoded);
+}

From e4d129d078ab0a2a3807bb381f72bce127dda72e Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:47:55 -0300
Subject: [PATCH 47/55] Document graph store memory behavior

---
 README.md                                        | 16 ++++++++++++++++
 .../2026-03-13-orchestration-gaps-design.md      | 12 ++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 10d466e..7fda7e1 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,22 @@ This keeps the architecture modular, simpler to reason about, and easier to evol
 
 See additional integration docs in [`docs/`](./docs).
 
+## Workflow Graph Features
+
+The orchestration graph runtime supports:
+
+- `task`, `agent`, `route`, `interrupt`, `send`, `transform`, and `subgraph` nodes
+- run replay, checkpoint forking, breakpoint interrupts, and post-start state injection
+- `send` fan-out with canonical `items_key` and configurable `output_key`
+- task/agent output shaping via `output_key` and `output_mapping`
+- template access to `state.*`, `input.*`, `item.*`, `config.*`, and `store.<namespace>.<key>`
+- `transform.store_updates` for writing durable workflow memory back to NullTickets
+
+Store-backed templates and `store_updates` require a NullTickets base URL. The
+runtime resolves it from workflow fields such as `tracker_url` or from run config
+(`config.tracker_url` / `config.tracker_api_token`), which are injected into
+state as `__config`.
+
 ## Config Location
 
 - Default config path: `~/.nullboiler/config.json`
diff --git a/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md b/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md
index 7d94355..414ca81 100644
--- a/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md
+++ b/docs/superpowers/specs/2026-03-13-orchestration-gaps-design.md
@@ -139,7 +139,13 @@ Response for GET:
 
 New template syntax: `{{store.namespace.key}}` — engine fetches from nulltickets Store API during prompt rendering.
 
-New node type isn't needed — `task` nodes can read via template, and `transform` nodes can write via a new `store_updates` field:
+Runtime resolution:
+
+- NullTickets base URL comes from workflow-level `tracker_url` / `nulltickets_url`, or from run config (`config.tracker_url`, surfaced as `state.__config.tracker_url`).
+- Optional auth token comes from `tracker_api_token` / `nulltickets_api_token` on the workflow or run config.
+- Missing store keys render as empty strings in templates.
+
+New node type isn't needed — `task` nodes can read via template, and `transform` nodes can write via a `store_updates` field (single object or array of objects):
 
 ```json
 {
@@ -155,7 +161,9 @@ New node type isn't needed — `task` nodes can read via template, and `transfor
 }
 ```
 
-Engine calls nulltickets `PUT /store/{namespace}/{key}` when `store_updates` is present.
+`store_updates.value` can point at a state path such as `state.review_result`, or it can be inline JSON that will be written as-is.
+
+Engine calls nulltickets `PUT /store/{namespace}/{key}` after `updates` are applied, so writes can reference the node's freshly updated state.
 
 ---
 

From 25222e31a487b5863981dd439c10ba0b48962fa1 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 08:57:34 -0300
Subject: [PATCH 48/55] Restrict tracker access to trusted config

---
 src/engine.zig | 107 ++++++++++++++++++++++++++++++++++++++++++++-----
 src/main.zig   |   3 ++
 2 files changed, 99 insertions(+), 11 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index db2c342..5273b6d 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -149,6 +149,8 @@ pub const Engine = struct {
     rate_limits: std.StringHashMap(RateLimitInfo),
     store_fetcher: templates.StoreFetcher,
     store_writer: StoreWriter,
+    trusted_tracker_url: ?[]const u8 = null,
+    trusted_tracker_api_token: ?[]const u8 = null,
     config_valid: bool = false,
     last_config_check_ms: i64 = 0,
 
@@ -170,6 +172,8 @@ pub const Engine = struct {
             .rate_limits = std.StringHashMap(RateLimitInfo).init(allocator),
             .store_fetcher = templates.fetchStoreValueHttp,
             .store_writer = putStoreValueViaHttp,
+            .trusted_tracker_url = null,
+            .trusted_tracker_api_token = null,
             .config_valid = false,
             .last_config_check_ms = 0,
         };
@@ -180,6 +184,11 @@ pub const Engine = struct {
         self.metrics = metrics;
     }
 
+    pub fn setTrustedTrackerAccess(self: *Engine, base_url: ?[]const u8, api_token: ?[]const u8) void {
+        self.trusted_tracker_url = base_url;
+        self.trusted_tracker_api_token = api_token;
+    }
+
     pub fn stop(self: *Engine) void {
         self.running.store(false, .release);
     }
@@ -405,8 +414,7 @@ pub const Engine = struct {
         // 2d. Collect deferred nodes (Gap 6)
         const deferred_nodes = collectDeferredNodes(alloc, workflow_json);
 
-        // 2c. Get tracker URL / task id for reconciliation and store access.
-        const tracker_url = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{ "tracker_url", "nulltickets_url" });
+        // 2c. Get task id for reconciliation.
         const task_id = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{"task_id"});
 
         // 3. Get completed nodes from latest checkpoint
@@ -991,8 +999,8 @@ pub const Engine = struct {
                 }
 
                 // Reconciliation: check tracker task status between steps
-                if (tracker_url != null and task_id != null) {
-                    if (!reconcileWithTracker(alloc, tracker_url.?, task_id.?)) {
+                if (self.trusted_tracker_url != null and task_id != null) {
+                    if (!reconcileWithTracker(alloc, self.trusted_tracker_url.?, self.trusted_tracker_api_token, task_id.?)) {
                         log.info("run {s} cancelled by reconciliation", .{run_row.id});
                         try self.store.updateRunStatus(run_row.id, "failed", "cancelled by tracker reconciliation");
                         try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"tracker_cancelled\"}");
@@ -1535,11 +1543,13 @@ pub const Engine = struct {
     }
 
     fn resolveRuntimeStoreAccess(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8) ?templates.StoreAccess {
-        const base_url = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{ "tracker_url", "nulltickets_url" }) orelse return null;
-        const api_token = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{ "tracker_api_token", "nulltickets_api_token" });
+        _ = alloc;
+        _ = workflow_json;
+        _ = state_json;
+        const base_url = self.trusted_tracker_url orelse return null;
         return .{
             .base_url = base_url,
-            .api_token = api_token,
+            .api_token = self.trusted_tracker_api_token,
             .fetcher = self.store_fetcher,
         };
     }
@@ -1925,6 +1935,28 @@ fn putStoreValueViaHttp(
     if (!ok) return error.StoreWriteFailed;
 }
 
+fn encodePathSegment(allocator: std.mem.Allocator, value: []const u8) ![]const u8 {
+    var buf: std.ArrayListUnmanaged(u8) = .empty;
+    errdefer buf.deinit(allocator);
+
+    for (value) |byte| {
+        if ((byte >= 'A' and byte <= 'Z') or
+            (byte >= 'a' and byte <= 'z') or
+            (byte >= '0' and byte <= '9') or
+            byte == '-' or
+            byte == '_' or
+            byte == '.' or
+            byte == '~')
+        {
+            try buf.append(allocator, byte);
+        } else {
+            try buf.writer(allocator).print("%{X:0>2}", .{byte});
+        }
+    }
+
+    return buf.toOwnedSlice(allocator);
+}
+
 var test_store_write_base_url: []const u8 = "";
 var test_store_write_api_token: ?[]const u8 = null;
 var test_store_write_namespace: []const u8 = "";
@@ -2298,8 +2330,11 @@ fn buildSubgraphInput(alloc: std.mem.Allocator, parent_state: []const u8, input_
 
 /// Reconcile with nulltickets: check if associated task has been cancelled.
 /// Returns true if the run should continue, false if it should be cancelled.
-fn reconcileWithTracker(alloc: std.mem.Allocator, tracker_url: []const u8, task_id: []const u8) bool {
-    const url = std.fmt.allocPrint(alloc, "{s}/tasks/{s}", .{ tracker_url, task_id }) catch return true;
+fn reconcileWithTracker(alloc: std.mem.Allocator, tracker_url: []const u8, tracker_api_token: ?[]const u8, task_id: []const u8) bool {
+    const task_id_enc = encodePathSegment(alloc, task_id) catch return true;
+    defer alloc.free(task_id_enc);
+
+    const url = std.fmt.allocPrint(alloc, "{s}/tasks/{s}", .{ tracker_url, task_id_enc }) catch return true;
     defer alloc.free(url);
 
     var client: std.http.Client = .{ .allocator = alloc };
@@ -2308,10 +2343,20 @@ fn reconcileWithTracker(alloc: std.mem.Allocator, tracker_url: []const u8, task_
     var response_body: std.io.Writer.Allocating = .init(alloc);
     defer response_body.deinit();
 
+    var auth_header: ?[]const u8 = null;
+    defer if (auth_header) |value| alloc.free(value);
+    var headers_buf: [1]std.http.Header = undefined;
+    const extra_headers: []const std.http.Header = if (tracker_api_token) |token| blk: {
+        auth_header = std.fmt.allocPrint(alloc, "Bearer {s}", .{token}) catch return true;
+        headers_buf[0] = .{ .name = "Authorization", .value = auth_header.? };
+        break :blk headers_buf[0..1];
+    } else &.{};
+
     const result = client.fetch(.{
         .location = .{ .url = url },
         .method = .GET,
         .response_writer = &response_body.writer,
+        .extra_headers = extra_headers,
     }) catch return true; // network errors -> continue
 
     const status_code = @intFromEnum(result.status);
@@ -3306,7 +3351,7 @@ test "engine: configurable runs inject __config" {
     }
 }
 
-test "engine: transform store_updates writes updated state value using config tracker settings" {
+test "engine: transform store_updates uses trusted tracker settings" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
     defer store.deinit();
@@ -3322,11 +3367,11 @@ test "engine: transform store_updates writes updated state value using config tr
     ;
 
     try store.createRunWithState("r1", null, wf, "{}", "{}");
-    try store.setConfigJson("r1", "{\"tracker_url\":\"http://tickets.test\",\"tracker_api_token\":\"secret-token\"}");
     try store.updateRunStatus("r1", "running", null);
 
     var engine = Engine.init(&store, allocator, 500);
     engine.store_writer = mockStoreWriter;
+    engine.setTrustedTrackerAccess("http://tickets.test", "secret-token");
 
     var arena = std.heap.ArenaAllocator.init(allocator);
     defer arena.deinit();
@@ -3344,6 +3389,46 @@ test "engine: transform store_updates writes updated state value using config tr
     try std.testing.expectEqualStrings("{\"grade\":\"approved\"}", test_store_write_value_json);
 }
 
+test "engine: workflow cannot override trusted tracker settings" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    test_store_write_base_url = "";
+    test_store_write_api_token = null;
+    test_store_write_namespace = "";
+    test_store_write_key = "";
+    test_store_write_value_json = "";
+
+    const wf =
+        \\{"tracker_url":"http://evil.test","tracker_api_token":"evil-token","nodes":{"save":{"type":"transform","updates":"{\"review_result\":{\"grade\":\"approved\"}}","store_updates":{"namespace":"project_context","key":"latest_review","value":"state.review_result"}}},"edges":[["__start__","save"],["save","__end__"]],"schema":{"review_result":{"type":"object","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+    engine.store_writer = mockStoreWriter;
+    engine.setTrustedTrackerAccess("http://tickets.test", "secret-token");
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    try std.testing.expectEqualStrings("http://tickets.test", test_store_write_base_url);
+    try std.testing.expect(test_store_write_api_token != null);
+    try std.testing.expectEqualStrings("secret-token", test_store_write_api_token.?);
+}
+
+test "encodePathSegment percent-encodes reserved characters" {
+    const encoded = try encodePathSegment(std.testing.allocator, "task/alpha beta");
+    defer std.testing.allocator.free(encoded);
+
+    try std.testing.expectEqualStrings("task%2Falpha%20beta", encoded);
+}
+
 test "getWorkflowVersion: extracts version" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
diff --git a/src/main.zig b/src/main.zig
index 43e75e8..45590f0 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -262,6 +262,9 @@ pub fn main() !void {
         .retry_jitter_ms = @as(i64, @intCast(cfg.engine.retry_jitter_ms)),
         .retry_max_elapsed_ms = @as(i64, @intCast(cfg.engine.retry_max_elapsed_ms)),
     }, &metrics);
+    if (cfg.tracker) |tracker_cfg| {
+        engine.setTrustedTrackerAccess(tracker_cfg.url, tracker_cfg.api_token);
+    }
     engine.response_queue = &response_queue;
     if (wf_watcher != null) {
         engine.workflow_watcher = &wf_watcher.?;

From b93ca55da4e4e0aae98358550d772bd5adeeedb1 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:03:57 -0300
Subject: [PATCH 49/55] Reuse parsed workflows in engine hot path

---
 src/engine.zig | 54 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 5273b6d..be06379 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -408,11 +408,11 @@ pub const Engine = struct {
         } else "{}";
 
         // 2b. Parse breakpoint lists from workflow definition
-        const interrupt_before = parseBreakpointList(alloc, workflow_json, "interrupt_before");
-        const interrupt_after = parseBreakpointList(alloc, workflow_json, "interrupt_after");
+        const interrupt_before = parseBreakpointListFromRoot(alloc, wf_root, "interrupt_before");
+        const interrupt_after = parseBreakpointListFromRoot(alloc, wf_root, "interrupt_after");
 
         // 2d. Collect deferred nodes (Gap 6)
-        const deferred_nodes = collectDeferredNodes(alloc, workflow_json);
+        const deferred_nodes = collectDeferredNodesFromRoot(alloc, wf_root);
 
         // 2c. Get task id for reconciliation.
         const task_id = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{"task_id"});
@@ -489,7 +489,7 @@ pub const Engine = struct {
             const all_ready_nodes = if (goto_ready) |gr| blk: {
                 goto_ready = null;
                 break :blk gr;
-            } else try findReadyNodes(alloc, workflow_json, &completed_nodes, &route_results);
+            } else try findReadyNodesFromRoot(alloc, wf_root, &completed_nodes, &route_results);
 
             // Gap 6: Filter out deferred nodes from ready list (execute them later)
             var ready_list: std.ArrayListUnmanaged([]const u8) = .empty;
@@ -551,7 +551,7 @@ pub const Engine = struct {
                     for (deferred_nodes) |deferred_name| {
                         if (completed_nodes.get(deferred_name) != null) continue;
 
-                        const def_node_json = getNodeJson(alloc, workflow_json, deferred_name) orelse continue;
+                        const def_node_json = getNodeJsonFromRoot(alloc, wf_root, deferred_name) orelse continue;
                         const def_node_type = getNodeField(alloc, def_node_json, "type") orelse "task";
 
                         if (std.mem.eql(u8, def_node_type, "transform")) {
@@ -621,7 +621,7 @@ pub const Engine = struct {
                 }
 
                 // Get node definition from workflow
-                const node_json = getNodeJson(alloc, workflow_json, node_name) orelse {
+                const node_json = getNodeJsonFromRoot(alloc, wf_root, node_name) orelse {
                     log.err("node {s} not found in workflow for run {s}", .{ node_name, run_row.id });
                     try self.store.updateRunStatus(run_row.id, "failed", "node not found in workflow");
                     return;
@@ -871,7 +871,7 @@ pub const Engine = struct {
                             if (cr.goto_targets) |targets| {
                                 var valid_targets: std.ArrayListUnmanaged([]const u8) = .empty;
                                 for (targets) |target| {
-                                    if (std.mem.eql(u8, target, "__end__") or getNodeJson(alloc, workflow_json, target) != null) {
+                                    if (std.mem.eql(u8, target, "__end__") or workflowHasNode(wf_root, target)) {
                                         try valid_targets.append(alloc, target);
                                     } else {
                                         log.warn("goto target {s} not found in workflow, skipping", .{target});
@@ -1031,7 +1031,7 @@ pub const Engine = struct {
 
                 // Broadcast rich SSE events for all modes
                 if (self.sse_hub) |hub| {
-                    const node_json_for_sse = getNodeJson(alloc, workflow_json, node_name);
+                    const node_json_for_sse = getNodeJsonFromRoot(alloc, wf_root, node_name);
                     const nt = if (node_json_for_sse) |nj| (getNodeField(alloc, nj, "type") orelse "task") else "task";
                     broadcastNodeEvents(hub, alloc, run_row.id, node_name, nt, running_state, null, version, 0);
                 }
@@ -1685,7 +1685,15 @@ pub fn findReadyNodes(
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch {
         return &.{};
     };
-    const root = parsed.value;
+    return findReadyNodesFromRoot(alloc, parsed.value, completed_nodes, route_results);
+}
+
+fn findReadyNodesFromRoot(
+    alloc: std.mem.Allocator,
+    root: json.Value,
+    completed_nodes: *std.StringHashMap(void),
+    route_results: *std.StringHashMap([]const u8),
+) ![]const []const u8 {
     if (root != .object) return &.{};
 
     // Get edges array
@@ -1848,7 +1856,10 @@ pub fn findReadyNodes(
 /// Workflow format: {"nodes": {"node_name": {...}}, "edges": [...]}
 fn getNodeJson(alloc: std.mem.Allocator, workflow_json: []const u8, node_name: []const u8) ?[]const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return null;
-    const root = parsed.value;
+    return getNodeJsonFromRoot(alloc, parsed.value, node_name);
+}
+
+fn getNodeJsonFromRoot(alloc: std.mem.Allocator, root: json.Value, node_name: []const u8) ?[]const u8 {
     if (root != .object) return null;
 
     const nodes = root.object.get("nodes") orelse return null;
@@ -1858,6 +1869,13 @@ fn getNodeJson(alloc: std.mem.Allocator, workflow_json: []const u8, node_name: [
     return serializeJsonValue(alloc, node) catch null;
 }
 
+fn workflowHasNode(root: json.Value, node_name: []const u8) bool {
+    if (root != .object) return false;
+    const nodes = root.object.get("nodes") orelse return false;
+    if (nodes != .object) return false;
+    return nodes.object.get(node_name) != null;
+}
+
 /// Get a string field from a node's JSON.
 fn getNodeField(alloc: std.mem.Allocator, node_json: []const u8, field: []const u8) ?[]const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, node_json, .{}) catch return null;
@@ -2155,8 +2173,12 @@ fn extractGotoTargets(alloc: std.mem.Allocator, output: []const u8) ?[]const []c
 /// Parse interrupt_before / interrupt_after arrays from workflow definition.
 fn parseBreakpointList(alloc: std.mem.Allocator, workflow_json: []const u8, field: []const u8) []const []const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return &.{};
-    if (parsed.value != .object) return &.{};
-    const arr_val = parsed.value.object.get(field) orelse return &.{};
+    return parseBreakpointListFromRoot(alloc, parsed.value, field);
+}
+
+fn parseBreakpointListFromRoot(alloc: std.mem.Allocator, root: json.Value, field: []const u8) []const []const u8 {
+    if (root != .object) return &.{};
+    const arr_val = root.object.get(field) orelse return &.{};
     if (arr_val != .array) return &.{};
 
     var result: std.ArrayListUnmanaged([]const u8) = .empty;
@@ -2257,8 +2279,12 @@ fn computeCacheKey(alloc: std.mem.Allocator, node_name: []const u8, rendered_pro
 /// Collect all deferred node names from workflow.
 fn collectDeferredNodes(alloc: std.mem.Allocator, workflow_json: []const u8) []const []const u8 {
     const parsed = json.parseFromSlice(json.Value, alloc, workflow_json, .{}) catch return &.{};
-    if (parsed.value != .object) return &.{};
-    const nodes_val = parsed.value.object.get("nodes") orelse return &.{};
+    return collectDeferredNodesFromRoot(alloc, parsed.value);
+}
+
+fn collectDeferredNodesFromRoot(alloc: std.mem.Allocator, root: json.Value) []const []const u8 {
+    if (root != .object) return &.{};
+    const nodes_val = root.object.get("nodes") orelse return &.{};
     if (nodes_val != .object) return &.{};
 
     var result: std.ArrayListUnmanaged([]const u8) = .empty;

From 3aa3dc76c1f562d72047d7cd28763c4272b52946 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:12:32 -0300
Subject: [PATCH 50/55] Align route contracts across validation and runtime

---
 src/engine.zig              | 58 ++++++++++++++++++++++++++++++++-----
 src/workflow_validation.zig | 54 ++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index be06379..eef2165 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -1076,24 +1076,20 @@ pub const Engine = struct {
 
         // Get the input path to read from state
         const input_path = getNodeField(alloc, node_json, "input") orelse "state.route_input";
+        const default_route = getNodeField(alloc, node_json, "default");
 
         // Read value from state
         const value_json = state_mod.getStateValue(alloc, state_json, input_path) catch null;
         if (value_json == null) {
-            // No value at path, try default route
-            const default_route = getNodeField(alloc, node_json, "default");
-            return RouteNodeResult{ .route_value = default_route };
+            return RouteNodeResult{ .route_value = resolveDeclaredRouteValue(alloc, node_json, default_route) };
         }
 
         // Stringify value for route matching
         const route_key = state_mod.stringifyForRoute(alloc, value_json.?) catch {
-            const default_route = getNodeField(alloc, node_json, "default");
-            return RouteNodeResult{ .route_value = default_route };
+            return RouteNodeResult{ .route_value = resolveDeclaredRouteValue(alloc, node_json, default_route) };
         };
 
-        // Look up in routes map — but routes are encoded in edges, not in node
-        // The route value is used for conditional edge matching like "node:value"
-        return RouteNodeResult{ .route_value = route_key };
+        return RouteNodeResult{ .route_value = resolveDeclaredRouteValue(alloc, node_json, route_key) };
     }
 
     // ── executeTaskNode ──────────────────────────────────────────────
@@ -2226,6 +2222,20 @@ fn getNodeObjectField(alloc: std.mem.Allocator, node_json: []const u8, field: []
     return serializeJsonValue(alloc, val) catch null;
 }
 
+fn resolveDeclaredRouteValue(alloc: std.mem.Allocator, node_json: []const u8, candidate: ?[]const u8) ?[]const u8 {
+    const routes_json = getNodeObjectField(alloc, node_json, "routes") orelse return candidate;
+    const parsed = json.parseFromSlice(json.Value, alloc, routes_json, .{}) catch return candidate;
+    if (parsed.value != .object) return candidate;
+
+    if (candidate) |route_value| {
+        if (parsed.value.object.get(route_value) != null) return route_value;
+    }
+
+    const default_route = getNodeField(alloc, node_json, "default") orelse return candidate;
+    if (parsed.value.object.get(default_route) != null) return default_route;
+    return candidate;
+}
+
 // ── Retry Config Helpers (Gap 2) ────────────────────────────────────
 
 /// Parse retry.max_attempts from node JSON. Returns null if no retry config.
@@ -2991,6 +3001,38 @@ test "engine: route node with conditional edges" {
     }
 }
 
+test "engine: route node falls back to declared default route" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    const wf =
+        \\{"nodes":{"r":{"type":"route","input":"state.decision","routes":{"yes":"t_yes","fallback":"t_fallback"},"default":"fallback"},"t_yes":{"type":"transform","updates":"{\"path\":\"yes\"}"},"t_fallback":{"type":"transform","updates":"{\"path\":\"fallback\"}"}},"edges":[["__start__","r"],["r:yes","t_yes"],["r:fallback","t_fallback"],["t_yes","__end__"],["t_fallback","__end__"]],"schema":{"decision":{"type":"string","reducer":"last_value"},"path":{"type":"string","reducer":"last_value"}}}
+    ;
+
+    try store.createRunWithState("r1", null, wf, "{}", "{\"decision\":\"unknown\"}");
+    try store.updateRunStatus("r1", "running", null);
+
+    var engine = Engine.init(&store, allocator, 500);
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    const run_row = (try store.getRun(arena.allocator(), "r1")).?;
+    try engine.processRun(arena.allocator(), run_row);
+
+    const run_row2 = (try store.getRun(arena.allocator(), "r1")).?;
+    if (std.mem.eql(u8, run_row2.status, "running")) {
+        try engine.processRun(arena.allocator(), run_row2);
+    }
+
+    const updated_run = (try store.getRun(arena.allocator(), "r1")).?;
+    try std.testing.expectEqualStrings("completed", updated_run.status);
+    if (updated_run.state_json) |sj| {
+        try std.testing.expect(std.mem.indexOf(u8, sj, "fallback") != null);
+    }
+}
+
 test "wrapOutput creates valid JSON" {
     const allocator = std.testing.allocator;
     var arena = std.heap.ArenaAllocator.init(allocator);
diff --git a/src/workflow_validation.zig b/src/workflow_validation.zig
index 1070b4d..c5419fb 100644
--- a/src/workflow_validation.zig
+++ b/src/workflow_validation.zig
@@ -459,6 +459,25 @@ pub fn validate(alloc: Allocator, definition_json: []const u8) ![]ValidationErro
                     .message = "route target node does not exist",
                 });
             }
+            if (!hasRouteEdge(edge_sources.items, edge_targets.items, nname, re.key_ptr.*, target)) {
+                try errors.append(alloc, .{
+                    .err_type = "missing_route_edge",
+                    .node = nname,
+                    .key = re.key_ptr.*,
+                    .message = "route key is declared in routes but has no matching conditional edge",
+                });
+            }
+        }
+
+        if (getJsonStringFromObj(nobj, "default")) |default_route| {
+            if (!routes_val.object.contains(default_route)) {
+                try errors.append(alloc, .{
+                    .err_type = "invalid_route_default",
+                    .node = nname,
+                    .key = "default",
+                    .message = "route default must reference a declared routes key",
+                });
+            }
         }
     }
 
@@ -506,6 +525,16 @@ fn edgeSourceNode(src_raw: []const u8) []const u8 {
     return src_raw;
 }
 
+fn hasRouteEdge(edge_sources: []const []const u8, edge_targets: []const []const u8, node_name: []const u8, route_key: []const u8, target: []const u8) bool {
+    for (edge_sources, edge_targets) |src_raw, edge_target| {
+        if (!std.mem.eql(u8, edge_target, target)) continue;
+        const colon_pos = std.mem.indexOfScalar(u8, src_raw, ':') orelse continue;
+        if (!std.mem.eql(u8, src_raw[0..colon_pos], node_name)) continue;
+        if (std.mem.eql(u8, src_raw[colon_pos + 1 ..], route_key)) return true;
+    }
+    return false;
+}
+
 fn getJsonStringFromObj(obj: std.json.ObjectMap, key: []const u8) ?[]const u8 {
     const val = obj.get(key) orelse return null;
     if (val == .string) return val.string;
@@ -806,3 +835,28 @@ test "validate invalid route target" {
     // Should have error about nonexistent node (either in route target or edge target)
     try std.testing.expect(errors.len > 0);
 }
+
+test "validate route requires matching conditional edges for declared routes" {
+    const alloc = std.testing.allocator;
+    const wf =
+        \\{"state_schema":{"x":{"type":"string","reducer":"last_value"}},"nodes":{"r":{"type":"route","input":"state.x","routes":{"yes":"approved"},"default":"yes"},"approved":{"type":"task","prompt":"approve"}},"edges":[["__start__","r"],["r:no","approved"],["approved","__end__"]]}
+    ;
+    const errors = try validate(alloc, wf);
+    defer {
+        for (errors) |e| {
+            alloc.free(e.err_type);
+            if (e.node) |n| alloc.free(n);
+            if (e.key) |k| alloc.free(k);
+            alloc.free(e.message);
+        }
+        alloc.free(errors);
+    }
+    var found_missing_route_edge = false;
+    for (errors) |err| {
+        if (std.mem.eql(u8, err.err_type, "missing_route_edge")) {
+            found_missing_route_edge = true;
+            break;
+        }
+    }
+    try std.testing.expect(found_missing_route_edge);
+}

From 66de615859ee17f4dca4eba88f4987e45f487eba Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:17:28 -0300
Subject: [PATCH 51/55] Extract shared worker snapshot builder

---
 src/engine.zig | 67 ++++++++++++++++++++------------------------------
 1 file changed, 27 insertions(+), 40 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index eef2165..23e3aae 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -633,7 +633,7 @@ pub const Engine = struct {
                 // Execute based on type
                 if (std.mem.eql(u8, node_type, "route")) {
                     // Route: evaluate routing logic, no worker dispatch
-                    const result = try self.executeRouteNode(alloc, node_name, node_json, running_state);
+                    const result = try executeRouteNode(alloc, node_json, running_state);
                     if (result.route_value) |rv| {
                         try route_results.put(try alloc.dupe(u8, node_name), rv);
                     }
@@ -1070,10 +1070,7 @@ pub const Engine = struct {
 
     // ── executeRouteNode ─────────────────────────────────────────────
 
-    fn executeRouteNode(self: *Engine, alloc: std.mem.Allocator, node_name: []const u8, node_json: []const u8, state_json: []const u8) !RouteNodeResult {
-        _ = self;
-        _ = node_name;
-
+    fn executeRouteNode(alloc: std.mem.Allocator, node_json: []const u8, state_json: []const u8) !RouteNodeResult {
         // Get the input path to read from state
         const input_path = getNodeField(alloc, node_json, "input") orelse "state.route_input";
         const default_route = getNodeField(alloc, node_json, "default");
@@ -1092,6 +1089,26 @@ pub const Engine = struct {
         return RouteNodeResult{ .route_value = resolveDeclaredRouteValue(alloc, node_json, route_key) };
     }
 
+    fn buildWorkerInfos(self: *Engine, alloc: std.mem.Allocator) ![]dispatch.WorkerInfo {
+        const workers = try self.store.listWorkers(alloc);
+        var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
+        for (workers) |worker| {
+            const current_tasks = self.store.countRunningStepsByWorker(worker.id) catch 0;
+            try worker_infos.append(alloc, .{
+                .id = worker.id,
+                .url = worker.url,
+                .token = worker.token,
+                .protocol = worker.protocol,
+                .model = worker.model,
+                .tags_json = worker.tags_json,
+                .max_concurrent = worker.max_concurrent,
+                .status = worker.status,
+                .current_tasks = current_tasks,
+            });
+        }
+        return worker_infos.toOwnedSlice(alloc);
+    }
+
     // ── executeTaskNode ──────────────────────────────────────────────
 
     fn executeTaskNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !TaskNodeResult {
@@ -1108,22 +1125,7 @@ pub const Engine = struct {
         };
 
         // 3. Get workers and select one
-        const workers = try self.store.listWorkers(alloc);
-        var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-        for (workers) |w| {
-            const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-            try worker_infos.append(alloc, .{
-                .id = w.id,
-                .url = w.url,
-                .token = w.token,
-                .protocol = w.protocol,
-                .model = w.model,
-                .tags_json = w.tags_json,
-                .max_concurrent = w.max_concurrent,
-                .status = w.status,
-                .current_tasks = current_tasks,
-            });
-        }
+        const worker_infos = try self.buildWorkerInfos(alloc);
 
         const required_tags = getNodeTags(alloc, node_json);
         const node_type = getNodeField(alloc, node_json, "type") orelse "task";
@@ -1134,7 +1136,7 @@ pub const Engine = struct {
         if (is_agent_node) {
             // Filter to A2A workers only
             var a2a_workers: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-            for (worker_infos.items) |w| {
+            for (worker_infos) |w| {
                 if (std.mem.eql(u8, w.protocol, "a2a")) {
                     try a2a_workers.append(alloc, w);
                 }
@@ -1145,7 +1147,7 @@ pub const Engine = struct {
         }
         // Fall back to any protocol if no A2A worker found (or not an agent node)
         if (selected_worker == null) {
-            selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
+            selected_worker = try dispatch.selectWorker(alloc, worker_infos, required_tags);
         }
         if (selected_worker == null) {
             return TaskNodeResult{ .no_worker = {} };
@@ -1444,22 +1446,7 @@ pub const Engine = struct {
         }
 
         // Build worker list once before iterating items
-        const workers = try self.store.listWorkers(alloc);
-        var worker_infos: std.ArrayListUnmanaged(dispatch.WorkerInfo) = .empty;
-        for (workers) |w| {
-            const current_tasks = self.store.countRunningStepsByWorker(w.id) catch 0;
-            try worker_infos.append(alloc, .{
-                .id = w.id,
-                .url = w.url,
-                .token = w.token,
-                .protocol = w.protocol,
-                .model = w.model,
-                .tags_json = w.tags_json,
-                .max_concurrent = w.max_concurrent,
-                .status = w.status,
-                .current_tasks = current_tasks,
-            });
-        }
+        const worker_infos = try self.buildWorkerInfos(alloc);
         const required_tags = getNodeTags(alloc, target_json);
 
         // For each item, execute the target node
@@ -1474,7 +1461,7 @@ pub const Engine = struct {
             // Render with item
             const rendered = self.renderWorkflowTemplate(alloc, run_row.workflow_json, prompt_template, state_json, run_row.input_json, item_str) catch continue;
 
-            const selected_worker = try dispatch.selectWorker(alloc, worker_infos.items, required_tags);
+            const selected_worker = try dispatch.selectWorker(alloc, worker_infos, required_tags);
             if (selected_worker == null) {
                 try results.append(alloc, "null");
                 continue;

From f61af528ba26ebdbe1addd22336dea745d86b0ef Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:25:16 -0300
Subject: [PATCH 52/55] Make runtime bindings explicit in engine

---
 src/engine.zig | 55 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 22 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index 23e3aae..b5281d1 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -115,6 +115,12 @@ const StoreWriter = *const fn (
     value_json: []const u8,
 ) anyerror!void;
 
+const RuntimeBindings = struct {
+    input_json: ?[]const u8,
+    task_id: ?[]const u8,
+    store_access: ?templates.StoreAccess,
+};
+
 // ── Engine ───────────────────────────────────────────────────────────
 
 pub const RuntimeConfig = struct {
@@ -415,7 +421,8 @@ pub const Engine = struct {
         const deferred_nodes = collectDeferredNodesFromRoot(alloc, wf_root);
 
         // 2c. Get task id for reconciliation.
-        const task_id = getRuntimeStringSetting(alloc, current_state, workflow_json, &.{"task_id"});
+        const runtime = self.buildRuntimeBindings(alloc, workflow_json, current_state, run_row.input_json);
+        const task_id = runtime.task_id;
 
         // 3. Get completed nodes from latest checkpoint
         var completed_nodes = std.StringHashMap(void).init(alloc);
@@ -560,7 +567,7 @@ pub const Engine = struct {
                             const def_new_state = state_mod.applyUpdates(alloc, running_state, def_updates, def_schema) catch running_state;
                             running_state = def_new_state;
                         } else if (std.mem.eql(u8, def_node_type, "task") or std.mem.eql(u8, def_node_type, "agent")) {
-                            const def_result = self.executeTaskNode(alloc, run_row, deferred_name, def_node_json, running_state) catch continue;
+                            const def_result = self.executeTaskNode(alloc, run_row, runtime, deferred_name, def_node_json, running_state) catch continue;
                             switch (def_result) {
                                 .completed => |cr| {
                                     if (cr.state_updates) |updates| {
@@ -690,7 +697,7 @@ pub const Engine = struct {
                     running_state = new_state;
 
                     if (getNodeField(alloc, node_json, "store_updates")) |store_updates_json| {
-                        self.applyStoreUpdates(alloc, workflow_json, running_state, store_updates_json) catch |err| {
+                        self.applyStoreUpdates(alloc, running_state, store_updates_json, runtime.store_access) catch |err| {
                             log.err("transform node {s} failed to write store updates: {}", .{ node_name, err });
                             try self.store.updateRunStatus(run_row.id, "failed", "transform store update failed");
                             return;
@@ -715,7 +722,7 @@ pub const Engine = struct {
                     const cache_ttl = parseCacheTtlMs(alloc, node_json);
                     if (cache_ttl != null) cache_check: {
                         const pt_c = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_check;
-                        const rnd_c = self.renderWorkflowTemplate(alloc, workflow_json, pt_c, state_with_meta, run_row.input_json, null) catch break :cache_check;
+                        const rnd_c = self.renderWorkflowTemplate(alloc, pt_c, state_with_meta, runtime, null) catch break :cache_check;
                         const ck_c = computeCacheKey(alloc, node_name, rnd_c) catch break :cache_check;
                         const cached = self.store.getCachedResult(alloc, ck_c) catch break :cache_check;
                         if (cached) |cached_upd| {
@@ -759,7 +766,7 @@ pub const Engine = struct {
                     }
 
                     const current_attempt: u32 = if (retrying_step) |rs| @intCast(rs.attempt) else 0;
-                    const result = try self.executeTaskNode(alloc, run_row, node_name, node_json, state_with_meta);
+                    const result = try self.executeTaskNode(alloc, run_row, runtime, node_name, node_json, state_with_meta);
 
                     // Handle retry scheduling for failed results (non-blocking)
                     const result_after_retry: TaskNodeResult = switch (result) {
@@ -837,7 +844,7 @@ pub const Engine = struct {
                                 // Gap 3: Store result in cache
                                 if (cache_ttl) |ttl| cache_store: {
                                     const pt_s = getNodeField(alloc, node_json, "prompt_template") orelse break :cache_store;
-                                    const rnd_s = self.renderWorkflowTemplate(alloc, workflow_json, pt_s, state_with_meta, run_row.input_json, null) catch break :cache_store;
+                                    const rnd_s = self.renderWorkflowTemplate(alloc, pt_s, state_with_meta, runtime, null) catch break :cache_store;
                                     const ck_s = computeCacheKey(alloc, node_name, rnd_s) catch break :cache_store;
                                     self.store.setCachedResult(ck_s, node_name, updates, ttl) catch |cerr| {
                                         log.warn("failed to cache result for node {s}: {}", .{ node_name, cerr });
@@ -959,7 +966,7 @@ pub const Engine = struct {
                     }
                 } else if (std.mem.eql(u8, node_type, "send")) {
                     // Send: read items from state, dispatch target_node per item
-                    const result = try self.executeSendNode(alloc, run_row, node_name, node_json, running_state);
+                    const result = try self.executeSendNode(alloc, run_row, runtime, node_name, node_json, running_state);
                     if (result.state_updates) |updates| {
                         const schema_json = cached_schema_json;
                         const new_state = state_mod.applyUpdates(alloc, running_state, updates, schema_json) catch |err| {
@@ -1111,7 +1118,7 @@ pub const Engine = struct {
 
     // ── executeTaskNode ──────────────────────────────────────────────
 
-    fn executeTaskNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !TaskNodeResult {
+    fn executeTaskNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, runtime: RuntimeBindings, node_name: []const u8, node_json: []const u8, state_json: []const u8) !TaskNodeResult {
         // 1. Get prompt template from node definition
         const prompt_template = getNodeField(alloc, node_json, "prompt_template") orelse {
             // No prompt template — mark as completed with no state updates
@@ -1119,7 +1126,7 @@ pub const Engine = struct {
         };
 
         // 2. Render prompt with graph template interpolation and optional store access.
-        const rendered_prompt = self.renderWorkflowTemplate(alloc, run_row.workflow_json, prompt_template, state_json, run_row.input_json, null) catch |err| {
+        const rendered_prompt = self.renderWorkflowTemplate(alloc, prompt_template, state_json, runtime, null) catch |err| {
             log.err("template render failed for node {s}: {}", .{ node_name, err });
             return TaskNodeResult{ .failed = "template render failed" };
         };
@@ -1232,7 +1239,7 @@ pub const Engine = struct {
                             }
 
                             // Render continuation prompt
-                            const cont_rendered = self.renderWorkflowTemplate(alloc, run_row.workflow_json, continuation_prompt.?, state_json, run_row.input_json, null) catch break;
+                            const cont_rendered = self.renderWorkflowTemplate(alloc, continuation_prompt.?, state_json, runtime, null) catch break;
 
                             const cont_result = try dispatch.dispatchStep(
                                 alloc,
@@ -1409,7 +1416,7 @@ pub const Engine = struct {
 
     // ── executeSendNode ──────────────────────────────────────────────
 
-    fn executeSendNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, node_name: []const u8, node_json: []const u8, state_json: []const u8) !SendNodeResult {
+    fn executeSendNode(self: *Engine, alloc: std.mem.Allocator, run_row: types.RunRow, runtime: RuntimeBindings, node_name: []const u8, node_json: []const u8, state_json: []const u8) !SendNodeResult {
         // Read items_key state path, with items_from kept as a legacy alias.
         const items_path = getSendItemsPath(alloc, node_json) orelse {
             log.warn("send node {s} missing items_key/items_from", .{node_name});
@@ -1459,7 +1466,7 @@ pub const Engine = struct {
             const prompt_template = getNodeField(alloc, target_json, "prompt_template") orelse continue;
 
             // Render with item
-            const rendered = self.renderWorkflowTemplate(alloc, run_row.workflow_json, prompt_template, state_json, run_row.input_json, item_str) catch continue;
+            const rendered = self.renderWorkflowTemplate(alloc, prompt_template, state_json, runtime, item_str) catch continue;
 
             const selected_worker = try dispatch.selectWorker(alloc, worker_infos, required_tags);
             if (selected_worker == null) {
@@ -1515,20 +1522,16 @@ pub const Engine = struct {
     fn renderWorkflowTemplate(
         self: *Engine,
         alloc: std.mem.Allocator,
-        workflow_json: []const u8,
         template: []const u8,
         state_json: []const u8,
-        input_json: ?[]const u8,
+        runtime: RuntimeBindings,
         item_json: ?[]const u8,
     ) ![]const u8 {
-        const store_access = self.resolveRuntimeStoreAccess(alloc, workflow_json, state_json);
-        return templates.renderTemplateWithStore(alloc, template, state_json, input_json, item_json, store_access);
+        _ = self;
+        return templates.renderTemplateWithStore(alloc, template, state_json, runtime.input_json, item_json, runtime.store_access);
     }
 
-    fn resolveRuntimeStoreAccess(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8) ?templates.StoreAccess {
-        _ = alloc;
-        _ = workflow_json;
-        _ = state_json;
+    fn runtimeStoreAccess(self: *Engine) ?templates.StoreAccess {
         const base_url = self.trusted_tracker_url orelse return null;
         return .{
             .base_url = base_url,
@@ -1537,8 +1540,16 @@ pub const Engine = struct {
         };
     }
 
-    fn applyStoreUpdates(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8, store_updates_json: []const u8) !void {
-        const access = self.resolveRuntimeStoreAccess(alloc, workflow_json, state_json) orelse return error.StoreNotConfigured;
+    fn buildRuntimeBindings(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8, input_json: ?[]const u8) RuntimeBindings {
+        return .{
+            .input_json = input_json,
+            .task_id = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{"task_id"}),
+            .store_access = self.runtimeStoreAccess(),
+        };
+    }
+
+    fn applyStoreUpdates(self: *Engine, alloc: std.mem.Allocator, state_json: []const u8, store_updates_json: []const u8, store_access: ?templates.StoreAccess) !void {
+        const access = store_access orelse return error.StoreNotConfigured;
         const parsed = try json.parseFromSlice(json.Value, alloc, store_updates_json, .{});
 
         switch (parsed.value) {

From dfa90350078e0a79369b8beb73cef52a41bd1e7a Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:30:09 -0300
Subject: [PATCH 53/55] Unify tracker runtime bindings

---
 src/engine.zig | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/engine.zig b/src/engine.zig
index b5281d1..0a0a596 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -115,10 +115,28 @@ const StoreWriter = *const fn (
     value_json: []const u8,
 ) anyerror!void;
 
+const TrackerRuntime = struct {
+    base_url: []const u8,
+    api_token: ?[]const u8,
+
+    fn storeAccess(self: TrackerRuntime, fetcher: templates.StoreFetcher) templates.StoreAccess {
+        return .{
+            .base_url = self.base_url,
+            .api_token = self.api_token,
+            .fetcher = fetcher,
+        };
+    }
+};
+
 const RuntimeBindings = struct {
     input_json: ?[]const u8,
     task_id: ?[]const u8,
-    store_access: ?templates.StoreAccess,
+    tracker: ?TrackerRuntime,
+
+    fn storeAccess(self: RuntimeBindings, fetcher: templates.StoreFetcher) ?templates.StoreAccess {
+        const tracker = self.tracker orelse return null;
+        return tracker.storeAccess(fetcher);
+    }
 };
 
 // ── Engine ───────────────────────────────────────────────────────────
@@ -697,7 +715,7 @@ pub const Engine = struct {
                     running_state = new_state;
 
                     if (getNodeField(alloc, node_json, "store_updates")) |store_updates_json| {
-                        self.applyStoreUpdates(alloc, running_state, store_updates_json, runtime.store_access) catch |err| {
+                        self.applyStoreUpdates(alloc, running_state, store_updates_json, runtime) catch |err| {
                             log.err("transform node {s} failed to write store updates: {}", .{ node_name, err });
                             try self.store.updateRunStatus(run_row.id, "failed", "transform store update failed");
                             return;
@@ -1006,8 +1024,8 @@ pub const Engine = struct {
                 }
 
                 // Reconciliation: check tracker task status between steps
-                if (self.trusted_tracker_url != null and task_id != null) {
-                    if (!reconcileWithTracker(alloc, self.trusted_tracker_url.?, self.trusted_tracker_api_token, task_id.?)) {
+                if (runtime.tracker) |tracker| {
+                    if (task_id != null and !reconcileWithTracker(alloc, tracker.base_url, tracker.api_token, task_id.?)) {
                         log.info("run {s} cancelled by reconciliation", .{run_row.id});
                         try self.store.updateRunStatus(run_row.id, "failed", "cancelled by tracker reconciliation");
                         try self.store.insertEvent(run_row.id, null, "run.failed", "{\"reason\":\"tracker_cancelled\"}");
@@ -1527,29 +1545,25 @@ pub const Engine = struct {
         runtime: RuntimeBindings,
         item_json: ?[]const u8,
     ) ![]const u8 {
-        _ = self;
-        return templates.renderTemplateWithStore(alloc, template, state_json, runtime.input_json, item_json, runtime.store_access);
-    }
-
-    fn runtimeStoreAccess(self: *Engine) ?templates.StoreAccess {
-        const base_url = self.trusted_tracker_url orelse return null;
-        return .{
-            .base_url = base_url,
-            .api_token = self.trusted_tracker_api_token,
-            .fetcher = self.store_fetcher,
-        };
+        return templates.renderTemplateWithStore(alloc, template, state_json, runtime.input_json, item_json, runtime.storeAccess(self.store_fetcher));
     }
 
     fn buildRuntimeBindings(self: *Engine, alloc: std.mem.Allocator, workflow_json: []const u8, state_json: []const u8, input_json: ?[]const u8) RuntimeBindings {
         return .{
             .input_json = input_json,
             .task_id = getRuntimeStringSetting(alloc, state_json, workflow_json, &.{"task_id"}),
-            .store_access = self.runtimeStoreAccess(),
+            .tracker = if (self.trusted_tracker_url) |base_url|
+                .{
+                    .base_url = base_url,
+                    .api_token = self.trusted_tracker_api_token,
+                }
+            else
+                null,
         };
     }
 
-    fn applyStoreUpdates(self: *Engine, alloc: std.mem.Allocator, state_json: []const u8, store_updates_json: []const u8, store_access: ?templates.StoreAccess) !void {
-        const access = store_access orelse return error.StoreNotConfigured;
+    fn applyStoreUpdates(self: *Engine, alloc: std.mem.Allocator, state_json: []const u8, store_updates_json: []const u8, runtime: RuntimeBindings) !void {
+        const access = runtime.storeAccess(self.store_fetcher) orelse return error.StoreNotConfigured;
         const parsed = try json.parseFromSlice(json.Value, alloc, store_updates_json, .{});
 
         switch (parsed.value) {

From 84e9bda988198816b03fd79ef276b19dee6700d8 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 09:39:32 -0300
Subject: [PATCH 54/55] Own SSE queue event payloads

---
 src/api.zig    |  7 ++---
 src/engine.zig |  6 +++--
 src/sse.zig    | 72 +++++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index d88f3ab..5475987 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -1683,7 +1683,8 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
     var sse_events_json: []const u8 = "[]";
     if (ctx.sse_hub) |hub| {
         const queue = hub.getOrCreateQueue(run_id);
-        const sse_events = queue.drain(ctx.allocator);
+        const sse_events = queue.drain();
+        defer queue.freeDrained(sse_events);
         if (sse_events.len > 0) {
             var sse_buf: std.ArrayListUnmanaged(u8) = .empty;
             sse_buf.append(ctx.allocator, '[') catch {};
@@ -1707,10 +1708,6 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
             }
             sse_buf.append(ctx.allocator, ']') catch {};
             sse_events_json = sse_buf.toOwnedSlice(ctx.allocator) catch "[]";
-            // Note: sse_events slice is allocated via ctx.allocator which is a
-            // per-request arena — no explicit free needed. The inner strings
-            // (event_type, data) are not owned by this allocator either (they
-            // originate from the engine's per-tick arena), so we must not free them.
         }
     }
 
diff --git a/src/engine.zig b/src/engine.zig
index 0a0a596..c701805 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -3738,7 +3738,8 @@ test "processUiMessages: broadcasts events" {
     ;
     processUiMessages(&hub, alloc, "run1", "step1", response);
 
-    const events = queue.drain(alloc);
+    const events = queue.drain();
+    defer queue.freeDrained(events);
     try std.testing.expectEqual(@as(usize, 2), events.len);
     try std.testing.expectEqualStrings("ui_message", events[0].event_type);
     try std.testing.expectEqualStrings("ui_message_delete", events[1].event_type);
@@ -3762,7 +3763,8 @@ test "processStreamMessages: broadcasts message events" {
     ;
     processStreamMessages(&hub, alloc, "run1", "step1", "task", response);
 
-    const events = queue.drain(alloc);
+    const events = queue.drain();
+    defer queue.freeDrained(events);
     try std.testing.expectEqual(@as(usize, 2), events.len);
     try std.testing.expectEqualStrings("message", events[0].event_type);
     try std.testing.expectEqualStrings("message", events[1].event_type);
diff --git a/src/sse.zig b/src/sse.zig
index d3bc1bd..7e6d0bc 100644
--- a/src/sse.zig
+++ b/src/sse.zig
@@ -33,6 +33,11 @@ pub const RunEventQueue = struct {
     mutex: std.Thread.Mutex,
     closed: std.atomic.Value(bool),
 
+    fn freeEvent(self: *RunEventQueue, event: SseEvent) void {
+        self.alloc.free(event.event_type);
+        self.alloc.free(event.data);
+    }
+
     pub fn init(alloc: Allocator) RunEventQueue {
         return .{
             .events = .empty,
@@ -43,6 +48,9 @@ pub const RunEventQueue = struct {
     }
 
     pub fn deinit(self: *RunEventQueue) void {
+        for (self.events.items) |event| {
+            self.freeEvent(event);
+        }
         self.events.deinit(self.alloc);
     }
 
@@ -50,17 +58,38 @@ pub const RunEventQueue = struct {
     pub fn push(self: *RunEventQueue, event: SseEvent) void {
         self.mutex.lock();
         defer self.mutex.unlock();
-        self.events.append(self.alloc, event) catch {};
+
+        const event_type = self.alloc.dupe(u8, event.event_type) catch return;
+        const data = self.alloc.dupe(u8, event.data) catch {
+            self.alloc.free(event_type);
+            return;
+        };
+
+        self.events.append(self.alloc, .{
+            .event_type = event_type,
+            .data = data,
+            .mode = event.mode,
+        }) catch {
+            self.alloc.free(event_type);
+            self.alloc.free(data);
+        };
     }
 
-    /// Drain all events from the queue. Returns owned slice. Thread-safe.
-    pub fn drain(self: *RunEventQueue, alloc: Allocator) []SseEvent {
+    /// Drain all events from the queue. Returns a queue-allocator-owned slice.
+    /// The caller must release it with `freeDrained`.
+    pub fn drain(self: *RunEventQueue) []SseEvent {
         self.mutex.lock();
         defer self.mutex.unlock();
         if (self.events.items.len == 0) return &.{};
-        const items = alloc.dupe(SseEvent, self.events.items) catch return &.{};
-        self.events.clearRetainingCapacity();
-        return items;
+        return self.events.toOwnedSlice(self.alloc) catch &.{};
+    }
+
+    pub fn freeDrained(self: *RunEventQueue, events: []SseEvent) void {
+        if (events.len == 0) return;
+        for (events) |event| {
+            self.freeEvent(event);
+        }
+        self.alloc.free(events);
     }
 
     /// Mark queue as closed (run completed/cancelled).
@@ -143,12 +172,35 @@ test "sse hub broadcast and drain" {
     queue.push(.{ .event_type = "step_started", .data = "{}" });
     queue.push(.{ .event_type = "step_completed", .data = "{}" });
 
-    const events = queue.drain(alloc);
-    defer alloc.free(events);
+    const events = queue.drain();
+    defer queue.freeDrained(events);
     try std.testing.expectEqual(@as(usize, 2), events.len);
     try std.testing.expectEqualStrings("step_started", events[0].event_type);
 }
 
+test "sse hub queue owns event payloads beyond source arena lifetime" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+
+    var arena = std.heap.ArenaAllocator.init(alloc);
+    const arena_alloc = arena.allocator();
+
+    const event_type = try arena_alloc.dupe(u8, "step.completed");
+    const payload = try arena_alloc.dupe(u8, "{\"ok\":true}");
+    queue.push(.{ .event_type = event_type, .data = payload });
+    arena.deinit();
+
+    const events = queue.drain();
+    defer queue.freeDrained(events);
+
+    try std.testing.expectEqual(@as(usize, 1), events.len);
+    try std.testing.expectEqualStrings("step.completed", events[0].event_type);
+    try std.testing.expectEqualStrings("{\"ok\":true}", events[0].data);
+}
+
 test "sse hub broadcast to non-existent queue is silent" {
     const alloc = std.testing.allocator;
     var hub = SseHub.init(alloc);
@@ -212,8 +264,8 @@ test "sse hub broadcast with mode" {
     queue.push(.{ .event_type = "task_start", .data = "{}", .mode = .tasks });
     queue.push(.{ .event_type = "debug", .data = "{}", .mode = .debug });
 
-    const events = queue.drain(alloc);
-    defer alloc.free(events);
+    const events = queue.drain();
+    defer queue.freeDrained(events);
     try std.testing.expectEqual(@as(usize, 3), events.len);
     try std.testing.expectEqual(StreamMode.values, events[0].mode);
     try std.testing.expectEqual(StreamMode.tasks, events[1].mode);

From 2a6bf9f7dddc35ccf3ac528b4ea01507861e8946 Mon Sep 17 00:00:00 2001
From: Igor Somov <donprusne@gmail.com>
Date: Sat, 14 Mar 2026 14:13:48 -0300
Subject: [PATCH 55/55] Make run stream multi-consumer

---
 src/api.zig    | 106 ++++++++++++++++++++-------
 src/engine.zig |  28 +++----
 src/sse.zig    | 193 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 255 insertions(+), 72 deletions(-)

diff --git a/src/api.zig b/src/api.zig
index 5475987..0ce34a9 100644
--- a/src/api.zig
+++ b/src/api.zig
@@ -913,8 +913,8 @@ fn handleCancelRun(ctx: *Context, run_id: []const u8) HttpResponse {
     // 5. Insert event
     ctx.store.insertEvent(run_id, null, "run.cancelled", "{}") catch {};
 
-    // 6. Close SSE queue
-    if (ctx.sse_hub) |hub| hub.removeQueue(run_id);
+    // 6. Mark SSE queue closed but keep buffered events available for late subscribers.
+    if (ctx.sse_hub) |hub| hub.closeQueue(run_id);
 
     // 7. Return 200
     const resp = std.fmt.allocPrint(ctx.allocator,
@@ -1647,6 +1647,10 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
 
     // Parse requested modes from ?mode= query param
     const mode_param = getQueryParam(target, "mode");
+    const after_seq = if (getQueryParam(target, "after_seq")) |raw|
+        std.fmt.parseInt(u64, raw, 10) catch 0
+    else
+        0;
     var requested_modes: [5]bool = .{ true, true, true, true, true }; // all modes by default
     if (mode_param) |modes_str| {
         // Reset all to false, then enable requested
@@ -1659,37 +1663,44 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
         }
     }
 
-    const events = ctx.store.getEventsByRun(ctx.allocator, run_id) catch {
-        return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get events\"}}");
-    };
+    const events_json = if (after_seq == 0) blk: {
+        const events = ctx.store.getEventsByRun(ctx.allocator, run_id) catch {
+            return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"failed to get events\"}}");
+        };
 
-    // Build events JSON array
-    var events_buf: std.ArrayListUnmanaged(u8) = .empty;
-    events_buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    for (events, 0..) |ev, i| {
-        if (i > 0) {
-            events_buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        // Build events JSON array
+        var events_buf: std.ArrayListUnmanaged(u8) = .empty;
+        events_buf.append(ctx.allocator, '[') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        for (events, 0..) |ev, i| {
+            if (i > 0) {
+                events_buf.append(ctx.allocator, ',') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+            }
+            const kind_json = jsonQuoted(ctx.allocator, ev.kind) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+            const entry = std.fmt.allocPrint(ctx.allocator,
+                \\{{"kind":{s},"data":{s},"ts_ms":{d}}}
+            , .{ kind_json, ev.data_json, ev.ts_ms }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+            events_buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
         }
-        const kind_json = jsonQuoted(ctx.allocator, ev.kind) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        const entry = std.fmt.allocPrint(ctx.allocator,
-            \\{{"kind":{s},"data":{s},"ts_ms":{d}}}
-        , .{ kind_json, ev.data_json, ev.ts_ms }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-        events_buf.appendSlice(ctx.allocator, entry) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    }
-    events_buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
-    const events_json = events_buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        events_buf.append(ctx.allocator, ']') catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+        break :blk events_buf.toOwnedSlice(ctx.allocator) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
+    } else "[]";
 
-    // If SSE hub available, drain queued SSE events filtered by requested modes
+    // If SSE hub available, snapshot queued SSE events filtered by requested modes
     var sse_events_json: []const u8 = "[]";
+    var latest_stream_seq: u64 = 0;
+    var oldest_stream_seq: u64 = 0;
+    var stream_gap = false;
     if (ctx.sse_hub) |hub| {
         const queue = hub.getOrCreateQueue(run_id);
-        const sse_events = queue.drain();
-        defer queue.freeDrained(sse_events);
-        if (sse_events.len > 0) {
+        const snapshot = queue.snapshotSince(ctx.allocator, after_seq);
+        latest_stream_seq = snapshot.latest_seq;
+        oldest_stream_seq = snapshot.oldest_seq;
+        stream_gap = snapshot.gap_detected;
+        if (snapshot.events.len > 0) {
             var sse_buf: std.ArrayListUnmanaged(u8) = .empty;
             sse_buf.append(ctx.allocator, '[') catch {};
             var first = true;
-            for (sse_events) |sse_ev| {
+            for (snapshot.events) |sse_ev| {
                 // Filter by requested modes
                 if (!requested_modes[@intFromEnum(sse_ev.mode)]) continue;
                 if (!first) {
@@ -1698,8 +1709,9 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
                 first = false;
                 const mode_str = sse_ev.mode.toString();
                 const sse_entry = std.fmt.allocPrint(ctx.allocator,
-                    \\{{"event":{s},"mode":"{s}","data":{s}}}
+                    \\{{"seq":{d},"event":{s},"mode":"{s}","data":{s}}}
                 , .{
+                    sse_ev.seq,
                     jsonQuoted(ctx.allocator, sse_ev.event_type) catch "\"\"",
                     mode_str,
                     sse_ev.data,
@@ -1718,12 +1730,15 @@ fn handleStream(ctx: *Context, run_id: []const u8, target: []const u8) HttpRespo
         "";
 
     const resp = std.fmt.allocPrint(ctx.allocator,
-        \\{{"status":{s}{s},"events":{s},"stream_events":{s}}}
+        \\{{"status":{s}{s},"events":{s},"stream_events":{s},"next_stream_seq":{d},"stream_oldest_seq":{d},"stream_gap":{s}}}
     , .{
         status_json,
         state_field,
         events_json,
         sse_events_json,
+        latest_stream_seq,
+        oldest_stream_seq,
+        if (stream_gap) "true" else "false",
     }) catch return jsonResponse(500, "{\"error\":{\"code\":\"internal\",\"message\":\"out of memory\"}}");
     return jsonResponse(200, resp);
 }
@@ -2788,6 +2803,45 @@ test "API: stream with mode query param" {
     try std.testing.expect(std.mem.indexOf(u8, resp2.body, "stream_events") != null);
 }
 
+test "API: stream supports independent cursors for multiple consumers" {
+    const allocator = std.testing.allocator;
+    var store = try Store.init(allocator, ":memory:");
+    defer store.deinit();
+
+    var arena = std.heap.ArenaAllocator.init(allocator);
+    defer arena.deinit();
+
+    var hub = sse_mod.SseHub.init(allocator);
+    defer hub.deinit();
+
+    try store.createRunWithState("r1", null, "{}", "{}", "{\"x\":1}");
+    try store.updateRunStatus("r1", "running", null);
+
+    const queue = hub.getOrCreateQueue("r1");
+    queue.push(.{ .event_type = "values", .data = "{\"step\":\"n1\"}", .mode = .values });
+
+    var ctx = Context{
+        .store = &store,
+        .allocator = arena.allocator(),
+        .sse_hub = &hub,
+    };
+
+    const consumer_a = handleRequest(&ctx, "GET", "/runs/r1/stream", "");
+    try std.testing.expectEqual(@as(u16, 200), consumer_a.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, consumer_a.body, "\"seq\":1") != null);
+
+    const consumer_b = handleRequest(&ctx, "GET", "/runs/r1/stream", "");
+    try std.testing.expectEqual(@as(u16, 200), consumer_b.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, consumer_b.body, "\"seq\":1") != null);
+
+    queue.push(.{ .event_type = "updates", .data = "{\"step\":\"n2\"}", .mode = .updates });
+    const consumer_a_next = handleRequest(&ctx, "GET", "/runs/r1/stream?after_seq=1", "");
+    try std.testing.expectEqual(@as(u16, 200), consumer_a_next.status_code);
+    try std.testing.expect(std.mem.indexOf(u8, consumer_a_next.body, "\"seq\":2") != null);
+    try std.testing.expect(std.mem.indexOf(u8, consumer_a_next.body, "\"events\":[]") != null);
+    try std.testing.expect(std.mem.indexOf(u8, consumer_a_next.body, "\"next_stream_seq\":2") != null);
+}
+
 test "API: workflow routes decode percent-encoded ids" {
     const allocator = std.testing.allocator;
     var store = try Store.init(allocator, ":memory:");
diff --git a/src/engine.zig b/src/engine.zig
index c701805..e2727c4 100644
--- a/src/engine.zig
+++ b/src/engine.zig
@@ -3738,13 +3738,13 @@ test "processUiMessages: broadcasts events" {
     ;
     processUiMessages(&hub, alloc, "run1", "step1", response);
 
-    const events = queue.drain();
-    defer queue.freeDrained(events);
-    try std.testing.expectEqual(@as(usize, 2), events.len);
-    try std.testing.expectEqualStrings("ui_message", events[0].event_type);
-    try std.testing.expectEqualStrings("ui_message_delete", events[1].event_type);
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
+    try std.testing.expectEqual(@as(usize, 2), snapshot.events.len);
+    try std.testing.expectEqualStrings("ui_message", snapshot.events[0].event_type);
+    try std.testing.expectEqualStrings("ui_message_delete", snapshot.events[1].event_type);
     // First event should contain step_id
-    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "step1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, snapshot.events[0].data, "step1") != null);
 }
 
 test "processStreamMessages: broadcasts message events" {
@@ -3763,15 +3763,15 @@ test "processStreamMessages: broadcasts message events" {
     ;
     processStreamMessages(&hub, alloc, "run1", "step1", "task", response);
 
-    const events = queue.drain();
-    defer queue.freeDrained(events);
-    try std.testing.expectEqual(@as(usize, 2), events.len);
-    try std.testing.expectEqualStrings("message", events[0].event_type);
-    try std.testing.expectEqualStrings("message", events[1].event_type);
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
+    try std.testing.expectEqual(@as(usize, 2), snapshot.events.len);
+    try std.testing.expectEqualStrings("message", snapshot.events[0].event_type);
+    try std.testing.expectEqualStrings("message", snapshot.events[1].event_type);
     // Should contain step context
-    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "step1") != null);
-    try std.testing.expect(std.mem.indexOf(u8, events[0].data, "task") != null);
-    try std.testing.expect(std.mem.indexOf(u8, events[1].data, "tool") != null);
+    try std.testing.expect(std.mem.indexOf(u8, snapshot.events[0].data, "step1") != null);
+    try std.testing.expect(std.mem.indexOf(u8, snapshot.events[0].data, "task") != null);
+    try std.testing.expect(std.mem.indexOf(u8, snapshot.events[1].data, "tool") != null);
 }
 
 test "applyUiMessagesToState: creates __ui_messages" {
diff --git a/src/sse.zig b/src/sse.zig
index 7e6d0bc..0c64344 100644
--- a/src/sse.zig
+++ b/src/sse.zig
@@ -21,17 +21,28 @@ pub const StreamMode = enum {
 };
 
 pub const SseEvent = struct {
+    seq: u64 = 0,
     event_type: []const u8, // "state_update", "step_started", etc.
     data: []const u8, // JSON string
     mode: StreamMode = .updates, // default mode
 };
 
+pub const EventSnapshot = struct {
+    events: []SseEvent,
+    latest_seq: u64,
+    oldest_seq: u64,
+    gap_detected: bool,
+};
+
 /// Per-run event queue. Thread-safe via mutex.
 pub const RunEventQueue = struct {
     events: std.ArrayListUnmanaged(SseEvent),
     alloc: Allocator,
     mutex: std.Thread.Mutex,
     closed: std.atomic.Value(bool),
+    next_seq: u64,
+
+    const max_retained_events: usize = 2048;
 
     fn freeEvent(self: *RunEventQueue, event: SseEvent) void {
         self.alloc.free(event.event_type);
@@ -44,6 +55,7 @@ pub const RunEventQueue = struct {
             .alloc = alloc,
             .mutex = .{},
             .closed = std.atomic.Value(bool).init(false),
+            .next_seq = 1,
         };
     }
 
@@ -66,30 +78,80 @@ pub const RunEventQueue = struct {
         };
 
         self.events.append(self.alloc, .{
+            .seq = self.next_seq,
             .event_type = event_type,
             .data = data,
             .mode = event.mode,
         }) catch {
             self.alloc.free(event_type);
             self.alloc.free(data);
+            return;
         };
+        self.next_seq += 1;
+
+        while (self.events.items.len > max_retained_events) {
+            const dropped = self.events.orderedRemove(0);
+            self.freeEvent(dropped);
+        }
     }
 
-    /// Drain all events from the queue. Returns a queue-allocator-owned slice.
-    /// The caller must release it with `freeDrained`.
-    pub fn drain(self: *RunEventQueue) []SseEvent {
+    pub fn snapshotSince(self: *RunEventQueue, alloc: Allocator, after_seq: u64) EventSnapshot {
         self.mutex.lock();
         defer self.mutex.unlock();
-        if (self.events.items.len == 0) return &.{};
-        return self.events.toOwnedSlice(self.alloc) catch &.{};
+
+        const latest_seq = self.next_seq -| 1;
+        const oldest_seq = if (self.events.items.len > 0) self.events.items[0].seq else latest_seq;
+        const gap_detected = after_seq > 0 and self.events.items.len > 0 and after_seq < self.events.items[0].seq and self.events.items[0].seq - after_seq > 1;
+
+        var snapshot_events: std.ArrayListUnmanaged(SseEvent) = .empty;
+        for (self.events.items) |event| {
+            if (event.seq <= after_seq) continue;
+
+            const event_type = alloc.dupe(u8, event.event_type) catch continue;
+            const data = alloc.dupe(u8, event.data) catch {
+                alloc.free(event_type);
+                continue;
+            };
+
+            snapshot_events.append(alloc, .{
+                .seq = event.seq,
+                .event_type = event_type,
+                .data = data,
+                .mode = event.mode,
+            }) catch {
+                alloc.free(event_type);
+                alloc.free(data);
+            };
+        }
+
+        const events = snapshot_events.toOwnedSlice(alloc) catch {
+            for (snapshot_events.items) |event| {
+                alloc.free(event.event_type);
+                alloc.free(event.data);
+            }
+            snapshot_events.deinit(alloc);
+            return .{
+                .events = &.{},
+                .latest_seq = latest_seq,
+                .oldest_seq = oldest_seq,
+                .gap_detected = gap_detected,
+            };
+        };
+
+        return .{
+            .events = events,
+            .latest_seq = latest_seq,
+            .oldest_seq = oldest_seq,
+            .gap_detected = gap_detected,
+        };
     }
 
-    pub fn freeDrained(self: *RunEventQueue, events: []SseEvent) void {
-        if (events.len == 0) return;
-        for (events) |event| {
-            self.freeEvent(event);
+    pub fn freeSnapshot(_: *RunEventQueue, alloc: Allocator, snapshot: EventSnapshot) void {
+        for (snapshot.events) |event| {
+            alloc.free(event.event_type);
+            alloc.free(event.data);
         }
-        self.alloc.free(events);
+        if (snapshot.events.len > 0) alloc.free(snapshot.events);
     }
 
     /// Mark queue as closed (run completed/cancelled).
@@ -138,14 +200,36 @@ pub const SseHub = struct {
         return queue;
     }
 
-    /// Broadcast event to a run's queue.
+    /// Broadcast event to a run's queue. Creates the queue on first write so
+    /// late subscribers can still read recent buffered events.
     pub fn broadcast(self: *SseHub, run_id: []const u8, event: SseEvent) void {
+        self.mutex.lock();
+        defer self.mutex.unlock();
+        const queue = if (self.queues.get(run_id)) |existing|
+            existing
+        else blk: {
+            const created = self.alloc.create(RunEventQueue) catch return;
+            created.* = RunEventQueue.init(self.alloc);
+            const id_copy = self.alloc.dupe(u8, run_id) catch {
+                self.alloc.destroy(created);
+                return;
+            };
+            self.queues.put(id_copy, created) catch {
+                self.alloc.free(id_copy);
+                self.alloc.destroy(created);
+                return;
+            };
+            break :blk created;
+        };
+        queue.push(event);
+    }
+
+    pub fn closeQueue(self: *SseHub, run_id: []const u8) void {
         self.mutex.lock();
         defer self.mutex.unlock();
         if (self.queues.get(run_id)) |queue| {
-            queue.push(event);
+            queue.close();
         }
-        // If no queue exists, event is silently dropped (no listeners)
     }
 
     /// Close and remove queue when run completes.
@@ -163,7 +247,7 @@ pub const SseHub = struct {
 
 // ── Tests ─────────────────────────────────────────────────────────────
 
-test "sse hub broadcast and drain" {
+test "sse hub snapshotSince supports multiple consumers" {
     const alloc = std.testing.allocator;
     var hub = SseHub.init(alloc);
     defer hub.deinit();
@@ -172,10 +256,15 @@ test "sse hub broadcast and drain" {
     queue.push(.{ .event_type = "step_started", .data = "{}" });
     queue.push(.{ .event_type = "step_completed", .data = "{}" });
 
-    const events = queue.drain();
-    defer queue.freeDrained(events);
-    try std.testing.expectEqual(@as(usize, 2), events.len);
-    try std.testing.expectEqualStrings("step_started", events[0].event_type);
+    const first = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, first);
+    const second = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, second);
+
+    try std.testing.expectEqual(@as(usize, 2), first.events.len);
+    try std.testing.expectEqual(@as(usize, 2), second.events.len);
+    try std.testing.expectEqualStrings("step_started", first.events[0].event_type);
+    try std.testing.expectEqualStrings("step_started", second.events[0].event_type);
 }
 
 test "sse hub queue owns event payloads beyond source arena lifetime" {
@@ -193,21 +282,27 @@ test "sse hub queue owns event payloads beyond source arena lifetime" {
     queue.push(.{ .event_type = event_type, .data = payload });
     arena.deinit();
 
-    const events = queue.drain();
-    defer queue.freeDrained(events);
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
 
-    try std.testing.expectEqual(@as(usize, 1), events.len);
-    try std.testing.expectEqualStrings("step.completed", events[0].event_type);
-    try std.testing.expectEqualStrings("{\"ok\":true}", events[0].data);
+    try std.testing.expectEqual(@as(usize, 1), snapshot.events.len);
+    try std.testing.expectEqualStrings("step.completed", snapshot.events[0].event_type);
+    try std.testing.expectEqualStrings("{\"ok\":true}", snapshot.events[0].data);
 }
 
-test "sse hub broadcast to non-existent queue is silent" {
+test "sse hub broadcast creates queue for late subscribers" {
     const alloc = std.testing.allocator;
     var hub = SseHub.init(alloc);
     defer hub.deinit();
 
-    // Should not crash
-    hub.broadcast("nonexistent", .{ .event_type = "test", .data = "{}" });
+    hub.broadcast("run1", .{ .event_type = "test", .data = "{}" });
+
+    const queue = hub.getOrCreateQueue("run1");
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
+
+    try std.testing.expectEqual(@as(usize, 1), snapshot.events.len);
+    try std.testing.expectEqualStrings("test", snapshot.events[0].event_type);
 }
 
 test "sse hub remove queue" {
@@ -221,6 +316,22 @@ test "sse hub remove queue" {
     try std.testing.expectEqual(@as(usize, 0), hub.queues.count());
 }
 
+test "sse hub closeQueue preserves buffered events" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    hub.broadcast("run1", .{ .event_type = "values", .data = "{}" });
+    hub.closeQueue("run1");
+
+    const queue = hub.getOrCreateQueue("run1");
+    try std.testing.expect(queue.isClosed());
+
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
+    try std.testing.expectEqual(@as(usize, 1), snapshot.events.len);
+}
+
 test "sse queue close" {
     const alloc = std.testing.allocator;
     var queue = RunEventQueue.init(alloc);
@@ -264,10 +375,28 @@ test "sse hub broadcast with mode" {
     queue.push(.{ .event_type = "task_start", .data = "{}", .mode = .tasks });
     queue.push(.{ .event_type = "debug", .data = "{}", .mode = .debug });
 
-    const events = queue.drain();
-    defer queue.freeDrained(events);
-    try std.testing.expectEqual(@as(usize, 3), events.len);
-    try std.testing.expectEqual(StreamMode.values, events[0].mode);
-    try std.testing.expectEqual(StreamMode.tasks, events[1].mode);
-    try std.testing.expectEqual(StreamMode.debug, events[2].mode);
+    const snapshot = queue.snapshotSince(alloc, 0);
+    defer queue.freeSnapshot(alloc, snapshot);
+    try std.testing.expectEqual(@as(usize, 3), snapshot.events.len);
+    try std.testing.expectEqual(StreamMode.values, snapshot.events[0].mode);
+    try std.testing.expectEqual(StreamMode.tasks, snapshot.events[1].mode);
+    try std.testing.expectEqual(StreamMode.debug, snapshot.events[2].mode);
+}
+
+test "sse hub snapshotSince returns only events after cursor" {
+    const alloc = std.testing.allocator;
+    var hub = SseHub.init(alloc);
+    defer hub.deinit();
+
+    const queue = hub.getOrCreateQueue("run1");
+    queue.push(.{ .event_type = "one", .data = "{}" });
+    queue.push(.{ .event_type = "two", .data = "{}" });
+    queue.push(.{ .event_type = "three", .data = "{}" });
+
+    const snapshot = queue.snapshotSince(alloc, 2);
+    defer queue.freeSnapshot(alloc, snapshot);
+
+    try std.testing.expectEqual(@as(usize, 1), snapshot.events.len);
+    try std.testing.expectEqual(@as(u64, 3), snapshot.events[0].seq);
+    try std.testing.expectEqualStrings("three", snapshot.events[0].event_type);
 }