diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs
new file mode 100644
index 00000000..aa722b4c
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs
@@ -0,0 +1,119 @@
+using System.Net;
+using System.Net.Http.Json;
+using System.Text.Json;
+using FluentAssertions;
+using Taskdeck.Api.Tests.Support;
+using Taskdeck.Application.DTOs;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that database operations produce appropriate error responses and
+/// that health endpoints report database status accurately.
+///
+public class DatabaseResilienceTests : IClassFixture
+{
+ private readonly HttpClient _client;
+
+ public DatabaseResilienceTests(TestWebApplicationFactory factory)
+ {
+ _client = factory.CreateClient();
+ }
+
+ // ── Health Endpoint Reports Database Status ───────────────────────
+
+ [Fact]
+ public async Task ReadyCheck_IncludesDatabaseCheck_WhenDatabaseIsReachable()
+ {
+ var response = await _client.GetAsync("/health/ready");
+
+ // With a working DB, ready check may be OK or 503 depending on worker state,
+ // but the database check itself should be Healthy.
+ var payload = await response.Content.ReadFromJsonAsync();
+ payload.TryGetProperty("checks", out var checks).Should().BeTrue();
+
+ var database = checks.GetProperty("database");
+ database.GetProperty("status").GetString().Should().Be("Healthy",
+ "database check should report Healthy when database is reachable");
+ }
+
+ [Fact]
+ public async Task LiveCheck_AlwaysReturnsHealthy_RegardlessOfDatabaseState()
+ {
+ var response = await _client.GetAsync("/health/live");
+
+ response.StatusCode.Should().Be(HttpStatusCode.OK,
+ "liveness probe should always return 200");
+
+ var payload = await response.Content.ReadFromJsonAsync();
+ payload.GetProperty("status").GetString().Should().Be("Healthy",
+ "live check is a simple heartbeat, independent of database state");
+ }
+
+ // ── Database Error Handling in API Operations ──────────────────────
+
+ [Fact]
+ public async Task Operations_OnNonExistentResource_ReturnNotFoundInsteadOfCrash()
+ {
+ await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-notfound");
+
+ // Accessing a non-existent board should return 404, not 500.
+ var response = await _client.GetAsync($"/api/boards/{Guid.NewGuid()}");
+ response.StatusCode.Should().Be(HttpStatusCode.NotFound,
+ "non-existent resource should return 404, not a database crash");
+
+ var body = await response.Content.ReadFromJsonAsync();
+ body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue(
+ "404 response should follow error contract");
+ errorCode.GetString().Should().Be("NotFound");
+ }
+
+ [Fact]
+ public async Task ConcurrentWrites_HandleConflictsGracefully()
+ {
+ await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-conflict");
+
+ // Create a board first.
+ var board = await ApiTestHarness.CreateBoardAsync(_client, "db-conflict-board");
+
+ // Try to delete the same board twice in quick succession.
+ var delete1 = _client.DeleteAsync($"/api/boards/{board.Id}");
+ var delete2 = _client.DeleteAsync($"/api/boards/{board.Id}");
+
+ var results = await Task.WhenAll(delete1, delete2);
+
+ // One should succeed (204/200), the other should get 404.
+ // Neither should be 500.
+ foreach (var result in results)
+ {
+ var statusCode = (int)result.StatusCode;
+ statusCode.Should().NotBe(500,
+ "concurrent operations should not cause unhandled 500 errors");
+ }
+
+ var statusCodes = results.Select(r => (int)r.StatusCode).OrderBy(s => s).ToArray();
+ statusCodes.Should().Contain(s => s >= 200 && s < 300,
+ "at least one delete should succeed");
+ }
+
+ // ── Database Write Validation ─────────────────────────────────────
+
+ [Fact]
+ public async Task CreateBoard_WithInvalidData_ReturnsValidationError()
+ {
+ await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-validation");
+
+ // An empty board name should return a validation error, not a DB crash.
+ var response = await _client.PostAsJsonAsync(
+ "/api/boards",
+ new CreateBoardDto("", "Empty name board"));
+
+ response.StatusCode.Should().Be(HttpStatusCode.BadRequest,
+ "invalid data should return 400, not a database crash");
+
+ var body = await response.Content.ReadFromJsonAsync();
+ body.TryGetProperty("errorCode", out _).Should().BeTrue(
+ "400 response should follow the error contract");
+ }
+}
diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs
new file mode 100644
index 00000000..ea04df7c
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs
@@ -0,0 +1,114 @@
+using System.Net;
+using System.Net.Http.Json;
+using System.Text.Json;
+using FluentAssertions;
+using Taskdeck.Api.Tests.Support;
+using Taskdeck.Application.DTOs;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that external service failures (GitHub OAuth, etc.) produce appropriate
+/// error responses while keeping local functionality working.
+///
+public class ExternalServiceFailureTests : IClassFixture
+{
+ private readonly HttpClient _client;
+
+ public ExternalServiceFailureTests(TestWebApplicationFactory factory)
+ {
+ _client = factory.CreateClient();
+ }
+
+ // ── Local Auth Still Works When External Auth Is Unavailable ───────
+
+ [Fact]
+ public async Task LocalRegistration_ShouldWork_RegardlessOfExternalOAuthState()
+ {
+ // Local auth (register + login) should not depend on any external service.
+ var suffix = Guid.NewGuid().ToString("N")[..8];
+ var response = await _client.PostAsJsonAsync(
+ "/api/auth/register",
+ new CreateUserDto($"ext-resilience-{suffix}", $"ext-resilience-{suffix}@example.com", "password123"));
+
+ response.StatusCode.Should().Be(HttpStatusCode.OK,
+ "local registration should succeed regardless of external service state");
+
+ var payload = await response.Content.ReadFromJsonAsync();
+ payload.Should().NotBeNull();
+ payload!.Token.Should().NotBeNullOrWhiteSpace(
+ "local auth should issue a token without relying on external services");
+ }
+
+ [Fact]
+ public async Task LocalLogin_ShouldWork_RegardlessOfExternalOAuthState()
+ {
+ // Register first.
+ var suffix = Guid.NewGuid().ToString("N")[..8];
+ var username = $"ext-login-{suffix}";
+ var registerResponse = await _client.PostAsJsonAsync(
+ "/api/auth/register",
+ new CreateUserDto(username, $"ext-login-{suffix}@example.com", "password123"));
+ registerResponse.StatusCode.Should().Be(HttpStatusCode.OK);
+
+ // Login should work via local path regardless of external service availability.
+ var loginResponse = await _client.PostAsJsonAsync(
+ "/api/auth/login",
+ new LoginDto(username, "password123"));
+
+ loginResponse.StatusCode.Should().Be(HttpStatusCode.OK,
+ "local login should succeed regardless of external service state");
+
+ var loginPayload = await loginResponse.Content.ReadFromJsonAsync();
+ loginPayload.Should().NotBeNull();
+ loginPayload!.Token.Should().NotBeNullOrWhiteSpace();
+ }
+
+ // ── Invalid External Auth Callback → Appropriate Error ────────────
+
+ [Fact]
+ public async Task GithubCallback_WhenGithubNotConfigured_ReturnsNotFound()
+ {
+ // When GitHub OAuth is not configured, the callback should return
+ // a clean 404 error rather than crashing.
+ var response = await _client.GetAsync("/api/auth/github/callback");
+
+ response.StatusCode.Should().Be(HttpStatusCode.NotFound,
+ "GitHub callback should return 404 when OAuth is not configured");
+
+ var body = await response.Content.ReadFromJsonAsync();
+ body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue(
+ "404 response should follow the error contract");
+ errorCode.GetString().Should().Be("NotFound");
+ }
+
+ [Fact]
+ public async Task GithubLogin_WhenGithubNotConfigured_ReturnsNotFound()
+ {
+ // The GitHub login initiation endpoint should also return 404 when not configured.
+ var response = await _client.GetAsync("/api/auth/github/login");
+
+ response.StatusCode.Should().Be(HttpStatusCode.NotFound,
+ "GitHub login should return 404 when OAuth is not configured");
+ }
+
+ // ── API Endpoints Return Proper Error Codes on Invalid Input ──────
+
+ [Fact]
+ public async Task ApiEndpoints_ReturnProperErrorCodes_WhenUnauthenticated()
+ {
+ // Without auth, protected endpoints should return 401, not 500.
+ var boardsResponse = await _client.GetAsync("/api/boards");
+ boardsResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized,
+ "unauthenticated request to boards should get 401, not 500");
+
+ var captureResponse = await _client.GetAsync("/api/capture/items");
+ captureResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized,
+ "unauthenticated request to capture should get 401, not 500");
+
+ var chatResponse = await _client.GetAsync("/api/llm/chat/sessions");
+ chatResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized,
+ "unauthenticated request to chat sessions should get 401, not 500");
+ }
+}
diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs
new file mode 100644
index 00000000..445803d3
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs
@@ -0,0 +1,313 @@
+using System.Net;
+using System.Net.Http.Json;
+using System.Runtime.CompilerServices;
+using System.Text.Json;
+using FluentAssertions;
+using Microsoft.AspNetCore.Hosting;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.DependencyInjection.Extensions;
+using Taskdeck.Api.Tests.Support;
+using Taskdeck.Application.DTOs;
+using Taskdeck.Application.Services;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that LLM provider failures (timeout, invalid response, total unavailability)
+/// are surfaced as degraded responses rather than 500 errors or infinite waits.
+///
+public class LlmProviderDegradationTests : IClassFixture
+{
+ private readonly TestWebApplicationFactory _baseFactory;
+
+ public LlmProviderDegradationTests(TestWebApplicationFactory baseFactory)
+ {
+ _baseFactory = baseFactory;
+ }
+
+ // ── Provider Timeout ───────────────────────────────────────────────
+
+ [Fact]
+ public async Task SendMessage_WhenProviderTimesOut_ReturnsDegradedResponseNotInfiniteWait()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new TimeoutProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+ client.Timeout = TimeSpan.FromSeconds(30);
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-timeout-resilience");
+
+ var createSessionResponse = await client.PostAsJsonAsync(
+ "/api/llm/chat/sessions",
+ new CreateChatSessionDto("Timeout provider test"));
+ createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created);
+ var session = await createSessionResponse.Content.ReadFromJsonAsync();
+ session.Should().NotBeNull();
+
+ var sendMessageResponse = await client.PostAsJsonAsync(
+ $"/api/llm/chat/sessions/{session!.Id}/messages",
+ new SendChatMessageDto("tell me something"));
+
+ // The request should not hang forever; it should return within the test timeout.
+ // The response may be degraded or an error -- the key assertion is no infinite wait.
+ sendMessageResponse.Should().NotBeNull(
+ "request should complete even when provider times out");
+
+ // Since the provider throws OperationCanceledException simulating timeout,
+ // the chat service should handle this and return a 500 error contract
+ // rather than an unhandled exception.
+ var statusCode = (int)sendMessageResponse.StatusCode;
+ statusCode.Should().BeOneOf(new[] { 200, 500 },
+ "should either return a degraded response or an error contract, not hang");
+ }
+
+ // ── Provider Throws Exception ──────────────────────────────────────
+
+ [Fact]
+ public async Task SendMessage_WhenProviderThrowsException_ReturnsErrorContract()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new ThrowingProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-throw-resilience");
+
+ var createSessionResponse = await client.PostAsJsonAsync(
+ "/api/llm/chat/sessions",
+ new CreateChatSessionDto("Throwing provider test"));
+ createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created);
+ var session = await createSessionResponse.Content.ReadFromJsonAsync();
+ session.Should().NotBeNull();
+
+ var sendMessageResponse = await client.PostAsJsonAsync(
+ $"/api/llm/chat/sessions/{session!.Id}/messages",
+ new SendChatMessageDto("create card 'Test'"));
+
+ sendMessageResponse.Should().NotBeNull();
+ var statusCode = (int)sendMessageResponse.StatusCode;
+ statusCode.Should().BeOneOf(new[] { 200, 500 },
+ "should return an error contract or degraded response, not crash");
+
+ if (sendMessageResponse.StatusCode == HttpStatusCode.InternalServerError)
+ {
+ var body = await sendMessageResponse.Content.ReadFromJsonAsync();
+ body.TryGetProperty("errorCode", out _).Should().BeTrue(
+ "500 response should follow error contract with errorCode");
+ body.TryGetProperty("message", out _).Should().BeTrue(
+ "500 response should follow error contract with message");
+ }
+ }
+
+ // ── Provider Unavailable but Non-LLM Features Still Work ──────────
+
+ [Fact]
+ public async Task BoardCrud_StillWorks_WhenAllProvidersUnavailable()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new TotallyDeadProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-dead-board-crud");
+
+ // Board CRUD should work regardless of LLM provider state.
+ var board = await ApiTestHarness.CreateBoardAsync(client, "resilience-board");
+ board.Should().NotBeNull();
+ board.Name.Should().StartWith("resilience-board");
+
+ var getResponse = await client.GetAsync($"/api/boards/{board.Id}");
+ getResponse.StatusCode.Should().Be(HttpStatusCode.OK);
+
+ var listResponse = await client.GetAsync("/api/boards");
+ listResponse.StatusCode.Should().Be(HttpStatusCode.OK);
+ }
+
+ [Fact]
+ public async Task CaptureItems_StillWork_WhenProviderUnavailable()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new TotallyDeadProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-dead-capture");
+
+ // Capture should still accept items even when the LLM is dead.
+ // The items queue up for later processing.
+ var captureResponse = await client.PostAsJsonAsync(
+ "/api/capture/items",
+ new CreateCaptureItemDto(null, "capture while LLM is down"));
+ captureResponse.StatusCode.Should().Be(HttpStatusCode.Created,
+ "capture should accept items even when LLM provider is unavailable");
+ }
+
+ // ── Provider Health Reports Unhealthy ──────────────────────────────
+
+ [Fact]
+ public async Task ProviderHealth_ReportsUnhealthy_WhenProviderIsDown()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new TotallyDeadProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-dead-health");
+
+ var response = await client.GetAsync("/api/llm/chat/health");
+ response.StatusCode.Should().Be(HttpStatusCode.OK);
+
+ var payload = await response.Content.ReadFromJsonAsync();
+ payload.Should().NotBeNull();
+ payload!.IsAvailable.Should().BeFalse(
+ "health check should report the provider as unavailable");
+ payload.ErrorMessage.Should().NotBeNullOrWhiteSpace(
+ "health check should include an error explanation");
+ }
+
+ [Fact]
+ public async Task ProviderHealth_WithProbe_ReportsUnhealthy_WhenProviderIsDown()
+ {
+ using var factory = _baseFactory.WithWebHostBuilder(builder =>
+ {
+ builder.UseEnvironment("Development");
+ builder.ConfigureServices(services =>
+ {
+ services.RemoveAll();
+ services.AddScoped(_ => new TotallyDeadProviderStub());
+ });
+ });
+ using var client = factory.CreateClient();
+
+ await ApiTestHarness.AuthenticateAsync(client, "llm-dead-probe");
+
+ var response = await client.GetAsync("/api/llm/chat/health?probe=true");
+ response.StatusCode.Should().Be(HttpStatusCode.OK);
+
+ var payload = await response.Content.ReadFromJsonAsync();
+ payload.Should().NotBeNull();
+ payload!.IsAvailable.Should().BeFalse();
+ payload.IsProbed.Should().BeTrue();
+ }
+
+ // ── Stub Implementations ──────────────────────────────────────────
+
+ ///
+ /// Provider that simulates a timeout by delaying beyond cancellation.
+ ///
+ private sealed class TimeoutProviderStub : ILlmProvider
+ {
+ public async Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default)
+ {
+ // Simulate a long wait that would be cancelled by the service's timeout.
+ using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+ internalCts.CancelAfter(TimeSpan.FromMilliseconds(50));
+ await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token);
+ throw new InvalidOperationException("Should not reach here");
+ }
+
+ public async IAsyncEnumerable StreamAsync(
+ ChatCompletionRequest request,
+ [EnumeratorCancellation] CancellationToken ct = default)
+ {
+ // Use a short internal timeout to avoid hanging for 60 seconds if a test
+ // hits the streaming endpoint. Cancels quickly like CompleteAsync does.
+ using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct);
+ internalCts.CancelAfter(TimeSpan.FromMilliseconds(50));
+ await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token);
+ yield return new LlmTokenEvent("timeout", true);
+ }
+
+ public Task GetHealthAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out"));
+
+ public Task ProbeAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out", IsProbed: true));
+ }
+
+ ///
+ /// Provider that throws an unhandled exception on every call.
+ ///
+ private sealed class ThrowingProviderStub : ILlmProvider
+ {
+ public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default)
+ => throw new InvalidOperationException("Simulated provider crash");
+
+ public async IAsyncEnumerable StreamAsync(
+ ChatCompletionRequest request,
+ [EnumeratorCancellation] CancellationToken ct = default)
+ {
+ await Task.CompletedTask;
+ ThrowStreamCrash();
+ yield break;
+ }
+
+ public Task GetHealthAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception"));
+
+ public Task ProbeAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception", IsProbed: true));
+
+ private static void ThrowStreamCrash()
+ => throw new InvalidOperationException("Simulated stream crash");
+ }
+
+ ///
+ /// Provider where everything reports unavailable.
+ ///
+ private sealed class TotallyDeadProviderStub : ILlmProvider
+ {
+ public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default)
+ => throw new InvalidOperationException("All providers are down");
+
+ public async IAsyncEnumerable StreamAsync(
+ ChatCompletionRequest request,
+ [EnumeratorCancellation] CancellationToken ct = default)
+ {
+ await Task.CompletedTask;
+ ThrowProvidersDown();
+ yield break;
+ }
+
+ public Task GetHealthAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable"));
+
+ public Task ProbeAsync(CancellationToken ct = default)
+ => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable", IsProbed: true));
+
+ private static void ThrowProvidersDown()
+ => throw new InvalidOperationException("All providers are down");
+ }
+}
diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs
new file mode 100644
index 00000000..ee4eb0db
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs
@@ -0,0 +1,171 @@
+using System.Net.Http.Json;
+using FluentAssertions;
+using Microsoft.AspNetCore.SignalR;
+using Microsoft.AspNetCore.SignalR.Client;
+using Taskdeck.Api.Realtime;
+using Taskdeck.Api.Tests.Support;
+using Taskdeck.Application.DTOs;
+using Taskdeck.Domain.Enums;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that SignalR hub failures on one connection do not cascade to other
+/// connected clients, and that invalid operations produce HubException rather
+/// than killing the connection.
+///
+public class SignalRDegradationTests : IClassFixture
+{
+ private readonly TestWebApplicationFactory _factory;
+
+ public SignalRDegradationTests(TestWebApplicationFactory factory)
+ {
+ _factory = factory;
+ }
+
+ // ── Hub Exception Isolation ────────────────────────────────────────
+
+ [Fact]
+ public async Task JoinBoard_WithInvalidBoardId_ThrowsHubExceptionButConnectionSurvives()
+ {
+ using var client = _factory.CreateClient();
+ var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-bad-board");
+
+ await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token);
+ await connection.StartAsync();
+ connection.State.Should().Be(HubConnectionState.Connected);
+
+ // Try to join a non-existent board — should throw HubException.
+ var act = () => connection.InvokeAsync("JoinBoard", Guid.NewGuid());
+ await act.Should().ThrowAsync(
+ "joining a non-existent board should throw a HubException");
+
+ // Connection should still be alive after the error.
+ connection.State.Should().Be(HubConnectionState.Connected,
+ "one failed hub invocation should not kill the connection");
+ }
+
+ [Fact]
+ public async Task SetEditingCard_WithoutJoining_ThrowsHubExceptionButConnectionSurvives()
+ {
+ using var client = _factory.CreateClient();
+ var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-no-join");
+ var board = await ApiTestHarness.CreateBoardAsync(client, "hub-resilience-board");
+
+ await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token);
+ await connection.StartAsync();
+
+ // Try editing a card without joining the board first.
+ var act = () => connection.InvokeAsync("SetEditingCard", board.Id, Guid.NewGuid());
+ await act.Should().ThrowAsync(
+ "setting editing card without joining should throw a HubException");
+
+ // Connection should still be connected.
+ connection.State.Should().Be(HubConnectionState.Connected,
+ "hub error should not disconnect the client");
+ }
+
+ // ── One Client's Error Doesn't Affect Others ──────────────────────
+
+ [Fact]
+ public async Task ErrorOnOneClient_DoesNotDisconnectOtherClients()
+ {
+ using var client1 = _factory.CreateClient();
+ using var client2 = _factory.CreateClient();
+
+ var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-resilience-user1");
+ var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-resilience-user2");
+
+ var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-resilience-multi");
+
+ // Share the board with user2.
+ await client1.PostAsJsonAsync(
+ $"/api/boards/{board.Id}/access",
+ new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor));
+
+ var presenceCollector = new EventCollector();
+
+ await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token);
+ await using var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token);
+
+ connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot));
+
+ await connection1.StartAsync();
+ await connection2.StartAsync();
+
+ // Both users join the board.
+ await connection1.InvokeAsync("JoinBoard", board.Id);
+ await connection2.InvokeAsync("JoinBoard", board.Id);
+
+ // Wait for presence events to confirm both clients joined (event-based, not timing-based).
+ await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3));
+
+ // Client 1 causes an error by trying to join a non-existent board.
+ var act = () => connection1.InvokeAsync("JoinBoard", Guid.NewGuid());
+ try { await act(); } catch (HubException) { /* expected */ }
+
+ // Client 2 should still be connected and functional.
+ connection2.State.Should().Be(HubConnectionState.Connected,
+ "client 2 should be unaffected by client 1's error");
+
+ // Client 1 should also still be connected (HubException doesn't kill connection).
+ connection1.State.Should().Be(HubConnectionState.Connected,
+ "client 1's connection should survive its own hub exception");
+
+ // Verify client 2 can still perform operations on the hub.
+ var postErrorAct = () => connection2.InvokeAsync("SetEditingCard", board.Id, (Guid?)null);
+ await postErrorAct.Should().NotThrowAsync(
+ "client 2 should be fully functional after client 1's error");
+ }
+
+ // ── Disconnection Handling ────────────────────────────────────────
+
+ [Fact]
+ public async Task DisconnectedClient_RemovedFromPresence_OtherClientsNotified()
+ {
+ using var client1 = _factory.CreateClient();
+ using var client2 = _factory.CreateClient();
+
+ var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-disconnect-user1");
+ var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-disconnect-user2");
+
+ var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-disconnect-board");
+
+ await client1.PostAsJsonAsync(
+ $"/api/boards/{board.Id}/access",
+ new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor));
+
+ var presenceCollector = new EventCollector();
+
+ await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token);
+ var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token);
+
+ connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot));
+
+ await connection1.StartAsync();
+ await connection2.StartAsync();
+
+ await connection1.InvokeAsync("JoinBoard", board.Id);
+ await connection2.InvokeAsync("JoinBoard", board.Id);
+
+ // Wait for join events.
+ await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3));
+ presenceCollector.Clear();
+
+ // Disconnect client 2 explicitly.
+ await connection2.DisposeAsync();
+
+ // Client 1 should receive a presence update showing client 2 left.
+ var disconnectEvents = await SignalRTestHelper.WaitForEventsAsync(
+ presenceCollector, 1, TimeSpan.FromSeconds(5));
+
+ disconnectEvents.Should().HaveCountGreaterThanOrEqualTo(1,
+ "client 1 should be notified when client 2 disconnects");
+
+ // The latest presence snapshot should no longer include client 2.
+ var latestPresence = disconnectEvents.Last();
+ latestPresence.Members.Should().NotContain(u => u.UserId == user2.UserId,
+ "disconnected user should be removed from presence");
+ }
+}
diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs
new file mode 100644
index 00000000..2f9782ed
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs
@@ -0,0 +1,255 @@
+using FluentAssertions;
+using Microsoft.Extensions.DependencyInjection;
+using Taskdeck.Application.Interfaces;
+using Taskdeck.Domain.Entities;
+using Taskdeck.Domain.Enums;
+using Taskdeck.Infrastructure.Persistence;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that webhook delivery failures are handled with retries, backoff, and
+/// dead-lettering rather than crashing or silently losing deliveries.
+///
+public class WebhookDeliveryResilienceTests : IClassFixture
+{
+ private readonly TestWebApplicationFactory _factory;
+
+ public WebhookDeliveryResilienceTests(TestWebApplicationFactory factory)
+ {
+ _factory = factory;
+ }
+
+ // ── Delivery to Unreachable Target → Retry Scheduling ─────────────
+
+ [Fact]
+ public async Task Delivery_ToUnreachableEndpoint_IsScheduledForRetry()
+ {
+ // Arrange: create entities directly in the DB to simulate a pending delivery.
+ using var scope = _factory.Services.CreateScope();
+ var dbContext = scope.ServiceProvider.GetRequiredService();
+ var deliveryRepo = scope.ServiceProvider.GetRequiredService();
+
+ var user = new User("webhook-retry-user", "webhook-retry@example.com", "hash");
+ var board = new Board("webhook-retry-board", ownerId: user.Id);
+ var subscription = new OutboundWebhookSubscription(
+ board.Id,
+ user.Id,
+ "https://example.com/webhook",
+ "signing-secret-123",
+ new[] { "card.*" });
+ var delivery = new OutboundWebhookDelivery(
+ Guid.NewGuid(),
+ subscription.Id,
+ board.Id,
+ "card.created",
+ "{\"event\":\"card.created\",\"data\":{}}");
+
+ dbContext.Users.Add(user);
+ dbContext.Boards.Add(board);
+ dbContext.OutboundWebhookSubscriptions.Add(subscription);
+ dbContext.OutboundWebhookDeliveries.Add(delivery);
+ await dbContext.SaveChangesAsync();
+
+ // Verify the delivery starts as Pending.
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Pending);
+
+ // Simulate a delivery failure by manually marking it.
+ var claimedAt = DateTimeOffset.UtcNow;
+ var claimed = await deliveryRepo.TryClaimPendingAsync(
+ delivery.Id,
+ delivery.UpdatedAt,
+ claimedAt,
+ CancellationToken.None);
+ claimed.Should().BeTrue();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Processing);
+
+ // Schedule retry (simulating what the worker does on HTTP failure).
+ var nextAttemptAt = DateTimeOffset.UtcNow.AddSeconds(10);
+ delivery.ScheduleRetry("Webhook endpoint returned HTTP 503.", nextAttemptAt, 503);
+ await dbContext.SaveChangesAsync();
+
+ // Assert: the delivery should be back to Pending with retry metadata.
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Pending,
+ "failed delivery should be rescheduled as Pending for retry");
+ delivery.AttemptCount.Should().Be(1,
+ "attempt count should be incremented after a failure");
+ delivery.LastErrorMessage.Should().Contain("503",
+ "error message should capture the failure reason");
+ delivery.NextAttemptAt.Should().BeAfter(DateTimeOffset.MinValue,
+ "retry should have a scheduled next attempt time");
+ }
+
+ // ── Dead-Lettering After Max Retries ──────────────────────────────
+
+ [Fact]
+ public async Task Delivery_AfterMaxRetries_IsDeadLettered()
+ {
+ using var scope = _factory.Services.CreateScope();
+ var dbContext = scope.ServiceProvider.GetRequiredService();
+ var deliveryRepo = scope.ServiceProvider.GetRequiredService();
+
+ var user = new User("webhook-deadletter-user", "webhook-deadletter@example.com", "hash");
+ var board = new Board("webhook-deadletter-board", ownerId: user.Id);
+ var subscription = new OutboundWebhookSubscription(
+ board.Id,
+ user.Id,
+ "https://example.com/webhook",
+ "signing-secret-456",
+ new[] { "card.*" });
+ var delivery = new OutboundWebhookDelivery(
+ Guid.NewGuid(),
+ subscription.Id,
+ board.Id,
+ "card.updated",
+ "{\"event\":\"card.updated\",\"data\":{}}");
+
+ dbContext.Users.Add(user);
+ dbContext.Boards.Add(board);
+ dbContext.OutboundWebhookSubscriptions.Add(subscription);
+ dbContext.OutboundWebhookDeliveries.Add(delivery);
+ await dbContext.SaveChangesAsync();
+
+ // Simulate multiple failed attempts until max retries is reached.
+ // Worker settings default: MaxRetries = 3
+ for (var attempt = 1; attempt <= 2; attempt++)
+ {
+ var updatedAt = delivery.UpdatedAt;
+ var claimed = await deliveryRepo.TryClaimPendingAsync(
+ delivery.Id, updatedAt, DateTimeOffset.UtcNow, CancellationToken.None);
+ claimed.Should().BeTrue($"attempt {attempt} claim should succeed");
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.ScheduleRetry(
+ $"HTTP 500 on attempt {attempt}",
+ DateTimeOffset.UtcNow.AddSeconds(-1), // Make immediately retryable
+ 500);
+ await dbContext.SaveChangesAsync();
+ await dbContext.Entry(delivery).ReloadAsync();
+ }
+
+ // Third attempt (attempt index = 3 which equals MaxRetries) → dead letter.
+ var finalUpdatedAt = delivery.UpdatedAt;
+ var finalClaimed = await deliveryRepo.TryClaimPendingAsync(
+ delivery.Id, finalUpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None);
+ finalClaimed.Should().BeTrue();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.MarkDeadLetter("HTTP 500 on final attempt", 500);
+ await dbContext.SaveChangesAsync();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter,
+ "delivery should be dead-lettered after exceeding max retries");
+ delivery.LastErrorMessage.Should().Contain("final attempt",
+ "dead-letter should preserve the failure reason");
+ }
+
+ // ── Inactive Subscription → Dead Letter ───────────────────────────
+
+ [Fact]
+ public async Task Delivery_ForInactiveSubscription_CanBeDeadLettered()
+ {
+ using var scope = _factory.Services.CreateScope();
+ var dbContext = scope.ServiceProvider.GetRequiredService();
+
+ var user = new User("webhook-inactive-user", "webhook-inactive@example.com", "hash");
+ var board = new Board("webhook-inactive-board", ownerId: user.Id);
+ var subscription = new OutboundWebhookSubscription(
+ board.Id,
+ user.Id,
+ "https://example.com/webhook",
+ "signing-secret-789",
+ new[] { "card.*" });
+
+ // Revoke the subscription before the delivery is processed.
+ subscription.Revoke(user.Id);
+
+ var delivery = new OutboundWebhookDelivery(
+ Guid.NewGuid(),
+ subscription.Id,
+ board.Id,
+ "card.deleted",
+ "{\"event\":\"card.deleted\",\"data\":{}}");
+
+ dbContext.Users.Add(user);
+ dbContext.Boards.Add(board);
+ dbContext.OutboundWebhookSubscriptions.Add(subscription);
+ dbContext.OutboundWebhookDeliveries.Add(delivery);
+ await dbContext.SaveChangesAsync();
+
+ // The worker would first claim the delivery (move to Processing),
+ // then check subscription.IsActive and dead-letter.
+ var deliveryRepo = scope.ServiceProvider.GetRequiredService();
+ var claimed = await deliveryRepo.TryClaimPendingAsync(
+ delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None);
+ claimed.Should().BeTrue("delivery should be claimable");
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Processing);
+
+ delivery.MarkDeadLetter("Webhook subscription is inactive before delivery dispatch.");
+ await dbContext.SaveChangesAsync();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter,
+ "delivery for inactive subscription should be dead-lettered");
+ delivery.LastErrorMessage.Should().Contain("inactive",
+ "dead-letter message should explain why delivery was abandoned");
+ }
+
+ // ── Stuck Processing Recovery ────────────────────────────────────
+
+ [Fact]
+ public async Task StuckProcessingDelivery_CanBeReturnedToPending()
+ {
+ using var scope = _factory.Services.CreateScope();
+ var dbContext = scope.ServiceProvider.GetRequiredService();
+ var deliveryRepo = scope.ServiceProvider.GetRequiredService();
+
+ var user = new User("webhook-stuck-user", "webhook-stuck@example.com", "hash");
+ var board = new Board("webhook-stuck-board", ownerId: user.Id);
+ var subscription = new OutboundWebhookSubscription(
+ board.Id,
+ user.Id,
+ "https://example.com/webhook",
+ "signing-secret-stuck",
+ new[] { "card.*" });
+ var delivery = new OutboundWebhookDelivery(
+ Guid.NewGuid(),
+ subscription.Id,
+ board.Id,
+ "card.moved",
+ "{\"event\":\"card.moved\",\"data\":{}}");
+
+ dbContext.Users.Add(user);
+ dbContext.Boards.Add(board);
+ dbContext.OutboundWebhookSubscriptions.Add(subscription);
+ dbContext.OutboundWebhookDeliveries.Add(delivery);
+ await dbContext.SaveChangesAsync();
+
+ // Claim the delivery (move to Processing).
+ var claimed = await deliveryRepo.TryClaimPendingAsync(
+ delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None);
+ claimed.Should().BeTrue();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Processing);
+
+ // Simulate worker recovery: return the stuck delivery to Pending.
+ delivery.ReturnToPending(
+ DateTimeOffset.UtcNow,
+ "Recovered stale processing webhook delivery for retry.");
+ await dbContext.SaveChangesAsync();
+
+ await dbContext.Entry(delivery).ReloadAsync();
+ delivery.Status.Should().Be(WebhookDeliveryStatus.Pending,
+ "stuck processing delivery should be recoverable to Pending");
+ delivery.LastErrorMessage.Should().Contain("Recovered",
+ "recovery message should explain why the delivery was returned to Pending");
+ }
+}
diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs
new file mode 100644
index 00000000..50404d48
--- /dev/null
+++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs
@@ -0,0 +1,251 @@
+using FluentAssertions;
+using Microsoft.Extensions.DependencyInjection;
+using Microsoft.Extensions.Logging;
+using Moq;
+using Taskdeck.Api.Workers;
+using Taskdeck.Application.DTOs;
+using Taskdeck.Application.Interfaces;
+using Taskdeck.Application.Services;
+using Taskdeck.Domain.Common;
+using Taskdeck.Domain.Entities;
+using Taskdeck.Domain.Enums;
+using Taskdeck.Tests.Support;
+using Xunit;
+
+namespace Taskdeck.Api.Tests.Resilience;
+
+///
+/// Tests that background workers handle exceptions, DB failures, cancellation, and
+/// repeated errors without crashing or leaving items in corrupted states.
+///
+public class WorkerResilienceTests
+{
+ // ── Worker Exception in Main Loop ──────────────────────────────────
+
+ [Fact]
+ public async Task LlmWorker_WhenProcessBatchThrows_LogsErrorAndContinuesToNextPoll()
+ {
+ // Arrange: set up a scope factory whose IUnitOfWork always throws.
+ var callCount = 0;
+ var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() =>
+ {
+ callCount++;
+ throw new InvalidOperationException("Simulated DB blowup");
+ });
+
+ var logger = new InMemoryLogger();
+ var settings = new WorkerSettings
+ {
+ QueuePollIntervalSeconds = 1,
+ EnableAutoQueueProcessing = true,
+ MaxBatchSize = 5,
+ MaxConcurrency = 1,
+ RetryBackoffSeconds = new[] { 0 }
+ };
+ var heartbeat = new WorkerHeartbeatRegistry();
+
+ var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger);
+
+ using var cts = new CancellationTokenSource();
+
+ // Act: run the worker for long enough to complete at least one iteration, then cancel.
+ var runTask = worker.StartAsync(cts.Token);
+ await Task.Delay(1500);
+ cts.Cancel();
+
+ try { await runTask; } catch (OperationCanceledException) { }
+ await worker.StopAsync(CancellationToken.None);
+
+ // Assert: the worker should have logged the error but NOT crashed;
+ // it should have executed at least one iteration.
+ callCount.Should().BeGreaterThanOrEqualTo(1,
+ "worker should have attempted at least one batch despite DB throwing");
+
+ logger.Entries.Should().Contain(e =>
+ e.Level == LogLevel.Error &&
+ e.Message.Contains("Error in LlmQueueToProposalWorker iteration"),
+ "worker should log the exception and continue looping");
+
+ // Heartbeat should still have been reported.
+ heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull(
+ "worker should report heartbeats even when processing fails");
+ }
+
+ [Fact]
+ public async Task ProposalHousekeepingWorker_WhenDbThrows_LogsErrorAndContinuesPolling()
+ {
+ var callCount = 0;
+ var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() =>
+ {
+ callCount++;
+ throw new InvalidOperationException("Simulated housekeeping DB failure");
+ });
+
+ var logger = new InMemoryLogger();
+ var settings = new WorkerSettings();
+ var heartbeat = new WorkerHeartbeatRegistry();
+
+ var worker = new ProposalHousekeepingWorker(scopeFactory, settings, heartbeat, logger);
+
+ using var cts = new CancellationTokenSource();
+ var runTask = worker.StartAsync(cts.Token);
+ await Task.Delay(300);
+ cts.Cancel();
+
+ try { await runTask; } catch (OperationCanceledException) { }
+ await worker.StopAsync(CancellationToken.None);
+
+ callCount.Should().BeGreaterThan(0);
+ logger.Entries.Should().Contain(e =>
+ e.Level == LogLevel.Error &&
+ e.Message.Contains("Error in ProposalHousekeepingWorker iteration"));
+ heartbeat.GetLastHeartbeat(nameof(ProposalHousekeepingWorker)).Should().NotBeNull();
+ }
+
+ // ── Worker Cancellation → Clean Shutdown ───────────────────────────
+
+ [Fact]
+ public async Task LlmWorker_WhenCancelled_ExitsWithoutCrashing()
+ {
+ // Arrange: the worker has nothing to process; we test clean cancellation.
+ var mockLlmQueue = new Mock();
+ mockLlmQueue
+ .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny()))
+ .ReturnsAsync(Enumerable.Empty());
+
+ var mockUnitOfWork = new Mock();
+ mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object);
+
+ var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object);
+ var logger = new InMemoryLogger();
+ var settings = new WorkerSettings
+ {
+ QueuePollIntervalSeconds = 1,
+ EnableAutoQueueProcessing = true,
+ MaxBatchSize = 5,
+ MaxConcurrency = 1,
+ RetryBackoffSeconds = new[] { 0 }
+ };
+ var heartbeat = new WorkerHeartbeatRegistry();
+
+ var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger);
+
+ using var cts = new CancellationTokenSource();
+ await worker.StartAsync(cts.Token);
+
+ // Let it run at least one cycle.
+ await Task.Delay(1500);
+
+ // StopAsync triggers cancellation and waits for ExecuteAsync to complete.
+ // This should NOT throw -- the BackgroundService infrastructure handles OperationCanceledException.
+ var stopAct = () => worker.StopAsync(CancellationToken.None);
+ await stopAct.Should().NotThrowAsync(
+ "worker should handle cancellation cleanly without throwing");
+
+ // Assert: startup log should be present (proving the worker actually ran).
+ logger.Entries.Should().Contain(e =>
+ e.Level == LogLevel.Information &&
+ e.Message.Contains("LlmQueueToProposalWorker starting"),
+ "worker should have logged startup before cancellation");
+
+ // The worker should not have logged any errors during normal operation.
+ logger.Entries.Should().NotContain(e =>
+ e.Level == LogLevel.Error,
+ "worker should not log errors during normal processing and cancellation");
+ }
+
+ [Fact]
+ public async Task LlmWorker_WhenAutoQueueProcessingDisabled_SkipsProcessingButStillReportsHeartbeat()
+ {
+ var mockLlmQueue = new Mock();
+ var processCallCount = 0;
+ mockLlmQueue
+ .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny()))
+ .Callback(() => processCallCount++)
+ .ReturnsAsync(Enumerable.Empty());
+
+ var mockUnitOfWork = new Mock();
+ mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object);
+
+ var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object);
+ var logger = new InMemoryLogger();
+ var settings = new WorkerSettings
+ {
+ QueuePollIntervalSeconds = 1,
+ EnableAutoQueueProcessing = false, // Disabled
+ MaxBatchSize = 5,
+ MaxConcurrency = 1,
+ RetryBackoffSeconds = new[] { 0 }
+ };
+ var heartbeat = new WorkerHeartbeatRegistry();
+
+ var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger);
+
+ using var cts = new CancellationTokenSource();
+ var runTask = worker.StartAsync(cts.Token);
+ await Task.Delay(1500);
+ cts.Cancel();
+
+ try { await runTask; } catch (OperationCanceledException) { }
+ await worker.StopAsync(CancellationToken.None);
+
+ // Queue should never have been queried because processing is disabled.
+ processCallCount.Should().Be(0,
+ "worker should skip batch processing when EnableAutoQueueProcessing=false");
+
+ // But heartbeats should still be reported.
+ heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull(
+ "heartbeats should be reported even when processing is disabled");
+ }
+
+ // ── Helpers ────────────────────────────────────────────────────────
+
+ ///
+ /// Creates an IServiceScopeFactory where resolving IUnitOfWork invokes
+ /// the provided action (which is expected to throw).
+ ///
+ private static IServiceScopeFactory CreateScopeFactoryThatThrowsOnUnitOfWork(Action onResolve)
+ {
+ var mockScope = new Mock();
+ var mockServiceProvider = new Mock();
+
+ mockServiceProvider
+ .Setup(sp => sp.GetService(typeof(IUnitOfWork)))
+ .Returns(() =>
+ {
+ onResolve();
+ return null!;
+ });
+
+ mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object);
+
+ var mockScopeFactory = new Mock();
+ mockScopeFactory
+ .Setup(f => f.CreateScope())
+ .Returns(mockScope.Object);
+
+ return mockScopeFactory.Object;
+ }
+
+ ///
+ /// Creates an IServiceScopeFactory that resolves a real IUnitOfWork mock.
+ ///
+ private static IServiceScopeFactory CreateScopeFactoryWithUnitOfWork(IUnitOfWork unitOfWork)
+ {
+ var mockScope = new Mock();
+ var mockServiceProvider = new Mock();
+
+ mockServiceProvider
+ .Setup(sp => sp.GetService(typeof(IUnitOfWork)))
+ .Returns(unitOfWork);
+
+ mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object);
+
+ var mockScopeFactory = new Mock();
+ mockScopeFactory
+ .Setup(f => f.CreateScope())
+ .Returns(mockScope.Object);
+
+ return mockScopeFactory.Object;
+ }
+}