diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs new file mode 100644 index 00000000..aa722b4c --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs @@ -0,0 +1,119 @@ +using System.Net; +using System.Net.Http.Json; +using System.Text.Json; +using FluentAssertions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that database operations produce appropriate error responses and +/// that health endpoints report database status accurately. +/// +public class DatabaseResilienceTests : IClassFixture +{ + private readonly HttpClient _client; + + public DatabaseResilienceTests(TestWebApplicationFactory factory) + { + _client = factory.CreateClient(); + } + + // ── Health Endpoint Reports Database Status ─────────────────────── + + [Fact] + public async Task ReadyCheck_IncludesDatabaseCheck_WhenDatabaseIsReachable() + { + var response = await _client.GetAsync("/health/ready"); + + // With a working DB, ready check may be OK or 503 depending on worker state, + // but the database check itself should be Healthy. + var payload = await response.Content.ReadFromJsonAsync(); + payload.TryGetProperty("checks", out var checks).Should().BeTrue(); + + var database = checks.GetProperty("database"); + database.GetProperty("status").GetString().Should().Be("Healthy", + "database check should report Healthy when database is reachable"); + } + + [Fact] + public async Task LiveCheck_AlwaysReturnsHealthy_RegardlessOfDatabaseState() + { + var response = await _client.GetAsync("/health/live"); + + response.StatusCode.Should().Be(HttpStatusCode.OK, + "liveness probe should always return 200"); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.GetProperty("status").GetString().Should().Be("Healthy", + "live check is a simple heartbeat, independent of database state"); + } + + // ── Database Error Handling in API Operations ────────────────────── + + [Fact] + public async Task Operations_OnNonExistentResource_ReturnNotFoundInsteadOfCrash() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-notfound"); + + // Accessing a non-existent board should return 404, not 500. + var response = await _client.GetAsync($"/api/boards/{Guid.NewGuid()}"); + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "non-existent resource should return 404, not a database crash"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue( + "404 response should follow error contract"); + errorCode.GetString().Should().Be("NotFound"); + } + + [Fact] + public async Task ConcurrentWrites_HandleConflictsGracefully() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-conflict"); + + // Create a board first. + var board = await ApiTestHarness.CreateBoardAsync(_client, "db-conflict-board"); + + // Try to delete the same board twice in quick succession. + var delete1 = _client.DeleteAsync($"/api/boards/{board.Id}"); + var delete2 = _client.DeleteAsync($"/api/boards/{board.Id}"); + + var results = await Task.WhenAll(delete1, delete2); + + // One should succeed (204/200), the other should get 404. + // Neither should be 500. + foreach (var result in results) + { + var statusCode = (int)result.StatusCode; + statusCode.Should().NotBe(500, + "concurrent operations should not cause unhandled 500 errors"); + } + + var statusCodes = results.Select(r => (int)r.StatusCode).OrderBy(s => s).ToArray(); + statusCodes.Should().Contain(s => s >= 200 && s < 300, + "at least one delete should succeed"); + } + + // ── Database Write Validation ───────────────────────────────────── + + [Fact] + public async Task CreateBoard_WithInvalidData_ReturnsValidationError() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-validation"); + + // An empty board name should return a validation error, not a DB crash. + var response = await _client.PostAsJsonAsync( + "/api/boards", + new CreateBoardDto("", "Empty name board")); + + response.StatusCode.Should().Be(HttpStatusCode.BadRequest, + "invalid data should return 400, not a database crash"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out _).Should().BeTrue( + "400 response should follow the error contract"); + } +} diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs new file mode 100644 index 00000000..ea04df7c --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs @@ -0,0 +1,114 @@ +using System.Net; +using System.Net.Http.Json; +using System.Text.Json; +using FluentAssertions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that external service failures (GitHub OAuth, etc.) produce appropriate +/// error responses while keeping local functionality working. +/// +public class ExternalServiceFailureTests : IClassFixture +{ + private readonly HttpClient _client; + + public ExternalServiceFailureTests(TestWebApplicationFactory factory) + { + _client = factory.CreateClient(); + } + + // ── Local Auth Still Works When External Auth Is Unavailable ─────── + + [Fact] + public async Task LocalRegistration_ShouldWork_RegardlessOfExternalOAuthState() + { + // Local auth (register + login) should not depend on any external service. + var suffix = Guid.NewGuid().ToString("N")[..8]; + var response = await _client.PostAsJsonAsync( + "/api/auth/register", + new CreateUserDto($"ext-resilience-{suffix}", $"ext-resilience-{suffix}@example.com", "password123")); + + response.StatusCode.Should().Be(HttpStatusCode.OK, + "local registration should succeed regardless of external service state"); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.Token.Should().NotBeNullOrWhiteSpace( + "local auth should issue a token without relying on external services"); + } + + [Fact] + public async Task LocalLogin_ShouldWork_RegardlessOfExternalOAuthState() + { + // Register first. + var suffix = Guid.NewGuid().ToString("N")[..8]; + var username = $"ext-login-{suffix}"; + var registerResponse = await _client.PostAsJsonAsync( + "/api/auth/register", + new CreateUserDto(username, $"ext-login-{suffix}@example.com", "password123")); + registerResponse.StatusCode.Should().Be(HttpStatusCode.OK); + + // Login should work via local path regardless of external service availability. + var loginResponse = await _client.PostAsJsonAsync( + "/api/auth/login", + new LoginDto(username, "password123")); + + loginResponse.StatusCode.Should().Be(HttpStatusCode.OK, + "local login should succeed regardless of external service state"); + + var loginPayload = await loginResponse.Content.ReadFromJsonAsync(); + loginPayload.Should().NotBeNull(); + loginPayload!.Token.Should().NotBeNullOrWhiteSpace(); + } + + // ── Invalid External Auth Callback → Appropriate Error ──────────── + + [Fact] + public async Task GithubCallback_WhenGithubNotConfigured_ReturnsNotFound() + { + // When GitHub OAuth is not configured, the callback should return + // a clean 404 error rather than crashing. + var response = await _client.GetAsync("/api/auth/github/callback"); + + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "GitHub callback should return 404 when OAuth is not configured"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue( + "404 response should follow the error contract"); + errorCode.GetString().Should().Be("NotFound"); + } + + [Fact] + public async Task GithubLogin_WhenGithubNotConfigured_ReturnsNotFound() + { + // The GitHub login initiation endpoint should also return 404 when not configured. + var response = await _client.GetAsync("/api/auth/github/login"); + + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "GitHub login should return 404 when OAuth is not configured"); + } + + // ── API Endpoints Return Proper Error Codes on Invalid Input ────── + + [Fact] + public async Task ApiEndpoints_ReturnProperErrorCodes_WhenUnauthenticated() + { + // Without auth, protected endpoints should return 401, not 500. + var boardsResponse = await _client.GetAsync("/api/boards"); + boardsResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to boards should get 401, not 500"); + + var captureResponse = await _client.GetAsync("/api/capture/items"); + captureResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to capture should get 401, not 500"); + + var chatResponse = await _client.GetAsync("/api/llm/chat/sessions"); + chatResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to chat sessions should get 401, not 500"); + } +} diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs new file mode 100644 index 00000000..445803d3 --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs @@ -0,0 +1,313 @@ +using System.Net; +using System.Net.Http.Json; +using System.Runtime.CompilerServices; +using System.Text.Json; +using FluentAssertions; +using Microsoft.AspNetCore.Hosting; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Taskdeck.Application.Services; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that LLM provider failures (timeout, invalid response, total unavailability) +/// are surfaced as degraded responses rather than 500 errors or infinite waits. +/// +public class LlmProviderDegradationTests : IClassFixture +{ + private readonly TestWebApplicationFactory _baseFactory; + + public LlmProviderDegradationTests(TestWebApplicationFactory baseFactory) + { + _baseFactory = baseFactory; + } + + // ── Provider Timeout ─────────────────────────────────────────────── + + [Fact] + public async Task SendMessage_WhenProviderTimesOut_ReturnsDegradedResponseNotInfiniteWait() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TimeoutProviderStub()); + }); + }); + using var client = factory.CreateClient(); + client.Timeout = TimeSpan.FromSeconds(30); + + await ApiTestHarness.AuthenticateAsync(client, "llm-timeout-resilience"); + + var createSessionResponse = await client.PostAsJsonAsync( + "/api/llm/chat/sessions", + new CreateChatSessionDto("Timeout provider test")); + createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created); + var session = await createSessionResponse.Content.ReadFromJsonAsync(); + session.Should().NotBeNull(); + + var sendMessageResponse = await client.PostAsJsonAsync( + $"/api/llm/chat/sessions/{session!.Id}/messages", + new SendChatMessageDto("tell me something")); + + // The request should not hang forever; it should return within the test timeout. + // The response may be degraded or an error -- the key assertion is no infinite wait. + sendMessageResponse.Should().NotBeNull( + "request should complete even when provider times out"); + + // Since the provider throws OperationCanceledException simulating timeout, + // the chat service should handle this and return a 500 error contract + // rather than an unhandled exception. + var statusCode = (int)sendMessageResponse.StatusCode; + statusCode.Should().BeOneOf(new[] { 200, 500 }, + "should either return a degraded response or an error contract, not hang"); + } + + // ── Provider Throws Exception ────────────────────────────────────── + + [Fact] + public async Task SendMessage_WhenProviderThrowsException_ReturnsErrorContract() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new ThrowingProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-throw-resilience"); + + var createSessionResponse = await client.PostAsJsonAsync( + "/api/llm/chat/sessions", + new CreateChatSessionDto("Throwing provider test")); + createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created); + var session = await createSessionResponse.Content.ReadFromJsonAsync(); + session.Should().NotBeNull(); + + var sendMessageResponse = await client.PostAsJsonAsync( + $"/api/llm/chat/sessions/{session!.Id}/messages", + new SendChatMessageDto("create card 'Test'")); + + sendMessageResponse.Should().NotBeNull(); + var statusCode = (int)sendMessageResponse.StatusCode; + statusCode.Should().BeOneOf(new[] { 200, 500 }, + "should return an error contract or degraded response, not crash"); + + if (sendMessageResponse.StatusCode == HttpStatusCode.InternalServerError) + { + var body = await sendMessageResponse.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out _).Should().BeTrue( + "500 response should follow error contract with errorCode"); + body.TryGetProperty("message", out _).Should().BeTrue( + "500 response should follow error contract with message"); + } + } + + // ── Provider Unavailable but Non-LLM Features Still Work ────────── + + [Fact] + public async Task BoardCrud_StillWorks_WhenAllProvidersUnavailable() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-board-crud"); + + // Board CRUD should work regardless of LLM provider state. + var board = await ApiTestHarness.CreateBoardAsync(client, "resilience-board"); + board.Should().NotBeNull(); + board.Name.Should().StartWith("resilience-board"); + + var getResponse = await client.GetAsync($"/api/boards/{board.Id}"); + getResponse.StatusCode.Should().Be(HttpStatusCode.OK); + + var listResponse = await client.GetAsync("/api/boards"); + listResponse.StatusCode.Should().Be(HttpStatusCode.OK); + } + + [Fact] + public async Task CaptureItems_StillWork_WhenProviderUnavailable() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-capture"); + + // Capture should still accept items even when the LLM is dead. + // The items queue up for later processing. + var captureResponse = await client.PostAsJsonAsync( + "/api/capture/items", + new CreateCaptureItemDto(null, "capture while LLM is down")); + captureResponse.StatusCode.Should().Be(HttpStatusCode.Created, + "capture should accept items even when LLM provider is unavailable"); + } + + // ── Provider Health Reports Unhealthy ────────────────────────────── + + [Fact] + public async Task ProviderHealth_ReportsUnhealthy_WhenProviderIsDown() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-health"); + + var response = await client.GetAsync("/api/llm/chat/health"); + response.StatusCode.Should().Be(HttpStatusCode.OK); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.IsAvailable.Should().BeFalse( + "health check should report the provider as unavailable"); + payload.ErrorMessage.Should().NotBeNullOrWhiteSpace( + "health check should include an error explanation"); + } + + [Fact] + public async Task ProviderHealth_WithProbe_ReportsUnhealthy_WhenProviderIsDown() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-probe"); + + var response = await client.GetAsync("/api/llm/chat/health?probe=true"); + response.StatusCode.Should().Be(HttpStatusCode.OK); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.IsAvailable.Should().BeFalse(); + payload.IsProbed.Should().BeTrue(); + } + + // ── Stub Implementations ────────────────────────────────────────── + + /// + /// Provider that simulates a timeout by delaying beyond cancellation. + /// + private sealed class TimeoutProviderStub : ILlmProvider + { + public async Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + { + // Simulate a long wait that would be cancelled by the service's timeout. + using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + internalCts.CancelAfter(TimeSpan.FromMilliseconds(50)); + await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token); + throw new InvalidOperationException("Should not reach here"); + } + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + // Use a short internal timeout to avoid hanging for 60 seconds if a test + // hits the streaming endpoint. Cancels quickly like CompleteAsync does. + using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + internalCts.CancelAfter(TimeSpan.FromMilliseconds(50)); + await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token); + yield return new LlmTokenEvent("timeout", true); + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out", IsProbed: true)); + } + + /// + /// Provider that throws an unhandled exception on every call. + /// + private sealed class ThrowingProviderStub : ILlmProvider + { + public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + => throw new InvalidOperationException("Simulated provider crash"); + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + await Task.CompletedTask; + ThrowStreamCrash(); + yield break; + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception", IsProbed: true)); + + private static void ThrowStreamCrash() + => throw new InvalidOperationException("Simulated stream crash"); + } + + /// + /// Provider where everything reports unavailable. + /// + private sealed class TotallyDeadProviderStub : ILlmProvider + { + public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + => throw new InvalidOperationException("All providers are down"); + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + await Task.CompletedTask; + ThrowProvidersDown(); + yield break; + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable", IsProbed: true)); + + private static void ThrowProvidersDown() + => throw new InvalidOperationException("All providers are down"); + } +} diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs new file mode 100644 index 00000000..ee4eb0db --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs @@ -0,0 +1,171 @@ +using System.Net.Http.Json; +using FluentAssertions; +using Microsoft.AspNetCore.SignalR; +using Microsoft.AspNetCore.SignalR.Client; +using Taskdeck.Api.Realtime; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Taskdeck.Domain.Enums; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that SignalR hub failures on one connection do not cascade to other +/// connected clients, and that invalid operations produce HubException rather +/// than killing the connection. +/// +public class SignalRDegradationTests : IClassFixture +{ + private readonly TestWebApplicationFactory _factory; + + public SignalRDegradationTests(TestWebApplicationFactory factory) + { + _factory = factory; + } + + // ── Hub Exception Isolation ──────────────────────────────────────── + + [Fact] + public async Task JoinBoard_WithInvalidBoardId_ThrowsHubExceptionButConnectionSurvives() + { + using var client = _factory.CreateClient(); + var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-bad-board"); + + await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token); + await connection.StartAsync(); + connection.State.Should().Be(HubConnectionState.Connected); + + // Try to join a non-existent board — should throw HubException. + var act = () => connection.InvokeAsync("JoinBoard", Guid.NewGuid()); + await act.Should().ThrowAsync( + "joining a non-existent board should throw a HubException"); + + // Connection should still be alive after the error. + connection.State.Should().Be(HubConnectionState.Connected, + "one failed hub invocation should not kill the connection"); + } + + [Fact] + public async Task SetEditingCard_WithoutJoining_ThrowsHubExceptionButConnectionSurvives() + { + using var client = _factory.CreateClient(); + var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-no-join"); + var board = await ApiTestHarness.CreateBoardAsync(client, "hub-resilience-board"); + + await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token); + await connection.StartAsync(); + + // Try editing a card without joining the board first. + var act = () => connection.InvokeAsync("SetEditingCard", board.Id, Guid.NewGuid()); + await act.Should().ThrowAsync( + "setting editing card without joining should throw a HubException"); + + // Connection should still be connected. + connection.State.Should().Be(HubConnectionState.Connected, + "hub error should not disconnect the client"); + } + + // ── One Client's Error Doesn't Affect Others ────────────────────── + + [Fact] + public async Task ErrorOnOneClient_DoesNotDisconnectOtherClients() + { + using var client1 = _factory.CreateClient(); + using var client2 = _factory.CreateClient(); + + var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-resilience-user1"); + var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-resilience-user2"); + + var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-resilience-multi"); + + // Share the board with user2. + await client1.PostAsJsonAsync( + $"/api/boards/{board.Id}/access", + new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor)); + + var presenceCollector = new EventCollector(); + + await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token); + await using var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token); + + connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot)); + + await connection1.StartAsync(); + await connection2.StartAsync(); + + // Both users join the board. + await connection1.InvokeAsync("JoinBoard", board.Id); + await connection2.InvokeAsync("JoinBoard", board.Id); + + // Wait for presence events to confirm both clients joined (event-based, not timing-based). + await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3)); + + // Client 1 causes an error by trying to join a non-existent board. + var act = () => connection1.InvokeAsync("JoinBoard", Guid.NewGuid()); + try { await act(); } catch (HubException) { /* expected */ } + + // Client 2 should still be connected and functional. + connection2.State.Should().Be(HubConnectionState.Connected, + "client 2 should be unaffected by client 1's error"); + + // Client 1 should also still be connected (HubException doesn't kill connection). + connection1.State.Should().Be(HubConnectionState.Connected, + "client 1's connection should survive its own hub exception"); + + // Verify client 2 can still perform operations on the hub. + var postErrorAct = () => connection2.InvokeAsync("SetEditingCard", board.Id, (Guid?)null); + await postErrorAct.Should().NotThrowAsync( + "client 2 should be fully functional after client 1's error"); + } + + // ── Disconnection Handling ──────────────────────────────────────── + + [Fact] + public async Task DisconnectedClient_RemovedFromPresence_OtherClientsNotified() + { + using var client1 = _factory.CreateClient(); + using var client2 = _factory.CreateClient(); + + var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-disconnect-user1"); + var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-disconnect-user2"); + + var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-disconnect-board"); + + await client1.PostAsJsonAsync( + $"/api/boards/{board.Id}/access", + new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor)); + + var presenceCollector = new EventCollector(); + + await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token); + var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token); + + connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot)); + + await connection1.StartAsync(); + await connection2.StartAsync(); + + await connection1.InvokeAsync("JoinBoard", board.Id); + await connection2.InvokeAsync("JoinBoard", board.Id); + + // Wait for join events. + await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3)); + presenceCollector.Clear(); + + // Disconnect client 2 explicitly. + await connection2.DisposeAsync(); + + // Client 1 should receive a presence update showing client 2 left. + var disconnectEvents = await SignalRTestHelper.WaitForEventsAsync( + presenceCollector, 1, TimeSpan.FromSeconds(5)); + + disconnectEvents.Should().HaveCountGreaterThanOrEqualTo(1, + "client 1 should be notified when client 2 disconnects"); + + // The latest presence snapshot should no longer include client 2. + var latestPresence = disconnectEvents.Last(); + latestPresence.Members.Should().NotContain(u => u.UserId == user2.UserId, + "disconnected user should be removed from presence"); + } +} diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs new file mode 100644 index 00000000..2f9782ed --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs @@ -0,0 +1,255 @@ +using FluentAssertions; +using Microsoft.Extensions.DependencyInjection; +using Taskdeck.Application.Interfaces; +using Taskdeck.Domain.Entities; +using Taskdeck.Domain.Enums; +using Taskdeck.Infrastructure.Persistence; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that webhook delivery failures are handled with retries, backoff, and +/// dead-lettering rather than crashing or silently losing deliveries. +/// +public class WebhookDeliveryResilienceTests : IClassFixture +{ + private readonly TestWebApplicationFactory _factory; + + public WebhookDeliveryResilienceTests(TestWebApplicationFactory factory) + { + _factory = factory; + } + + // ── Delivery to Unreachable Target → Retry Scheduling ───────────── + + [Fact] + public async Task Delivery_ToUnreachableEndpoint_IsScheduledForRetry() + { + // Arrange: create entities directly in the DB to simulate a pending delivery. + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-retry-user", "webhook-retry@example.com", "hash"); + var board = new Board("webhook-retry-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-123", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.created", + "{\"event\":\"card.created\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Verify the delivery starts as Pending. + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending); + + // Simulate a delivery failure by manually marking it. + var claimedAt = DateTimeOffset.UtcNow; + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, + delivery.UpdatedAt, + claimedAt, + CancellationToken.None); + claimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + // Schedule retry (simulating what the worker does on HTTP failure). + var nextAttemptAt = DateTimeOffset.UtcNow.AddSeconds(10); + delivery.ScheduleRetry("Webhook endpoint returned HTTP 503.", nextAttemptAt, 503); + await dbContext.SaveChangesAsync(); + + // Assert: the delivery should be back to Pending with retry metadata. + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending, + "failed delivery should be rescheduled as Pending for retry"); + delivery.AttemptCount.Should().Be(1, + "attempt count should be incremented after a failure"); + delivery.LastErrorMessage.Should().Contain("503", + "error message should capture the failure reason"); + delivery.NextAttemptAt.Should().BeAfter(DateTimeOffset.MinValue, + "retry should have a scheduled next attempt time"); + } + + // ── Dead-Lettering After Max Retries ────────────────────────────── + + [Fact] + public async Task Delivery_AfterMaxRetries_IsDeadLettered() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-deadletter-user", "webhook-deadletter@example.com", "hash"); + var board = new Board("webhook-deadletter-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-456", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.updated", + "{\"event\":\"card.updated\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Simulate multiple failed attempts until max retries is reached. + // Worker settings default: MaxRetries = 3 + for (var attempt = 1; attempt <= 2; attempt++) + { + var updatedAt = delivery.UpdatedAt; + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, updatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue($"attempt {attempt} claim should succeed"); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.ScheduleRetry( + $"HTTP 500 on attempt {attempt}", + DateTimeOffset.UtcNow.AddSeconds(-1), // Make immediately retryable + 500); + await dbContext.SaveChangesAsync(); + await dbContext.Entry(delivery).ReloadAsync(); + } + + // Third attempt (attempt index = 3 which equals MaxRetries) → dead letter. + var finalUpdatedAt = delivery.UpdatedAt; + var finalClaimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, finalUpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + finalClaimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.MarkDeadLetter("HTTP 500 on final attempt", 500); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter, + "delivery should be dead-lettered after exceeding max retries"); + delivery.LastErrorMessage.Should().Contain("final attempt", + "dead-letter should preserve the failure reason"); + } + + // ── Inactive Subscription → Dead Letter ─────────────────────────── + + [Fact] + public async Task Delivery_ForInactiveSubscription_CanBeDeadLettered() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-inactive-user", "webhook-inactive@example.com", "hash"); + var board = new Board("webhook-inactive-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-789", + new[] { "card.*" }); + + // Revoke the subscription before the delivery is processed. + subscription.Revoke(user.Id); + + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.deleted", + "{\"event\":\"card.deleted\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // The worker would first claim the delivery (move to Processing), + // then check subscription.IsActive and dead-letter. + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue("delivery should be claimable"); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + delivery.MarkDeadLetter("Webhook subscription is inactive before delivery dispatch."); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter, + "delivery for inactive subscription should be dead-lettered"); + delivery.LastErrorMessage.Should().Contain("inactive", + "dead-letter message should explain why delivery was abandoned"); + } + + // ── Stuck Processing Recovery ──────────────────────────────────── + + [Fact] + public async Task StuckProcessingDelivery_CanBeReturnedToPending() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-stuck-user", "webhook-stuck@example.com", "hash"); + var board = new Board("webhook-stuck-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-stuck", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.moved", + "{\"event\":\"card.moved\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Claim the delivery (move to Processing). + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + // Simulate worker recovery: return the stuck delivery to Pending. + delivery.ReturnToPending( + DateTimeOffset.UtcNow, + "Recovered stale processing webhook delivery for retry."); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending, + "stuck processing delivery should be recoverable to Pending"); + delivery.LastErrorMessage.Should().Contain("Recovered", + "recovery message should explain why the delivery was returned to Pending"); + } +} diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs new file mode 100644 index 00000000..50404d48 --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs @@ -0,0 +1,251 @@ +using FluentAssertions; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Moq; +using Taskdeck.Api.Workers; +using Taskdeck.Application.DTOs; +using Taskdeck.Application.Interfaces; +using Taskdeck.Application.Services; +using Taskdeck.Domain.Common; +using Taskdeck.Domain.Entities; +using Taskdeck.Domain.Enums; +using Taskdeck.Tests.Support; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that background workers handle exceptions, DB failures, cancellation, and +/// repeated errors without crashing or leaving items in corrupted states. +/// +public class WorkerResilienceTests +{ + // ── Worker Exception in Main Loop ────────────────────────────────── + + [Fact] + public async Task LlmWorker_WhenProcessBatchThrows_LogsErrorAndContinuesToNextPoll() + { + // Arrange: set up a scope factory whose IUnitOfWork always throws. + var callCount = 0; + var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() => + { + callCount++; + throw new InvalidOperationException("Simulated DB blowup"); + }); + + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 1, + EnableAutoQueueProcessing = true, + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + + // Act: run the worker for long enough to complete at least one iteration, then cancel. + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(1500); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + // Assert: the worker should have logged the error but NOT crashed; + // it should have executed at least one iteration. + callCount.Should().BeGreaterThanOrEqualTo(1, + "worker should have attempted at least one batch despite DB throwing"); + + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Error && + e.Message.Contains("Error in LlmQueueToProposalWorker iteration"), + "worker should log the exception and continue looping"); + + // Heartbeat should still have been reported. + heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull( + "worker should report heartbeats even when processing fails"); + } + + [Fact] + public async Task ProposalHousekeepingWorker_WhenDbThrows_LogsErrorAndContinuesPolling() + { + var callCount = 0; + var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() => + { + callCount++; + throw new InvalidOperationException("Simulated housekeeping DB failure"); + }); + + var logger = new InMemoryLogger(); + var settings = new WorkerSettings(); + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new ProposalHousekeepingWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(300); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + callCount.Should().BeGreaterThan(0); + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Error && + e.Message.Contains("Error in ProposalHousekeepingWorker iteration")); + heartbeat.GetLastHeartbeat(nameof(ProposalHousekeepingWorker)).Should().NotBeNull(); + } + + // ── Worker Cancellation → Clean Shutdown ─────────────────────────── + + [Fact] + public async Task LlmWorker_WhenCancelled_ExitsWithoutCrashing() + { + // Arrange: the worker has nothing to process; we test clean cancellation. + var mockLlmQueue = new Mock(); + mockLlmQueue + .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(Enumerable.Empty()); + + var mockUnitOfWork = new Mock(); + mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object); + + var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object); + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 1, + EnableAutoQueueProcessing = true, + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + await worker.StartAsync(cts.Token); + + // Let it run at least one cycle. + await Task.Delay(1500); + + // StopAsync triggers cancellation and waits for ExecuteAsync to complete. + // This should NOT throw -- the BackgroundService infrastructure handles OperationCanceledException. + var stopAct = () => worker.StopAsync(CancellationToken.None); + await stopAct.Should().NotThrowAsync( + "worker should handle cancellation cleanly without throwing"); + + // Assert: startup log should be present (proving the worker actually ran). + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Information && + e.Message.Contains("LlmQueueToProposalWorker starting"), + "worker should have logged startup before cancellation"); + + // The worker should not have logged any errors during normal operation. + logger.Entries.Should().NotContain(e => + e.Level == LogLevel.Error, + "worker should not log errors during normal processing and cancellation"); + } + + [Fact] + public async Task LlmWorker_WhenAutoQueueProcessingDisabled_SkipsProcessingButStillReportsHeartbeat() + { + var mockLlmQueue = new Mock(); + var processCallCount = 0; + mockLlmQueue + .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny())) + .Callback(() => processCallCount++) + .ReturnsAsync(Enumerable.Empty()); + + var mockUnitOfWork = new Mock(); + mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object); + + var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object); + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 1, + EnableAutoQueueProcessing = false, // Disabled + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(1500); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + // Queue should never have been queried because processing is disabled. + processCallCount.Should().Be(0, + "worker should skip batch processing when EnableAutoQueueProcessing=false"); + + // But heartbeats should still be reported. + heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull( + "heartbeats should be reported even when processing is disabled"); + } + + // ── Helpers ──────────────────────────────────────────────────────── + + /// + /// Creates an IServiceScopeFactory where resolving IUnitOfWork invokes + /// the provided action (which is expected to throw). + /// + private static IServiceScopeFactory CreateScopeFactoryThatThrowsOnUnitOfWork(Action onResolve) + { + var mockScope = new Mock(); + var mockServiceProvider = new Mock(); + + mockServiceProvider + .Setup(sp => sp.GetService(typeof(IUnitOfWork))) + .Returns(() => + { + onResolve(); + return null!; + }); + + mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object); + + var mockScopeFactory = new Mock(); + mockScopeFactory + .Setup(f => f.CreateScope()) + .Returns(mockScope.Object); + + return mockScopeFactory.Object; + } + + /// + /// Creates an IServiceScopeFactory that resolves a real IUnitOfWork mock. + /// + private static IServiceScopeFactory CreateScopeFactoryWithUnitOfWork(IUnitOfWork unitOfWork) + { + var mockScope = new Mock(); + var mockServiceProvider = new Mock(); + + mockServiceProvider + .Setup(sp => sp.GetService(typeof(IUnitOfWork))) + .Returns(unitOfWork); + + mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object); + + var mockScopeFactory = new Mock(); + mockScopeFactory + .Setup(f => f.CreateScope()) + .Returns(mockScope.Object); + + return mockScopeFactory.Object; + } +}