From 6e2949406fd862788235d740e92eb7e449985dda Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:11:44 +0100 Subject: [PATCH 01/11] Add worker resilience tests for exception handling and clean shutdown Tests that LlmQueueToProposalWorker and ProposalHousekeepingWorker handle database failures, exceptions in the main loop, cancellation, and disabled processing without crashing or losing heartbeats. --- .../Resilience/WorkerResilienceTests.cs | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs new file mode 100644 index 00000000..9ed8d4c1 --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs @@ -0,0 +1,251 @@ +using FluentAssertions; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using Moq; +using Taskdeck.Api.Workers; +using Taskdeck.Application.DTOs; +using Taskdeck.Application.Interfaces; +using Taskdeck.Application.Services; +using Taskdeck.Domain.Common; +using Taskdeck.Domain.Entities; +using Taskdeck.Domain.Enums; +using Taskdeck.Tests.Support; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that background workers handle exceptions, DB failures, cancellation, and +/// repeated errors without crashing or leaving items in corrupted states. +/// +public class WorkerResilienceTests +{ + // ── Worker Exception in Main Loop ────────────────────────────────── + + [Fact] + public async Task LlmWorker_WhenProcessBatchThrows_LogsErrorAndContinuesToNextPoll() + { + // Arrange: set up a scope factory whose IUnitOfWork always throws. + var callCount = 0; + var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() => + { + callCount++; + throw new InvalidOperationException("Simulated DB blowup"); + }); + + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 0, + EnableAutoQueueProcessing = true, + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + + // Act: run the worker for a brief window then cancel. + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(300); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + // Assert: the worker should have logged the error but NOT crashed; + // it should have executed more than one iteration. + callCount.Should().BeGreaterThan(0, + "worker should have attempted at least one batch despite DB throwing"); + + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Error && + e.Message.Contains("Error in LlmQueueToProposalWorker iteration"), + "worker should log the exception and continue looping"); + + // Heartbeat should still have been reported. + heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull( + "worker should report heartbeats even when processing fails"); + } + + [Fact] + public async Task ProposalHousekeepingWorker_WhenDbThrows_LogsErrorAndContinuesPolling() + { + var callCount = 0; + var scopeFactory = CreateScopeFactoryThatThrowsOnUnitOfWork(() => + { + callCount++; + throw new InvalidOperationException("Simulated housekeeping DB failure"); + }); + + var logger = new InMemoryLogger(); + var settings = new WorkerSettings(); + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new ProposalHousekeepingWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(300); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + callCount.Should().BeGreaterThan(0); + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Error && + e.Message.Contains("Error in ProposalHousekeepingWorker iteration")); + heartbeat.GetLastHeartbeat(nameof(ProposalHousekeepingWorker)).Should().NotBeNull(); + } + + // ── Worker Cancellation → Clean Shutdown ─────────────────────────── + + [Fact] + public async Task LlmWorker_WhenCancelled_ExitsWithoutCrashing() + { + // Arrange: the worker has nothing to process; we test clean cancellation. + var mockLlmQueue = new Mock(); + mockLlmQueue + .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(Enumerable.Empty()); + + var mockUnitOfWork = new Mock(); + mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object); + + var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object); + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 0, + EnableAutoQueueProcessing = true, + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + await worker.StartAsync(cts.Token); + + // Let it run at least one cycle. + await Task.Delay(150); + + // StopAsync triggers cancellation and waits for ExecuteAsync to complete. + // This should NOT throw -- the BackgroundService infrastructure handles OperationCanceledException. + var stopAct = () => worker.StopAsync(CancellationToken.None); + await stopAct.Should().NotThrowAsync( + "worker should handle cancellation cleanly without throwing"); + + // Assert: startup log should be present (proving the worker actually ran). + logger.Entries.Should().Contain(e => + e.Level == LogLevel.Information && + e.Message.Contains("LlmQueueToProposalWorker starting"), + "worker should have logged startup before cancellation"); + + // The worker should not have logged any errors during normal operation. + logger.Entries.Should().NotContain(e => + e.Level == LogLevel.Error, + "worker should not log errors during normal processing and cancellation"); + } + + [Fact] + public async Task LlmWorker_WhenAutoQueueProcessingDisabled_SkipsProcessingButStillReportsHeartbeat() + { + var mockLlmQueue = new Mock(); + var processCallCount = 0; + mockLlmQueue + .Setup(q => q.GetByStatusAsync(It.IsAny(), It.IsAny())) + .Callback(() => processCallCount++) + .ReturnsAsync(Enumerable.Empty()); + + var mockUnitOfWork = new Mock(); + mockUnitOfWork.Setup(u => u.LlmQueue).Returns(mockLlmQueue.Object); + + var scopeFactory = CreateScopeFactoryWithUnitOfWork(mockUnitOfWork.Object); + var logger = new InMemoryLogger(); + var settings = new WorkerSettings + { + QueuePollIntervalSeconds = 0, + EnableAutoQueueProcessing = false, // Disabled + MaxBatchSize = 5, + MaxConcurrency = 1, + RetryBackoffSeconds = new[] { 0 } + }; + var heartbeat = new WorkerHeartbeatRegistry(); + + var worker = new LlmQueueToProposalWorker(scopeFactory, settings, heartbeat, logger); + + using var cts = new CancellationTokenSource(); + var runTask = worker.StartAsync(cts.Token); + await Task.Delay(200); + cts.Cancel(); + + try { await runTask; } catch (OperationCanceledException) { } + await worker.StopAsync(CancellationToken.None); + + // Queue should never have been queried because processing is disabled. + processCallCount.Should().Be(0, + "worker should skip batch processing when EnableAutoQueueProcessing=false"); + + // But heartbeats should still be reported. + heartbeat.GetLastHeartbeat(nameof(LlmQueueToProposalWorker)).Should().NotBeNull( + "heartbeats should be reported even when processing is disabled"); + } + + // ── Helpers ──────────────────────────────────────────────────────── + + /// + /// Creates an IServiceScopeFactory where resolving IUnitOfWork invokes + /// the provided action (which is expected to throw). + /// + private static IServiceScopeFactory CreateScopeFactoryThatThrowsOnUnitOfWork(Action onResolve) + { + var mockScope = new Mock(); + var mockServiceProvider = new Mock(); + + mockServiceProvider + .Setup(sp => sp.GetService(typeof(IUnitOfWork))) + .Returns(() => + { + onResolve(); + return null!; + }); + + mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object); + + var mockScopeFactory = new Mock(); + mockScopeFactory + .Setup(f => f.CreateScope()) + .Returns(mockScope.Object); + + return mockScopeFactory.Object; + } + + /// + /// Creates an IServiceScopeFactory that resolves a real IUnitOfWork mock. + /// + private static IServiceScopeFactory CreateScopeFactoryWithUnitOfWork(IUnitOfWork unitOfWork) + { + var mockScope = new Mock(); + var mockServiceProvider = new Mock(); + + mockServiceProvider + .Setup(sp => sp.GetService(typeof(IUnitOfWork))) + .Returns(unitOfWork); + + mockScope.Setup(s => s.ServiceProvider).Returns(mockServiceProvider.Object); + + var mockScopeFactory = new Mock(); + mockScopeFactory + .Setup(f => f.CreateScope()) + .Returns(mockScope.Object); + + return mockScopeFactory.Object; + } +} From c2905ca5de6560be8d39aa4056ac514821ef7788 Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:11:54 +0100 Subject: [PATCH 02/11] Add LLM provider degradation tests for timeout, exception, and unavailability Tests that provider timeouts, exceptions, and total unavailability produce degraded responses or error contracts rather than infinite waits or crashes. Verifies non-LLM features (board CRUD, capture) still work when all providers are down. --- .../Resilience/LlmProviderDegradationTests.cs | 307 ++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs new file mode 100644 index 00000000..428d290c --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs @@ -0,0 +1,307 @@ +using System.Net; +using System.Net.Http.Json; +using System.Runtime.CompilerServices; +using System.Text.Json; +using FluentAssertions; +using Microsoft.AspNetCore.Hosting; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Taskdeck.Application.Services; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that LLM provider failures (timeout, invalid response, total unavailability) +/// are surfaced as degraded responses rather than 500 errors or infinite waits. +/// +public class LlmProviderDegradationTests : IClassFixture +{ + private readonly TestWebApplicationFactory _baseFactory; + + public LlmProviderDegradationTests(TestWebApplicationFactory baseFactory) + { + _baseFactory = baseFactory; + } + + // ── Provider Timeout ─────────────────────────────────────────────── + + [Fact] + public async Task SendMessage_WhenProviderTimesOut_ReturnsDegradedResponseNotInfiniteWait() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TimeoutProviderStub()); + }); + }); + using var client = factory.CreateClient(); + client.Timeout = TimeSpan.FromSeconds(30); + + await ApiTestHarness.AuthenticateAsync(client, "llm-timeout-resilience"); + + var createSessionResponse = await client.PostAsJsonAsync( + "/api/llm/chat/sessions", + new CreateChatSessionDto("Timeout provider test")); + createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created); + var session = await createSessionResponse.Content.ReadFromJsonAsync(); + session.Should().NotBeNull(); + + var sendMessageResponse = await client.PostAsJsonAsync( + $"/api/llm/chat/sessions/{session!.Id}/messages", + new SendChatMessageDto("tell me something")); + + // The request should not hang forever; it should return within the test timeout. + // The response may be degraded or an error -- the key assertion is no infinite wait. + sendMessageResponse.Should().NotBeNull( + "request should complete even when provider times out"); + + // Since the provider throws OperationCanceledException simulating timeout, + // the chat service should handle this and return a 500 error contract + // rather than an unhandled exception. + var statusCode = (int)sendMessageResponse.StatusCode; + statusCode.Should().BeOneOf(new[] { 200, 500 }, + "should either return a degraded response or an error contract, not hang"); + } + + // ── Provider Throws Exception ────────────────────────────────────── + + [Fact] + public async Task SendMessage_WhenProviderThrowsException_ReturnsErrorContract() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new ThrowingProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-throw-resilience"); + + var createSessionResponse = await client.PostAsJsonAsync( + "/api/llm/chat/sessions", + new CreateChatSessionDto("Throwing provider test")); + createSessionResponse.StatusCode.Should().Be(HttpStatusCode.Created); + var session = await createSessionResponse.Content.ReadFromJsonAsync(); + session.Should().NotBeNull(); + + var sendMessageResponse = await client.PostAsJsonAsync( + $"/api/llm/chat/sessions/{session!.Id}/messages", + new SendChatMessageDto("create card 'Test'")); + + sendMessageResponse.Should().NotBeNull(); + var statusCode = (int)sendMessageResponse.StatusCode; + statusCode.Should().BeOneOf(new[] { 200, 500 }, + "should return an error contract or degraded response, not crash"); + + if (sendMessageResponse.StatusCode == HttpStatusCode.InternalServerError) + { + var body = await sendMessageResponse.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out _).Should().BeTrue( + "500 response should follow error contract with errorCode"); + body.TryGetProperty("message", out _).Should().BeTrue( + "500 response should follow error contract with message"); + } + } + + // ── Provider Unavailable but Non-LLM Features Still Work ────────── + + [Fact] + public async Task BoardCrud_StillWorks_WhenAllProvidersUnavailable() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-board-crud"); + + // Board CRUD should work regardless of LLM provider state. + var board = await ApiTestHarness.CreateBoardAsync(client, "resilience-board"); + board.Should().NotBeNull(); + board.Name.Should().StartWith("resilience-board"); + + var getResponse = await client.GetAsync($"/api/boards/{board.Id}"); + getResponse.StatusCode.Should().Be(HttpStatusCode.OK); + + var listResponse = await client.GetAsync("/api/boards"); + listResponse.StatusCode.Should().Be(HttpStatusCode.OK); + } + + [Fact] + public async Task CaptureItems_StillWork_WhenProviderUnavailable() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-capture"); + + // Capture should still accept items even when the LLM is dead. + // The items queue up for later processing. + var captureResponse = await client.PostAsJsonAsync( + "/api/capture/items", + new CreateCaptureItemDto(null, "capture while LLM is down")); + captureResponse.StatusCode.Should().Be(HttpStatusCode.Created, + "capture should accept items even when LLM provider is unavailable"); + } + + // ── Provider Health Reports Unhealthy ────────────────────────────── + + [Fact] + public async Task ProviderHealth_ReportsUnhealthy_WhenProviderIsDown() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-health"); + + var response = await client.GetAsync("/api/llm/chat/health"); + response.StatusCode.Should().Be(HttpStatusCode.OK); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.IsAvailable.Should().BeFalse( + "health check should report the provider as unavailable"); + payload.ErrorMessage.Should().NotBeNullOrWhiteSpace( + "health check should include an error explanation"); + } + + [Fact] + public async Task ProviderHealth_WithProbe_ReportsUnhealthy_WhenProviderIsDown() + { + using var factory = _baseFactory.WithWebHostBuilder(builder => + { + builder.UseEnvironment("Development"); + builder.ConfigureServices(services => + { + services.RemoveAll(); + services.AddScoped(_ => new TotallyDeadProviderStub()); + }); + }); + using var client = factory.CreateClient(); + + await ApiTestHarness.AuthenticateAsync(client, "llm-dead-probe"); + + var response = await client.GetAsync("/api/llm/chat/health?probe=true"); + response.StatusCode.Should().Be(HttpStatusCode.OK); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.IsAvailable.Should().BeFalse(); + payload.IsProbed.Should().BeTrue(); + } + + // ── Stub Implementations ────────────────────────────────────────── + + /// + /// Provider that simulates a timeout by delaying beyond cancellation. + /// + private sealed class TimeoutProviderStub : ILlmProvider + { + public async Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + { + // Simulate a long wait that would be cancelled by the service's timeout. + using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + internalCts.CancelAfter(TimeSpan.FromMilliseconds(50)); + await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token); + throw new InvalidOperationException("Should not reach here"); + } + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + await Task.Delay(TimeSpan.FromSeconds(60), ct); + yield return new LlmTokenEvent("timeout", true); + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "TimeoutStub", "Provider timed out", IsProbed: true)); + } + + /// + /// Provider that throws an unhandled exception on every call. + /// + private sealed class ThrowingProviderStub : ILlmProvider + { + public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + => throw new InvalidOperationException("Simulated provider crash"); + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + await Task.CompletedTask; + throw new InvalidOperationException("Simulated stream crash"); +#pragma warning disable CS0162 + yield break; +#pragma warning restore CS0162 + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception", IsProbed: true)); + } + + /// + /// Provider where everything reports unavailable. + /// + private sealed class TotallyDeadProviderStub : ILlmProvider + { + public Task CompleteAsync(ChatCompletionRequest request, CancellationToken ct = default) + => throw new InvalidOperationException("All providers are down"); + + public async IAsyncEnumerable StreamAsync( + ChatCompletionRequest request, + [EnumeratorCancellation] CancellationToken ct = default) + { + await Task.CompletedTask; + throw new InvalidOperationException("All providers are down"); +#pragma warning disable CS0162 + yield break; +#pragma warning restore CS0162 + } + + public Task GetHealthAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable")); + + public Task ProbeAsync(CancellationToken ct = default) + => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable", IsProbed: true)); + } +} From f58c5b6aff1fec1deb2ddab318bc69be169276d5 Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:12:00 +0100 Subject: [PATCH 03/11] Add database resilience tests for health checks, 404 handling, and conflicts Tests that health endpoints report database status accurately, non-existent resources return 404 error contracts instead of 500, concurrent writes handle conflicts gracefully, and invalid data returns validation errors. --- .../Resilience/DatabaseResilienceTests.cs | 119 ++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs new file mode 100644 index 00000000..aa722b4c --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/DatabaseResilienceTests.cs @@ -0,0 +1,119 @@ +using System.Net; +using System.Net.Http.Json; +using System.Text.Json; +using FluentAssertions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that database operations produce appropriate error responses and +/// that health endpoints report database status accurately. +/// +public class DatabaseResilienceTests : IClassFixture +{ + private readonly HttpClient _client; + + public DatabaseResilienceTests(TestWebApplicationFactory factory) + { + _client = factory.CreateClient(); + } + + // ── Health Endpoint Reports Database Status ─────────────────────── + + [Fact] + public async Task ReadyCheck_IncludesDatabaseCheck_WhenDatabaseIsReachable() + { + var response = await _client.GetAsync("/health/ready"); + + // With a working DB, ready check may be OK or 503 depending on worker state, + // but the database check itself should be Healthy. + var payload = await response.Content.ReadFromJsonAsync(); + payload.TryGetProperty("checks", out var checks).Should().BeTrue(); + + var database = checks.GetProperty("database"); + database.GetProperty("status").GetString().Should().Be("Healthy", + "database check should report Healthy when database is reachable"); + } + + [Fact] + public async Task LiveCheck_AlwaysReturnsHealthy_RegardlessOfDatabaseState() + { + var response = await _client.GetAsync("/health/live"); + + response.StatusCode.Should().Be(HttpStatusCode.OK, + "liveness probe should always return 200"); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.GetProperty("status").GetString().Should().Be("Healthy", + "live check is a simple heartbeat, independent of database state"); + } + + // ── Database Error Handling in API Operations ────────────────────── + + [Fact] + public async Task Operations_OnNonExistentResource_ReturnNotFoundInsteadOfCrash() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-notfound"); + + // Accessing a non-existent board should return 404, not 500. + var response = await _client.GetAsync($"/api/boards/{Guid.NewGuid()}"); + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "non-existent resource should return 404, not a database crash"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue( + "404 response should follow error contract"); + errorCode.GetString().Should().Be("NotFound"); + } + + [Fact] + public async Task ConcurrentWrites_HandleConflictsGracefully() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-conflict"); + + // Create a board first. + var board = await ApiTestHarness.CreateBoardAsync(_client, "db-conflict-board"); + + // Try to delete the same board twice in quick succession. + var delete1 = _client.DeleteAsync($"/api/boards/{board.Id}"); + var delete2 = _client.DeleteAsync($"/api/boards/{board.Id}"); + + var results = await Task.WhenAll(delete1, delete2); + + // One should succeed (204/200), the other should get 404. + // Neither should be 500. + foreach (var result in results) + { + var statusCode = (int)result.StatusCode; + statusCode.Should().NotBe(500, + "concurrent operations should not cause unhandled 500 errors"); + } + + var statusCodes = results.Select(r => (int)r.StatusCode).OrderBy(s => s).ToArray(); + statusCodes.Should().Contain(s => s >= 200 && s < 300, + "at least one delete should succeed"); + } + + // ── Database Write Validation ───────────────────────────────────── + + [Fact] + public async Task CreateBoard_WithInvalidData_ReturnsValidationError() + { + await ApiTestHarness.AuthenticateAsync(_client, "db-resilience-validation"); + + // An empty board name should return a validation error, not a DB crash. + var response = await _client.PostAsJsonAsync( + "/api/boards", + new CreateBoardDto("", "Empty name board")); + + response.StatusCode.Should().Be(HttpStatusCode.BadRequest, + "invalid data should return 400, not a database crash"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out _).Should().BeTrue( + "400 response should follow the error contract"); + } +} From 15bc19b7007b067bd13645e5d459d6c307d0d3fa Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:12:07 +0100 Subject: [PATCH 04/11] Add SignalR degradation tests for hub error isolation and disconnection Tests that HubException on one client does not disconnect other clients, that invalid operations (joining non-existent boards, editing without joining) produce HubException without killing the connection, and that disconnected clients are properly removed from presence tracking. --- .../Resilience/SignalRDegradationTests.cs | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs new file mode 100644 index 00000000..a69da990 --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs @@ -0,0 +1,168 @@ +using System.Net; +using System.Net.Http.Json; +using FluentAssertions; +using Microsoft.AspNetCore.SignalR; +using Microsoft.AspNetCore.SignalR.Client; +using Taskdeck.Api.Realtime; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Taskdeck.Domain.Enums; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that SignalR hub failures on one connection do not cascade to other +/// connected clients, and that invalid operations produce HubException rather +/// than killing the connection. +/// +public class SignalRDegradationTests : IClassFixture +{ + private readonly TestWebApplicationFactory _factory; + + public SignalRDegradationTests(TestWebApplicationFactory factory) + { + _factory = factory; + } + + // ── Hub Exception Isolation ──────────────────────────────────────── + + [Fact] + public async Task JoinBoard_WithInvalidBoardId_ThrowsHubExceptionButConnectionSurvives() + { + using var client = _factory.CreateClient(); + var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-bad-board"); + + await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token); + await connection.StartAsync(); + connection.State.Should().Be(HubConnectionState.Connected); + + // Try to join a non-existent board — should throw HubException. + var act = () => connection.InvokeAsync("JoinBoard", Guid.NewGuid()); + await act.Should().ThrowAsync( + "joining a non-existent board should throw a HubException"); + + // Connection should still be alive after the error. + connection.State.Should().Be(HubConnectionState.Connected, + "one failed hub invocation should not kill the connection"); + } + + [Fact] + public async Task SetEditingCard_WithoutJoining_ThrowsHubExceptionButConnectionSurvives() + { + using var client = _factory.CreateClient(); + var user = await ApiTestHarness.AuthenticateAsync(client, "hub-resilience-no-join"); + var board = await ApiTestHarness.CreateBoardAsync(client, "hub-resilience-board"); + + await using var connection = SignalRTestHelper.CreateBoardsHubConnection(_factory, user.Token); + await connection.StartAsync(); + + // Try editing a card without joining the board first. + var act = () => connection.InvokeAsync("SetEditingCard", board.Id, Guid.NewGuid()); + await act.Should().ThrowAsync( + "setting editing card without joining should throw a HubException"); + + // Connection should still be connected. + connection.State.Should().Be(HubConnectionState.Connected, + "hub error should not disconnect the client"); + } + + // ── One Client's Error Doesn't Affect Others ────────────────────── + + [Fact] + public async Task ErrorOnOneClient_DoesNotDisconnectOtherClients() + { + using var client1 = _factory.CreateClient(); + using var client2 = _factory.CreateClient(); + + var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-resilience-user1"); + var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-resilience-user2"); + + var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-resilience-multi"); + + // Share the board with user2. + await client1.PostAsJsonAsync( + $"/api/boards/{board.Id}/access", + new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor)); + + await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token); + await using var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token); + + await connection1.StartAsync(); + await connection2.StartAsync(); + + // Both users join the board. + await connection1.InvokeAsync("JoinBoard", board.Id); + await connection2.InvokeAsync("JoinBoard", board.Id); + + // Give a moment for presence events to propagate. + await Task.Delay(500); + + // Client 1 causes an error by trying to join a non-existent board. + var act = () => connection1.InvokeAsync("JoinBoard", Guid.NewGuid()); + try { await act(); } catch (HubException) { /* expected */ } + + // Client 2 should still be connected and functional. + connection2.State.Should().Be(HubConnectionState.Connected, + "client 2 should be unaffected by client 1's error"); + + // Client 1 should also still be connected (HubException doesn't kill connection). + connection1.State.Should().Be(HubConnectionState.Connected, + "client 1's connection should survive its own hub exception"); + + // Verify client 2 can still perform operations on the hub. + var postErrorAct = () => connection2.InvokeAsync("SetEditingCard", board.Id, (Guid?)null); + await postErrorAct.Should().NotThrowAsync( + "client 2 should be fully functional after client 1's error"); + } + + // ── Disconnection Handling ──────────────────────────────────────── + + [Fact] + public async Task DisconnectedClient_RemovedFromPresence_OtherClientsNotified() + { + using var client1 = _factory.CreateClient(); + using var client2 = _factory.CreateClient(); + + var user1 = await ApiTestHarness.AuthenticateAsync(client1, "hub-disconnect-user1"); + var user2 = await ApiTestHarness.AuthenticateAsync(client2, "hub-disconnect-user2"); + + var board = await ApiTestHarness.CreateBoardAsync(client1, "hub-disconnect-board"); + + await client1.PostAsJsonAsync( + $"/api/boards/{board.Id}/access", + new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor)); + + var presenceCollector = new EventCollector(); + + await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token); + var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token); + + connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot)); + + await connection1.StartAsync(); + await connection2.StartAsync(); + + await connection1.InvokeAsync("JoinBoard", board.Id); + await connection2.InvokeAsync("JoinBoard", board.Id); + + // Wait for join events. + await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3)); + presenceCollector.Clear(); + + // Disconnect client 2 explicitly. + await connection2.DisposeAsync(); + + // Client 1 should receive a presence update showing client 2 left. + var disconnectEvents = await SignalRTestHelper.WaitForEventsAsync( + presenceCollector, 1, TimeSpan.FromSeconds(5)); + + disconnectEvents.Should().HaveCountGreaterThanOrEqualTo(1, + "client 1 should be notified when client 2 disconnects"); + + // The latest presence snapshot should no longer include client 2. + var latestPresence = disconnectEvents.Last(); + latestPresence.Members.Should().NotContain(u => u.UserId == user2.UserId, + "disconnected user should be removed from presence"); + } +} From 741fad358c5fd852add3f3167998002e3dba5efd Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:12:13 +0100 Subject: [PATCH 05/11] Add webhook delivery resilience tests for retry, dead-letter, and recovery Tests that webhook delivery failures trigger retry scheduling with backoff, exceed max retries leading to dead-lettering, handle inactive subscriptions by dead-lettering, and recover stuck processing deliveries back to Pending status. --- .../WebhookDeliveryResilienceTests.cs | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs new file mode 100644 index 00000000..487dfbad --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs @@ -0,0 +1,256 @@ +using FluentAssertions; +using Microsoft.Extensions.DependencyInjection; +using Taskdeck.Application.Interfaces; +using Taskdeck.Application.Services; +using Taskdeck.Domain.Entities; +using Taskdeck.Domain.Enums; +using Taskdeck.Infrastructure.Persistence; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that webhook delivery failures are handled with retries, backoff, and +/// dead-lettering rather than crashing or silently losing deliveries. +/// +public class WebhookDeliveryResilienceTests : IClassFixture +{ + private readonly TestWebApplicationFactory _factory; + + public WebhookDeliveryResilienceTests(TestWebApplicationFactory factory) + { + _factory = factory; + } + + // ── Delivery to Unreachable Target → Retry Scheduling ───────────── + + [Fact] + public async Task Delivery_ToUnreachableEndpoint_IsScheduledForRetry() + { + // Arrange: create entities directly in the DB to simulate a pending delivery. + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-retry-user", "webhook-retry@example.com", "hash"); + var board = new Board("webhook-retry-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-123", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.created", + "{\"event\":\"card.created\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Verify the delivery starts as Pending. + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending); + + // Simulate a delivery failure by manually marking it. + var claimedAt = DateTimeOffset.UtcNow; + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, + delivery.UpdatedAt, + claimedAt, + CancellationToken.None); + claimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + // Schedule retry (simulating what the worker does on HTTP failure). + var nextAttemptAt = DateTimeOffset.UtcNow.AddSeconds(10); + delivery.ScheduleRetry("Webhook endpoint returned HTTP 503.", nextAttemptAt, 503); + await dbContext.SaveChangesAsync(); + + // Assert: the delivery should be back to Pending with retry metadata. + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending, + "failed delivery should be rescheduled as Pending for retry"); + delivery.AttemptCount.Should().Be(1, + "attempt count should be incremented after a failure"); + delivery.LastErrorMessage.Should().Contain("503", + "error message should capture the failure reason"); + delivery.NextAttemptAt.Should().BeAfter(DateTimeOffset.MinValue, + "retry should have a scheduled next attempt time"); + } + + // ── Dead-Lettering After Max Retries ────────────────────────────── + + [Fact] + public async Task Delivery_AfterMaxRetries_IsDeadLettered() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-deadletter-user", "webhook-deadletter@example.com", "hash"); + var board = new Board("webhook-deadletter-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-456", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.updated", + "{\"event\":\"card.updated\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Simulate multiple failed attempts until max retries is reached. + // Worker settings default: MaxRetries = 3 + for (var attempt = 1; attempt <= 2; attempt++) + { + var updatedAt = delivery.UpdatedAt; + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, updatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue($"attempt {attempt} claim should succeed"); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.ScheduleRetry( + $"HTTP 500 on attempt {attempt}", + DateTimeOffset.UtcNow.AddSeconds(-1), // Make immediately retryable + 500); + await dbContext.SaveChangesAsync(); + await dbContext.Entry(delivery).ReloadAsync(); + } + + // Third attempt (attempt index = 3 which equals MaxRetries) → dead letter. + var finalUpdatedAt = delivery.UpdatedAt; + var finalClaimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, finalUpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + finalClaimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.MarkDeadLetter("HTTP 500 on final attempt", 500); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter, + "delivery should be dead-lettered after exceeding max retries"); + delivery.LastErrorMessage.Should().Contain("final attempt", + "dead-letter should preserve the failure reason"); + } + + // ── Inactive Subscription → Dead Letter ─────────────────────────── + + [Fact] + public async Task Delivery_ForInactiveSubscription_CanBeDeadLettered() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-inactive-user", "webhook-inactive@example.com", "hash"); + var board = new Board("webhook-inactive-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-789", + new[] { "card.*" }); + + // Revoke the subscription before the delivery is processed. + subscription.Revoke(user.Id); + + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.deleted", + "{\"event\":\"card.deleted\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // The worker would first claim the delivery (move to Processing), + // then check subscription.IsActive and dead-letter. + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue("delivery should be claimable"); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + delivery.MarkDeadLetter("Webhook subscription is inactive before delivery dispatch."); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.DeadLetter, + "delivery for inactive subscription should be dead-lettered"); + delivery.LastErrorMessage.Should().Contain("inactive", + "dead-letter message should explain why delivery was abandoned"); + } + + // ── Stuck Processing Recovery ──────────────────────────────────── + + [Fact] + public async Task StuckProcessingDelivery_CanBeReturnedToPending() + { + using var scope = _factory.Services.CreateScope(); + var dbContext = scope.ServiceProvider.GetRequiredService(); + var deliveryRepo = scope.ServiceProvider.GetRequiredService(); + + var user = new User("webhook-stuck-user", "webhook-stuck@example.com", "hash"); + var board = new Board("webhook-stuck-board", ownerId: user.Id); + var subscription = new OutboundWebhookSubscription( + board.Id, + user.Id, + "https://example.com/webhook", + "signing-secret-stuck", + new[] { "card.*" }); + var delivery = new OutboundWebhookDelivery( + Guid.NewGuid(), + subscription.Id, + board.Id, + "card.moved", + "{\"event\":\"card.moved\",\"data\":{}}"); + + dbContext.Users.Add(user); + dbContext.Boards.Add(board); + dbContext.OutboundWebhookSubscriptions.Add(subscription); + dbContext.OutboundWebhookDeliveries.Add(delivery); + await dbContext.SaveChangesAsync(); + + // Claim the delivery (move to Processing). + var claimed = await deliveryRepo.TryClaimPendingAsync( + delivery.Id, delivery.UpdatedAt, DateTimeOffset.UtcNow, CancellationToken.None); + claimed.Should().BeTrue(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Processing); + + // Simulate worker recovery: return the stuck delivery to Pending. + delivery.ReturnToPending( + DateTimeOffset.UtcNow, + "Recovered stale processing webhook delivery for retry."); + await dbContext.SaveChangesAsync(); + + await dbContext.Entry(delivery).ReloadAsync(); + delivery.Status.Should().Be(WebhookDeliveryStatus.Pending, + "stuck processing delivery should be recoverable to Pending"); + delivery.LastErrorMessage.Should().Contain("Recovered", + "recovery message should explain why the delivery was returned to Pending"); + } +} From 4d54ab75dec4bbf77f15c709712920403a3ade6a Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:12:20 +0100 Subject: [PATCH 06/11] Add external service failure tests for auth independence and error codes Tests that local authentication works regardless of external OAuth state, GitHub OAuth endpoints return proper 404 when not configured, and unauthenticated requests return 401 error contracts. --- .../Resilience/ExternalServiceFailureTests.cs | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs new file mode 100644 index 00000000..92d290b8 --- /dev/null +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs @@ -0,0 +1,111 @@ +using System.Net; +using System.Net.Http.Json; +using System.Text.Json; +using FluentAssertions; +using Taskdeck.Api.Tests.Support; +using Taskdeck.Application.DTOs; +using Xunit; + +namespace Taskdeck.Api.Tests.Resilience; + +/// +/// Tests that external service failures (GitHub OAuth, etc.) produce appropriate +/// error responses while keeping local functionality working. +/// +public class ExternalServiceFailureTests : IClassFixture +{ + private readonly HttpClient _client; + + public ExternalServiceFailureTests(TestWebApplicationFactory factory) + { + _client = factory.CreateClient(); + } + + // ── Local Auth Still Works When External Auth Is Unavailable ─────── + + [Fact] + public async Task LocalRegistration_ShouldWork_RegardlessOfExternalOAuthState() + { + // Local auth (register + login) should not depend on any external service. + var response = await _client.PostAsJsonAsync( + "/api/auth/register", + new CreateUserDto("ext-resilience-user", "ext-resilience@example.com", "password123")); + + response.StatusCode.Should().Be(HttpStatusCode.OK, + "local registration should succeed regardless of external service state"); + + var payload = await response.Content.ReadFromJsonAsync(); + payload.Should().NotBeNull(); + payload!.Token.Should().NotBeNullOrWhiteSpace( + "local auth should issue a token without relying on external services"); + } + + [Fact] + public async Task LocalLogin_ShouldWork_RegardlessOfExternalOAuthState() + { + // Register first. + var registerResponse = await _client.PostAsJsonAsync( + "/api/auth/register", + new CreateUserDto("ext-login-resilience", "ext-login-resilience@example.com", "password123")); + registerResponse.StatusCode.Should().Be(HttpStatusCode.OK); + + // Login should work via local path regardless of external service availability. + var loginResponse = await _client.PostAsJsonAsync( + "/api/auth/login", + new LoginDto("ext-login-resilience", "password123")); + + loginResponse.StatusCode.Should().Be(HttpStatusCode.OK, + "local login should succeed regardless of external service state"); + + var loginPayload = await loginResponse.Content.ReadFromJsonAsync(); + loginPayload.Should().NotBeNull(); + loginPayload!.Token.Should().NotBeNullOrWhiteSpace(); + } + + // ── Invalid External Auth Callback → Appropriate Error ──────────── + + [Fact] + public async Task GithubCallback_WhenGithubNotConfigured_ReturnsNotFound() + { + // When GitHub OAuth is not configured, the callback should return + // a clean 404 error rather than crashing. + var response = await _client.GetAsync("/api/auth/github/callback"); + + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "GitHub callback should return 404 when OAuth is not configured"); + + var body = await response.Content.ReadFromJsonAsync(); + body.TryGetProperty("errorCode", out var errorCode).Should().BeTrue( + "404 response should follow the error contract"); + errorCode.GetString().Should().Be("NotFound"); + } + + [Fact] + public async Task GithubLogin_WhenGithubNotConfigured_ReturnsNotFound() + { + // The GitHub login initiation endpoint should also return 404 when not configured. + var response = await _client.GetAsync("/api/auth/github/login"); + + response.StatusCode.Should().Be(HttpStatusCode.NotFound, + "GitHub login should return 404 when OAuth is not configured"); + } + + // ── API Endpoints Return Proper Error Codes on Invalid Input ────── + + [Fact] + public async Task ApiEndpoints_ReturnProperErrorCodes_WhenUnauthenticated() + { + // Without auth, protected endpoints should return 401, not 500. + var boardsResponse = await _client.GetAsync("/api/boards"); + boardsResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to boards should get 401, not 500"); + + var captureResponse = await _client.GetAsync("/api/capture/items"); + captureResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to capture should get 401, not 500"); + + var chatResponse = await _client.GetAsync("/api/llm/chat/sessions"); + chatResponse.StatusCode.Should().Be(HttpStatusCode.Unauthorized, + "unauthenticated request to chat sessions should get 401, not 500"); + } +} From 33f8b7c76c844621267a3f54c86836ccc7f6e399 Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Fri, 10 Apr 2026 01:25:17 +0100 Subject: [PATCH 07/11] Fix test isolation: use unique usernames to prevent collisions ExternalServiceFailureTests used fixed usernames for registration and login which could collide across test runs sharing the same database via IClassFixture. Add GUID suffixes to ensure uniqueness. --- .../Resilience/ExternalServiceFailureTests.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs index 92d290b8..ea04df7c 100644 --- a/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/ExternalServiceFailureTests.cs @@ -27,9 +27,10 @@ public ExternalServiceFailureTests(TestWebApplicationFactory factory) public async Task LocalRegistration_ShouldWork_RegardlessOfExternalOAuthState() { // Local auth (register + login) should not depend on any external service. + var suffix = Guid.NewGuid().ToString("N")[..8]; var response = await _client.PostAsJsonAsync( "/api/auth/register", - new CreateUserDto("ext-resilience-user", "ext-resilience@example.com", "password123")); + new CreateUserDto($"ext-resilience-{suffix}", $"ext-resilience-{suffix}@example.com", "password123")); response.StatusCode.Should().Be(HttpStatusCode.OK, "local registration should succeed regardless of external service state"); @@ -44,15 +45,17 @@ public async Task LocalRegistration_ShouldWork_RegardlessOfExternalOAuthState() public async Task LocalLogin_ShouldWork_RegardlessOfExternalOAuthState() { // Register first. + var suffix = Guid.NewGuid().ToString("N")[..8]; + var username = $"ext-login-{suffix}"; var registerResponse = await _client.PostAsJsonAsync( "/api/auth/register", - new CreateUserDto("ext-login-resilience", "ext-login-resilience@example.com", "password123")); + new CreateUserDto(username, $"ext-login-{suffix}@example.com", "password123")); registerResponse.StatusCode.Should().Be(HttpStatusCode.OK); // Login should work via local path regardless of external service availability. var loginResponse = await _client.PostAsJsonAsync( "/api/auth/login", - new LoginDto("ext-login-resilience", "password123")); + new LoginDto(username, "password123")); loginResponse.StatusCode.Should().Be(HttpStatusCode.OK, "local login should succeed regardless of external service state"); From fc36d4d8f6e9d1734eceb48d3807b2755687a1b6 Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Sat, 11 Apr 2026 23:48:02 +0100 Subject: [PATCH 08/11] Fix tight-loop CPU spin: use non-zero QueuePollIntervalSeconds in worker tests QueuePollIntervalSeconds = 0 caused Task.Delay(0) in the worker loop, spinning thousands of iterations during the test window. Change to 1 second and extend delay windows to 1500ms to ensure at least one iteration completes. Also tighten the weak callCount assertion. --- .../Resilience/WorkerResilienceTests.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs index 9ed8d4c1..50404d48 100644 --- a/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WorkerResilienceTests.cs @@ -36,7 +36,7 @@ public async Task LlmWorker_WhenProcessBatchThrows_LogsErrorAndContinuesToNextPo var logger = new InMemoryLogger(); var settings = new WorkerSettings { - QueuePollIntervalSeconds = 0, + QueuePollIntervalSeconds = 1, EnableAutoQueueProcessing = true, MaxBatchSize = 5, MaxConcurrency = 1, @@ -48,17 +48,17 @@ public async Task LlmWorker_WhenProcessBatchThrows_LogsErrorAndContinuesToNextPo using var cts = new CancellationTokenSource(); - // Act: run the worker for a brief window then cancel. + // Act: run the worker for long enough to complete at least one iteration, then cancel. var runTask = worker.StartAsync(cts.Token); - await Task.Delay(300); + await Task.Delay(1500); cts.Cancel(); try { await runTask; } catch (OperationCanceledException) { } await worker.StopAsync(CancellationToken.None); // Assert: the worker should have logged the error but NOT crashed; - // it should have executed more than one iteration. - callCount.Should().BeGreaterThan(0, + // it should have executed at least one iteration. + callCount.Should().BeGreaterThanOrEqualTo(1, "worker should have attempted at least one batch despite DB throwing"); logger.Entries.Should().Contain(e => @@ -120,7 +120,7 @@ public async Task LlmWorker_WhenCancelled_ExitsWithoutCrashing() var logger = new InMemoryLogger(); var settings = new WorkerSettings { - QueuePollIntervalSeconds = 0, + QueuePollIntervalSeconds = 1, EnableAutoQueueProcessing = true, MaxBatchSize = 5, MaxConcurrency = 1, @@ -134,7 +134,7 @@ public async Task LlmWorker_WhenCancelled_ExitsWithoutCrashing() await worker.StartAsync(cts.Token); // Let it run at least one cycle. - await Task.Delay(150); + await Task.Delay(1500); // StopAsync triggers cancellation and waits for ExecuteAsync to complete. // This should NOT throw -- the BackgroundService infrastructure handles OperationCanceledException. @@ -171,7 +171,7 @@ public async Task LlmWorker_WhenAutoQueueProcessingDisabled_SkipsProcessingButSt var logger = new InMemoryLogger(); var settings = new WorkerSettings { - QueuePollIntervalSeconds = 0, + QueuePollIntervalSeconds = 1, EnableAutoQueueProcessing = false, // Disabled MaxBatchSize = 5, MaxConcurrency = 1, @@ -183,7 +183,7 @@ public async Task LlmWorker_WhenAutoQueueProcessingDisabled_SkipsProcessingButSt using var cts = new CancellationTokenSource(); var runTask = worker.StartAsync(cts.Token); - await Task.Delay(200); + await Task.Delay(1500); cts.Cancel(); try { await runTask; } catch (OperationCanceledException) { } From cb7eba146d18733b3c299bf39539a32ea81564ac Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Sat, 11 Apr 2026 23:49:19 +0100 Subject: [PATCH 09/11] Replace timing-based Task.Delay with event-based waiting in SignalR tests Use SignalRTestHelper.WaitForEventsAsync with a presence collector instead of Task.Delay(500) to avoid flaky timing on slow CI. Also remove the unused System.Net using while keeping System.Net.Http.Json which is needed for PostAsJsonAsync. --- .../Resilience/SignalRDegradationTests.cs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs index a69da990..ee4eb0db 100644 --- a/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/SignalRDegradationTests.cs @@ -1,4 +1,3 @@ -using System.Net; using System.Net.Http.Json; using FluentAssertions; using Microsoft.AspNetCore.SignalR; @@ -85,9 +84,13 @@ await client1.PostAsJsonAsync( $"/api/boards/{board.Id}/access", new GrantAccessDto(board.Id, user2.UserId, UserRole.Editor)); + var presenceCollector = new EventCollector(); + await using var connection1 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user1.Token); await using var connection2 = SignalRTestHelper.CreateBoardsHubConnection(_factory, user2.Token); + connection1.On("boardPresence", snapshot => presenceCollector.Add(snapshot)); + await connection1.StartAsync(); await connection2.StartAsync(); @@ -95,8 +98,8 @@ await client1.PostAsJsonAsync( await connection1.InvokeAsync("JoinBoard", board.Id); await connection2.InvokeAsync("JoinBoard", board.Id); - // Give a moment for presence events to propagate. - await Task.Delay(500); + // Wait for presence events to confirm both clients joined (event-based, not timing-based). + await SignalRTestHelper.WaitForEventsAsync(presenceCollector, 2, TimeSpan.FromSeconds(3)); // Client 1 causes an error by trying to join a non-existent board. var act = () => connection1.InvokeAsync("JoinBoard", Guid.NewGuid()); From f69cf0bc7cf97e7ee8ef1f44e86b54ba4b5507b6 Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Sat, 11 Apr 2026 23:49:43 +0100 Subject: [PATCH 10/11] Remove unused Taskdeck.Application.Services using from webhook tests --- .../Resilience/WebhookDeliveryResilienceTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs index 487dfbad..2f9782ed 100644 --- a/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/WebhookDeliveryResilienceTests.cs @@ -1,7 +1,6 @@ using FluentAssertions; using Microsoft.Extensions.DependencyInjection; using Taskdeck.Application.Interfaces; -using Taskdeck.Application.Services; using Taskdeck.Domain.Entities; using Taskdeck.Domain.Enums; using Taskdeck.Infrastructure.Persistence; From 0ac892fa2a50b522aec0f3040995e186eddc206d Mon Sep 17 00:00:00 2001 From: Chris0Jeky Date: Sat, 11 Apr 2026 23:50:29 +0100 Subject: [PATCH 11/11] Refactor LLM test stubs: remove pragma warnings and fix 60s StreamAsync hang Replace #pragma warning disable CS0162 with helper methods that throw, making yield break reachable without suppressing warnings. Fix TimeoutProviderStub.StreamAsync to use a short internal cancellation (50ms) instead of blocking for 60 seconds. --- .../Resilience/LlmProviderDegradationTests.cs | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs index 428d290c..445803d3 100644 --- a/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs +++ b/backend/tests/Taskdeck.Api.Tests/Resilience/LlmProviderDegradationTests.cs @@ -242,7 +242,11 @@ public async IAsyncEnumerable StreamAsync( ChatCompletionRequest request, [EnumeratorCancellation] CancellationToken ct = default) { - await Task.Delay(TimeSpan.FromSeconds(60), ct); + // Use a short internal timeout to avoid hanging for 60 seconds if a test + // hits the streaming endpoint. Cancels quickly like CompleteAsync does. + using var internalCts = CancellationTokenSource.CreateLinkedTokenSource(ct); + internalCts.CancelAfter(TimeSpan.FromMilliseconds(50)); + await Task.Delay(TimeSpan.FromSeconds(60), internalCts.Token); yield return new LlmTokenEvent("timeout", true); } @@ -266,10 +270,8 @@ public async IAsyncEnumerable StreamAsync( [EnumeratorCancellation] CancellationToken ct = default) { await Task.CompletedTask; - throw new InvalidOperationException("Simulated stream crash"); -#pragma warning disable CS0162 + ThrowStreamCrash(); yield break; -#pragma warning restore CS0162 } public Task GetHealthAsync(CancellationToken ct = default) @@ -277,6 +279,9 @@ public Task GetHealthAsync(CancellationToken ct = default) public Task ProbeAsync(CancellationToken ct = default) => Task.FromResult(new LlmHealthStatus(false, "ThrowingStub", "Provider threw exception", IsProbed: true)); + + private static void ThrowStreamCrash() + => throw new InvalidOperationException("Simulated stream crash"); } /// @@ -292,10 +297,8 @@ public async IAsyncEnumerable StreamAsync( [EnumeratorCancellation] CancellationToken ct = default) { await Task.CompletedTask; - throw new InvalidOperationException("All providers are down"); -#pragma warning disable CS0162 + ThrowProvidersDown(); yield break; -#pragma warning restore CS0162 } public Task GetHealthAsync(CancellationToken ct = default) @@ -303,5 +306,8 @@ public Task GetHealthAsync(CancellationToken ct = default) public Task ProbeAsync(CancellationToken ct = default) => Task.FromResult(new LlmHealthStatus(false, "Dead", "All providers are unavailable", IsProbed: true)); + + private static void ThrowProvidersDown() + => throw new InvalidOperationException("All providers are down"); } }