-
Notifications
You must be signed in to change notification settings - Fork 0
Test: LLM provider abstraction and tool-calling edge cases (#709) #747
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
cd6d206
Test: add orchestrator edge case tests for tool-calling boundaries
Chris0Jeky bc7ddfd
Test: add LLM provider abstraction edge case tests
Chris0Jeky 293d209
Test: add intent classifier edge case tests for adversarial inputs
Chris0Jeky 4a67e18
Test: add tool executor registry edge case tests
Chris0Jeky d96f9a0
Fix: address adversarial self-review findings
Chris0Jeky b20d2fe
Fix adversarial review findings: false-positive tests, missing covera…
Chris0Jeky File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
249 changes: 249 additions & 0 deletions
249
backend/tests/Taskdeck.Application.Tests/Services/LlmIntentClassifierEdgeCaseTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,249 @@ | ||
| using FluentAssertions; | ||
| using Xunit; | ||
| using Taskdeck.Application.Services; | ||
|
|
||
| namespace Taskdeck.Application.Tests.Services; | ||
|
|
||
| /// <summary> | ||
| /// Edge case tests for LlmIntentClassifier expanding on the existing fuzz tests. | ||
| /// Covers: negation filtering, other-tool questions, ambiguous inputs, | ||
| /// very long inputs, prompt injection patterns, mixed casing, | ||
| /// and multi-intent detection gaps. | ||
| /// </summary> | ||
| public class LlmIntentClassifierEdgeCaseTests | ||
| { | ||
| // ── Negation filtering ─────────────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("Don't add a card")] | ||
| [InlineData("do not create a new task")] | ||
| [InlineData("never move the card to done")] | ||
| [InlineData("stop create new tasks")] | ||
| [InlineData("cancel the delete of card 5")] | ||
| [InlineData("don't remove that task")] | ||
| [InlineData("avoid create a task please")] // negation regex: "avoid" followed by verbs within word distance | ||
| public void Classify_NegatedInput_IsNotActionable(string input) | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeFalse( | ||
| $"negated input '{input}' should not be classified as actionable"); | ||
| } | ||
|
|
||
| // ── Other-tool questions ───────────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("How do I add a card in Trello?")] | ||
| [InlineData("How do I create a task in Jira?")] | ||
| [InlineData("Where do I move cards in Asana?")] | ||
| [InlineData("Can I create boards in Notion?")] | ||
| public void Classify_OtherToolQuestion_IsNotActionable(string input) | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeFalse( | ||
| $"question about another tool '{input}' should not be actionable"); | ||
| } | ||
|
|
||
| // ── Positive detection ─────────────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("create a new card called Test", "card.create")] | ||
| [InlineData("add a task for the meeting", "card.create")] | ||
| [InlineData("make a new task for sprint review", "card.create")] | ||
| [InlineData("move card to done column", "card.move")] | ||
| [InlineData("archive the old task", "card.archive")] | ||
| [InlineData("delete card number 5", "card.archive")] | ||
| [InlineData("remove the finished task", "card.archive")] | ||
| [InlineData("update card title to new name", "card.update")] | ||
| [InlineData("rename task to better name", "card.update")] | ||
| [InlineData("edit card description", "card.update")] | ||
| [InlineData("create a new board for the project", "board.create")] | ||
| [InlineData("rename board to Sprint 42", "board.update")] | ||
| [InlineData("reorder columns on the board", "column.reorder")] | ||
| public void Classify_ActionableInput_DetectsCorrectIntent(string input, string expectedIntent) | ||
| { | ||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeTrue($"'{input}' should be detected as actionable"); | ||
| actionIntent.Should().Be(expectedIntent); | ||
| } | ||
|
|
||
| // ── Non-actionable inputs ──────────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("hello")] | ||
| [InlineData("what is the weather?")] | ||
| [InlineData("tell me about the project")] | ||
| [InlineData("how are my tasks doing?")] | ||
| [InlineData("show me a summary")] | ||
| [InlineData("what's the status?")] | ||
| public void Classify_NonActionableInput_ReturnsFalse(string input) | ||
| { | ||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeFalse( | ||
| $"non-actionable input '{input}' should not be classified as actionable"); | ||
| actionIntent.Should().BeNull(); | ||
| } | ||
|
|
||
| // ── Edge cases ─────────────────────────────────────────────── | ||
|
|
||
| [Fact] | ||
| public void Classify_NullInput_ReturnsFalse() | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(null!); | ||
|
|
||
| isActionable.Should().BeFalse(); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_EmptyString_ReturnsFalse() | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(""); | ||
|
|
||
| isActionable.Should().BeFalse(); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_WhitespaceOnly_ReturnsFalse() | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(" \t\n "); | ||
|
|
||
| isActionable.Should().BeFalse(); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_VeryLongInput_DoesNotThrow() | ||
| { | ||
| // 10,000 character message should be handled gracefully | ||
| var longInput = new string('a', 9990) + " create a card"; | ||
|
|
||
| var act = () => LlmIntentClassifier.Classify(longInput); | ||
|
|
||
| act.Should().NotThrow(); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_VeryLongInput_WithActionableContent_StillDetects() | ||
| { | ||
| // Actionable content at the start should be detected even in long messages | ||
| var longInput = "create a new task called test " + new string('x', 5000); | ||
|
|
||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(longInput); | ||
|
|
||
| isActionable.Should().BeTrue(); | ||
| actionIntent.Should().Be("card.create"); | ||
| } | ||
|
|
||
| [Theory] | ||
| [InlineData("CREATE A NEW CARD")] | ||
| [InlineData("Create A New Card")] | ||
| [InlineData("cReAtE a NeW cArD")] | ||
| public void Classify_MixedCase_StillDetects(string input) | ||
| { | ||
| var (isActionable, _) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeTrue( | ||
| $"mixed case input '{input}' should still be detected"); | ||
| } | ||
|
|
||
| [Theory] | ||
| [InlineData("create a card\nand some other text")] | ||
| [InlineData("create\na\ncard")] | ||
| public void Classify_NewlinesInInput_DoesNotThrow(string input) | ||
| { | ||
| // Verify that newlines in input do not cause exceptions. | ||
| // The classifier may or may not detect the intent depending on | ||
| // whether the regex matches across line boundaries, but it must | ||
| // never crash. | ||
| var act = () => LlmIntentClassifier.Classify(input); | ||
| act.Should().NotThrow("newlines in input must not cause exceptions"); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_NewlinesSeparatingActionablePhrase_DetectsWhenOnOneLine() | ||
| { | ||
| // "create a card" on a single line should be detected even with trailing newlines | ||
| var input = "create a card\nsome other text after"; | ||
|
|
||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeTrue("actionable phrase on first line should be detected"); | ||
| actionIntent.Should().Be("card.create"); | ||
| } | ||
|
|
||
| [Fact] | ||
| public void Classify_PromptInjection_DoesNotCrashAndStillClassifies() | ||
| { | ||
| // Injection payloads that contain "create a card" should still be classified | ||
| // as actionable — the classifier is a regex-based intent detector, not a | ||
| // sanitizer. The key guarantee is no crashes and correct classification. | ||
| var injections = new[] | ||
| { | ||
| "create a card'; DROP TABLE cards;--", | ||
| "create a card with <script>alert('xss')</script>", | ||
| "create a card\0with null bytes", | ||
| "create a card\\nwith escaped newlines" | ||
| }; | ||
|
|
||
| foreach (var input in injections) | ||
| { | ||
| var act = () => LlmIntentClassifier.Classify(input); | ||
| act.Should().NotThrow($"input '{input}' should not cause an exception"); | ||
|
|
||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
| isActionable.Should().BeTrue( | ||
| $"injection input '{input}' still contains 'create a card' and should be actionable"); | ||
| actionIntent.Should().Be("card.create"); | ||
| } | ||
| } | ||
|
|
||
| // ── Archive vs Move disambiguation ─────────────────────────── | ||
|
|
||
| [Fact] | ||
| public void Classify_RemoveCard_ClassifiesAsArchive_NotMove() | ||
| { | ||
| // "remove" contains "move" as a substring. Verify archive takes priority. | ||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify("remove the task from backlog"); | ||
|
|
||
| isActionable.Should().BeTrue(); | ||
| actionIntent.Should().Be("card.archive"); | ||
| } | ||
|
|
||
| // ── Stemming/plural variations ─────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("create new cards", "card.create")] | ||
| [InlineData("add tasks for the team", "card.create")] | ||
| [InlineData("move tasks to done", "card.move")] | ||
| [InlineData("archive cards", "card.archive")] | ||
| [InlineData("update tasks", "card.update")] | ||
| public void Classify_PluralNouns_StillDetects(string input, string expectedIntent) | ||
| { | ||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeTrue($"plural input '{input}' should be detected"); | ||
| actionIntent.Should().Be(expectedIntent); | ||
| } | ||
|
|
||
| // ── Verb coverage ──────────────────────────────────────────── | ||
|
|
||
| [Theory] | ||
| [InlineData("generate a card for testing", "card.create")] | ||
| [InlineData("build a task list", "card.create")] | ||
| [InlineData("prepare a new task", "card.create")] | ||
| [InlineData("set up a new board", "board.create")] | ||
| [InlineData("modify the card title", "card.update")] | ||
| [InlineData("change task priority", "card.update")] | ||
| [InlineData("sort the columns", "column.reorder")] | ||
| [InlineData("rearrange the columns", "column.reorder")] | ||
| [InlineData("reorganize the board columns", "column.reorder")] | ||
| public void Classify_AlternateVerbs_DetectedCorrectly(string input, string expectedIntent) | ||
| { | ||
| var (isActionable, actionIntent) = LlmIntentClassifier.Classify(input); | ||
|
|
||
| isActionable.Should().BeTrue($"verb in '{input}' should be recognized"); | ||
| actionIntent.Should().Be(expectedIntent); | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This test currently has no explicit assertion (it only passes unless
Classifythrows). To make the intent clear and avoid accidental no-op coverage, add an explicit assertion such asact.Should().NotThrow()and/or assert expected classification for these newline cases.