diff --git a/skills/pm/SKILL.md b/skills/pm/SKILL.md index 8a039ae4..b1de284e 100644 --- a/skills/pm/SKILL.md +++ b/skills/pm/SKILL.md @@ -66,41 +66,53 @@ Then check: does `meta.ask_user_question` exist? ``` Do NOT modify it. Do NOT add options. Do NOT rephrase the question. -- **NO** → This is an interview question. Use `AskUserQuestion` with `meta.question` and generate 2-3 suggested answers. +- **NO** → This is an interview question. Use `AskUserQuestion` with `meta.question`. + - If `meta.skip_eligible == true`: add a skip option based on `meta.classification`: + - `classification == "decide_later"` → add option `{"label": "Decide later", "description": "Skip — will be recorded as an open item in the PRD"}` + - `classification == "deferred"` → add option `{"label": "Defer to dev", "description": "Skip — this technical decision will be deferred to the development phase"}` + - Generate 2-3 suggested answers as the other options. **C. Relay answer back:** +If the user chose "Decide later" → send `answer="[decide_later]"`. +If the user chose "Defer to dev" → send `answer="[deferred]"`. +Otherwise → send the user's answer normally. + ``` Tool: ouroboros_pm_interview Arguments: session_id: - : + : ``` **D. Check completion:** -If `meta.is_complete == true` → go to Step 4. -Otherwise → repeat Step 3. +Completion is determined ONLY by `meta.is_complete` — NEVER by the response text. +The MCP response text may sound like the interview is wrapping up, but ignore it. -### Step 4: Generate +If `meta.is_complete == true`: +- If `meta.generation_failed == true` → retry generation: + ``` + Tool: ouroboros_pm_interview + Arguments: + session_id: + action: "generate" + cwd: + ``` +- Otherwise → go to Step 4. The MCP auto-generated the PM document. + `meta.pm_path` and `meta.seed_path` contain the file paths. -``` -Tool: ouroboros_pm_interview -Arguments: - session_id: - action: "generate" - cwd: -``` +Otherwise → repeat Step 3, regardless of what the response text says. -### Step 5: Copy to Clipboard +### Step 4: Copy to Clipboard -After generation, read the pm.md file from `meta.pm_path` and copy its contents to the clipboard: +Read the pm.md file from `meta.pm_path` and copy its contents to the clipboard: ```bash cat | pbcopy ``` -### Step 6: Show Result & Next Step +### Step 5: Show Result & Next Step Show the following to the user: diff --git a/src/ouroboros/bigbang/pm_interview.py b/src/ouroboros/bigbang/pm_interview.py index ab8e9810..470fa91a 100644 --- a/src/ouroboros/bigbang/pm_interview.py +++ b/src/ouroboros/bigbang/pm_interview.py @@ -55,18 +55,7 @@ _PM_SYSTEM_PROMPT_PREFIX = """\ You are a Product Requirements interviewer helping a PM define their product. -Focus on PRODUCT-LEVEL questions: -- What problem does this solve and for whom? -- What are the business goals and success metrics? -- What are the user stories and workflows? -- What constraints exist (timeline, budget, compliance)? -- What is in scope vs out of scope? -- What are the acceptance criteria? - -Do NOT ask about: -- Implementation details (databases, frameworks, APIs) -- Architecture decisions (microservices, deployment) -- Code-level patterns or testing strategies +Focus on: goal, user stories, constraints, success criteria, assumptions. """ @@ -158,7 +147,8 @@ class PMInterviewEngine: """Original question text for questions classified as DECIDE_LATER. These are questions that are premature or unknowable at the PM stage. - They are auto-answered with a placeholder and stored here so the PMSeed + The main session presents the question to the user with a "decide later" + option; when chosen, the caller records the item here so the PMSeed and PM document can surface them as explicit "decide later" decisions. """ classifications: list[ClassificationResult] = field(default_factory=list) @@ -484,47 +474,39 @@ async def ask_next_question( output_type = classification.output_type if output_type == ClassifierOutputType.DEFERRED: - # Track as deferred item and generate a new question - self.deferred_items.append(classification.original_question) + # Return the question to the caller (main session) so the user + # can choose to defer it themselves. The main session detects + # classification == "deferred" via response_meta and offers + # a "skip / defer to dev" option. If the user picks it, the + # caller calls skip_as_deferred() which records the deferral + # and appends to deferred_items. + # + # Previously this branch auto-answered and recursed, which could + # trigger MCP 120s timeouts on consecutive DEFERRED runs. log.info( - "pm.question_deferred", + "pm.question_deferred_candidate", question=classification.original_question[:100], reasoning=classification.reasoning, output_type=output_type, ) - # Feed an automatic response back to the inner InterviewEngine - # so the round is properly recorded and the engine advances. - # This prevents the inner engine from re-generating similar - # technical questions it doesn't know were already handled. - await self.record_response( - state, - user_response="[Deferred to development phase] " - "This technical decision will be addressed during the " - "development interview.", - question=classification.original_question, - ) - # Recursively ask for the next real question - return await self.ask_next_question(state) + return Result.ok(classification.original_question) if output_type == ClassifierOutputType.DECIDE_LATER: - # Auto-answer with placeholder — no PM interaction needed - placeholder = classification.placeholder_response - self.decide_later_items.append(classification.original_question) + # Return the question to the caller (main session) so the user + # can choose "decide later" themselves. The main session detects + # classification == "decide_later" via response_meta and offers + # the option. If the user picks it, the caller calls + # skip_as_decide_later() which records the placeholder and + # appends to decide_later_items. + # + # Previously this branch auto-answered and recursed, which could + # trigger MCP 120s timeouts on consecutive DECIDE_LATER runs. log.info( "pm.question_decide_later", question=classification.original_question[:100], - placeholder=placeholder[:100], reasoning=classification.reasoning, ) - # Record the placeholder as the response so the interview - # engine advances its round count - await self.record_response( - state, - user_response=f"[Decide later] {placeholder}", - question=classification.original_question, - ) - # Recursively ask for the next real question - return await self.ask_next_question(state) + return Result.ok(classification.original_question) if output_type == ClassifierOutputType.REFRAMED: # Use the reframed version and track the mapping @@ -595,6 +577,76 @@ async def record_response( return await self.inner.record_response(state, user_response, question) + async def skip_as_decide_later( + self, + state: InterviewState, + question: str, + ) -> Result[InterviewState, ValidationError]: + """Skip a question as "decide later" at the user's explicit request. + + Records the question in ``decide_later_items`` and feeds a placeholder + response to the inner InterviewEngine so the round is properly recorded + and the engine advances. + + This is called when the main session detects that the user chose the + "decide later" option for a DECIDE_LATER-classified question, instead + of the old auto-skip behavior inside ``ask_next_question``. + + Args: + state: Current interview state. + question: The question the user chose to decide later. + + Returns: + Result containing updated state or ValidationError. + """ + if question not in self.decide_later_items: + self.decide_later_items.append(question) + + log.info( + "pm.question_decide_later_by_user", + question=question[:100], + ) + + return await self.record_response( + state, + user_response="[Decide later] To be determined — user chose to decide later.", + question=question, + ) + + async def skip_as_deferred( + self, + state: InterviewState, + question: str, + ) -> Result[InterviewState, ValidationError]: + """Skip a question as "deferred to dev" at the user's explicit request. + + Records the question in ``deferred_items`` and feeds a deferral + response to the inner InterviewEngine so the round is properly recorded + and the engine advances. + + Args: + state: Current interview state. + question: The question the user chose to defer. + + Returns: + Result containing updated state or ValidationError. + """ + if question not in self.deferred_items: + self.deferred_items.append(question) + + log.info( + "pm.question_deferred_by_user", + question=question[:100], + ) + + return await self.record_response( + state, + user_response="[Deferred to development phase] " + "This technical decision will be addressed during the " + "development interview.", + question=question, + ) + async def complete_interview( self, state: InterviewState, @@ -1129,13 +1181,14 @@ def _parse_pm_seed( for s in data.get("user_stories", []) ) - # Merge deferred items from classifier with extraction + # Merge LLM-extracted items with engine-tracked items, deduplicating. + # The extraction prompt includes raw items as context so the LLM may + # already emit them, but engine-tracked items are authoritative and + # must survive even if the extractor omits them. all_deferred = list(data.get("deferred_items", [])) for item in self.deferred_items: if item not in all_deferred: all_deferred.append(item) - - # Merge decide-later items — stored as original question text all_decide_later = list(data.get("decide_later_items", [])) for item in self.decide_later_items: if item not in all_decide_later: diff --git a/src/ouroboros/bigbang/question_classifier.py b/src/ouroboros/bigbang/question_classifier.py index 202ba7bf..1e43c428 100644 --- a/src/ouroboros/bigbang/question_classifier.py +++ b/src/ouroboros/bigbang/question_classifier.py @@ -3,9 +3,10 @@ Classifies generated questions as PM-answerable (planning), DEV-only (development/technical), or decide-later (premature/unknowable). -DEV-only questions are reframed into PM-friendly language or deferred to -the development interview phase. Decide-later questions get an automatic -placeholder response without PM interaction. +DEV-only questions are reframed into PM-friendly language or returned to +the user with the option to defer to the development interview phase. +Decide-later questions are returned to the user with the option to defer +rather than being automatically skipped. Uses a Sonnet-grade model for accurate judgment and reframing. """ @@ -51,8 +52,8 @@ class ClassifierOutputType(StrEnum): PASSTHROUGH: Planning question forwarded unchanged to the PM. REFRAMED: Development question rewritten in PM-friendly language. - DEFERRED: Deeply technical question deferred to the dev interview phase. - DECIDE_LATER: Premature question auto-answered with a placeholder. + DEFERRED: Deeply technical question returned to user with defer option. + DECIDE_LATER: Premature question returned to user with decide-later option. """ PASSTHROUGH = "passthrough" @@ -73,8 +74,9 @@ class ClassificationResult: reasoning: Why the classifier chose this category. defer_to_dev: Whether this question should be deferred entirely. decide_later: Whether this question is premature and should be - auto-answered with a placeholder. - placeholder_response: Automatic response for decide-later questions. + returned to the user with the option to defer. + placeholder_response: Placeholder response for decide-later questions + (used if the user chooses to defer). """ original_question: str @@ -90,8 +92,10 @@ def output_type(self) -> ClassifierOutputType: """Determine the classifier output type for this result. Returns: - DECIDE_LATER if premature/unknowable question (auto-answered), - DEFERRED if deeply technical (skipped entirely), + DECIDE_LATER if premature/unknowable question (returned to user + with option to defer), + DEFERRED if deeply technical (returned to user with option to + defer to dev phase), PASSTHROUGH if planning question (forwarded unchanged), REFRAMED if development question rewritten for PM. """ @@ -109,13 +113,20 @@ def question_for_pm(self) -> str: For PASSTHROUGH: returns original_question unchanged. For REFRAMED: returns the reframed_question. - For DEFERRED: returns empty string (should not be shown). - For DECIDE_LATER: returns empty string (auto-answered). + For DEFERRED: returns original_question (shown to user + so they can choose to answer or defer to dev). + For DECIDE_LATER: returns original_question (shown to user + so they can choose to answer or defer). """ if self.output_type == ClassifierOutputType.PASSTHROUGH: return self.original_question if self.output_type == ClassifierOutputType.REFRAMED: return self.reframed_question + if self.output_type in ( + ClassifierOutputType.DECIDE_LATER, + ClassifierOutputType.DEFERRED, + ): + return self.original_question return "" @@ -181,7 +192,8 @@ class QuestionClassifier: Uses a Sonnet-grade LLM to judge whether questions are appropriate for a PM audience, reframes technical questions when possible, and - identifies premature questions that should be auto-answered. + identifies premature questions that should be returned to the user + with the option to defer. Attributes: llm_adapter: LLM adapter for classification calls. diff --git a/src/ouroboros/cli/commands/pm.py b/src/ouroboros/cli/commands/pm.py index 4d2f8ec2..2d6565b7 100644 --- a/src/ouroboros/cli/commands/pm.py +++ b/src/ouroboros/cli/commands/pm.py @@ -410,6 +410,20 @@ async def _run_pm_interview( console.print(f"\n[bold yellow]?[/] {question}\n") + # Check if this question was classified as skippable — + # if so, show a hint that the user can defer it. + classification = engine.get_last_classification() + if classification == "decide_later": + console.print( + "[dim] 💡 This question can be deferred. " + 'Type "decide later" or "skip" to defer it.[/]\n' + ) + elif classification == "deferred": + console.print( + "[dim] 💡 This is a technical question that can be deferred to the dev phase. " + 'Type "defer" or "skip" to defer it.[/]\n' + ) + # Persist state + meta AFTER displaying the question but BEFORE # waiting for input so that an interruption preserves the pending # question and --resume shows the same question. @@ -442,6 +456,39 @@ async def _run_pm_interview( if state.rounds and state.rounds[-1].user_response is None: state.rounds.pop() + # Handle user-initiated skip (decide later / defer to dev) + _lower = user_response.strip().lower() + if classification == "decide_later" and _lower in ( + "decide later", + "skip", + "[decide_later]", + ): + record_result = await engine.skip_as_decide_later(state, question) + if isinstance(record_result, Result) and record_result.is_err: + print_error(f"Failed to skip question: {record_result.error}") + break + save_result = await engine.save_state(state) + if isinstance(save_result, Result) and save_result.is_err: + print_error(f"Failed to save state: {save_result.error}") + break + _save_cli_pm_meta(state.interview_id, engine) + continue + if classification == "deferred" and _lower in ( + "defer", + "skip", + "[deferred]", + ): + record_result = await engine.skip_as_deferred(state, question) + if isinstance(record_result, Result) and record_result.is_err: + print_error(f"Failed to defer question: {record_result.error}") + break + save_result = await engine.save_state(state) + if isinstance(save_result, Result) and save_result.is_err: + print_error(f"Failed to save state: {save_result.error}") + break + _save_cli_pm_meta(state.interview_id, engine) + continue + record_result = await engine.record_response(state, user_response, question) if isinstance(record_result, Result) and record_result.is_err: print_error(f"Failed to record response: {record_result.error}") diff --git a/src/ouroboros/mcp/tools/pm_handler.py b/src/ouroboros/mcp/tools/pm_handler.py index 9910fd60..3d2683af 100644 --- a/src/ouroboros/mcp/tools/pm_handler.py +++ b/src/ouroboros/mcp/tools/pm_handler.py @@ -338,12 +338,22 @@ async def handle( initial_context = arguments.get("initial_context") session_id = arguments.get("session_id") answer = arguments.get("answer") - cwd = arguments.get("cwd") or os.getcwd() + cwd_arg = arguments.get("cwd") selected_repos: list[str] | None = arguments.get("selected_repos") # Auto-detect action from parameter presence (AC 13) action = _detect_action(arguments) + # For resume/generate, prefer persisted session cwd over os.getcwd() + # so artifacts land in the workspace where the interview started. + if cwd_arg: + cwd = cwd_arg + elif session_id and action in ("resume", "generate"): + meta = _load_pm_meta(session_id, self.data_dir) + cwd = (meta.get("cwd") if meta else None) or os.getcwd() + else: + cwd = os.getcwd() + engine = self._get_engine() try: @@ -503,6 +513,12 @@ async def _handle_start( # Include pending_reframe in response meta if a reframe occurred pending_reframe = engine.get_pending_reframe() + # Check classification to signal skip eligibility + classification = _last_classification(engine) + is_decide_later = classification == "decide_later" + is_deferred = classification == "deferred" + skip_eligible = is_decide_later or is_deferred + meta = { "session_id": state.interview_id, "status": "interview_started", @@ -510,6 +526,8 @@ async def _handle_start( "response_param": "answer", "question": question, "is_brownfield": state.is_brownfield, + "classification": classification, + "skip_eligible": skip_eligible, "pending_reframe": pending_reframe, **diff, } @@ -518,18 +536,35 @@ async def _handle_start( "pm_handler.started", session_id=state.interview_id, is_brownfield=state.is_brownfield, + classification=classification, + skip_eligible=skip_eligible, has_pending_reframe=pending_reframe is not None, **diff, ) + # Build response text — include skip hint when applicable + start_text = f"PM interview started. Session ID: {state.interview_id}\n\n{question}" + if is_decide_later: + start_text += ( + "\n\n💡 This question can be deferred. " + 'The user may answer now, or choose "decide later" to skip it. ' + "If they choose to decide later, pass " + f'answer="[decide_later]" with session_id="{state.interview_id}".' + ) + elif is_deferred: + start_text += ( + "\n\n💡 This is a technical question that can be deferred to the dev phase. " + "The user may answer now, or choose to defer it. " + "If they choose to defer, pass " + f'answer="[deferred]" with session_id="{state.interview_id}".' + ) + return Result.ok( MCPToolResult( content=( MCPContentItem( type=ContentType.TEXT, - text=( - f"PM interview started. Session ID: {state.interview_id}\n\n{question}" - ), + text=start_text, ), ), is_error=False, @@ -725,6 +760,12 @@ async def _idempotent_select_repos( else (state.rounds[-1].question if state.rounds else "No question available.") ) + engine.restore_meta(meta) + classification = _last_classification(engine) + is_decide_later = classification == "decide_later" + is_deferred = classification == "deferred" + skip_eligible = is_decide_later or is_deferred + return Result.ok( MCPToolResult( content=( @@ -742,6 +783,8 @@ async def _idempotent_select_repos( "question": first_question, "is_brownfield": state.is_brownfield, "idempotent": True, + "classification": classification, + "skip_eligible": skip_eligible, }, ) ) @@ -780,15 +823,35 @@ async def _handle_answer( if not answer and state.rounds and state.rounds[-1].user_response is None: pending_question = state.rounds[-1].question classification = _last_classification(engine) + is_decide_later = classification == "decide_later" + is_deferred = classification == "deferred" + skip_eligible = is_decide_later or is_deferred pending_reframe = engine.get_pending_reframe() + # Include skip hint in re-displayed question + pending_text = f"Session {session_id}\n\n{pending_question}" + if is_decide_later: + pending_text += ( + "\n\n💡 This question can be deferred. " + 'The user may answer now, or choose "decide later" to skip it. ' + "If they choose to decide later, pass " + f'answer="[decide_later]" with session_id="{session_id}".' + ) + elif is_deferred: + pending_text += ( + "\n\n💡 This is a technical question that can be deferred to the dev phase. " + "The user may answer now, or choose to defer it. " + "If they choose to defer, pass " + f'answer="[deferred]" with session_id="{session_id}".' + ) + return Result.ok( MCPToolResult( content=( MCPContentItem( type=ContentType.TEXT, - text=f"Session {session_id}\n\n{pending_question}", + text=pending_text, ), ), is_error=False, @@ -799,6 +862,7 @@ async def _handle_answer( "question": pending_question, "is_complete": False, "classification": classification, + "skip_eligible": skip_eligible, "deferred_this_round": [], "decide_later_this_round": [], "interview_complete": False, @@ -811,6 +875,13 @@ async def _handle_answer( ) ) + # ── Per-round diff snapshot — must be BEFORE any skip/record call ── + # Snapshot list lengths here so that items appended inside + # skip_as_decide_later() / skip_as_deferred() are captured in the + # per-round diff returned at the end of this call. + deferred_before = len(engine.deferred_items) + decide_later_before = len(engine.decide_later_items) + # Record answer if provided if answer and not state.rounds: return Result.err( @@ -824,20 +895,55 @@ async def _handle_answer( if state.rounds[-1].user_response is None: state.rounds.pop() - record_result = await engine.record_response(state, answer, last_question) - if record_result.is_err: - return Result.err( - MCPToolError( - str(record_result.error), - tool_name="ouroboros_pm_interview", + # ── User chose to skip (decide later / defer to dev) ─── + # The main session detects classification via response_meta + # and offers skip options. The user's choice arrives as: + # answer="[decide_later]" → skip_as_decide_later() + # answer="[deferred]" → skip_as_deferred() + # Guard: only honour the sentinel when the last question was + # actually classified as that type. If a client sends + # "[decide_later]" for a passthrough/reframed question, treat + # it as a normal answer so no data is silently discarded. + stripped = answer.strip() + last_classification = _last_classification(engine) + if stripped == "[decide_later]" and last_classification == "decide_later": + skip_result = await engine.skip_as_decide_later(state, last_question) + if skip_result.is_err: + return Result.err( + MCPToolError( + str(skip_result.error), + tool_name="ouroboros_pm_interview", + ) ) - ) - state = record_result.value - state.clear_stored_ambiguity() + state = skip_result.value + state.clear_stored_ambiguity() + elif stripped == "[deferred]" and last_classification == "deferred": + skip_result = await engine.skip_as_deferred(state, last_question) + if skip_result.is_err: + return Result.err( + MCPToolError( + str(skip_result.error), + tool_name="ouroboros_pm_interview", + ) + ) + state = skip_result.value + state.clear_stored_ambiguity() + else: + record_result = await engine.record_response(state, answer, last_question) + if record_result.is_err: + return Result.err( + MCPToolError( + str(record_result.error), + tool_name="ouroboros_pm_interview", + ) + ) + state = record_result.value + state.clear_stored_ambiguity() # ── Completion check (AC 12) ───────────────────────────── # Completion is determined by engine ambiguity scoring. - # User controls when to stop. + # When complete, auto-generate the PM document immediately + # (no separate "generate" call needed from the skill). completion = await _check_completion(state, engine) if completion is not None: # Mark interview as complete @@ -852,21 +958,76 @@ async def _handle_answer( ) _save_pm_meta(session_id, engine, cwd=cwd, data_dir=self.data_dir) + log.info( + "pm_handler.interview_complete", + session_id=session_id, + **completion, + ) + + # Auto-generate PM document on completion + seed_result = await engine.generate_pm_seed(state) + if seed_result.is_err: + # Generation failed — still report completion but without document + summary_text = ( + f"Interview complete but PM generation failed: {seed_result.error}\n" + f"Session ID: {session_id}\n" + f'Retry with: action="generate", session_id="{session_id}"' + ) + return Result.ok( + MCPToolResult( + content=(MCPContentItem(type=ContentType.TEXT, text=summary_text),), + is_error=False, + meta={ + "session_id": session_id, + "is_complete": True, + "generation_failed": True, + **completion, + }, + ) + ) + + seed = seed_result.value + try: + seed_path = engine.save_pm_seed(seed) + pm_output_dir = Path(cwd) / ".ouroboros" + pm_path = save_pm_document(seed, output_dir=pm_output_dir) + except Exception as e: + log.error("pm_handler.save_failed", error=str(e), session_id=session_id) + summary_text = ( + f"Interview complete but saving PM artifacts failed: {e}\n" + f"Session ID: {session_id}\n" + f'Retry with: action="generate", session_id="{session_id}"' + ) + return Result.ok( + MCPToolResult( + content=(MCPContentItem(type=ContentType.TEXT, text=summary_text),), + is_error=False, + meta={ + "session_id": session_id, + "is_complete": True, + "generation_failed": True, + **completion, + }, + ) + ) + decide_later_summary = engine.format_decide_later_summary() summary_text = ( - f"Interview complete. Session ID: {session_id}\n" + f"Interview complete. PM document generated.\n\n" + f"Session ID: {session_id}\n" f"Rounds completed: {completion['rounds_completed']}\n" f"Completion reason: {completion['completion_reason']}\n" ) if completion.get("ambiguity_score") is not None: summary_text += f"Ambiguity score: {completion['ambiguity_score']:.2f}\n" summary_text += ( - f"\nDeferred items: {len(engine.deferred_items)}\n" - f"Decide-later items: {len(engine.decide_later_items)}\n" + f"\nPM document: {pm_path}\n" + f"Seed: {seed_path}\n" + f"\nDeferred items: {len(seed.deferred_items)}\n" + f"Decide-later items: {len(seed.decide_later_items)}\n" ) if decide_later_summary: summary_text += f"\n{decide_later_summary}\n" - summary_text += f'\nGenerate PM with: action="generate", session_id="{session_id}"' response_meta = { "session_id": session_id, @@ -878,14 +1039,10 @@ async def _handle_answer( **completion, "deferred_count": len(engine.deferred_items), "decide_later_count": len(engine.decide_later_items), + "seed_path": str(seed_path), + "pm_path": str(pm_path), } - log.info( - "pm_handler.interview_complete", - session_id=session_id, - **completion, - ) - return Result.ok( MCPToolResult( content=( @@ -899,11 +1056,6 @@ async def _handle_answer( ) ) - # ── Core diff computation (AC 8) ────────────────────────── - # Snapshot list lengths BEFORE ask_next_question - deferred_before = len(engine.deferred_items) - decide_later_before = len(engine.decide_later_items) - question_result = await engine.ask_next_question(state) if question_result.is_err: error_msg = str(question_result.error) @@ -958,6 +1110,11 @@ async def _handle_answer( # Extract classification from the last classify call classification = _last_classification(engine) + # Signal to the caller that the user can skip this question + is_decide_later = classification == "decide_later" + is_deferred = classification == "deferred" + skip_eligible = is_decide_later or is_deferred + response_meta = { "session_id": session_id, "input_type": "freeText", @@ -965,6 +1122,7 @@ async def _handle_answer( "question": question, "is_complete": False, "classification": classification, + "skip_eligible": skip_eligible, "deferred_this_round": diff["new_deferred"], "decide_later_this_round": diff["new_decide_later"], # Keep backward-compat fields from AC 8 @@ -977,16 +1135,34 @@ async def _handle_answer( "pm_handler.question_asked", session_id=session_id, classification=classification, + skip_eligible=skip_eligible, has_pending_reframe=pending_reframe is not None, **diff, ) + # Build response text — include skip hint when applicable + response_text = f"Session {session_id}\n\n{question}" + if is_decide_later: + response_text += ( + "\n\n💡 This question can be deferred. " + 'The user may answer now, or choose "decide later" to skip it. ' + "If they choose to decide later, pass " + f'answer="[decide_later]" with session_id="{session_id}".' + ) + elif is_deferred: + response_text += ( + "\n\n💡 This is a technical question that can be deferred to the dev phase. " + "The user may answer now, or choose to defer it. " + "If they choose to defer, pass " + f'answer="[deferred]" with session_id="{session_id}".' + ) + return Result.ok( MCPToolResult( content=( MCPContentItem( type=ContentType.TEXT, - text=f"Session {session_id}\n\n{question}", + text=response_text, ), ), is_error=False, @@ -1050,12 +1226,33 @@ async def _handle_generate( seed = seed_result.value - # Save seed to ~/.ouroboros/seeds/ (idempotent — overwrites on retry) - seed_path = engine.save_pm_seed(seed) - - # Save human-readable pm.md to {cwd}/.ouroboros/ (consistent with CLI path) - pm_output_dir = Path(cwd) / ".ouroboros" - pm_path = save_pm_document(seed, output_dir=pm_output_dir) + # Save seed and PM document with recovery contract + try: + seed_path = engine.save_pm_seed(seed) + pm_output_dir = Path(cwd) / ".ouroboros" + pm_path = save_pm_document(seed, output_dir=pm_output_dir) + except Exception as e: + log.error("pm_handler.generate_save_failed", error=str(e), session_id=session_id) + return Result.ok( + MCPToolResult( + content=( + MCPContentItem( + type=ContentType.TEXT, + text=( + f"PM generation succeeded but saving artifacts failed: {e}\n" + f"Session ID: {session_id}\n" + f'Retry with: action="generate", session_id="{session_id}"' + ), + ), + ), + is_error=False, + meta={ + "session_id": session_id, + "is_complete": True, + "generation_failed": True, + }, + ) + ) return Result.ok( MCPToolResult( @@ -1066,7 +1263,8 @@ async def _handle_generate( f"PM seed generated: {seed.product_name}\n" f"Seed: {seed_path}\n" f"PM document: {pm_path}\n\n" - f"Decide-later items: {len(seed.deferred_items) + len(seed.decide_later_items)}" + f"Deferred items: {len(seed.deferred_items)}\n" + f"Decide-later items: {len(seed.decide_later_items)}" ), ), ), diff --git a/tests/unit/bigbang/test_decide_later_items.py b/tests/unit/bigbang/test_decide_later_items.py index 1c2920cf..2f37776b 100644 --- a/tests/unit/bigbang/test_decide_later_items.py +++ b/tests/unit/bigbang/test_decide_later_items.py @@ -88,8 +88,12 @@ class TestDecideLaterItemsList: """Tests for decide-later items stored as original question text in list.""" @pytest.mark.asyncio - async def test_decide_later_stores_original_question_text(self): - """DECIDE_LATER stores original question text in decide_later_items list.""" + async def test_decide_later_returns_question_to_caller(self): + """DECIDE_LATER returns the question to the caller without auto-answering. + + The caller (main session) is responsible for presenting the decide-later + option and recording the item in decide_later_items when chosen. + """ engine = _make_engine() state = _make_state() @@ -97,7 +101,6 @@ async def test_decide_later_stores_original_question_text(self): # Inner engine generates the question engine.inner.ask_next_question.return_value = Result.ok(original_q) - engine.inner.record_response.return_value = Result.ok(state) # Classifier says DECIDE_LATER classification = ClassificationResult( @@ -110,40 +113,32 @@ async def test_decide_later_stores_original_question_text(self): ) assert classification.output_type == ClassifierOutputType.DECIDE_LATER - # First call returns DECIDE_LATER, second call returns a PASSTHROUGH - passthrough_q = "Who are the primary users?" - passthrough_classification = ClassificationResult( - original_question=passthrough_q, - category=QuestionCategory.PLANNING, - reframed_question=passthrough_q, - reasoning="Planning question.", - ) - - engine.classifier.classify = AsyncMock( - side_effect=[Result.ok(classification), Result.ok(passthrough_classification)] - ) - engine.inner.ask_next_question = AsyncMock( - side_effect=[Result.ok(original_q), Result.ok(passthrough_q)] - ) + engine.classifier.classify = AsyncMock(return_value=Result.ok(classification)) result = await engine.ask_next_question(state) assert result.is_ok - # The decide-later question is stored in decide_later_items - assert original_q in engine.decide_later_items - # It should NOT be in deferred_items - assert original_q not in engine.deferred_items + # The question is returned to the caller (not auto-answered) + assert result.value == original_q + # decide_later_items is NOT populated here — caller handles that + assert engine.decide_later_items == [] + # No recursive call — inner engine asked only once + assert engine.inner.ask_next_question.call_count == 1 + # No auto-response recorded + engine.inner.record_response.assert_not_called() @pytest.mark.asyncio - async def test_decide_later_items_list_is_separate_from_deferred(self): - """decide_later_items and deferred_items are independent lists.""" + async def test_decide_later_does_not_recurse(self): + """DECIDE_LATER returns immediately — no recursive ask_next_question call. + + Unlike DEFERRED (which auto-skips and recurses), DECIDE_LATER returns + the question to the caller so the user can choose decide-later. + """ engine = _make_engine() state = _make_state() decide_later_q = "How should we handle data migration?" - deferred_q = "Which ORM should we use for the database layer?" - # Set up two rounds: first DECIDE_LATER, then DEFERRED, then PASSTHROUGH decide_later_class = ClassificationResult( original_question=decide_later_q, category=QuestionCategory.DECIDE_LATER, @@ -152,103 +147,50 @@ async def test_decide_later_items_list_is_separate_from_deferred(self): decide_later=True, placeholder_response="TBD after schema design.", ) - deferred_class = ClassificationResult( - original_question=deferred_q, - category=QuestionCategory.DEVELOPMENT, - reframed_question=deferred_q, - reasoning="Deeply technical.", - defer_to_dev=True, - ) - passthrough_q = "What are the success metrics?" - passthrough_class = ClassificationResult( - original_question=passthrough_q, - category=QuestionCategory.PLANNING, - reframed_question=passthrough_q, - reasoning="Planning.", - ) - engine.classifier.classify = AsyncMock( - side_effect=[ - Result.ok(decide_later_class), - Result.ok(deferred_class), - Result.ok(passthrough_class), - ] - ) - engine.inner.ask_next_question = AsyncMock( - side_effect=[ - Result.ok(decide_later_q), - Result.ok(deferred_q), - Result.ok(passthrough_q), - ] - ) - engine.inner.record_response = AsyncMock(return_value=Result.ok(state)) + engine.classifier.classify = AsyncMock(return_value=Result.ok(decide_later_class)) + engine.inner.ask_next_question = AsyncMock(return_value=Result.ok(decide_later_q)) result = await engine.ask_next_question(state) assert result.is_ok - assert result.value == passthrough_q - - # Verify separation - assert decide_later_q in engine.decide_later_items - assert decide_later_q not in engine.deferred_items - assert deferred_q in engine.deferred_items - assert deferred_q not in engine.decide_later_items + # Returns the decide-later question directly + assert result.value == decide_later_q + # No auto-recording — caller handles decide-later flow + engine.inner.record_response.assert_not_called() + # Only one call to inner engine (no recursion) + assert engine.inner.ask_next_question.call_count == 1 + # decide_later_items not populated by engine (caller responsibility) + assert engine.decide_later_items == [] @pytest.mark.asyncio - async def test_multiple_decide_later_items_accumulate(self): - """Multiple DECIDE_LATER questions all accumulate in the list.""" + async def test_decide_later_classification_recorded(self): + """DECIDE_LATER classification is recorded so caller can detect it.""" engine = _make_engine() state = _make_state() q1 = "What rate limiting strategy should we use?" - q2 = "How should we handle GDPR data retention?" - q3 = "What's the target latency for API responses?" - passthrough_q = "What is the core problem?" - - classifications = [ - ClassificationResult( - original_question=q1, - category=QuestionCategory.DECIDE_LATER, - reframed_question=q1, - reasoning="Premature.", - decide_later=True, - placeholder_response="TBD.", - ), - ClassificationResult( - original_question=q2, - category=QuestionCategory.DECIDE_LATER, - reframed_question=q2, - reasoning="Needs legal review.", - decide_later=True, - placeholder_response="Needs legal input.", - ), - ClassificationResult( - original_question=q3, - category=QuestionCategory.DECIDE_LATER, - reframed_question=q3, - reasoning="Depends on infra.", - decide_later=True, - placeholder_response="After infra decision.", - ), - ClassificationResult( - original_question=passthrough_q, - category=QuestionCategory.PLANNING, - reframed_question=passthrough_q, - reasoning="Planning.", - ), - ] - engine.classifier.classify = AsyncMock(side_effect=[Result.ok(c) for c in classifications]) - engine.inner.ask_next_question = AsyncMock( - side_effect=[Result.ok(q1), Result.ok(q2), Result.ok(q3), Result.ok(passthrough_q)] + classification = ClassificationResult( + original_question=q1, + category=QuestionCategory.DECIDE_LATER, + reframed_question=q1, + reasoning="Premature.", + decide_later=True, + placeholder_response="TBD.", ) - engine.inner.record_response = AsyncMock(return_value=Result.ok(state)) + + engine.classifier.classify = AsyncMock(return_value=Result.ok(classification)) + engine.inner.ask_next_question = AsyncMock(return_value=Result.ok(q1)) result = await engine.ask_next_question(state) assert result.is_ok - assert len(engine.decide_later_items) == 3 - assert engine.decide_later_items == [q1, q2, q3] + assert result.value == q1 + # Classification is recorded so get_last_classification works + assert len(engine.classifications) == 1 + assert engine.classifications[0].output_type == ClassifierOutputType.DECIDE_LATER + assert engine.get_last_classification() == "decide_later" def test_decide_later_items_empty_by_default(self): """decide_later_items starts as empty list.""" @@ -531,3 +473,110 @@ def test_format_decide_later_summary_preserves_original_question_text(self): summary = engine.format_decide_later_summary() assert original_text in summary + + +class TestSkipAsDecideLater: + """Tests for user-initiated decide-later skip via skip_as_decide_later().""" + + @pytest.mark.asyncio + async def test_skip_records_question_in_decide_later_items(self): + """skip_as_decide_later records the question in decide_later_items.""" + engine = _make_engine() + state = _make_state() + engine.inner.record_response.return_value = Result.ok(state) + + question = "What caching strategy should we use?" + result = await engine.skip_as_decide_later(state, question) + + assert result.is_ok + assert question in engine.decide_later_items + + @pytest.mark.asyncio + async def test_skip_does_not_duplicate_existing_item(self): + """skip_as_decide_later does not duplicate if question already in list.""" + engine = _make_engine() + state = _make_state() + engine.inner.record_response.return_value = Result.ok(state) + + question = "What caching strategy should we use?" + engine.decide_later_items = [question] + + result = await engine.skip_as_decide_later(state, question) + + assert result.is_ok + assert engine.decide_later_items.count(question) == 1 + + @pytest.mark.asyncio + async def test_skip_records_placeholder_response_in_inner_engine(self): + """skip_as_decide_later feeds a placeholder response to inner engine.""" + engine = _make_engine() + state = _make_state() + engine.inner.record_response.return_value = Result.ok(state) + + question = "What deployment model should we use?" + await engine.skip_as_decide_later(state, question) + + engine.inner.record_response.assert_called_once() + call_args = engine.inner.record_response.call_args + recorded_response = call_args[0][1] + assert "[Decide later]" in recorded_response + + @pytest.mark.asyncio + async def test_skip_advances_interview_state(self): + """skip_as_decide_later returns the updated state from record_response.""" + engine = _make_engine() + state = _make_state(current_round=3) + updated_state = _make_state(current_round=4) + engine.inner.record_response.return_value = Result.ok(updated_state) + + question = "What rate limiting approach?" + result = await engine.skip_as_decide_later(state, question) + + assert result.is_ok + assert result.value == updated_state + + @pytest.mark.asyncio + async def test_skip_does_not_add_to_deferred_items(self): + """skip_as_decide_later does not affect deferred_items.""" + engine = _make_engine() + state = _make_state() + engine.inner.record_response.return_value = Result.ok(state) + + question = "What caching strategy?" + await engine.skip_as_decide_later(state, question) + + assert engine.deferred_items == [] + assert question in engine.decide_later_items + + @pytest.mark.asyncio + async def test_skip_multiple_questions_accumulate(self): + """Multiple skip_as_decide_later calls accumulate items.""" + engine = _make_engine() + state = _make_state() + engine.inner.record_response.return_value = Result.ok(state) + + q1 = "What caching strategy?" + q2 = "What rate limiting?" + q3 = "What deployment model?" + + await engine.skip_as_decide_later(state, q1) + await engine.skip_as_decide_later(state, q2) + await engine.skip_as_decide_later(state, q3) + + assert engine.decide_later_items == [q1, q2, q3] + + @pytest.mark.asyncio + async def test_skip_propagates_record_response_error(self): + """skip_as_decide_later propagates errors from record_response.""" + engine = _make_engine() + state = _make_state() + from ouroboros.core.errors import ValidationError as VE + + engine.inner.record_response.return_value = Result.err( + VE("State save failed", field="state") + ) + + question = "What caching strategy?" + result = await engine.skip_as_decide_later(state, question) + + assert result.is_err diff --git a/tests/unit/bigbang/test_pm_interview.py b/tests/unit/bigbang/test_pm_interview.py index 7565d596..b54bb2ac 100644 --- a/tests/unit/bigbang/test_pm_interview.py +++ b/tests/unit/bigbang/test_pm_interview.py @@ -431,17 +431,16 @@ async def test_dev_question_gets_reframed(self, tmp_path: Path) -> None: assert engine.classifications[0].category == QuestionCategory.DEVELOPMENT @pytest.mark.asyncio - async def test_deferred_question_skipped(self, tmp_path: Path) -> None: - """DEV-only questions marked as defer_to_dev are skipped and tracked.""" + async def test_deferred_question_returned_to_user(self, tmp_path: Path) -> None: + """DEV-only questions marked as defer_to_dev are returned to the user.""" adapter = _make_adapter() engine = _make_engine(adapter, tmp_path) dev_q = "Should we use gRPC or REST for inter-service communication?" - planning_q = "Who are the end users?" adapter.complete = AsyncMock( side_effect=[ - # First question generation: dev question + # Question generation: dev question Result.ok(_mock_completion(dev_q)), # Classification: defer to dev Result.ok( @@ -456,21 +455,6 @@ async def test_deferred_question_skipped(self, tmp_path: Path) -> None: ) ) ), - # Second question generation (recursive call): planning question - Result.ok(_mock_completion(planning_q)), - # Classification: planning - Result.ok( - _mock_completion( - json.dumps( - { - "category": "planning", - "reframed_question": planning_q, - "reasoning": "User question", - "defer_to_dev": False, - } - ) - ) - ), ] ) @@ -482,85 +466,46 @@ async def test_deferred_question_skipped(self, tmp_path: Path) -> None: result = await engine.ask_next_question(state) assert result.is_ok - assert result.value == planning_q - assert dev_q in engine.deferred_items - # The deferred question gets an automatic response fed back to the - # inner InterviewEngine so it knows the question was handled - assert len(state.rounds) == 1 - assert "[Deferred to development phase]" in state.rounds[0].user_response - assert state.rounds[0].question == dev_q + # The user sees the deferred question directly + assert result.value == dev_q + # deferred_items NOT populated yet (user hasn't chosen to skip) + assert dev_q not in engine.deferred_items + # No rounds auto-recorded + assert len(state.rounds) == 0 @pytest.mark.asyncio - async def test_deferred_auto_response_content(self, tmp_path: Path) -> None: - """DEV-classified deferred questions get an automatic response fed back - to the InterviewEngine with appropriate marker text.""" + async def test_user_can_skip_as_deferred(self, tmp_path: Path) -> None: + """User can defer a technical question via skip_as_deferred().""" adapter = _make_adapter() engine = _make_engine(adapter, tmp_path) dev_q = "What container orchestration platform should we use — Kubernetes or ECS?" - planning_q = "What are the key business goals?" - - adapter.complete = AsyncMock( - side_effect=[ - Result.ok(_mock_completion(dev_q)), - Result.ok( - _mock_completion( - json.dumps( - { - "category": "development", - "reframed_question": dev_q, - "reasoning": "Infrastructure choice is purely technical", - "defer_to_dev": True, - } - ) - ) - ), - Result.ok(_mock_completion(planning_q)), - Result.ok( - _mock_completion( - json.dumps( - { - "category": "planning", - "reframed_question": planning_q, - "reasoning": "Business question", - "defer_to_dev": False, - } - ) - ) - ), - ] - ) state = InterviewState( interview_id="test_auto_response", initial_context="Build a SaaS platform", ) - result = await engine.ask_next_question(state) + # User chooses to defer + adapter.complete = AsyncMock(return_value=Result.ok(_mock_completion("ok"))) + result = await engine.skip_as_deferred(state, dev_q) assert result.is_ok - assert result.value == planning_q - - # Verify the auto-response was properly recorded - deferred_round = state.rounds[0] - assert deferred_round.question == dev_q - assert "[Deferred to development phase]" in deferred_round.user_response - assert "development interview" in deferred_round.user_response - assert deferred_round.round_number == 1 + assert dev_q in engine.deferred_items + # Verify the deferral response was properly recorded + assert len(state.rounds) == 1 + assert "[Deferred to development phase]" in state.rounds[0].user_response @pytest.mark.asyncio - async def test_multiple_deferred_questions_all_auto_responded(self, tmp_path: Path) -> None: - """Multiple consecutive deferred questions each get auto-responses.""" + async def test_deferred_question_returned_not_auto_skipped(self, tmp_path: Path) -> None: + """DEFERRED questions are returned to user, not auto-skipped.""" adapter = _make_adapter() engine = _make_engine(adapter, tmp_path) dev_q1 = "Should we use gRPC or REST?" - dev_q2 = "Which CI/CD pipeline tool should we use?" - planning_q = "Who are the target users?" adapter.complete = AsyncMock( side_effect=[ - # First dev question Result.ok(_mock_completion(dev_q1)), Result.ok( _mock_completion( @@ -574,34 +519,6 @@ async def test_multiple_deferred_questions_all_auto_responded(self, tmp_path: Pa ) ) ), - # Second dev question - Result.ok(_mock_completion(dev_q2)), - Result.ok( - _mock_completion( - json.dumps( - { - "category": "development", - "reframed_question": dev_q2, - "reasoning": "CI/CD is infra", - "defer_to_dev": True, - } - ) - ) - ), - # Finally a planning question - Result.ok(_mock_completion(planning_q)), - Result.ok( - _mock_completion( - json.dumps( - { - "category": "planning", - "reframed_question": planning_q, - "reasoning": "User question", - "defer_to_dev": False, - } - ) - ) - ), ] ) @@ -613,18 +530,11 @@ async def test_multiple_deferred_questions_all_auto_responded(self, tmp_path: Pa result = await engine.ask_next_question(state) assert result.is_ok - assert result.value == planning_q - - # Both dev questions should be tracked as deferred - assert dev_q1 in engine.deferred_items - assert dev_q2 in engine.deferred_items - - # Both should have auto-responses recorded in the inner engine - assert len(state.rounds) == 2 - assert "[Deferred to development phase]" in state.rounds[0].user_response - assert "[Deferred to development phase]" in state.rounds[1].user_response - assert state.rounds[0].question == dev_q1 - assert state.rounds[1].question == dev_q2 + # First deferred question returned directly — no recursion + assert result.value == dev_q1 + # Not yet in deferred_items (user hasn't chosen to skip) + assert dev_q1 not in engine.deferred_items + assert len(state.rounds) == 0 @pytest.mark.asyncio async def test_classification_failure_returns_original(self, tmp_path: Path) -> None: @@ -652,18 +562,24 @@ async def test_classification_failure_returns_original(self, tmp_path: Path) -> assert result.value == question @pytest.mark.asyncio - async def test_decide_later_auto_responds_with_placeholder(self, tmp_path: Path) -> None: - """Decide-later questions are auto-answered with a placeholder and skipped.""" + async def test_decide_later_returns_question_without_auto_answering( + self, tmp_path: Path + ) -> None: + """Decide-later questions are returned to the caller for user decision. + + The engine no longer auto-answers with a placeholder or recurses. + The caller (main session) detects classification == "decide_later" + and presents the user with a decide-later option. + """ adapter = _make_adapter() engine = _make_engine(adapter, tmp_path) decide_later_q = "How should we handle scaling when we reach 1M users?" placeholder = "This will be determined after MVP launch and initial user metrics. Marking as a decision point for later." - planning_q = "Who are the target users?" adapter.complete = AsyncMock( side_effect=[ - # First question generation: decide-later question + # Question generation: decide-later question Result.ok(_mock_completion(decide_later_q)), # Classification: decide_later Result.ok( @@ -680,22 +596,7 @@ async def test_decide_later_auto_responds_with_placeholder(self, tmp_path: Path) ) ) ), - # Second question generation (recursive call): planning question - Result.ok(_mock_completion(planning_q)), - # Classification: planning - Result.ok( - _mock_completion( - json.dumps( - { - "category": "planning", - "reframed_question": planning_q, - "reasoning": "User question", - "defer_to_dev": False, - "decide_later": False, - } - ) - ) - ), + # No second question generation — no recursion ] ) @@ -707,14 +608,14 @@ async def test_decide_later_auto_responds_with_placeholder(self, tmp_path: Path) result = await engine.ask_next_question(state) assert result.is_ok - # The PM sees the next real question, not the decide-later one - assert result.value == planning_q - # The decide-later question is tracked in decide_later_items - assert decide_later_q in engine.decide_later_items - # A placeholder response was auto-recorded in the interview state - assert len(state.rounds) == 1 - assert "[Decide later]" in state.rounds[0].user_response - assert placeholder in state.rounds[0].user_response + # The decide-later question is returned to the caller + assert result.value == decide_later_q + # decide_later_items is NOT populated — caller handles that + assert engine.decide_later_items == [] + # No auto-response recorded — state has no rounds + assert len(state.rounds) == 0 + # Classification is recorded for caller to detect + assert engine.get_last_classification() == "decide_later" @pytest.mark.asyncio async def test_decide_later_classification_result_properties(self) -> None: @@ -729,7 +630,8 @@ async def test_decide_later_classification_result_properties(self) -> None: ) assert result.output_type == ClassifierOutputType.DECIDE_LATER - assert result.question_for_pm == "" # Should not be shown to PM + # Returned to user so they can choose to answer or defer + assert result.question_for_pm == "How should we handle scaling?" class TestRecordResponse: @@ -960,9 +862,13 @@ async def test_generates_seed_from_interview(self, tmp_path: Path) -> None: @pytest.mark.asyncio async def test_includes_deferred_items(self, tmp_path: Path) -> None: - """Deferred items from classifier are included in PMSeed.""" + """LLM-extracted deferred items are used in PMSeed (raw engine items not merged).""" adapter = _make_adapter() engine = _make_engine(adapter, tmp_path) + # Raw engine items are passed to the extraction prompt as context, + # so the LLM should summarise them. Only LLM-extracted items appear + # in the final seed — engine-tracked items are merged with LLM output + # and deduplicated to ensure no user-selected skip is lost. engine.deferred_items = ["Should we use gRPC or REST?"] extraction_response = json.dumps( @@ -993,6 +899,7 @@ async def test_includes_deferred_items(self, tmp_path: Path) -> None: assert result.is_ok seed = result.value assert "Database selection" in seed.deferred_items + # Engine-tracked item is merged back to prevent data loss assert "Should we use gRPC or REST?" in seed.deferred_items @pytest.mark.asyncio diff --git a/tests/unit/bigbang/test_question_classifier.py b/tests/unit/bigbang/test_question_classifier.py index 0619ee3a..0c95e3a6 100644 --- a/tests/unit/bigbang/test_question_classifier.py +++ b/tests/unit/bigbang/test_question_classifier.py @@ -181,8 +181,8 @@ def test_reframed_returns_reframed_question(self) -> None: ) assert result.question_for_pm == "How do you expect users to interact with data?" - def test_deferred_returns_empty_string(self) -> None: - """DEFERRED returns empty string — question should not be shown to PM.""" + def test_deferred_returns_original_question(self) -> None: + """DEFERRED returns original question — shown to user for defer/answer choice.""" result = ClassificationResult( original_question="Should we use gRPC or REST?", category=QuestionCategory.DEVELOPMENT, @@ -190,10 +190,10 @@ def test_deferred_returns_empty_string(self) -> None: reasoning="Purely technical", defer_to_dev=True, ) - assert result.question_for_pm == "" + assert result.question_for_pm == "Should we use gRPC or REST?" - def test_decide_later_returns_empty_string(self) -> None: - """DECIDE_LATER returns empty string — auto-answered, not shown to PM.""" + def test_decide_later_returns_original_question(self) -> None: + """DECIDE_LATER returns original question — shown to user for decision.""" result = ClassificationResult( original_question="How will this scale to millions of users?", category=QuestionCategory.DECIDE_LATER, @@ -202,7 +202,7 @@ def test_decide_later_returns_empty_string(self) -> None: decide_later=True, placeholder_response="To be determined post-launch.", ) - assert result.question_for_pm == "" + assert result.question_for_pm == "How will this scale to millions of users?" # ────────────────────────────────────────────────────────────── @@ -418,5 +418,5 @@ async def test_classify_decide_later_question(self) -> None: assert cr.category == QuestionCategory.DECIDE_LATER assert cr.decide_later is True assert cr.output_type == ClassifierOutputType.DECIDE_LATER - assert cr.question_for_pm == "" + assert cr.question_for_pm == question # returned to user for decision assert "initial deployment" in cr.placeholder_response diff --git a/tests/unit/mcp/tools/test_pm_handler.py b/tests/unit/mcp/tools/test_pm_handler.py index 639432b4..6a80bb4b 100644 --- a/tests/unit/mcp/tools/test_pm_handler.py +++ b/tests/unit/mcp/tools/test_pm_handler.py @@ -10,6 +10,11 @@ from ouroboros.bigbang.interview import InterviewRound, InterviewState from ouroboros.bigbang.pm_interview import PMInterviewEngine +from ouroboros.bigbang.question_classifier import ( + ClassificationResult, + ClassifierOutputType, + QuestionCategory, +) from ouroboros.core.types import Result from ouroboros.mcp.tools.pm_handler import ( _DATA_DIR, @@ -27,6 +32,24 @@ # ── Helpers ────────────────────────────────────────────────────── +def _make_classification(output_type: ClassifierOutputType) -> ClassificationResult: + """Create a minimal ClassificationResult stub for a given output type.""" + category_map = { + ClassifierOutputType.PASSTHROUGH: QuestionCategory.PLANNING, + ClassifierOutputType.REFRAMED: QuestionCategory.DEVELOPMENT, + ClassifierOutputType.DEFERRED: QuestionCategory.DEVELOPMENT, + ClassifierOutputType.DECIDE_LATER: QuestionCategory.DECIDE_LATER, + } + return ClassificationResult( + original_question="stub question", + category=category_map[output_type], + reframed_question="stub question", + reasoning="stub", + defer_to_dev=(output_type == ClassifierOutputType.DEFERRED), + decide_later=(output_type == ClassifierOutputType.DECIDE_LATER), + ) + + def _make_engine_stub( deferred: list[str] | None = None, decide_later: list[str] | None = None, @@ -452,6 +475,221 @@ async def test_handle_answer_includes_interview_complete_false(self, tmp_path: P assert result.is_ok assert result.value.meta["interview_complete"] is False + @pytest.mark.asyncio + async def test_handle_answer_decide_later_skips_and_records(self, tmp_path: Path) -> None: + """When answer='[decide_later]', skip_as_decide_later is called instead of record_response.""" + engine = make_pm_engine_mock(deferred_items=[], decide_later_items=[]) + # Set up classification so the guard accepts the sentinel + engine.classifications = [_make_classification(ClassifierOutputType.DECIDE_LATER)] + + question = "What caching strategy should we use?" + state = _make_state( + rounds=[ + InterviewRound(round_number=1, question=question, user_response=None), + ], + ) + + engine.load_state = AsyncMock(return_value=Result.ok(state)) + + # skip_as_decide_later should be called (not record_response) + async def mock_skip(s: Any, q: str) -> Result: + engine.decide_later_items.append(q) + return Result.ok(s) + + engine.skip_as_decide_later = AsyncMock(side_effect=mock_skip) + engine.record_response = AsyncMock(return_value=Result.ok(state)) + + engine.ask_next_question = AsyncMock(return_value=Result.ok("Next question?")) + engine.save_state = AsyncMock(return_value=Result.ok(tmp_path / "state.json")) + + _save_pm_meta("sess-dl", engine, cwd="/tmp/proj", data_dir=tmp_path) + + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) + + result = await handler.handle( + { + "session_id": "sess-dl", + "answer": "[decide_later]", + } + ) + + assert result.is_ok + # skip_as_decide_later was called + engine.skip_as_decide_later.assert_called_once_with(state, question) + # record_response was NOT called (skip_as_decide_later handles it internally) + engine.record_response.assert_not_called() + # The question was recorded in decide_later_items + assert question in engine.decide_later_items + + @pytest.mark.asyncio + async def test_handle_answer_decide_later_with_whitespace(self, tmp_path: Path) -> None: + """Whitespace-padded '[decide_later]' is still recognized.""" + engine = make_pm_engine_mock(deferred_items=[], decide_later_items=[]) + # Set up classification so the guard accepts the sentinel + engine.classifications = [_make_classification(ClassifierOutputType.DECIDE_LATER)] + + question = "What rate limiting approach?" + state = _make_state( + rounds=[ + InterviewRound(round_number=1, question=question, user_response=None), + ], + ) + + engine.load_state = AsyncMock(return_value=Result.ok(state)) + engine.skip_as_decide_later = AsyncMock(return_value=Result.ok(state)) + engine.ask_next_question = AsyncMock(return_value=Result.ok("Next question?")) + engine.save_state = AsyncMock(return_value=Result.ok(tmp_path / "state.json")) + + _save_pm_meta("sess-dl2", engine, cwd="/tmp/proj", data_dir=tmp_path) + + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) + + result = await handler.handle( + { + "session_id": "sess-dl2", + "answer": " [decide_later] ", + } + ) + + assert result.is_ok + engine.skip_as_decide_later.assert_called_once() + + @pytest.mark.asyncio + async def test_handle_answer_deferred_skips_and_records(self, tmp_path: Path) -> None: + """When answer='[deferred]', skip_as_deferred is called instead of record_response.""" + engine = make_pm_engine_mock(deferred_items=[], decide_later_items=[]) + # Set up classification so the guard accepts the sentinel + engine.classifications = [_make_classification(ClassifierOutputType.DEFERRED)] + + question = "Should we use gRPC or REST for inter-service communication?" + state = _make_state( + rounds=[ + InterviewRound(round_number=1, question=question, user_response=None), + ], + ) + + engine.load_state = AsyncMock(return_value=Result.ok(state)) + + async def mock_skip(s: Any, q: str) -> Result: + engine.deferred_items.append(q) + return Result.ok(s) + + engine.skip_as_deferred = AsyncMock(side_effect=mock_skip) + engine.record_response = AsyncMock(return_value=Result.ok(state)) + + engine.ask_next_question = AsyncMock(return_value=Result.ok("Next question?")) + engine.save_state = AsyncMock(return_value=Result.ok(tmp_path / "state.json")) + + _save_pm_meta("sess-def", engine, cwd="/tmp/proj", data_dir=tmp_path) + + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) + + result = await handler.handle( + { + "session_id": "sess-def", + "answer": "[deferred]", + } + ) + + assert result.is_ok + engine.skip_as_deferred.assert_called_once_with(state, question) + engine.record_response.assert_not_called() + assert question in engine.deferred_items + + + @pytest.mark.asyncio + async def test_decide_later_sentinel_for_passthrough_treated_as_normal_answer( + self, tmp_path: Path + ) -> None: + """[decide_later] sent for a passthrough question is recorded as a normal answer. + + Guards against clients blindly sending sentinels regardless of question type. + When the last classification is not 'decide_later', the sentinel must not + trigger skip_as_decide_later — it should fall through to record_response. + """ + engine = make_pm_engine_mock(deferred_items=[], decide_later_items=[]) + # Last classification is PASSTHROUGH, not DECIDE_LATER + engine.classifications = [_make_classification(ClassifierOutputType.PASSTHROUGH)] + + question = "What is the primary goal of the product?" + state = _make_state( + rounds=[ + InterviewRound(round_number=1, question=question, user_response=None), + ], + ) + + engine.load_state = AsyncMock(return_value=Result.ok(state)) + engine.skip_as_decide_later = AsyncMock(return_value=Result.ok(state)) + engine.record_response = AsyncMock(return_value=Result.ok(state)) + engine.ask_next_question = AsyncMock(return_value=Result.ok("Next question?")) + engine.save_state = AsyncMock(return_value=Result.ok(tmp_path / "state.json")) + + _save_pm_meta("sess-passthrough-guard", engine, cwd="/tmp/proj", data_dir=tmp_path) + + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) + + result = await handler.handle( + { + "session_id": "sess-passthrough-guard", + "answer": "[decide_later]", + } + ) + + assert result.is_ok + # Must NOT call skip_as_decide_later — the sentinel is treated as a normal answer + engine.skip_as_decide_later.assert_not_called() + # Must record the literal string as a normal response + engine.record_response.assert_called_once_with(state, "[decide_later]", question) + + @pytest.mark.asyncio + async def test_decide_later_skip_populates_per_round_diff(self, tmp_path: Path) -> None: + """Skipping via [decide_later] populates new_decide_later in response metadata. + + Verifies that the per-round diff snapshot is taken BEFORE the skip call + so that items added inside skip_as_decide_later() appear in the diff. + """ + engine = make_pm_engine_mock(deferred_items=[], decide_later_items=[]) + # Set up classification so the guard accepts the sentinel + engine.classifications = [_make_classification(ClassifierOutputType.DECIDE_LATER)] + + question = "What scaling approach suits future traffic?" + state = _make_state( + rounds=[ + InterviewRound(round_number=1, question=question, user_response=None), + ], + ) + + engine.load_state = AsyncMock(return_value=Result.ok(state)) + + async def mock_skip(s: Any, q: str) -> Result: + engine.decide_later_items.append(q) + return Result.ok(s) + + engine.skip_as_decide_later = AsyncMock(side_effect=mock_skip) + engine.record_response = AsyncMock(return_value=Result.ok(state)) + engine.ask_next_question = AsyncMock(return_value=Result.ok("Next question?")) + engine.save_state = AsyncMock(return_value=Result.ok(tmp_path / "state.json")) + + _save_pm_meta("sess-diff-skip", engine, cwd="/tmp/proj", data_dir=tmp_path) + + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) + + result = await handler.handle( + { + "session_id": "sess-diff-skip", + "answer": "[decide_later]", + } + ) + + assert result.is_ok + meta = result.value.meta + # The skipped question must appear in new_decide_later (per-round diff) + assert question in meta["new_decide_later"], ( + f"Expected '{question}' in new_decide_later, got {meta['new_decide_later']}" + ) + assert meta["decide_later_this_round"] == meta["new_decide_later"] + assert meta["decide_later_count"] == 1 + # ── Unit tests for _check_completion (AC 12) ───────────────────── @@ -652,6 +890,14 @@ async def test_handle_answer_completion_via_ambiguity(self, tmp_path: Path) -> N _save_pm_meta("sess-complete", engine, cwd="/tmp/proj", data_dir=tmp_path) + # Mock auto-generate on completion + seed_mock = MagicMock() + seed_mock.product_name = "Test Product" + seed_mock.deferred_items = ["d1"] + seed_mock.decide_later_items = ["dl1"] + engine.generate_pm_seed = AsyncMock(return_value=Result.ok(seed_mock)) + engine.save_pm_seed = MagicMock(return_value=tmp_path / "seed.json") + handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) # Mock _check_completion to return completion @@ -666,6 +912,9 @@ async def test_handle_answer_completion_via_ambiguity(self, tmp_path: Path) -> N "ouroboros.mcp.tools.pm_handler._check_completion", new_callable=AsyncMock, return_value=completion_meta, + ), patch( + "ouroboros.mcp.tools.pm_handler.save_pm_document", + return_value=tmp_path / "pm.md", ): result = await handler.handle( { @@ -684,10 +933,10 @@ async def test_handle_answer_completion_via_ambiguity(self, tmp_path: Path) -> N # Verify complete_interview was called engine.complete_interview.assert_called_once() - # Verify response text includes generate instructions + # Verify response text includes completion and PM document info text = result.value.content[0].text assert "Interview complete" in text - assert "generate" in text + assert "PM document" in text @pytest.mark.asyncio async def test_no_done_signal_processing(self, tmp_path: Path) -> None: @@ -1504,17 +1753,28 @@ async def test_resume_completion_meta_has_required_fields(self, tmp_path: Path) "ambiguity_score": 0.15, } ) + # Mock auto-generate on completion + seed_mock = MagicMock() + seed_mock.product_name = "Test Product" + seed_mock.deferred_items = ["d1"] + seed_mock.decide_later_items = ["dl1"] + engine.generate_pm_seed = AsyncMock(return_value=Result.ok(seed_mock)) + engine.save_pm_seed = MagicMock(return_value=tmp_path / "seed.json") _save_pm_meta("resume-complete", engine, cwd="/tmp", data_dir=tmp_path) handler = PMInterviewHandler(pm_engine=engine, data_dir=tmp_path) - result = await handler.handle( - { - "session_id": "resume-complete", - "answer": "Final answer", - } - ) + with patch( + "ouroboros.mcp.tools.pm_handler.save_pm_document", + return_value=tmp_path / "pm.md", + ): + result = await handler.handle( + { + "session_id": "resume-complete", + "answer": "Final answer", + } + ) assert result.is_ok meta = result.value.meta @@ -1528,6 +1788,9 @@ async def test_resume_completion_meta_has_required_fields(self, tmp_path: Path) assert meta["decide_later_this_round"] == [] # Also has completion details assert meta["interview_complete"] is True + # Auto-generated PM document + assert "pm_path" in meta + assert "seed_path" in meta @pytest.mark.asyncio async def test_resume_loads_state_and_meta(self, tmp_path: Path) -> None: