fix: Resolve inferenceInProgress error in agentic tool calling loop

kochj23 · claude · kochj23 · commit c5f15688dbfc · 2026-03-04T15:05:30.000-08:00
Root cause: chatCompletion() kept consuming the AsyncStream until
maxTokens even after a tool call was detected, because PythonService
.terminate() was a no-op with the native MLX backend. The follow-up
generateResponse() after tool execution hit isInferenceRunning = true.

Fix:
- MLXService.chatCompletion() now breaks out of the generation stream
  loop as soon as &lt;/tool&gt; or &lt;/tool_call&gt; appears in the response,
  returning immediately instead of running to maxTokens
- Removed all PythonService.shared.terminate() calls from ChatViewModel
  (no-ops since Python was eliminated in v6.2.0)
- Simplified stopGeneration() to directly set flags without a Task

Co-Authored-By: Claude Sonnet 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/MLX Code/Services/MLXService.swift b/MLX Code/Services/MLXService.swift
@@ -233,12 +233,22 @@ actor MLXService {
                 if let chunk = generation.chunk {
                     fullResponse += chunk
                     await MainActor.run { box.handler(chunk) }
+                    // Break as soon as a complete tool call is in the response.
+                    // This lets chatCompletion() return immediately rather than
+                    // running to maxTokens, which prevents inferenceInProgress
+                    // errors on the follow-up generation after tool execution.
+                    if fullResponse.contains("</tool>") || fullResponse.contains("</tool_call>") {
+                        break
+                    }
                 }
             }
         } else {
             for await generation in stream {
                 if let chunk = generation.chunk {
                     fullResponse += chunk
+                    if fullResponse.contains("</tool>") || fullResponse.contains("</tool_call>") {
+                        break
+                    }
                 }
             }
         }
diff --git a/MLX Code/ViewModels/ChatViewModel.swift b/MLX Code/ViewModels/ChatViewModel.swift
@@ -246,15 +246,9 @@ class ChatViewModel: ObservableObject {
     /// Stops the current generation
     func stopGeneration() {
         guard isGenerating else { return }
-
-        Task {
-            await PythonService.shared.terminate()
-            isGenerating = false
-            isWaitingForFirstToken = false
-            statusMessage = "Generation stopped"
-
-            logInfo("Generation stopped by user", category: "ChatViewModel")
-        }
+        isGenerating = false
+        isWaitingForFirstToken = false
+        statusMessage = "Generation stopped"
     }
 
     /// Regenerates the last assistant response
@@ -389,12 +383,8 @@ class ChatViewModel: ObservableObject {
                             }
                         }
 
-                        // Stop if a complete tool call has been received
-                        if accumulatedResponse.contains("</tool>") || accumulatedResponse.contains("</tool_call>") {
-                            shouldStopGeneration = true
-                            await PythonService.shared.terminate()
-                            return
-                        }
+                        // Tool call detection is handled in MLXService — it breaks the stream
+                        // loop as soon as </tool> appears, so we just update UI here.
 
                         // Check for repetition
                         if let detector = self.repetitionDetector {
@@ -410,23 +400,19 @@ class ChatViewModel: ObservableObject {
                                     accumulatedResponse = String(accumulatedResponse[..<truncateIndex])
                                     accumulatedResponse += "\n\n[Response truncated due to repetition detection]"
                                 }
-
-                                await PythonService.shared.terminate()
                             }
                         }
 
                         // Check for maximum length
                         if accumulatedResponse.count > ChatViewModel.maxResponseLength {
                             shouldStopGeneration = true
                             accumulatedResponse += "\n\n[Response truncated: maximum length reached]"
-                            await PythonService.shared.terminate()
                         }
 
                         // Check for maximum token count
                         if self.tokenCount > ChatViewModel.maxResponseTokens {
                             shouldStopGeneration = true
                             accumulatedResponse += "\n\n[Response truncated: maximum tokens reached]"
-                            await PythonService.shared.terminate()
                         }
 
                         // Update the message content

Original file line number	Diff line number	Diff line change
`@@ -233,12 +233,22 @@ actor MLXService {`
`233`	`233`	`if let chunk = generation.chunk {`
`234`	`234`	`fullResponse += chunk`
`235`	`235`	`await MainActor.run { box.handler(chunk) }`
	`236`	`+ // Break as soon as a complete tool call is in the response.`
	`237`	`+ // This lets chatCompletion() return immediately rather than`
	`238`	`+ // running to maxTokens, which prevents inferenceInProgress`
	`239`	`+ // errors on the follow-up generation after tool execution.`
	`240`	`+ if fullResponse.contains("</tool>") \|\| fullResponse.contains("</tool_call>") {`
	`241`	`+ break`
	`242`	`+ }`
`236`	`243`	`}`
`237`	`244`	`}`
`238`	`245`	`} else {`
`239`	`246`	`for await generation in stream {`
`240`	`247`	`if let chunk = generation.chunk {`
`241`	`248`	`fullResponse += chunk`
	`249`	`+ if fullResponse.contains("</tool>") \|\| fullResponse.contains("</tool_call>") {`
	`250`	`+ break`
	`251`	`+ }`
`242`	`252`	`}`
`243`	`253`	`}`
`244`	`254`	`}`