Skip to content

Commit c5f1568

Browse files
kochj23claude
andcommitted
fix: Resolve inferenceInProgress error in agentic tool calling loop
Root cause: chatCompletion() kept consuming the AsyncStream until maxTokens even after a tool call was detected, because PythonService .terminate() was a no-op with the native MLX backend. The follow-up generateResponse() after tool execution hit isInferenceRunning = true. Fix: - MLXService.chatCompletion() now breaks out of the generation stream loop as soon as </tool> or </tool_call> appears in the response, returning immediately instead of running to maxTokens - Removed all PythonService.shared.terminate() calls from ChatViewModel (no-ops since Python was eliminated in v6.2.0) - Simplified stopGeneration() to directly set flags without a Task Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent 3994bc8 commit c5f1568

2 files changed

Lines changed: 15 additions & 19 deletions

File tree

MLX Code/Services/MLXService.swift

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,12 +233,22 @@ actor MLXService {
233233
if let chunk = generation.chunk {
234234
fullResponse += chunk
235235
await MainActor.run { box.handler(chunk) }
236+
// Break as soon as a complete tool call is in the response.
237+
// This lets chatCompletion() return immediately rather than
238+
// running to maxTokens, which prevents inferenceInProgress
239+
// errors on the follow-up generation after tool execution.
240+
if fullResponse.contains("</tool>") || fullResponse.contains("</tool_call>") {
241+
break
242+
}
236243
}
237244
}
238245
} else {
239246
for await generation in stream {
240247
if let chunk = generation.chunk {
241248
fullResponse += chunk
249+
if fullResponse.contains("</tool>") || fullResponse.contains("</tool_call>") {
250+
break
251+
}
242252
}
243253
}
244254
}

MLX Code/ViewModels/ChatViewModel.swift

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -246,15 +246,9 @@ class ChatViewModel: ObservableObject {
246246
/// Stops the current generation
247247
func stopGeneration() {
248248
guard isGenerating else { return }
249-
250-
Task {
251-
await PythonService.shared.terminate()
252-
isGenerating = false
253-
isWaitingForFirstToken = false
254-
statusMessage = "Generation stopped"
255-
256-
logInfo("Generation stopped by user", category: "ChatViewModel")
257-
}
249+
isGenerating = false
250+
isWaitingForFirstToken = false
251+
statusMessage = "Generation stopped"
258252
}
259253

260254
/// Regenerates the last assistant response
@@ -389,12 +383,8 @@ class ChatViewModel: ObservableObject {
389383
}
390384
}
391385

392-
// Stop if a complete tool call has been received
393-
if accumulatedResponse.contains("</tool>") || accumulatedResponse.contains("</tool_call>") {
394-
shouldStopGeneration = true
395-
await PythonService.shared.terminate()
396-
return
397-
}
386+
// Tool call detection is handled in MLXService — it breaks the stream
387+
// loop as soon as </tool> appears, so we just update UI here.
398388

399389
// Check for repetition
400390
if let detector = self.repetitionDetector {
@@ -410,23 +400,19 @@ class ChatViewModel: ObservableObject {
410400
accumulatedResponse = String(accumulatedResponse[..<truncateIndex])
411401
accumulatedResponse += "\n\n[Response truncated due to repetition detection]"
412402
}
413-
414-
await PythonService.shared.terminate()
415403
}
416404
}
417405

418406
// Check for maximum length
419407
if accumulatedResponse.count > ChatViewModel.maxResponseLength {
420408
shouldStopGeneration = true
421409
accumulatedResponse += "\n\n[Response truncated: maximum length reached]"
422-
await PythonService.shared.terminate()
423410
}
424411

425412
// Check for maximum token count
426413
if self.tokenCount > ChatViewModel.maxResponseTokens {
427414
shouldStopGeneration = true
428415
accumulatedResponse += "\n\n[Response truncated: maximum tokens reached]"
429-
await PythonService.shared.terminate()
430416
}
431417

432418
// Update the message content

0 commit comments

Comments
 (0)