From 5fd27305e05621b0e52e9118b3199e40fbf3e616 Mon Sep 17 00:00:00 2001
From: Emilio Garcia <i.am.emilio@gmail.com>
Date: Wed, 25 Mar 2026 13:03:50 -0400
Subject: [PATCH] fix: make tool call assertions flexible to prevent CI flakes

Some models return multiple parallel tool calls for a single-tool prompt,
which is a valid API response. The previous assertions required exactly one
function call, causing intermittent CI failures when models produced
logically correct but duplicated tool invocations. This relaxes the
assertions to accept one or more function calls and responds to all of
them in follow-up turns, preventing the "tool_call_id not responded to"
error that occurs when only the first call is acknowledged.

Signed-off-by: Emilio Garcia <i.am.emilio@gmail.com>
Made-with: Cursor
---
 .../responses/test_tool_responses.py          | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/tests/integration/responses/test_tool_responses.py b/tests/integration/responses/test_tool_responses.py
index 427281c251..298b125059 100644
--- a/tests/integration/responses/test_tool_responses.py
+++ b/tests/integration/responses/test_tool_responses.py
@@ -462,10 +462,11 @@ def test_response_non_streaming_custom_tool(responses_client, text_model_id, cas
         tools=case.tools,
         stream=False,
     )
-    assert len(response.output) == 1
-    assert response.output[0].type == "function_call"
-    assert response.output[0].status == "completed"
-    assert response.output[0].name == "get_weather"
+    assert len(response.output) >= 1
+    function_calls = [o for o in response.output if o.type == "function_call"]
+    assert len(function_calls) >= 1
+    assert function_calls[0].status == "completed"
+    assert function_calls[0].name == "get_weather"
 
 
 @pytest.mark.parametrize("case", custom_tool_test_cases)
@@ -582,16 +583,16 @@ def test_function_call_output_list_text(responses_client, text_model_id):
         tools=tools,
         stream=False,
     )
-    assert len(response.output) == 1
-    assert response.output[0].type == "function_call"
-    call_id = response.output[0].call_id
+    function_calls = [o for o in response.output if o.type == "function_call"]
+    assert len(function_calls) >= 1
 
     inputs = [
         {
             "type": "function_call_output",
-            "call_id": call_id,
+            "call_id": fc.call_id,
             "output": [{"type": "input_text", "text": "It is sunny and 22 degrees Celsius in Paris."}],
-        },
+        }
+        for fc in function_calls
     ]
     response2 = responses_client.responses.create(
         model=text_model_id,
@@ -600,8 +601,8 @@ def test_function_call_output_list_text(responses_client, text_model_id):
         stream=False,
         previous_response_id=response.id,
     )
-    assert len(response2.output) == 1
-    assert response2.output[0].type == "message"
+    messages = [o for o in response2.output if o.type == "message"]
+    assert len(messages) >= 1
     assert response2.output_text
 
 
@@ -631,19 +632,19 @@ def test_function_call_output_list_text_multi_block(responses_client, text_model
         tools=tools,
         stream=False,
     )
-    assert len(response.output) == 1
-    assert response.output[0].type == "function_call"
-    call_id = response.output[0].call_id
+    function_calls = [o for o in response.output if o.type == "function_call"]
+    assert len(function_calls) >= 1
 
     inputs = [
         {
             "type": "function_call_output",
-            "call_id": call_id,
+            "call_id": fc.call_id,
             "output": [
                 {"type": "input_text", "text": "Current conditions: overcast skies."},
                 {"type": "input_text", "text": "Temperature: 15 degrees Celsius."},
             ],
-        },
+        }
+        for fc in function_calls
     ]
     response2 = responses_client.responses.create(
         model=text_model_id,
@@ -652,8 +653,8 @@ def test_function_call_output_list_text_multi_block(responses_client, text_model
         stream=False,
         previous_response_id=response.id,
     )
-    assert len(response2.output) == 1
-    assert response2.output[0].type == "message"
+    messages = [o for o in response2.output if o.type == "message"]
+    assert len(messages) >= 1
     assert response2.output_text