Skip to content

Commit a9bdf8e

Browse files
authored
Fix #1629 Empty tool call arguments in streaming events (#1636)
1 parent 6904dcb commit a9bdf8e

File tree

2 files changed

+375
-2
lines changed

2 files changed

+375
-2
lines changed

src/agents/run.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from openai.types.responses import (
1010
ResponseCompletedEvent,
11-
ResponseOutputItemAddedEvent,
11+
ResponseOutputItemDoneEvent,
1212
)
1313
from openai.types.responses.response_prompt_param import (
1414
ResponsePromptParam,
@@ -1040,7 +1040,7 @@ async def _run_single_turn_streamed(
10401040
)
10411041
context_wrapper.usage.add(usage)
10421042

1043-
if isinstance(event, ResponseOutputItemAddedEvent):
1043+
if isinstance(event, ResponseOutputItemDoneEvent):
10441044
output_item = event.item
10451045

10461046
if isinstance(output_item, _TOOL_CALL_TYPES):
Lines changed: 373 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,373 @@
1+
"""
2+
Tests to ensure that tool call arguments are properly populated in streaming events.
3+
4+
This test specifically guards against the regression where tool_called events
5+
were emitted with empty arguments during streaming (Issue #1629).
6+
"""
7+
8+
import json
9+
from collections.abc import AsyncIterator
10+
from typing import Any, Optional, Union, cast
11+
12+
import pytest
13+
from openai.types.responses import (
14+
ResponseCompletedEvent,
15+
ResponseFunctionToolCall,
16+
ResponseOutputItemAddedEvent,
17+
ResponseOutputItemDoneEvent,
18+
)
19+
20+
from agents import Agent, Runner, function_tool
21+
from agents.agent_output import AgentOutputSchemaBase
22+
from agents.handoffs import Handoff
23+
from agents.items import TResponseInputItem, TResponseOutputItem, TResponseStreamEvent
24+
from agents.model_settings import ModelSettings
25+
from agents.models.interface import Model, ModelTracing
26+
from agents.stream_events import RunItemStreamEvent
27+
from agents.tool import Tool
28+
from agents.tracing import generation_span
29+
30+
from .fake_model import get_response_obj
31+
from .test_responses import get_function_tool_call
32+
33+
34+
class StreamingFakeModel(Model):
35+
"""A fake model that actually emits streaming events to test our streaming fix."""
36+
37+
def __init__(self):
38+
self.turn_outputs: list[list[TResponseOutputItem]] = []
39+
self.last_turn_args: dict[str, Any] = {}
40+
41+
def set_next_output(self, output: list[TResponseOutputItem]):
42+
self.turn_outputs.append(output)
43+
44+
def get_next_output(self) -> list[TResponseOutputItem]:
45+
if not self.turn_outputs:
46+
return []
47+
return self.turn_outputs.pop(0)
48+
49+
async def get_response(
50+
self,
51+
system_instructions: Optional[str],
52+
input: Union[str, list[TResponseInputItem]],
53+
model_settings: ModelSettings,
54+
tools: list[Tool],
55+
output_schema: Optional[AgentOutputSchemaBase],
56+
handoffs: list[Handoff],
57+
tracing: ModelTracing,
58+
*,
59+
previous_response_id: Optional[str],
60+
conversation_id: Optional[str],
61+
prompt: Optional[Any],
62+
):
63+
raise NotImplementedError("Use stream_response instead")
64+
65+
async def stream_response(
66+
self,
67+
system_instructions: Optional[str],
68+
input: Union[str, list[TResponseInputItem]],
69+
model_settings: ModelSettings,
70+
tools: list[Tool],
71+
output_schema: Optional[AgentOutputSchemaBase],
72+
handoffs: list[Handoff],
73+
tracing: ModelTracing,
74+
*,
75+
previous_response_id: Optional[str] = None,
76+
conversation_id: Optional[str] = None,
77+
prompt: Optional[Any] = None,
78+
) -> AsyncIterator[TResponseStreamEvent]:
79+
"""Stream events that simulate real OpenAI streaming behavior for tool calls."""
80+
self.last_turn_args = {
81+
"system_instructions": system_instructions,
82+
"input": input,
83+
"model_settings": model_settings,
84+
"tools": tools,
85+
"output_schema": output_schema,
86+
"previous_response_id": previous_response_id,
87+
"conversation_id": conversation_id,
88+
}
89+
90+
with generation_span(disabled=True) as _:
91+
output = self.get_next_output()
92+
93+
sequence_number = 0
94+
95+
# Emit each output item with proper streaming events
96+
for item in output:
97+
if isinstance(item, ResponseFunctionToolCall):
98+
# First: emit ResponseOutputItemAddedEvent with EMPTY arguments
99+
# (this simulates the real streaming behavior that was causing the bug)
100+
empty_args_item = ResponseFunctionToolCall(
101+
id=item.id,
102+
call_id=item.call_id,
103+
type=item.type,
104+
name=item.name,
105+
arguments="", # EMPTY - this is the bug condition!
106+
)
107+
108+
yield ResponseOutputItemAddedEvent(
109+
item=empty_args_item,
110+
output_index=0,
111+
type="response.output_item.added",
112+
sequence_number=sequence_number,
113+
)
114+
sequence_number += 1
115+
116+
# Then: emit ResponseOutputItemDoneEvent with COMPLETE arguments
117+
yield ResponseOutputItemDoneEvent(
118+
item=item, # This has the complete arguments
119+
output_index=0,
120+
type="response.output_item.done",
121+
sequence_number=sequence_number,
122+
)
123+
sequence_number += 1
124+
125+
# Finally: emit completion
126+
yield ResponseCompletedEvent(
127+
type="response.completed",
128+
response=get_response_obj(output),
129+
sequence_number=sequence_number,
130+
)
131+
132+
133+
@function_tool
134+
def calculate_sum(a: int, b: int) -> str:
135+
"""Add two numbers together."""
136+
return str(a + b)
137+
138+
139+
@function_tool
140+
def format_message(name: str, message: str, urgent: bool = False) -> str:
141+
"""Format a message with name and urgency."""
142+
prefix = "URGENT: " if urgent else ""
143+
return f"{prefix}Hello {name}, {message}"
144+
145+
146+
@pytest.mark.asyncio
147+
async def test_streaming_tool_call_arguments_not_empty():
148+
"""Test that tool_called events contain non-empty arguments during streaming."""
149+
model = StreamingFakeModel()
150+
agent = Agent(
151+
name="TestAgent",
152+
model=model,
153+
tools=[calculate_sum],
154+
)
155+
156+
# Set up a tool call with arguments
157+
expected_arguments = '{"a": 5, "b": 3}'
158+
model.set_next_output(
159+
[
160+
get_function_tool_call("calculate_sum", expected_arguments, "call_123"),
161+
]
162+
)
163+
164+
result = Runner.run_streamed(agent, input="Add 5 and 3")
165+
166+
tool_called_events = []
167+
async for event in result.stream_events():
168+
if (
169+
event.type == "run_item_stream_event"
170+
and isinstance(event, RunItemStreamEvent)
171+
and event.name == "tool_called"
172+
):
173+
tool_called_events.append(event)
174+
175+
# Verify we got exactly one tool_called event
176+
assert len(tool_called_events) == 1, (
177+
f"Expected 1 tool_called event, got {len(tool_called_events)}"
178+
)
179+
180+
tool_event = tool_called_events[0]
181+
182+
# Verify the event has the expected structure
183+
assert hasattr(tool_event.item, "raw_item"), "tool_called event should have raw_item"
184+
assert hasattr(tool_event.item.raw_item, "arguments"), "raw_item should have arguments field"
185+
186+
# The critical test: arguments should NOT be empty
187+
# Cast to ResponseFunctionToolCall since we know that's what it is in our test
188+
raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)
189+
actual_arguments = raw_item.arguments
190+
assert actual_arguments != "", (
191+
f"Tool call arguments should not be empty, got: '{actual_arguments}'"
192+
)
193+
assert actual_arguments is not None, "Tool call arguments should not be None"
194+
195+
# Verify arguments contain the expected data
196+
assert actual_arguments == expected_arguments, (
197+
f"Expected arguments '{expected_arguments}', got '{actual_arguments}'"
198+
)
199+
200+
# Verify arguments are valid JSON that can be parsed
201+
try:
202+
parsed_args = json.loads(actual_arguments)
203+
assert parsed_args == {"a": 5, "b": 3}, (
204+
f"Parsed arguments should match expected values, got {parsed_args}"
205+
)
206+
except json.JSONDecodeError as e:
207+
pytest.fail(
208+
f"Tool call arguments should be valid JSON, but got: '{actual_arguments}' with error: {e}" # noqa: E501
209+
)
210+
211+
212+
@pytest.mark.asyncio
213+
async def test_streaming_tool_call_arguments_complex():
214+
"""Test streaming tool calls with complex arguments including strings and booleans."""
215+
model = StreamingFakeModel()
216+
agent = Agent(
217+
name="TestAgent",
218+
model=model,
219+
tools=[format_message],
220+
)
221+
222+
# Set up a tool call with complex arguments
223+
expected_arguments = (
224+
'{"name": "Alice", "message": "Your meeting is starting soon", "urgent": true}'
225+
)
226+
model.set_next_output(
227+
[
228+
get_function_tool_call("format_message", expected_arguments, "call_456"),
229+
]
230+
)
231+
232+
result = Runner.run_streamed(agent, input="Format a message for Alice")
233+
234+
tool_called_events = []
235+
async for event in result.stream_events():
236+
if (
237+
event.type == "run_item_stream_event"
238+
and isinstance(event, RunItemStreamEvent)
239+
and event.name == "tool_called"
240+
):
241+
tool_called_events.append(event)
242+
243+
assert len(tool_called_events) == 1, (
244+
f"Expected 1 tool_called event, got {len(tool_called_events)}"
245+
)
246+
247+
tool_event = tool_called_events[0]
248+
# Cast to ResponseFunctionToolCall since we know that's what it is in our test
249+
raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)
250+
actual_arguments = raw_item.arguments
251+
252+
# Critical checks for the regression
253+
assert actual_arguments != "", "Tool call arguments should not be empty"
254+
assert actual_arguments is not None, "Tool call arguments should not be None"
255+
assert actual_arguments == expected_arguments, (
256+
f"Expected '{expected_arguments}', got '{actual_arguments}'"
257+
)
258+
259+
# Verify the complex arguments parse correctly
260+
parsed_args = json.loads(actual_arguments)
261+
expected_parsed = {"name": "Alice", "message": "Your meeting is starting soon", "urgent": True}
262+
assert parsed_args == expected_parsed, f"Parsed arguments should match, got {parsed_args}"
263+
264+
265+
@pytest.mark.asyncio
266+
async def test_streaming_multiple_tool_calls_arguments():
267+
"""Test that multiple tool calls in streaming all have proper arguments."""
268+
model = StreamingFakeModel()
269+
agent = Agent(
270+
name="TestAgent",
271+
model=model,
272+
tools=[calculate_sum, format_message],
273+
)
274+
275+
# Set up multiple tool calls
276+
model.set_next_output(
277+
[
278+
get_function_tool_call("calculate_sum", '{"a": 10, "b": 20}', "call_1"),
279+
get_function_tool_call(
280+
"format_message", '{"name": "Bob", "message": "Test"}', "call_2"
281+
),
282+
]
283+
)
284+
285+
result = Runner.run_streamed(agent, input="Do some calculations")
286+
287+
tool_called_events = []
288+
async for event in result.stream_events():
289+
if (
290+
event.type == "run_item_stream_event"
291+
and isinstance(event, RunItemStreamEvent)
292+
and event.name == "tool_called"
293+
):
294+
tool_called_events.append(event)
295+
296+
# Should have exactly 2 tool_called events
297+
assert len(tool_called_events) == 2, (
298+
f"Expected 2 tool_called events, got {len(tool_called_events)}"
299+
)
300+
301+
# Check first tool call
302+
event1 = tool_called_events[0]
303+
# Cast to ResponseFunctionToolCall since we know that's what it is in our test
304+
raw_item1 = cast(ResponseFunctionToolCall, event1.item.raw_item)
305+
args1 = raw_item1.arguments
306+
assert args1 != "", "First tool call arguments should not be empty"
307+
expected_args1 = '{"a": 10, "b": 20}'
308+
assert args1 == expected_args1, (
309+
f"First tool call args: expected '{expected_args1}', got '{args1}'"
310+
)
311+
312+
# Check second tool call
313+
event2 = tool_called_events[1]
314+
# Cast to ResponseFunctionToolCall since we know that's what it is in our test
315+
raw_item2 = cast(ResponseFunctionToolCall, event2.item.raw_item)
316+
args2 = raw_item2.arguments
317+
assert args2 != "", "Second tool call arguments should not be empty"
318+
expected_args2 = '{"name": "Bob", "message": "Test"}'
319+
assert args2 == expected_args2, (
320+
f"Second tool call args: expected '{expected_args2}', got '{args2}'"
321+
)
322+
323+
324+
@pytest.mark.asyncio
325+
async def test_streaming_tool_call_with_empty_arguments():
326+
"""Test that tool calls with legitimately empty arguments still work correctly."""
327+
model = StreamingFakeModel()
328+
329+
@function_tool
330+
def get_current_time() -> str:
331+
"""Get the current time (no arguments needed)."""
332+
return "2024-01-15 10:30:00"
333+
334+
agent = Agent(
335+
name="TestAgent",
336+
model=model,
337+
tools=[get_current_time],
338+
)
339+
340+
# Tool call with empty arguments (legitimate case)
341+
model.set_next_output(
342+
[
343+
get_function_tool_call("get_current_time", "{}", "call_time"),
344+
]
345+
)
346+
347+
result = Runner.run_streamed(agent, input="What time is it?")
348+
349+
tool_called_events = []
350+
async for event in result.stream_events():
351+
if (
352+
event.type == "run_item_stream_event"
353+
and isinstance(event, RunItemStreamEvent)
354+
and event.name == "tool_called"
355+
):
356+
tool_called_events.append(event)
357+
358+
assert len(tool_called_events) == 1, (
359+
f"Expected 1 tool_called event, got {len(tool_called_events)}"
360+
)
361+
362+
tool_event = tool_called_events[0]
363+
# Cast to ResponseFunctionToolCall since we know that's what it is in our test
364+
raw_item = cast(ResponseFunctionToolCall, tool_event.item.raw_item)
365+
actual_arguments = raw_item.arguments
366+
367+
# Even "empty" arguments should be "{}", not literally empty string
368+
assert actual_arguments is not None, "Arguments should not be None"
369+
assert actual_arguments == "{}", f"Expected empty JSON object '{{}}', got '{actual_arguments}'"
370+
371+
# Should parse as valid empty JSON
372+
parsed_args = json.loads(actual_arguments)
373+
assert parsed_args == {}, f"Should parse to empty dict, got {parsed_args}"

0 commit comments

Comments
 (0)