Introduce tool_use_behavior on agents

rm-openai · rm-openai · commit c319539c9df8 · 2025-03-17T14:35:47.000-04:00
diff --git a/docs/agents.md b/docs/agents.md
@@ -130,3 +130,16 @@ robot_agent = pirate_agent.clone(
     instructions="Write like a robot",
 )
 ```
+
+## Forcing tool use
+
+Supplying a list of tools doesn't always mean the LLM will use a tool. You can force tool use by setting [`ModelSettings.tool_choice`][agents.model_settings.ModelSettings.tool_choice]. Valid values are:
+
+1. `auto`, which allows the LLM to decide whether or not to use a tool.
+2. `required`, which requires the LLM to use a tool (but it can intelligently decide which tool).
+3. `none`, which requires the LLM to _not_ use a tool.
+4. Setting a specific string e.g. `my_tool`, which requires the LLM to use that specific tool.
+
+!!! note
+
+    If requiring tool use, you should consider setting [`Agent.tool_use_behavior`] to stop the Agent from running when a tool output is produced. Otherwise, the Agent might run in an infinite loop, where the LLM produces a tool call , and the tool result is sent to the LLM, and this infinite loops because the LLM is always forced to use a tool.
diff --git a/src/agents/__init__.py b/src/agents/__init__.py
@@ -5,7 +5,7 @@
 from openai import AsyncOpenAI
 
 from . import _config
-from .agent import Agent
+from .agent import Agent, ToolsToFinalOutputFunction, ToolsToFinalOutputResult
 from .agent_output import AgentOutputSchema
 from .computer import AsyncComputer, Button, Computer, Environment
 from .exceptions import (
@@ -57,6 +57,7 @@
     ComputerTool,
     FileSearchTool,
     FunctionTool,
+    FunctionToolResult,
     Tool,
     WebSearchTool,
     default_tool_error_function,
@@ -136,6 +137,8 @@ def enable_verbose_stdout_logging():
 
 __all__ = [
     "Agent",
+    "ToolsToFinalOutputFunction",
+    "ToolsToFinalOutputResult",
     "Runner",
     "Model",
     "ModelProvider",
@@ -189,6 +192,7 @@ def enable_verbose_stdout_logging():
     "AgentUpdatedStreamEvent",
     "StreamEvent",
     "FunctionTool",
+    "FunctionToolResult",
     "ComputerTool",
     "FileSearchTool",
     "Tool",
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import asyncio
+import inspect
+from collections.abc import Awaitable
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
 from openai.types.responses import (
     ResponseComputerToolCall,
@@ -25,7 +27,7 @@
 from openai.types.responses.response_input_param import ComputerCallOutput
 from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 
-from .agent import Agent
+from .agent import Agent, ToolsToFinalOutputResult
 from .agent_output import AgentOutputSchema
 from .computer import AsyncComputer, Computer
 from .exceptions import AgentsException, ModelBehaviorError, UserError
@@ -48,7 +50,7 @@
 from .models.interface import ModelTracing
 from .run_context import RunContextWrapper, TContext
 from .stream_events import RunItemStreamEvent, StreamEvent
-from .tool import ComputerTool, FunctionTool
+from .tool import ComputerTool, FunctionTool, FunctionToolResult
 from .tracing import (
     SpanError,
     Trace,
@@ -70,6 +72,8 @@ class QueueCompleteSentinel:
 
 QUEUE_COMPLETE_SENTINEL = QueueCompleteSentinel()
 
+_NOT_FINAL_OUTPUT = ToolsToFinalOutputResult(is_final_output=False, final_output=None)
+
 
 @dataclass
 class ToolRunHandoff:
@@ -199,7 +203,7 @@ async def execute_tools_and_side_effects(
                 config=run_config,
             ),
         )
-        new_step_items.extend(function_results)
+        new_step_items.extend([result.run_item for result in function_results])
         new_step_items.extend(computer_results)
 
         # Second, check if there are any handoffs
@@ -216,6 +220,30 @@ async def execute_tools_and_side_effects(
                 run_config=run_config,
             )
 
+        # Third, we'll check if the tool use should result in a final output
+        check_tool_use = await cls._check_for_final_output_from_tools(
+            agent=agent,
+            tool_results=function_results,
+            context_wrapper=context_wrapper,
+            config=run_config,
+        )
+
+        if check_tool_use.is_final_output:
+            # If the output type is str, then let's just stringify it
+            if not agent.output_type or agent.output_type is str:
+                check_tool_use.final_output = str(check_tool_use.final_output)
+
+            return await cls.execute_final_output(
+                agent=agent,
+                original_input=original_input,
+                new_response=new_response,
+                pre_step_items=pre_step_items,
+                new_step_items=new_step_items,
+                final_output=check_tool_use.final_output,
+                hooks=hooks,
+                context_wrapper=context_wrapper,
+            )
+
         # Now we can check if the model also produced a final output
         message_items = [item for item in new_step_items if isinstance(item, MessageOutputItem)]
 
@@ -355,10 +383,10 @@ async def execute_function_tool_calls(
         hooks: RunHooks[TContext],
         context_wrapper: RunContextWrapper[TContext],
         config: RunConfig,
-    ) -> list[RunItem]:
+    ) -> list[FunctionToolResult]:
         async def run_single_tool(
             func_tool: FunctionTool, tool_call: ResponseFunctionToolCall
-        ) -> str:
+        ) -> Any:
             with function_span(func_tool.name) as span_fn:
                 if config.trace_include_sensitive_data:
                     span_fn.span_data.input = tool_call.arguments
@@ -404,10 +432,14 @@ async def run_single_tool(
         results = await asyncio.gather(*tasks)
 
         return [
-            ToolCallOutputItem(
-                output=str(result),
-                raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, str(result)),
-                agent=agent,
+            FunctionToolResult(
+                tool=tool_run.function_tool,
+                output=result,
+                run_item=ToolCallOutputItem(
+                    output=result,
+                    raw_item=ItemHelpers.tool_call_output_item(tool_run.tool_call, str(result)),
+                    agent=agent,
+                ),
             )
             for tool_run, result in zip(tool_runs, results)
         ]
@@ -646,6 +678,39 @@ def stream_step_result_to_queue(
             if event:
                 queue.put_nowait(event)
 
+    @classmethod
+    async def _check_for_final_output_from_tools(
+        cls,
+        *,
+        agent: Agent[TContext],
+        tool_results: list[FunctionToolResult],
+        context_wrapper: RunContextWrapper[TContext],
+        config: RunConfig,
+    ) -> ToolsToFinalOutputResult:
+        """Returns (i, final_output)."""
+        if not tool_results:
+            return _NOT_FINAL_OUTPUT
+
+        if agent.tool_use_behavior == "run_llm_again":
+            return _NOT_FINAL_OUTPUT
+        elif agent.tool_use_behavior == "stop_on_first_tool":
+            return ToolsToFinalOutputResult(
+                is_final_output=True, final_output=tool_results[0].output
+            )
+        elif callable(agent.tool_use_behavior):
+            if inspect.iscoroutinefunction(agent.tool_use_behavior):
+                return await cast(
+                    Awaitable[ToolsToFinalOutputResult],
+                    agent.tool_use_behavior(context_wrapper, tool_results),
+                )
+            else:
+                return cast(
+                    ToolsToFinalOutputResult, agent.tool_use_behavior(context_wrapper, tool_results)
+                )
+        else:
+            logger.error(f"Invalid tool_use_behavior: {agent.tool_use_behavior}")
+            raise UserError(f"Invalid tool_use_behavior: {agent.tool_use_behavior}")
+
 
 class TraceCtxManager:
     """Creates a trace only if there is no current trace, and manages the trace lifecycle."""
diff --git a/src/agents/items.py b/src/agents/items.py
@@ -129,8 +129,10 @@ class ToolCallOutputItem(RunItemBase[Union[FunctionCallOutput, ComputerCallOutpu
     raw_item: FunctionCallOutput | ComputerCallOutput
     """The raw item from the model."""
 
-    output: str
-    """The output of the tool call."""
+    output: Any
+    """The output of the tool call. This is whatever the tool call returned; the `raw_item`
+    contains a string representation of the output.
+    """
 
     type: Literal["tool_call_output_item"] = "tool_call_output_item"
 
diff --git a/src/agents/tool.py b/src/agents/tool.py
@@ -15,6 +15,7 @@
 from .computer import AsyncComputer, Computer
 from .exceptions import ModelBehaviorError
 from .function_schema import DocstringStyle, function_schema
+from .items import RunItem
 from .logger import logger
 from .run_context import RunContextWrapper
 from .tracing import SpanError
@@ -29,6 +30,18 @@
 ToolFunction = Union[ToolFunctionWithoutContext[ToolParams], ToolFunctionWithContext[ToolParams]]
 
 
+@dataclass
+class FunctionToolResult:
+    tool: FunctionTool
+    """The tool that was run."""
+
+    output: Any
+    """The output of the tool."""
+
+    run_item: RunItem
+    """The run item that was produced as a result of the tool call."""
+
+
 @dataclass
 class FunctionTool:
     """A tool that wraps a function. In most cases, you should use  the `function_tool` helpers to
@@ -44,15 +57,15 @@ class FunctionTool:
     params_json_schema: dict[str, Any]
     """The JSON schema for the tool's parameters."""
 
-    on_invoke_tool: Callable[[RunContextWrapper[Any], str], Awaitable[str]]
+    on_invoke_tool: Callable[[RunContextWrapper[Any], str], Awaitable[Any]]
     """A function that invokes the tool with the given context and parameters. The params passed
     are:
     1. The tool run context.
     2. The arguments from the LLM, as a JSON string.
 
-    You must return a string representation of the tool output. In case of errors, you can either
-    raise an Exception (which will cause the run to fail) or return a string error message (which
-    will be sent back to the LLM).
+    You must return a string representation of the tool output, or something we can call `str()` on.
+    In case of errors, you can either raise an Exception (which will cause the run to fail) or
+    return a string error message (which will be sent back to the LLM).
     """
 
     strict_json_schema: bool = True
@@ -204,7 +217,7 @@ def _create_function_tool(the_func: ToolFunction[...]) -> FunctionTool:
             strict_json_schema=strict_mode,
         )
 
-        async def _on_invoke_tool_impl(ctx: RunContextWrapper[Any], input: str) -> str:
+        async def _on_invoke_tool_impl(ctx: RunContextWrapper[Any], input: str) -> Any:
             try:
                 json_data: dict[str, Any] = json.loads(input) if input else {}
             except Exception as e:
@@ -251,9 +264,9 @@ async def _on_invoke_tool_impl(ctx: RunContextWrapper[Any], input: str) -> str:
             else:
                 logger.debug(f"Tool {schema.name} returned {result}")
 
-            return str(result)
+            return result
 
-        async def _on_invoke_tool(ctx: RunContextWrapper[Any], input: str) -> str:
+        async def _on_invoke_tool(ctx: RunContextWrapper[Any], input: str) -> Any:
             try:
                 return await _on_invoke_tool_impl(ctx, input)
             except Exception as e:
diff --git a/src/agents/tracing/span_data.py b/src/agents/tracing/span_data.py
@@ -51,7 +51,7 @@ def export(self) -> dict[str, Any]:
 class FunctionSpanData(SpanData):
     __slots__ = ("name", "input", "output")
 
-    def __init__(self, name: str, input: str | None, output: str | None):
+    def __init__(self, name: str, input: str | None, output: Any | None):
         self.name = name
         self.input = input
         self.output = output
@@ -65,7 +65,7 @@ def export(self) -> dict[str, Any]:
             "type": self.type,
             "name": self.name,
             "input": self.input,
-            "output": self.output,
+            "output": str(self.output) if self.output else None,
         }
 
 
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
@@ -21,6 +21,8 @@
     UserError,
     handoff,
 )
+from agents.agent import ToolsToFinalOutputResult
+from agents.tool import FunctionToolResult, function_tool
 
 from .fake_model import FakeModel
 from .test_responses import (
@@ -552,3 +554,83 @@ def guardrail_function(
 
     with pytest.raises(OutputGuardrailTripwireTriggered):
         await Runner.run(agent, input="user_message")
+
+
+@function_tool
+def test_tool_one():
+    return Foo(bar="tool_one_result")
+
+
+@function_tool
+def test_tool_two():
+    return "tool_two_result"
+
+
+@pytest.mark.asyncio
+async def test_tool_use_behavior_first_output():
+    model = FakeModel()
+    agent = Agent(
+        name="test",
+        model=model,
+        tools=[get_function_tool("foo", "tool_result"), test_tool_one, test_tool_two],
+        tool_use_behavior="stop_on_first_tool",
+        output_type=Foo,
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            # First turn: a message and tool call
+            [
+                get_text_message("a_message"),
+                get_function_tool_call("test_tool_one", None),
+                get_function_tool_call("test_tool_two", None),
+            ],
+        ]
+    )
+
+    result = await Runner.run(agent, input="user_message")
+
+    assert result.final_output == Foo(bar="tool_one_result"), (
+        "should have used the first tool result"
+    )
+
+
+def custom_tool_use_behavior(
+    context: RunContextWrapper[Any], results: list[FunctionToolResult]
+) -> ToolsToFinalOutputResult:
+    if "test_tool_one" in [result.tool.name for result in results]:
+        return ToolsToFinalOutputResult(is_final_output=True, final_output="the_final_output")
+    else:
+        return ToolsToFinalOutputResult(is_final_output=False, final_output=None)
+
+
+@pytest.mark.asyncio
+async def test_tool_use_behavior_custom_function():
+    model = FakeModel()
+    agent = Agent(
+        name="test",
+        model=model,
+        tools=[get_function_tool("foo", "tool_result"), test_tool_one, test_tool_two],
+        tool_use_behavior=custom_tool_use_behavior,
+    )
+
+    model.add_multiple_turn_outputs(
+        [
+            # First turn: a message and tool call
+            [
+                get_text_message("a_message"),
+                get_function_tool_call("test_tool_two", None),
+            ],
+            # Second turn: a message and tool call
+            [
+                get_text_message("a_message"),
+                get_function_tool_call("test_tool_one", None),
+                get_function_tool_call("test_tool_two", None),
+            ],
+        ]
+    )
+
+    result = await Runner.run(agent, input="user_message")
+
+    assert len(result.raw_responses) == 2, "should have two model responses"
+    assert result.final_output == "the_final_output", "should have used the custom function"
diff --git a/tests/test_function_tool.py b/tests/test_function_tool.py
diff --git a/tests/test_tool_use_behavior.py b/tests/test_tool_use_behavior.py