fix: address failing hitl error scenarios

mjschock · mjschock · commit cad04648139a · 2025-12-09T15:58:42.000-08:00
diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
@@ -695,6 +695,33 @@ def get_approval_identity(approval: ToolApprovalItem) -> str | None:
             config=run_config,
         )
 
+        # Execute shell calls that were approved
+        shell_results = await cls.execute_shell_calls(
+            agent=agent,
+            calls=processed_response.shell_calls,
+            hooks=hooks,
+            context_wrapper=context_wrapper,
+            config=run_config,
+        )
+
+        # Execute local shell calls that were approved
+        local_shell_results = await cls.execute_local_shell_calls(
+            agent=agent,
+            calls=processed_response.local_shell_calls,
+            hooks=hooks,
+            context_wrapper=context_wrapper,
+            config=run_config,
+        )
+
+        # Execute apply_patch calls that were approved
+        apply_patch_results = await cls.execute_apply_patch_calls(
+            agent=agent,
+            calls=processed_response.apply_patch_calls,
+            hooks=hooks,
+            context_wrapper=context_wrapper,
+            config=run_config,
+        )
+
         # When resuming we receive the original RunItem references; suppress duplicates
         # so history and streaming do not double-emit the same items.
         # Use object IDs since RunItem objects are not hashable
@@ -715,6 +742,15 @@ def append_if_new(item: RunItem) -> None:
         for computer_result in computer_results:
             append_if_new(computer_result)
 
+        for shell_result in shell_results:
+            append_if_new(shell_result)
+
+        for local_shell_result in local_shell_results:
+            append_if_new(local_shell_result)
+
+        for apply_patch_result in apply_patch_results:
+            append_if_new(apply_patch_result)
+
         # Run MCP tools that require approval after they get their approval results
         # Find MCP approval requests that have corresponding ToolApprovalItems in interruptions
         mcp_approval_runs = []
@@ -1043,23 +1079,24 @@ def process_model_response(
                 tools_used.append("code_interpreter")
             elif isinstance(output, LocalShellCall):
                 items.append(ToolCallItem(raw_item=output, agent=agent))
-                if shell_tool:
+                if local_shell_tool:
+                    tools_used.append("local_shell")
+                    local_shell_calls.append(
+                        ToolRunLocalShellCall(tool_call=output, local_shell_tool=local_shell_tool)
+                    )
+                elif shell_tool:
                     tools_used.append(shell_tool.name)
                     shell_calls.append(ToolRunShellCall(tool_call=output, shell_tool=shell_tool))
                 else:
                     tools_used.append("local_shell")
-                    if not local_shell_tool:
-                        _error_tracing.attach_error_to_current_span(
-                            SpanError(
-                                message="Local shell tool not found",
-                                data={},
-                            )
-                        )
-                        raise ModelBehaviorError(
-                            "Model produced local shell call without a local shell tool."
+                    _error_tracing.attach_error_to_current_span(
+                        SpanError(
+                            message="Local shell tool not found",
+                            data={},
                         )
-                    local_shell_calls.append(
-                        ToolRunLocalShellCall(tool_call=output, local_shell_tool=local_shell_tool)
+                    )
+                    raise ModelBehaviorError(
+                        "Model produced local shell call without a local shell tool."
                     )
             elif isinstance(output, ResponseCustomToolCall) and _is_apply_patch_name(
                 output.name, apply_patch_tool
diff --git a/src/agents/items.py b/src/agents/items.py
@@ -441,6 +441,43 @@ def __post_init__(self) -> None:
             else:
                 self.tool_name = None
 
+    def __hash__(self) -> int:
+        """Make ToolApprovalItem hashable so it can be added to sets.
+
+        This is required for line 783 in _run_impl.py where pending_hosted_mcp_approvals.add()
+        is called with a ToolApprovalItem.
+        """
+        # Extract call_id or id from raw_item for hashing
+        if isinstance(self.raw_item, dict):
+            call_id = self.raw_item.get("call_id") or self.raw_item.get("id")
+        else:
+            call_id = getattr(self.raw_item, "call_id", None) or getattr(self.raw_item, "id", None)
+
+        # Hash using call_id and tool_name for uniqueness
+        return hash((call_id, self.tool_name))
+
+    def __eq__(self, other: object) -> bool:
+        """Check equality based on call_id and tool_name."""
+        if not isinstance(other, ToolApprovalItem):
+            return False
+
+        # Extract call_id from both items
+        if isinstance(self.raw_item, dict):
+            self_call_id = self.raw_item.get("call_id") or self.raw_item.get("id")
+        else:
+            self_call_id = getattr(self.raw_item, "call_id", None) or getattr(
+                self.raw_item, "id", None
+            )
+
+        if isinstance(other.raw_item, dict):
+            other_call_id = other.raw_item.get("call_id") or other.raw_item.get("id")
+        else:
+            other_call_id = getattr(other.raw_item, "call_id", None) or getattr(
+                other.raw_item, "id", None
+            )
+
+        return self_call_id == other_call_id and self.tool_name == other.tool_name
+
     @property
     def name(self) -> str | None:
         """Returns the tool name if available on the raw item or provided explicitly.
diff --git a/src/agents/result.py b/src/agents/result.py
@@ -162,6 +162,8 @@ class RunResult(RunResultBase):
     _original_input: str | list[TResponseInputItem] | None = field(default=None, repr=False)
     """The original input from the first turn. Unlike `input`, this is never updated during the run.
     Used by to_state() to preserve the correct originalInput when serializing state."""
+    max_turns: int = 10
+    """The maximum number of turns allowed for this run."""
 
     def __post_init__(self) -> None:
         self._last_agent_ref = weakref.ref(self._last_agent)
@@ -218,7 +220,7 @@ def to_state(self) -> Any:
             if original_input_for_state is not None
             else self.input,
             starting_agent=self.last_agent,
-            max_turns=10,  # This will be overridden by the runner
+            max_turns=self.max_turns,
         )
 
         # Populate the state with data from the result
diff --git a/src/agents/run.py b/src/agents/run.py
@@ -919,6 +919,9 @@ async def run(
             # Override context with the state's context if not provided
             if context is None and run_state._context is not None:
                 context = run_state._context.context
+
+            # Override max_turns with the state's max_turns to preserve it across resumption
+            max_turns = run_state._max_turns
         else:
             # Keep original user input separate from session-prepared input
             raw_input = cast(Union[str, list[TResponseInputItem]], input)
@@ -1240,6 +1243,7 @@ def _get_approval_identity(
                                     _tool_use_tracker_snapshot=self._serialize_tool_use_tracker(
                                         tool_use_tracker
                                     ),
+                                    max_turns=max_turns,
                                 )
                                 result._original_input = _copy_str_or_list(original_input)
                                 return result
@@ -1284,6 +1288,7 @@ def _get_approval_identity(
                                     _tool_use_tracker_snapshot=self._serialize_tool_use_tracker(
                                         tool_use_tracker
                                     ),
+                                    max_turns=max_turns,
                                 )
                                 if server_conversation_tracker is None:
                                     # Save both input and output items together at the end.
@@ -1648,6 +1653,7 @@ def _get_approval_identity(
                                 _tool_use_tracker_snapshot=self._serialize_tool_use_tracker(
                                     tool_use_tracker
                                 ),
+                                max_turns=max_turns,
                             )
                             if run_state is not None:
                                 result._current_turn_persisted_item_count = (
@@ -1702,6 +1708,7 @@ def _get_approval_identity(
                                 _tool_use_tracker_snapshot=self._serialize_tool_use_tracker(
                                     tool_use_tracker
                                 ),
+                                max_turns=max_turns,
                             )
                             if run_state is not None:
                                 result._current_turn_persisted_item_count = (
@@ -1940,6 +1947,10 @@ def run_streamed(
             # Use context from RunState if not provided
             if context is None and run_state._context is not None:
                 context = run_state._context.context
+
+            # Override max_turns with the state's max_turns to preserve it across resumption
+            max_turns = run_state._max_turns
+
             # Use context wrapper from RunState
             context_wrapper = cast(RunContextWrapper[TContext], run_state._context)
         else:
diff --git a/src/agents/run_state.py b/src/agents/run_state.py
@@ -681,22 +681,27 @@ def _serialize_current_step(self) -> dict[str, Any] | None:
             return None
 
         # Interruptions are wrapped in a "data" field
+        interruptions_data = []
+        for item in self._current_step.interruptions:
+            if isinstance(item, ToolApprovalItem):
+                interruption_dict = {
+                    "type": "tool_approval_item",
+                    "rawItem": self._camelize_field_names(
+                        item.raw_item.model_dump(exclude_unset=True)
+                        if hasattr(item.raw_item, "model_dump")
+                        else item.raw_item
+                    ),
+                    "agent": {"name": item.agent.name},
+                }
+                # Include tool_name if present
+                if item.tool_name is not None:
+                    interruption_dict["toolName"] = item.tool_name
+                interruptions_data.append(interruption_dict)
+
         return {
             "type": "next_step_interruption",
             "data": {
-                "interruptions": [
-                    {
-                        "type": "tool_approval_item",
-                        "rawItem": self._camelize_field_names(
-                            item.raw_item.model_dump(exclude_unset=True)
-                            if hasattr(item.raw_item, "model_dump")
-                            else item.raw_item
-                        ),
-                        "agent": {"name": item.agent.name},
-                    }
-                    for item in self._current_step.interruptions
-                    if isinstance(item, ToolApprovalItem)
-                ],
+                "interruptions": interruptions_data,
             },
         }
 
@@ -994,8 +999,44 @@ async def from_string(
                     # Normalize field names from JSON format (camelCase)
                     # to Python format (snake_case)
                     normalized_raw_item = _normalize_field_names(item_data["rawItem"])
-                    raw_item = ResponseFunctionToolCall(**normalized_raw_item)
-                    approval_item = ToolApprovalItem(agent=agent, raw_item=raw_item)
+
+                    # Extract tool_name if present (for backwards compatibility)
+                    tool_name = item_data.get("toolName")
+
+                    # Tool call items can be function calls, shell calls, apply_patch calls,
+                    # MCP calls, etc. Check the type field to determine which type to deserialize as
+                    tool_type = normalized_raw_item.get("type")
+
+                    # Try to deserialize based on the type field
+                    try:
+                        if tool_type == "function_call":
+                            raw_item = ResponseFunctionToolCall(**normalized_raw_item)
+                        elif tool_type == "shell_call":
+                            # Shell calls use dict format, not a specific type
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "apply_patch_call":
+                            # Apply patch calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "hosted_tool_call":
+                            # MCP/hosted tool calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "local_shell_call":
+                            # Local shell calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        else:
+                            # Default to trying ResponseFunctionToolCall for backwards compatibility
+                            try:
+                                raw_item = ResponseFunctionToolCall(**normalized_raw_item)
+                            except Exception:
+                                # If that fails, use dict as-is
+                                raw_item = normalized_raw_item  # type: ignore[assignment]
+                    except Exception:
+                        # If deserialization fails, use dict for flexibility
+                        raw_item = normalized_raw_item  # type: ignore[assignment]
+
+                    approval_item = ToolApprovalItem(
+                        agent=agent, raw_item=raw_item, tool_name=tool_name
+                    )
                     interruptions.append(approval_item)
 
             # Import at runtime to avoid circular import
@@ -1172,8 +1213,44 @@ async def from_json(
                     # Normalize field names from JSON format (camelCase)
                     # to Python format (snake_case)
                     normalized_raw_item = _normalize_field_names(item_data["rawItem"])
-                    raw_item = ResponseFunctionToolCall(**normalized_raw_item)
-                    approval_item = ToolApprovalItem(agent=agent, raw_item=raw_item)
+
+                    # Extract tool_name if present (for backwards compatibility)
+                    tool_name = item_data.get("toolName")
+
+                    # Tool call items can be function calls, shell calls, apply_patch calls,
+                    # MCP calls, etc. Check the type field to determine which type to deserialize as
+                    tool_type = normalized_raw_item.get("type")
+
+                    # Try to deserialize based on the type field
+                    try:
+                        if tool_type == "function_call":
+                            raw_item = ResponseFunctionToolCall(**normalized_raw_item)
+                        elif tool_type == "shell_call":
+                            # Shell calls use dict format, not a specific type
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "apply_patch_call":
+                            # Apply patch calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "hosted_tool_call":
+                            # MCP/hosted tool calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        elif tool_type == "local_shell_call":
+                            # Local shell calls use dict format
+                            raw_item = normalized_raw_item  # type: ignore[assignment]
+                        else:
+                            # Default to trying ResponseFunctionToolCall for backwards compatibility
+                            try:
+                                raw_item = ResponseFunctionToolCall(**normalized_raw_item)
+                            except Exception:
+                                # If that fails, use dict as-is
+                                raw_item = normalized_raw_item  # type: ignore[assignment]
+                    except Exception:
+                        # If deserialization fails, use dict for flexibility
+                        raw_item = normalized_raw_item  # type: ignore[assignment]
+
+                    approval_item = ToolApprovalItem(
+                        agent=agent, raw_item=raw_item, tool_name=tool_name
+                    )
                     interruptions.append(approval_item)
 
             # Import at runtime to avoid circular import
@@ -1575,8 +1652,40 @@ def _deserialize_items(
                 result.append(MessageOutputItem(agent=agent, raw_item=raw_item_msg))
 
             elif item_type == "tool_call_item":
-                raw_item_tool = ResponseFunctionToolCall(**normalized_raw_item)
-                result.append(ToolCallItem(agent=agent, raw_item=raw_item_tool))
+                # Tool call items can be function calls, shell calls, apply_patch calls,
+                # MCP calls, etc. Check the type field to determine which type to deserialize as
+                tool_type = normalized_raw_item.get("type")
+
+                # Try to deserialize based on the type field
+                # If deserialization fails, fall back to using the dict as-is
+                try:
+                    if tool_type == "function_call":
+                        raw_item_tool = ResponseFunctionToolCall(**normalized_raw_item)
+                    elif tool_type == "shell_call":
+                        # Shell calls use dict format, not a specific type
+                        raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+                    elif tool_type == "apply_patch_call":
+                        # Apply patch calls use dict format
+                        raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+                    elif tool_type == "hosted_tool_call":
+                        # MCP/hosted tool calls use dict format
+                        raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+                    elif tool_type == "local_shell_call":
+                        # Local shell calls use dict format
+                        raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+                    else:
+                        # Default to trying ResponseFunctionToolCall for backwards compatibility
+                        try:
+                            raw_item_tool = ResponseFunctionToolCall(**normalized_raw_item)
+                        except Exception:
+                            # If that fails, use dict as-is
+                            raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+
+                    result.append(ToolCallItem(agent=agent, raw_item=raw_item_tool))
+                except Exception:
+                    # If deserialization fails, use dict for flexibility
+                    raw_item_tool = normalized_raw_item  # type: ignore[assignment]
+                    result.append(ToolCallItem(agent=agent, raw_item=raw_item_tool))
 
             elif item_type == "tool_call_output_item":
                 # For tool call outputs, validate and convert the raw dict