livekit · hudson-worden · Mar 3, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 5, 2026
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -1453,8 +1453,8 @@ def on_end_of_turn(self, info: _EndOfTurnInfo) -> bool:
                 extra={"user_input": info.new_transcript},
             )
 
-            if self._session._closing:
-                # add user input to chat context
+            if self._session._closing and info.new_transcript != "":
+                # add user input to chat context and skip blank messages
                 user_message = llm.ChatMessage(
                     role="user",
                     content=[info.new_transcript],

diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py
@@ -69,7 +69,8 @@ async def predict_end_of_turn(
     ) -> float: ...
 
 
-TurnDetectionMode = Literal["stt", "vad", "realtime_llm", "manual"] | _TurnDetector
+TurnDetectionType = Literal["stt", "vad", "realtime_llm", "manual"]
+TurnDetectionMode = TurnDetectionType | _TurnDetector
 """
 The mode of turn detection to use.
 
@@ -121,7 +122,9 @@ def __init__(
         self._turn_detector = turn_detection if not isinstance(turn_detection, str) else None
         self._stt = stt
         self._vad = vad
-        self._turn_detection_mode = turn_detection if isinstance(turn_detection, str) else None
+        self._turn_detection_mode: TurnDetectionType | None = (
+            turn_detection if isinstance(turn_detection, str) else None
+        )
         self._vad_base_turn_detection = self._turn_detection_mode in ("vad", None)
         self._user_turn_committed = False  # true if user turn ended but EOU task not done
 
@@ -521,13 +524,30 @@ async def _on_vad_event(self, ev: vad.VADEvent) -> None:
                 chat_ctx = self._hooks.retrieve_chat_ctx().copy()
                 self._run_eou_detection(chat_ctx)
 
+    def _eou_requires_transcript(self) -> bool:
+        if self._stt:
+            # while we aren't checking _turn_detector here,
+            #   _turn_detector and _turn_detection_mode are mutually exclusive (such that if one is provided, the other must be None)
+            # e.g. if _turn_detector is provided, _turn_detection_mode is None, and vice versa
+            match self._turn_detection_mode:
+                case "stt" | "realtime_llm" | None:
+                    return True
+                case "manual" | "vad":
+                    return False
+                case _:
+                    # If not specified then we assume it requires transcript
+                    return True
+        else:
+            return False
+
     def _run_eou_detection(self, chat_ctx: llm.ChatContext, skip_reply: bool = False) -> None:
-        if self._stt and not self._audio_transcript and self._turn_detection_mode != "manual":
-            # stt enabled but no transcript yet
+        if not self._audio_transcript and self._eou_requires_transcript():
             return
 
         chat_ctx = chat_ctx.copy()
-        chat_ctx.add_message(role="user", content=self._audio_transcript)
+        if self._audio_transcript != "":
+            # only append when we have a transcript so we don't inject blank user messages
+            chat_ctx.add_message(role="user", content=self._audio_transcript)
         turn_detector = (
             self._turn_detector
             if self._audio_transcript and self._turn_detection_mode != "manual"