livekit · longcw · Mar 9, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/examples/voice_agents/instructions_per_modality.py b/examples/voice_agents/instructions_per_modality.py
@@ -0,0 +1,105 @@
+import logging
+from datetime import datetime
+
+from dotenv import load_dotenv
+
+from livekit.agents import (
+    Agent,
+    AgentServer,
+    AgentSession,
+    JobContext,
+    JobProcess,
+    cli,
+    function_tool,
+    inference,
+)
+from livekit.agents.llm import Instructions
+from livekit.plugins import silero
+
+logger = logging.getLogger("instructions-per-modality")
+
+load_dotenv()
+
+BASE_INSTRUCTIONS = """\\
+You are a scheduling assistant named Alex that helps users book appointments.
+{modality_specific}
+Call `book_appointment` to finalise the booking.
+Never invent or assume details the user did not provide — ask for them instead.
+The current date is {current_date}.
+"""
+
+# Voice users speak in approximate, self-correcting natural language.
+# The LLM needs guidance on how to parse what was said, not how to say things back.
+AUDIO_SPECIFIC = """
+The user is speaking — their input arrives as voice transcription and may be imperfect.
+When interpreting what the user said:
+- Resolve relative spoken expressions to a concrete date/time: 'next Tuesday', 'tomorrow afternoon', 'the week after next around 3'.
+- Spoken numbers may be ambiguous: 'three thirty' could mean 3:30 PM or the 30th of March — ask for clarification when context does not make it obvious.
+- Honor verbal self-corrections: if the user says 'wait, I meant Thursday not Tuesday', update your understanding to Thursday and discard Tuesday.
+- Ignore filler words and hesitations ('um', 'uh', 'like', 'I guess').
+- Always confirm the resolved date and time out loud before booking, since spoken input is inherently ambiguous.
+"""
+
+# Text users type precise values — no need to normalise spoken patterns.
+TEXT_SPECIFIC = """
+The user is typing — take their input literally.
+When interpreting what the user wrote:
+- Accept exact dates and times in any common format (ISO, natural language, 12-hour or 24-hour clock).
+- If the user provides a complete and unambiguous date and time, you may book immediately without asking for confirmation.
+- Only ask follow-up questions for genuinely missing information.
+"""
+
+
+class SchedulingAgent(Agent):
+    def __init__(self) -> None:
+        current_date = datetime.now().strftime("%Y-%m-%d %A")
+        super().__init__(
+            instructions=Instructions(
+                audio=BASE_INSTRUCTIONS.format(
+                    modality_specific=AUDIO_SPECIFIC, current_date=current_date
+                ),
+                text=BASE_INSTRUCTIONS.format(
+                    modality_specific=TEXT_SPECIFIC, current_date=current_date
+                ),
+            )
+        )
+
+    async def on_enter(self) -> None:
+        self.session.generate_reply()
+
+    @function_tool
+    async def book_appointment(self, date: str, time: str) -> None:
+        """Book an appointment.
+
+        Args:
+            date: The date of the appointment in the format YYYY-MM-DD
+            time: The time of the appointment in the format HH:MM
+        """
+        logger.info(f"booking appointment for {date} at {time}")
+        return f"Appointment booked for {date} at {time}"
-    async def book_appointment(self, date: str, time: str) -> None:
-        """Book an appointment.
-
-        Args:
-            date: The date of the appointment in the format YYYY-MM-DD
-            time: The time of the appointment in the format HH:MM
-        """
-        logger.info(f"booking appointment for {date} at {time}")
-        return f"Appointment booked for {date} at {time}"
+    @function_tool
+    async def book_appointment(self, date: str, time: str) -> str:
+        """Book an appointment.
+
+        Args:
+            date: The date of the appointment in the format YYYY-MM-DD
+            time: The time of the appointment in the format HH:MM
+        """
+        logger.info(f"booking appointment for {date} at {time}")
+        return f"Appointment booked for {date} at {time}"
-    async def book_appointment(self, date: str, time: str) -> None:
-        """Book an appointment.
-
-        Args:
-            date: The date of the appointment in the format YYYY-MM-DD
-            time: The time of the appointment in the format HH:MM
-        """
-        logger.info(f"booking appointment for {date} at {time}")
-        return f"Appointment booked for {date} at {time}"
+    @function_tool
+    async def book_appointment(self, date: str, time: str) -> str:
+        """Book an appointment.
+
+        Args:
+            date: The date of the appointment in the format YYYY-MM-DD
+            time: The time of the appointment in the format HH:MM
+        """
+        logger.info(f"booking appointment for {date} at {time}")
+        return f"Appointment booked for {date} at {time}"
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess) -> None:
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext) -> None:
+    session = AgentSession(
+        stt=inference.STT("deepgram/nova-3"),
+        llm=inference.LLM("openai/gpt-4.1-mini"),
+        tts=inference.TTS("cartesia/sonic-3"),
+        vad=ctx.proc.userdata["vad"],
+    )
+
+    await session.start(agent=SchedulingAgent(), room=ctx.room)
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
diff --git a/livekit-agents/livekit/agents/beta/workflows/address.py b/livekit-agents/livekit/agents/beta/workflows/address.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING
 
 from ... import llm, stt, tts, vad
+from ...llm import Instructions
 from ...llm.tool_context import ToolError, ToolFlag, function_tool
 from ...types import NOT_GIVEN, NotGivenOr
 from ...utils import is_given
@@ -15,6 +16,47 @@
     from ...voice.audio_recognition import TurnDetectionMode
 
 
+_BASE_INSTRUCTIONS = """
+You are only a single step in a broader system, responsible solely for capturing an address.
+You will be handling addresses from any country.
+{modality_specific}
+Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. (before asking any questions or providing any answers.)
+Don't invent new addresses, stick strictly to what the user said.
+{confirmation_instructions}
+If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country.
+Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
+Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
+{extra_instructions}
+"""
+
+_AUDIO_SPECIFIC = """
+Expect that users will say address in different formats with fields filled like:
+- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',
+- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',
+- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',
+- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',
+- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',
+Normalize common spoken patterns silently:
+- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.
+- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.
+- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.
+- Filter out filler words or hesitations.
+- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.
+Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
+When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.
+Do not read the number and the suffix letters separately.
+Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.
+For example, read 90210 as 'nine zero two one zero.'
+Avoid using bullet points and parenthese in any responses.
+Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially.
+"""
+
+_TEXT_SPECIFIC = """
+Expect users to type their address directly.
+If the address looks almost correct but has minor issues (e.g. missing country or postal code), prompt for clarification.
+"""
+
+
 @dataclass
 class GetAddressResult:
     address: str
@@ -34,40 +76,27 @@ def __init__(
         allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
         require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
     ) -> None:
+        confirmation_instructions = (
+            "Call `confirm_address` after the user confirmed the address is correct."
+        )
+        extra = extra_instructions if extra_instructions else ""
+
         super().__init__(
-            instructions=(
-                "You are only a single step in a broader system, responsible solely for capturing an address.\n"
-                "You will be handling addresses from any country. Expect that users will say address in different formats with fields filled like:\n"
-                "- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',\n"
-                "- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',\n"
-                "- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',\n"
-                "- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',\n"
-                "- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',\n"
-                "Normalize common spoken patterns silently:\n"
-                "- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.\n"
-                "- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.\n"
-                "- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.\n"
-                "- Filter out filler words or hesitations.\n"
-                "- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.\n"
-                "Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
-                "Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. "
-                "(before asking any questions or providing any answers.) \n"
-                "Don't invent new addresses, stick strictly to what the user said. \n"
-                + (
-                    "Call `confirm_address` after the user confirmed the address is correct. \n"
-                    if require_confirmation is not False
-                    else ""
-                )
-                + "When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.\n"
-                "Do not read the number and the suffix letters separately.\n"
-                "Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.\n"
-                "For example, read 90210 as 'nine zero two one zero.'\n"
-                "Avoid using bullet points and parenthese in any responses.\n"
-                "Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially. \n"
-                "If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country. \n"
-                "Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
-                "Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
-                + extra_instructions
+            instructions=Instructions(
+                _BASE_INSTRUCTIONS.format(
+                    modality_specific=_AUDIO_SPECIFIC,
+                    confirmation_instructions=(
+                        confirmation_instructions if require_confirmation is not False else ""
+                    ),
+                    extra_instructions=extra,
+                ),
+                text=_BASE_INSTRUCTIONS.format(
+                    modality_specific=_TEXT_SPECIFIC,
+                    confirmation_instructions=(
+                        confirmation_instructions if require_confirmation is True else ""
+                    ),
+                    extra_instructions=extra,
+                ),
             ),
             chat_ctx=chat_ctx,
             turn_detection=turn_detection,

diff --git a/livekit-agents/livekit/agents/beta/workflows/email_address.py b/livekit-agents/livekit/agents/beta/workflows/email_address.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 from ... import llm, stt, tts, vad
+from ...llm import Instructions
 from ...llm.tool_context import ToolError, ToolFlag, function_tool
 from ...types import NOT_GIVEN, NotGivenOr
 from ...utils import is_given
@@ -19,6 +20,39 @@
     r"^[A-Za-z0-9][A-Za-z0-9._%+\-]*@(?:[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?\.)+[A-Za-z]{2,}$"
 )
 
+_BASE_INSTRUCTIONS = """
+You are only a single step in a broader system, responsible solely for capturing an email address.
+{modality_specific}
+Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. (before asking any questions or providing any answers.)
+Don't invent new email addresses, stick strictly to what the user said.
+{confirmation_instructions}
+If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed.
+Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
+Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
+{extra_instructions}
+"""
+
+_AUDIO_SPECIFIC = """
+Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:
+- 'john dot doe at gmail dot com'
+- 'susan underscore smith at yahoo dot co dot uk'
+- 'dave dash b at protonmail dot com'
+- 'jane at example' (partial—prompt for the domain)
+- 'theo t h e o at livekit dot io' (name followed by spelling)
+Normalize common spoken patterns silently:
+- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.
+- Convert 'at' to `@`.
+- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.
+- Filter out filler words or hesitations.
+- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).
+Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
+"""
+
+_TEXT_SPECIFIC = """
+Handle input as typed text. Expect users to type their email address directly in standard format.
+If the address looks almost correct but has minor typos (e.g. missing '@' or domain), prompt for clarification.
+"""
+
 
 @dataclass
 class GetEmailResult:
@@ -39,34 +73,27 @@ def __init__(
         allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
         require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
     ) -> None:
+        confirmation_instructions = (
+            "Call `confirm_email_address` after the user confirmed the email address is correct."
+        )
+        extra = extra_instructions if extra_instructions else ""
+
         super().__init__(
-            instructions=(
-                "You are only a single step in a broader system, responsible solely for capturing an email address.\n"
-                "Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:\n"
-                "- 'john dot doe at gmail dot com'\n"
-                "- 'susan underscore smith at yahoo dot co dot uk'\n"
-                "- 'dave dash b at protonmail dot com'\n"
-                "- 'jane at example' (partial—prompt for the domain)\n"
-                "- 'theo t h e o at livekit dot io' (name followed by spelling)\n"
-                "Normalize common spoken patterns silently:\n"
-                "- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.\n"
-                "- Convert 'at' to `@`.\n"
-                "- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.\n"
-                "- Filter out filler words or hesitations.\n"
-                "- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).\n"
-                "Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
-                "Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. "
-                "(before asking any questions or providing any answers.) \n"
-                "Don't invent new email addresses, stick strictly to what the user said. \n"
-                + (
-                    "Call `confirm_email_address` after the user confirmed the email address is correct. \n"
-                    if require_confirmation is not False
-                    else ""
-                )
-                + "If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed. \n"
-                "Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
-                "Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
-                + extra_instructions
+            instructions=Instructions(
+                _BASE_INSTRUCTIONS.format(
+                    modality_specific=_AUDIO_SPECIFIC,
+                    confirmation_instructions=(
+                        confirmation_instructions if require_confirmation is not False else ""
+                    ),
+                    extra_instructions=extra,
+                ),
+                text=_BASE_INSTRUCTIONS.format(
+                    modality_specific=_TEXT_SPECIFIC,
+                    confirmation_instructions=(
+                        confirmation_instructions if require_confirmation is True else ""
+                    ),
+                    extra_instructions=extra,
+                ),
             ),
             chat_ctx=chat_ctx,
             turn_detection=turn_detection,

diff --git a/livekit-agents/livekit/agents/llm/__init__.py b/livekit-agents/livekit/agents/llm/__init__.py
@@ -11,6 +11,7 @@
     FunctionCall,
     FunctionCallOutput,
     ImageContent,
+    Instructions,
     MetricsReport,
 )
 from .fallback_adapter import AvailabilityChangedEvent, FallbackAdapter
@@ -71,6 +72,7 @@
     "AgentConfigUpdate",
     "AgentHandoff",
     "MetricsReport",
+    "Instructions",
     "ChatItem",
     "ChoiceDelta",
     "ChatChunk",