Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions examples/voice_agents/instructions_per_modality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import logging
from datetime import datetime

from dotenv import load_dotenv

from livekit.agents import (
Agent,
AgentServer,
AgentSession,
JobContext,
JobProcess,
cli,
function_tool,
inference,
)
from livekit.agents.llm import Instructions
from livekit.plugins import silero

logger = logging.getLogger("instructions-per-modality")

load_dotenv()

BASE_INSTRUCTIONS = """\\
You are a scheduling assistant named Alex that helps users book appointments.
{modality_specific}
Call `book_appointment` to finalise the booking.
Never invent or assume details the user did not provide — ask for them instead.
The current date is {current_date}.
"""

# Voice users speak in approximate, self-correcting natural language.
# The LLM needs guidance on how to parse what was said, not how to say things back.
AUDIO_SPECIFIC = """
The user is speaking — their input arrives as voice transcription and may be imperfect.
When interpreting what the user said:
- Resolve relative spoken expressions to a concrete date/time: 'next Tuesday', 'tomorrow afternoon', 'the week after next around 3'.
- Spoken numbers may be ambiguous: 'three thirty' could mean 3:30 PM or the 30th of March — ask for clarification when context does not make it obvious.
- Honor verbal self-corrections: if the user says 'wait, I meant Thursday not Tuesday', update your understanding to Thursday and discard Tuesday.
- Ignore filler words and hesitations ('um', 'uh', 'like', 'I guess').
- Always confirm the resolved date and time out loud before booking, since spoken input is inherently ambiguous.
"""

# Text users type precise values — no need to normalise spoken patterns.
TEXT_SPECIFIC = """
The user is typing — take their input literally.
When interpreting what the user wrote:
- Accept exact dates and times in any common format (ISO, natural language, 12-hour or 24-hour clock).
- If the user provides a complete and unambiguous date and time, you may book immediately without asking for confirmation.
- Only ask follow-up questions for genuinely missing information.
"""


class SchedulingAgent(Agent):
def __init__(self) -> None:
current_date = datetime.now().strftime("%Y-%m-%d %A")
super().__init__(
instructions=Instructions(
audio=BASE_INSTRUCTIONS.format(
modality_specific=AUDIO_SPECIFIC, current_date=current_date
),
text=BASE_INSTRUCTIONS.format(
modality_specific=TEXT_SPECIFIC, current_date=current_date
),
)
)

async def on_enter(self) -> None:
self.session.generate_reply()

@function_tool
async def book_appointment(self, date: str, time: str) -> None:
"""Book an appointment.

Args:
date: The date of the appointment in the format YYYY-MM-DD
time: The time of the appointment in the format HH:MM
"""
logger.info(f"booking appointment for {date} at {time}")
return f"Appointment booked for {date} at {time}"
Comment on lines +71 to +79
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 book_appointment return type annotation is None but function returns a string

The book_appointment method at line 71 declares -> None but actually returns f"Appointment booked for {date} at {time}" at line 79. Because the function_tool decorator inspects the return type annotation to decide how to handle tool output, a -> None annotation may cause the framework to discard the return value, meaning the LLM never receives the booking confirmation string. This would make the agent unable to confirm to the user that the booking succeeded.

Suggested change
async def book_appointment(self, date: str, time: str) -> None:
"""Book an appointment.
Args:
date: The date of the appointment in the format YYYY-MM-DD
time: The time of the appointment in the format HH:MM
"""
logger.info(f"booking appointment for {date} at {time}")
return f"Appointment booked for {date} at {time}"
@function_tool
async def book_appointment(self, date: str, time: str) -> str:
"""Book an appointment.
Args:
date: The date of the appointment in the format YYYY-MM-DD
time: The time of the appointment in the format HH:MM
"""
logger.info(f"booking appointment for {date} at {time}")
return f"Appointment booked for {date} at {time}"
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.



server = AgentServer()


def prewarm(proc: JobProcess) -> None:
proc.userdata["vad"] = silero.VAD.load()


server.setup_fnc = prewarm


@server.rtc_session()
async def entrypoint(ctx: JobContext) -> None:
session = AgentSession(
stt=inference.STT("deepgram/nova-3"),
llm=inference.LLM("openai/gpt-4.1-mini"),
tts=inference.TTS("cartesia/sonic-3"),
vad=ctx.proc.userdata["vad"],
)

await session.start(agent=SchedulingAgent(), room=ctx.room)


if __name__ == "__main__":
cli.run_app(server)
95 changes: 62 additions & 33 deletions livekit-agents/livekit/agents/beta/workflows/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import TYPE_CHECKING

from ... import llm, stt, tts, vad
from ...llm import Instructions
from ...llm.tool_context import ToolError, ToolFlag, function_tool
from ...types import NOT_GIVEN, NotGivenOr
from ...utils import is_given
Expand All @@ -15,6 +16,47 @@
from ...voice.audio_recognition import TurnDetectionMode


_BASE_INSTRUCTIONS = """
You are only a single step in a broader system, responsible solely for capturing an address.
You will be handling addresses from any country.
{modality_specific}
Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. (before asking any questions or providing any answers.)
Don't invent new addresses, stick strictly to what the user said.
{confirmation_instructions}
If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country.
Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
{extra_instructions}
"""

_AUDIO_SPECIFIC = """
Expect that users will say address in different formats with fields filled like:
- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',
- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',
- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',
- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',
- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',
Normalize common spoken patterns silently:
- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.
- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.
- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.
- Filter out filler words or hesitations.
- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.
Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.
Do not read the number and the suffix letters separately.
Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.
For example, read 90210 as 'nine zero two one zero.'
Avoid using bullet points and parenthese in any responses.
Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially.
"""

_TEXT_SPECIFIC = """
Expect users to type their address directly.
If the address looks almost correct but has minor issues (e.g. missing country or postal code), prompt for clarification.
"""


@dataclass
class GetAddressResult:
address: str
Expand All @@ -34,40 +76,27 @@ def __init__(
allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
) -> None:
confirmation_instructions = (
"Call `confirm_address` after the user confirmed the address is correct."
)
extra = extra_instructions if extra_instructions else ""

super().__init__(
instructions=(
"You are only a single step in a broader system, responsible solely for capturing an address.\n"
"You will be handling addresses from any country. Expect that users will say address in different formats with fields filled like:\n"
"- 'street_address': '450 SOUTH MAIN ST', 'unit_number': 'FLOOR 2', 'locality': 'SALT LAKE CITY UT 84101', 'country': 'UNITED STATES',\n"
"- 'street_address': '123 MAPLE STREET', 'unit_number': 'APARTMENT 10', 'locality': 'OTTAWA ON K1A 0B1', 'country': 'CANADA',\n"
"- 'street_address': 'GUOMAO JIE 3 HAO, CHAOYANG QU', 'unit_number': 'GUOMAO DA SHA 18 LOU 101 SHI', 'locality': 'BEIJING SHI 100000', 'country': 'CHINA',\n"
"- 'street_address': '5 RUE DE L'ANCIENNE COMÉDIE', 'unit_number': 'APP C4', 'locality': '75006 PARIS', 'country': 'FRANCE',\n"
"- 'street_address': 'PLOT 10, NEHRU ROAD', 'unit_number': 'OFFICE 403, 4TH FLOOR', 'locality': 'VILE PARLE (E), MUMBAI MAHARASHTRA 400099', 'country': 'INDIA',\n"
"Normalize common spoken patterns silently:\n"
"- Convert words like 'dash' and 'apostrophe' into symbols: `-`, `'`.\n"
"- Convert spelled out numbers like 'six' and 'seven' into numerals: `6`, `7`.\n"
"- Recognize patterns where users speak their address field followed by spelling: e.g., 'guomao g u o m a o'.\n"
"- Filter out filler words or hesitations.\n"
"- Recognize when there may be accents on certain letters if explicitly said or common in the location specified. Be sure to verify the correct accents if existent.\n"
"Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
"Call `update_address` at the first opportunity whenever you form a new hypothesis about the address. "
"(before asking any questions or providing any answers.) \n"
"Don't invent new addresses, stick strictly to what the user said. \n"
+ (
"Call `confirm_address` after the user confirmed the address is correct. \n"
if require_confirmation is not False
else ""
)
+ "When reading a numerical ordinal suffix (st, nd, rd, th), the number must be verbally expanded into its full, correctly pronounced word form.\n"
"Do not read the number and the suffix letters separately.\n"
"Confirm postal codes by reading them out digit-by-digit as a sequence of single numbers. Do not read them as cardinal numbers.\n"
"For example, read 90210 as 'nine zero two one zero.'\n"
"Avoid using bullet points and parenthese in any responses.\n"
"Spell out the address letter-by-letter when applicable, such as street names and provinces, especially when the user spells it out initially. \n"
"If the address is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts in this order: street address, unit number if applicable, locality, and country. \n"
"Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
"Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
+ extra_instructions
instructions=Instructions(
_BASE_INSTRUCTIONS.format(
modality_specific=_AUDIO_SPECIFIC,
confirmation_instructions=(
confirmation_instructions if require_confirmation is not False else ""
),
extra_instructions=extra,
),
text=_BASE_INSTRUCTIONS.format(
modality_specific=_TEXT_SPECIFIC,
confirmation_instructions=(
confirmation_instructions if require_confirmation is True else ""
),
extra_instructions=extra,
),
),
chat_ctx=chat_ctx,
turn_detection=turn_detection,
Expand Down
81 changes: 54 additions & 27 deletions livekit-agents/livekit/agents/beta/workflows/email_address.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING

from ... import llm, stt, tts, vad
from ...llm import Instructions
from ...llm.tool_context import ToolError, ToolFlag, function_tool
from ...types import NOT_GIVEN, NotGivenOr
from ...utils import is_given
Expand All @@ -19,6 +20,39 @@
r"^[A-Za-z0-9][A-Za-z0-9._%+\-]*@(?:[A-Za-z0-9](?:[A-Za-z0-9\-]*[A-Za-z0-9])?\.)+[A-Za-z]{2,}$"
)

_BASE_INSTRUCTIONS = """
You are only a single step in a broader system, responsible solely for capturing an email address.
{modality_specific}
Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. (before asking any questions or providing any answers.)
Don't invent new email addresses, stick strictly to what the user said.
{confirmation_instructions}
If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed.
Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary.
Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called.\
{extra_instructions}
"""

_AUDIO_SPECIFIC = """
Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:
- 'john dot doe at gmail dot com'
- 'susan underscore smith at yahoo dot co dot uk'
- 'dave dash b at protonmail dot com'
- 'jane at example' (partial—prompt for the domain)
- 'theo t h e o at livekit dot io' (name followed by spelling)
Normalize common spoken patterns silently:
- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.
- Convert 'at' to `@`.
- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.
- Filter out filler words or hesitations.
- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).
Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.
"""

_TEXT_SPECIFIC = """
Handle input as typed text. Expect users to type their email address directly in standard format.
If the address looks almost correct but has minor typos (e.g. missing '@' or domain), prompt for clarification.
"""


@dataclass
class GetEmailResult:
Expand All @@ -39,34 +73,27 @@ def __init__(
allow_interruptions: NotGivenOr[bool] = NOT_GIVEN,
require_confirmation: NotGivenOr[bool] = NOT_GIVEN,
) -> None:
confirmation_instructions = (
"Call `confirm_email_address` after the user confirmed the email address is correct."
)
extra = extra_instructions if extra_instructions else ""

super().__init__(
instructions=(
"You are only a single step in a broader system, responsible solely for capturing an email address.\n"
"Handle input as noisy voice transcription. Expect that users will say emails aloud with formats like:\n"
"- 'john dot doe at gmail dot com'\n"
"- 'susan underscore smith at yahoo dot co dot uk'\n"
"- 'dave dash b at protonmail dot com'\n"
"- 'jane at example' (partial—prompt for the domain)\n"
"- 'theo t h e o at livekit dot io' (name followed by spelling)\n"
"Normalize common spoken patterns silently:\n"
"- Convert words like 'dot', 'underscore', 'dash', 'plus' into symbols: `.`, `_`, `-`, `+`.\n"
"- Convert 'at' to `@`.\n"
"- Recognize patterns where users speak their name or a word, followed by spelling: e.g., 'john j o h n'.\n"
"- Filter out filler words or hesitations.\n"
"- Assume some spelling if contextually obvious (e.g. 'mike b two two' → mikeb22).\n"
"Don't mention corrections. Treat inputs as possibly imperfect but fix them silently.\n"
"Call `update_email_address` at the first opportunity whenever you form a new hypothesis about the email. "
"(before asking any questions or providing any answers.) \n"
"Don't invent new email addresses, stick strictly to what the user said. \n"
+ (
"Call `confirm_email_address` after the user confirmed the email address is correct. \n"
if require_confirmation is not False
else ""
)
+ "If the email is unclear or invalid, or it takes too much back-and-forth, prompt for it in parts: first the part before the '@', then the domain—only if needed. \n"
"Ignore unrelated input and avoid going off-topic. Do not generate markdown, greetings, or unnecessary commentary. \n"
"Always explicitly invoke a tool when applicable. Do not simulate tool usage, no real action is taken unless the tool is explicitly called."
+ extra_instructions
instructions=Instructions(
_BASE_INSTRUCTIONS.format(
modality_specific=_AUDIO_SPECIFIC,
confirmation_instructions=(
confirmation_instructions if require_confirmation is not False else ""
),
extra_instructions=extra,
),
text=_BASE_INSTRUCTIONS.format(
modality_specific=_TEXT_SPECIFIC,
confirmation_instructions=(
confirmation_instructions if require_confirmation is True else ""
),
extra_instructions=extra,
),
),
chat_ctx=chat_ctx,
turn_detection=turn_detection,
Expand Down
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
FunctionCall,
FunctionCallOutput,
ImageContent,
Instructions,
MetricsReport,
)
from .fallback_adapter import AvailabilityChangedEvent, FallbackAdapter
Expand Down Expand Up @@ -71,6 +72,7 @@
"AgentConfigUpdate",
"AgentHandoff",
"MetricsReport",
"Instructions",
"ChatItem",
"ChoiceDelta",
"ChatChunk",
Expand Down
Loading