add audio utils to handle model audio input

Damian Fastowiec · Damian Fastowiec · commit 611734c0413b · 2025-02-24T22:09:41.000+01:00
diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -8,7 +8,7 @@
 
 from dspy.evaluate import Evaluate  # isort: skip
 from dspy.clients import *  # isort: skip
-from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, Image  # isort: skip
+from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, Image, Audio  # isort: skip
 from dspy.utils.logging_utils import configure_dspy_loggers, disable_logging, enable_logging
 from dspy.utils.asyncify import asyncify
 from dspy.utils.saving import load
diff --git a/dspy/adapters/__init__.py b/dspy/adapters/__init__.py
@@ -1,11 +1,19 @@
 from dspy.adapters.base import Adapter
 from dspy.adapters.chat_adapter import ChatAdapter
 from dspy.adapters.json_adapter import JSONAdapter
-from dspy.adapters.image_utils import Image
+from dspy.adapters.image_utils import Image, encode_image, is_image
+from dspy.adapters.audio_utils import Audio, encode_audio, is_audio
+from dspy.adapters.media_utils import try_expand_media_tags
 
 __all__ = [
-    "Adapter",
-    "ChatAdapter",
-    "JSONAdapter",
-    "Image",
+    'Adapter',
+    'ChatAdapter',
+    'JSONAdapter',
+    'Image',
+    'Audio',
+    'encode_image',
+    'encode_audio',
+    'is_image',
+    'is_audio',
+    'try_expand_media_tags',
 ]
diff --git a/dspy/adapters/chat_adapter.py b/dspy/adapters/chat_adapter.py
@@ -11,11 +11,11 @@
 from pydantic.fields import FieldInfo
 
 from dspy.adapters.base import Adapter
-from dspy.adapters.image_utils import try_expand_image_tags
 from dspy.adapters.utils import format_field_value, get_annotation_name, parse_value
 from dspy.signatures.field import OutputField
 from dspy.signatures.signature import Signature, SignatureMeta
 from dspy.signatures.utils import get_dspy_field_type
+from dspy.adapters.media_utils import try_expand_media_tags
 
 field_header_pattern = re.compile(r"\[\[ ## (\w+) ## \]\]")
 
@@ -54,7 +54,7 @@ def format(self, signature: Signature, demos: list[dict[str, Any]], inputs: dict
             messages.append(format_turn(signature, demo, role="assistant", incomplete=demo in incomplete_demos))
 
         messages.append(format_turn(signature, inputs, role="user"))
-        messages = try_expand_image_tags(messages)
+        messages = try_expand_media_tags(messages)
         return messages
 
     def parse(self, signature, completion):
diff --git a/dspy/adapters/image_utils.py b/dspy/adapters/image_utils.py
@@ -18,14 +18,14 @@
 
 class Image(pydantic.BaseModel):
     url: str
-    
+
     model_config = {
-        'frozen': True,
-        'str_strip_whitespace': True,
-        'validate_assignment': True,
-        'extra': 'forbid',
+        "frozen": True,
+        "str_strip_whitespace": True,
+        "validate_assignment": True,
+        "extra": "forbid",
     }
-        
+
     @pydantic.model_validator(mode="before")
     @classmethod
     def validate_input(cls, values):
@@ -68,6 +68,7 @@ def __repr__(self):
             return f"Image(url=data:image/{image_type};base64,<IMAGE_BASE_64_ENCODED({str(len_base64)})>)"
         return f"Image(url='{self.url}')"
 
+
 def is_url(string: str) -> bool:
     """Check if a string is a valid URL."""
     try:
@@ -77,7 +78,9 @@ def is_url(string: str) -> bool:
         return False
 
 
-def encode_image(image: Union[str, bytes, "PILImage.Image", dict], download_images: bool = False) -> str:
+def encode_image(
+    image: Union[str, bytes, "PILImage.Image", dict], download_images: bool = False
+) -> str:
     """
     Encode an image to a base64 data URI.
 
@@ -150,7 +153,8 @@ def _encode_image_from_url(image_url: str) -> str:
     encoded_image = base64.b64encode(response.content).decode("utf-8")
     return f"data:image/{file_extension};base64,{encoded_image}"
 
-def _encode_pil_image(image: 'PILImage') -> str:
+
+def _encode_pil_image(image: "PILImage") -> str:
     """Encode a PIL Image object to a base64 data URI."""
     buffered = io.BytesIO()
     file_extension = (image.format or "PNG").lower()
@@ -177,52 +181,3 @@ def is_image(obj) -> bool:
         elif is_url(obj):
             return True
     return False
-
-def try_expand_image_tags(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Try to expand image tags in the messages."""
-    for message in messages:
-        # NOTE: Assumption that content is a string
-        if "content" in message and "<DSPY_IMAGE_START>" in message["content"]:
-            message["content"] = expand_image_tags(message["content"])
-    return messages
-
-def expand_image_tags(text: str) -> Union[str, List[Dict[str, Any]]]:
-    """Expand image tags in the text. If there are any image tags, 
-    turn it from a content string into a content list of texts and image urls.
-    
-    Args:
-        text: The text content that may contain image tags
-        
-    Returns:
-        Either the original string if no image tags, or a list of content dicts
-        with text and image_url entries
-    """
-    image_tag_regex = r'"?<DSPY_IMAGE_START>(.*?)<DSPY_IMAGE_END>"?'
-    
-    # If no image tags, return original text
-    if not re.search(image_tag_regex, text):
-        return text
-        
-    final_list = []
-    remaining_text = text
-    
-    while remaining_text:
-        match = re.search(image_tag_regex, remaining_text)
-        if not match:
-            if remaining_text.strip():
-                final_list.append({"type": "text", "text": remaining_text.strip()})
-            break
-            
-        # Get text before the image tag
-        prefix = remaining_text[:match.start()].strip()
-        if prefix:
-            final_list.append({"type": "text", "text": prefix})
-            
-        # Add the image
-        image_url = match.group(1)
-        final_list.append({"type": "image_url", "image_url": {"url": image_url}})
-        
-        # Update remaining text
-        remaining_text = remaining_text[match.end():].strip()
-    
-    return final_list
diff --git a/dspy/adapters/json_adapter.py b/dspy/adapters/json_adapter.py
@@ -14,6 +14,7 @@
 
 from dspy.adapters.base import Adapter
 from dspy.adapters.image_utils import Image
+from dspy.adapters.audio_utils import Audio
 from dspy.adapters.utils import parse_value, format_field_value, get_annotation_name, serialize_for_json
 from dspy.signatures.signature import SignatureMeta
 from dspy.signatures.utils import get_dspy_field_type
@@ -131,7 +132,7 @@ def _format_field_value(field_info: FieldInfo, value: Any) -> str:
       The formatted value of the field, represented as a string.
     """
     # TODO: Wasnt this easy to fix?
-    if field_info.annotation is Image:
+    if field_info.annotation is Image or field_info.annotation is Audio:
         raise NotImplementedError("Images are not yet supported in JSON mode.")
 
     return format_field_value(field_info=field_info, value=value)
diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py
@@ -28,6 +28,7 @@ class MySignature(dspy.Signature):
 from pydantic.fields import FieldInfo
 
 from dspy.adapters.image_utils import Image  # noqa: F401
+from dspy.adapters.audio_utils import Audio  # noqa: F401
 from dspy.signatures.field import InputField, OutputField