pydantic · adtyavrdhn · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 11, 2025
diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
@@ -1,12 +1,14 @@
 from __future__ import annotations
 
+from collections.abc import Sequence
 from textwrap import dedent
 from typing import Any
 
 from pydantic import BaseModel, Field
 from pydantic_core import to_json
 
 from pydantic_ai import Agent, models
+from pydantic_ai.messages import MultiModalContentTypes, UserContent
 from pydantic_ai.settings import ModelSettings
 
 __all__ = (
@@ -62,16 +64,7 @@ async def judge_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=None, output=output, rubric=rubric)
     return (
         await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -112,19 +105,8 @@ async def judge_input_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Input>
-        {_stringify(inputs)}
-        </Input>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
+
     return (
         await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Input>
-        {_stringify(inputs)}
-        </Input>
-        <ExpectedOutput>
-        {_stringify(expected_output)}
-        </ExpectedOutput>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
 
     return (
         await _judge_input_output_expected_agent.run(
@@ -227,19 +194,7 @@ async def judge_output_expected(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <ExpectedOutput>
-        {_stringify(expected_output)}
-        </ExpectedOutput>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=None, output=output, rubric=rubric, expected_output=expected_output)
     return (
         await _judge_output_expected_agent.run(
             user_prompt, model=model or _default_model, model_settings=model_settings
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
         return to_json(value).decode()
     except Exception:
         return repr(value)
+
+
+def _build_prompt(
+    inputs: Any,
+    output: Any,
+    rubric: str,
+    expected_output: Any | None = None,
+) -> str | Sequence[str | UserContent]:
+    """Build a prompt that includes input, output, and rubric."""
+    sections: list[str | UserContent] = []
+
+    if inputs is not None:
+        if isinstance(inputs, str):
+            sections.append(f'<Input>\n{inputs}\n</Input>')
+        else:
+            sections.append('<Input>\n')
+            if isinstance(inputs, Sequence):
+                for item in inputs:  # type: ignore
+                    if isinstance(item, (str, MultiModalContentTypes)):
+                        sections.append(item)
+                    else:
+                        sections.append(_stringify(item))
+            elif isinstance(inputs, MultiModalContentTypes):
+                sections.append(inputs)
+            else:
+                sections.append(_stringify(inputs))
+            sections.append('</Input>')
+
+    sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
+    sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
+
+    if expected_output is not None:
+        sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
+
+    if inputs is None or isinstance(inputs, str):
+        return '\n\n'.join(sections)  # type: ignore[arg-type]
+    else:
+        return sections
diff --git a/tests/evals/test_llm_as_a_judge.py b/tests/evals/test_llm_as_a_judge.py
@@ -1,9 +1,10 @@
 from __future__ import annotations as _annotations
 
 import pytest
+from inline_snapshot import snapshot
 from pytest_mock import MockerFixture
 
-from ..conftest import try_import
+from ..conftest import BinaryContent, try_import
 
 with try_import() as imports_successful:
     from pydantic_ai.settings import ModelSettings
@@ -141,6 +142,54 @@ async def test_judge_input_output_mock(mocker: MockerFixture):
     assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]
 
 
+async def test_judge_input_output_binary_content_list_mock(mocker: MockerFixture, image_content: BinaryContent):
+    """Test judge_input_output function with mocked agent."""
+    # Mock the agent run method
+    mock_result = mocker.MagicMock()
+    mock_result.output = GradingOutput(reason='Test passed', pass_=True, score=1.0)
+    mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)
+
+    result = await judge_input_output([image_content, image_content], 'Hello world', 'Output contains input')
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    # Verify the agent was called with correct prompt
+    mock_run.assert_called_once()
+    raw_prompt = mock_run.call_args[0][0]
+
+    # 1) It must be a list
+    assert isinstance(raw_prompt, list), 'Expected prompt to be a list when passing binary'
+
+    # 2) The BinaryContent you passed in should be one of the elements
+    assert image_content in raw_prompt, 'Expected the exact BinaryContent instance to be in the prompt list'
+
+
+async def test_judge_input_output_binary_content_mock(mocker: MockerFixture, image_content: BinaryContent):
+    """Test judge_input_output function with mocked agent."""
+    # Mock the agent run method
+    mock_result = mocker.MagicMock()
+    mock_result.output = GradingOutput(reason='Test passed', pass_=True, score=1.0)
+    mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)
+
+    result = await judge_input_output(image_content, 'Hello world', 'Output contains input')
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    # Verify the agent was called with correct prompt
+    mock_run.assert_called_once()
+    raw_prompt = mock_run.call_args[0][0]
+
+    # 1) It must be a list
+    assert isinstance(raw_prompt, list), 'Expected prompt to be a list when passing binary'
+
+    # 2) The BinaryContent you passed in should be one of the elements
+    assert image_content in raw_prompt, 'Expected the exact BinaryContent instance to be in the prompt list'
+
+
 @pytest.mark.anyio
 async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture):
     """Test judge_input_output function with model_settings and mocked agent."""
@@ -172,7 +221,7 @@ async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture
 
 
 @pytest.mark.anyio
-async def test_judge_input_output_expected_mock(mocker: MockerFixture):
+async def test_judge_input_output_expected_mock(mocker: MockerFixture, image_content: BinaryContent):
     """Test judge_input_output_expected function with mocked agent."""
     # Mock the agent run method
     mock_result = mocker.MagicMock()
@@ -187,16 +236,29 @@ async def test_judge_input_output_expected_mock(mocker: MockerFixture):
     assert result.score == 1.0
 
     # Verify the agent was called with correct prompt
-    mock_run.assert_called_once()
     call_args = mock_run.call_args[0]
     assert '<Input>\nHello\n</Input>' in call_args[0]
     assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
     assert '<Output>\nHello world\n</Output>' in call_args[0]
     assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]
 
+    result = await judge_input_output_expected(image_content, 'Hello world', 'Hello', 'Output contains input')
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    call_args = mock_run.call_args[0]
+    assert image_content in call_args[0]
+    assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
+    assert '<Output>\nHello world\n</Output>' in call_args[0]
+    assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]
+
 
 @pytest.mark.anyio
-async def test_judge_input_output_expected_with_model_settings_mock(mocker: MockerFixture):
+async def test_judge_input_output_expected_with_model_settings_mock(
+    mocker: MockerFixture, image_content: BinaryContent
+):
     """Test judge_input_output_expected function with model_settings and mocked agent."""
     mock_result = mocker.MagicMock()
     mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
@@ -216,7 +278,6 @@ async def test_judge_input_output_expected_with_model_settings_mock(mocker: Mock
     assert result.pass_ is True
     assert result.score == 1.0
 
-    mock_run.assert_called_once()
     call_args, call_kwargs = mock_run.call_args
     assert '<Input>\nHello settings\n</Input>' in call_args[0]
     assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
@@ -226,6 +287,68 @@ async def test_judge_input_output_expected_with_model_settings_mock(mocker: Mock
     # Check if 'model' kwarg is passed, its value will be the default model or None
     assert 'model' in call_kwargs
 
+    result = await judge_input_output_expected(
+        image_content,
+        'Hello world with settings',
+        'Hello',
+        'Output contains input with settings',
+        model_settings=test_model_settings,
+    )
+
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed with settings'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    call_args, call_kwargs = mock_run.call_args
+    assert image_content in call_args[0]
+    assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
+    assert '<Output>\nHello world with settings\n</Output>' in call_args[0]
+    assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
+    assert call_kwargs['model_settings'] == test_model_settings
+    # Check if 'model' kwarg is passed, its value will be the default model or None
+    assert 'model' in call_kwargs
+
+    result = await judge_input_output_expected(
+        123,
+        'Hello world with settings',
+        'Hello',
+        'Output contains input with settings',
+        model_settings=test_model_settings,
+    )
+
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed with settings'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    call_args, call_kwargs = mock_run.call_args
+
+    assert call_args == snapshot(
+        (
+            [
+                '<Input>\n',
+                '123',
+                '</Input>',
+                """\
+<Output>
+Hello world with settings
+</Output>\
+""",
+                """\
+<Rubric>
+Output contains input with settings
+</Rubric>\
+""",
+                """\
+<ExpectedOutput>
+Hello
+</ExpectedOutput>\
+""",
+            ],
+        )
+    )
+
 
 @pytest.mark.anyio
 async def test_judge_output_expected_mock(mocker: MockerFixture):
@@ -243,7 +366,6 @@ async def test_judge_output_expected_mock(mocker: MockerFixture):
     assert result.score == 1.0
 
     # Verify the agent was called with correct prompt
-    mock_run.assert_called_once()
     call_args = mock_run.call_args[0]
     assert '<Input>' not in call_args[0]
     assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
@@ -252,7 +374,7 @@ async def test_judge_output_expected_mock(mocker: MockerFixture):
 
 
 @pytest.mark.anyio
-async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixture):
+async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixture, image_content: BinaryContent):
     """Test judge_output_expected function with model_settings and mocked agent."""
     mock_result = mocker.MagicMock()
     mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
@@ -280,3 +402,23 @@ async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixt
     assert call_kwargs['model_settings'] == test_model_settings
     # Check if 'model' kwarg is passed, its value will be the default model or None
     assert 'model' in call_kwargs
+
+    result = await judge_output_expected(
+        image_content,
+        'Hello',
+        'Output contains input with settings',
+        model_settings=test_model_settings,
+    )
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed with settings'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    call_args, call_kwargs = mock_run.call_args
+    assert '<Input>' not in call_args[0]
+    assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
+    assert '<Output>' in call_args[0]
+    assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
+    assert call_kwargs['model_settings'] == test_model_settings
+    # Check if 'model' kwarg is passed, its value will be the default model or None
+    assert 'model' in call_kwargs