Skip to content

Fix LLMJudge input handling to preserve BinaryContent as separate message part instead of stringifying #2173

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 45 additions & 52 deletions pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import annotations

from collections.abc import Sequence
from textwrap import dedent
from typing import Any

from pydantic import BaseModel, Field
from pydantic_core import to_json

from pydantic_ai import Agent, models
from pydantic_ai.messages import MultiModalContentTypes, UserContent
from pydantic_ai.settings import ModelSettings

__all__ = (
Expand Down Expand Up @@ -62,16 +64,7 @@ async def judge_output(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=None, output=output, rubric=rubric)
return (
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output
Expand Down Expand Up @@ -112,19 +105,8 @@ async def judge_input_output(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Input>
{_stringify(inputs)}
</Input>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)

return (
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
).output
Expand Down Expand Up @@ -168,22 +150,7 @@ async def judge_input_output_expected(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<Input>
{_stringify(inputs)}
</Input>
<ExpectedOutput>
{_stringify(expected_output)}
</ExpectedOutput>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)

return (
await _judge_input_output_expected_agent.run(
Expand Down Expand Up @@ -227,19 +194,7 @@ async def judge_output_expected(
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
but this can be changed using the `set_default_judge_model` function.
"""
user_prompt = dedent(
f"""
<ExpectedOutput>
{_stringify(expected_output)}
</ExpectedOutput>
<Output>
{_stringify(output)}
</Output>
<Rubric>
{rubric}
</Rubric>
"""
)
user_prompt = _build_prompt(inputs=None, output=output, rubric=rubric, expected_output=expected_output)
return (
await _judge_output_expected_agent.run(
user_prompt, model=model or _default_model, model_settings=model_settings
Expand All @@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
return to_json(value).decode()
except Exception:
return repr(value)


def _build_prompt(
inputs: Any,
output: Any,
rubric: str,
expected_output: Any | None = None,
) -> str | Sequence[str | UserContent]:
"""Build a prompt that includes input, output, and rubric."""
sections: list[str | UserContent] = []

if inputs is not None:
if isinstance(inputs, str):
sections.append(f'<Input>\n{inputs}\n</Input>')
else:
sections.append('<Input>\n')
if isinstance(inputs, Sequence):
for item in inputs: # type: ignore
if isinstance(item, (str, MultiModalContentTypes)):
sections.append(item)
else:
sections.append(_stringify(item))
elif isinstance(inputs, MultiModalContentTypes):
sections.append(inputs)
else:
sections.append(_stringify(inputs))
sections.append('</Input>')

sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')

if expected_output is not None:
sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')

if inputs is None or isinstance(inputs, str):
return '\n\n'.join(sections) # type: ignore[arg-type]
else:
return sections
156 changes: 149 additions & 7 deletions tests/evals/test_llm_as_a_judge.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from __future__ import annotations as _annotations

import pytest
from inline_snapshot import snapshot
from pytest_mock import MockerFixture

from ..conftest import try_import
from ..conftest import BinaryContent, try_import

with try_import() as imports_successful:
from pydantic_ai.settings import ModelSettings
Expand Down Expand Up @@ -141,6 +142,54 @@ async def test_judge_input_output_mock(mocker: MockerFixture):
assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]


async def test_judge_input_output_binary_content_list_mock(mocker: MockerFixture, image_content: BinaryContent):
"""Test judge_input_output function with mocked agent."""
# Mock the agent run method
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed', pass_=True, score=1.0)
mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)

result = await judge_input_output([image_content, image_content], 'Hello world', 'Output contains input')
assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed'
assert result.pass_ is True
assert result.score == 1.0

# Verify the agent was called with correct prompt
mock_run.assert_called_once()
raw_prompt = mock_run.call_args[0][0]

# 1) It must be a list
assert isinstance(raw_prompt, list), 'Expected prompt to be a list when passing binary'

# 2) The BinaryContent you passed in should be one of the elements
assert image_content in raw_prompt, 'Expected the exact BinaryContent instance to be in the prompt list'


async def test_judge_input_output_binary_content_mock(mocker: MockerFixture, image_content: BinaryContent):
"""Test judge_input_output function with mocked agent."""
# Mock the agent run method
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed', pass_=True, score=1.0)
mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)

result = await judge_input_output(image_content, 'Hello world', 'Output contains input')
assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed'
assert result.pass_ is True
assert result.score == 1.0

# Verify the agent was called with correct prompt
mock_run.assert_called_once()
raw_prompt = mock_run.call_args[0][0]

# 1) It must be a list
assert isinstance(raw_prompt, list), 'Expected prompt to be a list when passing binary'

# 2) The BinaryContent you passed in should be one of the elements
assert image_content in raw_prompt, 'Expected the exact BinaryContent instance to be in the prompt list'


@pytest.mark.anyio
async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture):
"""Test judge_input_output function with model_settings and mocked agent."""
Expand Down Expand Up @@ -172,7 +221,7 @@ async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture


@pytest.mark.anyio
async def test_judge_input_output_expected_mock(mocker: MockerFixture):
async def test_judge_input_output_expected_mock(mocker: MockerFixture, image_content: BinaryContent):
"""Test judge_input_output_expected function with mocked agent."""
# Mock the agent run method
mock_result = mocker.MagicMock()
Expand All @@ -187,16 +236,29 @@ async def test_judge_input_output_expected_mock(mocker: MockerFixture):
assert result.score == 1.0

# Verify the agent was called with correct prompt
mock_run.assert_called_once()
call_args = mock_run.call_args[0]
assert '<Input>\nHello\n</Input>' in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
assert '<Output>\nHello world\n</Output>' in call_args[0]
assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]

result = await judge_input_output_expected(image_content, 'Hello world', 'Hello', 'Output contains input')
assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed'
assert result.pass_ is True
assert result.score == 1.0

call_args = mock_run.call_args[0]
assert image_content in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
assert '<Output>\nHello world\n</Output>' in call_args[0]
assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]


@pytest.mark.anyio
async def test_judge_input_output_expected_with_model_settings_mock(mocker: MockerFixture):
async def test_judge_input_output_expected_with_model_settings_mock(
mocker: MockerFixture, image_content: BinaryContent
):
"""Test judge_input_output_expected function with model_settings and mocked agent."""
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
Expand All @@ -216,7 +278,6 @@ async def test_judge_input_output_expected_with_model_settings_mock(mocker: Mock
assert result.pass_ is True
assert result.score == 1.0

mock_run.assert_called_once()
call_args, call_kwargs = mock_run.call_args
assert '<Input>\nHello settings\n</Input>' in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
Expand All @@ -226,6 +287,68 @@ async def test_judge_input_output_expected_with_model_settings_mock(mocker: Mock
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs

result = await judge_input_output_expected(
image_content,
'Hello world with settings',
'Hello',
'Output contains input with settings',
model_settings=test_model_settings,
)

assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed with settings'
assert result.pass_ is True
assert result.score == 1.0

call_args, call_kwargs = mock_run.call_args
assert image_content in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
assert '<Output>\nHello world with settings\n</Output>' in call_args[0]
assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
assert call_kwargs['model_settings'] == test_model_settings
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs

result = await judge_input_output_expected(
123,
'Hello world with settings',
'Hello',
'Output contains input with settings',
model_settings=test_model_settings,
)

assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed with settings'
assert result.pass_ is True
assert result.score == 1.0

call_args, call_kwargs = mock_run.call_args

assert call_args == snapshot(
(
[
'<Input>\n',
'123',
'</Input>',
"""\
<Output>
Hello world with settings
</Output>\
""",
"""\
<Rubric>
Output contains input with settings
</Rubric>\
""",
"""\
<ExpectedOutput>
Hello
</ExpectedOutput>\
""",
],
)
)


@pytest.mark.anyio
async def test_judge_output_expected_mock(mocker: MockerFixture):
Expand All @@ -243,7 +366,6 @@ async def test_judge_output_expected_mock(mocker: MockerFixture):
assert result.score == 1.0

# Verify the agent was called with correct prompt
mock_run.assert_called_once()
call_args = mock_run.call_args[0]
assert '<Input>' not in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
Expand All @@ -252,7 +374,7 @@ async def test_judge_output_expected_mock(mocker: MockerFixture):


@pytest.mark.anyio
async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixture):
async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixture, image_content: BinaryContent):
"""Test judge_output_expected function with model_settings and mocked agent."""
mock_result = mocker.MagicMock()
mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
Expand Down Expand Up @@ -280,3 +402,23 @@ async def test_judge_output_expected_with_model_settings_mock(mocker: MockerFixt
assert call_kwargs['model_settings'] == test_model_settings
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs

result = await judge_output_expected(
image_content,
'Hello',
'Output contains input with settings',
model_settings=test_model_settings,
)
assert isinstance(result, GradingOutput)
assert result.reason == 'Test passed with settings'
assert result.pass_ is True
assert result.score == 1.0

call_args, call_kwargs = mock_run.call_args
assert '<Input>' not in call_args[0]
assert '<ExpectedOutput>\nHello\n</ExpectedOutput>' in call_args[0]
assert '<Output>' in call_args[0]
assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
assert call_kwargs['model_settings'] == test_model_settings
# Check if 'model' kwarg is passed, its value will be the default model or None
assert 'model' in call_kwargs
Loading