Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions openjudge/graders/agent/tool/tool_call_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,11 @@ def __init__(
language=language,
)

# Pattern to match tool calls in JSON format
self._tool_call_pattern = re.compile(
r'\{\s*"name"\s*:\s*"[^"]*"\s*,\s*"arguments"\s*:\s*\{.*?\}\s*\}', flags=re.DOTALL
)

def _parse_tools_from_response(
self,
response: str,
Expand All @@ -233,10 +238,7 @@ def _parse_tools_from_response(
List of parsed tool calls.
"""
tool_calls = []

# Pattern to match tool calls in JSON format
tool_call_pattern = r'\{\s*"name"\s*:\s*"[^"]*"\s*,\s*"arguments"\s*:\s*\{.*?\}\s*\}'
matches = re.findall(tool_call_pattern, response, re.DOTALL)
matches = self._tool_call_pattern.findall(response)

for match in matches:
try:
Expand Down
29 changes: 0 additions & 29 deletions openjudge/graders/agent/tool/tool_call_success.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import json
import re
import textwrap
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -244,34 +243,6 @@ def __init__(
)
self.template = template or DEFAULT_TOOL_CALL_SUCCESS_TEMPLATE

def _parse_tools_from_response(
self,
response: str,
) -> List[Dict[str, Any]]:
"""Extract tool calls from the response.

Args:
response: The response string to extract tool calls from.

Returns:
List of parsed tool calls.
"""
tool_calls = []

# Pattern to match tool calls in JSON format
tool_call_pattern = r'\{\s*"name"\s*:\s*"[^"]*"\s*,\s*"arguments"\s*:\s*\{.*?\}\s*\}'
matches = re.findall(tool_call_pattern, response, re.DOTALL)

for match in matches:
try:
tool_call = json.loads(match)
tool_calls.append(tool_call)
except json.JSONDecodeError:
# Skip invalid JSON
continue

return tool_calls

async def aevaluate(
self,
tool_definitions: Union[Dict[str, Any], List[Dict[str, Any]]],
Expand Down
9 changes: 7 additions & 2 deletions openjudge/graders/code/code_excution.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ def __init__(
)
self.test_framework_available = False

# Python code pattern in various formats
self._python_code_pattern = re.compile(r"```python\n(.*?)\n```", flags=re.DOTALL)
# generic code formats
self._generic_code_pattern = re.compile(r"```\n(.*?)\n```", flags=re.DOTALL)

def _extract_code(self, content: str) -> str:
"""
Extract code from content
Expand All @@ -71,12 +76,12 @@ def _extract_code(self, content: str) -> str:
Extracted code
"""
# Try to find Python code in various formats
code_match = re.search(r"```python\n(.*?)\n```", content, re.DOTALL)
code_match = self._python_code_pattern.search(content)
if code_match:
return code_match.group(1)

# Try other formats
code_match = re.search(r"```\n(.*?)\n```", content, re.DOTALL)
code_match = self._generic_code_pattern.search(content)
if code_match:
return code_match.group(1)

Expand Down
19 changes: 10 additions & 9 deletions openjudge/graders/code/code_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ def __init__(self):
description="Basic code style checking including indentation consistency and naming conventions.",
)

self._function_pattern = re.compile(r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(")
self._variable_pattern = re.compile(r"([a-zA-Z_][a-zA-Z0-9_]*)\s*=")
self._snake_case_pattern = re.compile(r"^[a-z_][a-z0-9_]*$")
self._code_pattern = re.compile(r"```(?:python)?\s*\n(.*?)\n\s*```", re.DOTALL)

def _check_indentation(self, code: str) -> tuple[bool, str]:
"""Check indentation consistency"""
lines = code.split("\n")
Expand Down Expand Up @@ -58,11 +63,8 @@ def _check_indentation(self, code: str) -> tuple[bool, str]:
def _check_naming(self, code: str) -> tuple[float, str]:
"""Check naming conventions"""
# Simple naming check
function_pattern = r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\("
variable_pattern = r"([a-zA-Z_][a-zA-Z0-9_]*)\s*="

functions = re.findall(function_pattern, code)
variables = re.findall(variable_pattern, code)
functions = self._function_pattern.findall(code)
variables = self._variable_pattern.findall(code)

total_names = len(functions) + len(variables)
if total_names == 0:
Expand All @@ -72,12 +74,12 @@ def _check_naming(self, code: str) -> tuple[float, str]:

# Check function names (should be snake_case)
for func in functions:
if re.match(r"^[a-z_][a-z0-9_]*$", func):
if self._snake_case_pattern.match(func):
good_names += 1

# Check variable names (should be snake_case)
for var in variables:
if re.match(r"^[a-z_][a-z0-9_]*$", var):
if self._snake_case_pattern.match(var):
good_names += 1

score = good_names / total_names
Expand Down Expand Up @@ -122,8 +124,7 @@ async def aevaluate(self, response: str) -> GraderScore:
0.5 Code style score: 0.500; Consistent indentation; Naming convention: 1/2 names follow snake_case
"""
# Extract code blocks
code_pattern = r"```(?:python)?\s*\n(.*?)\n\s*```"
code_blocks = re.findall(code_pattern, response, re.DOTALL)
code_blocks = self._code_pattern.findall(response)

if not code_blocks:
return GraderScore(
Expand Down
5 changes: 3 additions & 2 deletions openjudge/graders/code/syntax_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def __init__(self):
description="Check code syntax using Abstract Syntax Tree to validate Python code blocks.",
)

self._code_pattern = re.compile(r"```(?:python)?\s*\n(.*?)\n\s*```", re.DOTALL)

async def aevaluate(self, response: str) -> GraderScore:
"""Check code syntax in the provided response.

Expand Down Expand Up @@ -68,8 +70,7 @@ async def aevaluate(self, response: str) -> GraderScore:
"""

# Extract code blocks
code_pattern = r"```(?:python)?\s*\n(.*?)\n\s*```"
code_blocks = re.findall(code_pattern, response, re.DOTALL)
code_blocks = self._code_pattern.findall(response)

if not code_blocks:
# No code blocks, return neutral score
Expand Down
5 changes: 3 additions & 2 deletions openjudge/graders/format/ngram_repetition_penalty.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@ def __init__(
chinese_only=chinese_only,
)

self._think_pattern = re.compile(r"(.*?)", flags=re.DOTALL)

def _extract_thought_process(self, content: str) -> str:
"""Extract thought process"""
think_pattern = r"(.*?)"
matches = re.findall(think_pattern, content, re.DOTALL)
matches = self._think_pattern.findall(content)
return " ".join(matches) if matches else ""

def _generate_ngrams(self, tokens: List[str]) -> List[tuple]:
Expand Down
9 changes: 5 additions & 4 deletions openjudge/graders/format/reasoning_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def __init__(self, think_token: str = "think", answer_token: str = "answer"):
description="Check format reward for thinking format and answer format with proper tags.",
)
self.think_token = think_token
self.think_pattern = re.compile(f"<{self.think_token}>.*?</{self.think_token}>", flags=re.DOTALL)

self.answer_token = answer_token
self.answer_pattern = re.compile(f"<{self.answer_token}>.*?</{self.answer_token}>", flags=re.DOTALL)

# pylint: disable=unused-argument
async def aevaluate(self, response: str, *args: Any, **kwargs: Any) -> GraderScore:
Expand Down Expand Up @@ -73,12 +76,10 @@ async def aevaluate(self, response: str, *args: Any, **kwargs: Any) -> GraderSco
"""

# Check thinking format tags
think_pattern = f"<{self.think_token}>.*?</{self.think_token}>"
has_think_tag = bool(re.search(think_pattern, response, re.DOTALL))
has_think_tag = bool(self.think_pattern.search(response))

# Check answer format tags
answer_pattern = f"<{self.answer_token}>.*?</{self.answer_token}>"
has_answer_tag = bool(re.search(answer_pattern, response, re.DOTALL))
has_answer_tag = bool(self.answer_pattern.search(response))

# Calculate reward
reward = 1.0 if has_think_tag and has_answer_tag else 0.0
Expand Down
35 changes: 20 additions & 15 deletions openjudge/graders/format/reasoning_tool_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@ def __init__(self) -> None:
description="Check tool call format including think, answer and tool_call tags with JSON validation.",
)

# patterns for identifiying tags
self._think_pattern = re.compile(r"<think>(.*?)</think>", re.DOTALL)
self._answer_pattern = re.compile(r"<answer>(.*?)</answer>", re.DOTALL)
self._tool_call_pattern = re.compile(r"<tool_call>(.*?)</tool_call>", re.DOTALL)

self._think_answer_pattern = re.compile(r"^\s*<think>.*?</think>\s*<answer>.*?</answer>\s*$", re.DOTALL)
self._think_tool_call_pattern = re.compile(
r"^\s*<think>.*?</think>\s*(?:<tool_call>.*?</tool_call>\s*)+$", re.DOTALL
)

self._consecutive_start_tool_call_tag_pattern = re.compile(r"<tool_call>\s*<tool_call>")
self._consecutive_end_tool_call_tag_pattern = re.compile(r"</tool_call>\s*</tool_call>")

# pylint: disable=too-many-statements
async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:
"""
Expand Down Expand Up @@ -69,13 +82,9 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:
"""

# Extract tag contents
think_pattern = r"<think>(.*?)</think>"
answer_pattern = r"<answer>(.*?)</answer>"
tool_call_pattern = r"<tool_call>(.*?)</tool_call>"

think_matches = re.search(think_pattern, response, re.DOTALL)
answer_matches = re.search(answer_pattern, response, re.DOTALL)
tool_call_matches = re.findall(tool_call_pattern, response, re.DOTALL)
think_matches = self._think_pattern.search(response)
answer_matches = self._answer_pattern.search(response)
tool_call_matches = self._tool_call_pattern.findall(response)

has_think_tag = think_matches is not None
has_answer_tag = answer_matches is not None
Expand All @@ -89,9 +98,8 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:
# Case 1: <think></think> + <answer></answer>
if has_answer_tag and not has_tool_call_tag:
# Check overall format
format_pattern = r"^\s*<think>.*?</think>\s*<answer>.*?</answer>\s*$"
valid_format = bool(
re.match(format_pattern, response, re.DOTALL),
self._think_answer_pattern.match(response),
)

# Check tag occurrence count
Expand All @@ -115,9 +123,8 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:
# Case 2: <think></think> + <tool_call></tool_call>
elif has_tool_call_tag and not has_answer_tag:
# Check overall format
format_pattern = r"^\s*<think>.*?</think>\s*(?:<tool_call>.*?</tool_call>\s*)+$"
valid_format = bool(
re.match(format_pattern, response, re.DOTALL),
self._think_tool_call_pattern.match(response),
)

# Check <think> tag occurrence count
Expand All @@ -133,11 +140,9 @@ async def aevaluate(self, response: str, **kwargs: Any) -> GraderScore:

# Check for consecutive duplicate tags
if valid_format:
if re.search(
r"</tool_call>\s*</tool_call>",
if self._consecutive_end_tool_call_tag_pattern.search(
response,
) or re.search(
r"<tool_call>\s*<tool_call>",
) or self._consecutive_start_tool_call_tag_pattern.search(
response,
):
valid_format = False
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/text/number_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ def __init__(self, tolerance: float = 1e-6, **kwargs: Any) -> None:
**kwargs,
)
self.tolerance = tolerance
self._number_pattern = re.compile(r"-?\d+\.?\d*")

def _extract_numbers(self, text: str) -> List[float]:
"""Extract numbers from text"""
# Match integers and floating point numbers
number_pattern = r"-?\d+\.?\d*"
numbers = re.findall(number_pattern, text)
numbers = self._number_pattern.findall(text)
return [float(n) for n in numbers if n]

async def aevaluate(self, response: str, reference_response: str) -> GraderScore:
Expand Down