Skip to content
Merged
11 changes: 8 additions & 3 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import prepare_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -301,13 +302,16 @@ def evaluate_instance(

assert isinstance(workspace, RemoteWorkspace)

def _log_event(ev):
logger.debug("Event: %s", ev)
persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[_log_event],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -556,6 +560,7 @@ def _log_event(ev):

out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result=test_result,
instruction=instruction,
error=None,
Expand Down
11 changes: 10 additions & 1 deletion benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.evaluation import Evaluation
from benchmarks.utils.evaluation_utils import (
Expand Down Expand Up @@ -314,10 +315,17 @@ def evaluate_instance(
)

# Create conversation

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[lambda ev: logger.debug("Event: %s", ev)],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -353,6 +361,7 @@ def evaluate_instance(
# Return evaluation output
return EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={
"score": score,
"model_answer_raw": model_answer_raw,
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import prepare_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -312,16 +313,19 @@ def evaluate_instance(

assert isinstance(workspace, RemoteWorkspace)

def _log_event(ev): # keep it simple
logger.debug("Event: %s", ev)

repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
instance.data["repo_path"] = repo_path

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[_log_event],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -391,6 +395,7 @@ def _log_event(ev): # keep it simple
# EvalOutput is your model; keep fields consistent with prior JSONL
out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={
"git_patch": git_patch,
},
Expand Down
20 changes: 17 additions & 3 deletions benchmarks/openagentsafety/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from benchmarks.openagentsafety.build_images import build_workspace_image
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -263,10 +264,14 @@ def generate_instruction(instance_data: dict, template_path: str | None = None)


def run_evaluation_in_container(
workspace, evaluator_code: str, trajectory: str, instance_id: str
workspace,
evaluator_code: str,
trajectory: str,
instance_id: str,
attempt: int = 1,
) -> dict:
"""Execute evaluator code in the Docker container and return results."""
logger.info(f"Running evaluation for {instance_id}")
logger.info(f"Running evaluation for {instance_id} (attempt {attempt})")

# Write evaluator code
evaluator_path = "/workspace/evaluator_temp.py"
Expand Down Expand Up @@ -432,11 +437,17 @@ def event_callback(event) -> None:
if not isinstance(event, ConversationStateUpdateEvent):
received_events.append(event)

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

# Create conversation
conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[event_callback],
callbacks=[persist_callback, event_callback],
max_iteration_per_run=self.metadata.max_iterations,
stuck_detection=True,
)
Expand All @@ -461,6 +472,7 @@ def event_callback(event) -> None:
metrics = self.metadata.llm.metrics
return EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={"error": str(e)},
instruction=instruction,
error=str(e),
Expand Down Expand Up @@ -490,6 +502,7 @@ def event_callback(event) -> None:
evaluator_code=instance.data["evaluator_code"],
trajectory=trajectory,
instance_id=instance.id,
attempt=self.current_attempt,
)
except Exception as e:
logger.error(f"Evaluation failed: {e}")
Expand All @@ -504,6 +517,7 @@ def event_callback(event) -> None:
metrics = self.metadata.llm.metrics
return EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result=eval_result,
instruction=instruction,
error=None if not eval_result.get("error") else eval_result["error"],
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -231,16 +232,19 @@ def evaluate_instance(

assert isinstance(workspace, RemoteWorkspace)

def _log_event(ev): # keep it simple
logger.debug("Event: %s", ev)

repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
instance.data["repo_path"] = repo_path

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[_log_event],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -288,6 +292,7 @@ def _log_event(ev): # keep it simple
# EvalOutput is your model; keep fields consistent with prior JSONL
out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={
"git_patch": git_patch,
},
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -232,16 +233,19 @@ def evaluate_instance(

assert isinstance(workspace, RemoteWorkspace)

def _log_event(ev): # keep it simple
logger.debug("Event: %s", ev)

repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
instance.data["repo_path"] = repo_path

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[_log_event],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -351,6 +355,7 @@ def _log_event(ev): # keep it simple
# EvalOutput is your model; keep fields consistent with prior JSONL
out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={
"git_patch": git_patch,
},
Expand Down
13 changes: 9 additions & 4 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.evaluation import Evaluation
Expand Down Expand Up @@ -260,16 +261,19 @@ def evaluate_instance(

assert isinstance(workspace, RemoteWorkspace)

def _log_event(ev): # keep it simple
logger.debug("Event: %s", ev)

repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
instance.data["repo_path"] = repo_path

persist_callback = build_event_persistence_callback(
run_id=self.metadata.eval_output_dir,
instance_id=instance.id,
attempt=self.current_attempt,
)

conversation = Conversation(
agent=agent,
workspace=workspace,
callbacks=[_log_event],
callbacks=[persist_callback],
max_iteration_per_run=self.metadata.max_iterations,
)

Expand Down Expand Up @@ -316,6 +320,7 @@ def _log_event(ev): # keep it simple

out = EvalOutput(
instance_id=instance.id,
attempt=self.current_attempt,
test_result={
"git_patch": git_patch,
},
Expand Down
Loading