OpenHands · simonrosenberg · Jan 12, 2026 · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
@@ -14,6 +14,7 @@
 )
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import prepare_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -301,13 +302,16 @@ def evaluate_instance(
 
         assert isinstance(workspace, RemoteWorkspace)
 
-        def _log_event(ev):
-            logger.debug("Event: %s", ev)
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
 
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[_log_event],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -556,6 +560,7 @@ def _log_event(ev):
 
         out = EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result=test_result,
             instruction=instruction,
             error=None,

diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
@@ -15,6 +15,7 @@
 from benchmarks.gaia.utils import image_to_jpg_base64_url, image_to_png_base64_url
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import (
@@ -314,10 +315,17 @@ def evaluate_instance(
         )
 
         # Create conversation
+
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[lambda ev: logger.debug("Event: %s", ev)],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -353,6 +361,7 @@ def evaluate_instance(
         # Return evaluation output
         return EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result={
                 "score": score,
                 "model_answer_raw": model_answer_raw,

diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
@@ -16,6 +16,7 @@
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import prepare_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -312,16 +313,19 @@ def evaluate_instance(
 
         assert isinstance(workspace, RemoteWorkspace)
 
-        def _log_event(ev):  # keep it simple
-            logger.debug("Event: %s", ev)
-
         repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
         instance.data["repo_path"] = repo_path
 
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[_log_event],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -391,6 +395,7 @@ def _log_event(ev):  # keep it simple
         # EvalOutput is your model; keep fields consistent with prior JSONL
         out = EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result={
                 "git_patch": git_patch,
             },

diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
@@ -14,6 +14,7 @@
 
 from benchmarks.openagentsafety.build_images import build_workspace_image
 from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -263,10 +264,14 @@ def generate_instruction(instance_data: dict, template_path: str | None = None)
 
 
 def run_evaluation_in_container(
-    workspace, evaluator_code: str, trajectory: str, instance_id: str
+    workspace,
+    evaluator_code: str,
+    trajectory: str,
+    instance_id: str,
+    attempt: int = 1,
 ) -> dict:
     """Execute evaluator code in the Docker container and return results."""
-    logger.info(f"Running evaluation for {instance_id}")
+    logger.info(f"Running evaluation for {instance_id} (attempt {attempt})")
 
     # Write evaluator code
     evaluator_path = "/workspace/evaluator_temp.py"
@@ -432,11 +437,17 @@ def event_callback(event) -> None:
             if not isinstance(event, ConversationStateUpdateEvent):
                 received_events.append(event)
 
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         # Create conversation
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[event_callback],
+            callbacks=[persist_callback, event_callback],
             max_iteration_per_run=self.metadata.max_iterations,
             stuck_detection=True,
         )
@@ -461,6 +472,7 @@ def event_callback(event) -> None:
                 metrics = self.metadata.llm.metrics
             return EvalOutput(
                 instance_id=instance.id,
+                attempt=self.current_attempt,
                 test_result={"error": str(e)},
                 instruction=instruction,
                 error=str(e),
@@ -490,6 +502,7 @@ def event_callback(event) -> None:
                     evaluator_code=instance.data["evaluator_code"],
                     trajectory=trajectory,
                     instance_id=instance.id,
+                    attempt=self.current_attempt,
                 )
             except Exception as e:
                 logger.error(f"Evaluation failed: {e}")
@@ -504,6 +517,7 @@ def event_callback(event) -> None:
             metrics = self.metadata.llm.metrics
         return EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result=eval_result,
             instruction=instruction,
             error=None if not eval_result.get("error") else eval_result["error"],

diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
@@ -13,6 +13,7 @@
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -231,16 +232,19 @@ def evaluate_instance(
 
         assert isinstance(workspace, RemoteWorkspace)
 
-        def _log_event(ev):  # keep it simple
-            logger.debug("Event: %s", ev)
-
         repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
         instance.data["repo_path"] = repo_path
 
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[_log_event],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -288,6 +292,7 @@ def _log_event(ev):  # keep it simple
         # EvalOutput is your model; keep fields consistent with prior JSONL
         out = EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result={
                 "git_patch": git_patch,
             },

diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
@@ -12,6 +12,7 @@
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -232,16 +233,19 @@ def evaluate_instance(
 
         assert isinstance(workspace, RemoteWorkspace)
 
-        def _log_event(ev):  # keep it simple
-            logger.debug("Event: %s", ev)
-
         repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
         instance.data["repo_path"] = repo_path
 
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[_log_event],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -351,6 +355,7 @@ def _log_event(ev):  # keep it simple
         # EvalOutput is your model; keep fields consistent with prior JSONL
         out = EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result={
                 "git_patch": git_patch,
             },

diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
@@ -6,6 +6,7 @@
 
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import get_dataset
 from benchmarks.utils.evaluation import Evaluation
@@ -260,16 +261,19 @@ def evaluate_instance(
 
         assert isinstance(workspace, RemoteWorkspace)
 
-        def _log_event(ev):  # keep it simple
-            logger.debug("Event: %s", ev)
-
         repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
         instance.data["repo_path"] = repo_path
 
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
         conversation = Conversation(
             agent=agent,
             workspace=workspace,
-            callbacks=[_log_event],
+            callbacks=[persist_callback],
             max_iteration_per_run=self.metadata.max_iterations,
         )
 
@@ -316,6 +320,7 @@ def _log_event(ev):  # keep it simple
 
         out = EvalOutput(
             instance_id=instance.id,
+            attempt=self.current_attempt,
             test_result={
                 "git_patch": git_patch,
             },