refactor: reseting values before every validate call in validators

jmatejcz · jmatejcz · commit 160ec4ee6c7f · 2025-04-30T12:50:51.000+02:00
added unit test for that case
diff --git a/src/rai_bench/rai_bench/tool_calling_agent/interfaces.py b/src/rai_bench/rai_bench/tool_calling_agent/interfaces.py
@@ -410,15 +410,26 @@ def add_subtask_errors(self, idx: int, msgs: List[str]):
 
     def reset(self):
         """
-        reset all values refering previous validation
-        before next validation
+        resets all values refering previous validation.
+        Use it before next validation.
         """
         self.subtasks_errors = [[] for _ in range(len(self.subtasks))]
-        self.subtasks_passed: List[bool] = [False for _ in range(len(self.subtasks))]
+        self.subtasks_passed = [False for _ in range(len(self.subtasks))]
         self.extra_calls_used = 0
         self.passed = None
 
     def dump_results(self) -> ValidatorResult:
+        """Get results for last validate() call
+
+        Returns
+        -------
+        ValidatorResult
+
+        Raises
+        ------
+        ValueError
+            When called before validate()
+        """
         if self.passed is None:
             raise ValueError("Run validator validation before dumping results")
         subtasks_results: List[SubTaskResult] = []
@@ -436,7 +447,6 @@ def dump_results(self) -> ValidatorResult:
             extra_tool_calls_used=self.extra_calls_used,
             passed=self.passed,
         )
-        self.reset()
         return result
 
     @abstractmethod
diff --git a/src/rai_bench/rai_bench/tool_calling_agent/validators.py b/src/rai_bench/rai_bench/tool_calling_agent/validators.py
@@ -44,6 +44,7 @@ def type(self) -> str:
         return "ordered"
 
     def validate(self, tool_calls: List[ToolCall]) -> Tuple[bool, List[ToolCall]]:
+        self.reset()
         # Before validation create new iterator, in case validator
         # was used before in other task
         subtask_iter = iter(enumerate(self.subtasks))
@@ -93,6 +94,7 @@ def type(self) -> str:
         return "not ordered"
 
     def validate(self, tool_calls: List[ToolCall]) -> Tuple[bool, List[ToolCall]]:
+        self.reset()
         if len(tool_calls) < 1:
             self.logger.error("Not a single tool call to validate")
             self.passed = False
diff --git a/tests/rai_bench/tool_calling_agent/test_validators.py b/tests/rai_bench/tool_calling_agent/test_validators.py
@@ -443,6 +443,43 @@ def test_validate_extra_calls_when_subtask_eventually_passes(self):
             expected_errors_counts=[0, 5],
         )
 
+    def test_validate_reset(self):
+        subtasks = [
+            DummySubTask("task1"),
+            DummySubTask("task2", outcomes=10 * [False]),
+        ]
+        validator = OrderedCallsValidator(subtasks=subtasks)
+
+        tool_calls = [
+            ToolCall(name="tool1"),
+            ToolCall(name="tool2"),
+            ToolCall(name="tool2"),
+            ToolCall(name="tool2"),
+            ToolCall(name="tool2"),
+            ToolCall(name="tool2"),
+        ]
+        # additional call
+        validator.validate(tool_calls=tool_calls)
+        success, remaining = validator.validate(tool_calls=tool_calls)
+
+        assert not success
+        assert remaining == []
+        assert validator.subtasks_passed[0] is True
+        assert validator.subtasks_passed[1] is False
+        assert len(validator.subtasks_errors[1]) == 5
+        assert "error in task2" in validator.subtasks_errors[1][0]
+        assert validator.passed is False
+        assert validator.extra_calls_used == 4
+
+        assert_dumped(
+            validator,
+            expected_type="ordered",
+            expected_passed=False,
+            expected_extra_calls=4,
+            expected_subtasks_passed=[True, False],
+            expected_errors_counts=[0, 5],
+        )
+
 
 class TestNotOrderedCallsValidator:
     def test_init_with_empty_subtasks(self):
@@ -608,3 +645,32 @@ def test_validate_all_subtasks_fail(self):
             expected_subtasks_passed=[False, False],
             expected_errors_counts=[2, 2],
         )
+
+    def test_validate_reset(self):
+        subtasks = [
+            DummySubTask("task1", outcomes=4 * [False]),
+            DummySubTask("task2", outcomes=4 * [False]),
+        ]
+        validator = NotOrderedCallsValidator(subtasks=subtasks)
+        tool_calls = [ToolCall(), ToolCall()]
+
+        # additional call
+        validator.validate(tool_calls=tool_calls)
+        success, remaining = validator.validate(tool_calls=tool_calls)
+
+        assert not success
+        assert remaining == []
+        assert all(not passed for passed in validator.subtasks_passed)
+        assert len(validator.subtasks_errors[0]) == 2
+        assert len(validator.subtasks_errors[1]) == 2
+        assert validator.passed is False
+        assert validator.extra_calls_used == 0
+
+        assert_dumped(
+            validator,
+            expected_type="not ordered",
+            expected_passed=False,
+            expected_extra_calls=0,
+            expected_subtasks_passed=[False, False],
+            expected_errors_counts=[2, 2],
+        )