RobotecAI
diff --git a/‎src/rai_bench/rai_bench/base_benchmark.py
Lines changed: 125 additions & 0 deletions b/‎src/rai_bench/rai_bench/base_benchmark.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎src/rai_bench/rai_bench/examples/manipulation_o3de/main.py
Lines changed: 19 additions & 32 deletions b/‎src/rai_bench/rai_bench/examples/manipulation_o3de/main.py
Lines changed: 19 additions & 32 deletions
diff --git a/‎src/rai_bench/rai_bench/examples/manipulation_o3de/scenarios.py
Lines changed: 16 additions & 16 deletions b/‎src/rai_bench/rai_bench/examples/manipulation_o3de/scenarios.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎src/rai_bench/rai_bench/examples/test_models.py
Lines changed: 1 addition & 1 deletion b/‎src/rai_bench/rai_bench/examples/test_models.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/rai_bench/rai_bench/examples/tool_calling_agent/main.py
Lines changed: 5 additions & 3 deletions b/‎src/rai_bench/rai_bench/examples/tool_calling_agent/main.py
Lines changed: 5 additions & 3 deletions
@@ -0,0 +1,125 @@
+# Copyright (C) 2025 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import logging
+from abc import ABC, abstractmethod
+from pathlib import Path
+
+from langgraph.graph.state import CompiledStateGraph
+from pydantic import BaseModel, Field
+
+
+class BenchmarkSummary(BaseModel):
+    model_name: str = Field(..., description="Name of the LLM.")
+    success_rate: float = Field(
+        ..., description="Percentage of successfully completed tasks."
+    )
+    avg_time: float = Field(..., description="Average time taken across all tasks.")
+    total_extra_tool_calls_used: int = Field(
+        ..., description="Total number of extra tool calls used in this Task"
+    )
+    total_tasks: int = Field(..., description="Total number of executed tasks.")
+
+
+class BaseBenchmark(ABC):
+    """Base class for all benchmarks."""
+
+    def __init__(
+        self,
+        model_name: str,
+        results_dir: Path,
+        logger: logging.Logger | None = None,
+    ) -> None:
+        """Initialize the base benchmark.
+
+        Parameters
+        ----------
+        model_name : str
+            Name of the LLM model.
+        logger : Optional[loggers_type]
+            Logger instance.
+        results_filename : str
+            Path to the results file.
+        summary_filename : Optional[str]
+            Path to the summary file.
+        """
+        self.model_name = model_name
+        self.results_filename = results_dir / "results.csv"
+        self.summary_filename = results_dir / "results_summary.csv"
+
+        if logger:
+            self.logger = logger
+        else:
+            self.logger = logging.getLogger(__name__)
+
+    @staticmethod
+    def csv_initialize(filename: Path, base_model_cls: type[BaseModel]) -> None:
+        """Initialize a CSV file based on a Pydantic model class.
+
+        Parameters
+        ----------
+        filename : Path
+            Filename of the CSV file.
+        base_model_cls : type[BaseModel]
+            Pydantic model class to be used for creating the columns in the CSV file.
+        """
+        with open(filename, mode="w", newline="", encoding="utf-8") as file:
+            writer = csv.DictWriter(
+                file, fieldnames=base_model_cls.__annotations__.keys()
+            )
+            writer.writeheader()
+
+    @staticmethod
+    def csv_writerow(filename: Path, base_model_instance: BaseModel) -> None:
+        """Write a single row to a CSV file based on a Pydantic model instance contents,
+        ensuring that multiline strings are converted to one-line strings by replacing newlines.
+
+        Parameters
+        ----------
+        filename : Path
+            Filename of the CSV file.
+        base_model_instance : BaseModel
+            Pydantic model instance which contains the data to be written to the CSV file.
+        """
+        row = base_model_instance.model_dump()
+
+        for key, value in row.items():
+            if isinstance(value, str):
+                # Replace newline characters with a single space so they don't break csv
+                row[key] = " ".join(value.split())
+
+        with open(filename, mode="a", newline="", encoding="utf-8") as file:
+            writer = csv.DictWriter(
+                file, fieldnames=base_model_instance.__annotations__.keys()
+            )
+            writer.writerow(row)
+
+    @abstractmethod
+    def run_next(self, agent: CompiledStateGraph) -> None:
+        """Run the next task/scenario of the benchmark.
+
+        Parameters
+        ----------
+        agent : CompiledStateGraph
+            LangChain tool calling agent.
+        """
+        pass
+
+    @abstractmethod
+    def compute_and_save_summary(self) -> None:
+        """Compute summary statistics and save them to the summary file."""
+        pass
+
+    # TODO (jm) this can be probably same for all benchmark in the future
@@ -33,13 +33,9 @@
 from rai_open_set_vision.tools import GetGrabbingPointTool
 
 from rai_bench.examples.manipulation_o3de.scenarios import (
-    easy_scenarios,
-    hard_scenarios,
-    medium_scenarios,
     trivial_scenarios,
-    very_hard_scenarios,
 )
-from rai_bench.manipulation_o3de.benchmark import Benchmark
+from rai_bench.manipulation_o3de.benchmark import ManipulationO3DEBenchmark
 from rai_sim.o3de.o3de_bridge import (
     O3DEngineArmManipulationBridge,
 )
@@ -72,17 +68,8 @@ def run_benchmark(model_name: str, vendor: str, out_dir: str):
     node.declare_parameter("conversion_ratio", 1.0)
 
     # define model
-
     llm = get_llm_model_direct(model_name=model_name, vendor=vendor)
 
-    system_prompt = """
-    You are a robotic arm with interfaces to detect and manipulate objects.
-    Here are the coordinates information:
-    x - front to back (positive is forward)
-    y - left to right (positive is right)
-    z - up to down (positive is up)
-    Before starting the task, make sure to grab the camera image to understand the environment.
-    """
     # define tools
     tools: List[BaseTool] = [
         GetObjectPositionsTool(
@@ -165,33 +152,33 @@ def run_benchmark(model_name: str, vendor: str, out_dir: str):
     t_scenarios = trivial_scenarios(
         configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
     )
-    e_scenarios = easy_scenarios(
-        configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
-    )
-    m_scenarios = medium_scenarios(
-        configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
-    )
-    h_scenarios = hard_scenarios(
-        configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
-    )
-    vh_scenarios = very_hard_scenarios(
-        configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
-    )
+    # e_scenarios = easy_scenarios(
+    #     configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
+    # )
+    # m_scenarios = medium_scenarios(
+    #     configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
+    # )
+    # h_scenarios = hard_scenarios(
+    #     configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
+    # )
+    # vh_scenarios = very_hard_scenarios(
+    #     configs_dir=configs_dir, connector_path=connector_path, logger=bench_logger
+    # )
 
-    all_scenarios = t_scenarios + e_scenarios + m_scenarios + h_scenarios + vh_scenarios
+    all_scenarios = t_scenarios
     o3de = O3DEngineArmManipulationBridge(connector, logger=agent_logger)
     try:
         # define benchamrk
-        results_filename = f"{out_dir}/results.csv"
-        benchmark = Benchmark(
+        benchmark = ManipulationO3DEBenchmark(
+            model_name=model_name,
             simulation_bridge=o3de,
             scenarios=all_scenarios,
             logger=bench_logger,
-            results_filename=results_filename,
+            results_dir=Path(out_dir),
         )
-        for _ in range(len(all_scenarios)):
+        for scenario in all_scenarios:
             agent = create_conversational_agent(
-                llm, tools, system_prompt, logger=agent_logger
+                llm, tools, scenario.task.system_prompt, logger=agent_logger
             )
             benchmark.run_next(agent=agent)
             o3de.reset_arm()
 
@@ -18,7 +18,7 @@
 
 from rclpy.impl.rcutils_logger import RcutilsLogger
 
-from rai_bench.manipulation_o3de.benchmark import Benchmark, Scenario
+from rai_bench.manipulation_o3de.benchmark import ManipulationO3DEBenchmark, Scenario
 from rai_bench.manipulation_o3de.interfaces import Task
 from rai_bench.manipulation_o3de.tasks import (
     BuildCubeTowerTask,
@@ -86,7 +86,7 @@ def trivial_scenarios(
                 place_object_tasks.append(
                     PlaceObjectAtCoordTask(obj, coord, disp, logger=logger)
                 )
-    easy_place_objects_scenarios = Benchmark.create_scenarios(
+    easy_place_objects_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=place_object_tasks,
         simulation_configs=simulations_configs,
         simulation_configs_paths=simulation_configs_paths,
@@ -99,7 +99,7 @@ def trivial_scenarios(
         for objects in object_groups
     ]
 
-    easy_move_to_left_scenarios = Benchmark.create_scenarios(
+    easy_move_to_left_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=move_to_left_tasks,
         simulation_configs=simulations_configs,
         simulation_configs_paths=simulation_configs_paths,
@@ -168,7 +168,7 @@ def easy_scenarios(
                 place_object_tasks.append(
                     PlaceObjectAtCoordTask(obj, coord, disp, logger=logger)
                 )
-    easy_place_objects_scenarios = Benchmark.create_scenarios(
+    easy_place_objects_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=place_object_tasks,
         simulation_configs=simulations_configs,
         simulation_configs_paths=simulation_configs_paths,
@@ -188,15 +188,15 @@ def easy_scenarios(
         for objects in object_groups
     ]
 
-    easy_move_to_left_scenarios = Benchmark.create_scenarios(
+    easy_move_to_left_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=move_to_left_tasks,
         simulation_configs=simulations_configs,
         simulation_configs_paths=simulation_configs_paths,
     )
 
     # place cubes
     task = PlaceCubesTask(threshold_distance=0.2, logger=logger)
-    easy_place_cubes_scenarios = Benchmark.create_scenarios(
+    easy_place_cubes_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=[task],
         simulation_configs=simulations_configs,
         simulation_configs_paths=simulation_configs_paths,
@@ -284,7 +284,7 @@ def medium_scenarios(
         for objects in object_groups
     ]
 
-    move_to_left_scenarios = Benchmark.create_scenarios(
+    move_to_left_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=move_to_left_tasks,
         simulation_configs=medium_simulations_configs,
         simulation_configs_paths=medium_simulation_configs_paths,
@@ -293,7 +293,7 @@ def medium_scenarios(
 
     # place cubes
     task = PlaceCubesTask(threshold_distance=0.1, logger=logger)
-    easy_place_cubes_scenarios = Benchmark.create_scenarios(
+    easy_place_cubes_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=[task],
         simulation_configs=medium_simulations_configs,
         simulation_configs_paths=medium_simulation_configs_paths,
@@ -310,7 +310,7 @@ def medium_scenarios(
         for objects in object_groups
     ]
 
-    build_tower_scenarios = Benchmark.create_scenarios(
+    build_tower_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=build_tower_tasks,
         simulation_configs=easy_simulations_configs,
         simulation_configs_paths=easy_simulation_configs_paths,
@@ -330,7 +330,7 @@ def medium_scenarios(
         GroupObjectsTask(obj_types=objects, logger=logger) for objects in object_groups
     ]
 
-    group_object_scenarios = Benchmark.create_scenarios(
+    group_object_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=group_object_tasks,
         simulation_configs=easy_simulations_configs,
         simulation_configs_paths=easy_simulation_configs_paths,
@@ -418,15 +418,15 @@ def hard_scenarios(
         for objects in object_groups
     ]
 
-    move_to_left_scenarios = Benchmark.create_scenarios(
+    move_to_left_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=move_to_left_tasks,
         simulation_configs=hard_simulations_configs,
         simulation_configs_paths=hard_simulation_configs_paths,
     )
 
     # place cubes
     task = PlaceCubesTask(threshold_distance=0.1, logger=logger)
-    easy_place_cubes_scenarios = Benchmark.create_scenarios(
+    easy_place_cubes_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=[task],
         simulation_configs=hard_simulations_configs,
         simulation_configs_paths=hard_simulation_configs_paths,
@@ -442,7 +442,7 @@ def hard_scenarios(
         for objects in object_groups
     ]
 
-    build_tower_scenarios = Benchmark.create_scenarios(
+    build_tower_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=build_tower_tasks,
         simulation_configs=medium_simulations_configs,
         simulation_configs_paths=medium_simulation_configs_paths,
@@ -464,7 +464,7 @@ def hard_scenarios(
         GroupObjectsTask(obj_types=objects, logger=logger) for objects in object_groups
     ]
 
-    group_object_scenarios = Benchmark.create_scenarios(
+    group_object_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=group_object_tasks,
         simulation_configs=medium_simulations_configs,
         simulation_configs_paths=medium_simulation_configs_paths,
@@ -534,7 +534,7 @@ def very_hard_scenarios(
         for objects in object_groups
     ]
 
-    build_tower_scenarios = Benchmark.create_scenarios(
+    build_tower_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=build_tower_tasks,
         simulation_configs=hard_simulations_configs,
         simulation_configs_paths=hard_simulation_configs_paths,
@@ -555,7 +555,7 @@ def very_hard_scenarios(
         GroupObjectsTask(obj_types=objects, logger=logger) for objects in object_groups
     ]
 
-    group_object_scenarios = Benchmark.create_scenarios(
+    group_object_scenarios = ManipulationO3DEBenchmark.create_scenarios(
         tasks=group_object_tasks,
         simulation_configs=hard_simulations_configs,
         simulation_configs_paths=hard_simulation_configs_paths,
 
@@ -19,7 +19,7 @@
 if __name__ == "__main__":
     models_name = ["llama3.2", "qwen2.5:7b"]
     vendors = ["ollama", "ollama"]
-    benchmarks = ["manipulation_o3de"]
+    benchmarks = ["tool_calling_agent", "manipulation_o3de"]
     extra_tool_calls = [5]
     repeats = 1
 
 
@@ -54,7 +54,6 @@ def run_benchmark(model_name: str, vendor: str, out_dir: str, extra_tool_calls:
     experiment_dir = Path(out_dir)
     experiment_dir.mkdir(parents=True, exist_ok=True)
     log_filename = experiment_dir / "benchmark.log"
-    results_filename = experiment_dir / "results.csv"
 
     file_handler = logging.FileHandler(log_filename)
     file_handler.setLevel(logging.DEBUG)
@@ -76,7 +75,10 @@ def run_benchmark(model_name: str, vendor: str, out_dir: str, extra_tool_calls:
         task.set_logger(bench_logger)
 
     benchmark = ToolCallingAgentBenchmark(
-        tasks=all_tasks, logger=bench_logger, results_filename=results_filename
+        tasks=all_tasks,
+        logger=bench_logger,
+        model_name=model_name,
+        results_dir=experiment_dir,
     )
 
     llm = get_llm_model_direct(model_name=model_name, vendor=vendor)
@@ -87,7 +89,7 @@ def run_benchmark(model_name: str, vendor: str, out_dir: str, extra_tool_calls:
             system_prompt=task.get_system_prompt(),
             logger=agent_logger,
         )
-        benchmark.run_next(agent=agent, model_name=model_name)
+        benchmark.run_next(agent=agent)
 
 
 if __name__ == "__main__":