feat: add conversation id to HRI message (#480)

rachwalk · web-flow · commit 2a2c613c7920 · 2025-04-01T15:11:58.000+02:00
diff --git a/examples/s2s/conversational.py b/examples/s2s/conversational.py
@@ -35,15 +35,18 @@
 
 
 class LLMTextHandler(BaseCallbackHandler):
-    def __init__(self, connector: ROS2HRIConnector):
+    def __init__(self, connector: ROS2HRIConnector, speech_id: str = ""):
         self.connector = connector
         self.token_buffer = ""
+        self.speech_id = speech_id
 
     def on_llm_new_token(self, token: str, **kwargs):
         self.token_buffer += token
         if len(self.token_buffer) > 100 or token in [".", "?", "!", ",", ";", ":"]:
             logging.info(f"Sending token buffer: {self.token_buffer}")
-            self.connector.send_all_targets(AIMessage(content=self.token_buffer))
+            self.connector.send_all_targets(
+                AIMessage(content=self.token_buffer), self.speech_id
+            )
             self.token_buffer = ""
 
     def on_llm_end(
@@ -74,6 +77,7 @@ def __init__(self, connectors: Dict[str, BaseConnector]):  # type: ignore
         self._setup_ros_connector()
         self.main_thread = None
         self.stop_thread = Event()
+        self.current_speech_id = ""
 
     def run(self):
         logging.info("Running S2SConversationalAgent")
@@ -85,14 +89,24 @@ def _main_loop(self):
             time.sleep(0.01)
             speech = ""
             while not self.speech_queue.empty():
-                speech += "".join(self.speech_queue.get().text)
+                speech_message = self.speech_queue.get()
+                speech += "".join(speech_message.text)
                 logging.info(f"Received human speech {speech}!")
+                self.current_speech_id = speech_message.conversation_id
             if speech != "":
-                self.message_history.append(HumanMessage(content=speech))
+                self.message_history.append(
+                    HumanMessage(content=speech, conversation_id=self.current_speech_id)
+                )
                 assert isinstance(self.connectors["ros2"], ROS2HRIConnector)
                 ai_answer = self.llm.invoke(
                     self.message_history,
-                    config={"callbacks": [LLMTextHandler(self.connectors["ros2"])]},
+                    config={
+                        "callbacks": [
+                            LLMTextHandler(
+                                self.connectors["ros2"], self.current_speech_id
+                            )
+                        ]
+                    },
                 )
                 self.message_history.append(ai_answer)  # type: ignore
 
diff --git a/src/rai_asr/rai_asr/agents/asr_agent.py b/src/rai_asr/rai_asr/agents/asr_agent.py
@@ -274,5 +274,9 @@ def _send_ros2_message(self, data: str, topic: str):
             except Exception as e:
                 self.logger.error(f"Error sending message to {topic}: {e}")
         else:
-            msg = ROS2HRIMessage(HRIPayload(text=data), "human")
+            msg = ROS2HRIMessage(
+                HRIPayload(text=data),
+                "human",
+                ROS2HRIMessage.generate_conversation_id(),
+            )
             self.connectors["ros2_hri"].send_message(msg, topic)
diff --git a/src/rai_core/rai/agents/langchain/callback.py b/src/rai_core/rai/agents/langchain/callback.py
@@ -15,6 +15,7 @@
 import logging
 import threading
 from typing import List, Optional
+from uuid import UUID
 
 from langchain_core.callbacks import BaseCallbackHandler
 from langchain_core.messages import AIMessage
@@ -39,45 +40,61 @@ def __init__(
         self.max_buffer_size = max_buffer_size
         self._buffer_lock = threading.Lock()
         self.logger = logger or logging.getLogger(__name__)
+        self.current_conversation_id = None
+        self.current_chunk_id = 0
 
     def _should_split(self, token: str) -> bool:
         return token in self.splitting_chars
 
-    def _send_all_targets(self, tokens: str):
+    def _send_all_targets(self, tokens: str, done: bool = False):
         self.logger.info(
             f"Sending {len(tokens)} tokens to {len(self.connectors)} connectors"
         )
         for connector_name, connector in self.connectors.items():
             try:
-                connector.send_all_targets(AIMessage(content=tokens))
+                connector.send_all_targets(
+                    AIMessage(content=tokens),
+                    self.current_conversation_id,
+                    self.current_chunk_id,
+                    done,
+                )
                 self.logger.debug(f"Sent {len(tokens)} tokens to {connector_name}")
             except Exception as e:
                 self.logger.error(
                     f"Failed to send {len(tokens)} tokens to {connector_name}: {e}"
                 )
 
-    def on_llm_new_token(self, token: str, **kwargs):
+    def on_llm_new_token(self, token: str, *, run_id: UUID, **kwargs):
         if token == "":
             return
+        if self.current_conversation_id != str(run_id):
+            self.current_conversation_id = str(run_id)
+            self.current_chunk_id = 0
         if self.aggregate_chunks:
             with self._buffer_lock:
                 self.chunks_buffer += token
                 if len(self.chunks_buffer) < self.max_buffer_size:
                     if self._should_split(token):
                         self._send_all_targets(self.chunks_buffer)
                         self.chunks_buffer = ""
+                        self.current_chunk_id += 1
                 else:
                     self._send_all_targets(self.chunks_buffer)
                     self.chunks_buffer = ""
+                    self.current_chunk_id += 1
         else:
             self._send_all_targets(token)
+            self.current_chunk_id += 1
 
     def on_llm_end(
         self,
         response: LLMResult,
+        *,
+        run_id: UUID,
         **kwargs,
     ):
+        self.current_conversation_id = str(run_id)
         if self.aggregate_chunks and self.chunks_buffer:
             with self._buffer_lock:
-                self._send_all_targets(self.chunks_buffer)
+                self._send_all_targets(self.chunks_buffer, done=True)
                 self.chunks_buffer = ""
diff --git a/src/rai_core/rai/communication/hri_connector.py b/src/rai_core/rai/communication/hri_connector.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import base64
+import uuid
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import Any, Dict, Generic, Literal, Optional, Sequence, TypeVar, get_args
@@ -55,19 +56,25 @@ def __init__(
         payload: HRIPayload,
         metadata: Optional[Dict[str, Any]] = None,
         message_author: Literal["ai", "human"] = "ai",
+        communication_id: Optional[str] = None,
+        seq_no: int = 0,
+        seq_end: bool = False,
         **kwargs,
     ):
         super().__init__(payload, metadata)
         self.message_author = message_author
         self.text = payload.text
         self.images = payload.images
         self.audios = payload.audios
+        self.communication_id = communication_id
+        self.seq_no = seq_no
+        self.seq_end = seq_end
 
     def __bool__(self) -> bool:
         return bool(self.text or self.images or self.audios)
 
     def __repr__(self):
-        return f"HRIMessage(type={self.message_author}, text={self.text}, images={self.images}, audios={self.audios})"
+        return f"HRIMessage(type={self.message_author}, text={self.text}, images={self.images}, audios={self.audios}, communication_id={self.communication_id}, seq_no={self.seq_no}, seq_end={self.seq_end})"
 
     def _image_to_base64(self, image: ImageType) -> str:
         buffered = BytesIO()
@@ -115,6 +122,7 @@ def to_langchain(self) -> LangchainBaseMessage:
     def from_langchain(
         cls,
         message: LangchainBaseMessage | RAIMultimodalMessage,
+        communication_id: Optional[str] = None,
     ) -> "HRIMessage":
         if isinstance(message, RAIMultimodalMessage):
             text = message.text
@@ -137,8 +145,14 @@ def from_langchain(
                 ),
             ),
             message_author=message.type,  # type: ignore
+            communication_id=communication_id,
         )
 
+    @classmethod
+    def generate_communication_id(cls) -> str:
+        """Generate a unique communication ID."""
+        return str(uuid.uuid1())
+
 
 T = TypeVar("T", bound=HRIMessage)
 
@@ -167,12 +181,21 @@ def __init__(
     def _build_message(
         self,
         message: LangchainBaseMessage | RAIMultimodalMessage,
+        communication_id: Optional[str] = None,
+        seq_no: int = 0,
+        seq_end: bool = False,
     ) -> T:
-        return self.T_class.from_langchain(message)
+        return self.T_class.from_langchain(message, communication_id, seq_no, seq_end)
 
-    def send_all_targets(self, message: LangchainBaseMessage | RAIMultimodalMessage):
+    def send_all_targets(
+        self,
+        message: LangchainBaseMessage | RAIMultimodalMessage,
+        communication_id: Optional[str] = None,
+        seq_no: int = 0,
+        seq_end: bool = False,
+    ):
         for target in self.configured_targets:
-            to_send = self._build_message(message)
+            to_send = self._build_message(message, communication_id, seq_no, seq_end)
             self.send_message(to_send, target)
 
     def receive_all_sources(self, timeout_sec: float = 1.0) -> dict[str, T]:
diff --git a/src/rai_core/rai/communication/ros2/messages.py b/src/rai_core/rai/communication/ros2/messages.py
@@ -42,8 +42,15 @@ def __init__(self, payload: Any, metadata: Optional[Dict[str, Any]] = None):
 
 
 class ROS2HRIMessage(HRIMessage):
-    def __init__(self, payload: HRIPayload, message_author: Literal["ai", "human"]):
-        super().__init__(payload, {}, message_author)
+    def __init__(
+        self,
+        payload: HRIPayload,
+        message_author: Literal["ai", "human"],
+        communication_id: Optional[str] = None,
+        seq_no: int = 0,
+        seq_end: bool = False,
+    ):
+        super().__init__(payload, {}, message_author, communication_id, seq_no, seq_end)
 
     @classmethod
     def from_ros2(
@@ -66,9 +73,13 @@ def from_ros2(
             )
             for audio_msg in cast(List[ROS2HRIMessage__Audio], msg.audios)
         ]
+        communication_id = msg.communication_id if msg.communication_id != "" else None
         return ROS2HRIMessage(
             payload=HRIPayload(text=msg.text, images=pil_images, audios=audio_segments),
             message_author=message_author,
+            communication_id=communication_id,
+            seq_no=msg.seq_no,
+            seq_end=msg.seq_end,
         )
 
     def to_ros2_dict(self) -> OrderedDict[str, Any]:
@@ -94,6 +105,9 @@ def to_ros2_dict(self) -> OrderedDict[str, Any]:
                     text=self.payload.text,
                     images=img_msgs,
                     audios=audio_msgs,
+                    communication_id=self.communication_id or "",
+                    seq_no=self.seq_no,
+                    seq_end=self.seq_end,
                 )
             ),
         )
diff --git a/src/rai_interfaces/msg/HRIMessage.msg b/src/rai_interfaces/msg/HRIMessage.msg
@@ -18,3 +18,6 @@ std_msgs/Header header
 string text
 sensor_msgs/Image[] images
 rai_interfaces/AudioMessage[] audios
+string communication_id
+int64 seq_no
+bool seq_end
diff --git a/src/rai_tts/rai_tts/agents/tts_agent.py b/src/rai_tts/rai_tts/agents/tts_agent.py
@@ -74,6 +74,8 @@ class TextToSpeechAgent(BaseAgent):
         Text-to-speech model used for generating audio.
     logger : Optional[logging.Logger], optional
         Logger instance for logging messages, by default None.
+    max_speech_history : int, optional
+        Maximum amount of speech ids to remember, by default 64
     """
 
     def __init__(
@@ -82,6 +84,7 @@ def __init__(
         ros2_name: str,
         tts: TTSModel,
         logger: Optional[logging.Logger] = None,
+        max_speech_history=64,
     ):
         if logger is None:
             self.logger = logging.getLogger(__name__)
@@ -101,8 +104,10 @@ def __init__(
         super().__init__(connectors={"ros2": ros2_connector, "speaker": speaker})
 
         self.current_transcription_id = str(uuid4())[0:8]
+        self.current_speech_id = None
         self.text_queues: dict[str, Queue] = {self.current_transcription_id: Queue()}
         self.audio_queues: dict[str, Queue] = {self.current_transcription_id: Queue()}
+        self.remembered_speech_ids: list[str] = []
 
         self.tog_play_event = Event()
         self.stop_event = Event()
@@ -224,7 +229,17 @@ def _on_to_human_message(self, message: IROS2Message):
         self.logger.warning(
             f"Starting playback, current id: {self.current_transcription_id}"
         )
-        self.text_queues[self.current_transcription_id].put(msg.text)
+        if (
+            self.current_speech_id is None
+            and msg.conversation_id is not None
+            and msg.conversation_id not in self.remembered_speech_ids
+        ):
+            self.current_speech_id = msg.conversation_id
+            self.remembered_speech_ids.append(self.current_speech_id)
+            if len(self.remembered_speech_ids) > 64:
+                self.remembered_speech_ids.pop(0)
+        if self.current_speech_id == msg.conversation_id:
+            self.text_queues[self.current_transcription_id].put(msg.text)
         self.playback_data.playing = True
 
     def _on_command_message(self, message: IROS2Message):
@@ -237,6 +252,7 @@ def _on_command_message(self, message: IROS2Message):
         elif message.data == "pause":
             self.playback_data.playing = False
         elif message.data == "stop":
+            self.current_speech_id = None
             self.playback_data.playing = False
             previous_id = self.current_transcription_id
             self.logger.warning(f"Stopping playback, previous id: {previous_id}")
diff --git a/tests/communication/ros2/helpers.py b/tests/communication/ros2/helpers.py
@@ -180,12 +180,11 @@ def goal_response_callback(self, future):
 
     def get_result_callback(self, future):
         result = future.result().result
-        self.get_logger().info(f"Result: {result.sequence}")
-        rclpy.shutdown()
+        self.get_logger().info(f"Result: {result}")
 
     def feedback_callback(self, feedback_msg):
         feedback = feedback_msg.feedback
-        self.get_logger().info(f"Received feedback: {feedback.partial_sequence}")
+        self.get_logger().info(f"Received feedback: {feedback}")
 
 
 class TestServiceClient(Node):
diff --git a/tests/communication/ros2/test_connectors.py b/tests/communication/ros2/test_connectors.py
@@ -184,7 +184,9 @@ def test_ros2hri_default_message_publish(
         audios = [AudioSegment.silent(duration=1000)]
         text = "Hello, HRI!"
         payload = HRIPayload(images=images, audios=audios, text=text)
-        message = ROS2HRIMessage(payload=payload, message_author="ai")
+        message = ROS2HRIMessage(
+            payload=payload, message_author="ai", communication_id=""
+        )
         connector.send_message(message, target=topic_name)
         time.sleep(1)  # wait for the message to be received
 
@@ -231,13 +233,11 @@ def test_ros2ari_connector_create_service(
         service_client = TestServiceClient()
         executors, threads = multi_threaded_spinner([service_client])
         service_client.send_request()
-        time.sleep(0.01)
+        time.sleep(0.02)
         assert mock_callback.called
-    except Exception as e:
-        raise e
-
-    connector.shutdown()
-    shutdown_executors_and_threads(executors, threads)
+    finally:
+        connector.shutdown()
+        shutdown_executors_and_threads(executors, threads)
 
 
 def test_ros2ari_connector_action_call(ros_setup: None, request: pytest.FixtureRequest):
@@ -256,7 +256,7 @@ def test_ros2ari_connector_action_call(ros_setup: None, request: pytest.FixtureR
         action_client = TestActionClient()
         executors, threads = multi_threaded_spinner([action_client])
         action_client.send_goal()
-        time.sleep(0.01)
+        time.sleep(0.02)
+        assert mock_callback.called
     finally:
         shutdown_executors_and_threads(executors, threads)
-        assert mock_callback.called
diff --git a/tests/communication/test_hri_message.py b/tests/communication/test_hri_message.py