feat: add configurable voice agent basic version

rachwalk · rachwalk · commit 9d089885ceda · 2025-01-09T16:36:01.000+01:00
diff --git a/src/rai/rai/agents/__init__.py b/src/rai/rai/agents/__init__.py
@@ -15,9 +15,11 @@
 from rai.agents.conversational_agent import create_conversational_agent
 from rai.agents.state_based import create_state_based_agent
 from rai.agents.tool_runner import ToolRunner
+from rai.agents.voice_agent import VoiceRecognitionAgent
 
 __all__ = [
     "ToolRunner",
     "create_conversational_agent",
     "create_state_based_agent",
+    "VoiceRecognitionAgent",
 ]
diff --git a/src/rai/rai/agents/base.py b/src/rai/rai/agents/base.py
@@ -0,0 +1,36 @@
+# Copyright (C) 2024 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+from rai.communication import BaseConnector
+
+
+class BaseAgent(ABC):
+    def __init__(
+        self, connectors: Optional[dict[str, BaseConnector]] = None, *args, **kwargs
+    ):
+        if connectors is None:
+            connectors = {}
+        self.connectors: dict[str, BaseConnector] = connectors
+
+    @abstractmethod
+    def setup(self, *args, **kwargs):
+        pass
+
+    @abstractmethod
+    def run(self, *args, **kwargs):
+        pass
diff --git a/src/rai/rai/agents/voice_agent.py b/src/rai/rai/agents/voice_agent.py
@@ -0,0 +1,97 @@
+# Copyright (C) 2024 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from threading import Lock, Thread
+from typing import Any, List, Tuple
+
+import numpy as np
+from numpy.typing import NDArray
+
+from rai.agents.base import BaseAgent
+from rai.communication import AudioInputDeviceConfig, StreamingAudioInputDevice
+from rai_asr.models.base import BaseVoiceDetectionModel
+
+
+class VoiceRecognitionAgent(BaseAgent):
+    def __init__(self):
+        super().__init__(connectors={"microphone": StreamingAudioInputDevice()})
+        self.should_record_pipeline: List[BaseVoiceDetectionModel] = []
+        self.should_stop_pipeline: List[BaseVoiceDetectionModel] = []
+        self.transcription_lock = Lock()
+        self.shared_samples = []
+        self.recording_started = False
+        self.ran_setup = False
+
+    def __call__(self):
+        self.run()
+
+    def setup(
+        self, microphone_device_id: int, microphone_config: AudioInputDeviceConfig
+    ):
+        assert isinstance(self.connectors["microphone"], StreamingAudioInputDevice)
+        self.microphone_device_id = str(microphone_device_id)
+        self.connectors["microphone"].configure_device(
+            target=self.microphone_device_id, config=microphone_config
+        )
+        self.ran_setup = True
+
+    def run(self):
+        self.listener_handle = self.connectors["microphone"].start_action(
+            self.microphone_device_id, self.on_new_sample
+        )
+        self.transcription_thread = Thread(target=self._transcription_function)
+        self.transcription_thread.start()
+
+    def stop(self):
+        self.connectors["microphone"].terminate_action(self.listener_handle)
+        self.transcription_thread.join()
+
+    def on_new_sample(self, indata: np.ndarray, status_flags: dict[str, Any]):
+        should_stop, should_cancel = self.should_stop_recording(indata)
+        print(indata)
+        if should_cancel:
+            self.cancel_task()
+        if (self.recording_started and not should_stop) or (
+            self.should_start_recording(indata)
+        ):
+            with self.transcription_lock:
+                self.shared_samples.extend(indata)
+
+    def should_start_recording(self, audio_data: NDArray[np.int16]) -> bool:
+        output_parameters = {}
+        for model in self.should_record_pipeline:
+            should_listen, output_parameters = model.detected(
+                audio_data, output_parameters
+            )
+            if not should_listen:
+                return False
+        return True
+
+    def should_stop_recording(self, audio_data: NDArray[np.int16]) -> Tuple[bool, bool]:
+        output_parameters = {}
+        for model in self.should_stop_pipeline:
+            should_listen, output_parameters = model.detected(
+                audio_data, output_parameters
+            )
+            # TODO: Add handling output parametrs for checking if should cancel
+            if should_listen:
+                return False, False
+        return True, False
+
+    def _transcription_function(self):
+        with self.transcription_lock:
+            samples = np.array(self.shared_samples)
+            print(samples)
+            self.shared_samples = []
diff --git a/src/rai/rai/communication/__init__.py b/src/rai/rai/communication/__init__.py
@@ -13,11 +13,16 @@
 # limitations under the License.
 
 from .base_connector import BaseConnector, BaseMessage
-from .sound_device_connector import SoundDeviceError, StreamingAudioInputDevice
+from .sound_device_connector import (
+    AudioInputDeviceConfig,
+    SoundDeviceError,
+    StreamingAudioInputDevice,
+)
 
 __all__ = [
     "BaseMessage",
     "BaseConnector",
     "StreamingAudioInputDevice",
     "SoundDeviceError",
+    "AudioInputDeviceConfig",
 ]
diff --git a/src/rai/rai/communication/sound_device_connector.py b/src/rai/rai/communication/sound_device_connector.py
@@ -99,7 +99,7 @@ def start_action(
         self,
         target: str,
         on_feedback: Callable[[np.ndarray, dict[str, Any]], None],
-        on_finish: Callable = lambda _: None,
+        on_finish: Callable = lambda: None,
     ) -> str:
 
         target_device = self.configred_devices.get(target)
diff --git a/src/rai_asr/rai_asr/models/__init__.py b/src/rai_asr/rai_asr/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2024 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import BaseVoiceDetectionModel
+
+__all__ = ["BaseVoiceDetectionModel"]
diff --git a/src/rai_asr/rai_asr/models/base.py b/src/rai_asr/rai_asr/models/base.py
@@ -0,0 +1,28 @@
+# Copyright (C) 2024 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from abc import ABC, abstractmethod
+from typing import Any, Tuple
+
+from numpy._typing import NDArray
+
+
+class BaseVoiceDetectionModel(ABC):
+
+    @abstractmethod
+    def detected(
+        self, audio_data: NDArray, input_parameters: dict[str, Any]
+    ) -> Tuple[bool, dict[str, Any]]:
+        pass