feat: add open wake word model

rachwalk · rachwalk · commit 47fb148f1bf2 · 2025-01-09T16:36:01.000+01:00
diff --git a/src/rai_asr/rai_asr/models/__init__.py b/src/rai_asr/rai_asr/models/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .base import BaseVoiceDetectionModel
+from rai_asr.models.base import BaseVoiceDetectionModel
+from rai_asr.models.open_wake_word import OpenWakeWord
+from rai_asr.models.silero_vad import SileroVAD
 
-__all__ = ["BaseVoiceDetectionModel"]
+__all__ = ["BaseVoiceDetectionModel", "SileroVAD", "OpenWakeWord"]
diff --git a/src/rai_asr/rai_asr/models/open_wake_word.py b/src/rai_asr/rai_asr/models/open_wake_word.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2024 Robotec.AI
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Tuple
+
+from numpy.typing import NDArray
+from openwakeword.model import Model as OWWModel
+from openwakeword.utils import download_models
+
+from rai_asr.models import BaseVoiceDetectionModel
+
+
+class OpenWakeWord(BaseVoiceDetectionModel):
+    def __init__(self, wake_word_model_path: str, threshold: float = 0.5):
+        super(OpenWakeWord, self).__init__()
+        self.model_name = "open_wake_word"
+        download_models()
+        self.model = OWWModel(
+            wakeword_models=[
+                wake_word_model_path,
+            ],
+            inference_framework="onnx",
+        )
+        self.threshold = threshold
+
+    def detected(
+        self, audio_data: NDArray, input_parameters: dict[str, Any]
+    ) -> Tuple[bool, dict[str, Any]]:
+        print(len(audio_data))
+        predictions = self.model.predict(audio_data)
+        ret = input_parameters.copy()
+        ret.update({self.model_name: {"predictions": predictions}})
+        for key, value in predictions.items():
+            if value > self.threshold:
+                self.model.reset()
+                return True, ret
+        return False, ret
diff --git a/src/rai_asr/rai_asr/models/silero_vad.py b/src/rai_asr/rai_asr/models/silero_vad.py
@@ -48,7 +48,7 @@ def int2float(self, sound: NDArray[np.int16]):
         converted_sound = converted_sound.squeeze()
         return converted_sound
 
-    def detect(
+    def detected(
         self, audio_data: NDArray, input_parameters: dict[str, Any]
     ) -> Tuple[bool, dict[str, Any]]:
         vad_confidence = self.model(
@@ -57,5 +57,6 @@ def detect(
         ).item()
         ret = input_parameters.copy()
         ret.update({self.model_name: {"vad_confidence": vad_confidence}})
+        self.model.reset_states()  # NOTE: see streaming example at the bottom https://github.com/snakers4/silero-vad/wiki/Examples-and-Dependencies#dependencies
 
         return vad_confidence > self.threshold, ret