feat: working streaming ASR

rachwalk · rachwalk · commit 928a9ffa071d · 2025-01-23T16:03:54.000+01:00
diff --git a/src/rai/rai/agents/voice_agent.py b/src/rai/rai/agents/voice_agent.py
@@ -36,6 +36,7 @@ class ThreadData(TypedDict):
     thread: Thread
     event: Event
     transcription: str
+    joined: bool
 
 
 class VoiceRecognitionAgent(BaseAgent):
@@ -78,7 +79,7 @@ def __init__(
         self.sample_buffer_lock = Lock()
         self.active_thread = ""
         self.transcription_threads: dict[str, ThreadData] = {}
-        self.buffer_reminders: dict[str, list[NDArray]] = {}
+        self.transcription_buffers: dict[str, list[NDArray]] = {}
 
     def __call__(self):
         self.run()
@@ -106,12 +107,13 @@ def stop(self):
         self.logger.info("Stopping voice agent")
         self.running = False
         self.connectors["microphone"].terminate_action(self.listener_handle)
-        to_finish = len(list(self.transcription_threads.keys()))
-        while to_finish > 0:
+        while not all(
+            [thread["joined"] for thread in self.transcription_threads.values()]
+        ):
             for thread_id in self.transcription_threads:
                 if self.transcription_threads[thread_id]["event"].is_set():
                     self.transcription_threads[thread_id]["thread"].join()
-                    to_finish -= 1
+                    self.transcription_threads[thread_id]["joined"] = True
                 else:
                     self.logger.info(
                         f"Waiting for transcription of {thread_id} to finish..."
@@ -125,6 +127,12 @@ def on_new_sample(self, indata: np.ndarray, status_flags: dict[str, Any]):
             if not self.recording_started and len(self.sample_buffer) > 5:
                 self.sample_buffer = self.sample_buffer[-5:]
 
+        # attempt to join finished threads:
+        for thread_id in self.transcription_threads:
+            if self.transcription_threads[thread_id]["event"].is_set():
+                self.transcription_threads[thread_id]["thread"].join()
+                self.transcription_threads[thread_id]["joined"] = True
+
         voice_detected, output_parameters = self.vad.detected(indata, {})
         should_record = False
         # TODO: second condition is temporary
@@ -141,11 +149,11 @@ def on_new_sample(self, indata: np.ndarray, status_flags: dict[str, Any]):
             )
             transcription_finished = Event()
             self.active_thread = thread_id
-            transcription_thread.start()
             self.transcription_threads[thread_id] = {
                 "thread": transcription_thread,
                 "event": transcription_finished,
                 "transcription": "",
+                "joined": False,
             }
 
         if voice_detected:
@@ -156,12 +164,15 @@ def on_new_sample(self, indata: np.ndarray, status_flags: dict[str, Any]):
             self.recording_started
             and sample_time - self.grace_period_start > self.grace_period
         ):
-            self.logger.info("Grace period ended... stopping recording")
+            self.logger.info(
+                "Grace period ended... stopping recording, starting transcription"
+            )
             self.recording_started = False
             self.grace_period_start = 0
             with self.sample_buffer_lock:
-                self.buffer_reminders[self.active_thread] = self.sample_buffer
+                self.transcription_buffers[self.active_thread] = self.sample_buffer
                 self.sample_buffer = []
+            self.transcription_threads[self.active_thread]["thread"].start()
             self.active_thread = ""
 
     def should_record(
@@ -175,31 +186,46 @@ def should_record(
 
     def transcription_thread(self, identifier: str):
         self.logger.info(f"transcription thread {identifier} started")
-        with self.transcription_lock:
-            while self.active_thread == identifier:
-                with self.sample_buffer_lock:
-                    if len(self.sample_buffer) == 0:
-                        continue
-                    audio_data = self.sample_buffer.copy()
-                    self.sample_buffer = []
-                audio_data = np.concatenate(audio_data)
-                self.transcription_model.transcribe(audio_data)
-
-            # transciption of the reminder of the buffer
-            with self.sample_buffer_lock:
-                if identifier in self.buffer_reminders:
-                    audio_data = self.buffer_reminders[identifier]
-                    audio_data = np.concatenate(audio_data)
-                    self.transcription_model.transcribe(audio_data)
-                    del self.buffer_reminders[identifier]
-            # self.transcription_model.save_wav(f"{identifier}.wav")
-            transcription = self.transcription_model.consume_transcription()
-            print("Transcription: ", transcription)
-            self.connectors["ros2"].send_message(
-                ROS2ARIMessage(
-                    {"data": transcription}, {"msg_type": "std_msgs/msg/String"}
-                ),
-                "/from_human",
-            )
-            self.transcription_threads[identifier]["transcription"] = transcription
-            self.transcription_threads[identifier]["event"].set()
+        audio_data = np.concatenate(self.transcription_buffers[identifier])
+        with self.transcription_lock:  # this is only necessary for the local model... TODO: fix this somehow
+            transcription = self.transcription_model.transcribe(audio_data)
+        self.connectors["ros2"].send_message(
+            ROS2ARIMessage(
+                {"data": transcription}, {"msg_type": "std_msgs/msg/String"}
+            ),
+            "/from_human",
+        )
+        self.transcription_threads[identifier]["transcription"] = transcription
+        self.transcription_threads[identifier]["event"].set()
+
+        # with self.transcription_lock:
+        # while self.active_thread == identifier:
+        #     with self.sample_buffer_lock:
+        #         if len(self.sample_buffer) == 0:
+        #             continue
+        #         audio_data = self.sample_buffer.copy()
+        #         self.sample_buffer = []
+        #     audio_data = np.concatenate(audio_data)
+        #     with self.transcription_lock:
+        #         self.transcription_model.transcribe(audio_data)
+
+        # # transciption of the reminder of the buffer
+        # with self.sample_buffer_lock:
+        #     if identifier in self.transcription_buffers:
+        #         audio_data = self.transcription_buffers[identifier]
+        #         audio_data = np.concatenate(audio_data)
+        #         with self.transcription_lock:
+        #             self.transcription_model.transcribe(audio_data)
+        #         del self.transcription_buffers[identifier]
+        # # self.transcription_model.save_wav(f"{identifier}.wav")
+        # with self.transcription_lock:
+        #     transcription = self.transcription_model.consume_transcription()
+        # self.logger.info(f"Transcription: {transcription}")
+        # self.connectors["ros2"].send_message(
+        #     ROS2ARIMessage(
+        #         {"data": transcription}, {"msg_type": "std_msgs/msg/String"}
+        #     ),
+        #     "/from_human",
+        # )
+        # self.transcription_threads[identifier]["transcription"] = transcription
+        # self.transcription_threads[identifier]["event"].set()
diff --git a/src/rai_asr/rai_asr/models/base.py b/src/rai_asr/rai_asr/models/base.py
@@ -37,11 +37,6 @@ def __init__(self, model_name: str, sample_rate: int, language: str = "en"):
 
         self.latest_transcription = ""
 
-    def consume_transcription(self) -> str:
-        ret = self.latest_transcription
-        self.latest_transcription = ""
-        return ret
-
     @abstractmethod
-    def transcribe(self, data: NDArray[np.int16]):
+    def transcribe(self, data: NDArray[np.int16]) -> str:
         pass
diff --git a/src/rai_asr/rai_asr/models/local_whisper.py b/src/rai_asr/rai_asr/models/local_whisper.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 from typing import cast
 
 import numpy as np
@@ -30,14 +31,10 @@ def __init__(self, model_name: str, sample_rate: int, language: str = "en"):
         else:
             self.whisper = whisper.load_model(self.model_name)
 
+        self.logger = logging.getLogger(__name__)
         # TODO: remove sample storage before PR is merged, this is just to enable saving wav files for debugging
         # self.samples = None
 
-    def consume_transcription(self) -> str:
-        ret = super().consume_transcription()
-        # self.samples = None
-        return ret
-
     # def save_wav(self, output_filename: str):
     #     assert self.samples is not None, "No samples to save"
     #     combined_samples = self.samples
@@ -55,14 +52,13 @@ def consume_transcription(self) -> str:
     #         wav_file.setframerate(self.sample_rate)
     #         wav_file.writeframes(combined_samples.tobytes())
 
-    def transcribe(self, data: NDArray[np.int16]):
-        # self.samples = (
-        #     np.concatenate((self.samples, data)) if self.samples is not None else data
-        # )
+    def transcribe(self, data: NDArray[np.int16]) -> str:
         normalized_data = data.astype(np.float32) / 32768.0
         result = whisper.transcribe(
             self.whisper, normalized_data
         )  # TODO: handling of additional transcribe arguments (perhaps in model init)
         transcription = result["text"]
+        self.logger.info("transcription: %s", transcription)
         transcription = cast(str, transcription)
-        self.latest_transcription += transcription
+        self.latest_transcription = transcription
+        return transcription
diff --git a/src/rai_asr/rai_asr/models/open_ai_whisper.py b/src/rai_asr/rai_asr/models/open_ai_whisper.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import io
+import logging
 import os
 from functools import partial
 
@@ -36,21 +37,16 @@ def __init__(self, model_name: str, sample_rate: int, language: str = "en"):
             self.openai_client.audio.transcriptions.create,
             model=self.model_name,
         )
+        self.logger = logging.getLogger(__name__)
         self.samples = []
 
-    def add_samples(self, data: NDArray[np.int16]):
+    def transcribe(self, data: NDArray[np.int16]) -> str:
         normalized_data = data.astype(np.float32) / 32768.0
-        self.samples = (
-            np.concatenate([self.samples, normalized_data])
-            if self.samples is not None
-            else data
-        )
-
-    def transcribe(self) -> str:
         with io.BytesIO() as temp_wav_buffer:
-            wavfile.write(temp_wav_buffer, self.sample_rate, self.samples)
+            wavfile.write(temp_wav_buffer, self.sample_rate, normalized_data)
             temp_wav_buffer.seek(0)
             temp_wav_buffer.name = "temp.wav"
             response = self.model(file=temp_wav_buffer, language=self.language)
         transcription = response.text
+        self.logger.info("transcription: %s", transcription)
         return transcription