|
2 | 2 | #
|
3 | 3 | # SPDX-License-Identifier: MIT
|
4 | 4 |
|
| 5 | +from queue import Queue |
5 | 6 | import time
|
6 | 7 |
|
7 | 8 | import speech_recognition as sr
|
8 | 9 |
|
9 | 10 |
|
10 | 11 | class Listener:
|
11 |
| - def __init__(self, api_key, energy_threshold=300, record_timeout=30): |
| 12 | + def __init__( |
| 13 | + self, api_key, energy_threshold=300, phrase_timeout=3.0, record_timeout=30 |
| 14 | + ): |
12 | 15 | self.listener_handle = None
|
13 | 16 | self.microphone = sr.Microphone()
|
14 | 17 | self.recognizer = sr.Recognizer()
|
15 | 18 | self.recognizer.energy_threshold = energy_threshold
|
| 19 | + self.recognizer.dynamic_energy_threshold = False |
| 20 | + self.recognizer.pause_threshold = 1 |
| 21 | + self.last_sample = bytes() |
| 22 | + self.phrase_time = time.monotonic() |
| 23 | + self.phrase_timeout = phrase_timeout |
16 | 24 | with self.microphone as source:
|
17 | 25 | self.recognizer.adjust_for_ambient_noise(
|
18 | 26 | source
|
19 | 27 | ) # we only need to calibrate once, before we start listening
|
20 | 28 | self.record_timeout = record_timeout
|
| 29 | + self.phrase_complete = False |
| 30 | + self.data_queue = Queue() |
21 | 31 | self.listener_handle = None
|
22 |
| - self.audio = None |
23 | 32 | self.api_key = api_key
|
24 | 33 |
|
25 | 34 | def listen(self, ready_callback=None):
|
| 35 | + print("Start listening...") |
| 36 | + self.phrase_complete = False |
| 37 | + start = time.monotonic() |
26 | 38 | self._start_listening()
|
27 | 39 | if ready_callback:
|
28 | 40 | ready_callback()
|
29 |
| - while self.listener_handle and self.audio is None: |
30 |
| - time.sleep(0.1) |
| 41 | + while ( |
| 42 | + self.listener_handle and not self.speech_waiting() |
| 43 | + ) or not self.phrase_complete: |
| 44 | + if self.phrase_time and time.monotonic() > start + self.phrase_timeout: |
| 45 | + self.last_sample = bytes() |
| 46 | + self.phrase_complete = True |
| 47 | + self.phrase_time = time.monotonic() - start |
31 | 48 | self.stop_listening()
|
32 | 49 |
|
33 |
| - def _save_audio_callback(self, _recognizer, audio): |
34 |
| - self.audio = audio |
| 50 | + def _save_audio_callback(self, _, audio): |
| 51 | + print("Saving audio") |
| 52 | + data = audio.get_raw_data() |
| 53 | + self.data_queue.put(data) |
| 54 | + |
| 55 | + def _get_audio(self): |
| 56 | + """Concatenate and convert the queued raw data back to audio and return it""" |
| 57 | + start = time.monotonic() |
| 58 | + if self.speech_waiting(): |
| 59 | + self.phrase_complete = False |
| 60 | + if self.phrase_time and time.monotonic() > start + self.phrase_timeout: |
| 61 | + self.last_sample = bytes() |
| 62 | + self.phrase_complete = True |
| 63 | + self.phrase_time = time.monotonic() - start |
| 64 | + |
| 65 | + # Concatenate our current audio data with the latest audio data. |
| 66 | + while self.speech_waiting(): |
| 67 | + data = self.data_queue.get() |
| 68 | + self.last_sample += data |
| 69 | + |
| 70 | + # Use AudioData to convert the raw data to wav data. |
| 71 | + return sr.AudioData( |
| 72 | + self.last_sample, |
| 73 | + self.microphone.SAMPLE_RATE, |
| 74 | + self.microphone.SAMPLE_WIDTH, |
| 75 | + ) |
| 76 | + return None |
35 | 77 |
|
36 | 78 | def _start_listening(self):
|
37 |
| - self.listener_handle = self.recognizer.listen_in_background( |
38 |
| - self.microphone, self._save_audio_callback |
39 |
| - ) |
| 79 | + if not self.listener_handle: |
| 80 | + self.listener_handle = self.recognizer.listen_in_background( |
| 81 | + self.microphone, |
| 82 | + self._save_audio_callback, |
| 83 | + phrase_time_limit=self.record_timeout, |
| 84 | + ) |
40 | 85 |
|
41 | 86 | def stop_listening(self, wait_for_stop=False):
|
42 | 87 | if self.listener_handle:
|
43 | 88 | self.listener_handle(wait_for_stop=wait_for_stop)
|
44 | 89 | self.listener_handle = None
|
| 90 | + print("Stop listening...") |
45 | 91 |
|
46 | 92 | def is_listening(self):
|
47 | 93 | return self.listener_handle is not None
|
48 | 94 |
|
49 | 95 | def speech_waiting(self):
|
50 |
| - return self.audio is not None |
| 96 | + return not self.data_queue.empty() |
51 | 97 |
|
52 | 98 | def recognize(self):
|
53 |
| - if self.audio: |
| 99 | + audio = self._get_audio() |
| 100 | + if audio: |
54 | 101 | # Transcribe the audio data to text using Whisper
|
55 | 102 | print("Recognizing...")
|
56 | 103 | attempts = 0
|
57 | 104 | while attempts < 3:
|
58 | 105 | try:
|
59 | 106 | result = self.recognizer.recognize_whisper_api(
|
60 |
| - self.audio, api_key=self.api_key |
| 107 | + audio, api_key=self.api_key |
61 | 108 | )
|
62 | 109 |
|
63 | 110 | return result.strip()
|
|
0 commit comments