openai · dkundel-openai · Mar 18, 2025 · Mar 14, 2025 · Mar 18, 2025
@@ -1,6 +1,5 @@
 #!/usr/bin/env rye run python
 
-import time
 from pathlib import Path
 
 from openai import OpenAI
@@ -12,8 +11,6 @@
 
 
 def main() -> None:
-    stream_to_speakers()
-
     # Create text-to-speech audio file
     with openai.audio.speech.with_streaming_response.create(
         model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
     print(translation.text)
 
 
-def stream_to_speakers() -> None:
-    import pyaudio
-
-    player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
-
-    start_time = time.time()
-
-    with openai.audio.speech.with_streaming_response.create(
-        model="tts-1",
-        voice="alloy",
-        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
-        input="""I see skies of blue and clouds of white
-                The bright blessed days, the dark sacred nights
-                And I think to myself
-                What a wonderful world""",
-    ) as response:
-        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
-        for chunk in response.iter_bytes(chunk_size=1024):
-            player_stream.write(chunk)
-
-    print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
-
-
 if __name__ == "__main__":
     main()
@@ -0,0 +1,25 @@
+#!/usr/bin/env rye run python
+
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import Microphone
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+    print("Recording for the next 10 seconds...")
+    recording = await Microphone(timeout=10).record()
+    print("Recording complete")
+    transcription = await openai.audio.transcriptions.create(
+        model="whisper-1",
+        file=recording,
+    )
+
+    print(transcription.text)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -0,0 +1,31 @@
+#!/usr/bin/env rye run python
+
+import time
+import asyncio
+
+from openai import AsyncOpenAI
+from openai.helpers import LocalAudioPlayer
+
+# gets OPENAI_API_KEY from your environment variables
+openai = AsyncOpenAI()
+
+
+async def main() -> None:
+    start_time = time.time()
+
+    async with openai.audio.speech.with_streaming_response.create(
+        model="tts-1",
+        voice="alloy",
+        response_format="pcm",  # similar to WAV, but without a header chunk at the start.
+        input="""I see skies of blue and clouds of white
+                The bright blessed days, the dark sacred nights
+                And I think to myself
+                What a wonderful world""",
+    ) as response:
+        print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
+        await LocalAudioPlayer().play(response)
+        print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -16,6 +16,8 @@ dependencies = [
     "sniffio",
     "tqdm > 4",
     "jiter>=0.4.0, <1",
+    "sounddevice>=0.5.1",
+    "numpy>=2.0.2",
 ]
 requires-python = ">= 3.8"
 classifiers = [

@@ -7,6 +7,7 @@
 #   all-features: true
 #   with-sources: false
 #   generate-hashes: false
+#   universal: false
 
 -e file:.
 annotated-types==0.6.0
@@ -32,6 +33,7 @@ certifi==2023.7.22
     # via requests
 cffi==1.16.0
     # via cryptography
+    # via sounddevice
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
@@ -91,7 +93,7 @@ nest-asyncio==1.6.0
 nodeenv==1.8.0
     # via pyright
 nox==2023.4.22
-numpy==1.26.3
+numpy==2.0.2
     # via openai
     # via pandas
     # via pandas-stubs
@@ -101,7 +103,7 @@ packaging==23.2
     # via black
     # via nox
     # via pytest
-pandas==2.1.4
+pandas==2.2.3
     # via openai
 pandas-stubs==2.1.4.231227
     # via openai
@@ -153,6 +155,8 @@ sniffio==1.3.0
     # via trio
 sortedcontainers==2.4.0
     # via trio
+sounddevice==0.5.1
+    # via openai
 time-machine==2.9.0
 toml==0.10.2
     # via inline-snapshot

@@ -7,6 +7,7 @@
 #   all-features: true
 #   with-sources: false
 #   generate-hashes: false
+#   universal: false
 
 -e file:.
 annotated-types==0.6.0
@@ -17,6 +18,8 @@ anyio==4.1.0
 certifi==2023.7.22
     # via httpcore
     # via httpx
+cffi==1.17.1
+    # via sounddevice
 distro==1.8.0
     # via openai
 exceptiongroup==1.2.2
@@ -40,6 +43,8 @@ pandas==2.2.3
     # via openai
 pandas-stubs==2.2.2.240807
     # via openai
+pycparser==2.22
+    # via cffi
 pydantic==2.10.3
     # via openai
 pydantic-core==2.27.1
@@ -53,6 +58,8 @@ six==1.16.0
 sniffio==1.3.0
     # via anyio
     # via openai
+sounddevice==0.5.1
+    # via openai
 tqdm==4.66.5
     # via openai
 types-pytz==2024.2.0.20241003

@@ -0,0 +1,3 @@
+from .lib.helpers import Microphone, LocalAudioPlayer
+
+__all__ = ["LocalAudioPlayer", "Microphone"]
@@ -0,0 +1,4 @@
+from .microphone import Microphone
+from .local_audio_player import LocalAudioPlayer
+
+__all__ = ["Microphone", "LocalAudioPlayer"]
@@ -0,0 +1,161 @@
+import queue
+import asyncio
+from typing import Any, Union, Callable, AsyncGenerator, cast
+
+import numpy as np
+import sounddevice as sd
+import numpy.typing as npt
+
+from ... import _legacy_response
+from ..._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
+
+SAMPLE_RATE = 24000
+
+
+class LocalAudioPlayer:
+    def __init__(
+        self,
+        should_stop: Union[Callable[[], bool], None] = None,
+    ):
+        self.channels = 1
+        self.dtype = np.float32
+        self.should_stop = should_stop
+
+    async def _tts_response_to_buffer(
+        self,
+        response: Union[
+            _legacy_response.HttpxBinaryResponseContent,
+            AsyncStreamedBinaryAPIResponse,
+            StreamedBinaryAPIResponse,
+        ],
+    ) -> npt.NDArray[np.float32]:
+        chunks: list[bytes] = []
+        if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
+            response, StreamedBinaryAPIResponse
+        ):
+            for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunks.append(chunk)
+        else:
+            async for chunk in response.iter_bytes(chunk_size=1024):
+                if chunk:
+                    chunks.append(chunk)
+
+        audio_bytes = b"".join(chunks)
+        audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
+        audio_np = audio_np.reshape(-1, 1)
+        return audio_np
+
+    async def play(
+        self,
+        input: Union[
+            npt.NDArray[np.int16],
+            npt.NDArray[np.float32],
+            _legacy_response.HttpxBinaryResponseContent,
+            AsyncStreamedBinaryAPIResponse,
+            StreamedBinaryAPIResponse,
+        ],
+    ) -> None:
+        audio_content: npt.NDArray[np.float32]
+        if isinstance(input, np.ndarray):
+            if input.dtype == np.int16 and self.dtype == np.float32:
+                audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+            elif input.dtype == np.float32:
+                audio_content = cast(npt.NDArray[np.float32], input)
+            else:
+                raise ValueError(f"Unsupported dtype: {input.dtype}")
+        else:
+            audio_content = await self._tts_response_to_buffer(input)
+
+        loop = asyncio.get_event_loop()
+        event = asyncio.Event()
+        idx = 0
+
+        def callback(
+            outdata: npt.NDArray[np.float32],
+            frame_count: int,
+            _time_info: Any,
+            _status: Any,
+        ):
+            nonlocal idx
+
+            remainder = len(audio_content) - idx
+            if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
+                loop.call_soon_threadsafe(event.set)
+                raise sd.CallbackStop
+            valid_frames = frame_count if remainder >= frame_count else remainder
+            outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
+            outdata[valid_frames:] = 0
+            idx += valid_frames
+
+        stream = sd.OutputStream(
+            samplerate=SAMPLE_RATE,
+            callback=callback,
+            dtype=audio_content.dtype,
+            channels=audio_content.shape[1],
+        )
+        with stream:
+            await event.wait()
+
+    async def play_stream(
+        self,
+        buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
+    ) -> None:
+        loop = asyncio.get_event_loop()
+        event = asyncio.Event()
+        buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
+
+        async def buffer_producer():
+            async for buffer in buffer_stream:
+                if buffer is None:
+                    break
+                await loop.run_in_executor(None, buffer_queue.put, buffer)
+            await loop.run_in_executor(None, buffer_queue.put, None)  # Signal completion
+
+        def callback(
+            outdata: npt.NDArray[np.float32],
+            frame_count: int,
+            _time_info: Any,
+            _status: Any,
+        ):
+            nonlocal current_buffer, buffer_pos
+
+            frames_written = 0
+            while frames_written < frame_count:
+                if current_buffer is None or buffer_pos >= len(current_buffer):
+                    try:
+                        current_buffer = buffer_queue.get(timeout=0.1)
+                        if current_buffer is None:
+                            loop.call_soon_threadsafe(event.set)
+                            raise sd.CallbackStop
+                        buffer_pos = 0
+
+                        if current_buffer.dtype == np.int16 and self.dtype == np.float32:
+                            current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
+
+                    except queue.Empty:
+                        outdata[frames_written:] = 0
+                        return
+
+                remaining_frames = len(current_buffer) - buffer_pos
+                frames_to_write = min(frame_count - frames_written, remaining_frames)
+                outdata[frames_written : frames_written + frames_to_write] = current_buffer[
+                    buffer_pos : buffer_pos + frames_to_write
+                ]
+                buffer_pos += frames_to_write
+                frames_written += frames_to_write
+
+        current_buffer = None
+        buffer_pos = 0
+
+        producer_task = asyncio.create_task(buffer_producer())
+
+        with sd.OutputStream(
+            samplerate=SAMPLE_RATE,
+            channels=self.channels,
+            dtype=self.dtype,
+            callback=callback,
+        ):
+            await event.wait()
+
+        await producer_task
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .lib.helpers import Microphone, LocalAudioPlayer

		__all__ = ["LocalAudioPlayer", "Microphone"]