Skip to content

Commit 552a967

Browse files
feat(helpers): add helpers for local audio testing (#2215)
* feat(helpers): add helpers for local audio testing * fix linting
1 parent c71d4c9 commit 552a967

10 files changed

+334
-28
lines changed

examples/audio.py

-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
#!/usr/bin/env rye run python
22

3-
import time
43
from pathlib import Path
54

65
from openai import OpenAI
@@ -12,8 +11,6 @@
1211

1312

1413
def main() -> None:
15-
stream_to_speakers()
16-
1714
# Create text-to-speech audio file
1815
with openai.audio.speech.with_streaming_response.create(
1916
model="tts-1",
@@ -37,28 +34,5 @@ def main() -> None:
3734
print(translation.text)
3835

3936

40-
def stream_to_speakers() -> None:
41-
import pyaudio
42-
43-
player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
44-
45-
start_time = time.time()
46-
47-
with openai.audio.speech.with_streaming_response.create(
48-
model="tts-1",
49-
voice="alloy",
50-
response_format="pcm", # similar to WAV, but without a header chunk at the start.
51-
input="""I see skies of blue and clouds of white
52-
The bright blessed days, the dark sacred nights
53-
And I think to myself
54-
What a wonderful world""",
55-
) as response:
56-
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
57-
for chunk in response.iter_bytes(chunk_size=1024):
58-
player_stream.write(chunk)
59-
60-
print(f"Done in {int((time.time() - start_time) * 1000)}ms.")
61-
62-
6337
if __name__ == "__main__":
6438
main()

examples/speech_to_text.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env rye run python
2+
3+
import asyncio
4+
5+
from openai import AsyncOpenAI
6+
from openai.helpers import Microphone
7+
8+
# gets OPENAI_API_KEY from your environment variables
9+
openai = AsyncOpenAI()
10+
11+
12+
async def main() -> None:
13+
print("Recording for the next 10 seconds...")
14+
recording = await Microphone(timeout=10).record()
15+
print("Recording complete")
16+
transcription = await openai.audio.transcriptions.create(
17+
model="whisper-1",
18+
file=recording,
19+
)
20+
21+
print(transcription.text)
22+
23+
24+
if __name__ == "__main__":
25+
asyncio.run(main())

examples/text_to_speech.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env rye run python
2+
3+
import time
4+
import asyncio
5+
6+
from openai import AsyncOpenAI
7+
from openai.helpers import LocalAudioPlayer
8+
9+
# gets OPENAI_API_KEY from your environment variables
10+
openai = AsyncOpenAI()
11+
12+
13+
async def main() -> None:
14+
start_time = time.time()
15+
16+
async with openai.audio.speech.with_streaming_response.create(
17+
model="tts-1",
18+
voice="alloy",
19+
response_format="pcm", # similar to WAV, but without a header chunk at the start.
20+
input="""I see skies of blue and clouds of white
21+
The bright blessed days, the dark sacred nights
22+
And I think to myself
23+
What a wonderful world""",
24+
) as response:
25+
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
26+
await LocalAudioPlayer().play(response)
27+
print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")
28+
29+
30+
if __name__ == "__main__":
31+
asyncio.run(main())

pyproject.toml

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ dependencies = [
1616
"sniffio",
1717
"tqdm > 4",
1818
"jiter>=0.4.0, <1",
19+
"sounddevice>=0.5.1",
20+
"numpy>=2.0.2",
1921
]
2022
requires-python = ">= 3.8"
2123
classifiers = [

requirements-dev.lock

+5-2
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ certifi==2023.7.22
3333
# via requests
3434
cffi==1.16.0
3535
# via cryptography
36+
# via sounddevice
3637
charset-normalizer==3.3.2
3738
# via requests
3839
click==8.1.7
@@ -92,7 +93,7 @@ nest-asyncio==1.6.0
9293
nodeenv==1.8.0
9394
# via pyright
9495
nox==2023.4.22
95-
numpy==1.26.3
96+
numpy==2.0.2
9697
# via openai
9798
# via pandas
9899
# via pandas-stubs
@@ -102,7 +103,7 @@ packaging==23.2
102103
# via black
103104
# via nox
104105
# via pytest
105-
pandas==2.1.4
106+
pandas==2.2.3
106107
# via openai
107108
pandas-stubs==2.1.4.231227
108109
# via openai
@@ -154,6 +155,8 @@ sniffio==1.3.0
154155
# via trio
155156
sortedcontainers==2.4.0
156157
# via trio
158+
sounddevice==0.5.1
159+
# via openai
157160
time-machine==2.9.0
158161
toml==0.10.2
159162
# via inline-snapshot

requirements.lock

+6
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ anyio==4.1.0
1818
certifi==2023.7.22
1919
# via httpcore
2020
# via httpx
21+
cffi==1.17.1
22+
# via sounddevice
2123
distro==1.8.0
2224
# via openai
2325
exceptiongroup==1.2.2
@@ -41,6 +43,8 @@ pandas==2.2.3
4143
# via openai
4244
pandas-stubs==2.2.2.240807
4345
# via openai
46+
pycparser==2.22
47+
# via cffi
4448
pydantic==2.10.3
4549
# via openai
4650
pydantic-core==2.27.1
@@ -54,6 +58,8 @@ six==1.16.0
5458
sniffio==1.3.0
5559
# via anyio
5660
# via openai
61+
sounddevice==0.5.1
62+
# via openai
5763
tqdm==4.66.5
5864
# via openai
5965
types-pytz==2024.2.0.20241003

src/openai/helpers.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .lib.helpers import Microphone, LocalAudioPlayer
2+
3+
__all__ = ["LocalAudioPlayer", "Microphone"]

src/openai/lib/helpers/__init__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .microphone import Microphone
2+
from .local_audio_player import LocalAudioPlayer
3+
4+
__all__ = ["Microphone", "LocalAudioPlayer"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import queue
2+
import asyncio
3+
from typing import Any, Union, Callable, AsyncGenerator, cast
4+
5+
import numpy as np
6+
import sounddevice as sd
7+
import numpy.typing as npt
8+
9+
from ... import _legacy_response
10+
from ..._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse
11+
12+
SAMPLE_RATE = 24000
13+
14+
15+
class LocalAudioPlayer:
16+
def __init__(
17+
self,
18+
should_stop: Union[Callable[[], bool], None] = None,
19+
):
20+
self.channels = 1
21+
self.dtype = np.float32
22+
self.should_stop = should_stop
23+
24+
async def _tts_response_to_buffer(
25+
self,
26+
response: Union[
27+
_legacy_response.HttpxBinaryResponseContent,
28+
AsyncStreamedBinaryAPIResponse,
29+
StreamedBinaryAPIResponse,
30+
],
31+
) -> npt.NDArray[np.float32]:
32+
chunks: list[bytes] = []
33+
if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
34+
response, StreamedBinaryAPIResponse
35+
):
36+
for chunk in response.iter_bytes(chunk_size=1024):
37+
if chunk:
38+
chunks.append(chunk)
39+
else:
40+
async for chunk in response.iter_bytes(chunk_size=1024):
41+
if chunk:
42+
chunks.append(chunk)
43+
44+
audio_bytes = b"".join(chunks)
45+
audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
46+
audio_np = audio_np.reshape(-1, 1)
47+
return audio_np
48+
49+
async def play(
50+
self,
51+
input: Union[
52+
npt.NDArray[np.int16],
53+
npt.NDArray[np.float32],
54+
_legacy_response.HttpxBinaryResponseContent,
55+
AsyncStreamedBinaryAPIResponse,
56+
StreamedBinaryAPIResponse,
57+
],
58+
) -> None:
59+
audio_content: npt.NDArray[np.float32]
60+
if isinstance(input, np.ndarray):
61+
if input.dtype == np.int16 and self.dtype == np.float32:
62+
audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
63+
elif input.dtype == np.float32:
64+
audio_content = cast(npt.NDArray[np.float32], input)
65+
else:
66+
raise ValueError(f"Unsupported dtype: {input.dtype}")
67+
else:
68+
audio_content = await self._tts_response_to_buffer(input)
69+
70+
loop = asyncio.get_event_loop()
71+
event = asyncio.Event()
72+
idx = 0
73+
74+
def callback(
75+
outdata: npt.NDArray[np.float32],
76+
frame_count: int,
77+
_time_info: Any,
78+
_status: Any,
79+
):
80+
nonlocal idx
81+
82+
remainder = len(audio_content) - idx
83+
if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
84+
loop.call_soon_threadsafe(event.set)
85+
raise sd.CallbackStop
86+
valid_frames = frame_count if remainder >= frame_count else remainder
87+
outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
88+
outdata[valid_frames:] = 0
89+
idx += valid_frames
90+
91+
stream = sd.OutputStream(
92+
samplerate=SAMPLE_RATE,
93+
callback=callback,
94+
dtype=audio_content.dtype,
95+
channels=audio_content.shape[1],
96+
)
97+
with stream:
98+
await event.wait()
99+
100+
async def play_stream(
101+
self,
102+
buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
103+
) -> None:
104+
loop = asyncio.get_event_loop()
105+
event = asyncio.Event()
106+
buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)
107+
108+
async def buffer_producer():
109+
async for buffer in buffer_stream:
110+
if buffer is None:
111+
break
112+
await loop.run_in_executor(None, buffer_queue.put, buffer)
113+
await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion
114+
115+
def callback(
116+
outdata: npt.NDArray[np.float32],
117+
frame_count: int,
118+
_time_info: Any,
119+
_status: Any,
120+
):
121+
nonlocal current_buffer, buffer_pos
122+
123+
frames_written = 0
124+
while frames_written < frame_count:
125+
if current_buffer is None or buffer_pos >= len(current_buffer):
126+
try:
127+
current_buffer = buffer_queue.get(timeout=0.1)
128+
if current_buffer is None:
129+
loop.call_soon_threadsafe(event.set)
130+
raise sd.CallbackStop
131+
buffer_pos = 0
132+
133+
if current_buffer.dtype == np.int16 and self.dtype == np.float32:
134+
current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)
135+
136+
except queue.Empty:
137+
outdata[frames_written:] = 0
138+
return
139+
140+
remaining_frames = len(current_buffer) - buffer_pos
141+
frames_to_write = min(frame_count - frames_written, remaining_frames)
142+
outdata[frames_written : frames_written + frames_to_write] = current_buffer[
143+
buffer_pos : buffer_pos + frames_to_write
144+
]
145+
buffer_pos += frames_to_write
146+
frames_written += frames_to_write
147+
148+
current_buffer = None
149+
buffer_pos = 0
150+
151+
producer_task = asyncio.create_task(buffer_producer())
152+
153+
with sd.OutputStream(
154+
samplerate=SAMPLE_RATE,
155+
channels=self.channels,
156+
dtype=self.dtype,
157+
callback=callback,
158+
):
159+
await event.wait()
160+
161+
await producer_task

0 commit comments

Comments
 (0)