Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(helpers): add helpers for local audio testing #2215

Merged
merged 2 commits into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions examples/audio.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env rye run python

import time
from pathlib import Path

from openai import OpenAI
Expand All @@ -12,8 +11,6 @@


def main() -> None:
stream_to_speakers()

# Create text-to-speech audio file
with openai.audio.speech.with_streaming_response.create(
model="tts-1",
Expand All @@ -37,28 +34,5 @@ def main() -> None:
print(translation.text)


def stream_to_speakers() -> None:
import pyaudio

player_stream = pyaudio.PyAudio().open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)

start_time = time.time()

with openai.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
response_format="pcm", # similar to WAV, but without a header chunk at the start.
input="""I see skies of blue and clouds of white
The bright blessed days, the dark sacred nights
And I think to myself
What a wonderful world""",
) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
for chunk in response.iter_bytes(chunk_size=1024):
player_stream.write(chunk)

print(f"Done in {int((time.time() - start_time) * 1000)}ms.")


if __name__ == "__main__":
main()
25 changes: 25 additions & 0 deletions examples/speech_to_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env rye run python

import asyncio

from openai import AsyncOpenAI
from openai.helpers import Microphone

# gets OPENAI_API_KEY from your environment variables
openai = AsyncOpenAI()


async def main() -> None:
print("Recording for the next 10 seconds...")
recording = await Microphone(timeout=10).record()
print("Recording complete")
transcription = await openai.audio.transcriptions.create(
model="whisper-1",
file=recording,
)

print(transcription.text)


if __name__ == "__main__":
asyncio.run(main())
31 changes: 31 additions & 0 deletions examples/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env rye run python

import time
import asyncio

from openai import AsyncOpenAI
from openai.helpers import LocalAudioPlayer

# gets OPENAI_API_KEY from your environment variables
openai = AsyncOpenAI()


async def main() -> None:
start_time = time.time()

async with openai.audio.speech.with_streaming_response.create(
model="tts-1",
voice="alloy",
response_format="pcm", # similar to WAV, but without a header chunk at the start.
input="""I see skies of blue and clouds of white
The bright blessed days, the dark sacred nights
And I think to myself
What a wonderful world""",
) as response:
print(f"Time to first byte: {int((time.time() - start_time) * 1000)}ms")
await LocalAudioPlayer().play(response)
print(f"Time to play: {int((time.time() - start_time) * 1000)}ms")


if __name__ == "__main__":
asyncio.run(main())
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ dependencies = [
"sniffio",
"tqdm > 4",
"jiter>=0.4.0, <1",
"sounddevice>=0.5.1",
"numpy>=2.0.2",
]
requires-python = ">= 3.8"
classifiers = [
Expand Down
8 changes: 6 additions & 2 deletions requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# all-features: true
# with-sources: false
# generate-hashes: false
# universal: false

-e file:.
annotated-types==0.6.0
Expand All @@ -32,6 +33,7 @@ certifi==2023.7.22
# via requests
cffi==1.16.0
# via cryptography
# via sounddevice
charset-normalizer==3.3.2
# via requests
click==8.1.7
Expand Down Expand Up @@ -91,7 +93,7 @@ nest-asyncio==1.6.0
nodeenv==1.8.0
# via pyright
nox==2023.4.22
numpy==1.26.3
numpy==2.0.2
# via openai
# via pandas
# via pandas-stubs
Expand All @@ -101,7 +103,7 @@ packaging==23.2
# via black
# via nox
# via pytest
pandas==2.1.4
pandas==2.2.3
# via openai
pandas-stubs==2.1.4.231227
# via openai
Expand Down Expand Up @@ -153,6 +155,8 @@ sniffio==1.3.0
# via trio
sortedcontainers==2.4.0
# via trio
sounddevice==0.5.1
# via openai
time-machine==2.9.0
toml==0.10.2
# via inline-snapshot
Expand Down
7 changes: 7 additions & 0 deletions requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# all-features: true
# with-sources: false
# generate-hashes: false
# universal: false

-e file:.
annotated-types==0.6.0
Expand All @@ -17,6 +18,8 @@ anyio==4.1.0
certifi==2023.7.22
# via httpcore
# via httpx
cffi==1.17.1
# via sounddevice
distro==1.8.0
# via openai
exceptiongroup==1.2.2
Expand All @@ -40,6 +43,8 @@ pandas==2.2.3
# via openai
pandas-stubs==2.2.2.240807
# via openai
pycparser==2.22
# via cffi
pydantic==2.10.3
# via openai
pydantic-core==2.27.1
Expand All @@ -53,6 +58,8 @@ six==1.16.0
sniffio==1.3.0
# via anyio
# via openai
sounddevice==0.5.1
# via openai
tqdm==4.66.5
# via openai
types-pytz==2024.2.0.20241003
Expand Down
3 changes: 3 additions & 0 deletions src/openai/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .lib.helpers import Microphone, LocalAudioPlayer

__all__ = ["LocalAudioPlayer", "Microphone"]
4 changes: 4 additions & 0 deletions src/openai/lib/helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .microphone import Microphone
from .local_audio_player import LocalAudioPlayer

__all__ = ["Microphone", "LocalAudioPlayer"]
161 changes: 161 additions & 0 deletions src/openai/lib/helpers/local_audio_player.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import queue
import asyncio
from typing import Any, Union, Callable, AsyncGenerator, cast

import numpy as np
import sounddevice as sd
import numpy.typing as npt

from ... import _legacy_response
from ..._response import StreamedBinaryAPIResponse, AsyncStreamedBinaryAPIResponse

SAMPLE_RATE = 24000


class LocalAudioPlayer:
def __init__(
self,
should_stop: Union[Callable[[], bool], None] = None,
):
self.channels = 1
self.dtype = np.float32
self.should_stop = should_stop

async def _tts_response_to_buffer(
self,
response: Union[
_legacy_response.HttpxBinaryResponseContent,
AsyncStreamedBinaryAPIResponse,
StreamedBinaryAPIResponse,
],
) -> npt.NDArray[np.float32]:
chunks: list[bytes] = []
if isinstance(response, _legacy_response.HttpxBinaryResponseContent) or isinstance(
response, StreamedBinaryAPIResponse
):
for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunks.append(chunk)
else:
async for chunk in response.iter_bytes(chunk_size=1024):
if chunk:
chunks.append(chunk)

audio_bytes = b"".join(chunks)
audio_np = np.frombuffer(audio_bytes, dtype=np.int16).astype(np.float32) / 32767.0
audio_np = audio_np.reshape(-1, 1)
return audio_np

async def play(
self,
input: Union[
npt.NDArray[np.int16],
npt.NDArray[np.float32],
_legacy_response.HttpxBinaryResponseContent,
AsyncStreamedBinaryAPIResponse,
StreamedBinaryAPIResponse,
],
) -> None:
audio_content: npt.NDArray[np.float32]
if isinstance(input, np.ndarray):
if input.dtype == np.int16 and self.dtype == np.float32:
audio_content = (input.astype(np.float32) / 32767.0).reshape(-1, self.channels)
elif input.dtype == np.float32:
audio_content = cast(npt.NDArray[np.float32], input)
else:
raise ValueError(f"Unsupported dtype: {input.dtype}")
else:
audio_content = await self._tts_response_to_buffer(input)

loop = asyncio.get_event_loop()
event = asyncio.Event()
idx = 0

def callback(
outdata: npt.NDArray[np.float32],
frame_count: int,
_time_info: Any,
_status: Any,
):
nonlocal idx

remainder = len(audio_content) - idx
if remainder == 0 or (callable(self.should_stop) and self.should_stop()):
loop.call_soon_threadsafe(event.set)
raise sd.CallbackStop
valid_frames = frame_count if remainder >= frame_count else remainder
outdata[:valid_frames] = audio_content[idx : idx + valid_frames]
outdata[valid_frames:] = 0
idx += valid_frames

stream = sd.OutputStream(
samplerate=SAMPLE_RATE,
callback=callback,
dtype=audio_content.dtype,
channels=audio_content.shape[1],
)
with stream:
await event.wait()

async def play_stream(
self,
buffer_stream: AsyncGenerator[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None], None],
) -> None:
loop = asyncio.get_event_loop()
event = asyncio.Event()
buffer_queue: queue.Queue[Union[npt.NDArray[np.float32], npt.NDArray[np.int16], None]] = queue.Queue(maxsize=50)

async def buffer_producer():
async for buffer in buffer_stream:
if buffer is None:
break
await loop.run_in_executor(None, buffer_queue.put, buffer)
await loop.run_in_executor(None, buffer_queue.put, None) # Signal completion

def callback(
outdata: npt.NDArray[np.float32],
frame_count: int,
_time_info: Any,
_status: Any,
):
nonlocal current_buffer, buffer_pos

frames_written = 0
while frames_written < frame_count:
if current_buffer is None or buffer_pos >= len(current_buffer):
try:
current_buffer = buffer_queue.get(timeout=0.1)
if current_buffer is None:
loop.call_soon_threadsafe(event.set)
raise sd.CallbackStop
buffer_pos = 0

if current_buffer.dtype == np.int16 and self.dtype == np.float32:
current_buffer = (current_buffer.astype(np.float32) / 32767.0).reshape(-1, self.channels)

except queue.Empty:
outdata[frames_written:] = 0
return

remaining_frames = len(current_buffer) - buffer_pos
frames_to_write = min(frame_count - frames_written, remaining_frames)
outdata[frames_written : frames_written + frames_to_write] = current_buffer[
buffer_pos : buffer_pos + frames_to_write
]
buffer_pos += frames_to_write
frames_written += frames_to_write

current_buffer = None
buffer_pos = 0

producer_task = asyncio.create_task(buffer_producer())

with sd.OutputStream(
samplerate=SAMPLE_RATE,
channels=self.channels,
dtype=self.dtype,
callback=callback,
):
await event.wait()

await producer_task
Loading