Merge pull request #223 from sashirestela/196-support-open-ai-realtim…

…e-api Add Support for Realtime API
sashirestela · Nov 28, 2024 · 8c46ce7 · 8c46ce7
2 parents b127751 + 4f007cc
commit 8c46ce7
Show file tree

Hide file tree

Showing 22 changed files with 2,390 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -20,10 +20,11 @@ A Java library to use the OpenAI Api in the simplest possible way.
   - [Chat Completion with Streaming Example](#chat-completion-with-streaming-example)
   - [Chat Completion with Functions Example](#chat-completion-with-functions-example)
   - [Chat Completion with Vision Example](#chat-completion-with-vision-example)
-  - [Chat Completion with Audio Example](#chat-completion-with-audio-example) **NEW**
+  - [Chat Completion with Audio Example](#chat-completion-with-audio-example)
   - [Chat Completion with Structured Outputs](#chat-completion-with-structured-outputs)
   - [Chat Completion Conversation Example](#chat-completion-conversation-example)
   - [Assistant v2 Conversation Example](#assistant-v2-conversation-example)
+  - [Realtime Conversation Example](#realtime-conversation-example) **NEW**
 - [Support for Additional OpenAI Providers](#-support-for-additional-openai-providers)
   - [Azure OpenAI](#azure-openai)
   - [Anyscale](#anyscale)
@@ -43,18 +44,19 @@ Simple-OpenAI uses the [CleverClient](https://github.com/sashirestela/cleverclie
 ## ✅ Supported Services
 Simple-OpenAI seeks to stay up to date with the most recent changes in OpenAI. Currently, it supports most of the existing features and will continue to update with future changes.
 
-Full support for all of the OpenAI services:
+Full support for most of the OpenAI services:
 
 * Audio (Speech, Transcription, Translation)
 * Batch (Batches of Chat Completion)
-* Chat Completion (Text Generation, Streaming, Function Calling, Vision, Structured Outputs, **Audio**)
+* Chat Completion (Text Generation, Streaming, Function Calling, Vision, Structured Outputs, Audio)
 * Completion (Legacy Text Generation)
 * Embedding  (Vectoring Text)
 * Files (Upload Files)
 * Fine Tuning (Customize Models)
 * Image (Generate, Edit, Variation)
 * Models (List)
 * Moderation (Check Harmful Text)
+* Realtime Beta (Speech-to-Speech Conversation, Multimodality, Function Calling) **NEW**
 * Upload (Upload Large Files in Parts)
 * Assistants Beta v2 (Assistants, Threads, Messages, Runs, Steps, Vector Stores, Streaming, Function Calling, Vision, Structured Outputs)
 
@@ -911,6 +913,10 @@ Thread was deleted: true
 ```
 </details>
 
+### Realtime Conversation Example
+In this example you can see the code to establish a speech-to-speech conversation between you and the model using your microphone and your speaker. See the full code on:
+
+[RealtimeDemo.java](src/demo/java/io/github/sashirestela/openai/demo/RealtimeDemo.java)
 
 ## ✴ Support for Additional OpenAI Providers
 Simple-OpenAI can be used with additional providers that are compatible with the OpenAI API. At this moment, there is support for the following additional providers:
@@ -980,6 +986,7 @@ Examples for each OpenAI service have been created in the folder [demo](https://
     * Image
     * Model
     * Moderation
+    * Realtime
     * Upload
     * Conversation
     * AssistantV2
@@ -1035,7 +1042,7 @@ List of the main users of our library:
 - [SuperTurtyBot](https://github.com/DaRealTurtyWurty/SuperTurtyBot): A multi-purpose discord bot.
 - [Woolly](https://github.com/da-z/woolly): A code generation IntelliJ plugin.
 - [Vinopener](https://github.com/thevinopener/vinopener): A wine recommender app.
-- [Cryptik.ai GPT Chatbot](https://social.wubits.io/wubits/home/66b3969ec4b880134fe96886): A Telegram chatbot factory.
+- [ScalerX.ai](https://scalerX.ai): A Telegram chatbot factory.
 
 
 ## ❤ Show Us Your Love

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
   <groupId>io.github.sashirestela</groupId>
   <artifactId>simple-openai</artifactId>
-  <version>3.9.3</version>
+  <version>3.10.0</version>
   <packaging>jar</packaging>
 
   <name>simple-openai</name>
@@ -56,7 +56,7 @@
     <cleverclient.version>1.5.0</cleverclient.version>
     <slimvalidator.version>1.2.2</slimvalidator.version>
     <lombok.version>1.18.36</lombok.version>
-    <jackson.version>2.18.1</jackson.version>
+    <jackson.version>2.18.2</jackson.version>
     <json.schema.version>4.37.0</json.schema.version>
     <junit.version>5.11.3</junit.version>
     <mockito.version>5.14.2</mockito.version>

diff --git a/src/demo/java/io/github/sashirestela/openai/demo/RealtimeDemo.java b/src/demo/java/io/github/sashirestela/openai/demo/RealtimeDemo.java
@@ -0,0 +1,174 @@
+package io.github.sashirestela.openai.demo;
+
+import io.github.sashirestela.openai.SimpleOpenAI;
+import io.github.sashirestela.openai.SimpleOpenAI.RealtimeConfig;
+import io.github.sashirestela.openai.domain.chat.ChatRequest.Modality;
+import io.github.sashirestela.openai.domain.realtime.ClientEvent;
+import io.github.sashirestela.openai.domain.realtime.Configuration;
+import io.github.sashirestela.openai.domain.realtime.ServerEvent;
+
+import javax.sound.sampled.AudioFormat;
+import javax.sound.sampled.AudioSystem;
+import javax.sound.sampled.DataLine;
+import javax.sound.sampled.LineUnavailableException;
+import javax.sound.sampled.SourceDataLine;
+import javax.sound.sampled.TargetDataLine;
+
+import java.util.Base64;
+import java.util.Scanner;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+public class RealtimeDemo {
+
+    private static final int BUFFER_SIZE = 8192;
+
+    public static void main(String[] args) throws LineUnavailableException, InterruptedException {
+        var sound = new Sound();
+
+        var openAI = SimpleOpenAI.builder()
+                .apiKey(System.getenv("OPENAI_API_KEY"))
+                .realtimeConfig(RealtimeConfig.of("gpt-4o-realtime-preview-2024-10-01"))
+                .build();
+
+        var configuration = Configuration.builder()
+                .modality(Modality.AUDIO)
+                .instructions("Respond with short, direct sentences.")
+                .voice(Configuration.VoiceRealtime.ECHO)
+                .outputAudioFormat(Configuration.AudioFormatRealtime.PCM16)
+                .inputAudioTranscription(null)
+                .turnDetection(null)
+                .temperature(0.9)
+                .build();
+
+        var realtime = openAI.realtime();
+
+        realtime.onEvent(ServerEvent.ResponseAudioDelta.class, event -> {
+            var dataBase64 = Base64.getDecoder().decode(event.getDelta());
+            sound.speaker.write(dataBase64, 0, dataBase64.length);
+        });
+
+        realtime.onEvent(ServerEvent.ResponseAudioDone.class, event -> {
+            delay(1000); // Some delay to receive trailing audio deltas
+            sound.speaker.stop();
+            sound.speaker.drain();
+        });
+
+        realtime.onEvent(ServerEvent.ResponseAudioTranscriptDone.class, event -> {
+            System.out.println(event.getTranscript());
+            askForSpeaking();
+        });
+
+        // Connect synchronously and wait for the connection to complete
+        realtime.connect().thenRun(() -> {
+            System.out.println("Connection established!");
+            System.out.println("(Press any key and Return to terminate)");
+            realtime.send(ClientEvent.SessionUpdate.of(configuration)).join();
+        }).join();
+
+        Scanner scanner = new Scanner(System.in);
+        askForSpeaking();
+        while (true) {
+            sound.microphone.start();
+            AtomicBoolean isRecording = new AtomicBoolean(true);
+            CompletableFuture<Void> recordingFuture = CompletableFuture.runAsync(() -> {
+                byte[] data = new byte[BUFFER_SIZE];
+                try {
+                    while (isRecording.get()) {
+                        int bytesRead = sound.microphone.read(data, 0, data.length);
+                        if (bytesRead > 0) {
+                            var dataBase64 = Base64.getEncoder().encodeToString(data);
+                            // Use runAsync to prevent blocking and add a small delay
+                            CompletableFuture.runAsync(() -> {
+                                delay(10); // Small delay to prevent rapid sending
+                                realtime.send(ClientEvent.InputAudioBufferAppend.of(dataBase64)).join();
+                            });
+                        }
+                    }
+                } catch (Exception e) {
+                    e.printStackTrace();
+                }
+            });
+
+            var keyPressed = scanner.nextLine();
+            if (keyPressed.isEmpty()) {
+                isRecording.set(false);
+                sound.microphone.stop();
+                sound.microphone.drain();
+
+                // Wait for recording to finish
+                recordingFuture.join();
+
+                // Send ResponseCreate and wait for it to complete
+                realtime.send(ClientEvent.ResponseCreate.of(null)).join();
+
+                System.out.println("Waiting for AI response...\n");
+                sound.speaker.start();
+            } else {
+                break;
+            }
+        }
+        scanner.close();
+        sound.cleanup();
+        realtime.disconnect();
+    }
+
+    private static void askForSpeaking() {
+        System.out.println("\nSpeak your question (press Return when done):");
+    }
+
+    private static void delay(int milliseconds) {
+        try {
+            Thread.sleep(milliseconds);
+        } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+        }
+    }
+
+    public static class Sound {
+
+        private static final float SAMPLE_RATE = 24000f;
+        private static final int SAMPLE_SIZE_BITS = 16;
+        private static final int CHANNELS = 1;
+        private static final boolean SIGNED = true;
+        private static final boolean BIG_ENDIAN = false;
+
+        private TargetDataLine microphone;
+        private SourceDataLine speaker;
+
+        public Sound() throws LineUnavailableException {
+            AudioFormat format = new AudioFormat(
+                    SAMPLE_RATE,
+                    SAMPLE_SIZE_BITS,
+                    CHANNELS,
+                    SIGNED,
+                    BIG_ENDIAN);
+
+            DataLine.Info micInfo = new DataLine.Info(TargetDataLine.class, format);
+            if (!AudioSystem.isLineSupported(micInfo)) {
+                throw new LineUnavailableException("Microphone not supported");
+            }
+            microphone = (TargetDataLine) AudioSystem.getLine(micInfo);
+            microphone.open(format);
+
+            DataLine.Info speakerInfo = new DataLine.Info(SourceDataLine.class, format);
+            if (!AudioSystem.isLineSupported(speakerInfo)) {
+                throw new LineUnavailableException("Speakers not supported");
+            }
+            speaker = (SourceDataLine) AudioSystem.getLine(speakerInfo);
+            speaker.open(format);
+        }
+
+        public void cleanup() {
+            microphone.stop();
+            microphone.drain();
+            microphone.close();
+
+            speaker.stop();
+            speaker.drain();
+            speaker.close();
+        }
+
+    }
+
+}
diff --git a/src/main/java/io/github/sashirestela/openai/BaseSimpleOpenAI.java b/src/main/java/io/github/sashirestela/openai/BaseSimpleOpenAI.java
@@ -24,6 +24,8 @@ public abstract class BaseSimpleOpenAI {
     @Setter
     protected CleverClient cleverClient;
 
+    protected OpenAIRealtime realtime;
+
     protected OpenAI.Audios audioService;
     protected OpenAI.Batches batchService;
     protected OpenAI.ChatCompletions chatCompletionService;
@@ -63,6 +65,13 @@ public abstract class BaseSimpleOpenAI {
                 .endOfStream(END_OF_STREAM)
                 .objectMapper(objectMapper)
                 .build();
+        var baseRealtimeConfig = args.getBaseRealtimeConfig();
+        if (baseRealtimeConfig != null) {
+            this.realtime = OpenAIRealtime.builder()
+                    .httpClient(httpClient)
+                    .baseRealtimeConfig(baseRealtimeConfig)
+                    .build();
+        }
     }
 
     /**
@@ -208,4 +217,11 @@ public OpenAIBeta2.VectorStoreFileBatches vectorStoreFileBatches() {
         throw new UnsupportedOperationException(NOT_IMPLEMENTED);
     }
 
+    /**
+     * Throw not implemented
+     */
+    public OpenAIRealtime realtime() {
+        throw new UnsupportedOperationException(NOT_IMPLEMENTED);
+    }
+
 }
diff --git a/src/main/java/io/github/sashirestela/openai/BaseSimpleOpenAIArgs.java b/src/main/java/io/github/sashirestela/openai/BaseSimpleOpenAIArgs.java
@@ -2,6 +2,7 @@
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 import io.github.sashirestela.cleverclient.http.HttpRequestData;
+import io.github.sashirestela.openai.OpenAIRealtime.BaseRealtimeConfig;
 import lombok.Builder;
 import lombok.Getter;
 import lombok.NonNull;
@@ -20,5 +21,6 @@ public class BaseSimpleOpenAIArgs {
     private final HttpClient httpClient;
     private final UnaryOperator<HttpRequestData> requestInterceptor;
     private final ObjectMapper objectMapper;
+    private final BaseRealtimeConfig baseRealtimeConfig;
 
 }
diff --git a/src/main/java/io/github/sashirestela/openai/OpenAIBeta2.java b/src/main/java/io/github/sashirestela/openai/OpenAIBeta2.java
@@ -39,6 +39,7 @@
 import io.github.sashirestela.openai.domain.assistant.VectorStoreModifyRequest;
 import io.github.sashirestela.openai.domain.assistant.VectorStoreRequest;
 import io.github.sashirestela.openai.domain.assistant.events.AssistantStreamEvent;
+import io.github.sashirestela.openai.support.Constant;
 import io.github.sashirestela.openai.support.Poller;
 
 import java.util.List;
@@ -60,7 +61,7 @@ public interface OpenAIBeta2 {
      * @see <a href="https://platform.openai.com/docs/api-reference/assistants">Assistants</a>
      */
     @Resource("/v1/assistants")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface Assistants {
 
         /**
@@ -137,7 +138,7 @@ CompletableFuture<Assistant> modify(@Path("assistantId") String assistantId,
      * @see <a href="https://platform.openai.com/docs/api-reference/threads">Threads</a>
      */
     @Resource("/v1/threads")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface Threads {
 
         /**
@@ -194,7 +195,7 @@ default CompletableFuture<Thread> create() {
      * @see <a href="https://platform.openai.com/docs/api-reference/messages">Thread Messages</a>
      */
     @Resource("/v1/threads/{threadId}/messages")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface ThreadMessages {
 
         /**
@@ -269,7 +270,7 @@ CompletableFuture<ThreadMessage> modify(@Path("threadId") String threadId, @Path
      * @see <a href="https://platform.openai.com/docs/api-reference/runs">Thread Runs</a>
      */
     @Resource("/v1/threads")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface ThreadRuns {
 
         /**
@@ -528,7 +529,7 @@ default CompletableFuture<Stream<Event>> submitToolOutputStream(String threadId,
      * @see <a href="https://platform.openai.com/docs/api-reference/run-steps">Thread Run Steps</a>
      */
     @Resource("/v1/threads/{threadId}/runs/{runId}/steps")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface ThreadRunSteps {
 
         /**
@@ -603,7 +604,7 @@ default CompletableFuture<ThreadRunStep> getOneWithFileSearchResult(@Path("threa
      * @see <a href="https://platform.openai.com/docs/api-reference/vector-stores">Vector Stores</a>
      */
     @Resource("/v1/vector_stores")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface VectorStores {
 
         /**
@@ -693,7 +694,7 @@ CompletableFuture<VectorStore> modify(@Path("vectorStoreId") String vectorStoreI
      *      Files</a>
      */
     @Resource("/v1/vector_stores/{vectorStoreId}/files")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface VectorStoreFiles {
 
         /**
@@ -790,7 +791,7 @@ CompletableFuture<DeletedObject> delete(@Path("vectorStoreId") String vectorStor
      *      Store File Batches</a>
      */
     @Resource("/v1/vector_stores/{vectorStoreId}/file_batches")
-    @Header(name = "OpenAI-Beta", value = "assistants=v2")
+    @Header(name = Constant.OPENAI_BETA_HEADER, value = Constant.OPENAI_ASSISTANT_VERSION)
     interface VectorStoreFileBatches {
 
         /**