Skip to content

New Services to Provide Speech-To-Text and Text-To-Speech Functionality from Aristech #35

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Jun 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7eb51de
Initial commit. Set up credentials and create initial client.
andrew-golledge Apr 28, 2025
c328904
Use aristech_stt_client examples/live.rs to implement initial recogn…
andrew-golledge Apr 28, 2025
b9b783d
Move aristech stt code out of lib.rs to reside in separate transcribe…
andrew-golledge Apr 29, 2025
ba6ca1b
Add Aristech speech recognition to Context Switch registry.
andrew-golledge May 5, 2025
7522b00
Convert to `anyhow` errors to avoid using `unwrap`.
andrew-golledge May 8, 2025
0da2b04
Add code for Aristech TTS.
andrew-golledge May 9, 2025
af80025
Make `auth_config` field and `AuthConfig` enum public, to enable use …
andrew-golledge May 13, 2025
5273e56
Initial checkin of example app for aristech stt.
andrew-golledge May 13, 2025
656b798
Read token and secret from environment for tts client credentials. Ov…
andrew-golledge May 16, 2025
c1cd736
New aristech TTS example, loosely based on the azure example.
andrew-golledge May 16, 2025
a07723e
Use `end_of_utterance` flag instead of `is_final` when transcribing.
andrew-golledge May 20, 2025
59933eb
Force microphone to use 16_000 Hz sample rate instead of the default …
andrew-golledge May 20, 2025
1003b80
Add the aristech synthesize service.
andrew-golledge May 20, 2025
ac815ba
Fix problems with authentication parameter serialization.
andrew-golledge May 26, 2025
28a388b
Rebase on master.
andrew-golledge May 26, 2025
3f89e62
Use async_stream instead of starting a separate task to read the audi…
andrew-golledge May 26, 2025
540e2b7
Changes to synthesize example after rebase.
andrew-golledge May 27, 2025
4c5c018
Make use of `RecognitionSpec::default()` and a spot of comment polish…
andrew-golledge Jun 3, 2025
1f536cf
Comment polishing.
andrew-golledge Jun 3, 2025
2fc1607
Changed some parameter names in Aristech transcribe and synthesize se…
andrew-golledge Jun 4, 2025
a41b83a
Install protobuf-compiler in CI build and keep `cargo fmt` happy.
andrew-golledge Jun 4, 2025
8c88c9e
aristech:synthesize: Minor changes
pragmatrix Jun 11, 2025
e273aad
Extend README.md with Aristech prerequisites
pragmatrix Jun 11, 2025
8249620
aristech:transcribe: Convert model and prompt to an option
pragmatrix Jun 11, 2025
5b9a625
Remove separated Registry impl block
pragmatrix Jun 11, 2025
edd71cb
Review aristech examples
pragmatrix Jun 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions .github/workflows/context-switch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,11 @@ jobs:
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libasound2-dev # Required for ALSA audio support
# libasound2-dev Required for ALSA audio support, protobuf-compiler for Aristech
sudo apt-get install -y libasound2-dev protobuf-compiler

- name: Verify protoc installation
run: protoc --version

- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
Expand Down Expand Up @@ -49,7 +53,11 @@ jobs:
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libasound2-dev # Required for ALSA audio support
# libasound2-dev Required for ALSA audio support, protobuf-compiler for Aristech
sudo apt-get install -y libasound2-dev protobuf-compiler

- name: Verify protoc installation
run: protoc --version

- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
Expand All @@ -73,7 +81,11 @@ jobs:
- name: Install system dependencies
run: |
sudo apt-get update
sudo apt-get install -y libasound2-dev # Required for ALSA audio support
# libasound2-dev Required for ALSA audio support, protobuf-compiler for Aristech
sudo apt-get install -y libasound2-dev protobuf-compiler

- name: Verify protoc installation
run: protoc --version

- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
Expand Down
8 changes: 6 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ members = [
"audio-knife",
"audio-test",
"core",
"services/aristech",
"services/azure",
"services/google-transcribe",
"services/openai-dialog",
Expand All @@ -23,6 +24,7 @@ context-switch-core = { workspace = true }
openai-dialog = { path = "services/openai-dialog" }
azure = { workspace = true }
azure-speech = { workspace = true }
aristech = { workspace = true }

# basic

Expand All @@ -35,8 +37,8 @@ derive_more = { workspace = true }

# serialization / async runtime

serde = { workspace = true }
serde_json = "1.0.133"
serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
tokio = { workspace = true }
async-trait = { workspace = true }

Expand All @@ -54,6 +56,7 @@ cpal = "0.15.3"
rodio = { workspace = true, features = ["playback"] }

azure = { workspace = true }
aristech = { workspace = true }
google-transcribe = { path = "services/google-transcribe" }

tokio = { workspace = true, features = ["rt-multi-thread"] }
Expand All @@ -70,6 +73,7 @@ tracing-subscriber = { version = "0.3.19" }
context-switch-core = { path = "core" }
azure = { path = "services/azure" }
playback = { path = "services/playback" }
aristech = { path = "services/aristech" }

anyhow = "1.0.93"
derive_more = { version = "1.0.0", features = ["full"] }
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ Context Switch is a Rust-based framework for building real-time conversational a
- OpenAI API key
- Azure Speech Services subscription key
- Google Cloud API key (for Google transcription)
- For Aristech services:
- Install protoc
- macOS: `brew install protobuf`
- Linux: `apt-get install protobuf-compiler`

### Installation

Expand Down
235 changes: 235 additions & 0 deletions examples/aristech-synthesize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
use std::{env, thread, time::Duration};

use anyhow::{Context as AnyhowContext, Result};
use rodio::{OutputStreamBuilder, Sink, Source};
use tokio::{select, sync::mpsc::channel};

use aristech::synthesize::{AristechSynthesize, Params as AristechParams};
use context_switch::{InputModality, OutputModality};
use context_switch_core::{
AudioFormat, AudioFrame, AudioProducer, audio,
conversation::{Conversation, Input, Output},
service::Service,
};

const SAMPLE_TEXT: &str = "Hallo! Dies ist eine Demonstration des Aristech Text-zu-Sprache-Dienstes. \
Er wandelt geschriebenen Text in gesprochene Worte um, unter Verwendung fortschrittlicher neuronaler Netzwerke. \
Vielen Dank, dass Sie dieses Beispiel ausprobiert haben.";

#[tokio::main]
async fn main() -> Result<()> {
// Load environment variables from .env file
dotenvy::dotenv_override()?;

println!("Starting Aristech Text-to-Speech example...");

// Define audio format
let output_format = AudioFormat {
channels: 1,
sample_rate: 22050,
};

// Create params for Aristech synthesize service
let params = get_aristech_params()?;

// Set up channels for the conversation
let (output_producer, mut output_consumer) = channel(32);
let (conv_input_producer, conv_input_consumer) = channel(32);

// Create the service and conversation
let aristech = AristechSynthesize;
let mut conversation = aristech.conversation(
params,
Conversation::new(
InputModality::Text,
[OutputModality::Audio {
format: output_format,
}],
conv_input_consumer,
output_producer,
),
);

// Send the text to be synthesized
println!("Sending text to be synthesized: \"{SAMPLE_TEXT}\"");
conv_input_producer
.send(Input::Text {
request_id: None,
text: SAMPLE_TEXT.to_string(),
text_type: None,
})
.await
.context("Failed to send text input")?;

// Set up audio playback
let (audio_producer, playback_task) = setup_audio_playback(output_format).await;

// Spawn audio playback task
let playback_handle = tokio::spawn(playback_task);

// Listen for output and forward audio frames to the audio player
println!("Waiting for synthesized audio...");
let mut audio_frames = 0;

loop {
select! {
// Primary conversation
r = &mut conversation => {
r.context("Conversation stopped")?;
break;
}

// Forward output to audio playback
output = output_consumer.recv() => {
match output {
Some(Output::Audio { frame }) => {
audio_producer.produce(frame)?;
audio_frames += 1;
if audio_frames % 10 == 0 {
println!("Received {audio_frames} audio frames...");
}
}
Some(Output::RequestCompleted {..}) => {
println!("Text-to-speech conversion completed! Received {audio_frames} audio frames.");
// Close the audio producer to signal end of input
drop(audio_producer);
break;
}
None => {
println!("End of output");
break;
}
_ => {}
}
}
}
}

// Wait for audio playback to complete
println!("Waiting for audio playback to complete...");
if let Err(e) = playback_handle.await {
println!("Error waiting for playback: {e:?}");
}

println!("Example completed successfully!");
Ok(())
}

// Audio command for communication between async and audio threads
enum AudioCommand {
PlayFrame(AudioFrame),
Stop,
}

// Helper struct for converting AudioFrame to a rodio Source
struct FrameSource {
frames: Vec<f32>,
position: usize,
sample_rate: u32,
channels: u16,
}

impl Iterator for FrameSource {
type Item = f32;

fn next(&mut self) -> Option<f32> {
if self.position >= self.frames.len() {
None
} else {
let sample = self.frames[self.position];
self.position += 1;
Some(sample)
}
}
}

impl Source for FrameSource {
fn current_span_len(&self) -> Option<usize> {
Some(self.frames.len() - self.position)
}

fn channels(&self) -> u16 {
self.channels
}

fn sample_rate(&self) -> u32 {
self.sample_rate
}

fn total_duration(&self) -> Option<Duration> {
let seconds = self.frames.len() as f32 / (self.sample_rate as f32 * self.channels as f32);
Some(Duration::from_secs_f32(seconds))
}
}

// Set up audio playback system with channel-based communication
async fn setup_audio_playback(
format: AudioFormat,
) -> (AudioProducer, impl std::future::Future<Output = ()>) {
let (producer, mut consumer) = format.new_channel();

let (cmd_tx, cmd_rx) = std::sync::mpsc::channel();

// Spawn a dedicated audio thread
let handle = thread::spawn(move || {
// Create output stream in the audio thread
let stream = OutputStreamBuilder::from_default_device()
.unwrap()
.open_stream()
.unwrap();

let sink = Sink::connect_new(stream.mixer());

while let Ok(cmd) = cmd_rx.recv() {
match cmd {
AudioCommand::PlayFrame(frame) => {
let source = FrameSource {
frames: audio::from_i16(frame.samples),
position: 0,
sample_rate: frame.format.sample_rate,
channels: format.channels,
};
sink.append(source);
}
AudioCommand::Stop => break,
}
}

println!("Audio playback finished, waiting for sink to empty");
sink.sleep_until_end();
});

// Create async task to forward frames to the audio thread
let forward_task = async move {
while let Some(frame) = consumer.consume().await {
if cmd_tx.send(AudioCommand::PlayFrame(frame)).is_err() {
break;
}
}
let _ = cmd_tx.send(AudioCommand::Stop);
let _ = handle.join();
};

(producer, forward_task)
}

// Helper function to get Aristech parameters from environment variables
fn get_aristech_params() -> Result<AristechParams> {
let endpoint =
env::var("ARISTECH_ENDPOINT").context("ARISTECH_ENDPOINT environment variable not set")?;

let voice_id = env::var("ARISTECH_VOICE_ID").unwrap_or_else(|_| "anne_de_DE".to_string());

let token =
env::var("ARISTECH_TOKEN").context("ARISTECH_TOKEN environment variable not set")?;

let secret =
env::var("ARISTECH_SECRET").context("ARISTECH_SECRET environment variable not set")?;

Ok(AristechParams {
endpoint,
voice: Some(voice_id),
token,
secret,
})
}
Loading
Loading