-
Notifications
You must be signed in to change notification settings - Fork 410
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
1bc93d4
commit f7ba918
Showing
31 changed files
with
1,748 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from .api import generate_audio, text_to_semantic, semantic_to_waveform, save_as_prompt | ||
from .generation import SAMPLE_RATE, preload_models |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
from typing import Dict, Optional, Union | ||
|
||
import numpy as np | ||
|
||
from .generation import codec_decode, generate_coarse, generate_fine, generate_text_semantic | ||
|
||
|
||
def generate_with_settings(text_prompt, semantic_temp=0.6, eos_p=0.2, coarse_temp=0.7, fine_temp=0.5, voice_name=None, output_full=False): | ||
|
||
# generation with more control | ||
x_semantic = generate_text_semantic( | ||
text_prompt, | ||
history_prompt=voice_name, | ||
temp=semantic_temp, | ||
min_eos_p = eos_p, | ||
use_kv_caching=True | ||
) | ||
|
||
x_coarse_gen = generate_coarse( | ||
x_semantic, | ||
history_prompt=voice_name, | ||
temp=coarse_temp, | ||
use_kv_caching=True | ||
) | ||
x_fine_gen = generate_fine( | ||
x_coarse_gen, | ||
history_prompt=voice_name, | ||
temp=fine_temp, | ||
) | ||
|
||
if output_full: | ||
full_generation = { | ||
'semantic_prompt': x_semantic, | ||
'coarse_prompt': x_coarse_gen, | ||
'fine_prompt': x_fine_gen | ||
} | ||
return full_generation, codec_decode(x_fine_gen) | ||
return codec_decode(x_fine_gen) | ||
|
||
|
||
def text_to_semantic( | ||
text: str, | ||
history_prompt: Optional[Union[Dict, str]] = None, | ||
temp: float = 0.7, | ||
silent: bool = False, | ||
): | ||
"""Generate semantic array from text. | ||
Args: | ||
text: text to be turned into audio | ||
history_prompt: history choice for audio cloning | ||
temp: generation temperature (1.0 more diverse, 0.0 more conservative) | ||
silent: disable progress bar | ||
Returns: | ||
numpy semantic array to be fed into `semantic_to_waveform` | ||
""" | ||
x_semantic = generate_text_semantic( | ||
text, | ||
history_prompt=history_prompt, | ||
temp=temp, | ||
silent=silent, | ||
use_kv_caching=True | ||
) | ||
return x_semantic | ||
|
||
|
||
def semantic_to_waveform( | ||
semantic_tokens: np.ndarray, | ||
history_prompt: Optional[Union[Dict, str]] = None, | ||
temp: float = 0.7, | ||
silent: bool = False, | ||
output_full: bool = False, | ||
): | ||
"""Generate audio array from semantic input. | ||
Args: | ||
semantic_tokens: semantic token output from `text_to_semantic` | ||
history_prompt: history choice for audio cloning | ||
temp: generation temperature (1.0 more diverse, 0.0 more conservative) | ||
silent: disable progress bar | ||
output_full: return full generation to be used as a history prompt | ||
Returns: | ||
numpy audio array at sample frequency 24khz | ||
""" | ||
coarse_tokens = generate_coarse( | ||
semantic_tokens, | ||
history_prompt=history_prompt, | ||
temp=temp, | ||
silent=silent, | ||
use_kv_caching=True | ||
) | ||
fine_tokens = generate_fine( | ||
coarse_tokens, | ||
history_prompt=history_prompt, | ||
temp=0.5, | ||
) | ||
audio_arr = codec_decode(fine_tokens) | ||
if output_full: | ||
full_generation = { | ||
"semantic_prompt": semantic_tokens, | ||
"coarse_prompt": coarse_tokens, | ||
"fine_prompt": fine_tokens, | ||
} | ||
return full_generation, audio_arr | ||
return audio_arr | ||
|
||
|
||
def save_as_prompt(filepath, full_generation): | ||
assert(filepath.endswith(".npz")) | ||
assert(isinstance(full_generation, dict)) | ||
assert("semantic_prompt" in full_generation) | ||
assert("coarse_prompt" in full_generation) | ||
assert("fine_prompt" in full_generation) | ||
np.savez(filepath, **full_generation) | ||
|
||
|
||
def generate_audio( | ||
text: str, | ||
history_prompt: Optional[Union[Dict, str]] = None, | ||
text_temp: float = 0.7, | ||
waveform_temp: float = 0.7, | ||
silent: bool = False, | ||
output_full: bool = False, | ||
): | ||
"""Generate audio array from input text. | ||
Args: | ||
text: text to be turned into audio | ||
history_prompt: history choice for audio cloning | ||
text_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | ||
waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative) | ||
silent: disable progress bar | ||
output_full: return full generation to be used as a history prompt | ||
Returns: | ||
numpy audio array at sample frequency 24khz | ||
""" | ||
semantic_tokens = text_to_semantic( | ||
text, | ||
history_prompt=history_prompt, | ||
temp=text_temp, | ||
silent=silent, | ||
) | ||
out = semantic_to_waveform( | ||
semantic_tokens, | ||
history_prompt=history_prompt, | ||
temp=waveform_temp, | ||
silent=silent, | ||
output_full=output_full, | ||
) | ||
if output_full: | ||
full_generation, audio_arr = out | ||
return full_generation, audio_arr | ||
else: | ||
audio_arr = out | ||
return audio_arr |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.