-
Notifications
You must be signed in to change notification settings - Fork 410
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b55e81a
commit 1bc93d4
Showing
4 changed files
with
205 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
from datetime import datetime | ||
from mutagen.wave import WAVE | ||
from mutagen.id3._frames import * | ||
|
||
def create_filename(path, seed, name, extension): | ||
now = datetime.now() | ||
date_str =now.strftime("%m-%d-%Y") | ||
outputs_folder = os.path.join(os.getcwd(), path) | ||
if not os.path.exists(outputs_folder): | ||
os.makedirs(outputs_folder) | ||
|
||
sub_folder = os.path.join(outputs_folder, date_str) | ||
if not os.path.exists(sub_folder): | ||
os.makedirs(sub_folder) | ||
|
||
time_str = now.strftime("%H-%M-%S") | ||
if seed == None: | ||
file_name = f"{name}_{time_str}{extension}" | ||
else: | ||
file_name = f"{name}_{time_str}_s{seed}{extension}" | ||
return os.path.join(sub_folder, file_name) | ||
|
||
|
||
def add_id3_tag(filename, text, speakername, seed): | ||
audio = WAVE(filename) | ||
if speakername == None: | ||
speakername = "Unconditional" | ||
|
||
# write id3 tag with text truncated to 60 chars, as a precaution... | ||
audio["TIT2"] = TIT2(encoding=3, text=text[:60]) | ||
audio["TPE1"] = TPE1(encoding=3, text=f"Voice {speakername} using Seed={seed}") | ||
audio["TPUB"] = TPUB(encoding=3, text="Bark by Suno AI") | ||
audio["COMMENT"] = COMM(encoding=3, text="Generated with Bark GUI - Text-Prompted Generative Audio Model. Visit https://github.com/C0untFloyd/bark-gui") | ||
audio.save() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import re | ||
import xml.etree.ElementTree as ET | ||
from xml.sax import saxutils | ||
#import nltk | ||
|
||
# Chunked generation originally from https://github.com/serp-ai/bark-with-voice-clone | ||
def split_and_recombine_text(text, desired_length=100, max_length=150): | ||
# return nltk.sent_tokenize(text) | ||
|
||
# from https://github.com/neonbjb/tortoise-tts | ||
"""Split text it into chunks of a desired length trying to keep sentences intact.""" | ||
# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii | ||
text = re.sub(r"\n\n+", "\n", text) | ||
text = re.sub(r"\s+", " ", text) | ||
text = re.sub(r"[“”]", '"', text) | ||
|
||
rv = [] | ||
in_quote = False | ||
current = "" | ||
split_pos = [] | ||
pos = -1 | ||
end_pos = len(text) - 1 | ||
|
||
def seek(delta): | ||
nonlocal pos, in_quote, current | ||
is_neg = delta < 0 | ||
for _ in range(abs(delta)): | ||
if is_neg: | ||
pos -= 1 | ||
current = current[:-1] | ||
else: | ||
pos += 1 | ||
current += text[pos] | ||
if text[pos] == '"': | ||
in_quote = not in_quote | ||
return text[pos] | ||
|
||
def peek(delta): | ||
p = pos + delta | ||
return text[p] if p < end_pos and p >= 0 else "" | ||
|
||
def commit(): | ||
nonlocal rv, current, split_pos | ||
rv.append(current) | ||
current = "" | ||
split_pos = [] | ||
|
||
while pos < end_pos: | ||
c = seek(1) | ||
# do we need to force a split? | ||
if len(current) >= max_length: | ||
if len(split_pos) > 0 and len(current) > (desired_length / 2): | ||
# we have at least one sentence and we are over half the desired length, seek back to the last split | ||
d = pos - split_pos[-1] | ||
seek(-d) | ||
else: | ||
# no full sentences, seek back until we are not in the middle of a word and split there | ||
while c not in "!?.,\n " and pos > 0 and len(current) > desired_length: | ||
c = seek(-1) | ||
commit() | ||
# check for sentence boundaries | ||
elif not in_quote and (c in "!?]\n" or (c == "." and peek(1) in "\n ")): | ||
# seek forward if we have consecutive boundary markers but still within the max length | ||
while ( | ||
pos < len(text) - 1 and len(current) < max_length and peek(1) in "!?.]" | ||
): | ||
c = seek(1) | ||
split_pos.append(pos) | ||
if len(current) >= desired_length: | ||
commit() | ||
# treat end of quote as a boundary if its followed by a space or newline | ||
elif in_quote and peek(1) == '"' and peek(2) in "\n ": | ||
seek(2) | ||
split_pos.append(pos) | ||
rv.append(current) | ||
|
||
# clean up, remove lines with only whitespace or punctuation | ||
rv = [s.strip() for s in rv] | ||
rv = [s for s in rv if len(s) > 0 and not re.match(r"^[\s\.,;:!?]*$", s)] | ||
|
||
return rv | ||
|
||
def is_ssml(value): | ||
try: | ||
ET.fromstring(value) | ||
except ET.ParseError: | ||
return False | ||
return True | ||
|
||
def build_ssml(rawtext, selected_voice): | ||
texts = rawtext.split("\n") | ||
joinedparts = "" | ||
for textpart in texts: | ||
textpart = textpart.strip() | ||
if len(textpart) < 1: | ||
continue | ||
joinedparts = joinedparts + f"\n<voice name=\"{selected_voice}\">{saxutils.escape(textpart)}</voice>" | ||
ssml = f"""<?xml version="1.0"?> | ||
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://www.w3.org/2001/10/synthesis | ||
http://www.w3.org/TR/speech-synthesis/synthesis.xsd" | ||
xml:lang="en-US"> | ||
{joinedparts} | ||
</speak> | ||
""" | ||
return ssml | ||
|
||
def create_clips_from_ssml(ssmlinput): | ||
# Parse the XML | ||
tree = ET.ElementTree(ET.fromstring(ssmlinput)) | ||
root = tree.getroot() | ||
|
||
# Create an empty list | ||
voice_list = [] | ||
|
||
# Loop through all voice tags | ||
for voice in root.iter('{http://www.w3.org/2001/10/synthesis}voice'): | ||
# Extract the voice name attribute and the content text | ||
voice_name = voice.attrib['name'] | ||
voice_content = voice.text.strip() if voice.text else '' | ||
if(len(voice_content) > 0): | ||
parts = split_and_recombine_text(voice_content) | ||
for p in parts: | ||
if(len(p) > 1): | ||
# add to tuple list | ||
voice_list.append((voice_name, p)) | ||
return voice_list | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import yaml | ||
|
||
class Settings: | ||
def __init__(self, config_file): | ||
self.config_file = config_file | ||
self.load() | ||
|
||
def load(self): | ||
try: | ||
with open(self.config_file, 'r') as f: | ||
data = yaml.load(f, Loader=yaml.FullLoader) | ||
self.selected_theme = data.get('selected_theme', "gstaff/xkcd") | ||
self.server_name = data.get('server_name', "") | ||
self.server_port = data.get('server_port', 0) | ||
self.server_share = data.get('server_share', False) | ||
self.input_text_desired_length = data.get('input_text_desired_length', 110) | ||
self.input_text_max_length = data.get('input_text_max_length', 170) | ||
self.silence_sentence = data.get('silence_between_sentences', 250) | ||
self.silence_speakers = data.get('silence_between_speakers', 500) | ||
self.output_folder_path = data.get('output_folder_path', 'outputs') | ||
|
||
except: | ||
self.selected_theme = "gstaff/xkcd" | ||
|
||
def save(self): | ||
data = { | ||
'selected_theme': self.selected_theme, | ||
'server_name': self.server_name, | ||
'server_port': self.server_port, | ||
'server_share': self.server_share, | ||
'input_text_desired_length' : self.input_text_desired_length, | ||
'input_text_max_length' : self.input_text_max_length, | ||
'silence_between_sentences': self.silence_sentence, | ||
'silence_between_speakers': self.silence_speakers, | ||
'output_folder_path': self.output_folder_path | ||
} | ||
with open(self.config_file, 'w') as f: | ||
yaml.dump(data, f) | ||
|
||
|
||
|