Skip to content

Commit

Permalink
Merge pull request #2 from liuhuapiaoyuan/feature/voice-provider
Browse files Browse the repository at this point in the history
Feature/voice provider
  • Loading branch information
YOYZHANG authored Oct 24, 2024
2 parents cf31a95 + d4b8e56 commit 1528868
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 31 deletions.
1 change: 1 addition & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ htmlcov
.cache
.venv
.env
tmp/cache/*
14 changes: 12 additions & 2 deletions backend/api/routes/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from fastapi.responses import StreamingResponse, JSONResponse
import json
from typing import Dict, Optional
from constants import SPEEKERS
from utils import combine_audio, generate_dialogue, generate_podcast_info, generate_podcast_summary, get_pdf_text

router = APIRouter()
Expand All @@ -25,6 +26,12 @@ async def generate_transcript(
def test():
return {"message": "Hello World"}


@router.get("/speekers")
def speeker():
return JSONResponse(content=SPEEKERS)


@router.post("/summarize")
async def get_summary(
textInput: str = Form(...),
Expand Down Expand Up @@ -67,12 +74,15 @@ async def get_pod_info(
async def audio(
background_tasks: BackgroundTasks,
text: str = Form(...),
language: str = Form(...)
host_voice: str = Form(...),
guest_voice: str = Form(...),
language: str = Form(...) ,
provider: str = Form(...)
):
task_id = str(uuid.uuid4())
task_status[task_id] = {"status": "processing"}

background_tasks.add_task(combine_audio, task_status, task_id, text, language)
background_tasks.add_task(combine_audio, task_status, task_id, text, language,provider , host_voice,guest_voice)

return JSONResponse(content={"task_id": task_id, "status": "processing"})

Expand Down
38 changes: 36 additions & 2 deletions backend/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,14 @@

SPEECH_KEY = os.getenv('SPEECH_KEY')
SPEECH_REGION = "japaneast"

FISHAUDIO_KEY = os.getenv('FISHAUDIO_KEY')

# Fireworks API-related constants
FIREWORKS_API_KEY = os.getenv('FIREWORKS_API_KEY')
FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1"
FIREWORKS_BASE_URL = os.getenv('FIREWORKS_BASE_URL',"https://api.fireworks.ai/inference/v1")
FIREWORKS_MAX_TOKENS = 16_384
FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
FIREWORKS_MODEL_ID = os.getenv('FIREWORKS_MODEL_ID',"accounts/fireworks/models/llama-v3p1-405b-instruct")
FIREWORKS_TEMPERATURE = 0.1
FIREWORKS_JSON_RETRY_ATTEMPTS = 3
# Suno related constants
Expand All @@ -47,3 +50,34 @@
"Turkish": "tr",
}


FISHAUDIO_SPEEKER = [
{ "id": "59cb5986671546eaa6ca8ae6f29f6d22", "name": "央视配音" },
{ "id": "738d0cc1a3e9430a9de2b544a466a7fc", "name": "雷军" },
{ "id": "54a5170264694bfc8e9ad98df7bd89c3", "name": "丁真" },
{ "id": "7f92f8afb8ec43bf81429cc1c9199cb1", "name": "AD学姐" },
{ "id": "0eb38bc974e1459facca38b359e13511", "name": "赛马娘" },
{ "id": "e80ea225770f42f79d50aa98be3cedfc", "name": "孙笑川258" },
{ "id": "e4642e5edccd4d9ab61a69e82d4f8a14", "name": "蔡徐坤" },
{ "id": "f7561ff309bd4040a59f1e600f4f4338", "name": "黑手" },
{ "id": "332941d1360c48949f1b4e0cabf912cd", "name": "丁真(锐刻五代版)" },
{ "id": "1aacaeb1b840436391b835fd5513f4c4", "name": "芙宁娜" },
{ "id": "3b55b3d84d2f453a98d8ca9bb24182d6", "name": "邓紫琪" },
{ "id": "7af4d620be1c4c6686132f21940d51c5", "name": "东雪莲" },
{ "id": "e1cfccf59a1c4492b5f51c7c62a8abd2", "name": "永雏塔菲" },
{ "id": "665e031efe27435780ebfa56cc7e0e0d", "name": "月半猫" },
{ "id": "aebaa2305aa2452fbdc8f41eec852a79", "name": "雷军" },
{ "id": "7c66db6e457c4d53b1fe428a8c547953", "name": "郭德纲" },
{ "id": "99503144194c45ed8fb998ceac181dcc", "name": "贝利亚" },
{ "id": "4462fa28f3824bff808a94a6075570e5", "name": "雷军" },
{ "id": "188c9b7c06654042be0e8a25781761e8", "name": "周杰伦" },
{ "id": "6ce7ea8ada884bf3889fa7c7fb206691", "name": "御女茉莉" }
]
SPEEKERS = {
"fishaudio":FISHAUDIO_SPEEKER,
"azure":[
{"id":"zh-CN-YunxiNeural","name":"云希"},
{"id":"zh-CN-YunzeNeural","name":"云哲"},
{"id":"zh-CN-YunxuanNeural","name":"晓萱"},
]
}
33 changes: 33 additions & 0 deletions backend/fishaudio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from fish_audio_sdk import Session, TTSRequest, ReferenceAudio
from pydub import AudioSegment
import io

from constants import FISHAUDIO_KEY,FISHAUDIO_SPEEKER



import random

def get_adapter_speeker_id(speaker_name):
speeker = FISHAUDIO_SPEEKER[0]
if speaker_name != "主持人":
speeker = random.choice(FISHAUDIO_SPEEKER)
return speeker["id"]

def fishaudio_tts(text, reference_id=None) -> AudioSegment:
"""
将给定的文本转换为语音并返回AudioSegment对象。
:param text: 要转换的文本
:param reference_id: 可选参数,使用的模型 ID
:return: 返回生成的语音的AudioSegment对象
"""
session = Session(FISHAUDIO_KEY)
audio_buffer = io.BytesIO()
for chunk in session.tts(TTSRequest(
reference_id=reference_id,
text=text
)):
audio_buffer.write(chunk)
audio_buffer.seek(0) # 重置缓冲区的位置
return AudioSegment.from_file(audio_buffer, format="mp3")
1 change: 1 addition & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ numpy==2.1.1
python-multipart==0.0.12
PyPDF2==3.0.1
azure-cognitiveservices-speech==1.41.1
fish_audio_sdk
41 changes: 31 additions & 10 deletions backend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from typing import Any, Dict, Generator
import uuid
from openai import OpenAI
from fishaudio import fishaudio_tts
from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, PODCAST_INFO_PROMPT, QUESTION_MODIFIER, SUMMARY_INFO_PROMPT, SYSTEM_PROMPT, TONE_MODIFIER
import json
from pydub import AudioSegment
Expand Down Expand Up @@ -49,10 +50,12 @@ def generate_dialogue(pdfFile, textInput, tone, duration, language) -> Generator

yield json.dumps({"type": "final", "content": full_response})

async def process_line(line, voice):
return await generate_podcast_audio(line['content'], voice)
async def process_line(line, voice,provider):
if provider == 'fishaudio':
return await generate_podcast_audio(line['content'], voice)
return await generate_podcast_audio_by_azure(line['content'], voice)

async def generate_podcast_audio(text: str, voice: str) -> str:
async def generate_podcast_audio_by_azure(text: str, voice: str) -> str:
try:
speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
speech_config.speech_synthesis_voice_name = voice
Expand Down Expand Up @@ -80,7 +83,27 @@ async def generate_podcast_audio(text: str, voice: str) -> str:
print(f"Error in generate_podcast_audio: {e}")
raise

async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str) -> Generator[str, None, None]:
async def generate_podcast_audio(text: str, voice: str) -> str:
return await generate_podcast_audio_by_fish(text,voice)

async def generate_podcast_audio_by_fish(text: str, voice: str) -> str:
try:
return fishaudio_tts(text=text,reference_id=voice)
except Exception as e:
print(f"Error in generate_podcast_audio: {e}")
raise
async def process_lines_with_limit(lines, provider , host_voice, guest_voice, max_concurrency):
semaphore = asyncio.Semaphore(max_concurrency)

async def limited_process_line(line):
async with semaphore:
voice = host_voice if (line['speaker'] == '主持人' or line['speaker'] == 'Host') else guest_voice
return await process_line(line, voice , provider)

tasks = [limited_process_line(line) for line in lines]
results = await asyncio.gather(*tasks)
return results
async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str , provider:str,host_voice: str , guest_voice:str) -> Generator[str, None, None]:
try:
dialogue_regex = r'\*\*([\s\S]*?)\*\*[::]\s*([\s\S]*?)(?=\*\*|$)'
matches = re.findall(dialogue_regex, text, re.DOTALL)
Expand All @@ -93,13 +116,11 @@ async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, l
for match in matches
]

host_voice = "zh-CN-YunxiNeural"
guest_voice = "zh-CN-YunzeNeural"

print("Starting audio generation")
audio_segments = await asyncio.gather(
*[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines]
)
# audio_segments = await asyncio.gather(
# *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines]
# )
audio_segments = await process_lines_with_limit(lines,provider, host_voice, guest_voice, 10 if provider=='azure' else 5)
print("Audio generation completed")

# 合并音频
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ function App() {
}
return (
<div className="h-screen flex flex-col overflow-hidden">
<main className="flex-grow flex bg-[rgb(245,245,245)]">
<main className="flex-grow flex bg-[rgb(245,245,245)] h-full">
<Menu
handleGenerate={handleGenerate}
isGenerating={isGenerating}
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/components/content.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ export default function Content({
const audioFormData = new FormData();
audioFormData.append('text', transcriptFinalResult.content);
audioFormData.append('language', formData.get('language') as string);
audioFormData.append('host_voice', formData.get('hostVoice') as string);
audioFormData.append('guest_voice', formData.get('guestVoice') as string);
audioFormData.append('provider', formData.get('provider') as string);

generateAudio(audioFormData)
}
Expand Down
Loading

0 comments on commit 1528868

Please sign in to comment.