Merge pull request #2 from liuhuapiaoyuan/feature/voice-provider

Feature/voice provider
YOYZHANG · Oct 24, 2024 · 1528868 · 1528868
2 parents cf31a95 + d4b8e56
commit 1528868
Show file tree

Hide file tree

Showing 10 changed files with 235 additions and 31 deletions.
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -7,3 +7,4 @@ htmlcov
 .cache
 .venv
 .env
+tmp/cache/*
diff --git a/backend/api/routes/chat.py b/backend/api/routes/chat.py
@@ -3,6 +3,7 @@
 from fastapi.responses import StreamingResponse, JSONResponse
 import json
 from typing import Dict, Optional
+from constants import SPEEKERS
 from utils import combine_audio, generate_dialogue, generate_podcast_info, generate_podcast_summary, get_pdf_text
 
 router = APIRouter()
@@ -25,6 +26,12 @@ async def generate_transcript(
 def test():
     return {"message": "Hello World"}
 
+
+@router.get("/speekers")
+def speeker():
+    return JSONResponse(content=SPEEKERS)
+
+
 @router.post("/summarize")
 async def get_summary(
     textInput: str = Form(...),
@@ -67,12 +74,15 @@ async def get_pod_info(
 async def audio(
     background_tasks: BackgroundTasks,
     text: str = Form(...),
-    language: str = Form(...)
+    host_voice: str = Form(...),
+    guest_voice: str = Form(...),
+    language: str = Form(...) ,
+    provider: str = Form(...)
 ):  
     task_id = str(uuid.uuid4())
     task_status[task_id] = {"status": "processing"}
 
-    background_tasks.add_task(combine_audio, task_status, task_id, text, language)
+    background_tasks.add_task(combine_audio, task_status, task_id, text, language,provider , host_voice,guest_voice)
 
     return JSONResponse(content={"task_id": task_id, "status": "processing"})
 

diff --git a/backend/constants.py b/backend/constants.py
@@ -23,11 +23,14 @@
 
 SPEECH_KEY = os.getenv('SPEECH_KEY')
 SPEECH_REGION = "japaneast"
+
+FISHAUDIO_KEY = os.getenv('FISHAUDIO_KEY')
+
 # Fireworks API-related constants
 FIREWORKS_API_KEY = os.getenv('FIREWORKS_API_KEY')
-FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1"
+FIREWORKS_BASE_URL = os.getenv('FIREWORKS_BASE_URL',"https://api.fireworks.ai/inference/v1")
 FIREWORKS_MAX_TOKENS = 16_384
-FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
+FIREWORKS_MODEL_ID = os.getenv('FIREWORKS_MODEL_ID',"accounts/fireworks/models/llama-v3p1-405b-instruct")
 FIREWORKS_TEMPERATURE = 0.1
 FIREWORKS_JSON_RETRY_ATTEMPTS = 3
 # Suno related constants
@@ -47,3 +50,34 @@
     "Turkish": "tr",
 }
 
+
+FISHAUDIO_SPEEKER = [
+    { "id": "59cb5986671546eaa6ca8ae6f29f6d22", "name": "央视配音" },
+    { "id": "738d0cc1a3e9430a9de2b544a466a7fc", "name": "雷军" },
+    { "id": "54a5170264694bfc8e9ad98df7bd89c3", "name": "丁真" },
+    { "id": "7f92f8afb8ec43bf81429cc1c9199cb1", "name": "AD学姐" },
+    { "id": "0eb38bc974e1459facca38b359e13511", "name": "赛马娘" },
+    { "id": "e80ea225770f42f79d50aa98be3cedfc", "name": "孙笑川258" },
+    { "id": "e4642e5edccd4d9ab61a69e82d4f8a14", "name": "蔡徐坤" },
+    { "id": "f7561ff309bd4040a59f1e600f4f4338", "name": "黑手" },
+    { "id": "332941d1360c48949f1b4e0cabf912cd", "name": "丁真（锐刻五代版）" },
+    { "id": "1aacaeb1b840436391b835fd5513f4c4", "name": "芙宁娜" },
+    { "id": "3b55b3d84d2f453a98d8ca9bb24182d6", "name": "邓紫琪" },
+    { "id": "7af4d620be1c4c6686132f21940d51c5", "name": "东雪莲" },
+    { "id": "e1cfccf59a1c4492b5f51c7c62a8abd2", "name": "永雏塔菲" },
+    { "id": "665e031efe27435780ebfa56cc7e0e0d", "name": "月半猫" },
+    { "id": "aebaa2305aa2452fbdc8f41eec852a79", "name": "雷军" },
+    { "id": "7c66db6e457c4d53b1fe428a8c547953", "name": "郭德纲" },
+    { "id": "99503144194c45ed8fb998ceac181dcc", "name": "贝利亚" },
+    { "id": "4462fa28f3824bff808a94a6075570e5", "name": "雷军" },
+    { "id": "188c9b7c06654042be0e8a25781761e8", "name": "周杰伦" },
+    { "id": "6ce7ea8ada884bf3889fa7c7fb206691", "name": "御女茉莉" }
+]
+SPEEKERS = {
+    "fishaudio":FISHAUDIO_SPEEKER,
+    "azure":[
+        {"id":"zh-CN-YunxiNeural","name":"云希"},
+        {"id":"zh-CN-YunzeNeural","name":"云哲"},
+        {"id":"zh-CN-YunxuanNeural","name":"晓萱"},
+    ]
+}
diff --git a/backend/fishaudio.py b/backend/fishaudio.py
@@ -0,0 +1,33 @@
+from fish_audio_sdk import Session, TTSRequest, ReferenceAudio
+from pydub import AudioSegment
+import io
+
+from constants import FISHAUDIO_KEY,FISHAUDIO_SPEEKER
+
+
+
+import random
+
+def get_adapter_speeker_id(speaker_name):
+    speeker = FISHAUDIO_SPEEKER[0]
+    if speaker_name != "主持人": 
+        speeker = random.choice(FISHAUDIO_SPEEKER)
+    return speeker["id"]
+
+def fishaudio_tts(text, reference_id=None) -> AudioSegment:
+    """
+    将给定的文本转换为语音并返回AudioSegment对象。
+    
+    :param text: 要转换的文本
+    :param reference_id: 可选参数，使用的模型 ID
+    :return: 返回生成的语音的AudioSegment对象
+    """
+    session = Session(FISHAUDIO_KEY)
+    audio_buffer = io.BytesIO()
+    for chunk in session.tts(TTSRequest(
+        reference_id=reference_id,
+        text=text
+    )):
+        audio_buffer.write(chunk)
+    audio_buffer.seek(0)  # 重置缓冲区的位置
+    return AudioSegment.from_file(audio_buffer, format="mp3")
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -18,3 +18,4 @@ numpy==2.1.1
 python-multipart==0.0.12
 PyPDF2==3.0.1
 azure-cognitiveservices-speech==1.41.1
+fish_audio_sdk
diff --git a/backend/utils.py b/backend/utils.py
@@ -9,6 +9,7 @@
 from typing import Any, Dict, Generator
 import uuid
 from openai import OpenAI
+from fishaudio import fishaudio_tts
 from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, PODCAST_INFO_PROMPT, QUESTION_MODIFIER, SUMMARY_INFO_PROMPT, SYSTEM_PROMPT, TONE_MODIFIER
 import json
 from pydub import AudioSegment
@@ -49,10 +50,12 @@ def generate_dialogue(pdfFile, textInput, tone, duration, language) -> Generator
 
     yield json.dumps({"type": "final", "content": full_response})
 
-async def process_line(line, voice):
-    return await generate_podcast_audio(line['content'], voice)
+async def process_line(line, voice,provider):
+    if provider == 'fishaudio':
+        return await generate_podcast_audio(line['content'], voice)
+    return await generate_podcast_audio_by_azure(line['content'], voice)
 
-async def generate_podcast_audio(text: str, voice: str) -> str:
+async def generate_podcast_audio_by_azure(text: str, voice: str) -> str:
     try:
         speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION)
         speech_config.speech_synthesis_voice_name = voice
@@ -80,7 +83,27 @@ async def generate_podcast_audio(text: str, voice: str) -> str:
         print(f"Error in generate_podcast_audio: {e}")
         raise
 
-async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str) -> Generator[str, None, None]:
+async def generate_podcast_audio(text: str, voice: str) -> str:
+    return await generate_podcast_audio_by_fish(text,voice) 
+
+async def generate_podcast_audio_by_fish(text: str, voice: str) -> str:
+    try: 
+        return fishaudio_tts(text=text,reference_id=voice)
+    except Exception as e:
+        print(f"Error in generate_podcast_audio: {e}")
+        raise
+async def process_lines_with_limit(lines, provider , host_voice, guest_voice, max_concurrency):
+    semaphore = asyncio.Semaphore(max_concurrency)
+
+    async def limited_process_line(line):
+        async with semaphore:
+            voice = host_voice if (line['speaker'] == '主持人' or line['speaker'] == 'Host') else guest_voice
+            return await process_line(line, voice , provider)
+
+    tasks = [limited_process_line(line) for line in lines]
+    results = await asyncio.gather(*tasks)
+    return results
+async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str , provider:str,host_voice: str , guest_voice:str) -> Generator[str, None, None]:
     try:
         dialogue_regex = r'\*\*([\s\S]*?)\*\*[:：]\s*([\s\S]*?)(?=\*\*|$)'
         matches = re.findall(dialogue_regex, text, re.DOTALL)
@@ -93,13 +116,11 @@ async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, l
         for match in matches
         ]
 
-        host_voice = "zh-CN-YunxiNeural"
-        guest_voice = "zh-CN-YunzeNeural"
-
         print("Starting audio generation")
-        audio_segments = await asyncio.gather(
-            *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines]
-        )
+        # audio_segments = await asyncio.gather(
+        #     *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines]
+        # )
+        audio_segments = await process_lines_with_limit(lines,provider, host_voice, guest_voice, 10 if provider=='azure' else 5)
         print("Audio generation completed")
 
         # 合并音频

diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
@@ -36,7 +36,7 @@ function App() {
   }
   return (
     <div className="h-screen flex flex-col overflow-hidden">
-      <main className="flex-grow flex bg-[rgb(245,245,245)]">
+      <main className="flex-grow flex bg-[rgb(245,245,245)] h-full">
         <Menu
           handleGenerate={handleGenerate}
           isGenerating={isGenerating}

diff --git a/frontend/src/components/content.tsx b/frontend/src/components/content.tsx
@@ -66,6 +66,9 @@ export default function Content({
         const audioFormData = new FormData();
         audioFormData.append('text', transcriptFinalResult.content);
         audioFormData.append('language', formData.get('language') as string);
+        audioFormData.append('host_voice', formData.get('hostVoice') as string);
+        audioFormData.append('guest_voice', formData.get('guestVoice') as string);
+        audioFormData.append('provider', formData.get('provider') as string);
 
         generateAudio(audioFormData)
       }
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ htmlcov @@
     .cache
     .venv
     .env
+    tmp/cache/*