diff --git a/backend/.gitignore b/backend/.gitignore index 993dacd..e2c2b07 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -7,3 +7,4 @@ htmlcov .cache .venv .env +tmp/cache/* \ No newline at end of file diff --git a/backend/api/routes/chat.py b/backend/api/routes/chat.py index a5209ad..2a2dab5 100644 --- a/backend/api/routes/chat.py +++ b/backend/api/routes/chat.py @@ -3,6 +3,7 @@ from fastapi.responses import StreamingResponse, JSONResponse import json from typing import Dict, Optional +from constants import SPEEKERS from utils import combine_audio, generate_dialogue, generate_podcast_info, generate_podcast_summary, get_pdf_text router = APIRouter() @@ -25,6 +26,12 @@ async def generate_transcript( def test(): return {"message": "Hello World"} + +@router.get("/speekers") +def speeker(): + return JSONResponse(content=SPEEKERS) + + @router.post("/summarize") async def get_summary( textInput: str = Form(...), @@ -67,12 +74,15 @@ async def get_pod_info( async def audio( background_tasks: BackgroundTasks, text: str = Form(...), - language: str = Form(...) + host_voice: str = Form(...), + guest_voice: str = Form(...), + language: str = Form(...) , + provider: str = Form(...) ): task_id = str(uuid.uuid4()) task_status[task_id] = {"status": "processing"} - background_tasks.add_task(combine_audio, task_status, task_id, text, language) + background_tasks.add_task(combine_audio, task_status, task_id, text, language,provider , host_voice,guest_voice) return JSONResponse(content={"task_id": task_id, "status": "processing"}) diff --git a/backend/constants.py b/backend/constants.py index 93ae7f3..da87af2 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -23,11 +23,14 @@ SPEECH_KEY = os.getenv('SPEECH_KEY') SPEECH_REGION = "japaneast" + +FISHAUDIO_KEY = os.getenv('FISHAUDIO_KEY') + # Fireworks API-related constants FIREWORKS_API_KEY = os.getenv('FIREWORKS_API_KEY') -FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1" +FIREWORKS_BASE_URL = os.getenv('FIREWORKS_BASE_URL',"https://api.fireworks.ai/inference/v1") FIREWORKS_MAX_TOKENS = 16_384 -FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct" +FIREWORKS_MODEL_ID = os.getenv('FIREWORKS_MODEL_ID',"accounts/fireworks/models/llama-v3p1-405b-instruct") FIREWORKS_TEMPERATURE = 0.1 FIREWORKS_JSON_RETRY_ATTEMPTS = 3 # Suno related constants @@ -47,3 +50,34 @@ "Turkish": "tr", } + +FISHAUDIO_SPEEKER = [ + { "id": "59cb5986671546eaa6ca8ae6f29f6d22", "name": "央视配音" }, + { "id": "738d0cc1a3e9430a9de2b544a466a7fc", "name": "雷军" }, + { "id": "54a5170264694bfc8e9ad98df7bd89c3", "name": "丁真" }, + { "id": "7f92f8afb8ec43bf81429cc1c9199cb1", "name": "AD学姐" }, + { "id": "0eb38bc974e1459facca38b359e13511", "name": "赛马娘" }, + { "id": "e80ea225770f42f79d50aa98be3cedfc", "name": "孙笑川258" }, + { "id": "e4642e5edccd4d9ab61a69e82d4f8a14", "name": "蔡徐坤" }, + { "id": "f7561ff309bd4040a59f1e600f4f4338", "name": "黑手" }, + { "id": "332941d1360c48949f1b4e0cabf912cd", "name": "丁真(锐刻五代版)" }, + { "id": "1aacaeb1b840436391b835fd5513f4c4", "name": "芙宁娜" }, + { "id": "3b55b3d84d2f453a98d8ca9bb24182d6", "name": "邓紫琪" }, + { "id": "7af4d620be1c4c6686132f21940d51c5", "name": "东雪莲" }, + { "id": "e1cfccf59a1c4492b5f51c7c62a8abd2", "name": "永雏塔菲" }, + { "id": "665e031efe27435780ebfa56cc7e0e0d", "name": "月半猫" }, + { "id": "aebaa2305aa2452fbdc8f41eec852a79", "name": "雷军" }, + { "id": "7c66db6e457c4d53b1fe428a8c547953", "name": "郭德纲" }, + { "id": "99503144194c45ed8fb998ceac181dcc", "name": "贝利亚" }, + { "id": "4462fa28f3824bff808a94a6075570e5", "name": "雷军" }, + { "id": "188c9b7c06654042be0e8a25781761e8", "name": "周杰伦" }, + { "id": "6ce7ea8ada884bf3889fa7c7fb206691", "name": "御女茉莉" } +] +SPEEKERS = { + "fishaudio":FISHAUDIO_SPEEKER, + "azure":[ + {"id":"zh-CN-YunxiNeural","name":"云希"}, + {"id":"zh-CN-YunzeNeural","name":"云哲"}, + {"id":"zh-CN-YunxuanNeural","name":"晓萱"}, + ] +} diff --git a/backend/fishaudio.py b/backend/fishaudio.py new file mode 100644 index 0000000..a526057 --- /dev/null +++ b/backend/fishaudio.py @@ -0,0 +1,33 @@ +from fish_audio_sdk import Session, TTSRequest, ReferenceAudio +from pydub import AudioSegment +import io + +from constants import FISHAUDIO_KEY,FISHAUDIO_SPEEKER + + + +import random + +def get_adapter_speeker_id(speaker_name): + speeker = FISHAUDIO_SPEEKER[0] + if speaker_name != "主持人": + speeker = random.choice(FISHAUDIO_SPEEKER) + return speeker["id"] + +def fishaudio_tts(text, reference_id=None) -> AudioSegment: + """ + 将给定的文本转换为语音并返回AudioSegment对象。 + + :param text: 要转换的文本 + :param reference_id: 可选参数,使用的模型 ID + :return: 返回生成的语音的AudioSegment对象 + """ + session = Session(FISHAUDIO_KEY) + audio_buffer = io.BytesIO() + for chunk in session.tts(TTSRequest( + reference_id=reference_id, + text=text + )): + audio_buffer.write(chunk) + audio_buffer.seek(0) # 重置缓冲区的位置 + return AudioSegment.from_file(audio_buffer, format="mp3") diff --git a/backend/requirements.txt b/backend/requirements.txt index d6c5aa9..b3fb11b 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -18,3 +18,4 @@ numpy==2.1.1 python-multipart==0.0.12 PyPDF2==3.0.1 azure-cognitiveservices-speech==1.41.1 +fish_audio_sdk \ No newline at end of file diff --git a/backend/utils.py b/backend/utils.py index 3f08a8e..f50fafc 100644 --- a/backend/utils.py +++ b/backend/utils.py @@ -9,6 +9,7 @@ from typing import Any, Dict, Generator import uuid from openai import OpenAI +from fishaudio import fishaudio_tts from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, PODCAST_INFO_PROMPT, QUESTION_MODIFIER, SUMMARY_INFO_PROMPT, SYSTEM_PROMPT, TONE_MODIFIER import json from pydub import AudioSegment @@ -49,10 +50,12 @@ def generate_dialogue(pdfFile, textInput, tone, duration, language) -> Generator yield json.dumps({"type": "final", "content": full_response}) -async def process_line(line, voice): - return await generate_podcast_audio(line['content'], voice) +async def process_line(line, voice,provider): + if provider == 'fishaudio': + return await generate_podcast_audio(line['content'], voice) + return await generate_podcast_audio_by_azure(line['content'], voice) -async def generate_podcast_audio(text: str, voice: str) -> str: +async def generate_podcast_audio_by_azure(text: str, voice: str) -> str: try: speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION) speech_config.speech_synthesis_voice_name = voice @@ -80,7 +83,27 @@ async def generate_podcast_audio(text: str, voice: str) -> str: print(f"Error in generate_podcast_audio: {e}") raise -async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str) -> Generator[str, None, None]: +async def generate_podcast_audio(text: str, voice: str) -> str: + return await generate_podcast_audio_by_fish(text,voice) + +async def generate_podcast_audio_by_fish(text: str, voice: str) -> str: + try: + return fishaudio_tts(text=text,reference_id=voice) + except Exception as e: + print(f"Error in generate_podcast_audio: {e}") + raise +async def process_lines_with_limit(lines, provider , host_voice, guest_voice, max_concurrency): + semaphore = asyncio.Semaphore(max_concurrency) + + async def limited_process_line(line): + async with semaphore: + voice = host_voice if (line['speaker'] == '主持人' or line['speaker'] == 'Host') else guest_voice + return await process_line(line, voice , provider) + + tasks = [limited_process_line(line) for line in lines] + results = await asyncio.gather(*tasks) + return results +async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str , provider:str,host_voice: str , guest_voice:str) -> Generator[str, None, None]: try: dialogue_regex = r'\*\*([\s\S]*?)\*\*[::]\s*([\s\S]*?)(?=\*\*|$)' matches = re.findall(dialogue_regex, text, re.DOTALL) @@ -93,13 +116,11 @@ async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, l for match in matches ] - host_voice = "zh-CN-YunxiNeural" - guest_voice = "zh-CN-YunzeNeural" - print("Starting audio generation") - audio_segments = await asyncio.gather( - *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines] - ) + # audio_segments = await asyncio.gather( + # *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines] + # ) + audio_segments = await process_lines_with_limit(lines,provider, host_voice, guest_voice, 10 if provider=='azure' else 5) print("Audio generation completed") # 合并音频 diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index e5fac99..ad9483e 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -36,7 +36,7 @@ function App() { } return (
-
+
(null); + const speekerReq = useSpeeker() + + const handleFileChange = (event: React.ChangeEvent) => { const file = event.target.files?.[0]; @@ -53,21 +61,24 @@ export default function Menu({ handleGenerate, isGenerating }: { handleGenerate: formData.append('tone', tone); formData.append('duration', duration); formData.append('language', language); + formData.append('hostVoice', hostVoice); + formData.append('guestVoice', guestVoice); + formData.append('provider', provider); handleGenerate(formData); }; return (
-
+

上传 PDF *

-