From f0c0716a4f9bd0d07f34c097ccf413413c381758 Mon Sep 17 00:00:00 2001 From: liuhuapiaoyuan <278780765@qq.com> Date: Wed, 23 Oct 2024 15:42:30 +0800 Subject: [PATCH 1/4] fix: fix menu scrollbar --- frontend/src/App.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index e5fac99..ad9483e 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -36,7 +36,7 @@ function App() { } return (
-
+
Date: Wed, 23 Oct 2024 15:43:15 +0800 Subject: [PATCH 2/4] feat: add voice provider (adapter fishaudio) --- backend/.gitignore | 1 + backend/api/routes/chat.py | 14 +++- backend/constants.py | 38 ++++++++++- backend/fishaudio.py | 34 ++++++++++ backend/requirements.txt | 1 + backend/utils.py | 41 +++++++++--- frontend/src/components/content.tsx | 2 + frontend/src/components/menu.tsx | 100 +++++++++++++++++++++++----- frontend/src/hooks/useSpeeker.ts | 32 +++++++++ 9 files changed, 233 insertions(+), 30 deletions(-) create mode 100644 backend/fishaudio.py create mode 100644 frontend/src/hooks/useSpeeker.ts diff --git a/backend/.gitignore b/backend/.gitignore index 993dacd..e2c2b07 100644 --- a/backend/.gitignore +++ b/backend/.gitignore @@ -7,3 +7,4 @@ htmlcov .cache .venv .env +tmp/cache/* \ No newline at end of file diff --git a/backend/api/routes/chat.py b/backend/api/routes/chat.py index a5209ad..2a2dab5 100644 --- a/backend/api/routes/chat.py +++ b/backend/api/routes/chat.py @@ -3,6 +3,7 @@ from fastapi.responses import StreamingResponse, JSONResponse import json from typing import Dict, Optional +from constants import SPEEKERS from utils import combine_audio, generate_dialogue, generate_podcast_info, generate_podcast_summary, get_pdf_text router = APIRouter() @@ -25,6 +26,12 @@ async def generate_transcript( def test(): return {"message": "Hello World"} + +@router.get("/speekers") +def speeker(): + return JSONResponse(content=SPEEKERS) + + @router.post("/summarize") async def get_summary( textInput: str = Form(...), @@ -67,12 +74,15 @@ async def get_pod_info( async def audio( background_tasks: BackgroundTasks, text: str = Form(...), - language: str = Form(...) + host_voice: str = Form(...), + guest_voice: str = Form(...), + language: str = Form(...) , + provider: str = Form(...) ): task_id = str(uuid.uuid4()) task_status[task_id] = {"status": "processing"} - background_tasks.add_task(combine_audio, task_status, task_id, text, language) + background_tasks.add_task(combine_audio, task_status, task_id, text, language,provider , host_voice,guest_voice) return JSONResponse(content={"task_id": task_id, "status": "processing"}) diff --git a/backend/constants.py b/backend/constants.py index 93ae7f3..da87af2 100644 --- a/backend/constants.py +++ b/backend/constants.py @@ -23,11 +23,14 @@ SPEECH_KEY = os.getenv('SPEECH_KEY') SPEECH_REGION = "japaneast" + +FISHAUDIO_KEY = os.getenv('FISHAUDIO_KEY') + # Fireworks API-related constants FIREWORKS_API_KEY = os.getenv('FIREWORKS_API_KEY') -FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1" +FIREWORKS_BASE_URL = os.getenv('FIREWORKS_BASE_URL',"https://api.fireworks.ai/inference/v1") FIREWORKS_MAX_TOKENS = 16_384 -FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct" +FIREWORKS_MODEL_ID = os.getenv('FIREWORKS_MODEL_ID',"accounts/fireworks/models/llama-v3p1-405b-instruct") FIREWORKS_TEMPERATURE = 0.1 FIREWORKS_JSON_RETRY_ATTEMPTS = 3 # Suno related constants @@ -47,3 +50,34 @@ "Turkish": "tr", } + +FISHAUDIO_SPEEKER = [ + { "id": "59cb5986671546eaa6ca8ae6f29f6d22", "name": "央视配音" }, + { "id": "738d0cc1a3e9430a9de2b544a466a7fc", "name": "雷军" }, + { "id": "54a5170264694bfc8e9ad98df7bd89c3", "name": "丁真" }, + { "id": "7f92f8afb8ec43bf81429cc1c9199cb1", "name": "AD学姐" }, + { "id": "0eb38bc974e1459facca38b359e13511", "name": "赛马娘" }, + { "id": "e80ea225770f42f79d50aa98be3cedfc", "name": "孙笑川258" }, + { "id": "e4642e5edccd4d9ab61a69e82d4f8a14", "name": "蔡徐坤" }, + { "id": "f7561ff309bd4040a59f1e600f4f4338", "name": "黑手" }, + { "id": "332941d1360c48949f1b4e0cabf912cd", "name": "丁真(锐刻五代版)" }, + { "id": "1aacaeb1b840436391b835fd5513f4c4", "name": "芙宁娜" }, + { "id": "3b55b3d84d2f453a98d8ca9bb24182d6", "name": "邓紫琪" }, + { "id": "7af4d620be1c4c6686132f21940d51c5", "name": "东雪莲" }, + { "id": "e1cfccf59a1c4492b5f51c7c62a8abd2", "name": "永雏塔菲" }, + { "id": "665e031efe27435780ebfa56cc7e0e0d", "name": "月半猫" }, + { "id": "aebaa2305aa2452fbdc8f41eec852a79", "name": "雷军" }, + { "id": "7c66db6e457c4d53b1fe428a8c547953", "name": "郭德纲" }, + { "id": "99503144194c45ed8fb998ceac181dcc", "name": "贝利亚" }, + { "id": "4462fa28f3824bff808a94a6075570e5", "name": "雷军" }, + { "id": "188c9b7c06654042be0e8a25781761e8", "name": "周杰伦" }, + { "id": "6ce7ea8ada884bf3889fa7c7fb206691", "name": "御女茉莉" } +] +SPEEKERS = { + "fishaudio":FISHAUDIO_SPEEKER, + "azure":[ + {"id":"zh-CN-YunxiNeural","name":"云希"}, + {"id":"zh-CN-YunzeNeural","name":"云哲"}, + {"id":"zh-CN-YunxuanNeural","name":"晓萱"}, + ] +} diff --git a/backend/fishaudio.py b/backend/fishaudio.py new file mode 100644 index 0000000..bf20123 --- /dev/null +++ b/backend/fishaudio.py @@ -0,0 +1,34 @@ +from fish_audio_sdk import Session, TTSRequest, ReferenceAudio +from pydub import AudioSegment +import io + +from constants import FISHAUDIO_KEY,FISHAUDIO_SPEEKER + + + +import random + +def get_adapter_speeker_id(speaker_name): + speeker = FISHAUDIO_SPEEKER[0] + if speaker_name != "主持人": + speeker = random.choice(FISHAUDIO_SPEEKER) + return speeker["id"] + +def fishaudio_tts(text, reference_id=None) -> AudioSegment: + """ + 将给定的文本转换为语音并返回AudioSegment对象。 + + :param text: 要转换的文本 + :param reference_id: 可选参数,使用的模型 ID + :return: 返回生成的语音的AudioSegment对象 + """ + print("reference_id:", reference_id) + session = Session(FISHAUDIO_KEY) + audio_buffer = io.BytesIO() + for chunk in session.tts(TTSRequest( + reference_id=reference_id, + text=text + )): + audio_buffer.write(chunk) + audio_buffer.seek(0) # 重置缓冲区的位置 + return AudioSegment.from_file(audio_buffer, format="mp3") diff --git a/backend/requirements.txt b/backend/requirements.txt index d6c5aa9..b3fb11b 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -18,3 +18,4 @@ numpy==2.1.1 python-multipart==0.0.12 PyPDF2==3.0.1 azure-cognitiveservices-speech==1.41.1 +fish_audio_sdk \ No newline at end of file diff --git a/backend/utils.py b/backend/utils.py index 3f08a8e..5d0a718 100644 --- a/backend/utils.py +++ b/backend/utils.py @@ -9,6 +9,7 @@ from typing import Any, Dict, Generator import uuid from openai import OpenAI +from fishaudio import fishaudio_tts from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, PODCAST_INFO_PROMPT, QUESTION_MODIFIER, SUMMARY_INFO_PROMPT, SYSTEM_PROMPT, TONE_MODIFIER import json from pydub import AudioSegment @@ -49,10 +50,12 @@ def generate_dialogue(pdfFile, textInput, tone, duration, language) -> Generator yield json.dumps({"type": "final", "content": full_response}) -async def process_line(line, voice): - return await generate_podcast_audio(line['content'], voice) +async def process_line(line, voice,provider): + if provider == 'fishaudio': + return await generate_podcast_audio(line['content'], voice) + return await generate_podcast_audio_by_azure(line['content'], voice) -async def generate_podcast_audio(text: str, voice: str) -> str: +async def generate_podcast_audio_by_azure(text: str, voice: str) -> str: try: speech_config = speechsdk.SpeechConfig(subscription=SPEECH_KEY, region=SPEECH_REGION) speech_config.speech_synthesis_voice_name = voice @@ -80,7 +83,27 @@ async def generate_podcast_audio(text: str, voice: str) -> str: print(f"Error in generate_podcast_audio: {e}") raise -async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str) -> Generator[str, None, None]: +async def generate_podcast_audio(text: str, voice: str) -> str: + return await generate_podcast_audio_by_fish(text,voice) + +async def generate_podcast_audio_by_fish(text: str, voice: str) -> str: + try: + return fishaudio_tts(text=text,reference_id=voice) + except Exception as e: + print(f"Error in generate_podcast_audio: {e}") + raise +async def process_lines_with_limit(lines, provider , host_voice, guest_voice, max_concurrency): + semaphore = asyncio.Semaphore(max_concurrency) + + async def limited_process_line(line): + async with semaphore: + voice = host_voice if (line['speaker'] == '主持人' or line['speaker'] == 'Host') else guest_voice + return await process_line(line, voice , provider) + + tasks = [limited_process_line(line) for line in lines] + results = await asyncio.gather(*tasks) + return results +async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, language: str , provider:str,host_voice: str , guest_voice:str) -> Generator[str, None, None]: try: dialogue_regex = r'\*\*([\s\S]*?)\*\*[::]\s*([\s\S]*?)(?=\*\*|$)' matches = re.findall(dialogue_regex, text, re.DOTALL) @@ -93,13 +116,11 @@ async def combine_audio(task_status: Dict[str, Dict], task_id: str, text: str, l for match in matches ] - host_voice = "zh-CN-YunxiNeural" - guest_voice = "zh-CN-YunzeNeural" - print("Starting audio generation") - audio_segments = await asyncio.gather( - *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines] - ) + # audio_segments = await asyncio.gather( + # *[process_line(line, host_voice if line['speaker'] == '主持人' else guest_voice) for line in lines] + # ) + audio_segments = await process_lines_with_limit(lines, host_voice, guest_voice, 10 if provider=='azure' else 5) print("Audio generation completed") # 合并音频 diff --git a/frontend/src/components/content.tsx b/frontend/src/components/content.tsx index 931a943..cb5b9d3 100644 --- a/frontend/src/components/content.tsx +++ b/frontend/src/components/content.tsx @@ -66,6 +66,8 @@ export default function Content({ const audioFormData = new FormData(); audioFormData.append('text', transcriptFinalResult.content); audioFormData.append('language', formData.get('language') as string); + audioFormData.append('host_voice', formData.get('hostVoice') as string); + audioFormData.append('guest_voice', formData.get('guestVoice') as string); generateAudio(audioFormData) } diff --git a/frontend/src/components/menu.tsx b/frontend/src/components/menu.tsx index a673df2..fe80a52 100644 --- a/frontend/src/components/menu.tsx +++ b/frontend/src/components/menu.tsx @@ -1,8 +1,10 @@ -import React, { useState } from 'react'; +import React, { useState } from 'react'; import { Button } from "@/components/ui/button"; import { Textarea } from "@/components/ui/textarea"; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; import { Upload, FileText, Mic, Clock, Globe, Sparkles } from "lucide-react"; +import { Card, CardContent } from './ui/card'; +import { useSpeeker } from '@/hooks/useSpeeker'; const MAX_FILE_SIZE = 5 * 1024 * 1024; // 5MB in bytes const DEMO_PDF_URL = '/demo.pdf'; // 替换为你的演示 PDF 文件的实际路径 @@ -13,7 +15,13 @@ export default function Menu({ handleGenerate, isGenerating }: { handleGenerate: const [tone, setTone] = useState('neutral'); const [duration, setDuration] = useState('short'); const [language, setLanguage] = useState('Chinese'); + const [hostVoice, setHostVoice] = useState('zh-CN-YunxiNeural'); + const [guestVoice, setGuestVoice] = useState('zh-CN-YunzeNeural'); + const [provider, setProvider] = useState('azure'); const [fileError, setFileError] = useState(null); + const speekerReq = useSpeeker() + + const handleFileChange = (event: React.ChangeEvent) => { const file = event.target.files?.[0]; @@ -53,21 +61,23 @@ export default function Menu({ handleGenerate, isGenerating }: { handleGenerate: formData.append('tone', tone); formData.append('duration', duration); formData.append('language', language); + formData.append('hostVoice', hostVoice); + formData.append('guestVoice', guestVoice); handleGenerate(formData); }; return (
-
+

上传 PDF *

-