-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
507 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ app.egg-info | |
htmlcov | ||
.cache | ||
.venv | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,24 @@ | ||
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.12 | ||
|
||
WORKDIR /app/ | ||
# 设置工作目录 | ||
WORKDIR /app | ||
|
||
# 将当前目录内容复制到容器的 /app 中 | ||
COPY . /app | ||
|
||
# 安装项目依赖 | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# 安装 FFmpeg | ||
RUN apt-get update && apt-get install -y ffmpeg | ||
|
||
# 设置环境变量 | ||
ENV FIREWORKS_API_KEY=${FIREWORKS_API_KEY} | ||
|
||
# 暴露端口 8000 供应用使用 | ||
EXPOSE 8000 | ||
|
||
# 运行应用 | ||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] | ||
|
||
|
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from fastapi import APIRouter | ||
|
||
from api.routes import chat | ||
|
||
api_router = APIRouter() | ||
api_router.include_router(chat.router, prefix="/chat") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import os | ||
import random | ||
|
||
from fastapi import APIRouter, Query | ||
from constants import GRADIO_CACHE_DIR, MELO_TTS_LANGUAGE_MAPPING, SUNO_LANGUAGE_MAPPING | ||
from utils import generate_podcast_audio, generate_script | ||
from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, QUESTION_MODIFIER, SYSTEM_PROMPT, TONE_MODIFIER | ||
from schema import ShortDialogue | ||
from loguru import logger | ||
from pydub import AudioSegment | ||
|
||
from tempfile import NamedTemporaryFile | ||
|
||
router = APIRouter() | ||
|
||
@router.get("/") | ||
def generate(input: str = Query(..., description="Input string")): | ||
random_voice_number = random.randint(1, 9) | ||
|
||
modified_system_prompt = SYSTEM_PROMPT | ||
question = "introduce chatgpt" | ||
tone = "funny" | ||
language = "English" | ||
length = "Short (1-2 min)" | ||
|
||
if question: | ||
modified_system_prompt += f"\n\n{QUESTION_MODIFIER} {question}" | ||
if tone: | ||
modified_system_prompt += f"\n\n{TONE_MODIFIER} {tone}." | ||
if length: | ||
modified_system_prompt += f"\n\n{LENGTH_MODIFIERS[length]}" | ||
if language: | ||
modified_system_prompt += f"\n\n{LANGUAGE_MODIFIER} {language}." | ||
|
||
llm_output = generate_script(modified_system_prompt, "introduce chatgpt", ShortDialogue) | ||
|
||
logger.info(f"Generated dialogue: {llm_output}") | ||
|
||
audio_segments = [] | ||
transcript = "" | ||
total_characters = 0 | ||
|
||
for line in llm_output.dialogue: | ||
print(f"Generating audio for {line.speaker}: {line.text}") | ||
logger.info(f"Generating audio for {line.speaker}: {line.text}") | ||
if line.speaker == "Host (Jane)": | ||
speaker = f"**Host**: {line.text}" | ||
else: | ||
speaker = f"**{llm_output.name_of_guest}**: {line.text}" | ||
transcript += speaker + "\n\n" | ||
total_characters += len(line.text) | ||
|
||
language_for_tts = SUNO_LANGUAGE_MAPPING[language] | ||
|
||
# Get audio file path | ||
audio_file_path = generate_podcast_audio( | ||
line.text, line.speaker, language_for_tts, random_voice_number | ||
) | ||
# Read the audio file into an AudioSegment | ||
audio_segment = AudioSegment.from_file(audio_file_path) | ||
audio_segments.append(audio_segment) | ||
|
||
# Concatenate all audio segments | ||
combined_audio = sum(audio_segments) | ||
|
||
# Export the combined audio to a temporary file | ||
temporary_directory = GRADIO_CACHE_DIR | ||
os.makedirs(temporary_directory, exist_ok=True) | ||
|
||
temporary_file = NamedTemporaryFile( | ||
dir=temporary_directory, | ||
delete=False, | ||
suffix=".mp3", | ||
) | ||
combined_audio.export(temporary_file.name, format="mp3") | ||
logger.info(f"Generated {total_characters} characters of audio") | ||
|
||
# Delete any files in the temp directory that end with .mp3 and are over a day old | ||
# for file in glob.glob(f"{temporary_directory}*.mp3"): | ||
# if ( | ||
# os.path.isfile(file) | ||
# and time.time() - os.path.getmtime(file) > GRADIO_CLEAR_CACHE_OLDER_THAN | ||
# ): | ||
# os.remove(file) | ||
|
||
print(temporary_file.name) | ||
print(transcript) | ||
return {"message": f"Hello World, input: {temporary_file}"} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
""" | ||
constants.py | ||
""" | ||
|
||
import os | ||
|
||
from pathlib import Path | ||
|
||
# Key constants | ||
APP_TITLE = "Open NotebookLM" | ||
CHARACTER_LIMIT = 100_000 | ||
|
||
# Gradio-related constants | ||
GRADIO_CACHE_DIR = "./gradio_cached_examples/tmp/" | ||
GRADIO_CLEAR_CACHE_OLDER_THAN = 1 * 24 * 60 * 60 # 1 day | ||
|
||
# Error messages-related constants | ||
ERROR_MESSAGE_NO_INPUT = "Please provide at least one PDF file or a URL." | ||
ERROR_MESSAGE_NOT_PDF = "The provided file is not a PDF. Please upload only PDF files." | ||
ERROR_MESSAGE_NOT_SUPPORTED_IN_MELO_TTS = "The selected language is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language." | ||
ERROR_MESSAGE_READING_PDF = "Error reading the PDF file" | ||
ERROR_MESSAGE_TOO_LONG = "The total content is too long. Please ensure the combined text from PDFs and URL is fewer than {CHARACTER_LIMIT} characters." | ||
|
||
# Fireworks API-related constants | ||
FIREWORKS_API_KEY = os.getenv['FIREWORKS_API_KEY'] | ||
FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1" | ||
FIREWORKS_MAX_TOKENS = 16_384 | ||
FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct" | ||
FIREWORKS_TEMPERATURE = 0.1 | ||
FIREWORKS_JSON_RETRY_ATTEMPTS = 3 | ||
|
||
# MeloTTS | ||
MELO_API_NAME = "/synthesize" | ||
MELO_TTS_SPACES_ID = "mrfakename/MeloTTS" | ||
MELO_RETRY_ATTEMPTS = 3 | ||
MELO_RETRY_DELAY = 5 # in seconds | ||
|
||
MELO_TTS_LANGUAGE_MAPPING = { | ||
"en": "EN", | ||
"es": "ES", | ||
"fr": "FR", | ||
"zh": "ZJ", | ||
"ja": "JP", | ||
"ko": "KR", | ||
} | ||
|
||
|
||
# Suno related constants | ||
SUNO_LANGUAGE_MAPPING = { | ||
"English": "en", | ||
"Chinese": "zh", | ||
"French": "fr", | ||
"German": "de", | ||
"Hindi": "hi", | ||
"Italian": "it", | ||
"Japanese": "ja", | ||
"Korean": "ko", | ||
"Polish": "pl", | ||
"Portuguese": "pt", | ||
"Russian": "ru", | ||
"Spanish": "es", | ||
"Turkish": "tr", | ||
} | ||
|
||
# General audio-related constants | ||
NOT_SUPPORTED_IN_MELO_TTS = list( | ||
set(SUNO_LANGUAGE_MAPPING.values()) - set(MELO_TTS_LANGUAGE_MAPPING.keys()) | ||
) | ||
NOT_SUPPORTED_IN_MELO_TTS = [ | ||
key for key, id in SUNO_LANGUAGE_MAPPING.items() if id in NOT_SUPPORTED_IN_MELO_TTS | ||
] | ||
|
||
# Jina Reader-related constants | ||
JINA_READER_URL = "https://r.jina.ai/" | ||
JINA_RETRY_ATTEMPTS = 3 | ||
JINA_RETRY_DELAY = 5 # in seconds | ||
|
||
# UI-related constants | ||
UI_DESCRIPTION = """ | ||
<table style="border-collapse: collapse; border: none; padding: 20px;"> | ||
<tr style="border: none;"> | ||
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;"> | ||
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;"> | ||
</td> | ||
<td style="border: none; vertical-align: top; padding: 10px;"> | ||
<p style="margin-bottom: 15px;">Convert your PDFs into podcasts with open-source AI models (<a href="https://huggingface.co/meta-llama/Llama-3.1-405B">Llama 3.1 405B</a> via <a href="https://fireworks.ai/">Fireworks AI</a>, <a href="https://huggingface.co/myshell-ai/MeloTTS-English">MeloTTS</a>, <a href="https://huggingface.co/suno/bark">Bark</a>).</p> | ||
<p style="margin-top: 15px;">Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.</p> | ||
</td> | ||
</tr> | ||
</table> | ||
""" | ||
UI_AVAILABLE_LANGUAGES = list(set(SUNO_LANGUAGE_MAPPING.keys())) | ||
UI_INPUTS = { | ||
"file_upload": { | ||
"label": "1. 📄 Upload your PDF(s)", | ||
"file_types": [".pdf"], | ||
"file_count": "multiple", | ||
}, | ||
"url": { | ||
"label": "2. 🔗 Paste a URL (optional)", | ||
"placeholder": "Enter a URL to include its content", | ||
}, | ||
"question": { | ||
"label": "3. 🤔 Do you have a specific question or topic in mind?", | ||
"placeholder": "Enter a question or topic", | ||
}, | ||
"tone": { | ||
"label": "4. 🎭 Choose the tone", | ||
"choices": ["Fun", "Formal"], | ||
"value": "Fun", | ||
}, | ||
"length": { | ||
"label": "5. ⏱️ Choose the length", | ||
"choices": ["Short (1-2 min)", "Medium (3-5 min)"], | ||
"value": "Medium (3-5 min)", | ||
}, | ||
"language": { | ||
"label": "6. 🌐 Choose the language", | ||
"choices": UI_AVAILABLE_LANGUAGES, | ||
"value": "English", | ||
}, | ||
"advanced_audio": { | ||
"label": "7. 🔄 Use advanced audio generation? (Experimental)", | ||
"value": True, | ||
}, | ||
} | ||
UI_OUTPUTS = { | ||
"audio": {"label": "🔊 Podcast", "format": "mp3"}, | ||
"transcript": { | ||
"label": "📜 Transcript", | ||
}, | ||
} | ||
UI_API_NAME = "generate_podcast" | ||
UI_ALLOW_FLAGGING = "never" | ||
UI_CONCURRENCY_LIMIT = 3 | ||
UI_EXAMPLES = [ | ||
[ | ||
[str(Path("examples/1310.4546v1.pdf"))], | ||
"", | ||
"Explain this paper to me like I'm 5 years old", | ||
"Fun", | ||
"Short (1-2 min)", | ||
"English", | ||
True, | ||
], | ||
[ | ||
[], | ||
"https://en.wikipedia.org/wiki/Hugging_Face", | ||
"How did Hugging Face become so successful?", | ||
"Fun", | ||
"Short (1-2 min)", | ||
"English", | ||
False, | ||
], | ||
[ | ||
[], | ||
"https://simple.wikipedia.org/wiki/Taylor_Swift", | ||
"Why is Taylor Swift so popular?", | ||
"Fun", | ||
"Short (1-2 min)", | ||
"English", | ||
False, | ||
], | ||
] | ||
UI_CACHE_EXAMPLES = True | ||
UI_SHOW_API = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
prompts.py | ||
""" | ||
|
||
SYSTEM_PROMPT = """ | ||
You are a world-class podcast producer tasked with transforming the provided input text into an engaging and informative podcast script. The input may be unstructured or messy, sourced from PDFs or web pages. Your goal is to extract the most interesting and insightful content for a compelling podcast discussion. | ||
# Steps to Follow: | ||
1. **Analyze the Input:** | ||
Carefully examine the text, identifying key topics, points, and interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant information or formatting issues. | ||
2. **Brainstorm Ideas:** | ||
In the `<scratchpad>`, creatively brainstorm ways to present the key points engagingly. Consider: | ||
- Analogies, storytelling techniques, or hypothetical scenarios to make content relatable | ||
- Ways to make complex topics accessible to a general audience | ||
- Thought-provoking questions to explore during the podcast | ||
- Creative approaches to fill any gaps in the information | ||
3. **Craft the Dialogue:** | ||
Develop a natural, conversational flow between the host (Jane) and the guest speaker (the author or an expert on the topic). Incorporate: | ||
- The best ideas from your brainstorming session | ||
- Clear explanations of complex topics | ||
- An engaging and lively tone to captivate listeners | ||
- A balance of information and entertainment | ||
Rules for the dialogue: | ||
- The host (Jane) always initiates the conversation and interviews the guest | ||
- Include thoughtful questions from the host to guide the discussion | ||
- Incorporate natural speech patterns, including occasional verbal fillers (e.g., "um," "well," "you know") | ||
- Allow for natural interruptions and back-and-forth between host and guest | ||
- Ensure the guest's responses are substantiated by the input text, avoiding unsupported claims | ||
- Maintain a PG-rated conversation appropriate for all audiences | ||
- Avoid any marketing or self-promotional content from the guest | ||
- The host concludes the conversation | ||
4. **Summarize Key Insights:** | ||
Naturally weave a summary of key points into the closing part of the dialogue. This should feel like a casual conversation rather than a formal recap, reinforcing the main takeaways before signing off. | ||
5. **Maintain Authenticity:** | ||
Throughout the script, strive for authenticity in the conversation. Include: | ||
- Moments of genuine curiosity or surprise from the host | ||
- Instances where the guest might briefly struggle to articulate a complex idea | ||
- Light-hearted moments or humor when appropriate | ||
- Brief personal anecdotes or examples that relate to the topic (within the bounds of the input text) | ||
6. **Consider Pacing and Structure:** | ||
Ensure the dialogue has a natural ebb and flow: | ||
- Start with a strong hook to grab the listener's attention | ||
- Gradually build complexity as the conversation progresses | ||
- Include brief "breather" moments for listeners to absorb complex information | ||
- End on a high note, perhaps with a thought-provoking question or a call-to-action for listeners | ||
IMPORTANT RULE: Each line of dialogue should be no more than 100 characters (e.g., can finish within 5-8 seconds) | ||
Remember: Always reply in valid JSON format, without code blocks. Begin directly with the JSON output. | ||
""" | ||
|
||
QUESTION_MODIFIER = "PLEASE ANSWER THE FOLLOWING QN:" | ||
|
||
TONE_MODIFIER = "TONE: The tone of the podcast should be" | ||
|
||
LANGUAGE_MODIFIER = "OUTPUT LANGUAGE <IMPORTANT>: The the podcast should be" | ||
|
||
LENGTH_MODIFIERS = { | ||
"Short (1-2 min)": "Keep the podcast brief, around 5s long.", | ||
"Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.", | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.