Skip to content

Commit

Permalink
feat: add llm
Browse files Browse the repository at this point in the history
  • Loading branch information
YOYZHANG committed Oct 12, 2024
1 parent 8ccc623 commit 7e45062
Show file tree
Hide file tree
Showing 12 changed files with 507 additions and 7 deletions.
1 change: 1 addition & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ app.egg-info
htmlcov
.cache
.venv
.env
23 changes: 22 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
FROM tiangolo/uvicorn-gunicorn-fastapi:python3.12

WORKDIR /app/
# 设置工作目录
WORKDIR /app

# 将当前目录内容复制到容器的 /app 中
COPY . /app

# 安装项目依赖
RUN pip install --no-cache-dir -r requirements.txt

# 安装 FFmpeg
RUN apt-get update && apt-get install -y ffmpeg

# 设置环境变量
ENV FIREWORKS_API_KEY=${FIREWORKS_API_KEY}

# 暴露端口 8000 供应用使用
EXPOSE 8000

# 运行应用
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]


File renamed without changes.
7 changes: 7 additions & 0 deletions backend/api/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from fastapi import APIRouter

from api.routes import chat

api_router = APIRouter()
api_router.include_router(chat.router, prefix="/chat")

89 changes: 89 additions & 0 deletions backend/api/routes/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import random

from fastapi import APIRouter, Query
from constants import GRADIO_CACHE_DIR, MELO_TTS_LANGUAGE_MAPPING, SUNO_LANGUAGE_MAPPING
from utils import generate_podcast_audio, generate_script
from prompts import LANGUAGE_MODIFIER, LENGTH_MODIFIERS, QUESTION_MODIFIER, SYSTEM_PROMPT, TONE_MODIFIER
from schema import ShortDialogue
from loguru import logger
from pydub import AudioSegment

from tempfile import NamedTemporaryFile

router = APIRouter()

@router.get("/")
def generate(input: str = Query(..., description="Input string")):
random_voice_number = random.randint(1, 9)

modified_system_prompt = SYSTEM_PROMPT
question = "introduce chatgpt"
tone = "funny"
language = "English"
length = "Short (1-2 min)"

if question:
modified_system_prompt += f"\n\n{QUESTION_MODIFIER} {question}"
if tone:
modified_system_prompt += f"\n\n{TONE_MODIFIER} {tone}."
if length:
modified_system_prompt += f"\n\n{LENGTH_MODIFIERS[length]}"
if language:
modified_system_prompt += f"\n\n{LANGUAGE_MODIFIER} {language}."

llm_output = generate_script(modified_system_prompt, "introduce chatgpt", ShortDialogue)

logger.info(f"Generated dialogue: {llm_output}")

audio_segments = []
transcript = ""
total_characters = 0

for line in llm_output.dialogue:
print(f"Generating audio for {line.speaker}: {line.text}")
logger.info(f"Generating audio for {line.speaker}: {line.text}")
if line.speaker == "Host (Jane)":
speaker = f"**Host**: {line.text}"
else:
speaker = f"**{llm_output.name_of_guest}**: {line.text}"
transcript += speaker + "\n\n"
total_characters += len(line.text)

language_for_tts = SUNO_LANGUAGE_MAPPING[language]

# Get audio file path
audio_file_path = generate_podcast_audio(
line.text, line.speaker, language_for_tts, random_voice_number
)
# Read the audio file into an AudioSegment
audio_segment = AudioSegment.from_file(audio_file_path)
audio_segments.append(audio_segment)

# Concatenate all audio segments
combined_audio = sum(audio_segments)

# Export the combined audio to a temporary file
temporary_directory = GRADIO_CACHE_DIR
os.makedirs(temporary_directory, exist_ok=True)

temporary_file = NamedTemporaryFile(
dir=temporary_directory,
delete=False,
suffix=".mp3",
)
combined_audio.export(temporary_file.name, format="mp3")
logger.info(f"Generated {total_characters} characters of audio")

# Delete any files in the temp directory that end with .mp3 and are over a day old
# for file in glob.glob(f"{temporary_directory}*.mp3"):
# if (
# os.path.isfile(file)
# and time.time() - os.path.getmtime(file) > GRADIO_CLEAR_CACHE_OLDER_THAN
# ):
# os.remove(file)

print(temporary_file.name)
print(transcript)
return {"message": f"Hello World, input: {temporary_file}"}

166 changes: 166 additions & 0 deletions backend/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
"""
constants.py
"""

import os

from pathlib import Path

# Key constants
APP_TITLE = "Open NotebookLM"
CHARACTER_LIMIT = 100_000

# Gradio-related constants
GRADIO_CACHE_DIR = "./gradio_cached_examples/tmp/"
GRADIO_CLEAR_CACHE_OLDER_THAN = 1 * 24 * 60 * 60 # 1 day

# Error messages-related constants
ERROR_MESSAGE_NO_INPUT = "Please provide at least one PDF file or a URL."
ERROR_MESSAGE_NOT_PDF = "The provided file is not a PDF. Please upload only PDF files."
ERROR_MESSAGE_NOT_SUPPORTED_IN_MELO_TTS = "The selected language is not supported without advanced audio generation. Please enable advanced audio generation or choose a supported language."
ERROR_MESSAGE_READING_PDF = "Error reading the PDF file"
ERROR_MESSAGE_TOO_LONG = "The total content is too long. Please ensure the combined text from PDFs and URL is fewer than {CHARACTER_LIMIT} characters."

# Fireworks API-related constants
FIREWORKS_API_KEY = os.getenv['FIREWORKS_API_KEY']
FIREWORKS_BASE_URL = "https://api.fireworks.ai/inference/v1"
FIREWORKS_MAX_TOKENS = 16_384
FIREWORKS_MODEL_ID = "accounts/fireworks/models/llama-v3p1-405b-instruct"
FIREWORKS_TEMPERATURE = 0.1
FIREWORKS_JSON_RETRY_ATTEMPTS = 3

# MeloTTS
MELO_API_NAME = "/synthesize"
MELO_TTS_SPACES_ID = "mrfakename/MeloTTS"
MELO_RETRY_ATTEMPTS = 3
MELO_RETRY_DELAY = 5 # in seconds

MELO_TTS_LANGUAGE_MAPPING = {
"en": "EN",
"es": "ES",
"fr": "FR",
"zh": "ZJ",
"ja": "JP",
"ko": "KR",
}


# Suno related constants
SUNO_LANGUAGE_MAPPING = {
"English": "en",
"Chinese": "zh",
"French": "fr",
"German": "de",
"Hindi": "hi",
"Italian": "it",
"Japanese": "ja",
"Korean": "ko",
"Polish": "pl",
"Portuguese": "pt",
"Russian": "ru",
"Spanish": "es",
"Turkish": "tr",
}

# General audio-related constants
NOT_SUPPORTED_IN_MELO_TTS = list(
set(SUNO_LANGUAGE_MAPPING.values()) - set(MELO_TTS_LANGUAGE_MAPPING.keys())
)
NOT_SUPPORTED_IN_MELO_TTS = [
key for key, id in SUNO_LANGUAGE_MAPPING.items() if id in NOT_SUPPORTED_IN_MELO_TTS
]

# Jina Reader-related constants
JINA_READER_URL = "https://r.jina.ai/"
JINA_RETRY_ATTEMPTS = 3
JINA_RETRY_DELAY = 5 # in seconds

# UI-related constants
UI_DESCRIPTION = """
<table style="border-collapse: collapse; border: none; padding: 20px;">
<tr style="border: none;">
<td style="border: none; vertical-align: top; padding-right: 30px; padding-left: 30px;">
<img src="https://raw.githubusercontent.com/gabrielchua/daily-ai-papers/main/_includes/icon.png" alt="Open NotebookLM" width="120" style="margin-bottom: 10px;">
</td>
<td style="border: none; vertical-align: top; padding: 10px;">
<p style="margin-bottom: 15px;">Convert your PDFs into podcasts with open-source AI models (<a href="https://huggingface.co/meta-llama/Llama-3.1-405B">Llama 3.1 405B</a> via <a href="https://fireworks.ai/">Fireworks AI</a>, <a href="https://huggingface.co/myshell-ai/MeloTTS-English">MeloTTS</a>, <a href="https://huggingface.co/suno/bark">Bark</a>).</p>
<p style="margin-top: 15px;">Note: Only the text content of the PDFs will be processed. Images and tables are not included. The total content should be no more than 100,000 characters due to the context length of Llama 3.1 405B.</p>
</td>
</tr>
</table>
"""
UI_AVAILABLE_LANGUAGES = list(set(SUNO_LANGUAGE_MAPPING.keys()))
UI_INPUTS = {
"file_upload": {
"label": "1. 📄 Upload your PDF(s)",
"file_types": [".pdf"],
"file_count": "multiple",
},
"url": {
"label": "2. 🔗 Paste a URL (optional)",
"placeholder": "Enter a URL to include its content",
},
"question": {
"label": "3. 🤔 Do you have a specific question or topic in mind?",
"placeholder": "Enter a question or topic",
},
"tone": {
"label": "4. 🎭 Choose the tone",
"choices": ["Fun", "Formal"],
"value": "Fun",
},
"length": {
"label": "5. ⏱️ Choose the length",
"choices": ["Short (1-2 min)", "Medium (3-5 min)"],
"value": "Medium (3-5 min)",
},
"language": {
"label": "6. 🌐 Choose the language",
"choices": UI_AVAILABLE_LANGUAGES,
"value": "English",
},
"advanced_audio": {
"label": "7. 🔄 Use advanced audio generation? (Experimental)",
"value": True,
},
}
UI_OUTPUTS = {
"audio": {"label": "🔊 Podcast", "format": "mp3"},
"transcript": {
"label": "📜 Transcript",
},
}
UI_API_NAME = "generate_podcast"
UI_ALLOW_FLAGGING = "never"
UI_CONCURRENCY_LIMIT = 3
UI_EXAMPLES = [
[
[str(Path("examples/1310.4546v1.pdf"))],
"",
"Explain this paper to me like I'm 5 years old",
"Fun",
"Short (1-2 min)",
"English",
True,
],
[
[],
"https://en.wikipedia.org/wiki/Hugging_Face",
"How did Hugging Face become so successful?",
"Fun",
"Short (1-2 min)",
"English",
False,
],
[
[],
"https://simple.wikipedia.org/wiki/Taylor_Swift",
"Why is Taylor Swift so popular?",
"Fun",
"Short (1-2 min)",
"English",
False,
],
]
UI_CACHE_EXAMPLES = True
UI_SHOW_API = True
8 changes: 3 additions & 5 deletions backend/app/main.py → backend/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from api.main import api_router

app = FastAPI(
title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json",
generate_unique_id_function=custom_generate_unique_id,
)
app = FastAPI()

# 添加CORS中间件
app.add_middleware(
Expand All @@ -16,3 +13,4 @@
allow_headers=["*"],
)

app.include_router(api_router, prefix="/api/v1")
68 changes: 68 additions & 0 deletions backend/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
prompts.py
"""

SYSTEM_PROMPT = """
You are a world-class podcast producer tasked with transforming the provided input text into an engaging and informative podcast script. The input may be unstructured or messy, sourced from PDFs or web pages. Your goal is to extract the most interesting and insightful content for a compelling podcast discussion.
# Steps to Follow:
1. **Analyze the Input:**
Carefully examine the text, identifying key topics, points, and interesting facts or anecdotes that could drive an engaging podcast conversation. Disregard irrelevant information or formatting issues.
2. **Brainstorm Ideas:**
In the `<scratchpad>`, creatively brainstorm ways to present the key points engagingly. Consider:
- Analogies, storytelling techniques, or hypothetical scenarios to make content relatable
- Ways to make complex topics accessible to a general audience
- Thought-provoking questions to explore during the podcast
- Creative approaches to fill any gaps in the information
3. **Craft the Dialogue:**
Develop a natural, conversational flow between the host (Jane) and the guest speaker (the author or an expert on the topic). Incorporate:
- The best ideas from your brainstorming session
- Clear explanations of complex topics
- An engaging and lively tone to captivate listeners
- A balance of information and entertainment
Rules for the dialogue:
- The host (Jane) always initiates the conversation and interviews the guest
- Include thoughtful questions from the host to guide the discussion
- Incorporate natural speech patterns, including occasional verbal fillers (e.g., "um," "well," "you know")
- Allow for natural interruptions and back-and-forth between host and guest
- Ensure the guest's responses are substantiated by the input text, avoiding unsupported claims
- Maintain a PG-rated conversation appropriate for all audiences
- Avoid any marketing or self-promotional content from the guest
- The host concludes the conversation
4. **Summarize Key Insights:**
Naturally weave a summary of key points into the closing part of the dialogue. This should feel like a casual conversation rather than a formal recap, reinforcing the main takeaways before signing off.
5. **Maintain Authenticity:**
Throughout the script, strive for authenticity in the conversation. Include:
- Moments of genuine curiosity or surprise from the host
- Instances where the guest might briefly struggle to articulate a complex idea
- Light-hearted moments or humor when appropriate
- Brief personal anecdotes or examples that relate to the topic (within the bounds of the input text)
6. **Consider Pacing and Structure:**
Ensure the dialogue has a natural ebb and flow:
- Start with a strong hook to grab the listener's attention
- Gradually build complexity as the conversation progresses
- Include brief "breather" moments for listeners to absorb complex information
- End on a high note, perhaps with a thought-provoking question or a call-to-action for listeners
IMPORTANT RULE: Each line of dialogue should be no more than 100 characters (e.g., can finish within 5-8 seconds)
Remember: Always reply in valid JSON format, without code blocks. Begin directly with the JSON output.
"""

QUESTION_MODIFIER = "PLEASE ANSWER THE FOLLOWING QN:"

TONE_MODIFIER = "TONE: The tone of the podcast should be"

LANGUAGE_MODIFIER = "OUTPUT LANGUAGE <IMPORTANT>: The the podcast should be"

LENGTH_MODIFIERS = {
"Short (1-2 min)": "Keep the podcast brief, around 5s long.",
"Medium (3-5 min)": "Aim for a moderate length, about 3-5 minutes.",
}
5 changes: 5 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,8 @@ sniffio==1.3.1
starlette==0.38.6
typing_extensions==4.12.2
uvicorn==0.31.1
openai==1.40.6
pydub==0.25.1
gradio_client==1.4.0
loguru==0.7.2
suno-bark @ git+https://github.com/suno-ai/bark.git@f4f32d4cd480dfec1c245d258174bc9bde3c2148
Loading

0 comments on commit 7e45062

Please sign in to comment.