Skip to content

Commit d22f5c7

Browse files
committed
Initial push
0 parents  commit d22f5c7

19 files changed

+1853
-0
lines changed

Dockerfile

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
FROM ubuntu:22.04
2+
3+
# Install system dependencies
4+
RUN apt-get update && apt-get install -y \
5+
python3.10 \
6+
python3-pip \
7+
python3-dev \
8+
build-essential \
9+
cmake \
10+
git \
11+
ffmpeg \
12+
wget \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
# Link python3 to python for convenience (optional)
16+
RUN ln -s /usr/bin/python3 /usr/bin/python
17+
18+
# Set working directory and copy files
19+
WORKDIR /app
20+
21+
# Install Python dependencies
22+
COPY requirements.txt .
23+
RUN pip3 install --no-cache-dir -r requirements.txt
24+
25+
# Copy application code
26+
COPY . .
27+
28+
# Debug line
29+
RUN ls -la /app # See what's copied
30+
31+
# Build whisper.cpp from its subdirectory
32+
WORKDIR /app/whisper.cpp
33+
RUN cmake -B build && cmake --build build --config Release
34+
35+
# Download Whisper model
36+
WORKDIR /app
37+
RUN mkdir -p /app/whisper_models
38+
RUN wget -O /app/whisper_models/ggml-base.en.bin https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin
39+
40+
# Switch back to /app for the app runtime
41+
WORKDIR /app
42+
43+
# Set environment variables
44+
ENV PYTHONPATH=/app
45+
ENV WHISPER_MODEL_PATH=/app/whisper_models/ggml-base.en.bin
46+
ENV WHISPER_CPP_PATH=/app/whisper.cpp/build/bin/whisper-cli
47+
48+
# Expose port
49+
EXPOSE 8000
50+
51+
# Command to run the application
52+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

Dockerfile.slim

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
FROM python:3.10-slim
2+
3+
WORKDIR /app
4+
5+
# Install system dependencies
6+
RUN apt-get update && apt-get install -y \
7+
build-essential \
8+
cmake \
9+
git \
10+
ffmpeg \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
# Install Python dependencies
14+
COPY requirements.txt .
15+
RUN pip install --no-cache-dir -r requirements.txt
16+
17+
# Use alternatives to avoid building whisper.cpp with vectorization issues
18+
# Option 1: Use a different speech recognition engine
19+
# Add to requirements.txt: openai-whisper
20+
# And use the Python library for transcription instead of the C++ version
21+
22+
# Create necessary directories
23+
RUN mkdir -p /app/models
24+
25+
# Add placeholder for whisper.cpp main executable
26+
# (We'll work around this in the Python code)
27+
28+
# Copy application code
29+
COPY . .
30+
31+
# Set environment variables
32+
ENV PYTHONPATH=/app
33+
ENV WHISPER_MODEL_PATH=/app/models
34+
35+
# Expose port
36+
EXPOSE 8000
37+
38+
# Command to run the application
39+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

InterviewAgent.iml

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<module type="PYTHON_MODULE" version="4">
3+
<component name="NewModuleRootManager" inherit-compiler-output="true">
4+
<exclude-output />
5+
<content url="file://$MODULE_DIR$">
6+
<excludeFolder url="file://$MODULE_DIR$/.venv" />
7+
</content>
8+
<orderEntry type="jdk" jdkName="Python 3.13 (InterviewAgent)" jdkType="Python SDK" />
9+
<orderEntry type="sourceFolder" forTests="false" />
10+
</component>
11+
</module>

ProjectStructure.txt

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
InterviewAgent/
2+
├── app/
3+
│ ├── __init__.py
4+
│ ├── main.py # FastAPI application entry point
5+
│ ├── routes/
6+
│ │ ├── __init__.py
7+
│ │ ├── websocket.py # WebSocket routes for audio streaming
8+
│ │ └── api.py # REST API routes
9+
│ ├── services/
10+
│ │ ├── __init__.py
11+
│ │ ├── whisper_service.py # Whisper.cpp integration
12+
│ │ └── grok_service.py # Grok API integration
13+
│ └── models/
14+
│ ├── __init__.py
15+
│ └── conversation.py # Data models
16+
├── static/
17+
│ ├── css/
18+
│ │ └── style.css
19+
│ ├── js/
20+
│ │ ├── app.js # Main application logic
21+
│ │ └── webrtc.js # WebRTC handling
22+
│ └── index.html # Main page
23+
├── tests/
24+
│ └── test_whisper.py
25+
├── Dockerfile
26+
├── docker-compose.yml
27+
├── requirements.txt
28+
└── README.md

README.md

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# WebRTC Speech-to-Text with Grok API
2+
3+
This project provides a web application that uses WebRTC to capture audio from the user's microphone, processes it with whisper.cpp for speech-to-text conversion, and integrates with X.AI's Grok API for intelligent responses.
4+
5+
## Features
6+
7+
- Real-time audio capture using WebRTC
8+
- Server-side speech-to-text processing with whisper.cpp
9+
- Integration with X.AI's Grok API
10+
- WebSocket-based communication for real-time interactions
11+
- Simple and intuitive user interface
12+
13+
## Prerequisites
14+
15+
- Docker and Docker Compose (for containerized deployment)
16+
- X.AI (Grok) API key
17+
18+
## Setup and Installation
19+
20+
1. Clone this repository:
21+
```bash
22+
git clone https://github.com/yourusername/webrtc-whisper-grok.git
23+
cd webrtc-whisper-grok
24+
```
25+
26+
2. Create a `.env` file in the project root with your API key:
27+
```
28+
GROK_API_KEY=your_api_key_here
29+
```
30+
31+
3. Build and start the application with Docker Compose:
32+
```bash
33+
docker-compose up --build
34+
```
35+
36+
4. Access the application at http://localhost:8000
37+
38+
## Manual Setup (without Docker)
39+
40+
If you prefer to run the application without Docker:
41+
42+
1. Install system dependencies:
43+
- Python 3.10+
44+
- FFmpeg
45+
- Build tools (gcc, cmake, etc.)
46+
47+
2. Clone and build whisper.cpp:
48+
```bash
49+
git clone https://github.com/ggerganov/whisper.cpp.git
50+
cd whisper.cpp
51+
make
52+
bash ./models/download-ggml-model.sh base.en
53+
cd ..
54+
```
55+
56+
3. Install Python dependencies:
57+
```bash
58+
pip install -r requirements.txt
59+
```
60+
61+
4. Set environment variables:
62+
```bash
63+
export GROK_API_KEY=your_api_key_here
64+
export WHISPER_CPP_PATH=/path/to/whisper.cpp/main
65+
```
66+
67+
5. Run the application:
68+
```bash
69+
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
70+
```
71+
72+
## Architecture
73+
74+
- **Frontend**: HTML/CSS/JavaScript with WebRTC for audio capture
75+
- **Backend**: FastAPI Python application
76+
- **WebSockets**: For real-time audio streaming and response delivery
77+
- **Processing Pipeline**: Audio → whisper.cpp → Grok API → User Interface
78+
79+
## Development
80+
81+
The project structure follows a clean architecture approach:
82+
83+
- `/app`: Backend Python code
84+
- `/static`: Frontend assets
85+
- `/tests`: Test cases
86+
87+
## Security Considerations
88+
89+
- This application requires microphone access, which is sensitive permission
90+
- HTTPS should be used in production to secure the WebRTC connection
91+
- API keys should be properly secured and not exposed in client-side code
92+
93+
## License
94+
95+
[MIT License](LICENSE)
96+
97+
## Acknowledgements
98+
99+
- [whisper.cpp](https://github.com/ggerganov/whisper.cpp) for high-performance speech recognition
100+
- X.AI for the Grok API
101+
- FastAPI for the web framework

app/main.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import sys
2+
3+
from fastapi import FastAPI
4+
from fastapi.staticfiles import StaticFiles
5+
from fastapi.middleware.cors import CORSMiddleware
6+
import uvicorn
7+
import logging
8+
9+
from app.routes.websocket import router as websocket_router
10+
from app.routes.api import router as api_router
11+
12+
# Configure logging
13+
logging.basicConfig(
14+
level=logging.DEBUG,
15+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
16+
handlers=[
17+
logging.StreamHandler(sys.stdout)
18+
]
19+
)
20+
logger = logging.getLogger(__name__)
21+
22+
# Create FastAPI app
23+
app = FastAPI(
24+
title="WebRTC Speech-to-Text with Grok",
25+
description="A web application that processes speech using WebRTC and whisper.cpp, then sends it to Grok API",
26+
version="0.1.0",
27+
)
28+
29+
# Configure CORS
30+
app.add_middleware(
31+
CORSMiddleware,
32+
allow_origins=["*"], # For development only, restrict in production
33+
allow_credentials=True,
34+
allow_methods=["*"],
35+
allow_headers=["*"],
36+
)
37+
38+
# Include routers
39+
app.include_router(websocket_router)
40+
app.include_router(api_router)
41+
42+
# Mount static files
43+
app.mount("/static", StaticFiles(directory="static"), name="static")
44+
app.mount("/", StaticFiles(directory="static", html=True), name="root")
45+
46+
if __name__ == "__main__":
47+
logger.info("Starting server")
48+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=True)

app/models/conversation.py

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from pydantic import BaseModel
2+
from typing import List, Dict, Optional
3+
from datetime import datetime
4+
import uuid
5+
6+
class Message(BaseModel):
7+
"""
8+
Represents a single message in a conversation.
9+
"""
10+
id: str = None
11+
role: str # "system", "user", or "assistant"
12+
content: str
13+
timestamp: datetime = None
14+
15+
def __init__(self, **data):
16+
super().__init__(**data)
17+
if self.id is None:
18+
self.id = str(uuid.uuid4())
19+
if self.timestamp is None:
20+
self.timestamp = datetime.now()
21+
22+
class Conversation(BaseModel):
23+
"""
24+
Represents a conversation with message history.
25+
"""
26+
id: str = None
27+
messages: List[Message] = []
28+
created_at: datetime = None
29+
updated_at: datetime = None
30+
metadata: Dict = {}
31+
32+
def __init__(self, **data):
33+
super().__init__(**data)
34+
if self.id is None:
35+
self.id = str(uuid.uuid4())
36+
if self.created_at is None:
37+
self.created_at = datetime.now()
38+
if self.updated_at is None:
39+
self.updated_at = self.created_at
40+
41+
def add_message(self, role: str, content: str) -> Message:
42+
"""
43+
Add a new message to the conversation.
44+
45+
Args:
46+
role: The role of the sender ("system", "user", or "assistant")
47+
content: The message content
48+
49+
Returns:
50+
The newly created message
51+
"""
52+
message = Message(role=role, content=content)
53+
self.messages.append(message)
54+
self.updated_at = datetime.now()
55+
return message
56+
57+
def to_dict(self) -> Dict:
58+
"""
59+
Convert the conversation to a dictionary.
60+
61+
Returns:
62+
Dictionary representation of the conversation
63+
"""
64+
return {
65+
"id": self.id,
66+
"messages": [
67+
{
68+
"role": msg.role,
69+
"content": msg.content
70+
}
71+
for msg in self.messages
72+
],
73+
"created_at": self.created_at.isoformat(),
74+
"updated_at": self.updated_at.isoformat(),
75+
"metadata": self.metadata
76+
}
77+
78+
def to_api_messages(self) -> List[Dict]:
79+
"""
80+
Convert the conversation messages to a format suitable for the API.
81+
82+
Returns:
83+
List of message dictionaries formatted for the API
84+
"""
85+
return [
86+
{
87+
"role": msg.role,
88+
"content": msg.content
89+
}
90+
for msg in self.messages
91+
]

0 commit comments

Comments
 (0)