Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions products/llm_playground/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# app/Dockerfile
FROM python:3.10-slim AS builder
RUN apt-get update && apt-get install -y build-essential git
RUN pip3 install -U pip && pip3 install boto3 requests black isort
WORKDIR /app

COPY requirements.txt start.sh /tmp/
ARG OO
ARG EO
ARG BO
ENV OO=${OO}
ENV EO=${EO}
ENV BO=${BO}
ENV PYTHONDONTWRITEBYTECODE=1

RUN pip3 install -r /tmp/requirements.txt
COPY app .

RUN pip3 install --upgrade awscli

# Configure pip to use AWS CodeArtifact
ARG CODEARTIFACT_TOKEN

RUN --mount=type=secret,id=catoken \
sh -c ' \
export CODEARTIFACT_AUTH_TOKEN="$(cat /run/secrets/catoken)" \
&& pip install --no-cache-dir --no-deps "encryptor-obfuscator==0.1" "jsl-i==1.1" \
--extra-index-url "https://aws:${CODEARTIFACT_TOKEN}:@johnsnowlabs-175460536396.d.codeartifact.us-east-2.amazonaws.com/pypi/protecto/simple/" \
'

RUN sh /tmp/start.sh

FROM nvidia/cuda:12.0.1-cudnn8-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y \
&& apt-get install -y \
git \
openjdk-8-jdk-headless \
python3-pip \
python3.10-dev

ARG EO
ARG BO
ENV HOME=/app \
EO=${EO} \
BO=${BO} \
PYTHONDONTWRITEBYTECODE=1 \
JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/ \
PATH=/usr/lib/jvm/java-8-openjdk-amd64/bin:/app/.local/bin:$PATH \
TF_CPP_MIN_LOG_LEVEL=2 \
PYTHONIOENCODING=utf-8 \
CUDA_HOME=/usr/local/cuda \
PYTHONPATH=${PYTHONPATH}:/ \
PYSPARK_PYTHON=python3 \
HF_HOME=/opt/.prop \
PYSPARK_DRIVER_PYTHON=python3

COPY requirements-with-cli.txt /tmp/
# python package installation
WORKDIR /app
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/dist-packages
RUN pip3 install --no-cache-dir -r /tmp/requirements-with-cli.txt

# Cleanup
RUN rm -f /usr/bin/python3 \
&& rm -rf /tmp/* /var/tmp/* /var/lib/apt/lists/* /usr/share/man/* /usr/share/doc/* \
&& ln -s /usr/bin/python3.10 /usr/bin/python3 \
&& apt-get purge --remove linux-libc-dev git -y \
&& apt-get autoremove -y \
&& apt-get clean


RUN chown -R nobody:nogroup /app
COPY --from=builder /app /app
COPY --from=builder /opt/ /opt/
#CMD ["sleep", "infinity"]
CMD ["python3", "-m", "uvicorn", "main:app", "--workers", "1", "--host", "0.0.0.0", "--port", "5000"]
# python3 -m uvicorn main:app --workers 1 --host 0.0.0.0 --port 5000 --reload
19 changes: 19 additions & 0 deletions products/llm_playground/app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from loguru import logger
import os
# Define custom format without file details
_f = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{message}</cyan>"

# Remove the default logger
logger.remove()

# Add a new console logger with the custom format
logger.add(lambda msg: print(msg, end=""), format=_f, level=os.environ.get("LOGGING_MODE", "INFO").upper())
try:
from jsl_i import start as app_start

app_start()
except ModuleNotFoundError:
logger.error("*********app is not ready for production use, running in dev mode****")


__all__ = ["logger"]
44 changes: 44 additions & 0 deletions products/llm_playground/app/api_info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"status":
{
"method": "GET",
"route": "http://localhost:__PORT__/status",
"description": "Checks status of server and deployment"
},
"info":
{
"method": "GET",
"route": "http://localhost:__PORT__/info",
"description": "Gets list of available JSL LLMs"
},
"deploy":
{
"method": "POST",
"route": "http://localhost:__PORT__/deploy",
"payload":
{
"llm_name": "jsl_meds_q4_v1",
"force": true,
"device": "auto"
},
"description": "Deploys a llm model specified in the payload"
},
"generate":
{
"method": "POST",
"route": "http://localhost:__PORT__/generate",
"payload":
{
"text": "### Template:\n{\n\"Drugs\": []\n}\n### Text:\nShe is consuming lipitor and metformin daily.",
"max_new_tokens": 512,
"temperature": 0.4
},
"description": "Generates or does inference on text using currently deployed LLM"
},
"offload":
{
"method": "POST",
"route": "http://localhost:__PORT__/offload",
"description": "Offloads deployed LLM from memory/GPU"
}
}
29 changes: 29 additions & 0 deletions products/llm_playground/app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import json
import os
import warnings

warnings.filterwarnings("ignore", category=Warning)
from fastapi import FastAPI

import app

app = FastAPI(
docs_url="/docs",
title="JSL LLM APIs",
swagger_ui_parameters={"tryItOutEnabled": True},
)
PORT = os.getenv("CONTAINER_PORT", 5001)
BASE_URL = f"http://localhost:{PORT}"


@app.get("/", tags=["API Documentation"], include_in_schema=False)
@app.get("/help", tags=["API Documentation"], include_in_schema=False)
async def rest_api_info():
with open("api_info.json", "r") as f:
json_str = f.read().replace("__PORT__", f"{PORT}")
return json.loads(json_str)


from app.src import api

app.include_router(api.route)
3 changes: 3 additions & 0 deletions products/llm_playground/app/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
JSL Property
"""
215 changes: 215 additions & 0 deletions products/llm_playground/app/src/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import os
import time
from datetime import datetime, timezone
from typing import Dict, Optional

import GPUtil
import torch
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, status
from pydantic import BaseModel

from app import logger
from app.src.constants import BUSY_STATUS, Status
from app.src.jsl_llm import JslLlm
from app.src.utils import clear_opt

jsl_llm = JslLlm()

logger.info("********************* APP IS READY *********************")


class DeployRequest(BaseModel):
force: Optional[bool] = False
llm_name: str
device: Optional[str] = "auto"
hf_token: Optional[str] = None


class GenerateRequest(BaseModel):
text: str
max_new_tokens: Optional[int] = 128
temperature: Optional[float] = 0.1


async def before_request(request: Request):
logger.debug(f"Received request: {request.method} {request.url}")
if os.getenv("MINIMAL_SETUP"):
return

try:
jsl_llm.beat_check()
except Exception as ex:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=f"Issue in license: {ex}",
headers={"WWW-Authenticate": "Bearer"},
)

if not jsl_llm.is_inference_scope():
raise HTTPException(status_code=400, detail="invalid license type")


route = APIRouter(dependencies=[Depends(before_request)])

jobs: Dict[str, str] = {}


async def deployment_task(llm_name: str, device_id: str, force: bool, hf_token: str):
jobs.clear()
jobs[llm_name] = Status.DEPLOYING.value
jsl_llm.offload_model()
if not jsl_llm.llms_info:
await jsl_llm.setup_llms_info()
if force:
await clear_opt()
extract_path, remote_name = None, None
try:
if llm_name not in jsl_llm.llms_info:
remote_name = llm_name
else:
extract_path = await jsl_llm.download_from_s3(llm_name)
jobs[llm_name] = Status.FINALIZING.value
await jsl_llm.load_model(
llm_name=llm_name,
extract_path_or_remote_name=(extract_path or remote_name),
device_id=device_id,
hf_token=hf_token,
)
except Exception as ex:
logger.error("failed to complete job.")
logger.debug(f"failed to complete job; error={ex}")
jobs[llm_name] = Status.FAILED.value
return

jobs[llm_name] = Status.DEPLOYED.value


async def current_server_info():
server_info = {
"system": {
"torch_cpu_threads": torch.get_num_threads(),
"is_cuda_available": torch.cuda.is_available(),
"number_of_cuda_devices": torch.cuda.device_count(),
},
"deployment_status": {},
}
if gpus := GPUtil.getGPUs():
gpus_info = []
for gpu in gpus:
gpus_info.append(
{
"GPU ID": gpu.id,
"GPU Name": gpu.name,
"GPU Load": f"{gpu.load * 100 } %",
"GPU Memory Free": f"{gpu.memoryFree} MB",
"GPU Memory Used": f"{gpu.memoryUsed} MB",
"GPU Memory Total": f"{gpu.memoryTotal} MB",
"GPU Temperature": f"{gpu.temperature} °C",
"GPU Driver": f"{gpu.driver}",
}
)
server_info["system"]["gpus_info"] = gpus_info

if jsl_llm.llm:
server_info["deployment_status"] = {
"model": jsl_llm.llm_name,
"status": Status.DEPLOYED.value,
"device": jsl_llm.device,
"max_context_length": jsl_llm.model_max_len,
"deployed_at": jsl_llm.deployed_at,
}
elif jobs:
for llm_name, _status in jobs.items():
server_info["deployment_status"] = {"model": llm_name, "status": _status}
if _status in BUSY_STATUS:
if _status == Status.FINALIZING.value:
server_info["deployment_status"]["progress"] = "98 %"
else:
server_info["deployment_status"]["progress"] = (
await jsl_llm.get_progress(llm_name)
)
break

return server_info


@route.get("/status", tags=["System Checks"])
async def server_status():
return await current_server_info()


@route.get(
"/info", tags=["System Checks"], summary="Lists available JSL LLMs and their info"
)
async def available_llms_info():
await jsl_llm.setup_llms_info()
return {
llm_name: {"size": info.get("size")}
for llm_name, info in jsl_llm.llms_info.items()
}


def check_if_busy():
if current_status := list(jobs.values()):
if current_status[0] in BUSY_STATUS:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail=Status.JOB_IN_PROGRESS.value,
headers={"Retry-After": "30s"},
)
return False


@route.post("/deploy", tags=["Load/Offload"])
async def deploy_llm(
data: DeployRequest, bk_task: BackgroundTasks, is_busy=Depends(check_if_busy)
):
llm_name = data.llm_name
jobs[llm_name] = Status.STARTING.value
args = dict(
llm_name=llm_name,
device_id=data.device,
force=data.force,
hf_token=data.hf_token,
)
bk_task.add_task(deployment_task, **args)
return {"message": Status.DEPLOYMENT_STARTED}


@route.post("/offload", tags=["Load/Offload"])
async def offload_llm(is_busy=Depends(check_if_busy)):
msg = jsl_llm.offload_model()
jobs.clear()
return {"message": msg}


@route.post("/generate", description="Generate prediction", tags=["Generate/Query"])
async def generate_results(data: GenerateRequest, is_busy=Depends(check_if_busy)):
generation_started = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
start_time = time.perf_counter()
result = await jsl_llm.get_prediction(
data.text, data.max_new_tokens, data.temperature
)
if type(result) == str:
return {"message": result}
output, output_tokens_len = result
end_time = time.perf_counter()
time_taken = end_time - start_time
return {
"message": output,
"generation_started": generation_started,
"time_taken": (
f"{time_taken* 1000:.3f} ms" if time_taken < 2 else f"{time_taken:.2f} sec"
),
"new_token_generated": output_tokens_len,
"input": {
"max_new_tokens": data.max_new_tokens,
"temperature": data.temperature,
"prompt_tokens": await jsl_llm.get_input_token_len(data.text),
},
"deployment": {
"deployed_at": jsl_llm.deployed_at,
"model_name": jsl_llm.llm_name,
"device": jsl_llm.device,
},
}
Loading