JohnSnowLabs · AbdullahMubeenAnwar · Oct 17, 2025 · Oct 17, 2025
diff --git a/products/llm_playground/Dockerfile b/products/llm_playground/Dockerfile
@@ -0,0 +1,79 @@
+# app/Dockerfile
+FROM python:3.10-slim AS builder
+RUN apt-get update && apt-get install -y build-essential git
+RUN pip3 install -U pip &&  pip3 install boto3 requests black isort
+WORKDIR /app
+
+COPY requirements.txt start.sh /tmp/
+ARG OO
+ARG EO
+ARG BO
+ENV OO=${OO}
+ENV EO=${EO}
+ENV BO=${BO}
+ENV PYTHONDONTWRITEBYTECODE=1
+
+RUN pip3 install -r /tmp/requirements.txt
+COPY app .
+
+RUN pip3 install --upgrade awscli
+
+# Configure pip to use AWS CodeArtifact
+ARG CODEARTIFACT_TOKEN
+
+RUN --mount=type=secret,id=catoken \
+    sh -c ' \
+        export CODEARTIFACT_AUTH_TOKEN="$(cat /run/secrets/catoken)" \
+        && pip install --no-cache-dir --no-deps "encryptor-obfuscator==0.1" "jsl-i==1.1" \
+        --extra-index-url "https://aws:${CODEARTIFACT_TOKEN}:@johnsnowlabs-175460536396.d.codeartifact.us-east-2.amazonaws.com/pypi/protecto/simple/" \
+    '
+
+RUN sh /tmp/start.sh
+
+FROM nvidia/cuda:12.0.1-cudnn8-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y                                      \
+    && apt-get install -y                                  \
+    git                                                    \
+    openjdk-8-jdk-headless                                 \
+    python3-pip                                            \
+    python3.10-dev
+
+ARG EO
+ARG BO
+ENV HOME=/app                                                         \
+    EO=${EO}                                                          \
+    BO=${BO}                                                          \
+    PYTHONDONTWRITEBYTECODE=1                                         \
+    JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/                      \
+    PATH=/usr/lib/jvm/java-8-openjdk-amd64/bin:/app/.local/bin:$PATH  \
+    TF_CPP_MIN_LOG_LEVEL=2                                            \
+    PYTHONIOENCODING=utf-8                                            \
+    CUDA_HOME=/usr/local/cuda                                         \
+    PYTHONPATH=${PYTHONPATH}:/                                        \
+    PYSPARK_PYTHON=python3                                            \
+    HF_HOME=/opt/.prop                                                \
+    PYSPARK_DRIVER_PYTHON=python3
+
+COPY requirements-with-cli.txt /tmp/
+# python package installation
+WORKDIR /app
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/dist-packages
+RUN pip3 install --no-cache-dir -r /tmp/requirements-with-cli.txt
+
+# Cleanup
+RUN rm -f /usr/bin/python3                                                               \
+    && rm -rf /tmp/* /var/tmp/* /var/lib/apt/lists/* /usr/share/man/* /usr/share/doc/*   \
+    && ln -s /usr/bin/python3.10 /usr/bin/python3                                        \
+    && apt-get purge --remove linux-libc-dev git -y                                          \
+    && apt-get autoremove -y                                                             \
+    && apt-get clean
+
+
+RUN chown -R nobody:nogroup /app
+COPY --from=builder /app /app
+COPY --from=builder /opt/ /opt/
+#CMD ["sleep", "infinity"]
+CMD ["python3", "-m", "uvicorn", "main:app", "--workers", "1", "--host", "0.0.0.0", "--port", "5000"]
+# python3 -m uvicorn main:app --workers 1 --host 0.0.0.0 --port 5000 --reload
diff --git a/products/llm_playground/app/__init__.py b/products/llm_playground/app/__init__.py
@@ -0,0 +1,19 @@
+from loguru import logger
+import os
+# Define custom format without file details
+_f = "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level}</level> | <cyan>{message}</cyan>"
+
+# Remove the default logger
+logger.remove()
+
+# Add a new console logger with the custom format
+logger.add(lambda msg: print(msg, end=""), format=_f, level=os.environ.get("LOGGING_MODE", "INFO").upper())
+try:
+    from jsl_i import start as app_start
+
+    app_start()
+except ModuleNotFoundError:
+    logger.error("*********app is not ready for production use, running in dev mode****")
+
+
+__all__ = ["logger"]
diff --git a/products/llm_playground/app/api_info.json b/products/llm_playground/app/api_info.json
@@ -0,0 +1,44 @@
+{
+    "status":
+    {
+        "method": "GET",
+        "route": "http://localhost:__PORT__/status",
+        "description": "Checks status of server and deployment"
+    },
+    "info":
+    {
+        "method": "GET",
+        "route": "http://localhost:__PORT__/info",
+        "description": "Gets list of available JSL LLMs"
+    },
+    "deploy":
+    {
+        "method": "POST",
+        "route": "http://localhost:__PORT__/deploy",
+        "payload":
+        {
+            "llm_name": "jsl_meds_q4_v1",
+            "force": true,
+            "device": "auto"
+        },
+        "description": "Deploys a llm model specified in the payload"
+    },
+    "generate":
+    {
+        "method": "POST",
+        "route": "http://localhost:__PORT__/generate",
+        "payload":
+        {
+            "text": "### Template:\n{\n\"Drugs\": []\n}\n### Text:\nShe is consuming lipitor and metformin daily.",
+            "max_new_tokens": 512,
+            "temperature": 0.4
+        },
+        "description": "Generates or does inference on text using currently deployed LLM"
+    },
+    "offload":
+    {
+        "method": "POST",
+        "route": "http://localhost:__PORT__/offload",
+        "description": "Offloads deployed LLM from memory/GPU"
+    }
+}
diff --git a/products/llm_playground/app/main.py b/products/llm_playground/app/main.py
@@ -0,0 +1,29 @@
+import json
+import os
+import warnings
+
+warnings.filterwarnings("ignore", category=Warning)
+from fastapi import FastAPI
+
+import app
+
+app = FastAPI(
+    docs_url="/docs",
+    title="JSL LLM APIs",
+    swagger_ui_parameters={"tryItOutEnabled": True},
+)
+PORT = os.getenv("CONTAINER_PORT", 5001)
+BASE_URL = f"http://localhost:{PORT}"
+
+
+@app.get("/", tags=["API Documentation"], include_in_schema=False)
+@app.get("/help", tags=["API Documentation"], include_in_schema=False)
+async def rest_api_info():
+    with open("api_info.json", "r") as f:
+        json_str = f.read().replace("__PORT__", f"{PORT}")
+        return json.loads(json_str)
+
+
+from app.src import api
+
+app.include_router(api.route)
diff --git a/products/llm_playground/app/src/__init__.py b/products/llm_playground/app/src/__init__.py
@@ -0,0 +1,3 @@
+"""
+JSL Property
+"""
diff --git a/products/llm_playground/app/src/api.py b/products/llm_playground/app/src/api.py
@@ -0,0 +1,215 @@
+import os
+import time
+from datetime import datetime, timezone
+from typing import Dict, Optional
+
+import GPUtil
+import torch
+from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, status
+from pydantic import BaseModel
+
+from app import logger
+from app.src.constants import BUSY_STATUS, Status
+from app.src.jsl_llm import JslLlm
+from app.src.utils import clear_opt
+
+jsl_llm = JslLlm()
+
+logger.info("********************* APP IS READY *********************")
+
+
+class DeployRequest(BaseModel):
+    force: Optional[bool] = False
+    llm_name: str
+    device: Optional[str] = "auto"
+    hf_token: Optional[str] = None
+
+
+class GenerateRequest(BaseModel):
+    text: str
+    max_new_tokens: Optional[int] = 128
+    temperature: Optional[float] = 0.1
+
+
+async def before_request(request: Request):
+    logger.debug(f"Received request: {request.method} {request.url}")
+    if os.getenv("MINIMAL_SETUP"):
+        return
+
+    try:
+        jsl_llm.beat_check()
+    except Exception as ex:
+        raise HTTPException(
+            status_code=status.HTTP_401_UNAUTHORIZED,
+            detail=f"Issue in license: {ex}",
+            headers={"WWW-Authenticate": "Bearer"},
+        )
+
+    if not jsl_llm.is_inference_scope():
+        raise HTTPException(status_code=400, detail="invalid license type")
+
+
+route = APIRouter(dependencies=[Depends(before_request)])
+
+jobs: Dict[str, str] = {}
+
+
+async def deployment_task(llm_name: str, device_id: str, force: bool, hf_token: str):
+    jobs.clear()
+    jobs[llm_name] = Status.DEPLOYING.value
+    jsl_llm.offload_model()
+    if not jsl_llm.llms_info:
+        await jsl_llm.setup_llms_info()
+    if force:
+        await clear_opt()
+    extract_path, remote_name = None, None
+    try:
+        if llm_name not in jsl_llm.llms_info:
+            remote_name = llm_name
+        else:
+            extract_path = await jsl_llm.download_from_s3(llm_name)
+            jobs[llm_name] = Status.FINALIZING.value
+        await jsl_llm.load_model(
+            llm_name=llm_name,
+            extract_path_or_remote_name=(extract_path or remote_name),
+            device_id=device_id,
+            hf_token=hf_token,
+        )
+    except Exception as ex:
+        logger.error("failed to complete job.")
+        logger.debug(f"failed to complete job; error={ex}")
+        jobs[llm_name] = Status.FAILED.value
+        return
+
+    jobs[llm_name] = Status.DEPLOYED.value
+
+
+async def current_server_info():
+    server_info = {
+        "system": {
+            "torch_cpu_threads": torch.get_num_threads(),
+            "is_cuda_available": torch.cuda.is_available(),
+            "number_of_cuda_devices": torch.cuda.device_count(),
+        },
+        "deployment_status": {},
+    }
+    if gpus := GPUtil.getGPUs():
+        gpus_info = []
+        for gpu in gpus:
+            gpus_info.append(
+                {
+                    "GPU ID": gpu.id,
+                    "GPU Name": gpu.name,
+                    "GPU Load": f"{gpu.load * 100 } %",
+                    "GPU Memory Free": f"{gpu.memoryFree} MB",
+                    "GPU Memory Used": f"{gpu.memoryUsed} MB",
+                    "GPU Memory Total": f"{gpu.memoryTotal}  MB",
+                    "GPU Temperature": f"{gpu.temperature} °C",
+                    "GPU Driver": f"{gpu.driver}",
+                }
+            )
+        server_info["system"]["gpus_info"] = gpus_info
+
+    if jsl_llm.llm:
+        server_info["deployment_status"] = {
+            "model": jsl_llm.llm_name,
+            "status": Status.DEPLOYED.value,
+            "device": jsl_llm.device,
+            "max_context_length": jsl_llm.model_max_len,
+            "deployed_at": jsl_llm.deployed_at,
+        }
+    elif jobs:
+        for llm_name, _status in jobs.items():
+            server_info["deployment_status"] = {"model": llm_name, "status": _status}
+            if _status in BUSY_STATUS:
+                if _status == Status.FINALIZING.value:
+                    server_info["deployment_status"]["progress"] = "98 %"
+                else:
+                    server_info["deployment_status"]["progress"] = (
+                        await jsl_llm.get_progress(llm_name)
+                    )
+            break
+
+    return server_info
+
+
+@route.get("/status", tags=["System Checks"])
+async def server_status():
+    return await current_server_info()
+
+
+@route.get(
+    "/info", tags=["System Checks"], summary="Lists available JSL LLMs and their info"
+)
+async def available_llms_info():
+    await jsl_llm.setup_llms_info()
+    return {
+        llm_name: {"size": info.get("size")}
+        for llm_name, info in jsl_llm.llms_info.items()
+    }
+
+
+def check_if_busy():
+    if current_status := list(jobs.values()):
+        if current_status[0] in BUSY_STATUS:
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail=Status.JOB_IN_PROGRESS.value,
+                headers={"Retry-After": "30s"},
+            )
+    return False
+
+
+@route.post("/deploy", tags=["Load/Offload"])
+async def deploy_llm(
+    data: DeployRequest, bk_task: BackgroundTasks, is_busy=Depends(check_if_busy)
+):
+    llm_name = data.llm_name
+    jobs[llm_name] = Status.STARTING.value
+    args = dict(
+        llm_name=llm_name,
+        device_id=data.device,
+        force=data.force,
+        hf_token=data.hf_token,
+    )
+    bk_task.add_task(deployment_task, **args)
+    return {"message": Status.DEPLOYMENT_STARTED}
+
+
+@route.post("/offload", tags=["Load/Offload"])
+async def offload_llm(is_busy=Depends(check_if_busy)):
+    msg = jsl_llm.offload_model()
+    jobs.clear()
+    return {"message": msg}
+
+
+@route.post("/generate", description="Generate prediction", tags=["Generate/Query"])
+async def generate_results(data: GenerateRequest, is_busy=Depends(check_if_busy)):
+    generation_started = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
+    start_time = time.perf_counter()
+    result = await jsl_llm.get_prediction(
+        data.text, data.max_new_tokens, data.temperature
+    )
+    if type(result) == str:
+        return {"message": result}
+    output, output_tokens_len = result
+    end_time = time.perf_counter()
+    time_taken = end_time - start_time
+    return {
+        "message": output,
+        "generation_started": generation_started,
+        "time_taken": (
+            f"{time_taken* 1000:.3f} ms" if time_taken < 2 else f"{time_taken:.2f} sec"
+        ),
+        "new_token_generated": output_tokens_len,
+        "input": {
+            "max_new_tokens": data.max_new_tokens,
+            "temperature": data.temperature,
+            "prompt_tokens": await jsl_llm.get_input_token_len(data.text),
+        },
+        "deployment": {
+            "deployed_at": jsl_llm.deployed_at,
+            "model_name": jsl_llm.llm_name,
+            "device": jsl_llm.device,
+        },
+    }