Merge pull request #248 from transformerlab/fix/llama-cpp-install

deep1401 · web-flow · commit 466485ebf473 · 2025-04-29T09:06:17.000-07:00
Llama CPP Upgrade
diff --git a/run.sh b/run.sh
@@ -68,6 +68,13 @@ else
     # echo "✅ Uvicorn is installed."
 fi
 
+# Check if NVIDIA GPU is available and add necessary paths
+if command -v nvidia-smi &> /dev/null; then
+    echo "✅ NVIDIA GPU detected, adding CUDA libraries to path"
+    # Add common NVIDIA library paths
+    export LD_LIBRARY_PATH=${ENV_DIR}/lib:$LD_LIBRARY_PATH
+fi
+
 echo "▶️ Starting the API server:"
 if [ "$RELOAD" = true ]; then
     echo "🔁 Reload the server on file changes"
diff --git a/transformerlab/plugins/llama_cpp_server/index.json b/transformerlab/plugins/llama_cpp_server/index.json
@@ -4,7 +4,7 @@
   "description": "Runs llama-cpp-python server that can run GGUF models that work well on CPU only machines.",
   "plugin-format": "python",
   "type": "loader",
-  "version": "0.1.6",
+  "version": "0.1.8",
   "model_architectures": ["GGUF"],
   "supported_hardware_architectures": [
     "cpu",
diff --git a/transformerlab/plugins/llama_cpp_server/main.py b/transformerlab/plugins/llama_cpp_server/main.py
@@ -11,6 +11,7 @@
 
 Right now only generate_stream works -- need to do work to make generate work
 """
+import torch
 
 import argparse
 import asyncio
@@ -20,7 +21,6 @@
 from typing import List
 
 import llama_cpp
-import torch
 import uvicorn
 from fastapi import BackgroundTasks, FastAPI, Request
 from fastapi.concurrency import run_in_threadpool
diff --git a/transformerlab/plugins/llama_cpp_server/setup.sh b/transformerlab/plugins/llama_cpp_server/setup.sh
@@ -4,4 +4,26 @@
 # So we will install llama-cpp-python only and implement our
 # own server using FastAPI
 
-uv pip install llama-cpp-python==0.2.79 --upgrade --force-reinstall --no-cache-dir
+echo "Setting up llama-cpp-python..."
+
+# Detect OS
+if [[ "$(uname)" == "Darwin" ]]; then
+    # macOS - check for Metal support
+    if [[ "$(uname -m)" == "arm64" || "$(sysctl -n machdep.cpu.brand_string)" == *"Apple"* ]]; then
+        echo "Detected Mac with Apple Silicon - installing with Metal support"
+        CMAKE_ARGS="-DGGML_METAL=on" uv pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir
+    else
+        echo "Detected Mac with Intel CPU - installing with OpenBLAS support"
+        CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" uv pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir
+    fi
+elif command -v nvidia-smi &> /dev/null; then
+    # Linux/Other with CUDA detected
+    echo "CUDA GPU detected. Installing based on CUDA setup using GGML CUDA"
+    CMAKE_ARGS="-DGGML_CUDA=on" FORCE_CMAKE=1 uv pip install llama-cpp-python --force-reinstall --no-cache-dir
+else
+    # Linux/Other without CUDA - try using OpenBLAS
+    echo "No GPU detected - installing with OpenBLAS support"
+    CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" uv pip install llama-cpp-python --upgrade --no-cache-dir
+fi
+
+echo "llama-cpp-python installation complete."