add llm-katan to k8s

JaredforReal · JaredforReal · commit a136572976a1 · 2025-10-17T22:11:52.000+08:00
Signed-off-by: JaredforReal &lt;w13431838023@gmail.com&gt;
diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md
@@ -2,6 +2,8 @@
 
 This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize.
 
+By default, the base kustomization deploys a Pod with an `llm-katan` sidecar so that the default config (qwen3 on 127.0.0.1:8002) works out-of-the-box. If you prefer to run without the sidecar, replace `deployment.with-llm-katan.yaml` with `deployment.yaml` in `kustomization.yaml`.
+
 ## Architecture
 
 The deployment consists of:
@@ -318,6 +320,7 @@ Edit the `resources` section in `deployment.yaml` accordingly.
 ### Kubernetes Manifests (`deploy/kubernetes/`)
 
 - `deployment.yaml` - Main application deployment with optimized resource settings
+- `deployment.with-llm-katan.yaml` - Optional variant including an llm-katan sidecar listening on 8002 (works with default config pointing to qwen3 at 127.0.0.1:8002)
 - `service.yaml` - Services for gRPC, HTTP API, and metrics
 - `pvc.yaml` - Persistent volume claim for model storage
 - `namespace.yaml` - Dedicated namespace for the application
@@ -327,6 +330,21 @@ Edit the `resources` section in `deployment.yaml` accordingly.
 
 ### Development Tools
 
+## Optional: run with llm-katan sidecar
+
+To mimic the docker-compose default setup, you can deploy a variant that runs an `llm-katan` sidecar inside the same Pod. The provided `deployment.with-llm-katan.yaml` exposes llm-katan on `0.0.0.0:8002` and serves the model name `qwen3`.
+
+Notes:
+
+- Ensure the Qwen model content is available at `/app/models/Qwen/Qwen3-0.6B` in the PVC. You can pre-populate the PV or customize the init container to fetch from an internal source.
+- The default Kubernetes `config.yaml` has been aligned to use `qwen3` and endpoint `127.0.0.1:8002`, so it will work out-of-the-box with this sidecar.
+
+Apply the sidecar variant instead of the default deployment:
+
+```bash
+kubectl apply -n vllm-semantic-router-system -f deploy/kubernetes/deployment.with-llm-katan.yaml
+```
+
 - `tools/kind/kind-config.yaml` - Kind cluster configuration for local development
 - `tools/make/kube.mk` - Make targets for Kubernetes operations
 - `Makefile` - Root makefile including all make targets
diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml
@@ -1,15 +1,15 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
 semantic_cache:
   enabled: true
-  backend_type: "memory"  # Options: "memory" or "milvus"
+  backend_type: "memory" # Options: "memory" or "milvus"
   similarity_threshold: 0.8
-  max_entries: 1000  # Only applies to memory backend
+  max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
 
 tools:
   enabled: true
@@ -32,13 +32,13 @@ prompt_guard:
 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
 vllm_endpoints:
   - name: "endpoint1"
-    address: "127.0.0.1"  # IPv4 address - REQUIRED format
-    port: 8000
+    address: "127.0.0.1" # llm-katan sidecar or local backend
+    port: 8002
     weight: 1
 
 model_config:
-  "openai/gpt-oss-20b":
-    reasoning_family: "gpt-oss"  # This model uses GPT-OSS reasoning syntax
+  "qwen3":
+    reasoning_family: "qwen3" # Match docker-compose default model name
     preferred_endpoints: ["endpoint1"]
     pii_policy:
       allow_by_default: true
@@ -62,76 +62,76 @@ classifier:
 categories:
   - name: business
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: false  # Business performs better without reasoning
+        use_reasoning: false # Business performs better without reasoning
   - name: law
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.4
         use_reasoning: false
   - name: psychology
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: biology
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.9
         use_reasoning: false
   - name: chemistry
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
-        use_reasoning: true  # Enable reasoning for complex chemistry
+        use_reasoning: true # Enable reasoning for complex chemistry
   - name: history
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: other
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: health
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: economics
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
         use_reasoning: false
   - name: math
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 1.0
-        use_reasoning: true  # Enable reasoning for complex math
+        use_reasoning: true # Enable reasoning for complex math
   - name: physics
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
-        use_reasoning: true  # Enable reasoning for physics
+        use_reasoning: true # Enable reasoning for physics
   - name: computer science
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.6
         use_reasoning: false
   - name: philosophy
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.5
         use_reasoning: false
   - name: engineering
     model_scores:
-      - model: openai/gpt-oss-20b
+      - model: qwen3
         score: 0.7
         use_reasoning: false
 
-default_model: openai/gpt-oss-20b
+default_model: qwen3
 
 # Reasoning family configurations
 reasoning_families:
@@ -164,5 +164,6 @@ api:
       detailed_goroutine_tracking: true
       high_resolution_timing: false
       sample_rate: 1.0
-      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
       size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
diff --git a/deploy/kubernetes/deployment.with-llm-katan.yaml b/deploy/kubernetes/deployment.with-llm-katan.yaml
@@ -0,0 +1,169 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: semantic-router
+  namespace: vllm-semantic-router-system
+  labels:
+    app: semantic-router
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: semantic-router
+  template:
+    metadata:
+      labels:
+        app: semantic-router
+    spec:
+      initContainers:
+        - name: model-downloader
+          image: python:3.11-slim
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -e
+              echo "Installing Hugging Face CLI..."
+              pip install --no-cache-dir huggingface_hub[cli]
+
+              echo "Downloading classifier models to persistent volume..."
+              cd /app/models
+
+              # Download category classifier model
+              if [ ! -d "category_classifier_modernbert-base_model" ]; then
+                echo "Downloading category classifier model..."
+                huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
+              else
+                echo "Category classifier model already exists, skipping..."
+              fi
+
+              # Download PII classifier model
+              if [ ! -d "pii_classifier_modernbert-base_model" ]; then
+                echo "Downloading PII classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
+              else
+                echo "PII classifier model already exists, skipping..."
+              fi
+
+              # Download jailbreak classifier model
+              if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
+                echo "Downloading jailbreak classifier model..."
+                huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
+              else
+                echo "Jailbreak classifier model already exists, skipping..."
+              fi
+
+              # Download PII token classifier model
+              if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
+                echo "Downloading PII token classifier model..."
+                huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
+              else
+                echo "PII token classifier model already exists, skipping..."
+              fi
+
+              # Optional: Prepare Qwen model directory for llm-katan sidecar
+              # NOTE: Provide the model content under /app/models/Qwen/Qwen3-0.6B via pre-populated PV
+              # or customize the following block to fetch from your internal artifact store.
+              if [ ! -d "Qwen/Qwen3-0.6B" ]; then
+                echo "Qwen3-0.6B directory not found. Please pre-populate /app/models/Qwen/Qwen3-0.6B in the PVC or customize init script to download it."
+              fi
+
+              echo "Model directory listing:" && ls -la /app/models/
+          env:
+            - name: HF_HUB_CACHE
+              value: /tmp/hf_cache
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "250m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
+      containers:
+        - name: semantic-router
+          image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          args: ["--secure=true"]
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          ports:
+            - containerPort: 50051
+              name: grpc
+              protocol: TCP
+            - containerPort: 9190
+              name: metrics
+              protocol: TCP
+            - containerPort: 8080
+              name: classify-api
+              protocol: TCP
+          env:
+            - name: LD_LIBRARY_PATH
+              value: "/app/lib"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /app/config
+              readOnly: true
+            - name: models-volume
+              mountPath: /app/models
+          livenessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 90
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          resources:
+            requests:
+              memory: "3Gi"
+              cpu: "1"
+            limits:
+              memory: "6Gi"
+              cpu: "2"
+        - name: llm-katan
+          image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
+          imagePullPolicy: IfNotPresent
+          args:
+            [
+              "llm-katan",
+              "--model",
+              "/app/models/Qwen/Qwen3-0.6B",
+              "--served-model-name",
+              "qwen3",
+              "--host",
+              "0.0.0.0",
+              "--port",
+              "8002",
+            ]
+          ports:
+            - containerPort: 8002
+              name: katan
+              protocol: TCP
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
+          resources:
+            requests:
+              memory: "1Gi"
+              cpu: "500m"
+            limits:
+              memory: "2Gi"
+              cpu: "1"
+      volumes:
+        - name: config-volume
+          configMap:
+            name: semantic-router-config
+        - name: models-volume
+          persistentVolumeClaim:
+            claimName: semantic-router-models
diff --git a/deploy/kubernetes/kustomization.yaml b/deploy/kubernetes/kustomization.yaml
@@ -5,21 +5,21 @@ metadata:
   name: semantic-router
 
 resources:
-- namespace.yaml
-- pvc.yaml
-- deployment.yaml
-- service.yaml
+  - namespace.yaml
+  - pvc.yaml
+  - deployment.with-llm-katan.yaml
+  - service.yaml
 
 # Generate ConfigMap
 configMapGenerator:
-- name: semantic-router-config
-  files:
-  - config.yaml
-  - tools_db.json
+  - name: semantic-router-config
+    files:
+      - config.yaml
+      - tools_db.json
 
 # Namespace for all resources
 namespace: vllm-semantic-router-system
 
 images:
-- name: ghcr.io/vllm-project/semantic-router/extproc
-  newTag: latest
+  - name: ghcr.io/vllm-project/semantic-router/extproc
+    newTag: latest