Skip to content

Commit a136572

Browse files
committed
add llm-katan to k8s
Signed-off-by: JaredforReal <[email protected]>
1 parent 2cde570 commit a136572

File tree

4 files changed

+226
-38
lines changed

4 files changed

+226
-38
lines changed

deploy/kubernetes/README.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
This directory contains Kubernetes manifests for deploying the Semantic Router using Kustomize.
44

5+
By default, the base kustomization deploys a Pod with an `llm-katan` sidecar so that the default config (qwen3 on 127.0.0.1:8002) works out-of-the-box. If you prefer to run without the sidecar, replace `deployment.with-llm-katan.yaml` with `deployment.yaml` in `kustomization.yaml`.
6+
57
## Architecture
68

79
The deployment consists of:
@@ -318,6 +320,7 @@ Edit the `resources` section in `deployment.yaml` accordingly.
318320
### Kubernetes Manifests (`deploy/kubernetes/`)
319321

320322
- `deployment.yaml` - Main application deployment with optimized resource settings
323+
- `deployment.with-llm-katan.yaml` - Optional variant including an llm-katan sidecar listening on 8002 (works with default config pointing to qwen3 at 127.0.0.1:8002)
321324
- `service.yaml` - Services for gRPC, HTTP API, and metrics
322325
- `pvc.yaml` - Persistent volume claim for model storage
323326
- `namespace.yaml` - Dedicated namespace for the application
@@ -327,6 +330,21 @@ Edit the `resources` section in `deployment.yaml` accordingly.
327330

328331
### Development Tools
329332

333+
## Optional: run with llm-katan sidecar
334+
335+
To mimic the docker-compose default setup, you can deploy a variant that runs an `llm-katan` sidecar inside the same Pod. The provided `deployment.with-llm-katan.yaml` exposes llm-katan on `0.0.0.0:8002` and serves the model name `qwen3`.
336+
337+
Notes:
338+
339+
- Ensure the Qwen model content is available at `/app/models/Qwen/Qwen3-0.6B` in the PVC. You can pre-populate the PV or customize the init container to fetch from an internal source.
340+
- The default Kubernetes `config.yaml` has been aligned to use `qwen3` and endpoint `127.0.0.1:8002`, so it will work out-of-the-box with this sidecar.
341+
342+
Apply the sidecar variant instead of the default deployment:
343+
344+
```bash
345+
kubectl apply -n vllm-semantic-router-system -f deploy/kubernetes/deployment.with-llm-katan.yaml
346+
```
347+
330348
- `tools/kind/kind-config.yaml` - Kind cluster configuration for local development
331349
- `tools/make/kube.mk` - Make targets for Kubernetes operations
332350
- `Makefile` - Root makefile including all make targets

deploy/kubernetes/config.yaml

Lines changed: 29 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
bert_model:
2-
model_id: sentence-transformers/all-MiniLM-L12-v2
2+
model_id: models/all-MiniLM-L12-v2
33
threshold: 0.6
44
use_cpu: true
55

66
semantic_cache:
77
enabled: true
8-
backend_type: "memory" # Options: "memory" or "milvus"
8+
backend_type: "memory" # Options: "memory" or "milvus"
99
similarity_threshold: 0.8
10-
max_entries: 1000 # Only applies to memory backend
10+
max_entries: 1000 # Only applies to memory backend
1111
ttl_seconds: 3600
12-
eviction_policy: "fifo"
12+
eviction_policy: "fifo"
1313

1414
tools:
1515
enabled: true
@@ -32,13 +32,13 @@ prompt_guard:
3232
# NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
3333
vllm_endpoints:
3434
- name: "endpoint1"
35-
address: "127.0.0.1" # IPv4 address - REQUIRED format
36-
port: 8000
35+
address: "127.0.0.1" # llm-katan sidecar or local backend
36+
port: 8002
3737
weight: 1
3838

3939
model_config:
40-
"openai/gpt-oss-20b":
41-
reasoning_family: "gpt-oss" # This model uses GPT-OSS reasoning syntax
40+
"qwen3":
41+
reasoning_family: "qwen3" # Match docker-compose default model name
4242
preferred_endpoints: ["endpoint1"]
4343
pii_policy:
4444
allow_by_default: true
@@ -62,76 +62,76 @@ classifier:
6262
categories:
6363
- name: business
6464
model_scores:
65-
- model: openai/gpt-oss-20b
65+
- model: qwen3
6666
score: 0.7
67-
use_reasoning: false # Business performs better without reasoning
67+
use_reasoning: false # Business performs better without reasoning
6868
- name: law
6969
model_scores:
70-
- model: openai/gpt-oss-20b
70+
- model: qwen3
7171
score: 0.4
7272
use_reasoning: false
7373
- name: psychology
7474
model_scores:
75-
- model: openai/gpt-oss-20b
75+
- model: qwen3
7676
score: 0.6
7777
use_reasoning: false
7878
- name: biology
7979
model_scores:
80-
- model: openai/gpt-oss-20b
80+
- model: qwen3
8181
score: 0.9
8282
use_reasoning: false
8383
- name: chemistry
8484
model_scores:
85-
- model: openai/gpt-oss-20b
85+
- model: qwen3
8686
score: 0.6
87-
use_reasoning: true # Enable reasoning for complex chemistry
87+
use_reasoning: true # Enable reasoning for complex chemistry
8888
- name: history
8989
model_scores:
90-
- model: openai/gpt-oss-20b
90+
- model: qwen3
9191
score: 0.7
9292
use_reasoning: false
9393
- name: other
9494
model_scores:
95-
- model: openai/gpt-oss-20b
95+
- model: qwen3
9696
score: 0.7
9797
use_reasoning: false
9898
- name: health
9999
model_scores:
100-
- model: openai/gpt-oss-20b
100+
- model: qwen3
101101
score: 0.5
102102
use_reasoning: false
103103
- name: economics
104104
model_scores:
105-
- model: openai/gpt-oss-20b
105+
- model: qwen3
106106
score: 1.0
107107
use_reasoning: false
108108
- name: math
109109
model_scores:
110-
- model: openai/gpt-oss-20b
110+
- model: qwen3
111111
score: 1.0
112-
use_reasoning: true # Enable reasoning for complex math
112+
use_reasoning: true # Enable reasoning for complex math
113113
- name: physics
114114
model_scores:
115-
- model: openai/gpt-oss-20b
115+
- model: qwen3
116116
score: 0.7
117-
use_reasoning: true # Enable reasoning for physics
117+
use_reasoning: true # Enable reasoning for physics
118118
- name: computer science
119119
model_scores:
120-
- model: openai/gpt-oss-20b
120+
- model: qwen3
121121
score: 0.6
122122
use_reasoning: false
123123
- name: philosophy
124124
model_scores:
125-
- model: openai/gpt-oss-20b
125+
- model: qwen3
126126
score: 0.5
127127
use_reasoning: false
128128
- name: engineering
129129
model_scores:
130-
- model: openai/gpt-oss-20b
130+
- model: qwen3
131131
score: 0.7
132132
use_reasoning: false
133133

134-
default_model: openai/gpt-oss-20b
134+
default_model: qwen3
135135

136136
# Reasoning family configurations
137137
reasoning_families:
@@ -164,5 +164,6 @@ api:
164164
detailed_goroutine_tracking: true
165165
high_resolution_timing: false
166166
sample_rate: 1.0
167-
duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
167+
duration_buckets:
168+
[0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
168169
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: semantic-router
5+
namespace: vllm-semantic-router-system
6+
labels:
7+
app: semantic-router
8+
spec:
9+
replicas: 1
10+
selector:
11+
matchLabels:
12+
app: semantic-router
13+
template:
14+
metadata:
15+
labels:
16+
app: semantic-router
17+
spec:
18+
initContainers:
19+
- name: model-downloader
20+
image: python:3.11-slim
21+
securityContext:
22+
runAsNonRoot: false
23+
allowPrivilegeEscalation: false
24+
command: ["/bin/bash", "-c"]
25+
args:
26+
- |
27+
set -e
28+
echo "Installing Hugging Face CLI..."
29+
pip install --no-cache-dir huggingface_hub[cli]
30+
31+
echo "Downloading classifier models to persistent volume..."
32+
cd /app/models
33+
34+
# Download category classifier model
35+
if [ ! -d "category_classifier_modernbert-base_model" ]; then
36+
echo "Downloading category classifier model..."
37+
huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
38+
else
39+
echo "Category classifier model already exists, skipping..."
40+
fi
41+
42+
# Download PII classifier model
43+
if [ ! -d "pii_classifier_modernbert-base_model" ]; then
44+
echo "Downloading PII classifier model..."
45+
huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
46+
else
47+
echo "PII classifier model already exists, skipping..."
48+
fi
49+
50+
# Download jailbreak classifier model
51+
if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
52+
echo "Downloading jailbreak classifier model..."
53+
huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
54+
else
55+
echo "Jailbreak classifier model already exists, skipping..."
56+
fi
57+
58+
# Download PII token classifier model
59+
if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
60+
echo "Downloading PII token classifier model..."
61+
huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
62+
else
63+
echo "PII token classifier model already exists, skipping..."
64+
fi
65+
66+
# Optional: Prepare Qwen model directory for llm-katan sidecar
67+
# NOTE: Provide the model content under /app/models/Qwen/Qwen3-0.6B via pre-populated PV
68+
# or customize the following block to fetch from your internal artifact store.
69+
if [ ! -d "Qwen/Qwen3-0.6B" ]; then
70+
echo "Qwen3-0.6B directory not found. Please pre-populate /app/models/Qwen/Qwen3-0.6B in the PVC or customize init script to download it."
71+
fi
72+
73+
echo "Model directory listing:" && ls -la /app/models/
74+
env:
75+
- name: HF_HUB_CACHE
76+
value: /tmp/hf_cache
77+
resources:
78+
requests:
79+
memory: "512Mi"
80+
cpu: "250m"
81+
limits:
82+
memory: "1Gi"
83+
cpu: "500m"
84+
volumeMounts:
85+
- name: models-volume
86+
mountPath: /app/models
87+
containers:
88+
- name: semantic-router
89+
image: ghcr.io/vllm-project/semantic-router/extproc:latest
90+
args: ["--secure=true"]
91+
securityContext:
92+
runAsNonRoot: false
93+
allowPrivilegeEscalation: false
94+
ports:
95+
- containerPort: 50051
96+
name: grpc
97+
protocol: TCP
98+
- containerPort: 9190
99+
name: metrics
100+
protocol: TCP
101+
- containerPort: 8080
102+
name: classify-api
103+
protocol: TCP
104+
env:
105+
- name: LD_LIBRARY_PATH
106+
value: "/app/lib"
107+
volumeMounts:
108+
- name: config-volume
109+
mountPath: /app/config
110+
readOnly: true
111+
- name: models-volume
112+
mountPath: /app/models
113+
livenessProbe:
114+
tcpSocket:
115+
port: 50051
116+
initialDelaySeconds: 60
117+
periodSeconds: 30
118+
timeoutSeconds: 10
119+
failureThreshold: 3
120+
readinessProbe:
121+
tcpSocket:
122+
port: 50051
123+
initialDelaySeconds: 90
124+
periodSeconds: 30
125+
timeoutSeconds: 10
126+
failureThreshold: 3
127+
resources:
128+
requests:
129+
memory: "3Gi"
130+
cpu: "1"
131+
limits:
132+
memory: "6Gi"
133+
cpu: "2"
134+
- name: llm-katan
135+
image: ghcr.io/vllm-project/semantic-router/llm-katan:latest
136+
imagePullPolicy: IfNotPresent
137+
args:
138+
[
139+
"llm-katan",
140+
"--model",
141+
"/app/models/Qwen/Qwen3-0.6B",
142+
"--served-model-name",
143+
"qwen3",
144+
"--host",
145+
"0.0.0.0",
146+
"--port",
147+
"8002",
148+
]
149+
ports:
150+
- containerPort: 8002
151+
name: katan
152+
protocol: TCP
153+
volumeMounts:
154+
- name: models-volume
155+
mountPath: /app/models
156+
resources:
157+
requests:
158+
memory: "1Gi"
159+
cpu: "500m"
160+
limits:
161+
memory: "2Gi"
162+
cpu: "1"
163+
volumes:
164+
- name: config-volume
165+
configMap:
166+
name: semantic-router-config
167+
- name: models-volume
168+
persistentVolumeClaim:
169+
claimName: semantic-router-models

deploy/kubernetes/kustomization.yaml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,21 @@ metadata:
55
name: semantic-router
66

77
resources:
8-
- namespace.yaml
9-
- pvc.yaml
10-
- deployment.yaml
11-
- service.yaml
8+
- namespace.yaml
9+
- pvc.yaml
10+
- deployment.with-llm-katan.yaml
11+
- service.yaml
1212

1313
# Generate ConfigMap
1414
configMapGenerator:
15-
- name: semantic-router-config
16-
files:
17-
- config.yaml
18-
- tools_db.json
15+
- name: semantic-router-config
16+
files:
17+
- config.yaml
18+
- tools_db.json
1919

2020
# Namespace for all resources
2121
namespace: vllm-semantic-router-system
2222

2323
images:
24-
- name: ghcr.io/vllm-project/semantic-router/extproc
25-
newTag: latest
24+
- name: ghcr.io/vllm-project/semantic-router/extproc
25+
newTag: latest

0 commit comments

Comments
 (0)