Skip to content
This repository was archived by the owner on May 15, 2025. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions deploy/components/inference-gateway/deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,10 @@ spec:
image: quay.io/vllm-d/gateway-api-inference-extension/epp:latest
imagePullPolicy: IfNotPresent
args:
- -refreshMetricsInterval
- "500ms"
- -poolName
- "vllm-llama3-8b-instruct"
- -v
- "4"
- "5"
- --zap-encoder
- "json"
- -grpcPort
Expand Down
2 changes: 2 additions & 0 deletions deploy/components/inference-gateway/httproutes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ spec:
kind: InferencePool
name: vllm-llama3-8b-instruct
port: 8000
timeouts:
request: 30s
3 changes: 3 additions & 0 deletions deploy/components/inference-gateway/inference-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ spec:
criticality: Critical
poolRef:
name: vllm-llama3-8b-instruct
targetModels:
- name: food-review-1
weight: 100
1 change: 1 addition & 0 deletions deploy/components/inference-gateway/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ resources:
- destination-rules.yaml
- inference-pools.yaml
- inference-models.yaml
- services.yaml
- deployments.yaml
- gateways.yaml
- httproutes.yaml
Expand Down
9 changes: 1 addition & 8 deletions deploy/components/inference-gateway/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ rules:
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencepools"
- "inferencemodels"
verbs:
- "get"
Expand All @@ -19,14 +20,6 @@ rules:
- "get"
- "watch"
- "list"
- apiGroups:
- "inference.networking.x-k8s.io"
resources:
- "inferencepools"
verbs:
- "get"
- "watch"
- "list"
- apiGroups:
- "discovery.k8s.io"
resources:
Expand Down
13 changes: 13 additions & 0 deletions deploy/components/inference-gateway/services.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: endpoint-picker
spec:
selector:
app: endpoint-picker
ports:
- protocol: TCP
port: 9002
targetPort: 9002
appProtocol: http2
type: ClusterIP
1 change: 1 addition & 0 deletions deploy/components/istio-control-plane/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ resources:
- webhooks.yaml
- deployments.yaml
- hpa.yaml
- telemetry.yaml
10 changes: 10 additions & 0 deletions deploy/components/istio-control-plane/telemetry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Enables debug logging for Gateways
apiVersion: telemetry.istio.io/v1
kind: Telemetry
metadata:
name: mesh-default
namespace: istio-system
spec:
accessLogging:
- providers:
- name: envoy
9 changes: 6 additions & 3 deletions deploy/components/vllm-sim/deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ spec:
args:
- "--port=8000"
- "--model=food-review"
# - "--lora=lora10,lora20,lora30"
# - "--time-to-first-token=500"
ports:
- containerPort: 8000
- name: http
containerPort: 8000
protocol: TCP
env:
- name: PORT
value: "8000"
3 changes: 2 additions & 1 deletion deploy/environments/dev/kind/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ resources:
- ../../../components/inference-gateway/

patches:
- path: gateway.yaml
- path: patch-deployments.yaml
- path: patch-gateways.yaml
22 changes: 22 additions & 0 deletions deploy/environments/dev/kind/patch-deployments.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: endpoint-picker
spec:
template:
spec:
containers:
- name: epp
args:
- -poolName
- "vllm-llama3-8b-instruct"
- -poolNamespace
- "default"
- -v
- "5"
- --zap-encoder
- "json"
- -grpcPort
- "9002"
- -grpcHealthPort
- "9003"
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ kind: Gateway
metadata:
name: inference-gateway
annotations:
networking.istio.io/service-type: NodePort
networking.istio.io/service-type: NodePort
8 changes: 3 additions & 5 deletions deploy/environments/dev/kubernetes/patch-deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,12 @@ spec:
containers:
- name: epp
args:
- -poolNamespace
- ${NAMESPACE}
- -refreshMetricsInterval
- "500ms"
- -poolName
- "vllm-llama3-8b-instruct"
- -poolNamespace
- ${NAMESPACE}
- -v
- "4"
- "5"
- --zap-encoder
- "json"
- -grpcPort
Expand Down
5 changes: 5 additions & 0 deletions pkg/epp/handlers/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ func (s *StreamingServer) HandleRequestBody(

func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *RequestContext, req *extProcPb.ProcessingRequest_RequestHeaders) error {
reqCtx.RequestReceivedTimestamp = time.Now()
logger := log.FromContext(ctx)
logger.V(logutil.TRACE).Info("Headers Handler", "request", req)

for _, header := range req.RequestHeaders.Headers.GetHeaders() {
value := string(header.RawValue)
Expand All @@ -159,5 +161,8 @@ func (s *StreamingServer) HandleRequestHeaders(ctx context.Context, reqCtx *Requ
endpoint := pod.Address + ":" + strconv.Itoa(int(pool.Spec.TargetPortNumber))
s.populateRequestHeaderResponse(reqCtx, endpoint, 0)
}

logger.V(logutil.TRACE).Info("Headers Handler", "handler", "complete")

return nil
}
3 changes: 3 additions & 0 deletions pkg/epp/handlers/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ func (s *StreamingServer) HandleResponseBody(
},
},
}

logger.V(logutil.TRACE).Info("Handle Response Body", "reqCtx", reqCtx)

return reqCtx, nil
}

Expand Down
9 changes: 9 additions & 0 deletions pkg/epp/handlers/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
// Order of requests matter in FULL_DUPLEX_STREAMING. For both request and response, the order of response sent back MUST be: Header->Body->Trailer, with trailer being optional.
func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProcessor_ProcessServer, logger logr.Logger) error {
loggerTrace := logger.V(logutil.TRACE)
loggerTrace.Info("updateStateAndSendIfNeeded", "step", "started")
// No switch statement as we could send multiple responses in one pass.
if r.RequestState == RequestReceived && r.reqHeaderResp != nil {
loggerTrace.Info("Sending request header response", "obj", r.reqHeaderResp)
Expand All @@ -334,6 +335,7 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
if r.RequestState == HeaderRequestResponseComplete && r.reqBodyResp != nil {
loggerTrace.Info("Sending request body response")
if err := srv.Send(r.reqBodyResp); err != nil {
loggerTrace.Info("failed to send response back to Envoy", "err", err)
return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
}
r.RequestState = BodyRequestResponsesComplete
Expand All @@ -345,15 +347,18 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
if r.RequestState == BodyRequestResponsesComplete && r.reqTrailerResp != nil {
// Trailers in requests are not guaranteed
if err := srv.Send(r.reqTrailerResp); err != nil {
loggerTrace.Info("failed to send response back to Envoy", "err", err)
return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
}
loggerTrace.Info("sent reqTrailerResp back to Envoy", "reqTrailerResp", r.reqTrailerResp)
}
if r.RequestState == ResponseRecieved && r.respHeaderResp != nil {
loggerTrace.Info("Sending response header response", "obj", r.respHeaderResp)
if err := srv.Send(r.respHeaderResp); err != nil {
return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
}
r.RequestState = HeaderResponseResponseComplete
loggerTrace.Info("send respHeaderResp", "reqHeaderResp", r.reqHeaderResp)
}
if r.RequestState == HeaderResponseResponseComplete && r.respBodyResp != nil {
loggerTrace.Info("Sending response body response")
Expand All @@ -365,15 +370,19 @@ func (r *RequestContext) updateStateAndSendIfNeeded(srv extProcPb.ExternalProces
if body.ResponseBody.Response.GetBodyMutation().GetStreamedResponse().GetEndOfStream() {
r.RequestState = BodyResponseResponsesComplete
}
loggerTrace.Info("dumping the response so a new stream message can begin")
// Dump the response so a new stream message can begin
r.respBodyResp = nil
}
if r.RequestState == BodyResponseResponsesComplete && r.respTrailerResp != nil {
// Trailers in requests are not guaranteed
if err := srv.Send(r.respTrailerResp); err != nil {
loggerTrace.Info("failed to send response back to Envoy", "err", err)
return status.Errorf(codes.Unknown, "failed to send response back to Envoy: %v", err)
}
loggerTrace.Info("sent respTrailerResp", "respTrailerResp", r.respTrailerResp)
}
loggerTrace.Info("updateStateAndSendIfNeeded", "step", "complete")
return nil
}

Expand Down