Skip to content

Commit 268ba06

Browse files
committed
untested
Signed-off-by: Grant Linville <[email protected]>
1 parent bdca12e commit 268ba06

File tree

3 files changed

+177
-32
lines changed

3 files changed

+177
-32
lines changed

pkg/mcp/backend.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ func (e *ErrNotSupportedByBackend) Error() string {
3333
}
3434

3535
var (
36-
ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")
37-
ErrHealthCheckFailed = errors.New("MCP server is not healthy")
36+
ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")
37+
ErrHealthCheckFailed = errors.New("MCP server is not healthy")
38+
ErrPodCrashLoopBackOff = errors.New("pod is in CrashLoopBackOff state")
39+
ErrImagePullFailed = errors.New("failed to pull container image")
40+
ErrPodSchedulingFailed = errors.New("pod could not be scheduled")
41+
ErrPodConfigurationFailed = errors.New("pod configuration is invalid")
3842
)
3943

4044
func ensureServerReady(ctx context.Context, url string, server ServerConfig) error {

pkg/mcp/kubernetes.go

Lines changed: 163 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/json"
66
"fmt"
77
"io"
8+
"log/slog"
89
"sort"
910
"strings"
1011
"time"
@@ -455,45 +456,177 @@ func (k *kubernetesBackend) k8sObjects(id string, server ServerConfig, serverDis
455456
return objs, nil
456457
}
457458

458-
func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
459-
// Wait for the deployment to be updated.
460-
_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
461-
return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
462-
}, wait.Option{Timeout: time.Minute})
463-
if err != nil {
464-
return "", ErrHealthCheckTimeout
459+
// analyzePodStatus examines a pod's status to determine if we should retry waiting for it
460+
// or if we should fail immediately. Returns (shouldRetry, reason).
461+
func analyzePodStatus(pod *corev1.Pod) (bool, string) {
462+
// Check pod phase first
463+
switch pod.Status.Phase {
464+
case corev1.PodFailed:
465+
return false, fmt.Sprintf("pod is in Failed phase: %s", pod.Status.Message)
466+
case corev1.PodSucceeded:
467+
// This shouldn't happen for a long-running deployment, but if it does, it's an error
468+
return false, "pod succeeded and exited"
469+
case corev1.PodUnknown:
470+
return false, "pod is in Unknown phase"
471+
}
472+
473+
// Check pod conditions for scheduling issues
474+
for _, cond := range pod.Status.Conditions {
475+
if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
476+
// Pod can't be scheduled - check if it's a transient issue
477+
if cond.Reason == corev1.PodReasonUnschedulable {
478+
// Unschedulable could be transient (e.g., waiting for autoscaler)
479+
return true, fmt.Sprintf("pod unschedulable: %s", cond.Message)
480+
}
481+
}
465482
}
466483

467-
if err = ensureServerReady(ctx, url, server); err != nil {
468-
return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
484+
// Check container statuses
485+
allContainerStatuses := append([]corev1.ContainerStatus{}, pod.Status.ContainerStatuses...)
486+
487+
for _, cs := range allContainerStatuses {
488+
// Check if container is waiting
489+
if cs.State.Waiting != nil {
490+
waiting := cs.State.Waiting
491+
switch waiting.Reason {
492+
// Transient/recoverable states - should retry
493+
case "ContainerCreating", "PodInitializing":
494+
return true, fmt.Sprintf("container %s is %s", cs.Name, waiting.Reason)
495+
496+
// Image pull states - need to check if it's temporary or permanent
497+
case "ImagePullBackOff", "ErrImagePull":
498+
// ImagePullBackOff can be transient (network issues) but also permanent (bad image)
499+
// We'll treat it as retryable for now, but it will eventually hit max retries
500+
return true, fmt.Sprintf("container %s: %s - %s", cs.Name, waiting.Reason, waiting.Message)
501+
502+
// Permanent failures - should not retry
503+
case "CrashLoopBackOff":
504+
return false, fmt.Sprintf("container %s is in CrashLoopBackOff: %s", cs.Name, waiting.Message)
505+
case "InvalidImageName":
506+
return false, fmt.Sprintf("container %s has invalid image name: %s", cs.Name, waiting.Message)
507+
case "CreateContainerConfigError", "CreateContainerError":
508+
return false, fmt.Sprintf("container %s failed to create: %s - %s", cs.Name, waiting.Reason, waiting.Message)
509+
case "RunContainerError":
510+
return false, fmt.Sprintf("container %s failed to run: %s", cs.Name, waiting.Message)
511+
}
512+
}
513+
514+
// Check if container terminated with errors and has high restart count
515+
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
516+
if cs.RestartCount > 3 {
517+
return false, fmt.Sprintf("container %s repeatedly crashing (exit code %d, %d restarts): %s",
518+
cs.Name, cs.State.Terminated.ExitCode, cs.RestartCount, cs.State.Terminated.Reason)
519+
}
520+
}
469521
}
470522

471-
// Now get the pod name that is currently running
472-
var (
473-
pods corev1.PodList
474-
runningPodCount int
475-
podName string
476-
)
477-
if err = k.client.List(ctx, &pods, &kclient.ListOptions{
478-
Namespace: k.mcpNamespace,
479-
LabelSelector: labels.SelectorFromSet(map[string]string{
480-
"app": id,
481-
}),
482-
}); err != nil {
483-
return "", fmt.Errorf("failed to list MCP pods: %w", err)
523+
// Check if pod is being evicted
524+
if pod.Status.Reason == "Evicted" {
525+
return false, fmt.Sprintf("pod was evicted: %s", pod.Status.Message)
484526
}
485527

486-
for _, p := range pods.Items {
487-
if p.Status.Phase == corev1.PodRunning {
488-
podName = p.Name
489-
runningPodCount++
528+
// Default: pod is in Pending or Running but not ready yet - should retry
529+
return true, fmt.Sprintf("pod in phase %s, waiting for containers to be ready", pod.Status.Phase)
530+
}
531+
532+
func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
533+
const maxRetries = 5
534+
var lastReason string
535+
536+
// Retry loop with smart pod status checking
537+
for attempt := 0; attempt <= maxRetries; attempt++ {
538+
// Wait for the deployment to be updated.
539+
_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
540+
return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
541+
}, wait.Option{Timeout: time.Minute})
542+
543+
if err == nil {
544+
// Deployment is ready, now ensure the server is ready
545+
if err = ensureServerReady(ctx, url, server); err != nil {
546+
return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
547+
}
548+
549+
// Now get the pod name that is currently running
550+
var (
551+
pods corev1.PodList
552+
runningPodCount int
553+
podName string
554+
)
555+
if err = k.client.List(ctx, &pods, &kclient.ListOptions{
556+
Namespace: k.mcpNamespace,
557+
LabelSelector: labels.SelectorFromSet(map[string]string{
558+
"app": id,
559+
}),
560+
}); err != nil {
561+
return "", fmt.Errorf("failed to list MCP pods: %w", err)
562+
}
563+
564+
for _, p := range pods.Items {
565+
if p.Status.Phase == corev1.PodRunning {
566+
podName = p.Name
567+
runningPodCount++
568+
}
569+
}
570+
if runningPodCount == 1 {
571+
return podName, nil
572+
}
573+
574+
return "", ErrHealthCheckTimeout
490575
}
491-
}
492-
if runningPodCount == 1 {
493-
return podName, nil
576+
577+
// Deployment wait timed out, check pod status to decide if we should retry
578+
var pods corev1.PodList
579+
if listErr := k.client.List(ctx, &pods, &kclient.ListOptions{
580+
Namespace: k.mcpNamespace,
581+
LabelSelector: labels.SelectorFromSet(map[string]string{
582+
"app": id,
583+
}),
584+
}); listErr != nil {
585+
slog.Error("failed to list MCP pods for status check", "id", id, "error", listErr)
586+
return "", fmt.Errorf("failed to list MCP pods: %w", listErr)
587+
}
588+
589+
if len(pods.Items) == 0 {
590+
slog.Warn("no pods found for MCP server", "id", id, "attempt", attempt+1)
591+
lastReason = "no pods found"
592+
if attempt < maxRetries {
593+
continue
594+
}
595+
return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason)
596+
}
597+
598+
// Analyze the first pod (there should only be one for a deployment with replicas=1)
599+
pod := &pods.Items[0]
600+
shouldRetry, reason := analyzePodStatus(pod)
601+
lastReason = reason
602+
603+
if !shouldRetry {
604+
// Permanent failure - return appropriate error based on reason
605+
slog.Error("pod in non-retryable state", "id", id, "reason", reason, "attempt", attempt+1)
606+
if strings.Contains(reason, "CrashLoopBackOff") {
607+
return "", fmt.Errorf("%w: %s", ErrPodCrashLoopBackOff, reason)
608+
} else if strings.Contains(reason, "ImagePull") || strings.Contains(reason, "InvalidImageName") {
609+
return "", fmt.Errorf("%w: %s", ErrImagePullFailed, reason)
610+
} else if strings.Contains(reason, "unschedulable") || strings.Contains(reason, "Evicted") {
611+
return "", fmt.Errorf("%w: %s", ErrPodSchedulingFailed, reason)
612+
} else if strings.Contains(reason, "CreateContainer") || strings.Contains(reason, "RunContainer") {
613+
return "", fmt.Errorf("%w: %s", ErrPodConfigurationFailed, reason)
614+
}
615+
return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, reason)
616+
}
617+
618+
// Transient issue - retry if we haven't exceeded max retries
619+
if attempt < maxRetries {
620+
slog.Info("pod not ready, will retry", "id", id, "reason", reason, "attempt", attempt+1, "maxRetries", maxRetries)
621+
continue
622+
}
623+
624+
// Exceeded max retries
625+
slog.Error("exceeded max retries waiting for pod", "id", id, "lastReason", lastReason, "attempts", attempt+1)
626+
return "", fmt.Errorf("%w after %d retries: %s", ErrHealthCheckTimeout, maxRetries+1, lastReason)
494627
}
495628

496-
return "", ErrHealthCheckTimeout
629+
return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason)
497630
}
498631

499632
func (k *kubernetesBackend) restartServer(ctx context.Context, id string) error {

pkg/mcp/mcp.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,14 @@ func findSpecialError(err error, mcpServerDisplayName string) (bool, error) {
128128
return true, fmt.Errorf("no response from MCP server %s, this is likely due to a configuration error", mcpServerDisplayName)
129129
case unwrappedErr == ErrHealthCheckFailed || unwrappedErr == ErrHealthCheckTimeout:
130130
return true, fmt.Errorf("MCP server %s is unhealthy", mcpServerDisplayName)
131+
case unwrappedErr == ErrPodCrashLoopBackOff:
132+
return true, fmt.Errorf("MCP server %s pod is crashing", mcpServerDisplayName)
133+
case unwrappedErr == ErrImagePullFailed:
134+
return true, fmt.Errorf("failed to pull image for MCP server %s", mcpServerDisplayName)
135+
case unwrappedErr == ErrPodSchedulingFailed:
136+
return true, fmt.Errorf("MCP server %s pod could not be scheduled", mcpServerDisplayName)
137+
case unwrappedErr == ErrPodConfigurationFailed:
138+
return true, fmt.Errorf("MCP server %s has invalid configuration", mcpServerDisplayName)
131139
default:
132140
switch e := unwrappedErr.(type) {
133141
case nmcp.AuthRequiredErr:

0 commit comments

Comments
 (0)