untested

g-linville · g-linville · commit 268ba0635c29 · 2025-10-01T15:59:18.000+05:30
Signed-off-by: Grant Linville &lt;grant@acorn.io&gt;
diff --git a/pkg/mcp/backend.go b/pkg/mcp/backend.go
@@ -33,8 +33,12 @@ func (e *ErrNotSupportedByBackend) Error() string {
 }
 
 var (
-	ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")
-	ErrHealthCheckFailed  = errors.New("MCP server is not healthy")
+	ErrHealthCheckTimeout     = errors.New("timed out waiting for MCP server to be ready")
+	ErrHealthCheckFailed      = errors.New("MCP server is not healthy")
+	ErrPodCrashLoopBackOff    = errors.New("pod is in CrashLoopBackOff state")
+	ErrImagePullFailed        = errors.New("failed to pull container image")
+	ErrPodSchedulingFailed    = errors.New("pod could not be scheduled")
+	ErrPodConfigurationFailed = errors.New("pod configuration is invalid")
 )
 
 func ensureServerReady(ctx context.Context, url string, server ServerConfig) error {
diff --git a/pkg/mcp/kubernetes.go b/pkg/mcp/kubernetes.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io"
+	"log/slog"
 	"sort"
 	"strings"
 	"time"
@@ -455,45 +456,177 @@ func (k *kubernetesBackend) k8sObjects(id string, server ServerConfig, serverDis
 	return objs, nil
 }
 
-func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
-	// Wait for the deployment to be updated.
-	_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
-		return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
-	}, wait.Option{Timeout: time.Minute})
-	if err != nil {
-		return "", ErrHealthCheckTimeout
+// analyzePodStatus examines a pod's status to determine if we should retry waiting for it
+// or if we should fail immediately. Returns (shouldRetry, reason).
+func analyzePodStatus(pod *corev1.Pod) (bool, string) {
+	// Check pod phase first
+	switch pod.Status.Phase {
+	case corev1.PodFailed:
+		return false, fmt.Sprintf("pod is in Failed phase: %s", pod.Status.Message)
+	case corev1.PodSucceeded:
+		// This shouldn't happen for a long-running deployment, but if it does, it's an error
+		return false, "pod succeeded and exited"
+	case corev1.PodUnknown:
+		return false, "pod is in Unknown phase"
+	}
+
+	// Check pod conditions for scheduling issues
+	for _, cond := range pod.Status.Conditions {
+		if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
+			// Pod can't be scheduled - check if it's a transient issue
+			if cond.Reason == corev1.PodReasonUnschedulable {
+				// Unschedulable could be transient (e.g., waiting for autoscaler)
+				return true, fmt.Sprintf("pod unschedulable: %s", cond.Message)
+			}
+		}
 	}
 
-	if err = ensureServerReady(ctx, url, server); err != nil {
-		return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
+	// Check container statuses
+	allContainerStatuses := append([]corev1.ContainerStatus{}, pod.Status.ContainerStatuses...)
+
+	for _, cs := range allContainerStatuses {
+		// Check if container is waiting
+		if cs.State.Waiting != nil {
+			waiting := cs.State.Waiting
+			switch waiting.Reason {
+			// Transient/recoverable states - should retry
+			case "ContainerCreating", "PodInitializing":
+				return true, fmt.Sprintf("container %s is %s", cs.Name, waiting.Reason)
+
+			// Image pull states - need to check if it's temporary or permanent
+			case "ImagePullBackOff", "ErrImagePull":
+				// ImagePullBackOff can be transient (network issues) but also permanent (bad image)
+				// We'll treat it as retryable for now, but it will eventually hit max retries
+				return true, fmt.Sprintf("container %s: %s - %s", cs.Name, waiting.Reason, waiting.Message)
+
+			// Permanent failures - should not retry
+			case "CrashLoopBackOff":
+				return false, fmt.Sprintf("container %s is in CrashLoopBackOff: %s", cs.Name, waiting.Message)
+			case "InvalidImageName":
+				return false, fmt.Sprintf("container %s has invalid image name: %s", cs.Name, waiting.Message)
+			case "CreateContainerConfigError", "CreateContainerError":
+				return false, fmt.Sprintf("container %s failed to create: %s - %s", cs.Name, waiting.Reason, waiting.Message)
+			case "RunContainerError":
+				return false, fmt.Sprintf("container %s failed to run: %s", cs.Name, waiting.Message)
+			}
+		}
+
+		// Check if container terminated with errors and has high restart count
+		if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
+			if cs.RestartCount > 3 {
+				return false, fmt.Sprintf("container %s repeatedly crashing (exit code %d, %d restarts): %s",
+					cs.Name, cs.State.Terminated.ExitCode, cs.RestartCount, cs.State.Terminated.Reason)
+			}
+		}
 	}
 
-	// Now get the pod name that is currently running
-	var (
-		pods            corev1.PodList
-		runningPodCount int
-		podName         string
-	)
-	if err = k.client.List(ctx, &pods, &kclient.ListOptions{
-		Namespace: k.mcpNamespace,
-		LabelSelector: labels.SelectorFromSet(map[string]string{
-			"app": id,
-		}),
-	}); err != nil {
-		return "", fmt.Errorf("failed to list MCP pods: %w", err)
+	// Check if pod is being evicted
+	if pod.Status.Reason == "Evicted" {
+		return false, fmt.Sprintf("pod was evicted: %s", pod.Status.Message)
 	}
 
-	for _, p := range pods.Items {
-		if p.Status.Phase == corev1.PodRunning {
-			podName = p.Name
-			runningPodCount++
+	// Default: pod is in Pending or Running but not ready yet - should retry
+	return true, fmt.Sprintf("pod in phase %s, waiting for containers to be ready", pod.Status.Phase)
+}
+
+func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
+	const maxRetries = 5
+	var lastReason string
+
+	// Retry loop with smart pod status checking
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		// Wait for the deployment to be updated.
+		_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
+			return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
+		}, wait.Option{Timeout: time.Minute})
+
+		if err == nil {
+			// Deployment is ready, now ensure the server is ready
+			if err = ensureServerReady(ctx, url, server); err != nil {
+				return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
+			}
+
+			// Now get the pod name that is currently running
+			var (
+				pods            corev1.PodList
+				runningPodCount int
+				podName         string
+			)
+			if err = k.client.List(ctx, &pods, &kclient.ListOptions{
+				Namespace: k.mcpNamespace,
+				LabelSelector: labels.SelectorFromSet(map[string]string{
+					"app": id,
+				}),
+			}); err != nil {
+				return "", fmt.Errorf("failed to list MCP pods: %w", err)
+			}
+
+			for _, p := range pods.Items {
+				if p.Status.Phase == corev1.PodRunning {
+					podName = p.Name
+					runningPodCount++
+				}
+			}
+			if runningPodCount == 1 {
+				return podName, nil
+			}
+
+			return "", ErrHealthCheckTimeout
 		}
-	}
-	if runningPodCount == 1 {
-		return podName, nil
+
+		// Deployment wait timed out, check pod status to decide if we should retry
+		var pods corev1.PodList
+		if listErr := k.client.List(ctx, &pods, &kclient.ListOptions{
+			Namespace: k.mcpNamespace,
+			LabelSelector: labels.SelectorFromSet(map[string]string{
+				"app": id,
+			}),
+		}); listErr != nil {
+			slog.Error("failed to list MCP pods for status check", "id", id, "error", listErr)
+			return "", fmt.Errorf("failed to list MCP pods: %w", listErr)
+		}
+
+		if len(pods.Items) == 0 {
+			slog.Warn("no pods found for MCP server", "id", id, "attempt", attempt+1)
+			lastReason = "no pods found"
+			if attempt < maxRetries {
+				continue
+			}
+			return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason)
+		}
+
+		// Analyze the first pod (there should only be one for a deployment with replicas=1)
+		pod := &pods.Items[0]
+		shouldRetry, reason := analyzePodStatus(pod)
+		lastReason = reason
+
+		if !shouldRetry {
+			// Permanent failure - return appropriate error based on reason
+			slog.Error("pod in non-retryable state", "id", id, "reason", reason, "attempt", attempt+1)
+			if strings.Contains(reason, "CrashLoopBackOff") {
+				return "", fmt.Errorf("%w: %s", ErrPodCrashLoopBackOff, reason)
+			} else if strings.Contains(reason, "ImagePull") || strings.Contains(reason, "InvalidImageName") {
+				return "", fmt.Errorf("%w: %s", ErrImagePullFailed, reason)
+			} else if strings.Contains(reason, "unschedulable") || strings.Contains(reason, "Evicted") {
+				return "", fmt.Errorf("%w: %s", ErrPodSchedulingFailed, reason)
+			} else if strings.Contains(reason, "CreateContainer") || strings.Contains(reason, "RunContainer") {
+				return "", fmt.Errorf("%w: %s", ErrPodConfigurationFailed, reason)
+			}
+			return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, reason)
+		}
+
+		// Transient issue - retry if we haven't exceeded max retries
+		if attempt < maxRetries {
+			slog.Info("pod not ready, will retry", "id", id, "reason", reason, "attempt", attempt+1, "maxRetries", maxRetries)
+			continue
+		}
+
+		// Exceeded max retries
+		slog.Error("exceeded max retries waiting for pod", "id", id, "lastReason", lastReason, "attempts", attempt+1)
+		return "", fmt.Errorf("%w after %d retries: %s", ErrHealthCheckTimeout, maxRetries+1, lastReason)
 	}
 
-	return "", ErrHealthCheckTimeout
+	return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason)
 }
 
 func (k *kubernetesBackend) restartServer(ctx context.Context, id string) error {
diff --git a/pkg/mcp/mcp.go b/pkg/mcp/mcp.go
@@ -128,6 +128,14 @@ func findSpecialError(err error, mcpServerDisplayName string) (bool, error) {
 			return true, fmt.Errorf("no response from MCP server %s, this is likely due to a configuration error", mcpServerDisplayName)
 		case unwrappedErr == ErrHealthCheckFailed || unwrappedErr == ErrHealthCheckTimeout:
 			return true, fmt.Errorf("MCP server %s is unhealthy", mcpServerDisplayName)
+		case unwrappedErr == ErrPodCrashLoopBackOff:
+			return true, fmt.Errorf("MCP server %s pod is crashing", mcpServerDisplayName)
+		case unwrappedErr == ErrImagePullFailed:
+			return true, fmt.Errorf("failed to pull image for MCP server %s", mcpServerDisplayName)
+		case unwrappedErr == ErrPodSchedulingFailed:
+			return true, fmt.Errorf("MCP server %s pod could not be scheduled", mcpServerDisplayName)
+		case unwrappedErr == ErrPodConfigurationFailed:
+			return true, fmt.Errorf("MCP server %s has invalid configuration", mcpServerDisplayName)
 		default:
 			switch e := unwrappedErr.(type) {
 			case nmcp.AuthRequiredErr:

Original file line number	Diff line number	Diff line change
`@@ -33,8 +33,12 @@ func (e *ErrNotSupportedByBackend) Error() string {`
`33`	`33`	`}`
`34`	`34`
`35`	`35`	`var (`
`36`		`- ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")`
`37`		`- ErrHealthCheckFailed = errors.New("MCP server is not healthy")`
	`36`	`+ ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")`
	`37`	`+ ErrHealthCheckFailed = errors.New("MCP server is not healthy")`
	`38`	`+ ErrPodCrashLoopBackOff = errors.New("pod is in CrashLoopBackOff state")`
	`39`	`+ ErrImagePullFailed = errors.New("failed to pull container image")`
	`40`	`+ ErrPodSchedulingFailed = errors.New("pod could not be scheduled")`
	`41`	`+ ErrPodConfigurationFailed = errors.New("pod configuration is invalid")`
`38`	`42`	`)`
`39`	`43`
`40`	`44`	`func ensureServerReady(ctx context.Context, url string, server ServerConfig) error {`