Skip to content

Commit ad68aa7

Browse files
authored
enhance: be smarter about timeouts when waiting for deployments (#4530)
Signed-off-by: Grant Linville <[email protected]>
1 parent 0b83b6c commit ad68aa7

File tree

3 files changed

+187
-31
lines changed

3 files changed

+187
-31
lines changed

pkg/mcp/backend.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,12 @@ func (e *ErrNotSupportedByBackend) Error() string {
3333
}
3434

3535
var (
36-
ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")
37-
ErrHealthCheckFailed = errors.New("MCP server is not healthy")
36+
ErrHealthCheckTimeout = errors.New("timed out waiting for MCP server to be ready")
37+
ErrHealthCheckFailed = errors.New("MCP server is not healthy")
38+
ErrPodCrashLoopBackOff = errors.New("pod is in CrashLoopBackOff state")
39+
ErrImagePullFailed = errors.New("failed to pull container image")
40+
ErrPodSchedulingFailed = errors.New("pod could not be scheduled")
41+
ErrPodConfigurationFailed = errors.New("pod configuration is invalid")
3842
)
3943

4044
func ensureServerReady(ctx context.Context, url string, server ServerConfig) error {

pkg/mcp/kubernetes.go

Lines changed: 173 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/obot-platform/nah/pkg/apply"
1414
"github.com/obot-platform/nah/pkg/name"
1515
"github.com/obot-platform/obot/apiclient/types"
16+
"github.com/obot-platform/obot/logger"
1617
"github.com/obot-platform/obot/pkg/wait"
1718
appsv1 "k8s.io/api/apps/v1"
1819
corev1 "k8s.io/api/core/v1"
@@ -27,6 +28,8 @@ import (
2728
kclient "sigs.k8s.io/controller-runtime/pkg/client"
2829
)
2930

31+
var olog = logger.Package()
32+
3033
type kubernetesBackend struct {
3134
clientset *kubernetes.Clientset
3235
client kclient.WithWatch
@@ -459,45 +462,186 @@ func (k *kubernetesBackend) k8sObjects(server ServerConfig, userID, serverDispla
459462
return objs, nil
460463
}
461464

462-
func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
463-
// Wait for the deployment to be updated.
464-
_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
465-
return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
466-
}, wait.Option{Timeout: time.Minute})
467-
if err != nil {
468-
return "", ErrHealthCheckTimeout
465+
// getNewestPod finds and returns the most recently created pod from the list.
466+
func getNewestPod(pods []corev1.Pod) (*corev1.Pod, error) {
467+
if len(pods) == 0 {
468+
return nil, fmt.Errorf("no pods provided")
469469
}
470470

471-
if err = ensureServerReady(ctx, url, server); err != nil {
472-
return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
471+
newest := &pods[0]
472+
for i := range pods {
473+
if pods[i].CreationTimestamp.After(newest.CreationTimestamp.Time) {
474+
newest = &pods[i]
475+
}
473476
}
474477

475-
// Now get the pod name that is currently running
476-
var (
477-
pods corev1.PodList
478-
runningPodCount int
479-
podName string
480-
)
481-
if err = k.client.List(ctx, &pods, &kclient.ListOptions{
482-
Namespace: k.mcpNamespace,
483-
LabelSelector: labels.SelectorFromSet(map[string]string{
484-
"app": id,
485-
}),
486-
}); err != nil {
487-
return "", fmt.Errorf("failed to list MCP pods: %w", err)
478+
return newest, nil
479+
}
480+
481+
// analyzePodStatus examines a pod's status to determine if we should retry waiting for it
482+
// or if we should fail immediately. Returns (shouldRetry, error).
483+
func analyzePodStatus(pod *corev1.Pod) (bool, error) {
484+
// Check pod phase first
485+
switch pod.Status.Phase {
486+
case corev1.PodFailed:
487+
return false, fmt.Errorf("%w: pod is in Failed phase: %s", ErrHealthCheckTimeout, pod.Status.Message)
488+
case corev1.PodSucceeded:
489+
// This shouldn't happen for a long-running deployment, but if it does, it's an error
490+
return false, fmt.Errorf("%w: pod succeeded and exited", ErrHealthCheckTimeout)
491+
case corev1.PodUnknown:
492+
return false, fmt.Errorf("%w: pod is in Unknown phase", ErrHealthCheckTimeout)
493+
}
494+
495+
// Check pod conditions for scheduling issues
496+
for _, cond := range pod.Status.Conditions {
497+
if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
498+
// Pod can't be scheduled - check if it's a transient issue
499+
if cond.Reason == corev1.PodReasonUnschedulable {
500+
// Unschedulable could be transient (e.g., waiting for autoscaler)
501+
return true, fmt.Errorf("%w: pod unschedulable: %s", ErrPodSchedulingFailed, cond.Message)
502+
}
503+
}
488504
}
489505

490-
for _, p := range pods.Items {
491-
if p.Status.Phase == corev1.PodRunning {
492-
podName = p.Name
493-
runningPodCount++
506+
for _, cs := range pod.Status.ContainerStatuses {
507+
// Check if container is waiting
508+
if cs.State.Waiting != nil {
509+
waiting := cs.State.Waiting
510+
switch waiting.Reason {
511+
// Transient/recoverable states - should retry
512+
case "ContainerCreating", "PodInitializing":
513+
return true, fmt.Errorf("container %s is %s", cs.Name, waiting.Reason)
514+
515+
// Image pull states - need to check if it's temporary or permanent
516+
case "ImagePullBackOff", "ErrImagePull":
517+
// ImagePullBackOff can be transient (network issues) but also permanent (bad image)
518+
// We'll treat it as retryable for now, but it will eventually hit max retries
519+
return true, fmt.Errorf("%w: container %s: %s - %s", ErrImagePullFailed, cs.Name, waiting.Reason, waiting.Message)
520+
521+
// Permanent failures - should not retry
522+
case "CrashLoopBackOff":
523+
return false, fmt.Errorf("%w: container %s is in CrashLoopBackOff: %s", ErrPodCrashLoopBackOff, cs.Name, waiting.Message)
524+
case "InvalidImageName":
525+
return false, fmt.Errorf("%w: container %s has invalid image name: %s", ErrImagePullFailed, cs.Name, waiting.Message)
526+
case "CreateContainerConfigError", "CreateContainerError":
527+
return false, fmt.Errorf("%w: container %s failed to create: %s - %s", ErrPodConfigurationFailed, cs.Name, waiting.Reason, waiting.Message)
528+
case "RunContainerError":
529+
return false, fmt.Errorf("%w: container %s failed to run: %s", ErrPodConfigurationFailed, cs.Name, waiting.Message)
530+
}
494531
}
532+
533+
// Check if container terminated with errors and has high restart count
534+
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 {
535+
if cs.RestartCount > 3 {
536+
return false, fmt.Errorf("%w: container %s repeatedly crashing (exit code %d, %d restarts): %s",
537+
ErrPodCrashLoopBackOff, cs.Name, cs.State.Terminated.ExitCode, cs.RestartCount, cs.State.Terminated.Reason)
538+
}
539+
}
540+
}
541+
542+
// Check if pod is being evicted
543+
if pod.Status.Reason == "Evicted" {
544+
return false, fmt.Errorf("%w: pod was evicted: %s", ErrPodSchedulingFailed, pod.Status.Message)
495545
}
496-
if runningPodCount == 1 {
497-
return podName, nil
546+
547+
// Default: pod is in Pending or Running but not ready yet - should retry
548+
return true, fmt.Errorf("pod in phase %s, waiting for containers to be ready", pod.Status.Phase)
549+
}
550+
551+
func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) {
552+
const maxRetries = 5
553+
var lastErr error
554+
555+
// Retry loop with smart pod status checking
556+
for attempt := range maxRetries {
557+
// Wait for the deployment to be updated.
558+
_, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) {
559+
return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil
560+
}, wait.Option{Timeout: time.Minute})
561+
if err == nil {
562+
// Deployment is ready, now ensure the server is ready
563+
if err = ensureServerReady(ctx, url, server); err != nil {
564+
return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err)
565+
}
566+
567+
// Now get the pod name that is currently running
568+
var (
569+
pods corev1.PodList
570+
runningPodCount int
571+
podName string
572+
)
573+
if err = k.client.List(ctx, &pods, &kclient.ListOptions{
574+
Namespace: k.mcpNamespace,
575+
LabelSelector: labels.SelectorFromSet(map[string]string{
576+
"app": id,
577+
}),
578+
}); err != nil {
579+
return "", fmt.Errorf("failed to list MCP pods: %w", err)
580+
}
581+
582+
for _, p := range pods.Items {
583+
if p.Status.Phase == corev1.PodRunning {
584+
podName = p.Name
585+
runningPodCount++
586+
}
587+
}
588+
589+
// runningPodCount should always equal 1, if the deployment is ready, as it is by this point in the code.
590+
// However, we will check just to make sure, and retry if it isn't.
591+
if runningPodCount == 1 {
592+
return podName, nil
593+
} else if runningPodCount > 1 {
594+
lastErr = fmt.Errorf("more than one running pod found")
595+
} else {
596+
lastErr = fmt.Errorf("no pods found")
597+
}
598+
continue
599+
}
600+
601+
// Deployment wait timed out, check pod status to decide if we should retry
602+
var pods corev1.PodList
603+
if listErr := k.client.List(ctx, &pods, &kclient.ListOptions{
604+
Namespace: k.mcpNamespace,
605+
LabelSelector: labels.SelectorFromSet(map[string]string{
606+
"app": id,
607+
}),
608+
}); listErr != nil {
609+
olog.Debugf("failed to list MCP pods for status check: id=%s error=%v", id, listErr)
610+
return "", fmt.Errorf("failed to list MCP pods: %w", listErr)
611+
}
612+
613+
if len(pods.Items) == 0 {
614+
olog.Debugf("no pods found for MCP server: id=%s attempt=%d", id, attempt+1)
615+
lastErr = fmt.Errorf("no pods found")
616+
if attempt < maxRetries {
617+
continue
618+
}
619+
return "", fmt.Errorf("%w: %v", ErrHealthCheckTimeout, lastErr)
620+
}
621+
622+
// Get the newest pod and analyze its status
623+
newestPod, err := getNewestPod(pods.Items)
624+
if err != nil {
625+
olog.Debugf("failed to get newest pod: id=%s error=%v attempt=%d", id, err, attempt+1)
626+
lastErr = err
627+
if attempt < maxRetries {
628+
continue
629+
}
630+
return "", fmt.Errorf("%w: %v", ErrHealthCheckTimeout, lastErr)
631+
}
632+
633+
shouldRetry, podErr := analyzePodStatus(newestPod)
634+
lastErr = podErr
635+
636+
if !shouldRetry {
637+
// Permanent failure - return the error with the appropriate type already wrapped
638+
olog.Debugf("pod in non-retryable state: id=%s error=%v attempt=%d", id, podErr, attempt+1)
639+
return "", podErr
640+
}
498641
}
499642

500-
return "", ErrHealthCheckTimeout
643+
olog.Debugf("exceeded max retries waiting for pod: id=%s lastError=%v attempts=%d", id, lastErr, maxRetries)
644+
return "", fmt.Errorf("%w after %d retries: %v", ErrHealthCheckTimeout, maxRetries, lastErr)
501645
}
502646

503647
func (k *kubernetesBackend) restartServer(ctx context.Context, id string) error {

pkg/mcp/mcp.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,14 @@ func findSpecialError(err error, mcpServerDisplayName string) (bool, error) {
128128
return true, fmt.Errorf("no response from MCP server %s, this is likely due to a configuration error", mcpServerDisplayName)
129129
case unwrappedErr == ErrHealthCheckFailed || unwrappedErr == ErrHealthCheckTimeout:
130130
return true, fmt.Errorf("MCP server %s is unhealthy", mcpServerDisplayName)
131+
case unwrappedErr == ErrPodCrashLoopBackOff:
132+
return true, fmt.Errorf("MCP server %s pod is crashing", mcpServerDisplayName)
133+
case unwrappedErr == ErrImagePullFailed:
134+
return true, fmt.Errorf("failed to pull image for MCP server %s", mcpServerDisplayName)
135+
case unwrappedErr == ErrPodSchedulingFailed:
136+
return true, fmt.Errorf("MCP server %s pod could not be scheduled", mcpServerDisplayName)
137+
case unwrappedErr == ErrPodConfigurationFailed:
138+
return true, fmt.Errorf("MCP server %s has invalid configuration", mcpServerDisplayName)
131139
default:
132140
switch e := unwrappedErr.(type) {
133141
case nmcp.AuthRequiredErr:

0 commit comments

Comments
 (0)