@@ -13,6 +13,7 @@ import (
1313 "github.com/obot-platform/nah/pkg/apply"
1414 "github.com/obot-platform/nah/pkg/name"
1515 "github.com/obot-platform/obot/apiclient/types"
16+ "github.com/obot-platform/obot/logger"
1617 "github.com/obot-platform/obot/pkg/wait"
1718 appsv1 "k8s.io/api/apps/v1"
1819 corev1 "k8s.io/api/core/v1"
@@ -27,6 +28,8 @@ import (
2728 kclient "sigs.k8s.io/controller-runtime/pkg/client"
2829)
2930
31+ var olog = logger .Package ()
32+
3033type kubernetesBackend struct {
3134 clientset * kubernetes.Clientset
3235 client kclient.WithWatch
@@ -459,45 +462,186 @@ func (k *kubernetesBackend) k8sObjects(server ServerConfig, userID, serverDispla
459462 return objs , nil
460463}
461464
462- func (k * kubernetesBackend ) updatedMCPPodName (ctx context.Context , url , id string , server ServerConfig ) (string , error ) {
463- // Wait for the deployment to be updated.
464- _ , err := wait .For (ctx , k .client , & appsv1.Deployment {ObjectMeta : metav1.ObjectMeta {Name : id , Namespace : k .mcpNamespace }}, func (dep * appsv1.Deployment ) (bool , error ) {
465- return dep .Generation == dep .Status .ObservedGeneration && dep .Status .Replicas == 1 && dep .Status .UpdatedReplicas == 1 && dep .Status .ReadyReplicas == 1 && dep .Status .AvailableReplicas == 1 , nil
466- }, wait.Option {Timeout : time .Minute })
467- if err != nil {
468- return "" , ErrHealthCheckTimeout
465+ // getNewestPod finds and returns the most recently created pod from the list.
466+ func getNewestPod (pods []corev1.Pod ) (* corev1.Pod , error ) {
467+ if len (pods ) == 0 {
468+ return nil , fmt .Errorf ("no pods provided" )
469469 }
470470
471- if err = ensureServerReady (ctx , url , server ); err != nil {
472- return "" , fmt .Errorf ("failed to ensure MCP server is ready: %w" , err )
471+ newest := & pods [0 ]
472+ for i := range pods {
473+ if pods [i ].CreationTimestamp .After (newest .CreationTimestamp .Time ) {
474+ newest = & pods [i ]
475+ }
473476 }
474477
475- // Now get the pod name that is currently running
476- var (
477- pods corev1.PodList
478- runningPodCount int
479- podName string
480- )
481- if err = k .client .List (ctx , & pods , & kclient.ListOptions {
482- Namespace : k .mcpNamespace ,
483- LabelSelector : labels .SelectorFromSet (map [string ]string {
484- "app" : id ,
485- }),
486- }); err != nil {
487- return "" , fmt .Errorf ("failed to list MCP pods: %w" , err )
478+ return newest , nil
479+ }
480+
481+ // analyzePodStatus examines a pod's status to determine if we should retry waiting for it
482+ // or if we should fail immediately. Returns (shouldRetry, error).
483+ func analyzePodStatus (pod * corev1.Pod ) (bool , error ) {
484+ // Check pod phase first
485+ switch pod .Status .Phase {
486+ case corev1 .PodFailed :
487+ return false , fmt .Errorf ("%w: pod is in Failed phase: %s" , ErrHealthCheckTimeout , pod .Status .Message )
488+ case corev1 .PodSucceeded :
489+ // This shouldn't happen for a long-running deployment, but if it does, it's an error
490+ return false , fmt .Errorf ("%w: pod succeeded and exited" , ErrHealthCheckTimeout )
491+ case corev1 .PodUnknown :
492+ return false , fmt .Errorf ("%w: pod is in Unknown phase" , ErrHealthCheckTimeout )
493+ }
494+
495+ // Check pod conditions for scheduling issues
496+ for _ , cond := range pod .Status .Conditions {
497+ if cond .Type == corev1 .PodScheduled && cond .Status == corev1 .ConditionFalse {
498+ // Pod can't be scheduled - check if it's a transient issue
499+ if cond .Reason == corev1 .PodReasonUnschedulable {
500+ // Unschedulable could be transient (e.g., waiting for autoscaler)
501+ return true , fmt .Errorf ("%w: pod unschedulable: %s" , ErrPodSchedulingFailed , cond .Message )
502+ }
503+ }
488504 }
489505
490- for _ , p := range pods .Items {
491- if p .Status .Phase == corev1 .PodRunning {
492- podName = p .Name
493- runningPodCount ++
506+ for _ , cs := range pod .Status .ContainerStatuses {
507+ // Check if container is waiting
508+ if cs .State .Waiting != nil {
509+ waiting := cs .State .Waiting
510+ switch waiting .Reason {
511+ // Transient/recoverable states - should retry
512+ case "ContainerCreating" , "PodInitializing" :
513+ return true , fmt .Errorf ("container %s is %s" , cs .Name , waiting .Reason )
514+
515+ // Image pull states - need to check if it's temporary or permanent
516+ case "ImagePullBackOff" , "ErrImagePull" :
517+ // ImagePullBackOff can be transient (network issues) but also permanent (bad image)
518+ // We'll treat it as retryable for now, but it will eventually hit max retries
519+ return true , fmt .Errorf ("%w: container %s: %s - %s" , ErrImagePullFailed , cs .Name , waiting .Reason , waiting .Message )
520+
521+ // Permanent failures - should not retry
522+ case "CrashLoopBackOff" :
523+ return false , fmt .Errorf ("%w: container %s is in CrashLoopBackOff: %s" , ErrPodCrashLoopBackOff , cs .Name , waiting .Message )
524+ case "InvalidImageName" :
525+ return false , fmt .Errorf ("%w: container %s has invalid image name: %s" , ErrImagePullFailed , cs .Name , waiting .Message )
526+ case "CreateContainerConfigError" , "CreateContainerError" :
527+ return false , fmt .Errorf ("%w: container %s failed to create: %s - %s" , ErrPodConfigurationFailed , cs .Name , waiting .Reason , waiting .Message )
528+ case "RunContainerError" :
529+ return false , fmt .Errorf ("%w: container %s failed to run: %s" , ErrPodConfigurationFailed , cs .Name , waiting .Message )
530+ }
494531 }
532+
533+ // Check if container terminated with errors and has high restart count
534+ if cs .State .Terminated != nil && cs .State .Terminated .ExitCode != 0 {
535+ if cs .RestartCount > 3 {
536+ return false , fmt .Errorf ("%w: container %s repeatedly crashing (exit code %d, %d restarts): %s" ,
537+ ErrPodCrashLoopBackOff , cs .Name , cs .State .Terminated .ExitCode , cs .RestartCount , cs .State .Terminated .Reason )
538+ }
539+ }
540+ }
541+
542+ // Check if pod is being evicted
543+ if pod .Status .Reason == "Evicted" {
544+ return false , fmt .Errorf ("%w: pod was evicted: %s" , ErrPodSchedulingFailed , pod .Status .Message )
495545 }
496- if runningPodCount == 1 {
497- return podName , nil
546+
547+ // Default: pod is in Pending or Running but not ready yet - should retry
548+ return true , fmt .Errorf ("pod in phase %s, waiting for containers to be ready" , pod .Status .Phase )
549+ }
550+
551+ func (k * kubernetesBackend ) updatedMCPPodName (ctx context.Context , url , id string , server ServerConfig ) (string , error ) {
552+ const maxRetries = 5
553+ var lastErr error
554+
555+ // Retry loop with smart pod status checking
556+ for attempt := range maxRetries {
557+ // Wait for the deployment to be updated.
558+ _ , err := wait .For (ctx , k .client , & appsv1.Deployment {ObjectMeta : metav1.ObjectMeta {Name : id , Namespace : k .mcpNamespace }}, func (dep * appsv1.Deployment ) (bool , error ) {
559+ return dep .Generation == dep .Status .ObservedGeneration && dep .Status .Replicas == 1 && dep .Status .UpdatedReplicas == 1 && dep .Status .ReadyReplicas == 1 && dep .Status .AvailableReplicas == 1 , nil
560+ }, wait.Option {Timeout : time .Minute })
561+ if err == nil {
562+ // Deployment is ready, now ensure the server is ready
563+ if err = ensureServerReady (ctx , url , server ); err != nil {
564+ return "" , fmt .Errorf ("failed to ensure MCP server is ready: %w" , err )
565+ }
566+
567+ // Now get the pod name that is currently running
568+ var (
569+ pods corev1.PodList
570+ runningPodCount int
571+ podName string
572+ )
573+ if err = k .client .List (ctx , & pods , & kclient.ListOptions {
574+ Namespace : k .mcpNamespace ,
575+ LabelSelector : labels .SelectorFromSet (map [string ]string {
576+ "app" : id ,
577+ }),
578+ }); err != nil {
579+ return "" , fmt .Errorf ("failed to list MCP pods: %w" , err )
580+ }
581+
582+ for _ , p := range pods .Items {
583+ if p .Status .Phase == corev1 .PodRunning {
584+ podName = p .Name
585+ runningPodCount ++
586+ }
587+ }
588+
589+ // runningPodCount should always equal 1, if the deployment is ready, as it is by this point in the code.
590+ // However, we will check just to make sure, and retry if it isn't.
591+ if runningPodCount == 1 {
592+ return podName , nil
593+ } else if runningPodCount > 1 {
594+ lastErr = fmt .Errorf ("more than one running pod found" )
595+ } else {
596+ lastErr = fmt .Errorf ("no pods found" )
597+ }
598+ continue
599+ }
600+
601+ // Deployment wait timed out, check pod status to decide if we should retry
602+ var pods corev1.PodList
603+ if listErr := k .client .List (ctx , & pods , & kclient.ListOptions {
604+ Namespace : k .mcpNamespace ,
605+ LabelSelector : labels .SelectorFromSet (map [string ]string {
606+ "app" : id ,
607+ }),
608+ }); listErr != nil {
609+ olog .Debugf ("failed to list MCP pods for status check: id=%s error=%v" , id , listErr )
610+ return "" , fmt .Errorf ("failed to list MCP pods: %w" , listErr )
611+ }
612+
613+ if len (pods .Items ) == 0 {
614+ olog .Debugf ("no pods found for MCP server: id=%s attempt=%d" , id , attempt + 1 )
615+ lastErr = fmt .Errorf ("no pods found" )
616+ if attempt < maxRetries {
617+ continue
618+ }
619+ return "" , fmt .Errorf ("%w: %v" , ErrHealthCheckTimeout , lastErr )
620+ }
621+
622+ // Get the newest pod and analyze its status
623+ newestPod , err := getNewestPod (pods .Items )
624+ if err != nil {
625+ olog .Debugf ("failed to get newest pod: id=%s error=%v attempt=%d" , id , err , attempt + 1 )
626+ lastErr = err
627+ if attempt < maxRetries {
628+ continue
629+ }
630+ return "" , fmt .Errorf ("%w: %v" , ErrHealthCheckTimeout , lastErr )
631+ }
632+
633+ shouldRetry , podErr := analyzePodStatus (newestPod )
634+ lastErr = podErr
635+
636+ if ! shouldRetry {
637+ // Permanent failure - return the error with the appropriate type already wrapped
638+ olog .Debugf ("pod in non-retryable state: id=%s error=%v attempt=%d" , id , podErr , attempt + 1 )
639+ return "" , podErr
640+ }
498641 }
499642
500- return "" , ErrHealthCheckTimeout
643+ olog .Debugf ("exceeded max retries waiting for pod: id=%s lastError=%v attempts=%d" , id , lastErr , maxRetries )
644+ return "" , fmt .Errorf ("%w after %d retries: %v" , ErrHealthCheckTimeout , maxRetries , lastErr )
501645}
502646
503647func (k * kubernetesBackend ) restartServer (ctx context.Context , id string ) error {
0 commit comments