|
5 | 5 | "encoding/json" |
6 | 6 | "fmt" |
7 | 7 | "io" |
| 8 | + "log/slog" |
8 | 9 | "sort" |
9 | 10 | "strings" |
10 | 11 | "time" |
@@ -455,45 +456,177 @@ func (k *kubernetesBackend) k8sObjects(id string, server ServerConfig, serverDis |
455 | 456 | return objs, nil |
456 | 457 | } |
457 | 458 |
|
458 | | -func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) { |
459 | | - // Wait for the deployment to be updated. |
460 | | - _, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) { |
461 | | - return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil |
462 | | - }, wait.Option{Timeout: time.Minute}) |
463 | | - if err != nil { |
464 | | - return "", ErrHealthCheckTimeout |
| 459 | +// analyzePodStatus examines a pod's status to determine if we should retry waiting for it |
| 460 | +// or if we should fail immediately. Returns (shouldRetry, reason). |
| 461 | +func analyzePodStatus(pod *corev1.Pod) (bool, string) { |
| 462 | + // Check pod phase first |
| 463 | + switch pod.Status.Phase { |
| 464 | + case corev1.PodFailed: |
| 465 | + return false, fmt.Sprintf("pod is in Failed phase: %s", pod.Status.Message) |
| 466 | + case corev1.PodSucceeded: |
| 467 | + // This shouldn't happen for a long-running deployment, but if it does, it's an error |
| 468 | + return false, "pod succeeded and exited" |
| 469 | + case corev1.PodUnknown: |
| 470 | + return false, "pod is in Unknown phase" |
| 471 | + } |
| 472 | + |
| 473 | + // Check pod conditions for scheduling issues |
| 474 | + for _, cond := range pod.Status.Conditions { |
| 475 | + if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse { |
| 476 | + // Pod can't be scheduled - check if it's a transient issue |
| 477 | + if cond.Reason == corev1.PodReasonUnschedulable { |
| 478 | + // Unschedulable could be transient (e.g., waiting for autoscaler) |
| 479 | + return true, fmt.Sprintf("pod unschedulable: %s", cond.Message) |
| 480 | + } |
| 481 | + } |
465 | 482 | } |
466 | 483 |
|
467 | | - if err = ensureServerReady(ctx, url, server); err != nil { |
468 | | - return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err) |
| 484 | + // Check container statuses |
| 485 | + allContainerStatuses := append([]corev1.ContainerStatus{}, pod.Status.ContainerStatuses...) |
| 486 | + |
| 487 | + for _, cs := range allContainerStatuses { |
| 488 | + // Check if container is waiting |
| 489 | + if cs.State.Waiting != nil { |
| 490 | + waiting := cs.State.Waiting |
| 491 | + switch waiting.Reason { |
| 492 | + // Transient/recoverable states - should retry |
| 493 | + case "ContainerCreating", "PodInitializing": |
| 494 | + return true, fmt.Sprintf("container %s is %s", cs.Name, waiting.Reason) |
| 495 | + |
| 496 | + // Image pull states - need to check if it's temporary or permanent |
| 497 | + case "ImagePullBackOff", "ErrImagePull": |
| 498 | + // ImagePullBackOff can be transient (network issues) but also permanent (bad image) |
| 499 | + // We'll treat it as retryable for now, but it will eventually hit max retries |
| 500 | + return true, fmt.Sprintf("container %s: %s - %s", cs.Name, waiting.Reason, waiting.Message) |
| 501 | + |
| 502 | + // Permanent failures - should not retry |
| 503 | + case "CrashLoopBackOff": |
| 504 | + return false, fmt.Sprintf("container %s is in CrashLoopBackOff: %s", cs.Name, waiting.Message) |
| 505 | + case "InvalidImageName": |
| 506 | + return false, fmt.Sprintf("container %s has invalid image name: %s", cs.Name, waiting.Message) |
| 507 | + case "CreateContainerConfigError", "CreateContainerError": |
| 508 | + return false, fmt.Sprintf("container %s failed to create: %s - %s", cs.Name, waiting.Reason, waiting.Message) |
| 509 | + case "RunContainerError": |
| 510 | + return false, fmt.Sprintf("container %s failed to run: %s", cs.Name, waiting.Message) |
| 511 | + } |
| 512 | + } |
| 513 | + |
| 514 | + // Check if container terminated with errors and has high restart count |
| 515 | + if cs.State.Terminated != nil && cs.State.Terminated.ExitCode != 0 { |
| 516 | + if cs.RestartCount > 3 { |
| 517 | + return false, fmt.Sprintf("container %s repeatedly crashing (exit code %d, %d restarts): %s", |
| 518 | + cs.Name, cs.State.Terminated.ExitCode, cs.RestartCount, cs.State.Terminated.Reason) |
| 519 | + } |
| 520 | + } |
469 | 521 | } |
470 | 522 |
|
471 | | - // Now get the pod name that is currently running |
472 | | - var ( |
473 | | - pods corev1.PodList |
474 | | - runningPodCount int |
475 | | - podName string |
476 | | - ) |
477 | | - if err = k.client.List(ctx, &pods, &kclient.ListOptions{ |
478 | | - Namespace: k.mcpNamespace, |
479 | | - LabelSelector: labels.SelectorFromSet(map[string]string{ |
480 | | - "app": id, |
481 | | - }), |
482 | | - }); err != nil { |
483 | | - return "", fmt.Errorf("failed to list MCP pods: %w", err) |
| 523 | + // Check if pod is being evicted |
| 524 | + if pod.Status.Reason == "Evicted" { |
| 525 | + return false, fmt.Sprintf("pod was evicted: %s", pod.Status.Message) |
484 | 526 | } |
485 | 527 |
|
486 | | - for _, p := range pods.Items { |
487 | | - if p.Status.Phase == corev1.PodRunning { |
488 | | - podName = p.Name |
489 | | - runningPodCount++ |
| 528 | + // Default: pod is in Pending or Running but not ready yet - should retry |
| 529 | + return true, fmt.Sprintf("pod in phase %s, waiting for containers to be ready", pod.Status.Phase) |
| 530 | +} |
| 531 | + |
| 532 | +func (k *kubernetesBackend) updatedMCPPodName(ctx context.Context, url, id string, server ServerConfig) (string, error) { |
| 533 | + const maxRetries = 5 |
| 534 | + var lastReason string |
| 535 | + |
| 536 | + // Retry loop with smart pod status checking |
| 537 | + for attempt := 0; attempt <= maxRetries; attempt++ { |
| 538 | + // Wait for the deployment to be updated. |
| 539 | + _, err := wait.For(ctx, k.client, &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: id, Namespace: k.mcpNamespace}}, func(dep *appsv1.Deployment) (bool, error) { |
| 540 | + return dep.Generation == dep.Status.ObservedGeneration && dep.Status.Replicas == 1 && dep.Status.UpdatedReplicas == 1 && dep.Status.ReadyReplicas == 1 && dep.Status.AvailableReplicas == 1, nil |
| 541 | + }, wait.Option{Timeout: time.Minute}) |
| 542 | + |
| 543 | + if err == nil { |
| 544 | + // Deployment is ready, now ensure the server is ready |
| 545 | + if err = ensureServerReady(ctx, url, server); err != nil { |
| 546 | + return "", fmt.Errorf("failed to ensure MCP server is ready: %w", err) |
| 547 | + } |
| 548 | + |
| 549 | + // Now get the pod name that is currently running |
| 550 | + var ( |
| 551 | + pods corev1.PodList |
| 552 | + runningPodCount int |
| 553 | + podName string |
| 554 | + ) |
| 555 | + if err = k.client.List(ctx, &pods, &kclient.ListOptions{ |
| 556 | + Namespace: k.mcpNamespace, |
| 557 | + LabelSelector: labels.SelectorFromSet(map[string]string{ |
| 558 | + "app": id, |
| 559 | + }), |
| 560 | + }); err != nil { |
| 561 | + return "", fmt.Errorf("failed to list MCP pods: %w", err) |
| 562 | + } |
| 563 | + |
| 564 | + for _, p := range pods.Items { |
| 565 | + if p.Status.Phase == corev1.PodRunning { |
| 566 | + podName = p.Name |
| 567 | + runningPodCount++ |
| 568 | + } |
| 569 | + } |
| 570 | + if runningPodCount == 1 { |
| 571 | + return podName, nil |
| 572 | + } |
| 573 | + |
| 574 | + return "", ErrHealthCheckTimeout |
490 | 575 | } |
491 | | - } |
492 | | - if runningPodCount == 1 { |
493 | | - return podName, nil |
| 576 | + |
| 577 | + // Deployment wait timed out, check pod status to decide if we should retry |
| 578 | + var pods corev1.PodList |
| 579 | + if listErr := k.client.List(ctx, &pods, &kclient.ListOptions{ |
| 580 | + Namespace: k.mcpNamespace, |
| 581 | + LabelSelector: labels.SelectorFromSet(map[string]string{ |
| 582 | + "app": id, |
| 583 | + }), |
| 584 | + }); listErr != nil { |
| 585 | + slog.Error("failed to list MCP pods for status check", "id", id, "error", listErr) |
| 586 | + return "", fmt.Errorf("failed to list MCP pods: %w", listErr) |
| 587 | + } |
| 588 | + |
| 589 | + if len(pods.Items) == 0 { |
| 590 | + slog.Warn("no pods found for MCP server", "id", id, "attempt", attempt+1) |
| 591 | + lastReason = "no pods found" |
| 592 | + if attempt < maxRetries { |
| 593 | + continue |
| 594 | + } |
| 595 | + return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason) |
| 596 | + } |
| 597 | + |
| 598 | + // Analyze the first pod (there should only be one for a deployment with replicas=1) |
| 599 | + pod := &pods.Items[0] |
| 600 | + shouldRetry, reason := analyzePodStatus(pod) |
| 601 | + lastReason = reason |
| 602 | + |
| 603 | + if !shouldRetry { |
| 604 | + // Permanent failure - return appropriate error based on reason |
| 605 | + slog.Error("pod in non-retryable state", "id", id, "reason", reason, "attempt", attempt+1) |
| 606 | + if strings.Contains(reason, "CrashLoopBackOff") { |
| 607 | + return "", fmt.Errorf("%w: %s", ErrPodCrashLoopBackOff, reason) |
| 608 | + } else if strings.Contains(reason, "ImagePull") || strings.Contains(reason, "InvalidImageName") { |
| 609 | + return "", fmt.Errorf("%w: %s", ErrImagePullFailed, reason) |
| 610 | + } else if strings.Contains(reason, "unschedulable") || strings.Contains(reason, "Evicted") { |
| 611 | + return "", fmt.Errorf("%w: %s", ErrPodSchedulingFailed, reason) |
| 612 | + } else if strings.Contains(reason, "CreateContainer") || strings.Contains(reason, "RunContainer") { |
| 613 | + return "", fmt.Errorf("%w: %s", ErrPodConfigurationFailed, reason) |
| 614 | + } |
| 615 | + return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, reason) |
| 616 | + } |
| 617 | + |
| 618 | + // Transient issue - retry if we haven't exceeded max retries |
| 619 | + if attempt < maxRetries { |
| 620 | + slog.Info("pod not ready, will retry", "id", id, "reason", reason, "attempt", attempt+1, "maxRetries", maxRetries) |
| 621 | + continue |
| 622 | + } |
| 623 | + |
| 624 | + // Exceeded max retries |
| 625 | + slog.Error("exceeded max retries waiting for pod", "id", id, "lastReason", lastReason, "attempts", attempt+1) |
| 626 | + return "", fmt.Errorf("%w after %d retries: %s", ErrHealthCheckTimeout, maxRetries+1, lastReason) |
494 | 627 | } |
495 | 628 |
|
496 | | - return "", ErrHealthCheckTimeout |
| 629 | + return "", fmt.Errorf("%w: %s", ErrHealthCheckTimeout, lastReason) |
497 | 630 | } |
498 | 631 |
|
499 | 632 | func (k *kubernetesBackend) restartServer(ctx context.Context, id string) error { |
|
0 commit comments