Skip to content

Commit 81640fd

Browse files
committed
Migrate from ClusterPolicy to NVIDIADriver owned driver daemonsets
Signed-off-by: Christopher Desiniotis <[email protected]>
1 parent fdac9eb commit 81640fd

File tree

2 files changed

+8
-4
lines changed

2 files changed

+8
-4
lines changed

controllers/object_controls.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -3745,7 +3745,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
37453745
return true, nil
37463746
}
37473747

3748-
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
3748+
func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, deleteOptions *client.DeleteOptions) error {
37493749
// Get all DaemonSets owned by ClusterPolicy
37503750
//
37513751
// (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
@@ -3762,7 +3762,7 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context)
37623762
// filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
37633763
if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
37643764
n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
3765-
err = n.client.Delete(ctx, &ds)
3765+
err = n.client.Delete(ctx, &ds, deleteOptions)
37663766
if err != nil {
37673767
return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
37683768
}

controllers/state_manager.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -952,8 +952,12 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
952952
n.singleton.Spec.Driver.UseNvdiaDriverCRDType() {
953953
n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
954954
n.idx++
955-
// Cleanup all driver daemonsets owned by ClusterPolicy.
956-
err := n.cleanupAllDriverDaemonSets(n.ctx)
955+
// Cleanup all driver daemonsets owned by ClusterPolicy, but orphan the dependent pod objects.
956+
// This way, switching to the new NVIDIADriver API does not cause a cluster-wide disruption.
957+
// NVIDIA driver pods owned by ClusterPolicy daemonsets will remain running until the NVIDIADriver
958+
// controller migrates these pods to new ones owned by NVIDIADriver daemonsets.
959+
deletePropagationOrphan := metav1.DeletePropagationOrphan
960+
err := n.cleanupAllDriverDaemonSets(n.ctx, &client.DeleteOptions{PropagationPolicy: &deletePropagationOrphan})
957961
if err != nil {
958962
return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
959963
}

0 commit comments

Comments
 (0)