From 00587fb38676041e92537c44ba8431cee6e8ca84 Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Thu, 30 Jan 2025 14:08:48 +0200 Subject: [PATCH] Fix comments DNM before squash Signed-off-by: Sebastian Sch --- .github/workflows/test.yml | 9 +- cmd/sriov-network-config-daemon/start.go | 11 +- controllers/drain_controller.go | 2 +- controllers/drain_controller_helper.go | 8 +- .../sriovnetworknodepolicy_controller.go | 4 +- pkg/consts/constants.go | 5 +- pkg/daemon/config.go | 16 +- pkg/daemon/config_test.go | 4 +- pkg/daemon/daemon.go | 151 ++++++++++-------- pkg/daemon/daemon_test.go | 39 +++-- pkg/daemon/status.go | 30 ++-- pkg/systemd/systemd.go | 26 ++- pkg/utils/shutdown.go | 2 +- 13 files changed, 189 insertions(+), 118 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4c4928ab7..b73db9213 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,8 +40,11 @@ jobs: - name: Check out code into the Go module directory uses: actions/checkout@v2 - - name: test pkg - run: make test-pkg + - name: test pkg on kubernetes + run: CLUSTER_TYPE=kubernetes make test-pkg + + - name: test pkg on openshift + run: CLUSTER_TYPE=openshift make test-pkg - name: test cmd run: make test-cmd @@ -49,7 +52,7 @@ jobs: - name: test api run: make test-api - - name: test controllers on opensfhit + - name: test controllers on openshift run: CLUSTER_TYPE=openshift make test-controllers - name: test controllers on kubernetes diff --git a/cmd/sriov-network-config-daemon/start.go b/cmd/sriov-network-config-daemon/start.go index 9b0846d34..1e82b3271 100644 --- a/cmd/sriov-network-config-daemon/start.go +++ b/cmd/sriov-network-config-daemon/start.go @@ -145,7 +145,7 @@ func configGlobalVariables() error { return nil } -func UseKubeletKubeConfig() { +func useKubeletKubeConfig() { fnLogger := log.Log.WithName("sriov-network-config-daemon") kubeconfig, err := clientcmd.LoadFromFile("/host/etc/kubernetes/kubeconfig") @@ -174,7 +174,6 @@ func UseKubeletKubeConfig() { } func getOperatorConfig(kClient runtimeclient.Client) (*sriovnetworkv1.SriovOperatorConfig, error) { - // Init feature gates once to prevent race conditions. defaultConfig := &sriovnetworkv1.SriovOperatorConfig{} err := kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig) if err != nil { @@ -228,7 +227,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error { // On openshift we use the kubeconfig from kubelet on the node where the daemon is running // this allow us to improve security as every daemon has access only to its own node if vars.ClusterType == consts.ClusterTypeOpenshift { - UseKubeletKubeConfig() + useKubeletKubeConfig() } kubeconfig := os.Getenv("KUBECONFIG") @@ -343,19 +342,19 @@ func runStartCmd(cmd *cobra.Command, args []string) error { startOpts.disabledPlugins) // Init Daemon configuration on the node - if err = dm.DaemonInitialization(); err != nil { + if err = dm.Init(); err != nil { setupLog.Error(err, "unable to initialize daemon") os.Exit(1) } // Setup reconcile loop with manager if err = dm.SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create setup daemon manager for SriovNetworkNodeState") + setupLog.Error(err, "unable to setup daemon with manager for SriovNetworkNodeState") os.Exit(1) } // Setup reconcile loop with manager - if err = daemon.NewOperatorConfigReconcile(kClient).SetupWithManager(mgr); err != nil { + if err = daemon.NewOperatorConfigNodeReconcile(kClient).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create setup daemon manager for OperatorConfig") os.Exit(1) } diff --git a/controllers/drain_controller.go b/controllers/drain_controller.go index 796e44dda..d8dec59d8 100644 --- a/controllers/drain_controller.go +++ b/controllers/drain_controller.go @@ -108,7 +108,7 @@ func (dr *DrainReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl // create the drain state annotation if it doesn't exist in the sriovNetworkNodeState object nodeStateDrainAnnotationCurrent, currentNodeStateExist, err := dr.ensureAnnotationExists(ctx, nodeNetworkState, constants.NodeStateDrainAnnotationCurrent) if err != nil { - reqLogger.Error(err, "failed to ensure nodeStateDrainAnnotation") + reqLogger.Error(err, "failed to ensure nodeStateDrainAnnotationCurrent") return ctrl.Result{}, err } _, desireNodeStateExist, err := dr.ensureAnnotationExists(ctx, nodeNetworkState, constants.NodeStateDrainAnnotation) diff --git a/controllers/drain_controller_helper.go b/controllers/drain_controller_helper.go index c9e6bf550..aa12a0019 100644 --- a/controllers/drain_controller_helper.go +++ b/controllers/drain_controller_helper.go @@ -3,10 +3,8 @@ package controllers import ( "context" "fmt" - "time" "github.com/go-logr/logr" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -44,7 +42,7 @@ func (dr *DrainReconcile) handleNodeIdleNodeStateDrainingOrCompleted(ctx context "DrainController", "node complete drain was not completed") // TODO: make this time configurable - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return reconcile.Result{RequeueAfter: constants.DrainControllerRequeueTime}, nil } // move the node state back to idle @@ -106,7 +104,7 @@ func (dr *DrainReconcile) handleNodeDrainOrReboot(ctx context.Context, corev1.EventTypeWarning, "DrainController", "node drain operation was not completed") - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return reconcile.Result{RequeueAfter: constants.DrainControllerRequeueTime}, nil } // if we manage to drain we label the node state with drain completed and finish @@ -180,7 +178,7 @@ func (dr *DrainReconcile) tryDrainNode(ctx context.Context, node *corev1.Node) ( // the node requested to be drained, but we are at the limit so we re-enqueue the request reqLogger.Info("MaxParallelNodeConfiguration limit reached for draining nodes re-enqueue the request") // TODO: make this time configurable - return &reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return &reconcile.Result{RequeueAfter: constants.DrainControllerRequeueTime}, nil } if currentSnns == nil { diff --git a/controllers/sriovnetworknodepolicy_controller.go b/controllers/sriovnetworknodepolicy_controller.go index f8811ed97..0b334a94b 100644 --- a/controllers/sriovnetworknodepolicy_controller.go +++ b/controllers/sriovnetworknodepolicy_controller.go @@ -88,7 +88,7 @@ func (r *SriovNetworkNodePolicyReconciler) Reconcile(ctx context.Context, req ct if err := r.Get(ctx, types.NamespacedName{Namespace: vars.Namespace, Name: constants.DefaultConfigName}, defaultOpConf); err != nil { if errors.IsNotFound(err) { reqLogger.Info("default SriovOperatorConfig object not found, cannot reconcile SriovNetworkNodePolicies. Requeue.") - return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + return reconcile.Result{RequeueAfter: constants.DrainControllerRequeueTime}, nil } return reconcile.Result{}, err } @@ -226,7 +226,7 @@ func (r *SriovNetworkNodePolicyReconciler) syncDevicePluginConfigMap(ctx context } configData[node.Name] = string(config) - if data.ResourceList == nil || len(data.ResourceList) == 0 { + if len(data.ResourceList) == 0 { // if we don't have policies we should add the disabled label for the device plugin err = utils.LabelNode(ctx, node.Name, constants.SriovDevicePluginLabel, constants.SriovDevicePluginLabelDisabled, r.Client) if err != nil { diff --git a/pkg/consts/constants.go b/pkg/consts/constants.go index 6aadef648..2c069c81e 100644 --- a/pkg/consts/constants.go +++ b/pkg/consts/constants.go @@ -14,7 +14,10 @@ const ( Chroot = "/host" Host = "/host" - ResyncPeriod = 5 * time.Minute + ResyncPeriod = 5 * time.Minute + DaemonRequeueTime = 30 * time.Second + DrainControllerRequeueTime = 5 * time.Second + DefaultConfigName = "default" ConfigDaemonPath = "./bindata/manifests/daemon" InjectorWebHookPath = "./bindata/manifests/webhook" diff --git a/pkg/daemon/config.go b/pkg/daemon/config.go index ddeba84db..bf05b3873 100644 --- a/pkg/daemon/config.go +++ b/pkg/daemon/config.go @@ -14,16 +14,19 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" ) -type OperatorConfigReconcile struct { +// OperatorConfigNodeReconcile represents the reconcile struct for the OperatorConfig. +type OperatorConfigNodeReconcile struct { client client.Client latestFeatureGates map[string]bool } -func NewOperatorConfigReconcile(client client.Client) *OperatorConfigReconcile { - return &OperatorConfigReconcile{client: client, latestFeatureGates: make(map[string]bool)} +// NewOperatorConfigNodeReconcile creates a new instance of OperatorConfigNodeReconcile with the given client. +func NewOperatorConfigNodeReconcile(client client.Client) *OperatorConfigNodeReconcile { + return &OperatorConfigNodeReconcile{client: client, latestFeatureGates: make(map[string]bool)} } -func (oc *OperatorConfigReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +// Reconcile reconciles the OperatorConfig resource. It updates log level and feature gates as necessary. +func (oc *OperatorConfigNodeReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { reqLogger := log.FromContext(ctx).WithName("Reconcile") operatorConfig := &sriovnetworkv1.SriovOperatorConfig{} err := oc.client.Get(ctx, client.ObjectKey{Namespace: req.Namespace, Name: req.Name}, operatorConfig) @@ -32,7 +35,7 @@ func (oc *OperatorConfigReconcile) Reconcile(ctx context.Context, req ctrl.Reque reqLogger.Info("OperatorConfig doesn't exist", "name", req.Name, "namespace", req.Namespace) return ctrl.Result{}, nil } - reqLogger.Error(err, "Failed to operatorConfig", "name", req.Name, "namespace", req.Namespace) + reqLogger.Error(err, "Failed to get OperatorConfig", "name", req.Name, "namespace", req.Namespace) return ctrl.Result{}, err } @@ -54,7 +57,8 @@ func (oc *OperatorConfigReconcile) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, nil } -func (oc *OperatorConfigReconcile) SetupWithManager(mgr ctrl.Manager) error { +// SetupWithManager sets up the reconciliation logic for this controller using the given manager. +func (oc *OperatorConfigNodeReconcile) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&sriovnetworkv1.SriovOperatorConfig{}). Complete(oc) diff --git a/pkg/daemon/config_test.go b/pkg/daemon/config_test.go index ad84cd5bf..c6095ab91 100644 --- a/pkg/daemon/config_test.go +++ b/pkg/daemon/config_test.go @@ -31,7 +31,7 @@ var _ = Describe("Daemon OperatorConfig Controller", Ordered, func() { }) Expect(err).ToNot(HaveOccurred()) - configController := daemon.NewOperatorConfigReconcile(k8sClient) + configController := daemon.NewOperatorConfigNodeReconcile(k8sClient) err = configController.SetupWithManager(k8sManager) Expect(err).ToNot(HaveOccurred()) @@ -57,7 +57,7 @@ var _ = Describe("Daemon OperatorConfig Controller", Ordered, func() { Expect(err).ToNot(HaveOccurred()) }) - BeforeEach(func() { + AfterEach(func() { Expect(k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovOperatorConfig{}, client.InNamespace(testNamespace))).ToNot(HaveOccurred()) }) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 422621e00..ae0ea29d0 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -11,6 +11,7 @@ import ( "k8s.io/client-go/kubernetes" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/predicate" @@ -26,7 +27,10 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/vars" ) -type DaemonReconcile struct { +// NodeReconciler struct holds various components necessary for reconciling an SR-IOV node. +// It includes a Kubernetes client, SR-IOV client, and other utility interfaces. +// The struct is designed to manage the lifecycle of an SR-IOV devices on a given node. +type NodeReconciler struct { client client.Client sriovClient snclientset.Interface @@ -49,6 +53,7 @@ type DaemonReconcile struct { disableDrain bool } +// New creates a new instance of NodeReconciler. func New( client client.Client, sriovClient snclientset.Interface, @@ -58,8 +63,8 @@ func New( er *EventRecorder, featureGates featuregate.FeatureGate, disabledPlugins []string, -) *DaemonReconcile { - return &DaemonReconcile{ +) *NodeReconciler { + return &NodeReconciler{ client: client, sriovClient: sriovClient, kubeClient: kubeClient, @@ -73,8 +78,10 @@ func New( } } -func (dn *DaemonReconcile) DaemonInitialization() error { - funcLog := log.Log.WithName("DaemonInitialization") +// Init initializes the Sriov Network Operator daemon. +// It enables kernel modules, prepare udev rules and load the host network state +func (dn *NodeReconciler) Init() error { + funcLog := log.Log.WithName("Init") var err error if !vars.UsingSystemdMode { @@ -100,10 +107,9 @@ func (dn *DaemonReconcile) DaemonInitialization() error { funcLog.Error(err, "failed to prepare udev files to rename VF representors for requested VFs") } - ns := &sriovnetworkv1.SriovNetworkNodeState{} // init openstack info if vars.PlatformType == consts.VirtualOpenStack { - ns, err = dn.HostHelpers.GetCheckPointNodeState() + ns, err := dn.HostHelpers.GetCheckPointNodeState() if err != nil { return err } @@ -119,7 +125,8 @@ func (dn *DaemonReconcile) DaemonInitialization() error { } // get interfaces - err = dn.getHostNetworkStatus(ns) + ns := &sriovnetworkv1.SriovNetworkNodeState{} + err = dn.updateStatusFromHost(ns) if err != nil { funcLog.Error(err, "failed to get host network status on init") return err @@ -128,7 +135,7 @@ func (dn *DaemonReconcile) DaemonInitialization() error { // init vendor plugins dn.loadedPlugins, err = loadPlugins(ns, dn.HostHelpers, dn.disabledPlugins) if err != nil { - funcLog.Error(err, "failed to enable vendor plugins") + funcLog.Error(err, "failed to load vendor plugins") return err } @@ -140,7 +147,21 @@ func (dn *DaemonReconcile) DaemonInitialization() error { return nil } -func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +// Reconcile Reconciles the nodeState object by performing the following steps: +// 1. Retrieves the latest NodeState from the API server. +// 2. Checks if the object has the required drain controller annotations for the current generation. +// 3. Updates the nodeState Status object with the existing network state (interfaces, bridges, and RDMA status). +// 4. If running in systemd mode, checks the sriov result from the config-daemon that runs in systemd. +// 5. Compares the latest generation with the last applied generation to determine if a refresh on NICs is needed. +// 6. Checks for drift between the host state and the nodeState status. +// 7. Updates the sync state of the nodeState object as per the current requirements. +// 8. Determines if a drain is required based on the current state of the nodeState. +// 9. Handles the drain if necessary, ensuring that it does not conflict with other drain requests. +// 10. Applies the changes to the nodeState if there are no issues and updates the sync status accordingly. +// 11. If a reboot is required after applying the changes, returns a result to trigger a reboot. +// +// Returns a Result indicating whether or not the controller should requeue the request for further processing. +func (dn *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { reqLogger := log.FromContext(ctx).WithName("Reconcile") // Get the latest NodeState desiredNodeState := &sriovnetworkv1.SriovNetworkNodeState{} @@ -167,14 +188,14 @@ func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctr reqLogger.V(0).Info("new generation", "generation", latest) // Update the nodeState Status object with the existing network state (interfaces bridges and rdma status) - err = dn.getHostNetworkStatus(desiredNodeState) + err = dn.updateStatusFromHost(desiredNodeState) if err != nil { reqLogger.Error(err, "failed to get host network status") return ctrl.Result{}, err } // if we are running in systemd mode we want to get the sriov result from the config-daemon that runs in systemd - sriovResult, exist, err := dn.checkSystemdStatus(ctx, desiredNodeState) + sriovResult, sriovResultExists, err := dn.checkSystemdStatus() //TODO: in the case we need to think what to do if we try to apply again or not if err != nil { reqLogger.Error(err, "failed to check systemd status unexpected error") @@ -189,15 +210,10 @@ func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, err } - // if there are no node drifted changes, and we are on the latest applied policy + // if there are no host state drift changes, and we are on the latest applied policy // we check if we need to publish a new nodeState status if not we requeue if !isDrifted { - shouldUpdate, err := dn.shouldUpdateStatus(current, desiredNodeState) - if err != nil { - reqLogger.Error(err, "failed to check host state") - return ctrl.Result{}, err - } - + shouldUpdate := dn.shouldUpdateStatus(current, desiredNodeState) if shouldUpdate { reqLogger.Info("updating nodeState with new host status") err = dn.updateSyncState(ctx, desiredNodeState, desiredNodeState.Status.SyncStatus, desiredNodeState.Status.LastSyncError) @@ -207,7 +223,7 @@ func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctr } } - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil } } @@ -232,9 +248,9 @@ func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctr reqLogger.Error(err, "failed to write systemd config file") return ctrl.Result{}, err } - reqDrain = reqDrain || systemdConfModified || !exist + reqDrain = reqDrain || systemdConfModified || !sriovResultExists // require reboot if drain needed for systemd mode - reqReboot = reqReboot || systemdConfModified || reqDrain || !exist + reqReboot = reqReboot || reqDrain } reqLogger.V(0).Info("aggregated daemon node state requirement", @@ -262,7 +278,10 @@ func (dn *DaemonReconcile) Reconcile(ctx context.Context, req ctrl.Request) (ctr return ctrl.Result{}, nil } -func (dn *DaemonReconcile) checkOnNodeStateChange(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) { +// checkOnNodeStateChange checks the state change required for the node based on the desired SriovNetworkNodeState. +// The function iterates over all loaded plugins and calls their OnNodeStateChange method with the desired state. +// It returns two boolean values indicating whether a reboot or drain operation is required. +func (dn *NodeReconciler) checkOnNodeStateChange(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, bool, error) { funcLog := log.Log.WithName("checkOnNodeStateChange") reqReboot := false reqDrain := false @@ -285,7 +304,9 @@ func (dn *DaemonReconcile) checkOnNodeStateChange(desiredNodeState *sriovnetwork return reqReboot, reqDrain, nil } -func (dn *DaemonReconcile) checkSystemdStatus(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (*systemd.SriovResult, bool, error) { +// checkSystemdStatus Checks the status of systemd services on the host node. +// return the sriovResult struct a boolean if the result file exist on the node +func (dn *NodeReconciler) checkSystemdStatus() (*systemd.SriovResult, bool, error) { if !vars.UsingSystemdMode { return nil, false, nil } @@ -318,32 +339,18 @@ func (dn *DaemonReconcile) checkSystemdStatus(ctx context.Context, desiredNodeSt return nil, false, err } } - - //// only if something is not equal we apply if not we continue to check if something change on the node, - //// and we need to trigger a reconfiguration - //if desiredNodeState.Status.SyncStatus != sriovResult.SyncStatus || - // desiredNodeState.Status.LastSyncError != sriovResult.LastSyncError { - // err = dn.updateSyncState(ctx, desiredNodeState, sriovResult.SyncStatus, sriovResult.LastSyncError) - // if err != nil { - // funcLog.Error(err, "failed to update sync status") - // } - // return sriovResult, err - //} - - // TODO: check if we need this - //if sriovResult.LastSyncError != "" || sriovResult.SyncStatus == consts.SyncStatusFailed { - // funcLog.Info("sync failed systemd service error", "last-sync-error", sriovResult.LastSyncError) - // err = dn.updateSyncState(ctx, desiredNodeState, consts.SyncStatusFailed, sriovResult.LastSyncError) - // if err != nil { - // return nil, false, err - // } - // dn.lastAppliedGeneration = desiredNodeState.Generation - // return sriovResult, true, err - //} return sriovResult, exist, nil } -func (dn *DaemonReconcile) apply(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool, sriovResult *systemd.SriovResult) (ctrl.Result, error) { +// apply applies the desired state of the node by: +// 1. Applying vendor plugins that have been loaded. +// 2. Depending on whether a reboot is required or if the configuration is being done via systemd, it applies the generic or virtual plugin(s). +// 3. Rebooting the node if necessary and sending an event. +// 4. Restarting the device plugin pod on the node. +// 5. Requesting annotation updates for draining the idle state of the node. +// 6. Synchronizing with the host network status and updating the sync status of the node in the nodeState object. +// 7. Updating the lastAppliedGeneration to the current generation. +func (dn *NodeReconciler) apply(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool, sriovResult *systemd.SriovResult) (ctrl.Result, error) { reqLogger := log.FromContext(ctx).WithName("Apply") // apply the vendor plugins after we are done with drain if needed for k, p := range dn.loadedPlugins { @@ -394,7 +401,7 @@ func (dn *DaemonReconcile) apply(ctx context.Context, desiredNodeState *sriovnet return ctrl.Result{}, err } - _, err := dn.annotate(ctx, desiredNodeState, consts.DrainIdle) + err := dn.annotate(ctx, desiredNodeState, consts.DrainIdle) if err != nil { reqLogger.Error(err, "failed to request annotation update to idle") return ctrl.Result{}, err @@ -409,7 +416,7 @@ func (dn *DaemonReconcile) apply(ctx context.Context, desiredNodeState *sriovnet } // Update the nodeState Status object with the existing network interfaces - err = dn.getHostNetworkStatus(desiredNodeState) + err = dn.updateStatusFromHost(desiredNodeState) if err != nil { reqLogger.Error(err, "failed to get host network status") return ctrl.Result{}, err @@ -423,12 +430,12 @@ func (dn *DaemonReconcile) apply(ctx context.Context, desiredNodeState *sriovnet // update the lastAppliedGeneration dn.lastAppliedGeneration = desiredNodeState.Generation - return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + return ctrl.Result{RequeueAfter: consts.DaemonRequeueTime}, nil } // checkHostStateDrift returns true if the node state drifted from the nodeState policy // Check if there is a change in the host network interfaces that require a reconfiguration by the daemon -func (dn *DaemonReconcile) checkHostStateDrift(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { +func (dn *NodeReconciler) checkHostStateDrift(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { funcLog := log.Log.WithName("checkHostStateDrift()") // Skip when SriovNetworkNodeState object has just been created. @@ -467,7 +474,13 @@ func (dn *DaemonReconcile) checkHostStateDrift(ctx context.Context, desiredNodeS return false, nil } -func (dn *DaemonReconcile) writeSystemdConfigFile(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { +// writeSystemdConfigFile Writes the systemd configuration file for the node +// and handles any necessary actions such as removing an existing result file and writing supported NIC IDs. +// +// The function first attempts to write the systemd configuration file based on the desired node state. +// If successful, it checks if the configuration file was modified. If so, it removes the existing result file (if present) to ensure that outdated results are not used. +// After writing the configuration file and potentially removing the old one, it writes a file containing supported NIC IDs. +func (dn *NodeReconciler) writeSystemdConfigFile(desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { funcLog := log.Log.WithName("writeSystemdConfigFile()") funcLog.V(0).Info("writing systemd config file to host") systemdConfModified, err := systemd.WriteConfFile(desiredNodeState) @@ -498,7 +511,7 @@ func (dn *DaemonReconcile) writeSystemdConfigFile(desiredNodeState *sriovnetwork // handleDrain: adds the right annotation to the node and nodeState object // returns true if we need to finish the reconcile loop and wait for a new object -func (dn *DaemonReconcile) handleDrain(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool) (bool, error) { +func (dn *NodeReconciler) handleDrain(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, reqReboot bool) (bool, error) { funcLog := log.Log.WithName("handleDrain") // done with the drain we can continue with the configuration if utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) { @@ -523,10 +536,13 @@ func (dn *DaemonReconcile) handleDrain(ctx context.Context, desiredNodeState *sr if reqReboot { annotation = consts.RebootRequired } - return dn.annotate(ctx, desiredNodeState, annotation) + return true, dn.annotate(ctx, desiredNodeState, annotation) } -func (dn *DaemonReconcile) restartDevicePluginPod(ctx context.Context) error { +// restartDevicePluginPod restarts the device plugin pod on the specified node. +// +// The function checks if the pod exists, deletes it if found, and waits for it to be deleted successfully. +func (dn *NodeReconciler) restartDevicePluginPod(ctx context.Context) error { log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod") pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ @@ -583,7 +599,8 @@ func (dn *DaemonReconcile) restartDevicePluginPod(ctx context.Context) error { return nil } -func (dn *DaemonReconcile) rebootNode() error { +// rebootNode Reboots the node by executing a systemd-run command +func (dn *NodeReconciler) rebootNode() error { funcLog := log.Log.WithName("rebootNode") funcLog.Info("trigger node reboot") exit, err := dn.HostHelpers.Chroot(consts.Host) @@ -609,7 +626,9 @@ func (dn *DaemonReconcile) rebootNode() error { return nil } -func (dn *DaemonReconcile) prepareNMUdevRule() error { +// prepareNMUdevRule prepares/validate the status of the config-daemon custom udev rules needed to control +// the virtual functions by the operator only. +func (dn *NodeReconciler) prepareNMUdevRule() error { // we need to remove the Red Hat Virtio network device from the udev rule configuration // if we don't remove it when running the config-daemon on a virtual node it will disconnect the node after a reboot // even that the operator should not be installed on virtual environments that are not openstack @@ -626,7 +645,7 @@ func (dn *DaemonReconcile) prepareNMUdevRule() error { } // isDrainCompleted returns true if the current-state annotation is drain completed -func (dn *DaemonReconcile) isDrainCompleted(reqDrain bool, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool { +func (dn *NodeReconciler) isDrainCompleted(reqDrain bool, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool { // if we need to drain check the drain status if reqDrain { return utils.ObjectHasAnnotation(desiredNodeState, consts.NodeStateDrainAnnotationCurrent, consts.DrainComplete) @@ -641,35 +660,37 @@ func (dn *DaemonReconcile) isDrainCompleted(reqDrain bool, desiredNodeState *sri return true } -func (dn *DaemonReconcile) annotate( +// annotate annotates the nodeState object with specified annotation. +func (dn *NodeReconciler) annotate( ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, - annotationState string) (bool, error) { + annotationState string) error { funcLog := log.Log.WithName("annotate") funcLog.Info(fmt.Sprintf("apply '%s' annotation for node", annotationState)) err := utils.AnnotateNode(ctx, desiredNodeState.Name, consts.NodeDrainAnnotation, annotationState, dn.client) if err != nil { log.Log.Error(err, "Failed to annotate node") - return false, err + return err } funcLog.Info(fmt.Sprintf("apply '%s' annotation for nodeState", annotationState)) if err := utils.AnnotateObject(context.Background(), desiredNodeState, consts.NodeStateDrainAnnotation, annotationState, dn.client); err != nil { - return false, err + return err } // the node was annotated we need to wait for the operator to finish the drain - return true, nil + return nil } // SetupWithManager sets up the controller with the Manager. -func (dn *DaemonReconcile) SetupWithManager(mgr ctrl.Manager) error { +func (dn *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&sriovnetworkv1.SriovNetworkNodeState{}). WithEventFilter(predicate.Or(predicate.AnnotationChangedPredicate{}, predicate.GenerationChangedPredicate{})). + WithOptions(controller.Options{MaxConcurrentReconciles: 1}). Complete(dn) } @@ -677,6 +698,6 @@ func (dn *DaemonReconcile) SetupWithManager(mgr ctrl.Manager) error { // ---- unit tests helper function ----- // ------------------------------------- -func (dn *DaemonReconcile) GetLastAppliedGeneration() int64 { +func (dn *NodeReconciler) GetLastAppliedGeneration() int64 { return dn.lastAppliedGeneration } diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index 0792026c4..c8658ca40 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -2,6 +2,7 @@ package daemon_test import ( "context" + "os" "sync" "time" @@ -25,6 +26,7 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/featuregate" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper" mock_helper "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/helper/mock" + hostTypes "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/host/types" snolog "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/log" "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms" mock_platforms "github.com/k8snetworkplumbingwg/sriov-network-operator/pkg/platforms/mock" @@ -39,7 +41,7 @@ var ( kubeclient *kubernetes.Clientset eventRecorder *daemon.EventRecorder wg sync.WaitGroup - startDaemon func(dc *daemon.DaemonReconcile) + startDaemon func(dc *daemon.NodeReconciler) t FullGinkgoTInterface mockCtrl *gomock.Controller @@ -56,7 +58,7 @@ var _ = Describe("Daemon Controller", Ordered, func() { BeforeAll(func() { ctx, cancel = context.WithCancel(context.Background()) wg = sync.WaitGroup{} - startDaemon = func(dc *daemon.DaemonReconcile) { + startDaemon = func(dc *daemon.NodeReconciler) { By("start controller manager") wg.Add(1) go func() { @@ -88,12 +90,15 @@ var _ = Describe("Daemon Controller", Ordered, func() { }) snolog.SetLogLevel(2) - vars.ClusterType = constants.ClusterTypeOpenshift + // Check if the environment variable CLUSTER_TYPE is set + if clusterType, ok := os.LookupEnv("CLUSTER_TYPE"); ok && clusterType == "openshift" { + vars.ClusterType = constants.ClusterTypeOpenshift + } else { + vars.ClusterType = constants.ClusterTypeKubernetes + } }) BeforeEach(func() { - Expect(k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodeState{}, client.InNamespace(testNamespace))).ToNot(HaveOccurred()) - mockCtrl = gomock.NewController(t) hostHelper = mock_helper.NewMockHostHelpersInterface(mockCtrl) platformHelper = mock_platforms.NewMockInterface(mockCtrl) @@ -113,12 +118,26 @@ var _ = Describe("Daemon Controller", Ordered, func() { }) AfterEach(func() { + Expect(k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovNetworkNodeState{}, client.InNamespace(testNamespace))).ToNot(HaveOccurred()) + By("Shutdown controller manager") cancel() wg.Wait() }) - Context("Config Daemon", func() { + AfterAll(func() { + Expect(k8sClient.DeleteAllOf(context.Background(), &sriovnetworkv1.SriovOperatorConfig{}, client.InNamespace(testNamespace))).ToNot(HaveOccurred()) + }) + + Context("Config Daemon generic flow", func() { + BeforeEach(func() { + // k8s plugin for k8s cluster type + if vars.ClusterType == constants.ClusterTypeKubernetes { + hostHelper.EXPECT().ReadServiceManifestFile(gomock.Any()).Return(&hostTypes.Service{Name: "test"}, nil).AnyTimes() + hostHelper.EXPECT().ReadServiceInjectionManifestFile(gomock.Any()).Return(&hostTypes.Service{Name: "test"}, nil).AnyTimes() + } + }) + It("Should expose nodeState Status section", func() { By("Init mock functions") afterConfig := false @@ -175,7 +194,7 @@ var _ = Describe("Daemon Controller", Ordered, func() { featureGates := featuregate.New() featureGates.Init(map[string]bool{}) - dc := CreateDaemon(hostHelper, platformHelper, featureGates, []string{}) + dc := createDaemon(hostHelper, platformHelper, featureGates, []string{}) startDaemon(dc) _, nodeState := createNode("node1") @@ -288,11 +307,11 @@ func createNode(nodeName string) (*corev1.Node, *sriovnetworkv1.SriovNetworkNode return &node, &nodeState } -func CreateDaemon( +func createDaemon( hostHelper helper.HostHelpersInterface, platformHelper platforms.Interface, featureGates featuregate.FeatureGate, - disablePlugins []string) *daemon.DaemonReconcile { + disablePlugins []string) *daemon.NodeReconciler { kClient, err := client.New( cfg, client.Options{ @@ -306,7 +325,7 @@ func CreateDaemon( Expect(err).ToNot(HaveOccurred()) configController := daemon.New(kClient, snclient, kubeclient, hostHelper, platformHelper, eventRecorder, featureGates, disablePlugins) - err = configController.DaemonInitialization() + err = configController.Init() Expect(err).ToNot(HaveOccurred()) err = configController.SetupWithManager(k8sManager) Expect(err).ToNot(HaveOccurred()) diff --git a/pkg/daemon/status.go b/pkg/daemon/status.go index a95198fdf..87ad03b00 100644 --- a/pkg/daemon/status.go +++ b/pkg/daemon/status.go @@ -18,7 +18,7 @@ const ( Unknown = "Unknown" ) -func (dn *DaemonReconcile) updateSyncState(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, status, failedMessage string) error { +func (dn *NodeReconciler) updateSyncState(ctx context.Context, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState, status, failedMessage string) error { funcLog := log.Log.WithName("updateSyncState") currentNodeState := &sriovnetworkv1.SriovNetworkNodeState{} desiredNodeState.Status.SyncStatus = status @@ -57,20 +57,20 @@ func (dn *DaemonReconcile) updateSyncState(ctx context.Context, desiredNodeState return nil } -func (dn *DaemonReconcile) shouldUpdateStatus(current, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { +func (dn *NodeReconciler) shouldUpdateStatus(current, desiredNodeState *sriovnetworkv1.SriovNetworkNodeState) bool { // check number of interfaces are equal if len(current.Status.Interfaces) != len(desiredNodeState.Status.Interfaces) { - return true, nil + return true } // check for bridges if !reflect.DeepEqual(current.Status.Bridges, desiredNodeState.Status.Bridges) { - return true, nil + return true } // check for system if !reflect.DeepEqual(current.Status.System, desiredNodeState.Status.System) { - return true, nil + return true } // check for interfaces @@ -81,33 +81,33 @@ func (dn *DaemonReconcile) shouldUpdateStatus(current, desiredNodeState *sriovne for idx := range d { // check if it's a new device if d[idx].PciAddress != c[idx].PciAddress { - return true, nil + return true } // remove all the vfs d[idx].VFs = nil c[idx].VFs = nil if !reflect.DeepEqual(d[idx], c[idx]) { - return true, nil + return true } } - return false, nil + return false } -func (dn *DaemonReconcile) getHostNetworkStatus(nodeState *sriovnetworkv1.SriovNetworkNodeState) error { - log.Log.WithName("GetHostNetworkStatus").Info("Getting host network status") - var iface []sriovnetworkv1.InterfaceExt +func (dn *NodeReconciler) updateStatusFromHost(nodeState *sriovnetworkv1.SriovNetworkNodeState) error { + log.Log.WithName("updateStatusFromHost").Info("Getting host network status") + var ifaces []sriovnetworkv1.InterfaceExt var bridges sriovnetworkv1.Bridges var err error if vars.PlatformType == consts.VirtualOpenStack { - iface, err = dn.platformHelpers.DiscoverSriovDevicesVirtual() + ifaces, err = dn.platformHelpers.DiscoverSriovDevicesVirtual() if err != nil { return err } } else { - iface, err = dn.HostHelpers.DiscoverSriovDevices(dn.HostHelpers) + ifaces, err = dn.HostHelpers.DiscoverSriovDevices(dn.HostHelpers) if err != nil { return err } @@ -119,13 +119,13 @@ func (dn *DaemonReconcile) getHostNetworkStatus(nodeState *sriovnetworkv1.SriovN } } - nodeState.Status.Interfaces = iface + nodeState.Status.Interfaces = ifaces nodeState.Status.Bridges = bridges nodeState.Status.System.RdmaMode, err = dn.HostHelpers.DiscoverRDMASubsystem() return err } -func (dn *DaemonReconcile) recordStatusChangeEvent(ctx context.Context, oldStatus, newStatus, lastError string) { +func (dn *NodeReconciler) recordStatusChangeEvent(ctx context.Context, oldStatus, newStatus, lastError string) { if oldStatus != newStatus { if oldStatus == "" { oldStatus = Unknown diff --git a/pkg/systemd/systemd.go b/pkg/systemd/systemd.go index b89312f3f..28ff7acc8 100644 --- a/pkg/systemd/systemd.go +++ b/pkg/systemd/systemd.go @@ -42,6 +42,7 @@ const ( // TODO: move this to the host interface also +// SriovConfig: Contains the information we saved on the host for the sriov-config service running on the host type SriovConfig struct { Spec sriovnetworkv1.SriovNetworkNodeStateSpec `yaml:"spec"` UnsupportedNics bool `yaml:"unsupportedNics"` @@ -50,11 +51,14 @@ type SriovConfig struct { OVSDBSocketPath string `yaml:"ovsdbSocketPath"` } +// SriovResult: Contains the result from the sriov-config service trying to apply the requested policies type SriovResult struct { SyncStatus string `yaml:"syncStatus"` LastSyncError string `yaml:"lastSyncError"` } +// ReadConfFile reads the SR-IOV config file from the host +// Unmarshal YAML content into SriovConfig object func ReadConfFile() (spec *SriovConfig, err error) { rawConfig, err := os.ReadFile(utils.GetHostExtensionPath(SriovSystemdConfigPath)) if err != nil { @@ -66,6 +70,9 @@ func ReadConfFile() (spec *SriovConfig, err error) { return spec, err } +// WriteConfFile generates or updates a SriovNetwork configuration file based on the provided state. +// It creates the necessary directory structure if the file doesn't exist, +// reads the existing content to check for changes, and writes new content only when needed. func WriteConfFile(newState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) { newFile := false sriovConfig := &SriovConfig{ @@ -147,6 +154,8 @@ func WriteConfFile(newState *sriovnetworkv1.SriovNetworkNodeState) (bool, error) return true, nil } +// WriteSriovResult writes SR-IOV results to the host. +// It creates the file if it doesn't exist func WriteSriovResult(result *SriovResult) error { _, err := os.Stat(utils.GetHostExtensionPath(SriovSystemdResultPath)) if err != nil { @@ -180,11 +189,14 @@ func WriteSriovResult(result *SriovResult) error { return nil } +// ReadSriovResult reads and parses the sriov result file from the host. +// The function first checks if the result file exists. If it doesn't, it returns nil with a success flag of false and no error. +// If the file exists, it reads its contents and attempts to unmarshal the YAML data into the SriovResult struct. func ReadSriovResult() (*SriovResult, bool, error) { _, err := os.Stat(utils.GetHostExtensionPath(SriovSystemdResultPath)) if err != nil { if os.IsNotExist(err) { - log.Log.V(2).Info("ReadSriovResult(): file does not exist, return empty result") + log.Log.V(2).Info("ReadSriovResult(): file does not exist") return nil, false, nil } else { log.Log.Error(err, "ReadSriovResult(): failed to check sriov result file", "path", utils.GetHostExtensionPath(SriovSystemdResultPath)) @@ -207,6 +219,7 @@ func ReadSriovResult() (*SriovResult, bool, error) { return result, true, err } +// RemoveSriovResult: Removes the Sriov result file from the host. func RemoveSriovResult() error { err := os.Remove(utils.GetHostExtensionPath(SriovSystemdResultPath)) if err != nil { @@ -221,6 +234,9 @@ func RemoveSriovResult() error { return nil } +// WriteSriovSupportedNics() creates or updates a file containing the list of supported SR-IOV NIC IDs +// If the file does not exist, it will create it +// It reads from sriovnetworkv1.NicIDMap to gather the list of NIC identifiers func WriteSriovSupportedNics() error { _, err := os.Stat(utils.GetHostExtensionPath(sriovSystemdSupportedNicPath)) if err != nil { @@ -253,6 +269,10 @@ func WriteSriovSupportedNics() error { return nil } +// ReadSriovSupportedNics reads the list of SR-IOV supported network interface cards (NICs) from the host. +// It returns a slice of strings where each string represents a line from the file, +// with each line corresponding to an SR-IOV supported NIC. If the file does not exist, it returns nil and an error. +// If there is an error reading the file, it returns the error along with the file path for debugging purposes. func ReadSriovSupportedNics() ([]string, error) { _, err := os.Stat(utils.GetHostExtensionPath(sriovSystemdSupportedNicPath)) if err != nil { @@ -275,6 +295,10 @@ func ReadSriovSupportedNics() ([]string, error) { return lines, nil } +// CleanSriovFilesFromHost removes SR-IOV related configuration and service files from the host system. +// It deletes several systemd-related files including configuration paths, result paths, supported NICs path, +// and service binary path. If not in an OpenShift environment, it also removes the main SR-IOV +// service and post-networking service files. func CleanSriovFilesFromHost(isOpenShift bool) error { err := os.Remove(utils.GetHostExtensionPath(SriovSystemdConfigPath)) if err != nil && !os.IsNotExist(err) { diff --git a/pkg/utils/shutdown.go b/pkg/utils/shutdown.go index f8f9618d4..6f084aaaa 100644 --- a/pkg/utils/shutdown.go +++ b/pkg/utils/shutdown.go @@ -35,7 +35,7 @@ func updateFinalizers() { shutdownLog.Error(err, "Failed to list SriovNetworks") } else { for _, instance := range networkList.Items { - if instance.ObjectMeta.Finalizers == nil || len(instance.ObjectMeta.Finalizers) == 0 { + if len(instance.ObjectMeta.Finalizers) == 0 { continue } if err != nil {