From 084810a1b5afa144f7f208e12bde2dbc58c72086 Mon Sep 17 00:00:00 2001 From: Emilien Macchi Date: Wed, 11 Sep 2024 15:30:26 -0400 Subject: [PATCH 01/11] openstack: dynamically mount the config-drive When we want to use config-drive in immutable systems, very often the config-drive is only used at boot and then umounted (e.g. ignition does this). Later when we want to fetch Metadata from the config drive, we actually have to mount it. In this PR, I'm adding similar code than coreos/ignition where we dynamically mount the config-drive is the device was found with the right label (config-2 or CONFIG-2 as documented in OpenStack). If the device is found, we mount it, fetch the data and umount it. --- pkg/platforms/openstack/openstack.go | 115 +++++++++++++++++++++------ 1 file changed, 92 insertions(+), 23 deletions(-) diff --git a/pkg/platforms/openstack/openstack.go b/pkg/platforms/openstack/openstack.go index 94a9ae433..8968c96be 100644 --- a/pkg/platforms/openstack/openstack.go +++ b/pkg/platforms/openstack/openstack.go @@ -5,6 +5,8 @@ import ( "fmt" "io" "os" + "os/exec" + "path/filepath" "strconv" "strings" @@ -21,15 +23,18 @@ import ( ) const ( - ospHostMetaDataDir = "/host/var/config/openstack/2018-08-27" - ospMetaDataDir = "/var/config/openstack/2018-08-27" - ospMetaDataBaseURL = "http://169.254.169.254/openstack/2018-08-27" - ospNetworkDataJSON = "network_data.json" - ospMetaDataJSON = "meta_data.json" - ospHostNetworkDataFile = ospHostMetaDataDir + "/" + ospNetworkDataJSON - ospHostMetaDataFile = ospHostMetaDataDir + "/" + ospMetaDataJSON - ospNetworkDataURL = ospMetaDataBaseURL + "/" + ospNetworkDataJSON - ospMetaDataURL = ospMetaDataBaseURL + "/" + ospMetaDataJSON + varConfigPath = "/var/config" + ospMetaDataBaseDir = "/openstack/2018-08-27" + ospMetaDataDir = varConfigPath + ospMetaDataBaseDir + ospMetaDataBaseURL = "http://169.254.169.254" + ospMetaDataBaseDir + ospNetworkDataJSON = "network_data.json" + ospMetaDataJSON = "meta_data.json" + ospNetworkDataURL = ospMetaDataBaseURL + "/" + ospNetworkDataJSON + ospMetaDataURL = ospMetaDataBaseURL + "/" + ospMetaDataJSON + // Config drive is defined as an iso9660 or vfat (deprecated) drive + // with the "config-2" label. + //https://docs.openstack.org/nova/latest/user/config-drive.html + configDriveLabel = "config-2" ) var ( @@ -109,9 +114,10 @@ func New(hostManager host.HostManagerInterface) OpenstackInterface { } // GetOpenstackData gets the metadata and network_data -func getOpenstackData(useHostPath bool) (metaData *OSPMetaData, networkData *OSPNetworkData, err error) { - metaData, networkData, err = getOpenstackDataFromConfigDrive(useHostPath) +func getOpenstackData(mountConfigDrive bool) (metaData *OSPMetaData, networkData *OSPNetworkData, err error) { + metaData, networkData, err = getOpenstackDataFromConfigDrive(mountConfigDrive) if err != nil { + log.Log.Error(err, "GetOpenStackData(): non-fatal error getting OpenStack data from config drive") metaData, networkData, err = getOpenstackDataFromMetadataService() if err != nil { return metaData, networkData, fmt.Errorf("GetOpenStackData(): error getting OpenStack data: %w", err) @@ -153,46 +159,109 @@ func getOpenstackData(useHostPath bool) (metaData *OSPMetaData, networkData *OSP return metaData, networkData, err } +// getConfigDriveDevice returns the config drive device which was found +func getConfigDriveDevice() (string, error) { + dev := "/dev/disk/by-label/" + configDriveLabel + if _, err := os.Stat(dev); os.IsNotExist(err) { + out, err := exec.Command( + "blkid", "-l", + "-t", "LABEL="+configDriveLabel, + "-o", "device", + ).CombinedOutput() + if err != nil { + return "", fmt.Errorf("unable to run blkid: %v", err) + } + dev = strings.TrimSpace(string(out)) + } + log.Log.Info("found config drive device", "device", dev) + return dev, nil +} + +// mountConfigDriveDevice mounts the config drive and return the path +func mountConfigDriveDevice(device string) (string, error) { + if device == "" { + return "", fmt.Errorf("device is empty") + } + tmpDir, err := os.MkdirTemp("", "sriov-configdrive") + if err != nil { + return "", fmt.Errorf("error creating temp directory: %w", err) + } + cmd := exec.Command("mount", "-o", "ro", "-t", "auto", device, tmpDir) + if err := cmd.Run(); err != nil { + return "", fmt.Errorf("error mounting config drive: %w", err) + } + log.Log.V(2).Info("mounted config drive device", "device", device, "path", tmpDir) + return tmpDir, nil +} + +// ummountConfigDriveDevice ummounts the config drive device +func ummountConfigDriveDevice(path string) error { + if path == "" { + return fmt.Errorf("path is empty") + } + cmd := exec.Command("umount", path) + if err := cmd.Run(); err != nil { + return fmt.Errorf("error umounting config drive: %w", err) + } + log.Log.V(2).Info("umounted config drive", "path", path) + return nil +} + // getOpenstackDataFromConfigDrive reads the meta_data and network_data files -func getOpenstackDataFromConfigDrive(useHostPath bool) (metaData *OSPMetaData, networkData *OSPNetworkData, err error) { +func getOpenstackDataFromConfigDrive(mountConfigDrive bool) (metaData *OSPMetaData, networkData *OSPNetworkData, err error) { metaData = &OSPMetaData{} networkData = &OSPNetworkData{} + var configDrivePath string log.Log.Info("reading OpenStack meta_data from config-drive") var metadataf *os.File ospMetaDataFilePath := ospMetaDataFile - if useHostPath { - ospMetaDataFilePath = ospHostMetaDataFile + if mountConfigDrive { + configDriveDevice, err := getConfigDriveDevice() + if err != nil { + return metaData, networkData, fmt.Errorf("error finding config drive device: %w", err) + } + configDrivePath, err = mountConfigDriveDevice(configDriveDevice) + if err != nil { + return metaData, networkData, fmt.Errorf("error mounting config drive device: %w", err) + } + defer func() { + if e := ummountConfigDriveDevice(configDrivePath); err == nil && e != nil { + err = fmt.Errorf("error umounting config drive device: %w", e) + } + if e := os.Remove(configDrivePath); err == nil && e != nil { + err = fmt.Errorf("error removing temp directory %s: %w", configDrivePath, e) + } + }() + ospMetaDataFilePath = filepath.Join(configDrivePath, ospMetaDataBaseDir, ospMetaDataJSON) + ospNetworkDataFile = filepath.Join(configDrivePath, ospMetaDataBaseDir, ospNetworkDataJSON) } metadataf, err = os.Open(ospMetaDataFilePath) if err != nil { - return metaData, networkData, fmt.Errorf("error opening file %s: %w", ospHostMetaDataFile, err) + return metaData, networkData, fmt.Errorf("error opening file %s: %w", ospMetaDataFilePath, err) } defer func() { if e := metadataf.Close(); err == nil && e != nil { - err = fmt.Errorf("error closing file %s: %w", ospHostMetaDataFile, e) + err = fmt.Errorf("error closing file %s: %w", ospMetaDataFilePath, e) } }() if err = json.NewDecoder(metadataf).Decode(&metaData); err != nil { - return metaData, networkData, fmt.Errorf("error unmarshalling metadata from file %s: %w", ospHostMetaDataFile, err) + return metaData, networkData, fmt.Errorf("error unmarshalling metadata from file %s: %w", ospMetaDataFilePath, err) } log.Log.Info("reading OpenStack network_data from config-drive") var networkDataf *os.File ospNetworkDataFilePath := ospNetworkDataFile - if useHostPath { - ospNetworkDataFilePath = ospHostNetworkDataFile - } networkDataf, err = os.Open(ospNetworkDataFilePath) if err != nil { - return metaData, networkData, fmt.Errorf("error opening file %s: %w", ospHostNetworkDataFile, err) + return metaData, networkData, fmt.Errorf("error opening file %s: %w", ospNetworkDataFilePath, err) } defer func() { if e := networkDataf.Close(); err == nil && e != nil { - err = fmt.Errorf("error closing file %s: %w", ospHostNetworkDataFile, e) + err = fmt.Errorf("error closing file %s: %w", ospNetworkDataFilePath, e) } }() if err = json.NewDecoder(networkDataf).Decode(&networkData); err != nil { - return metaData, networkData, fmt.Errorf("error unmarshalling metadata from file %s: %w", ospHostNetworkDataFile, err) + return metaData, networkData, fmt.Errorf("error unmarshalling metadata from file %s: %w", ospNetworkDataFilePath, err) } return metaData, networkData, err } From ba21df035b79c907dd1cbc4898e83a7557109553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Mon, 23 Sep 2024 14:19:43 +0200 Subject: [PATCH 02/11] Enclose array expansions in double quote Fixes the following shellcheck error: SC2068 (error): Double quote array expansions to avoid re-splitting elements. https://www.shellcheck.net/wiki/SC2068 --- hack/deploy-setup.sh | 2 +- hack/vf-netns-switcher.sh | 8 ++++---- test/scripts/enable-kargs_test.sh | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hack/deploy-setup.sh b/hack/deploy-setup.sh index 2c2fc7d8d..807479c38 100755 --- a/hack/deploy-setup.sh +++ b/hack/deploy-setup.sh @@ -22,7 +22,7 @@ load_manifest() { fi files="service_account.yaml role.yaml role_binding.yaml clusterrole.yaml clusterrolebinding.yaml configmap.yaml sriovoperatorconfig.yaml operator.yaml" for m in ${files}; do - if [ "$(echo ${EXCLUSIONS[@]} | grep -o ${m} | wc -w | xargs)" == "0" ] ; then + if [ "$(echo "${EXCLUSIONS[@]}" | grep -o ${m} | wc -w | xargs)" == "0" ] ; then envsubst< ${m} | ${OPERATOR_EXEC} apply ${namespace:-} --validate=false -f - fi done diff --git a/hack/vf-netns-switcher.sh b/hack/vf-netns-switcher.sh index de4e8041a..e842a8dc8 100755 --- a/hack/vf-netns-switcher.sh +++ b/hack/vf-netns-switcher.sh @@ -95,7 +95,7 @@ It must be of the form :,. This flag can be repeated to specify done return_interfaces_to_default_namespace(){ - for netns in ${netnses[@]};do + for netns in "${netnses[@]}";do for pf in ${pfs[$netns]};do return_interface_to_default_namespace "${netns}" "${pf}" done @@ -360,7 +360,7 @@ main(){ trap return_interfaces_to_default_namespace INT EXIT TERM while true;do - for netns in ${netnses[@]};do + for netns in "${netnses[@]}";do switch_pfs "$netns" "${pfs[$netns]}" sleep 2 switch_netns_vfs "$netns" @@ -388,7 +388,7 @@ if [[ "$status" != "0" ]];then exit $status fi -for netns in ${netnses[@]};do +for netns in "${netnses[@]}";do netns_create "$netns" let status=$status+$? if [[ "$status" != "0" ]];then @@ -397,7 +397,7 @@ for netns in ${netnses[@]};do fi done -for netns in ${netnses[@]};do +for netns in "${netnses[@]}";do get_pcis_from_pfs "$netns" "${pfs[$netns]}" get_pf_switch_dev_info "$netns" "${pfs[$netns]}" done diff --git a/test/scripts/enable-kargs_test.sh b/test/scripts/enable-kargs_test.sh index 615f3d2b2..40c2764be 100755 --- a/test/scripts/enable-kargs_test.sh +++ b/test/scripts/enable-kargs_test.sh @@ -46,7 +46,7 @@ setUp() { # Mock chroot calls to the temporary test folder export real_chroot=$(which chroot) chroot() { - $real_chroot $FAKE_HOST ${@:2} + $real_chroot $FAKE_HOST "${@:2}" } export -f chroot From 3d553bfd6985fbd7225f615577148ed1e6a42963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Mon, 23 Sep 2024 14:22:26 +0200 Subject: [PATCH 03/11] Add missing shebang Fixes the following shellcheck error: SC2148 (error): Tips depend on target shell and yours is unknown. Add a shebang or a 'shell' directive. https://www.shellcheck.net/wiki/SC2148 --- hack/env.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hack/env.sh b/hack/env.sh index c49c399d8..64f79212d 100755 --- a/hack/env.sh +++ b/hack/env.sh @@ -1,3 +1,5 @@ +#!/bin/bash + if [ -z $SKIP_VAR_SET ]; then export SRIOV_CNI_IMAGE=${SRIOV_CNI_IMAGE:-ghcr.io/k8snetworkplumbingwg/sriov-cni} export SRIOV_INFINIBAND_CNI_IMAGE=${SRIOV_INFINIBAND_CNI_IMAGE:-ghcr.io/k8snetworkplumbingwg/ib-sriov-cni} From 63246d6918a155fc9cbe2aed057274a5dcc9503d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Mon, 23 Sep 2024 15:51:05 +0200 Subject: [PATCH 04/11] Explicitly expand array values Fixes the following shellcheck errors: SC2145 (error): Argument mixes string and array. Use * or separate argument. SC2199 (error): Arrays implicitly concatenate in [[ ]]. Use a loop (or explicit * instead of @). https://www.shellcheck.net/wiki/SC2145 https://www.shellcheck.net/wiki/SC2199 Also fixes a typo in SUPPORTED_INTERFACE_SWITCHER_MODES. --- hack/run-e2e-test-kind.sh | 6 +++--- hack/vf-netns-switcher.sh | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/hack/run-e2e-test-kind.sh b/hack/run-e2e-test-kind.sh index 5cb7750c7..3cc080d9c 100755 --- a/hack/run-e2e-test-kind.sh +++ b/hack/run-e2e-test-kind.sh @@ -6,7 +6,7 @@ export SRIOV_NETWORK_OPERATOR_IMAGE="${SRIOV_NETWORK_OPERATOR_IMAGE:-sriov-netwo export SRIOV_NETWORK_CONFIG_DAEMON_IMAGE="${SRIOV_NETWORK_CONFIG_DAEMON_IMAGE:-origin-sriov-network-config-daemon:e2e-test}" export KUBECONFIG="${KUBECONFIG:-${HOME}/.kube/config}" INTERFACES_SWITCHER="${INTERFACES_SWITCHER:-"test-suite"}" -SUPPORTED_INTERFACE_SWTICHER_MODES=("test-suite", "system-service") +SUPPORTED_INTERFACE_SWITCHER_MODES=("test-suite", "system-service") RETRY_MAX=10 INTERVAL=10 TIMEOUT=300 @@ -16,9 +16,9 @@ while test $# -gt 0; do case "$1" in --device-netns-switcher) INTERFACES_SWITCHER="$2" - if [[ ! "${SUPPORTED_INTERFACE_SWTICHER_MODES[@]}" =~ "${INTERFACES_SWITCHER}" ]]; then + if [[ ! "${SUPPORTED_INTERFACE_SWITCHER_MODES[*]}" =~ "${INTERFACES_SWITCHER}" ]]; then echo "Error: unsupported interface switching mode: ${INTERFACES_SWITCHER}!" - echo "Supported modes are: ${SUPPORTED_INTERFACE_SWTICHER_MODES[@]}" + echo "Supported modes are: ${SUPPORTED_INTERFACE_SWITCHER_MODES[*]}" exit 1 fi shift diff --git a/hack/vf-netns-switcher.sh b/hack/vf-netns-switcher.sh index e842a8dc8..69881da7b 100755 --- a/hack/vf-netns-switcher.sh +++ b/hack/vf-netns-switcher.sh @@ -348,7 +348,7 @@ variables_check(){ check_empty_var(){ local var_name="$1" - if [[ -z "${!var_name[@]}" ]];then + if [[ -z "${!var_name[*]}" ]];then echo "Error: $var_name is empty..." return 1 fi @@ -403,7 +403,7 @@ for netns in "${netnses[@]}";do done if [[ "${#pcis[@]}" == "0" ]];then - echo "Error: could not get pci addresses of interfaces ${pfs[@]}!!" + echo "Error: could not get pci addresses of interfaces ${pfs[*]}!!" exit 1 fi From 3529811b1d3a0833dacc2e7fc27425749562f769 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Mon, 23 Sep 2024 17:43:01 +0200 Subject: [PATCH 05/11] Iterate over globs. Fixes the following shellcheck error: SC2045 (error): Iterating over ls output is fragile. Use globs. https://www.shellcheck.net/wiki/SC2045 --- hack/vf-netns-switcher.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hack/vf-netns-switcher.sh b/hack/vf-netns-switcher.sh index 69881da7b..c383b5d1e 100755 --- a/hack/vf-netns-switcher.sh +++ b/hack/vf-netns-switcher.sh @@ -277,19 +277,20 @@ switch_interface_vf_representors(){ return 0 fi - for interface in $(ls /sys/class/net);do - phys_switch_id=$(cat /sys/class/net/$interface/phys_switch_id) + for interface in /sys/class/net/*;do + phys_switch_id=$(cat $interface/phys_switch_id) if [[ "$phys_switch_id" != "${pf_switch_ids[$pf_name]}" ]]; then continue fi - phys_port_name=$(cat /sys/class/net/$interface/phys_port_name) + phys_port_name=$(cat $interface/phys_port_name) phys_port_name_pf_index=${phys_port_name%vf*} phys_port_name_pf_index=${phys_port_name_pf_index#pf} if [[ "$phys_port_name_pf_index" != "${pf_port_names[$pf_name]:1}" ]]; then continue fi - echo "Switching VF representor $interface of PF $pf_name to netns $worker_netns" - switch_vf $interface $worker_netns + interface_name=${interface##*/} + echo "Switching VF representor $interface_name of PF $pf_name to netns $worker_netns" + switch_vf $interface_name $worker_netns done } From a01a1392f384df0653e4baa7cbdcacdc58953a38 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Mon, 7 Oct 2024 14:20:03 +0200 Subject: [PATCH 06/11] metrics: Fix `Metrics should have the correct labels` test `sriov_kubepoddevice` metric might end up in the Prometheus database after a while, as the default scrape interval is 30s. This leads to failures in the end-to-end lane like: ``` [sriov] Metrics Exporter When Prometheus operator is available [It] Metrics should have the correct labels /root/opr-ocp2-1/data/sriov-network-operator/sriov-network-operator/test/conformance/tests/test_exporter_metrics.go:132 [FAILED] no value for metric sriov_kubepoddevice ``` Put the metric assertion in an `Eventually` statement Signed-off-by: Andrea Panattoni --- .../tests/test_exporter_metrics.go | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/conformance/tests/test_exporter_metrics.go b/test/conformance/tests/test_exporter_metrics.go index 96bf792b5..f7bc82d3f 100644 --- a/test/conformance/tests/test_exporter_metrics.go +++ b/test/conformance/tests/test_exporter_metrics.go @@ -66,6 +66,8 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, ContinueOnFailure, func() Expect(err).ToNot(HaveOccurred()) waitForNetAttachDef("test-me-network", namespaces.Test) + WaitForSRIOVStable() + DeferCleanup(namespaces.Clean, operatorNamespace, namespaces.Test, clients, discovery.Enabled()) }) @@ -158,15 +160,17 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, ContinueOnFailure, func() }, "90s", "1s").Should(Succeed()) // sriov_kubepoddevice has a different sets of label than statistics metrics - samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name)) - Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice") - Expect(samples[0].Metric).To(And( - HaveKey(model.LabelName("pciAddr")), - HaveKeyWithValue(model.LabelName("node"), model.LabelValue(pod.Spec.NodeName)), - HaveKeyWithValue(model.LabelName("dev_type"), model.LabelValue("openshift.io/metricsResource")), - HaveKeyWithValue(model.LabelName("namespace"), model.LabelValue(pod.Namespace)), - HaveKeyWithValue(model.LabelName("pod"), model.LabelValue(pod.Name)), - )) + Eventually(func(g Gomega) { + samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name)) + g.Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice") + g.Expect(samples[0].Metric).To(And( + HaveKey(model.LabelName("pciAddr")), + HaveKeyWithValue(model.LabelName("node"), model.LabelValue(pod.Spec.NodeName)), + HaveKeyWithValue(model.LabelName("dev_type"), model.LabelValue("openshift.io/metricsResource")), + HaveKeyWithValue(model.LabelName("namespace"), model.LabelValue(pod.Namespace)), + HaveKeyWithValue(model.LabelName("pod"), model.LabelValue(pod.Name)), + )) + }, "60s", "1s").Should(Succeed()) }) }) }) From 6abdfe6d188344ab6bd6ad0e64f0895ab7aa414f Mon Sep 17 00:00:00 2001 From: Sebastian Sch Date: Tue, 8 Oct 2024 21:23:42 +0300 Subject: [PATCH 07/11] Fix NRI rbac Signed-off-by: Sebastian Sch --- bindata/manifests/webhook/002-rbac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindata/manifests/webhook/002-rbac.yaml b/bindata/manifests/webhook/002-rbac.yaml index 77b2d95d7..32affca29 100644 --- a/bindata/manifests/webhook/002-rbac.yaml +++ b/bindata/manifests/webhook/002-rbac.yaml @@ -21,7 +21,7 @@ rules: - apiGroups: - "" resources: - - configmap + - configmaps verbs: - 'watch' - 'list' From fb193e80038325b4c9bc8d8012d809eca9bc46da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Thu, 26 Sep 2024 14:23:34 +0200 Subject: [PATCH 08/11] Use grep for matching args with sh Fixes the following shellcheck error: SC2081 (error): [ .. ] can't match globs. Use a case statement. https://www.shellcheck.net/wiki/SC2081 --- test/scripts/enable-kargs_test.sh | 1 + test/scripts/rpm-ostree_mock | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/scripts/enable-kargs_test.sh b/test/scripts/enable-kargs_test.sh index 40c2764be..93a985700 100755 --- a/test/scripts/enable-kargs_test.sh +++ b/test/scripts/enable-kargs_test.sh @@ -40,6 +40,7 @@ setUp() { cp $(which cat) ${FAKE_HOST}/usr/bin/ cp $(which test) ${FAKE_HOST}/usr/bin/ cp $(which sh) ${FAKE_HOST}/usr/bin/ + cp $(which grep) ${FAKE_HOST}/usr/bin/ cp "$SCRIPTPATH/rpm-ostree_mock" ${FAKE_HOST}/usr/bin/rpm-ostree } diff --git a/test/scripts/rpm-ostree_mock b/test/scripts/rpm-ostree_mock index 16e816cc9..db6f66040 100755 --- a/test/scripts/rpm-ostree_mock +++ b/test/scripts/rpm-ostree_mock @@ -5,7 +5,7 @@ # Write invocation with arguments to a file to allow making assertion. echo "$*" >> /rpm-ostree_calls -if [ "$*" != *"--append"* ] +if ! echo "$*" | grep -q "\--append" then # Caller is trying to read kernel arguments. cat /proc/cmdline From 5394d218f8c50ab7acf05558c1777491115fdbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Andr=C3=A9?= Date: Mon, 23 Sep 2024 17:58:54 +0200 Subject: [PATCH 09/11] CI: Add a bash linter to pre-submits Warns about shellcheck issues with severity `error`. --- .github/workflows/test.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2fbe84c81..d59e52e47 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -100,6 +100,16 @@ jobs: # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. version: v1.55.2 + shellcheck: + name: Shellcheck + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run ShellCheck + uses: ludeeus/action-shellcheck@master + with: + severity: error + test-coverage: name: test-coverage runs-on: ubuntu-latest From f286a04ad7c47216fece213bca47fddcc774f4d2 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Fri, 4 Oct 2024 19:05:53 +0200 Subject: [PATCH 10/11] config-daemon: Restart all instances of device-plugin When the operator changes the device-plugin Spec (e.g. .Spec.NodeSelector), it may happen that there are two device plugin pods for a given node, one that is terminating, the other that is initializing. If the config-daemon executes `restartDevicePluginPod()` at the same time, it may kill the terminating pod, while the initializing one will run with the old dp configuration. This may cause one or more resources to not being advertised, until a manual device plugin restart occurs. Make the config-daemon restart all the device-plugin instances it founds for its own node. Signed-off-by: Andrea Panattoni --- pkg/daemon/daemon.go | 53 +++++++++++++++++----------------- pkg/daemon/daemon_test.go | 61 ++++++++++++++++++++++++++++++--------- 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 5ed31ff85..ff7f326dc 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -120,6 +120,7 @@ func New( eventRecorder: er, featureGate: featureGates, disabledPlugins: disabledPlugins, + mu: &sync.Mutex{}, } } @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { var timeout int64 = 5 var metadataKey = "metadata.name" - dn.mu = &sync.Mutex{} informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient, time.Second*15, vars.Namespace, @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error { defer dn.mu.Unlock() log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod") - var podToDelete string pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ LabelSelector: "app=sriov-device-plugin", FieldSelector: "spec.nodeName=" + vars.NodeName, @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error { log.Log.Info("restartDevicePluginPod(): device plugin pod exited") return nil } - podToDelete = pods.Items[0].Name - log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) - err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) - if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): pod to delete not found") - return nil - } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") - return err - } - - if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { - _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + for _, pod := range pods.Items { + podToDelete := pod.Name + log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) + err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): device plugin pod exited") - return true, nil + log.Log.Info("restartDevicePluginPod(): pod to delete not found") + continue } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") - } else { - log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") + return err + } + + if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { + _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + if errors.IsNotFound(err) { + log.Log.Info("restartDevicePluginPod(): device plugin pod exited") + return true, nil + } + + if err != nil { + log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") + } else { + log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + } + return false, nil + }, dn.stopCh); err != nil { + log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") + return err } - return false, nil - }, dn.stopCh); err != nil { - log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") - return err } return nil diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index f1111810a..67a56633f 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -32,6 +32,8 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" ) +var SriovDevicePluginPod corev1.Pod + func TestConfigDaemon(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Config Daemon Suite") @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() { }, } - SriovDevicePluginPod := corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "sriov-device-plugin-xxxx", - Namespace: vars.Namespace, - Labels: map[string]string{ - "app": "sriov-device-plugin", - }, - }, - Spec: corev1.PodSpec{ - NodeName: "test-node", - }, - } - err = sriovnetworkv1.AddToScheme(scheme.Scheme) Expect(err).ToNot(HaveOccurred()) kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{ @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() { Namespace: vars.Namespace, }}).Build() - kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod) + kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs) snclient := snclientset.NewSimpleClientset() err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace) Expect(err).ToNot(HaveOccurred()) @@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() { err := sut.Run(stopCh, exitCh) Expect(err).ToNot(HaveOccurred()) }() + + SriovDevicePluginPod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sriov-device-plugin-xxxx", + Namespace: vars.Namespace, + Labels: map[string]string{ + "app": "sriov-device-plugin", + }, + }, + Spec: corev1.PodSpec{ + NodeName: "test-node", + }, + } + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + }) AfterEach(func() { @@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() { Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777)) }) + + It("restart all the sriov-device-plugin pods present on the node", func() { + otherPod1 := SriovDevicePluginPod.DeepCopy() + otherPod1.Name = "sriov-device-plugin-xxxa" + _, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + otherPod2 := SriovDevicePluginPod.DeepCopy() + otherPod2.Name = "sriov-device-plugin-xxxz" + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + err = sut.restartDevicePluginPod() + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() (int, error) { + podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=sriov-device-plugin", + FieldSelector: "spec.nodeName=test-node", + }) + + if err != nil { + return 0, err + } + + return len(podList.Items), nil + }, "1s").Should(BeZero()) + }) }) }) From f4ad3e7d03309e8faf33d1ff538d5b3df5c0895c Mon Sep 17 00:00:00 2001 From: Sebastian Scheinkman Date: Wed, 9 Oct 2024 23:50:49 +0000 Subject: [PATCH 11/11] d/s run make bundle --- .../manifests/sriov-network-operator.clusterserviceversion.yaml | 2 +- .../stable/sriov-network-operator.clusterserviceversion.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml b/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml index b044f1994..3aa38950d 100644 --- a/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml +++ b/bundle/manifests/sriov-network-operator.clusterserviceversion.yaml @@ -100,7 +100,7 @@ metadata: categories: Networking certified: "false" containerImage: quay.io/openshift/origin-sriov-network-operator:4.18 - createdAt: "2024-10-07T23:51:07Z" + createdAt: "2024-10-09T23:50:49Z" description: An operator for configuring SR-IOV components and initializing SRIOV network devices in Openshift cluster. features.operators.openshift.io/cnf: "false" diff --git a/manifests/stable/sriov-network-operator.clusterserviceversion.yaml b/manifests/stable/sriov-network-operator.clusterserviceversion.yaml index b044f1994..3aa38950d 100644 --- a/manifests/stable/sriov-network-operator.clusterserviceversion.yaml +++ b/manifests/stable/sriov-network-operator.clusterserviceversion.yaml @@ -100,7 +100,7 @@ metadata: categories: Networking certified: "false" containerImage: quay.io/openshift/origin-sriov-network-operator:4.18 - createdAt: "2024-10-07T23:51:07Z" + createdAt: "2024-10-09T23:50:49Z" description: An operator for configuring SR-IOV components and initializing SRIOV network devices in Openshift cluster. features.operators.openshift.io/cnf: "false"