Skip to content

Commit fb157cd

Browse files
authored
Instantiate Device Plugin (#2979)
* instantiate device plugin * address pr comments * Replace Sleep with Ticker --------- Signed-off-by: aggarwal0009 <[email protected]>
1 parent c00a764 commit fb157cd

File tree

2 files changed

+148
-3
lines changed

2 files changed

+148
-3
lines changed

cns/service/main.go

+139-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/Azure/azure-container-networking/cns/cnireconciler"
2727
"github.com/Azure/azure-container-networking/cns/common"
2828
"github.com/Azure/azure-container-networking/cns/configuration"
29+
"github.com/Azure/azure-container-networking/cns/deviceplugin"
2930
"github.com/Azure/azure-container-networking/cns/endpointmanager"
3031
"github.com/Azure/azure-container-networking/cns/fsnotify"
3132
"github.com/Azure/azure-container-networking/cns/grpc"
@@ -65,6 +66,7 @@ import (
6566
"github.com/Azure/azure-container-networking/store"
6667
"github.com/Azure/azure-container-networking/telemetry"
6768
"github.com/avast/retry-go/v4"
69+
"github.com/google/go-cmp/cmp"
6870
"github.com/pkg/errors"
6971
"go.uber.org/zap"
7072
"go.uber.org/zap/zapcore"
@@ -105,9 +107,14 @@ const (
105107
// envVarEnableCNIConflistGeneration enables cni conflist generation if set (value doesn't matter)
106108
envVarEnableCNIConflistGeneration = "CNS_ENABLE_CNI_CONFLIST_GENERATION"
107109

108-
cnsReqTimeout = 15 * time.Second
109-
defaultLocalServerIP = "localhost"
110-
defaultLocalServerPort = "10090"
110+
cnsReqTimeout = 15 * time.Second
111+
defaultLocalServerIP = "localhost"
112+
defaultLocalServerPort = "10090"
113+
defaultDevicePluginRetryInterval = 2 * time.Second
114+
defaultNodeInfoCRDPollInterval = 5 * time.Second
115+
defaultDevicePluginMaxRetryCount = 5
116+
initialVnetNICCount = 0
117+
initialIBNICCount = 0
111118
)
112119

113120
type cniConflistScenario string
@@ -910,6 +917,50 @@ func main() {
910917
}
911918
}
912919

920+
if cnsconfig.EnableSwiftV2 && cnsconfig.EnableK8sDevicePlugin {
921+
// Create device plugin manager instance
922+
pluginManager := deviceplugin.NewPluginManager(z)
923+
pluginManager.AddPlugin(mtv1alpha1.DeviceTypeVnetNIC, initialVnetNICCount)
924+
pluginManager.AddPlugin(mtv1alpha1.DeviceTypeInfiniBandNIC, initialIBNICCount)
925+
926+
ctx, cancel := context.WithCancel(context.Background())
927+
defer cancel()
928+
929+
// Start device plugin manager in a separate goroutine
930+
go func() {
931+
retryCount := 0
932+
ticker := time.NewTicker(defaultDevicePluginRetryInterval)
933+
// Ensure the ticker is stopped on exit
934+
defer ticker.Stop()
935+
for {
936+
select {
937+
case <-ctx.Done():
938+
z.Info("Context canceled, stopping plugin manager")
939+
return
940+
case <-ticker.C:
941+
if pluginErr := pluginManager.Run(ctx); pluginErr != nil {
942+
z.Error("plugin manager exited with error", zap.Error(pluginErr))
943+
retryCount++
944+
// Implementing a basic circuit breaker
945+
if retryCount >= defaultDevicePluginMaxRetryCount {
946+
z.Error("Max retries reached, stopping plugin manager")
947+
return
948+
}
949+
} else {
950+
return
951+
}
952+
}
953+
}
954+
}()
955+
956+
// go routine to poll node info crd and update device counts
957+
go func() {
958+
if pollErr := pollNodeInfoCRDAndUpdatePlugin(ctx, z, pluginManager); pollErr != nil {
959+
z.Error("Error in pollNodeInfoCRDAndUpdatePlugin", zap.Error(pollErr))
960+
}
961+
}()
962+
}
963+
913964
// Conditionally initialize and start the gRPC server
914965
if cnsconfig.GRPCSettings.Enable {
915966
// Define gRPC server settings
@@ -1083,6 +1134,91 @@ func main() {
10831134
logger.Close()
10841135
}
10851136

1137+
// Poll CRD until it's set and update PluginManager
1138+
func pollNodeInfoCRDAndUpdatePlugin(ctx context.Context, zlog *zap.Logger, pluginManager *deviceplugin.PluginManager) error {
1139+
kubeConfig, err := ctrl.GetConfig()
1140+
if err != nil {
1141+
logger.Errorf("Failed to get kubeconfig for request controller: %v", err)
1142+
return errors.Wrap(err, "failed to get kubeconfig")
1143+
}
1144+
kubeConfig.UserAgent = "azure-cns-" + version
1145+
1146+
clientset, err := kubernetes.NewForConfig(kubeConfig)
1147+
if err != nil {
1148+
return errors.Wrap(err, "failed to build clientset")
1149+
}
1150+
1151+
nodeName, err := configuration.NodeName()
1152+
if err != nil {
1153+
return errors.Wrap(err, "failed to get NodeName")
1154+
}
1155+
1156+
node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
1157+
if err != nil {
1158+
return errors.Wrapf(err, "failed to get node %s", nodeName)
1159+
}
1160+
1161+
// check the Node labels for Swift V2
1162+
if _, ok := node.Labels[configuration.LabelNodeSwiftV2]; !ok {
1163+
zlog.Info("Node is not labeled for Swift V2, skipping polling nodeinfo crd")
1164+
return nil
1165+
}
1166+
1167+
directcli, err := client.New(kubeConfig, client.Options{Scheme: multitenancy.Scheme})
1168+
if err != nil {
1169+
return errors.Wrap(err, "failed to create ctrl client")
1170+
}
1171+
1172+
nodeInfoCli := multitenancy.NodeInfoClient{
1173+
Cli: directcli,
1174+
}
1175+
1176+
ticker := time.NewTicker(defaultNodeInfoCRDPollInterval)
1177+
defer ticker.Stop()
1178+
1179+
for {
1180+
select {
1181+
case <-ctx.Done():
1182+
zlog.Info("Polling context canceled, exiting")
1183+
return nil
1184+
case <-ticker.C:
1185+
// Fetch the CRD status
1186+
nodeInfo, err := nodeInfoCli.Get(ctx, node.Name)
1187+
if err != nil {
1188+
zlog.Error("Error fetching nodeinfo CRD", zap.Error(err))
1189+
return errors.Wrap(err, "failed to get nodeinfo crd")
1190+
}
1191+
1192+
// Check if the status is set
1193+
if !cmp.Equal(nodeInfo.Status, mtv1alpha1.NodeInfoStatus{}) && len(nodeInfo.Status.DeviceInfos) > 0 {
1194+
// Create a map to count devices by type
1195+
deviceCounts := map[mtv1alpha1.DeviceType]int{
1196+
mtv1alpha1.DeviceTypeVnetNIC: 0,
1197+
mtv1alpha1.DeviceTypeInfiniBandNIC: 0,
1198+
}
1199+
1200+
// Aggregate device counts from the CRD
1201+
for _, deviceInfo := range nodeInfo.Status.DeviceInfos {
1202+
switch deviceInfo.DeviceType {
1203+
case mtv1alpha1.DeviceTypeVnetNIC, mtv1alpha1.DeviceTypeInfiniBandNIC:
1204+
deviceCounts[deviceInfo.DeviceType]++
1205+
default:
1206+
zlog.Error("Unknown device type", zap.String("deviceType", string(deviceInfo.DeviceType)))
1207+
}
1208+
}
1209+
1210+
// Update the plugin manager with device counts
1211+
for deviceType, count := range deviceCounts {
1212+
pluginManager.TrackDevices(deviceType, count)
1213+
}
1214+
1215+
// Exit polling loop once the CRD status is successfully processed
1216+
return nil
1217+
}
1218+
}
1219+
}
1220+
}
1221+
10861222
func InitializeMultiTenantController(ctx context.Context, httpRestService cns.HTTPService, cnsconfig configuration.CNSConfig) error {
10871223
var multiTenantController multitenantcontroller.RequestController
10881224
kubeConfig, err := ctrl.GetConfig()

crd/multitenancy/client.go

+9
Original file line numberDiff line numberDiff line change
@@ -216,3 +216,12 @@ func (n *NodeInfoClient) CreateOrUpdate(ctx context.Context, nodeInfo *v1alpha1.
216216
}
217217
return nil
218218
}
219+
220+
// Get retrieves the NodeInfo CRD by name.
221+
func (n *NodeInfoClient) Get(ctx context.Context, name string) (*v1alpha1.NodeInfo, error) {
222+
var nodeInfo v1alpha1.NodeInfo
223+
if err := n.Cli.Get(ctx, client.ObjectKey{Name: name}, &nodeInfo); err != nil {
224+
return nil, errors.Wrap(err, "error getting nodeinfo crd")
225+
}
226+
return &nodeInfo, nil
227+
}

0 commit comments

Comments
 (0)