@@ -26,6 +26,7 @@ import (
26
26
"github.com/Azure/azure-container-networking/cns/cnireconciler"
27
27
"github.com/Azure/azure-container-networking/cns/common"
28
28
"github.com/Azure/azure-container-networking/cns/configuration"
29
+ "github.com/Azure/azure-container-networking/cns/deviceplugin"
29
30
"github.com/Azure/azure-container-networking/cns/endpointmanager"
30
31
"github.com/Azure/azure-container-networking/cns/fsnotify"
31
32
"github.com/Azure/azure-container-networking/cns/grpc"
@@ -65,6 +66,7 @@ import (
65
66
"github.com/Azure/azure-container-networking/store"
66
67
"github.com/Azure/azure-container-networking/telemetry"
67
68
"github.com/avast/retry-go/v4"
69
+ "github.com/google/go-cmp/cmp"
68
70
"github.com/pkg/errors"
69
71
"go.uber.org/zap"
70
72
"go.uber.org/zap/zapcore"
@@ -105,9 +107,14 @@ const (
105
107
// envVarEnableCNIConflistGeneration enables cni conflist generation if set (value doesn't matter)
106
108
envVarEnableCNIConflistGeneration = "CNS_ENABLE_CNI_CONFLIST_GENERATION"
107
109
108
- cnsReqTimeout = 15 * time .Second
109
- defaultLocalServerIP = "localhost"
110
- defaultLocalServerPort = "10090"
110
+ cnsReqTimeout = 15 * time .Second
111
+ defaultLocalServerIP = "localhost"
112
+ defaultLocalServerPort = "10090"
113
+ defaultDevicePluginRetryInterval = 2 * time .Second
114
+ defaultNodeInfoCRDPollInterval = 5 * time .Second
115
+ defaultDevicePluginMaxRetryCount = 5
116
+ initialVnetNICCount = 0
117
+ initialIBNICCount = 0
111
118
)
112
119
113
120
type cniConflistScenario string
@@ -910,6 +917,50 @@ func main() {
910
917
}
911
918
}
912
919
920
+ if cnsconfig .EnableSwiftV2 && cnsconfig .EnableK8sDevicePlugin {
921
+ // Create device plugin manager instance
922
+ pluginManager := deviceplugin .NewPluginManager (z )
923
+ pluginManager .AddPlugin (mtv1alpha1 .DeviceTypeVnetNIC , initialVnetNICCount )
924
+ pluginManager .AddPlugin (mtv1alpha1 .DeviceTypeInfiniBandNIC , initialIBNICCount )
925
+
926
+ ctx , cancel := context .WithCancel (context .Background ())
927
+ defer cancel ()
928
+
929
+ // Start device plugin manager in a separate goroutine
930
+ go func () {
931
+ retryCount := 0
932
+ ticker := time .NewTicker (defaultDevicePluginRetryInterval )
933
+ // Ensure the ticker is stopped on exit
934
+ defer ticker .Stop ()
935
+ for {
936
+ select {
937
+ case <- ctx .Done ():
938
+ z .Info ("Context canceled, stopping plugin manager" )
939
+ return
940
+ case <- ticker .C :
941
+ if pluginErr := pluginManager .Run (ctx ); pluginErr != nil {
942
+ z .Error ("plugin manager exited with error" , zap .Error (pluginErr ))
943
+ retryCount ++
944
+ // Implementing a basic circuit breaker
945
+ if retryCount >= defaultDevicePluginMaxRetryCount {
946
+ z .Error ("Max retries reached, stopping plugin manager" )
947
+ return
948
+ }
949
+ } else {
950
+ return
951
+ }
952
+ }
953
+ }
954
+ }()
955
+
956
+ // go routine to poll node info crd and update device counts
957
+ go func () {
958
+ if pollErr := pollNodeInfoCRDAndUpdatePlugin (ctx , z , pluginManager ); pollErr != nil {
959
+ z .Error ("Error in pollNodeInfoCRDAndUpdatePlugin" , zap .Error (pollErr ))
960
+ }
961
+ }()
962
+ }
963
+
913
964
// Conditionally initialize and start the gRPC server
914
965
if cnsconfig .GRPCSettings .Enable {
915
966
// Define gRPC server settings
@@ -1083,6 +1134,91 @@ func main() {
1083
1134
logger .Close ()
1084
1135
}
1085
1136
1137
+ // Poll CRD until it's set and update PluginManager
1138
+ func pollNodeInfoCRDAndUpdatePlugin (ctx context.Context , zlog * zap.Logger , pluginManager * deviceplugin.PluginManager ) error {
1139
+ kubeConfig , err := ctrl .GetConfig ()
1140
+ if err != nil {
1141
+ logger .Errorf ("Failed to get kubeconfig for request controller: %v" , err )
1142
+ return errors .Wrap (err , "failed to get kubeconfig" )
1143
+ }
1144
+ kubeConfig .UserAgent = "azure-cns-" + version
1145
+
1146
+ clientset , err := kubernetes .NewForConfig (kubeConfig )
1147
+ if err != nil {
1148
+ return errors .Wrap (err , "failed to build clientset" )
1149
+ }
1150
+
1151
+ nodeName , err := configuration .NodeName ()
1152
+ if err != nil {
1153
+ return errors .Wrap (err , "failed to get NodeName" )
1154
+ }
1155
+
1156
+ node , err := clientset .CoreV1 ().Nodes ().Get (ctx , nodeName , metav1.GetOptions {})
1157
+ if err != nil {
1158
+ return errors .Wrapf (err , "failed to get node %s" , nodeName )
1159
+ }
1160
+
1161
+ // check the Node labels for Swift V2
1162
+ if _ , ok := node .Labels [configuration .LabelNodeSwiftV2 ]; ! ok {
1163
+ zlog .Info ("Node is not labeled for Swift V2, skipping polling nodeinfo crd" )
1164
+ return nil
1165
+ }
1166
+
1167
+ directcli , err := client .New (kubeConfig , client.Options {Scheme : multitenancy .Scheme })
1168
+ if err != nil {
1169
+ return errors .Wrap (err , "failed to create ctrl client" )
1170
+ }
1171
+
1172
+ nodeInfoCli := multitenancy.NodeInfoClient {
1173
+ Cli : directcli ,
1174
+ }
1175
+
1176
+ ticker := time .NewTicker (defaultNodeInfoCRDPollInterval )
1177
+ defer ticker .Stop ()
1178
+
1179
+ for {
1180
+ select {
1181
+ case <- ctx .Done ():
1182
+ zlog .Info ("Polling context canceled, exiting" )
1183
+ return nil
1184
+ case <- ticker .C :
1185
+ // Fetch the CRD status
1186
+ nodeInfo , err := nodeInfoCli .Get (ctx , node .Name )
1187
+ if err != nil {
1188
+ zlog .Error ("Error fetching nodeinfo CRD" , zap .Error (err ))
1189
+ return errors .Wrap (err , "failed to get nodeinfo crd" )
1190
+ }
1191
+
1192
+ // Check if the status is set
1193
+ if ! cmp .Equal (nodeInfo .Status , mtv1alpha1.NodeInfoStatus {}) && len (nodeInfo .Status .DeviceInfos ) > 0 {
1194
+ // Create a map to count devices by type
1195
+ deviceCounts := map [mtv1alpha1.DeviceType ]int {
1196
+ mtv1alpha1 .DeviceTypeVnetNIC : 0 ,
1197
+ mtv1alpha1 .DeviceTypeInfiniBandNIC : 0 ,
1198
+ }
1199
+
1200
+ // Aggregate device counts from the CRD
1201
+ for _ , deviceInfo := range nodeInfo .Status .DeviceInfos {
1202
+ switch deviceInfo .DeviceType {
1203
+ case mtv1alpha1 .DeviceTypeVnetNIC , mtv1alpha1 .DeviceTypeInfiniBandNIC :
1204
+ deviceCounts [deviceInfo .DeviceType ]++
1205
+ default :
1206
+ zlog .Error ("Unknown device type" , zap .String ("deviceType" , string (deviceInfo .DeviceType )))
1207
+ }
1208
+ }
1209
+
1210
+ // Update the plugin manager with device counts
1211
+ for deviceType , count := range deviceCounts {
1212
+ pluginManager .TrackDevices (deviceType , count )
1213
+ }
1214
+
1215
+ // Exit polling loop once the CRD status is successfully processed
1216
+ return nil
1217
+ }
1218
+ }
1219
+ }
1220
+ }
1221
+
1086
1222
func InitializeMultiTenantController (ctx context.Context , httpRestService cns.HTTPService , cnsconfig configuration.CNSConfig ) error {
1087
1223
var multiTenantController multitenantcontroller.RequestController
1088
1224
kubeConfig , err := ctrl .GetConfig ()
0 commit comments