From 31beab4f1e0ea7ebf8afa93e96b728b09b667f15 Mon Sep 17 00:00:00 2001 From: Flavio Fernandes Date: Wed, 15 Jun 2022 01:54:23 +0000 Subject: [PATCH] egressip: add metrics Add metrics to observe core events related to EgressIP: - name: egress_ips_assign_latency_seconds -- Histogram ** desc: The latency of egress IP assignment to ovn nb database - name: egress_ips_unassign_latency_seconds -- Histogram ** desc: The latency of egress IP unassignment from ovn nb database - name: egress_ips_node_unreachable_total -- Counter desc: The total number of times assigned egress IP(s) were unreachable - name: egress_ips_rebalance_total -- Counter desc: The total number of times assigned egress IP(s) needed to be moved to a different node Add flags to explicitly enable the histogram metrics, since we only see value in having them when scale testing egress ips. The flag introduced here is: --metrics-enable-eip-scale Signed-off-by: Flavio Fernandes --- go-controller/pkg/config/config.go | 8 +++- go-controller/pkg/config/config_test.go | 3 ++ go-controller/pkg/metrics/master.go | 56 +++++++++++++++++++++++++ go-controller/pkg/ovn/egressip.go | 43 +++++++++++++++---- 4 files changed, 102 insertions(+), 8 deletions(-) diff --git a/go-controller/pkg/config/config.go b/go-controller/pkg/config/config.go index 3ce2b9b4d96..6b0b215e136 100644 --- a/go-controller/pkg/config/config.go +++ b/go-controller/pkg/config/config.go @@ -325,7 +325,8 @@ type MetricsConfig struct { NodeServerCert string `gcfg:"node-server-cert"` // EnableConfigDuration holds the boolean flag to enable OVN-Kubernetes master to monitor OVN-Kubernetes master // configuration duration and optionally, its application to all nodes - EnableConfigDuration bool `gcfg:"enable-config-duration"` + EnableConfigDuration bool `gcfg:"enable-config-duration"` + EnableEIPScaleMetrics bool `gcfg:"enable-eip-scale-metrics"` } // OVNKubernetesFeatureConfig holds OVN-Kubernetes feature enhancement config file parameters and command-line overrides @@ -1000,6 +1001,11 @@ var MetricsFlags = []cli.Flag{ Usage: "Enables monitoring OVN-Kubernetes master and OVN configuration duration", Destination: &cliConfig.Metrics.EnableConfigDuration, }, + &cli.BoolFlag{ + Name: "metrics-enable-eip-scale", + Usage: "Enables metrics related to Egress IP scaling", + Destination: &cliConfig.Metrics.EnableEIPScaleMetrics, + }, } // OvnNBFlags capture OVN northbound database options diff --git a/go-controller/pkg/config/config_test.go b/go-controller/pkg/config/config_test.go index cf47f733df5..9cd92c0350f 100644 --- a/go-controller/pkg/config/config_test.go +++ b/go-controller/pkg/config/config_test.go @@ -157,6 +157,7 @@ enable-pprof=true node-server-privkey=/path/to/node-metrics-private.key node-server-cert=/path/to/node-metrics.crt enable-config-duration=true +enable-eip-scale-metrics=true [logging] loglevel=5 @@ -574,6 +575,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/path/to/node-metrics-private.key")) gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/path/to/node-metrics.crt")) gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true)) + gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true)) gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL)) gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/path/to/nb-client-private.key")) @@ -657,6 +659,7 @@ var _ = Describe("Config Operations", func() { gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/tls/nodeprivkey")) gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/tls/nodecert")) gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true)) + gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true)) gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL)) gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/client/privkey")) diff --git a/go-controller/pkg/metrics/master.go b/go-controller/pkg/metrics/master.go index 13eec718851..2abd00a395d 100644 --- a/go-controller/pkg/metrics/master.go +++ b/go-controller/pkg/metrics/master.go @@ -189,6 +189,36 @@ var metricEgressIPCount = prometheus.NewGauge(prometheus.GaugeOpts{ Help: "The number of defined egress IP addresses", }) +var metricEgressIPAssignLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "egress_ips_assign_latency_seconds", + Help: "The latency of egress IP assignment to ovn nb database", + Buckets: prometheus.ExponentialBuckets(.001, 2, 15), +}) + +var metricEgressIPUnassignLatency = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "egress_ips_unassign_latency_seconds", + Help: "The latency of egress IP unassignment from ovn nb database", + Buckets: prometheus.ExponentialBuckets(.001, 2, 15), +}) + +var metricEgressIPNodeUnreacheableCount = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "egress_ips_node_unreachable_total", + Help: "The total number of times assigned egress IP(s) were unreachable"}, +) + +var metricEgressIPRebalanceCount = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: MetricOvnkubeNamespace, + Subsystem: MetricOvnkubeSubsystemMaster, + Name: "egress_ips_rebalance_total", + Help: "The total number of times assigned egress IP(s) needed to be moved to a different node"}, +) + var metricEgressFirewallRuleCount = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: MetricOvnkubeNamespace, Subsystem: MetricOvnkubeSubsystemMaster, @@ -344,6 +374,12 @@ func RegisterMasterFunctional() { prometheus.MustRegister(metricV4AllocatedHostSubnetCount) prometheus.MustRegister(metricV6AllocatedHostSubnetCount) prometheus.MustRegister(metricEgressIPCount) + if config.Metrics.EnableEIPScaleMetrics { + prometheus.MustRegister(metricEgressIPAssignLatency) + prometheus.MustRegister(metricEgressIPUnassignLatency) + } + prometheus.MustRegister(metricEgressIPNodeUnreacheableCount) + prometheus.MustRegister(metricEgressIPRebalanceCount) prometheus.MustRegister(metricEgressFirewallRuleCount) prometheus.MustRegister(metricEgressFirewallCount) prometheus.MustRegister(metricEgressRoutingViaHost) @@ -433,6 +469,26 @@ func RecordEgressIPCount(count float64) { metricEgressIPCount.Set(count) } +// RecordEgressIPAssign records how long it took EgressIP to configure OVN. +func RecordEgressIPAssign(duration time.Duration) { + metricEgressIPAssignLatency.Observe(duration.Seconds()) +} + +// RecordEgressIPUnassign records how long it took EgressIP to unconfigure OVN. +func RecordEgressIPUnassign(duration time.Duration) { + metricEgressIPUnassignLatency.Observe(duration.Seconds()) +} + +// RecordEgressIPReachableNode records how many times EgressIP detected an unuseable node. +func RecordEgressIPUnreachableNode() { + metricEgressIPNodeUnreacheableCount.Inc() +} + +// RecordEgressIPRebalance records how many EgressIPs had to move to a different egress node. +func RecordEgressIPRebalance(count int) { + metricEgressIPRebalanceCount.Add(float64(count)) +} + // UpdateEgressFirewallRuleCount records the number of Egress firewall rules. func UpdateEgressFirewallRuleCount(count float64) { metricEgressFirewallRuleCount.Add(count) diff --git a/go-controller/pkg/ovn/egressip.go b/go-controller/pkg/ovn/egressip.go index 72649fbcb30..f14e8a0d87c 100644 --- a/go-controller/pkg/ovn/egressip.go +++ b/go-controller/pkg/ovn/egressip.go @@ -138,6 +138,11 @@ func (oc *Controller) reconcileEgressIP(old, new *egressipv1.EgressIP) (err erro } } + invalidStatusLen := len(invalidStatus) + if invalidStatusLen > 0 { + metrics.RecordEgressIPRebalance(invalidStatusLen) + } + // Add only the diff between what is requested and valid and that which // isn't already assigned. ipsToAssign := validSpecIPs @@ -148,7 +153,7 @@ func (oc *Controller) reconcileEgressIP(old, new *egressipv1.EgressIP) (err erro statusToKeep = append(statusToKeep, status) ipsToAssign.Delete(status.EgressIP) } - statusToRemove := make([]egressipv1.EgressIPStatusItem, 0, len(invalidStatus)) + statusToRemove := make([]egressipv1.EgressIPStatusItem, 0, invalidStatusLen) for status := range invalidStatus { statusToRemove = append(statusToRemove, status) ipsToRemove.Insert(status.EgressIP) @@ -1225,6 +1230,7 @@ func (oc *Controller) syncEgressIPs(eIPs []interface{}) error { // - Egress IPs which have been deleted while ovnkube-master was down // - pods/namespaces which have stopped matching on egress IPs while // ovnkube-master was down + egressIPCache, err := oc.generateCacheForEgressIP(eIPs) if err != nil { return fmt.Errorf("syncEgressIPs unable to generate cache for egressip: %v", err) @@ -1922,13 +1928,24 @@ type egressIPController struct { // (routing pod traffic to the egress node) and NAT objects on the egress node // (SNAT-ing to the egress IP). func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, pod *kapi.Pod, podIPs []*net.IPNet) (err error) { - if err := e.deletePerPodGRSNAT(pod, podIPs, status); err != nil { + if config.Metrics.EnableEIPScaleMetrics { + start := time.Now() + defer func() { + if err != nil { + return + } + duration := time.Since(start) + metrics.RecordEgressIPAssign(duration) + }() + } + if err = e.deletePerPodGRSNAT(pod, podIPs, status); err != nil { return err } - if err := e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.createEgressReroutePolicy); err != nil { + if err = e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.createEgressReroutePolicy); err != nil { return fmt.Errorf("unable to create logical router policy, err: %v", err) } - ops, err := createNATRuleOps(e.nbClient, podIPs, status, egressIPName) + var ops []ovsdb.Operation + ops, err = createNATRuleOps(e.nbClient, podIPs, status, egressIPName) if err != nil { return fmt.Errorf("unable to create NAT rule for status: %v, err: %v", status, err) } @@ -1938,14 +1955,25 @@ func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, statu // deletePodEgressIPAssignment deletes the OVN programmed egress IP // configuration mentioned for addPodEgressIPAssignment. -func (e *egressIPController) deletePodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, podIPs []*net.IPNet) error { - if err := e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.deleteEgressReroutePolicy); errors.Is(err, libovsdbclient.ErrNotFound) { +func (e *egressIPController) deletePodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, podIPs []*net.IPNet) (err error) { + if config.Metrics.EnableEIPScaleMetrics { + start := time.Now() + defer func() { + if err != nil { + return + } + duration := time.Since(start) + metrics.RecordEgressIPUnassign(duration) + }() + } + if err = e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.deleteEgressReroutePolicy); errors.Is(err, libovsdbclient.ErrNotFound) { // if the gateway router join IP setup is already gone, then don't count it as error. klog.Warningf("Unable to delete logical router policy, err: %v", err) } else if err != nil { return fmt.Errorf("unable to delete logical router policy, err: %v", err) } - ops, err := deleteNATRuleOps(e.nbClient, []ovsdb.Operation{}, podIPs, status, egressIPName) + var ops []ovsdb.Operation + ops, err = deleteNATRuleOps(e.nbClient, []ovsdb.Operation{}, podIPs, status, egressIPName) if err != nil { return fmt.Errorf("unable to delete NAT rule for status: %v, err: %v", status, err) } @@ -2169,6 +2197,7 @@ func (oc *Controller) checkEgressNodesReachability() { oc.eIPC.allocator.Unlock() for nodeName, shouldDelete := range reAddOrDelete { if shouldDelete { + metrics.RecordEgressIPUnreachableNode() klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", nodeName) if err := oc.deleteEgressNode(nodeName); err != nil { klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", nodeName, err)