Skip to content

Commit

Permalink
Merge pull request ovn-kubernetes#3058 from flavio-fernandes/egressip…
Browse files Browse the repository at this point in the history
…_metrics

egressip: add metrics
  • Loading branch information
trozet authored Aug 19, 2022
2 parents 8d83b81 + 31beab4 commit 550a7cb
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 8 deletions.
8 changes: 7 additions & 1 deletion go-controller/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,8 @@ type MetricsConfig struct {
NodeServerCert string `gcfg:"node-server-cert"`
// EnableConfigDuration holds the boolean flag to enable OVN-Kubernetes master to monitor OVN-Kubernetes master
// configuration duration and optionally, its application to all nodes
EnableConfigDuration bool `gcfg:"enable-config-duration"`
EnableConfigDuration bool `gcfg:"enable-config-duration"`
EnableEIPScaleMetrics bool `gcfg:"enable-eip-scale-metrics"`
}

// OVNKubernetesFeatureConfig holds OVN-Kubernetes feature enhancement config file parameters and command-line overrides
Expand Down Expand Up @@ -1000,6 +1001,11 @@ var MetricsFlags = []cli.Flag{
Usage: "Enables monitoring OVN-Kubernetes master and OVN configuration duration",
Destination: &cliConfig.Metrics.EnableConfigDuration,
},
&cli.BoolFlag{
Name: "metrics-enable-eip-scale",
Usage: "Enables metrics related to Egress IP scaling",
Destination: &cliConfig.Metrics.EnableEIPScaleMetrics,
},
}

// OvnNBFlags capture OVN northbound database options
Expand Down
3 changes: 3 additions & 0 deletions go-controller/pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ enable-pprof=true
node-server-privkey=/path/to/node-metrics-private.key
node-server-cert=/path/to/node-metrics.crt
enable-config-duration=true
enable-eip-scale-metrics=true
[logging]
loglevel=5
Expand Down Expand Up @@ -574,6 +575,7 @@ var _ = Describe("Config Operations", func() {
gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/path/to/node-metrics-private.key"))
gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/path/to/node-metrics.crt"))
gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true))

gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL))
gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/path/to/nb-client-private.key"))
Expand Down Expand Up @@ -657,6 +659,7 @@ var _ = Describe("Config Operations", func() {
gomega.Expect(Metrics.NodeServerPrivKey).To(gomega.Equal("/tls/nodeprivkey"))
gomega.Expect(Metrics.NodeServerCert).To(gomega.Equal("/tls/nodecert"))
gomega.Expect(Metrics.EnableConfigDuration).To(gomega.Equal(true))
gomega.Expect(Metrics.EnableEIPScaleMetrics).To(gomega.Equal(true))

gomega.Expect(OvnNorth.Scheme).To(gomega.Equal(OvnDBSchemeSSL))
gomega.Expect(OvnNorth.PrivKey).To(gomega.Equal("/client/privkey"))
Expand Down
56 changes: 56 additions & 0 deletions go-controller/pkg/metrics/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,36 @@ var metricEgressIPCount = prometheus.NewGauge(prometheus.GaugeOpts{
Help: "The number of defined egress IP addresses",
})

var metricEgressIPAssignLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "egress_ips_assign_latency_seconds",
Help: "The latency of egress IP assignment to ovn nb database",
Buckets: prometheus.ExponentialBuckets(.001, 2, 15),
})

var metricEgressIPUnassignLatency = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "egress_ips_unassign_latency_seconds",
Help: "The latency of egress IP unassignment from ovn nb database",
Buckets: prometheus.ExponentialBuckets(.001, 2, 15),
})

var metricEgressIPNodeUnreacheableCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "egress_ips_node_unreachable_total",
Help: "The total number of times assigned egress IP(s) were unreachable"},
)

var metricEgressIPRebalanceCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Name: "egress_ips_rebalance_total",
Help: "The total number of times assigned egress IP(s) needed to be moved to a different node"},
)

var metricEgressFirewallRuleCount = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: MetricOvnkubeNamespace,
Subsystem: MetricOvnkubeSubsystemMaster,
Expand Down Expand Up @@ -344,6 +374,12 @@ func RegisterMasterFunctional() {
prometheus.MustRegister(metricV4AllocatedHostSubnetCount)
prometheus.MustRegister(metricV6AllocatedHostSubnetCount)
prometheus.MustRegister(metricEgressIPCount)
if config.Metrics.EnableEIPScaleMetrics {
prometheus.MustRegister(metricEgressIPAssignLatency)
prometheus.MustRegister(metricEgressIPUnassignLatency)
}
prometheus.MustRegister(metricEgressIPNodeUnreacheableCount)
prometheus.MustRegister(metricEgressIPRebalanceCount)
prometheus.MustRegister(metricEgressFirewallRuleCount)
prometheus.MustRegister(metricEgressFirewallCount)
prometheus.MustRegister(metricEgressRoutingViaHost)
Expand Down Expand Up @@ -433,6 +469,26 @@ func RecordEgressIPCount(count float64) {
metricEgressIPCount.Set(count)
}

// RecordEgressIPAssign records how long it took EgressIP to configure OVN.
func RecordEgressIPAssign(duration time.Duration) {
metricEgressIPAssignLatency.Observe(duration.Seconds())
}

// RecordEgressIPUnassign records how long it took EgressIP to unconfigure OVN.
func RecordEgressIPUnassign(duration time.Duration) {
metricEgressIPUnassignLatency.Observe(duration.Seconds())
}

// RecordEgressIPReachableNode records how many times EgressIP detected an unuseable node.
func RecordEgressIPUnreachableNode() {
metricEgressIPNodeUnreacheableCount.Inc()
}

// RecordEgressIPRebalance records how many EgressIPs had to move to a different egress node.
func RecordEgressIPRebalance(count int) {
metricEgressIPRebalanceCount.Add(float64(count))
}

// UpdateEgressFirewallRuleCount records the number of Egress firewall rules.
func UpdateEgressFirewallRuleCount(count float64) {
metricEgressFirewallRuleCount.Add(count)
Expand Down
43 changes: 36 additions & 7 deletions go-controller/pkg/ovn/egressip.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ func (oc *Controller) reconcileEgressIP(old, new *egressipv1.EgressIP) (err erro
}
}

invalidStatusLen := len(invalidStatus)
if invalidStatusLen > 0 {
metrics.RecordEgressIPRebalance(invalidStatusLen)
}

// Add only the diff between what is requested and valid and that which
// isn't already assigned.
ipsToAssign := validSpecIPs
Expand All @@ -148,7 +153,7 @@ func (oc *Controller) reconcileEgressIP(old, new *egressipv1.EgressIP) (err erro
statusToKeep = append(statusToKeep, status)
ipsToAssign.Delete(status.EgressIP)
}
statusToRemove := make([]egressipv1.EgressIPStatusItem, 0, len(invalidStatus))
statusToRemove := make([]egressipv1.EgressIPStatusItem, 0, invalidStatusLen)
for status := range invalidStatus {
statusToRemove = append(statusToRemove, status)
ipsToRemove.Insert(status.EgressIP)
Expand Down Expand Up @@ -1225,6 +1230,7 @@ func (oc *Controller) syncEgressIPs(eIPs []interface{}) error {
// - Egress IPs which have been deleted while ovnkube-master was down
// - pods/namespaces which have stopped matching on egress IPs while
// ovnkube-master was down

egressIPCache, err := oc.generateCacheForEgressIP(eIPs)
if err != nil {
return fmt.Errorf("syncEgressIPs unable to generate cache for egressip: %v", err)
Expand Down Expand Up @@ -1922,13 +1928,24 @@ type egressIPController struct {
// (routing pod traffic to the egress node) and NAT objects on the egress node
// (SNAT-ing to the egress IP).
func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, pod *kapi.Pod, podIPs []*net.IPNet) (err error) {
if err := e.deletePerPodGRSNAT(pod, podIPs, status); err != nil {
if config.Metrics.EnableEIPScaleMetrics {
start := time.Now()
defer func() {
if err != nil {
return
}
duration := time.Since(start)
metrics.RecordEgressIPAssign(duration)
}()
}
if err = e.deletePerPodGRSNAT(pod, podIPs, status); err != nil {
return err
}
if err := e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.createEgressReroutePolicy); err != nil {
if err = e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.createEgressReroutePolicy); err != nil {
return fmt.Errorf("unable to create logical router policy, err: %v", err)
}
ops, err := createNATRuleOps(e.nbClient, podIPs, status, egressIPName)
var ops []ovsdb.Operation
ops, err = createNATRuleOps(e.nbClient, podIPs, status, egressIPName)
if err != nil {
return fmt.Errorf("unable to create NAT rule for status: %v, err: %v", status, err)
}
Expand All @@ -1938,14 +1955,25 @@ func (e *egressIPController) addPodEgressIPAssignment(egressIPName string, statu

// deletePodEgressIPAssignment deletes the OVN programmed egress IP
// configuration mentioned for addPodEgressIPAssignment.
func (e *egressIPController) deletePodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, podIPs []*net.IPNet) error {
if err := e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.deleteEgressReroutePolicy); errors.Is(err, libovsdbclient.ErrNotFound) {
func (e *egressIPController) deletePodEgressIPAssignment(egressIPName string, status egressipv1.EgressIPStatusItem, podIPs []*net.IPNet) (err error) {
if config.Metrics.EnableEIPScaleMetrics {
start := time.Now()
defer func() {
if err != nil {
return
}
duration := time.Since(start)
metrics.RecordEgressIPUnassign(duration)
}()
}
if err = e.handleEgressReroutePolicy(podIPs, status, egressIPName, e.deleteEgressReroutePolicy); errors.Is(err, libovsdbclient.ErrNotFound) {
// if the gateway router join IP setup is already gone, then don't count it as error.
klog.Warningf("Unable to delete logical router policy, err: %v", err)
} else if err != nil {
return fmt.Errorf("unable to delete logical router policy, err: %v", err)
}
ops, err := deleteNATRuleOps(e.nbClient, []ovsdb.Operation{}, podIPs, status, egressIPName)
var ops []ovsdb.Operation
ops, err = deleteNATRuleOps(e.nbClient, []ovsdb.Operation{}, podIPs, status, egressIPName)
if err != nil {
return fmt.Errorf("unable to delete NAT rule for status: %v, err: %v", status, err)
}
Expand Down Expand Up @@ -2169,6 +2197,7 @@ func (oc *Controller) checkEgressNodesReachability() {
oc.eIPC.allocator.Unlock()
for nodeName, shouldDelete := range reAddOrDelete {
if shouldDelete {
metrics.RecordEgressIPUnreachableNode()
klog.Warningf("Node: %s is detected as unreachable, deleting it from egress assignment", nodeName)
if err := oc.deleteEgressNode(nodeName); err != nil {
klog.Errorf("Node: %s is detected as unreachable, but could not re-assign egress IPs, err: %v", nodeName, err)
Expand Down

0 comments on commit 550a7cb

Please sign in to comment.