Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/cloudogu/retry-lib v0.1.0
github.com/dlclark/regexp2 v1.11.5
github.com/go-logr/logr v1.4.3
github.com/prometheus/client_golang v1.23.2
github.com/stretchr/testify v1.11.1
github.com/vmware-tanzu/velero v1.16.1
go.uber.org/zap v1.27.0
Expand Down Expand Up @@ -56,13 +57,13 @@ require (
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/onsi/gomega v1.38.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.66.1 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
Expand Down
2 changes: 1 addition & 1 deletion k8s/helm/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ spec:
- args:
- operator
- --health-probe-bind-address=:8081
- --metrics-bind-address=127.0.0.1:8080
- --metrics-bind-address=:8080
env:
- name: LOG_LEVEL
value: {{ .Values.manager.env.logLevel | default "info" }}
Expand Down
12 changes: 12 additions & 0 deletions k8s/helm/templates/metrics-reader-rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "k8s-backup-operator.name" . }}-metrics-reader
labels:
app.kubernetes.io/component: rbac
{{- include "k8s-backup-operator.labels" . | nindent 4 }}
rules:
- nonResourceURLs:
- /metrics
verbs:
- get
16 changes: 16 additions & 0 deletions k8s/helm/templates/metrics-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "k8s-backup-operator.name" . }}-controller-manager-metrics-service
labels:
app.kubernetes.io/component: kube-rbac-proxy
{{- include "k8s-backup-operator.labels" . | nindent 4 }}
spec:
type: ClusterIP
selector:
{{- include "k8s-backup-operator.selectorLabels" . | nindent 4 }}
ports:
- name: metrics
port: 8080
protocol: TCP
targetPort: 8080
19 changes: 19 additions & 0 deletions k8s/helm/templates/restore-dogu-modifier-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: k8s-backup-operator-restore-dogu-modifier
data:
dogu-modifier: |
version: v1
resourceModifierRules:
- conditions:
groupResource: dogus.k8s.cloudogu.com
namespaces:
- {{ .Release.Namespace }}
patches:
- operation: add
path: "/metadata/annotations"
value: "{}"
- operation: add
path: "/metadata/annotations/wasRestored"
value: "yes"
20 changes: 20 additions & 0 deletions k8s/helm/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: {{ include "k8s-backup-operator.name" . }}-servicemonitor
labels:
release: k8s-prometheus
{{- include "k8s-backup-operator.labels" . | nindent 4 }}
spec:
selector:
matchLabels:
{{- include "k8s-backup-operator.selectorLabels" . | nindent 6 }}
endpoints:
- port: metrics
scheme: http
path: /metrics
interval: 30s
# tlsConfig:
# insecureSkipVerify: false
# caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
5 changes: 5 additions & 0 deletions k8s/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,8 @@ initContainer:
schedule:
name: ces-schedule
cron: "00 02 * * *"
metrics:
serviceMonitor:
enabled: true
additionalLabels:
release: prometheus
4 changes: 4 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"time"

"github.com/cloudogu/k8s-backup-operator/pkg/metrics"
"github.com/cloudogu/k8s-backup-operator/pkg/ownerreference"
"github.com/cloudogu/k8s-backup-operator/pkg/provider"
blueprintv3 "github.com/cloudogu/k8s-blueprint-lib/v3/client"
Expand Down Expand Up @@ -84,6 +85,8 @@ func main() {

config.ConfigureLogger()

metrics.RegisterMetrics()

logger := log.FromContext(ctx).WithName("main")

if len(os.Args) < 2 {
Expand Down Expand Up @@ -234,6 +237,7 @@ func getK8sManagerOptions(flags *flag.FlagSet, args []string, operatorConfig *co
LeaderElectionID: "e3f6c1a7.cloudogu.com",
LeaseDuration: &leaseDuration,
RenewDeadline: &renewDeadline,
Metrics: server.Options{BindAddress: ":8080"},
}
controllerOpts = parseManagerFlags(flags, args, controllerOpts)

Expand Down
5 changes: 5 additions & 0 deletions pkg/backup/backupCreateManager.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

v1 "github.com/cloudogu/k8s-backup-lib/api/v1"
annotationsPkg "github.com/cloudogu/k8s-backup-operator/pkg/annotations"
"github.com/cloudogu/k8s-backup-operator/pkg/metrics"
"github.com/cloudogu/k8s-backup-operator/pkg/provider"
blueprintv3 "github.com/cloudogu/k8s-blueprint-lib/v3/client"
"github.com/cloudogu/k8s-registry-lib/repository"
Expand Down Expand Up @@ -42,13 +43,15 @@ func newBackupCreateManager(k8sClient k8sClient, clientSet ecosystemInterface, b

func (bcm *backupCreateManager) create(ctx context.Context, backup *v1.Backup) error {
logger := log.FromContext(ctx)
metrics.InitBackupStatusMetrics(bcm.namespace, backup.Name)
bcm.recorder.Event(backup, corev1.EventTypeNormal, v1.CreateEventReason, "Start backup process")
backupClient := bcm.clientSet.EcosystemV1Alpha1().Backups(bcm.namespace)

backup, err := backupClient.UpdateStatusInProgress(ctx, backup)
if err != nil {
return fmt.Errorf("failed to set status [%s] in backup resource: %w", v1.BackupStatusInProgress, err)
}
metrics.UpdateBackupStatusMetrics(bcm.namespace, backup.Name, v1.BackupStatusInProgress)

backup.Status.StartTimestamp = metav1.Now()
backup, err = backupClient.UpdateStatus(ctx, backup, metav1.UpdateOptions{})
Expand Down Expand Up @@ -105,6 +108,7 @@ func (bcm *backupCreateManager) create(ctx context.Context, backup *v1.Backup) e
if updateStatusErr != nil {
err = errors.Join(err, fmt.Errorf("failed to update backups status to 'Failed': %w", updateStatusErr))
}
metrics.UpdateBackupStatusMetrics(bcm.namespace, backup.Name, v1.BackupStatusFailed)

return err
}
Expand All @@ -113,6 +117,7 @@ func (bcm *backupCreateManager) create(ctx context.Context, backup *v1.Backup) e
if err != nil {
return fmt.Errorf("failed to set status [%s] in backup resource: %w", v1.BackupStatusCompleted, err)
}
metrics.UpdateBackupStatusMetrics(bcm.namespace, backup.Name, v1.BackupStatusCompleted)

return nil
}
Expand Down
5 changes: 4 additions & 1 deletion pkg/backup/backupReconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package backup
import (
"context"
"fmt"
"strings"

"github.com/cloudogu/k8s-backup-operator/pkg/metrics"
"github.com/cloudogu/k8s-backup-operator/pkg/requeue"
"sigs.k8s.io/controller-runtime/pkg/client"
"strings"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -47,6 +49,7 @@ func NewBackupReconciler(clientSet ecosystemInterface, recorder eventRecorder, n
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/reconcile
func (r *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
metrics.UpdateBackupReconcileTotalMetric()

backup, err := r.clientSet.EcosystemV1Alpha1().Backups(r.namespace).Get(ctx, req.Name, metav1.GetOptions{})
if err != nil {
Expand Down
88 changes: 88 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package metrics

import (
v1 "github.com/cloudogu/k8s-backup-lib/api/v1"
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
BackupReconcileTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "backup_reconcile_total",
Help: "Total number of reconciles of the backup custom resource.",
})

RestoreReconcileTotal = prometheus.NewCounter(prometheus.CounterOpts{
Name: "restore_reconcile_total",
Help: "Total number of reconciles of the restore custom resource.",
})

BackupStatusTransitionsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "backup_status_transitions_total",
Help: "Number of backup status transitions labeled by 'to'.",
},
[]string{"namespace", "name", "to"},
)

RestoreStatusTransitionsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "restore_status_transitions_total",
Help: "Number of restore status transitions labeled by 'to'.",
},
[]string{"namespace", "name", "to", "backup_name"},
)
)

// RegisterMetrics registers custom metrics with the global prometheus registry
func RegisterMetrics() {
metrics.Registry.MustRegister(BackupReconcileTotal, BackupStatusTransitionsTotal, RestoreReconcileTotal, RestoreStatusTransitionsTotal)
}

// ### Backup ###

// UpdateBackupStatusMetrics updates the metrics for a backup resource with the new status
func UpdateBackupStatusMetrics(namespace, name, newStatus string) {
// count transitions
BackupStatusTransitionsTotal.WithLabelValues(namespace, name, newStatus).Inc()
}

// InitBackupStatusMetrics initializes the metrics for a backup resource
func InitBackupStatusMetrics(namespace, name string) {
// all status values need to be initialized to 0 to monitor status increases
backupStatuses := []string{v1.BackupStatusInProgress, v1.BackupStatusCompleted, v1.BackupStatusFailed, v1.BackupStatusDeleting}
for _, status := range backupStatuses {
BackupStatusTransitionsTotal.WithLabelValues(namespace, name, status).Add(0)
}

UpdateBackupStatusMetrics(namespace, name, v1.BackupStatusNew)
}

// UpdateBackupReconcileTotalMetric increments the metric for the total number of reconciles of the backup resource
func UpdateBackupReconcileTotalMetric() {
BackupReconcileTotal.Inc()
}

// ### Restore ###

// UpdateRestoreStatusMetrics updates the metrics for a restore resource with the new status
func UpdateRestoreStatusMetrics(namespace, name, backupName, newStatus string) {
// count transitions
RestoreStatusTransitionsTotal.WithLabelValues(namespace, name, newStatus, backupName).Inc()
}

// InitRestoreStatusMetrics initializes the metrics for a restore resource
func InitRestoreStatusMetrics(namespace, name, backupName string) {
// all status values need to be initialized to 0 to monitor status increases
restoreStatuses := []string{v1.RestoreStatusInProgress, v1.RestoreStatusCompleted, v1.RestoreStatusFailed, v1.RestoreStatusDeleting}
for _, status := range restoreStatuses {
RestoreStatusTransitionsTotal.WithLabelValues(namespace, name, status, backupName).Add(0)
}

UpdateRestoreStatusMetrics(namespace, name, backupName, v1.RestoreStatusNew)
}

// UpdateRestoreReconcileTotalMetric increments the metric for the total number of reconciles of the restore resource
func UpdateRestoreReconcileTotalMetric() {
RestoreReconcileTotal.Inc()
}
103 changes: 103 additions & 0 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package metrics

import (
"testing"

v1 "github.com/cloudogu/k8s-backup-lib/api/v1"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
)

func TestUpdateBackupReconcileTotalMetric(t *testing.T) {
t.Run("should increment backup reconcile total metric", func(t *testing.T) {
initial := testutil.ToFloat64(BackupReconcileTotal)

UpdateBackupReconcileTotalMetric()

current := testutil.ToFloat64(BackupReconcileTotal)
assert.Equal(t, initial+1, current)
})
}

func TestInitBackupStatusMetrics(t *testing.T) {
t.Run("should initialize backup status metrics correctly", func(t *testing.T) {
namespace := "test-ns"
name := "test-backup"
BackupStatusTransitionsTotal.Reset()

InitBackupStatusMetrics(namespace, name)

valNew := testutil.ToFloat64(BackupStatusTransitionsTotal.WithLabelValues(namespace, name, v1.BackupStatusNew))
assert.Equal(t, 1.0, valNew, "expected status '%s' to be 1", v1.BackupStatusNew)

expectedZeroStatuses := []string{v1.BackupStatusInProgress, v1.BackupStatusCompleted, v1.BackupStatusFailed, v1.BackupStatusDeleting}
for _, status := range expectedZeroStatuses {
val := testutil.ToFloat64(BackupStatusTransitionsTotal.WithLabelValues(namespace, name, status))
assert.Equal(t, 0.0, val, "expected status '%s' to be initialized to 0", status)
}
})
}

func TestUpdateBackupStatusMetrics(t *testing.T) {
t.Run("should increment specific backup status metric", func(t *testing.T) {
namespace := "test-ns"
name := "test-backup-update"
status := v1.BackupStatusFailed

counter := BackupStatusTransitionsTotal.WithLabelValues(namespace, name, status)
initial := testutil.ToFloat64(counter)

UpdateBackupStatusMetrics(namespace, name, status)

current := testutil.ToFloat64(counter)
assert.Equal(t, initial+1, current)
})
}

func TestUpdateRestoreReconcileTotalMetric(t *testing.T) {
t.Run("should increment restore reconcile total metric", func(t *testing.T) {
initial := testutil.ToFloat64(RestoreReconcileTotal)

UpdateRestoreReconcileTotalMetric()

current := testutil.ToFloat64(RestoreReconcileTotal)
assert.Equal(t, initial+1, current)
})
}

func TestInitRestoreStatusMetrics(t *testing.T) {
t.Run("should initialize restore status metrics correctly", func(t *testing.T) {
namespace := "test-ns"
name := "test-restore"
backupName := "source-backup"
RestoreStatusTransitionsTotal.Reset()

InitRestoreStatusMetrics(namespace, name, backupName)

valNew := testutil.ToFloat64(RestoreStatusTransitionsTotal.WithLabelValues(namespace, name, v1.RestoreStatusNew, backupName))
assert.Equal(t, 1.0, valNew, "expected status '%s' to be 1", v1.RestoreStatusNew)

expectedZeroStatuses := []string{v1.RestoreStatusInProgress, v1.RestoreStatusCompleted, v1.RestoreStatusFailed, v1.RestoreStatusDeleting}
for _, status := range expectedZeroStatuses {
val := testutil.ToFloat64(RestoreStatusTransitionsTotal.WithLabelValues(namespace, name, status, backupName))
assert.Equal(t, 0.0, val, "expected status '%s' to be initialized to 0", status)
}
})
}

func TestUpdateRestoreStatusMetrics(t *testing.T) {
t.Run("should increment specific restore status metric", func(t *testing.T) {
namespace := "test-ns"
name := "test-restore-update"
backupName := "source-backup"
status := v1.RestoreStatusCompleted

counter := RestoreStatusTransitionsTotal.WithLabelValues(namespace, name, status, backupName)
initial := testutil.ToFloat64(counter)

UpdateRestoreStatusMetrics(namespace, name, backupName, status)

current := testutil.ToFloat64(counter)
assert.Equal(t, initial+1, current)
})
}
Loading