Skip to content

Commit 9a049dc

Browse files
committed
changes to allow custom labels for ServiceMonitor
Signed-off-by: Saurabh Choudhary <[email protected]>
1 parent 751bf09 commit 9a049dc

File tree

5 files changed

+248
-107
lines changed

5 files changed

+248
-107
lines changed

api/nvidia/v1/clusterpolicy_types.go

+43-38
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,40 @@ func (r Runtime) String() string {
124124
}
125125
}
126126

127+
// ServiceMonitorConfig defines configuration options for the ServiceMonitor
128+
// deployed for NVIDIA GPU Operator resources
129+
type ServiceMonitorConfig struct {
130+
// Enabled indicates if ServiceMonitor is deployed
131+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
132+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of ServiceMonitor"
133+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
134+
Enabled *bool `json:"enabled,omitempty"`
135+
136+
// Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
137+
// Supported units: y, w, d, h, m, s, ms
138+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
139+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from"
140+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
141+
Interval promv1.Duration `json:"interval,omitempty"`
142+
143+
// HonorLabels chooses the metric’s labels on collisions with target labels.
144+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
145+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
146+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
147+
HonorLabels *bool `json:"honorLabels,omitempty"`
148+
149+
// AdditionalLabels to add to ServiceMonitor instance
150+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
151+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance"
152+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
153+
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`
154+
155+
// Relabelings allows to rewrite labels on metric sets
156+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
157+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets"
158+
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
159+
}
160+
127161
// OperatorSpec describes configuration options for the operator
128162
type OperatorSpec struct {
129163
// +kubebuilder:validation:Enum=docker;crio;containerd
@@ -143,6 +177,11 @@ type OperatorSpec struct {
143177
// queryable and should be preserved when modifying objects.
144178
Annotations map[string]string `json:"annotations,omitempty"`
145179

180+
// Optional: ServiceMonitor configuration for NVIDIA GPU Operator
181+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
182+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator"
183+
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
184+
146185
// UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules
147186
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
148187
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="On OpenShift, enable DriverToolkit image to build and install driver modules"
@@ -901,7 +940,7 @@ type DCGMExporterSpec struct {
901940
// Optional: ServiceMonitor configuration for NVIDIA DCGM Exporter
902941
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
903942
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter"
904-
ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"`
943+
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
905944
}
906945

907946
// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
@@ -914,40 +953,6 @@ type DCGMExporterMetricsConfig struct {
914953
Name string `json:"name,omitempty"`
915954
}
916955

917-
// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
918-
// deployed for DCGM Exporter
919-
type DCGMExporterServiceMonitorConfig struct {
920-
// Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter
921-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
922-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor"
923-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
924-
Enabled *bool `json:"enabled,omitempty"`
925-
926-
// Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
927-
// Supported units: y, w, d, h, m, s, ms
928-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
929-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from NVDIA DCGM Exporter"
930-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
931-
Interval promv1.Duration `json:"interval,omitempty"`
932-
933-
// HonorLabels chooses the metric’s labels on collisions with target labels.
934-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
935-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
936-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
937-
HonorLabels *bool `json:"honorLabels,omitempty"`
938-
939-
// AdditionalLabels to add to ServiceMonitor instance for NVIDIA DCGM Exporter
940-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
941-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance for NVIDIA DCGM Exporter"
942-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
943-
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`
944-
945-
// Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter
946-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
947-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter"
948-
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
949-
}
950-
951956
// DCGMSpec defines the properties for NVIDIA DCGM deployment
952957
type DCGMSpec struct {
953958
// Enabled indicates if deployment of NVIDIA DCGM Hostengine as a separate pod is enabled.
@@ -2040,10 +2045,10 @@ func (dcgm *DCGMSpec) IsEnabled() bool {
20402045
return *dcgm.Enabled
20412046
}
20422047

2043-
// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator
2044-
func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool {
2048+
// IsEnabled returns true if ServiceMonitor is enabled through gpu-operator
2049+
func (sm *ServiceMonitorConfig) IsEnabled() bool {
20452050
if sm.Enabled == nil {
2046-
// ServiceMonitor for DCGM Exporter is disabled by default
2051+
// ServiceMonitor is disabled by default
20472052
return false
20482053
}
20492054
return *sm.Enabled

api/nvidia/v1/zz_generated.deepcopy.go

+49-44
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/nvidia.com_clusterpolicies.yaml

+120-4
Original file line numberDiff line numberDiff line change
@@ -389,25 +389,23 @@ spec:
389389
additionalProperties:
390390
type: string
391391
description: AdditionalLabels to add to ServiceMonitor instance
392-
for NVIDIA DCGM Exporter
393392
type: object
394393
enabled:
395394
description: Enabled indicates if ServiceMonitor is deployed
396-
for NVIDIA DCGM Exporter
397395
type: boolean
398396
honorLabels:
399397
description: HonorLabels chooses the metric’s labels on collisions
400398
with target labels.
401399
type: boolean
402400
interval:
403401
description: |-
404-
Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
402+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
405403
Supported units: y, w, d, h, m, s, ms
406404
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
407405
type: string
408406
relabelings:
409407
description: Relabelings allows to rewrite labels on metric
410-
sets for NVIDIA DCGM Exporter
408+
sets
411409
items:
412410
description: |-
413411
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
@@ -1553,6 +1551,124 @@ spec:
15531551
runtimeClass:
15541552
default: nvidia
15551553
type: string
1554+
serviceMonitor:
1555+
description: 'Optional: ServiceMonitor configuration for NVIDIA
1556+
GPU Operator'
1557+
properties:
1558+
additionalLabels:
1559+
additionalProperties:
1560+
type: string
1561+
description: AdditionalLabels to add to ServiceMonitor instance
1562+
type: object
1563+
enabled:
1564+
description: Enabled indicates if ServiceMonitor is deployed
1565+
type: boolean
1566+
honorLabels:
1567+
description: HonorLabels chooses the metric’s labels on collisions
1568+
with target labels.
1569+
type: boolean
1570+
interval:
1571+
description: |-
1572+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
1573+
Supported units: y, w, d, h, m, s, ms
1574+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
1575+
type: string
1576+
relabelings:
1577+
description: Relabelings allows to rewrite labels on metric
1578+
sets
1579+
items:
1580+
description: |-
1581+
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
1582+
scraped samples and remote write samples.
1583+
1584+
1585+
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
1586+
properties:
1587+
action:
1588+
default: replace
1589+
description: |-
1590+
Action to perform based on the regex matching.
1591+
1592+
1593+
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
1594+
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
1595+
1596+
1597+
Default: "Replace"
1598+
enum:
1599+
- replace
1600+
- Replace
1601+
- keep
1602+
- Keep
1603+
- drop
1604+
- Drop
1605+
- hashmod
1606+
- HashMod
1607+
- labelmap
1608+
- LabelMap
1609+
- labeldrop
1610+
- LabelDrop
1611+
- labelkeep
1612+
- LabelKeep
1613+
- lowercase
1614+
- Lowercase
1615+
- uppercase
1616+
- Uppercase
1617+
- keepequal
1618+
- KeepEqual
1619+
- dropequal
1620+
- DropEqual
1621+
type: string
1622+
modulus:
1623+
description: |-
1624+
Modulus to take of the hash of the source label values.
1625+
1626+
1627+
Only applicable when the action is `HashMod`.
1628+
format: int64
1629+
type: integer
1630+
regex:
1631+
description: Regular expression against which the extracted
1632+
value is matched.
1633+
type: string
1634+
replacement:
1635+
description: |-
1636+
Replacement value against which a Replace action is performed if the
1637+
regular expression matches.
1638+
1639+
1640+
Regex capture groups are available.
1641+
type: string
1642+
separator:
1643+
description: Separator is the string between concatenated
1644+
SourceLabels.
1645+
type: string
1646+
sourceLabels:
1647+
description: |-
1648+
The source labels select values from existing labels. Their content is
1649+
concatenated using the configured Separator and matched against the
1650+
configured regular expression.
1651+
items:
1652+
description: |-
1653+
LabelName is a valid Prometheus label name which may only contain ASCII
1654+
letters, numbers, as well as underscores.
1655+
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
1656+
type: string
1657+
type: array
1658+
targetLabel:
1659+
description: |-
1660+
Label to which the resulting string is written in a replacement.
1661+
1662+
1663+
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
1664+
`KeepEqual` and `DropEqual` actions.
1665+
1666+
1667+
Regex capture groups are available.
1668+
type: string
1669+
type: object
1670+
type: array
1671+
type: object
15561672
use_ocp_driver_toolkit:
15571673
description: UseOpenShiftDriverToolkit indicates if DriverToolkit
15581674
image should be used on OpenShift to build and install driver

0 commit comments

Comments
 (0)