Skip to content

Commit 8e8e5ec

Browse files
committed
changes to allow custom labels for ServiceMonitor
Signed-off-by: Saurabh Choudhary <[email protected]>
1 parent 951df73 commit 8e8e5ec

File tree

5 files changed

+248
-102
lines changed

5 files changed

+248
-102
lines changed

api/nvidia/v1/clusterpolicy_types.go

+43-38
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,40 @@ func (r Runtime) String() string {
124124
}
125125
}
126126

127+
// ServiceMonitorConfig defines configuration options for the ServiceMonitor
128+
// deployed for NVIDIA GPU Operator resources
129+
type ServiceMonitorConfig struct {
130+
// Enabled indicates if ServiceMonitor is deployed
131+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
132+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of ServiceMonitor"
133+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
134+
Enabled *bool `json:"enabled,omitempty"`
135+
136+
// Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
137+
// Supported units: y, w, d, h, m, s, ms
138+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
139+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from"
140+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
141+
Interval promv1.Duration `json:"interval,omitempty"`
142+
143+
// HonorLabels chooses the metric’s labels on collisions with target labels.
144+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
145+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
146+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
147+
HonorLabels *bool `json:"honorLabels,omitempty"`
148+
149+
// AdditionalLabels to add to ServiceMonitor instance
150+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
151+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance"
152+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
153+
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`
154+
155+
// Relabelings allows to rewrite labels on metric sets
156+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
157+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets"
158+
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
159+
}
160+
127161
// OperatorSpec describes configuration options for the operator
128162
type OperatorSpec struct {
129163
// +kubebuilder:validation:Enum=docker;crio;containerd
@@ -143,6 +177,11 @@ type OperatorSpec struct {
143177
// queryable and should be preserved when modifying objects.
144178
Annotations map[string]string `json:"annotations,omitempty"`
145179

180+
// Optional: ServiceMonitor configuration for NVIDIA GPU Operator
181+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
182+
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA GPU Operator"
183+
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
184+
146185
// UseOpenShiftDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules
147186
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
148187
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="On OpenShift, enable DriverToolkit image to build and install driver modules"
@@ -901,7 +940,7 @@ type DCGMExporterSpec struct {
901940
// Optional: ServiceMonitor configuration for NVIDIA DCGM Exporter
902941
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
903942
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="ServiceMonitor configuration for NVIDIA DCGM Exporter"
904-
ServiceMonitor *DCGMExporterServiceMonitorConfig `json:"serviceMonitor,omitempty"`
943+
ServiceMonitor *ServiceMonitorConfig `json:"serviceMonitor,omitempty"`
905944
}
906945

907946
// DCGMExporterMetricsConfig defines metrics to be collected by NVIDIA DCGM Exporter
@@ -914,40 +953,6 @@ type DCGMExporterMetricsConfig struct {
914953
Name string `json:"name,omitempty"`
915954
}
916955

917-
// DCGMExporterServiceMonitorConfig defines configuration options for the ServiceMonitor
918-
// deployed for DCGM Exporter
919-
type DCGMExporterServiceMonitorConfig struct {
920-
// Enabled indicates if ServiceMonitor is deployed for NVIDIA DCGM Exporter
921-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
922-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable deployment of NVIDIA DCGM Exporter ServiceMonitor"
923-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
924-
Enabled *bool `json:"enabled,omitempty"`
925-
926-
// Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
927-
// Supported units: y, w, d, h, m, s, ms
928-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
929-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Interval which metrics should be scraped from NVDIA DCGM Exporter"
930-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
931-
Interval promv1.Duration `json:"interval,omitempty"`
932-
933-
// HonorLabels chooses the metric’s labels on collisions with target labels.
934-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
935-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Choose the metric's label on collisions with target labels"
936-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
937-
HonorLabels *bool `json:"honorLabels,omitempty"`
938-
939-
// AdditionalLabels to add to ServiceMonitor instance for NVIDIA DCGM Exporter
940-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
941-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Additional labels to add to ServiceMonitor instance for NVIDIA DCGM Exporter"
942-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
943-
AdditionalLabels map[string]string `json:"additionalLabels,omitempty"`
944-
945-
// Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter
946-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
947-
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Relabelings allows to rewrite labels on metric sets for NVIDIA DCGM Exporter"
948-
Relabelings []*promv1.RelabelConfig `json:"relabelings,omitempty"`
949-
}
950-
951956
// DCGMSpec defines the properties for NVIDIA DCGM deployment
952957
type DCGMSpec struct {
953958
// Enabled indicates if deployment of NVIDIA DCGM Hostengine as a separate pod is enabled.
@@ -2040,10 +2045,10 @@ func (dcgm *DCGMSpec) IsEnabled() bool {
20402045
return *dcgm.Enabled
20412046
}
20422047

2043-
// IsEnabled returns true if ServiceMonitor for DCGM Exporter is enabled through gpu-operator
2044-
func (sm *DCGMExporterServiceMonitorConfig) IsEnabled() bool {
2048+
// IsEnabled returns true if ServiceMonitor is enabled through gpu-operator
2049+
func (sm *ServiceMonitorConfig) IsEnabled() bool {
20452050
if sm.Enabled == nil {
2046-
// ServiceMonitor for DCGM Exporter is disabled by default
2051+
// ServiceMonitor is disabled by default
20472052
return false
20482053
}
20492054
return *sm.Enabled

api/nvidia/v1/zz_generated.deepcopy.go

+49-44
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/nvidia.com_clusterpolicies.yaml

+120-4
Original file line numberDiff line numberDiff line change
@@ -389,25 +389,23 @@ spec:
389389
additionalProperties:
390390
type: string
391391
description: AdditionalLabels to add to ServiceMonitor instance
392-
for NVIDIA DCGM Exporter
393392
type: object
394393
enabled:
395394
description: Enabled indicates if ServiceMonitor is deployed
396-
for NVIDIA DCGM Exporter
397395
type: boolean
398396
honorLabels:
399397
description: HonorLabels chooses the metric’s labels on collisions
400398
with target labels.
401399
type: boolean
402400
interval:
403401
description: |-
404-
Interval which metrics should be scraped from NVIDIA DCGM Exporter. If not specified Prometheus’ global scrape interval is used.
402+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
405403
Supported units: y, w, d, h, m, s, ms
406404
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
407405
type: string
408406
relabelings:
409407
description: Relabelings allows to rewrite labels on metric
410-
sets for NVIDIA DCGM Exporter
408+
sets
411409
items:
412410
description: |-
413411
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
@@ -1560,6 +1558,124 @@ spec:
15601558
runtimeClass:
15611559
default: nvidia
15621560
type: string
1561+
serviceMonitor:
1562+
description: 'Optional: ServiceMonitor configuration for NVIDIA
1563+
GPU Operator'
1564+
properties:
1565+
additionalLabels:
1566+
additionalProperties:
1567+
type: string
1568+
description: AdditionalLabels to add to ServiceMonitor instance
1569+
type: object
1570+
enabled:
1571+
description: Enabled indicates if ServiceMonitor is deployed
1572+
type: boolean
1573+
honorLabels:
1574+
description: HonorLabels chooses the metric’s labels on collisions
1575+
with target labels.
1576+
type: boolean
1577+
interval:
1578+
description: |-
1579+
Interval which metrics should be scraped from. If not specified Prometheus’ global scrape interval is used.
1580+
Supported units: y, w, d, h, m, s, ms
1581+
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
1582+
type: string
1583+
relabelings:
1584+
description: Relabelings allows to rewrite labels on metric
1585+
sets
1586+
items:
1587+
description: |-
1588+
RelabelConfig allows dynamic rewriting of the label set for targets, alerts,
1589+
scraped samples and remote write samples.
1590+
1591+
1592+
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config
1593+
properties:
1594+
action:
1595+
default: replace
1596+
description: |-
1597+
Action to perform based on the regex matching.
1598+
1599+
1600+
`Uppercase` and `Lowercase` actions require Prometheus >= v2.36.0.
1601+
`DropEqual` and `KeepEqual` actions require Prometheus >= v2.41.0.
1602+
1603+
1604+
Default: "Replace"
1605+
enum:
1606+
- replace
1607+
- Replace
1608+
- keep
1609+
- Keep
1610+
- drop
1611+
- Drop
1612+
- hashmod
1613+
- HashMod
1614+
- labelmap
1615+
- LabelMap
1616+
- labeldrop
1617+
- LabelDrop
1618+
- labelkeep
1619+
- LabelKeep
1620+
- lowercase
1621+
- Lowercase
1622+
- uppercase
1623+
- Uppercase
1624+
- keepequal
1625+
- KeepEqual
1626+
- dropequal
1627+
- DropEqual
1628+
type: string
1629+
modulus:
1630+
description: |-
1631+
Modulus to take of the hash of the source label values.
1632+
1633+
1634+
Only applicable when the action is `HashMod`.
1635+
format: int64
1636+
type: integer
1637+
regex:
1638+
description: Regular expression against which the extracted
1639+
value is matched.
1640+
type: string
1641+
replacement:
1642+
description: |-
1643+
Replacement value against which a Replace action is performed if the
1644+
regular expression matches.
1645+
1646+
1647+
Regex capture groups are available.
1648+
type: string
1649+
separator:
1650+
description: Separator is the string between concatenated
1651+
SourceLabels.
1652+
type: string
1653+
sourceLabels:
1654+
description: |-
1655+
The source labels select values from existing labels. Their content is
1656+
concatenated using the configured Separator and matched against the
1657+
configured regular expression.
1658+
items:
1659+
description: |-
1660+
LabelName is a valid Prometheus label name which may only contain ASCII
1661+
letters, numbers, as well as underscores.
1662+
pattern: ^[a-zA-Z_][a-zA-Z0-9_]*$
1663+
type: string
1664+
type: array
1665+
targetLabel:
1666+
description: |-
1667+
Label to which the resulting string is written in a replacement.
1668+
1669+
1670+
It is mandatory for `Replace`, `HashMod`, `Lowercase`, `Uppercase`,
1671+
`KeepEqual` and `DropEqual` actions.
1672+
1673+
1674+
Regex capture groups are available.
1675+
type: string
1676+
type: object
1677+
type: array
1678+
type: object
15631679
use_ocp_driver_toolkit:
15641680
description: UseOpenShiftDriverToolkit indicates if DriverToolkit
15651681
image should be used on OpenShift to build and install driver

0 commit comments

Comments
 (0)