Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions karpenter/changelog.d/21819.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add support for Karpenter v1.8 metrics
15 changes: 15 additions & 0 deletions karpenter/datadog_checks/karpenter/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
'aws_sdk_go_request_duration_seconds': 'aws.sdk_go.request.duration_seconds',
'aws_sdk_go_request_attempt': 'aws.sdk_go.request_attempt',
'aws_sdk_go_request_attempt_duration_seconds': 'aws.sdk_go.request_attempt.duration_seconds',
'client_go_request': 'client_go_request',
'client_go_request_duration_seconds': 'client_go_request_duration_seconds',
'controller_runtime_conversion_webhook_panics': 'controller_runtime_conversion_webhook_panics',
'certwatcher_read_certificate': 'certwatcher.read.certificate',
'certwatcher_read_certificate_errors': 'certwatcher.read.certificate.errors',
'controller_runtime_active_workers': 'controller.runtime.active_workers',
Expand Down Expand Up @@ -101,6 +104,10 @@
'karpenter_nodepool_usage': 'nodepool_usage',
'karpenter_nodes_allocatable': 'nodes.allocatable',
'karpenter_nodes_created': 'nodes.created',
'karpenter_nodes_current_lifetime_seconds': 'nodes_current_lifetime_seconds',
'karpenter_nodes_drained': 'nodes_drained',
'karpenter_nodes_eviction_requests': 'nodes_eviction_requests',
'karpenter_nodes_lifetime_duration_seconds': 'nodes_lifetime_duration_seconds',
'karpenter_nodes_terminated': 'nodes.terminated',
'karpenter_nodes_leases_deleted': 'nodes.leases_deleted',
'karpenter_nodes_system_overhead': 'nodes.system_overhead',
Expand Down Expand Up @@ -156,20 +163,28 @@
'operator_ec2nodeclass_status_condition_transitions': 'operator.ec2nodeclass.status_condition.transitions',
'operator_ec2nodeclass_status_condition_current_status_seconds': 'operator.ec2nodeclass.status_condition.current_status.seconds',
'operator_ec2nodeclass_status_condition_count': 'operator.ec2nodeclass.status_condition_count',
'operator_ec2nodeclass_status_condition_transition_seconds': 'operator_ec2nodeclass_status_condition_transition_seconds',
'operator_ec2nodeclass_termination_current_time_seconds': 'operator_ec2nodeclass_termination_current_time_seconds',
'operator_ec2nodeclass_termination_duration_seconds': 'operator_ec2nodeclass_termination_duration_seconds',
'operator_node_event_count': 'operator.node.event_count',
'operator_node_status_condition_transitions': 'operator.node.status_condition.transitions',
'operator_node_status_condition_transition_seconds': 'operator.node.status_condition.transitions.seconds',
'operator_node_status_condition_current_status_seconds': 'operator.node.status_condition.current_status.seconds',
'operator_node_status_condition_count': 'operator.node.status_condition_count',
'operator_node_termination_duration_seconds': 'operator.node.termination.duration_seconds',
'operator_node_termination_current_time_seconds': 'operator_node_termination_current_time_seconds',
'operator_nodeclaim_status_condition_transitions': 'operator.nodeclaim.status_condition.transitions',
'operator_nodeclaim_status_condition_transition_seconds': 'operator.nodeclaim.status_condition.transitions.seconds',
'operator_nodeclaim_status_condition_current_status_seconds': 'operator.nodeclaim.status_condition.current_status.seconds',
'operator_nodeclaim_status_condition_count': 'operator.nodeclaim.status_condition_count',
'operator_nodeclaim_termination_duration_seconds': 'operator.nodeclaim.termination.duration_seconds',
'operator_nodeclaim_termination_current_time_seconds': 'operator_nodeclaim_termination_current_time_seconds',
'operator_nodepool_status_condition_transitions': 'operator.nodepool.status_condition.transitions',
'operator_nodepool_status_condition_current_status_seconds': 'operator.nodepool.status_condition.current_status.seconds',
'operator_nodepool_status_condition_count': 'operator.nodepool.status_condition_count',
'operator_nodepool_status_condition_transition_seconds': 'operator_nodepool_status_condition_transition_seconds',
'operator_nodepool_termination_current_time_seconds': 'operator_nodepool_termination_current_time_seconds',
'operator_nodepool_termination_duration_seconds': 'operator_nodepool_termination_duration_seconds',
}

RENAME_LABELS_MAP = {
Expand Down
29 changes: 28 additions & 1 deletion karpenter/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ karpenter.aws.sdk_go.request_attempt.duration_seconds.sum,count,,second,,Sum of
karpenter.build_info,gauge,,,,A metric with a constant '1' value labeled by version from which Karpenter was built.,0,karpenter,,,
karpenter.certwatcher.read.certificate.count,count,,read,,The count of certificate reads,0,karpenter,,,
karpenter.certwatcher.read.certificate.errors.count,count,,error,,The count of certificate read errors,0,karpenter,,,
karpenter.client_go_request.count,count,,request,,The count of client-go requests,0,karpenter,,,
karpenter.client_go_request_duration_seconds.bucket,count,,,,Histogram buckets for client-go request durations,0,karpenter,,,
karpenter.client_go_request_duration_seconds.count,count,,,,Count of client-go request durations,0,karpenter,,,
karpenter.client_go_request_duration_seconds.sum,count,,second,,Sum of client-go request durations,0,karpenter,,,
karpenter.cloudprovider.batcher.batch.time_seconds.bucket,count,,,,The count of observation in the batching window histogram by `upper_bound` buckets,0,karpenter,,,
karpenter.cloudprovider.batcher.batch.time_seconds.count,count,,,,The count of observation in the batching window histogram,0,karpenter,,,
karpenter.cloudprovider.batcher.batch.time_seconds.sum,count,,second,,The sum of the duration of the batching window per batcher,0,karpenter,,,
Expand Down Expand Up @@ -38,6 +42,7 @@ karpenter.controller.runtime.reconcile.time_seconds.sum,count,,second,,The sum o
karpenter.controller.runtime.reconcile_errors.count,count,,error,,The count of reconciliation errors per controller,0,karpenter,,,
karpenter.controller.runtime.reconcile_panics.count,count,,,,Total number of reconciliation panics per controller,0,karpenter,,,
karpenter.controller.runtime.terminal.reconcile.errors.count,count,,,,Total number of terminal reconciliation errors per controller,0,karpenter,,,
karpenter.controller_runtime_conversion_webhook_panics.count,count,,,,Total number of conversion webhook panics,0,karpenter,,,
karpenter.deprovisioning.actions_performed.count,count,,execution,,The count of deprovisioning actions performed. Labeled by deprovisioner (Deprecated in v1.0+),0,karpenter,,,
karpenter.deprovisioning.consolidation_timeouts,gauge,,timeout,,Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type (Deprecated in v1.0+),0,karpenter,,,
karpenter.deprovisioning.eligible_machines,gauge,,,,Number of machines eligible for deprovisioning by Karpenter. Labeled by deprovisioner (Deprecated in v1.0+),0,karpenter,,,
Expand Down Expand Up @@ -133,6 +138,12 @@ karpenter.nodes.total.daemon_limits,gauge,,,,Total resources specified by Daemon
karpenter.nodes.total.daemon_requests,gauge,,,,Total resources requested by DaemonSet pods,0,karpenter,,,
karpenter.nodes.total.pod_limits,gauge,,,,Total pod resources specified by non-DaemonSet pod limits,0,karpenter,,,
karpenter.nodes.total.pod_requests,gauge,,,,Total pod resources requested by non-DaemonSet pods bound,0,karpenter,,,
karpenter.nodes_current_lifetime_seconds,gauge,,second,,Current lifetime of nodes in seconds,0,karpenter,,,
karpenter.nodes_drained.count,count,,node,,Count of nodes drained,0,karpenter,,,
karpenter.nodes_eviction_requests.count,count,,request,,Count of node eviction requests,0,karpenter,,,
karpenter.nodes_lifetime_duration_seconds.bucket,count,,,,Histogram buckets for node lifetime durations,0,karpenter,,,
karpenter.nodes_lifetime_duration_seconds.count,count,,,,Count of node lifetime durations,0,karpenter,,,
karpenter.nodes_lifetime_duration_seconds.sum,count,,second,,Sum of node lifetime durations,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for ec2nodeclass,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition.transitions.count,count,,,,Count of status condition transitions for ec2nodeclass,0,karpenter,,,
karpenter.operator.ec2nodeclass.status_condition_count,gauge,,,,Number of conditions for ec2nodeclass,0,karpenter,,,
Expand All @@ -149,6 +160,22 @@ karpenter.operator.nodeclaim.termination.duration_seconds.bucket,count,,,,Histog
karpenter.operator.nodepool.status_condition.current_status.seconds,gauge,,second,,Time current status condition has been active for nodepool,0,karpenter,,,
karpenter.operator.nodepool.status_condition.transitions.count,count,,,,Count of status condition transitions for nodepool,0,karpenter,,,
karpenter.operator.nodepool.status_condition_count,gauge,,,,Number of conditions for nodepool,0,karpenter,,,
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.bucket,count,,,,Histogram buckets for ec2nodeclass status condition transitions,0,karpenter,,,
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.count,count,,,,Count of ec2nodeclass status condition transitions,0,karpenter,,,
karpenter.operator_ec2nodeclass_status_condition_transition_seconds.sum,count,,,,Sum of ec2nodeclass status condition transitions,0,karpenter,,,
karpenter.operator_ec2nodeclass_termination_current_time_seconds,gauge,,second,,Current time for ec2nodeclass termination,0,karpenter,,,
karpenter.operator_ec2nodeclass_termination_duration_seconds.bucket,count,,,,Histogram buckets for ec2nodeclass termination durations,0,karpenter,,,
karpenter.operator_ec2nodeclass_termination_duration_seconds.count,count,,,,Count of ec2nodeclass termination durations,0,karpenter,,,
karpenter.operator_ec2nodeclass_termination_duration_seconds.sum,count,,,,Sum of ec2nodeclass termination durations,0,karpenter,,,
karpenter.operator_node_termination_current_time_seconds,gauge,,second,,Current time for node termination,0,karpenter,,,
karpenter.operator_nodeclaim_termination_current_time_seconds,gauge,,second,,Current time for nodeclaim termination,0,karpenter,,,
karpenter.operator_nodepool_status_condition_transition_seconds.bucket,count,,,,Histogram buckets for nodepool status condition transitions,0,karpenter,,,
karpenter.operator_nodepool_status_condition_transition_seconds.count,count,,,,Count of nodepool status condition transitions,0,karpenter,,,
karpenter.operator_nodepool_status_condition_transition_seconds.sum,count,,,,Sum of nodepool status condition transitions,0,karpenter,,,
karpenter.operator_nodepool_termination_current_time_seconds,gauge,,second,,Current time for nodepool termination,0,karpenter,,,
karpenter.operator_nodepool_termination_duration_seconds.bucket,count,,,,Histogram buckets for nodepool termination durations,0,karpenter,,,
karpenter.operator_nodepool_termination_duration_seconds.count,count,,,,Count of nodepool termination durations,0,karpenter,,,
karpenter.operator_nodepool_termination_duration_seconds.sum,count,,,,Sum of nodepool termination durations,0,karpenter,,,
karpenter.pods.startup.time_seconds.count,count,,,,The count of the observations in the pod startup summary,0,karpenter,,,
karpenter.pods.startup.time_seconds.quantile,gauge,,,,The time taken between pod creation and the pod being in a running state by `quantile`,0,karpenter,,,
karpenter.pods.startup.time_seconds.sum,count,,second,,The sum of the time from pod creation and the pod being in a running state,0,karpenter,,,
Expand Down Expand Up @@ -181,4 +208,4 @@ karpenter.workqueue.work.duration_seconds.count,count,,,,The count of observatio
karpenter.workqueue.work.duration_seconds.sum,count,,second,,The sum of the amount of seconds spent processing an item from workqueue takes,0,karpenter,,,
karpenter.workqueue_adds.count,count,,,,The count of adds handled by workqueue,0,karpenter,,,
karpenter.workqueue_depth,gauge,,,,Current depth of workqueue,0,karpenter,,,
karpenter.workqueue_retries.count,count,,attempt,,The count of retries handled by workqueue,0,karpenter,,,
karpenter.workqueue_retries.count,count,,attempt,,The count of retries handled by workqueue,0,karpenter,,,
27 changes: 27 additions & 0 deletions karpenter/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def get_fixture_path(filename):
'karpenter.build_info',
'karpenter.certwatcher.read.certificate.count',
'karpenter.certwatcher.read.certificate.errors.count',
'karpenter.client_go_request.count',
'karpenter.client_go_request_duration_seconds.bucket',
'karpenter.client_go_request_duration_seconds.count',
'karpenter.client_go_request_duration_seconds.sum',
'karpenter.cloudprovider.batcher.batch.time_seconds.bucket',
'karpenter.cloudprovider.batcher.batch.time_seconds.count',
'karpenter.cloudprovider.batcher.batch.time_seconds.sum',
Expand All @@ -46,6 +50,7 @@ def get_fixture_path(filename):
'karpenter.controller.runtime.reconcile.time_seconds.count',
'karpenter.controller.runtime.reconcile.time_seconds.sum',
'karpenter.controller.runtime.reconcile_errors.count',
'karpenter.controller_runtime_conversion_webhook_panics.count',
'karpenter.deprovisioning.actions_performed.count',
'karpenter.deprovisioning.eligible_machines',
'karpenter.deprovisioning.evaluation.duration_seconds.bucket',
Expand Down Expand Up @@ -108,8 +113,14 @@ def get_fixture_path(filename):
'karpenter.machines_registered.count',
'karpenter.machines_terminated.count',
'karpenter.nodes.allocatable',
'karpenter.nodes_current_lifetime_seconds',
'karpenter.nodes_drained.count',
'karpenter.nodes.eviction.queue_depth',
'karpenter.nodes_eviction_requests.count',
'karpenter.nodes.leases_deleted.count',
'karpenter.nodes_lifetime_duration_seconds.bucket',
'karpenter.nodes_lifetime_duration_seconds.count',
'karpenter.nodes_lifetime_duration_seconds.sum',
'karpenter.nodes.system_overhead',
'karpenter.nodes.terminated.count',
'karpenter.nodes.termination.time_seconds.count',
Expand Down Expand Up @@ -200,19 +211,35 @@ def get_fixture_path(filename):
'karpenter.operator.ec2nodeclass.status_condition.current_status.seconds',
'karpenter.operator.ec2nodeclass.status_condition.transitions.count',
'karpenter.operator.ec2nodeclass.status_condition_count',
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.bucket',
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.count',
'karpenter.operator_ec2nodeclass_status_condition_transition_seconds.sum',
'karpenter.operator_ec2nodeclass_termination_current_time_seconds',
'karpenter.operator_ec2nodeclass_termination_duration_seconds.bucket',
'karpenter.operator_ec2nodeclass_termination_duration_seconds.count',
'karpenter.operator_ec2nodeclass_termination_duration_seconds.sum',
'karpenter.operator.node.status_condition.current_status.seconds',
'karpenter.operator.node.status_condition.transitions.count',
'karpenter.operator.node.status_condition.transitions.seconds.bucket',
'karpenter.operator.node.status_condition_count',
'karpenter.operator.node.termination.duration_seconds.bucket',
'karpenter.operator_node_termination_current_time_seconds',
'karpenter.operator.nodeclaim.status_condition.current_status.seconds',
'karpenter.operator.nodeclaim.status_condition.transitions.count',
'karpenter.operator.nodeclaim.status_condition.transitions.seconds.bucket',
'karpenter.operator.nodeclaim.status_condition_count',
'karpenter.operator.nodeclaim.termination.duration_seconds.bucket',
'karpenter.operator_nodeclaim_termination_current_time_seconds',
'karpenter.operator.nodepool.status_condition.current_status.seconds',
'karpenter.operator.nodepool.status_condition.transitions.count',
'karpenter.operator.nodepool.status_condition_count',
'karpenter.operator_nodepool_status_condition_transition_seconds.bucket',
'karpenter.operator_nodepool_status_condition_transition_seconds.count',
'karpenter.operator_nodepool_status_condition_transition_seconds.sum',
'karpenter.operator_nodepool_termination_current_time_seconds',
'karpenter.operator_nodepool_termination_duration_seconds.bucket',
'karpenter.operator_nodepool_termination_duration_seconds.count',
'karpenter.operator_nodepool_termination_duration_seconds.sum',
]
RENAMED_LABELS = [
'go_version:go1.20.6',
Expand Down
30 changes: 29 additions & 1 deletion karpenter/tests/fixtures/karpenter_metrics.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2576,4 +2576,32 @@ karpenter_cluster_state_synced 1
karpenter_cluster_state_node_count 265
# HELP karpenter_disruption_consolidation_timeouts_total Number of times the Consolidation algorithm has reached a timeout. Labeled by consolidation type.
# TYPE karpenter_disruption_consolidation_timeouts_total counter
karpenter_disruption_consolidation_timeouts_total{consolidation_type="single"} 5
karpenter_disruption_consolidation_timeouts_total{consolidation_type="single"} 5
# HELP client_go_request_total Total number of HTTP requests to the Kubernetes API by HTTP method and response code.
# TYPE client_go_request_total counter
client_go_request_total{code="200",host="10.100.0.1:443",method="GET"} 100
client_go_request_total{code="200",host="10.100.0.1:443",method="PUT"} 50
# HELP client_go_request_duration_seconds Latency of HTTP requests to the Kubernetes API by HTTP method and response code.
# TYPE client_go_request_duration_seconds histogram
client_go_request_duration_seconds_bucket{host="10.100.0.1:443",verb="GET",le="0.005"} 10
client_go_request_duration_seconds_bucket{host="10.100.0.1:443",verb="GET",le="+Inf"} 100
client_go_request_duration_seconds_sum{host="10.100.0.1:443",verb="GET"} 5.5
client_go_request_duration_seconds_count{host="10.100.0.1:443",verb="GET"} 100
# HELP controller_runtime_conversion_webhook_panics_total Total number of conversion webhook panics
# TYPE controller_runtime_conversion_webhook_panics_total counter
controller_runtime_conversion_webhook_panics_total 0
# HELP karpenter_nodes_current_lifetime_seconds The current age of each node in seconds
# TYPE karpenter_nodes_current_lifetime_seconds gauge
karpenter_nodes_current_lifetime_seconds{nodepool="default"} 3600
# HELP karpenter_nodes_drained_total Total number of nodes drained
# TYPE karpenter_nodes_drained_total counter
karpenter_nodes_drained_total{nodepool="default"} 5
# HELP karpenter_nodes_eviction_requests_total Total number of eviction requests made
# TYPE karpenter_nodes_eviction_requests_total counter
karpenter_nodes_eviction_requests_total 10
# HELP karpenter_nodes_lifetime_duration_seconds The lifetime duration of nodes that have been deleted or replaced
# TYPE karpenter_nodes_lifetime_duration_seconds histogram
karpenter_nodes_lifetime_duration_seconds_bucket{nodepool="default",le="3600"} 5
karpenter_nodes_lifetime_duration_seconds_bucket{nodepool="default",le="+Inf"} 10
karpenter_nodes_lifetime_duration_seconds_sum{nodepool="default"} 36000
karpenter_nodes_lifetime_duration_seconds_count{nodepool="default"} 10
Loading
Loading