Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* [FEATURE] Memberlist: Add `-memberlist.cluster-label` and `-memberlist.cluster-label-verification-disabled` to prevent accidental cross-cluster gossip joins and support rolling label rollout. #7385
* [FEATURE] Querier: Add timeout classification to classify query timeouts as 4XX (user error) or 5XX (system error) based on phase timing. When enabled, queries that spend most of their time in PromQL evaluation return `422 Unprocessable Entity` instead of `503 Service Unavailable`. #7374
* [FEATURE] Querier: Implement Resource Based Throttling in Querier. #7442
* [FEATURE] Querier: Add resource-based query eviction that automatically cancels the heaviest running query when CPU or heap utilization exceeds configured thresholds. #7488
* [ENHANCEMENT] Parquet Converter: Add a ring status page to expose the ring status. #7455
* [ENHANCEMENT] Ingester: Add WAL record metrics to help evaluate the effectiveness of WAL compression type (e.g. snappy, zstd): `cortex_ingester_tsdb_wal_record_part_writes_total`, `cortex_ingester_tsdb_wal_record_parts_bytes_written_total`, and `cortex_ingester_tsdb_wal_record_bytes_saved_total`. #7420
* [ENHANCEMENT] Distributor: Introduce dynamic `Symbols` slice capacity pooling. #7398 #7401
Expand Down
37 changes: 37 additions & 0 deletions docs/blocks-storage/querier.md
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,43 @@ querier:
# type. 0 to disable.
# CLI flag: -querier.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

eviction:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in
# percentage, between 0 and 1. monitored_resources config must include
# the resource type. 0 to disable.
# CLI flag: -querier.query-protection.eviction.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in
# percentage, between 0 and 1. monitored_resources config must include
# the resource type. 0 to disable.
# CLI flag: -querier.query-protection.eviction.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

# EXPERIMENTAL: How frequently the evictor checks system resource
# utilization.
# CLI flag: -querier.query-protection.eviction.check-interval
[check_interval: <duration> | default = 1s]

# EXPERIMENTAL: Number of check intervals to wait after an eviction before
# evicting again.
# CLI flag: -querier.query-protection.eviction.cooldown-period
[cooldown_period: <int> | default = 3]

# EXPERIMENTAL: The query metric used to determine the heaviest query for
# eviction. Supported values: fetched_samples, fetched_series,
# fetched_chunks, fetched_chunk_bytes.
# CLI flag: -querier.query-protection.eviction.eviction-metric
[eviction_metric: <string> | default = "fetched_samples"]

# EXPERIMENTAL: Minimum time a query must be running before it becomes
# eligible for eviction. Queries younger than this are ignored.
# CLI flag: -querier.query-protection.eviction.min-query-age
[min_query_age: <duration> | default = 10s]
```

### `blocks_storage_config`
Expand Down
37 changes: 37 additions & 0 deletions docs/blocks-storage/store-gateway.md
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,43 @@ store_gateway:
# CLI flag: -store-gateway.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

eviction:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in
# percentage, between 0 and 1. monitored_resources config must include
# the resource type. 0 to disable.
# CLI flag: -store-gateway.query-protection.eviction.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in
# percentage, between 0 and 1. monitored_resources config must include
# the resource type. 0 to disable.
# CLI flag: -store-gateway.query-protection.eviction.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

# EXPERIMENTAL: How frequently the evictor checks system resource
# utilization.
# CLI flag: -store-gateway.query-protection.eviction.check-interval
[check_interval: <duration> | default = 1s]

# EXPERIMENTAL: Number of check intervals to wait after an eviction before
# evicting again.
# CLI flag: -store-gateway.query-protection.eviction.cooldown-period
[cooldown_period: <int> | default = 3]

# EXPERIMENTAL: The query metric used to determine the heaviest query for
# eviction. Supported values: fetched_samples, fetched_series,
# fetched_chunks, fetched_chunk_bytes.
# CLI flag: -store-gateway.query-protection.eviction.eviction-metric
[eviction_metric: <string> | default = "fetched_samples"]

# EXPERIMENTAL: Minimum time a query must be running before it becomes
# eligible for eviction. Queries younger than this are ignored.
# CLI flag: -store-gateway.query-protection.eviction.min-query-age
[min_query_age: <duration> | default = 10s]

hedged_request:
# If true, hedged requests are applied to object store calls. It can help
# with reducing tail latency.
Expand Down
111 changes: 111 additions & 0 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -3872,6 +3872,43 @@ query_protection:
# disable.
# CLI flag: -ingester.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

eviction:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -ingester.query-protection.eviction.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -ingester.query-protection.eviction.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

# EXPERIMENTAL: How frequently the evictor checks system resource
# utilization.
# CLI flag: -ingester.query-protection.eviction.check-interval
[check_interval: <duration> | default = 1s]

# EXPERIMENTAL: Number of check intervals to wait after an eviction before
# evicting again.
# CLI flag: -ingester.query-protection.eviction.cooldown-period
[cooldown_period: <int> | default = 3]

# EXPERIMENTAL: The query metric used to determine the heaviest query for
# eviction. Supported values: fetched_samples, fetched_series,
# fetched_chunks, fetched_chunk_bytes.
# CLI flag: -ingester.query-protection.eviction.eviction-metric
[eviction_metric: <string> | default = "fetched_samples"]

# EXPERIMENTAL: Minimum time a query must be running before it becomes
# eligible for eviction. Queries younger than this are ignored.
# CLI flag: -ingester.query-protection.eviction.min-query-age
[min_query_age: <duration> | default = 10s]
```

### `ingester_client_config`
Expand Down Expand Up @@ -5016,6 +5053,43 @@ query_protection:
# disable.
# CLI flag: -querier.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

eviction:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -querier.query-protection.eviction.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -querier.query-protection.eviction.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

# EXPERIMENTAL: How frequently the evictor checks system resource
# utilization.
# CLI flag: -querier.query-protection.eviction.check-interval
[check_interval: <duration> | default = 1s]

# EXPERIMENTAL: Number of check intervals to wait after an eviction before
# evicting again.
# CLI flag: -querier.query-protection.eviction.cooldown-period
[cooldown_period: <int> | default = 3]

# EXPERIMENTAL: The query metric used to determine the heaviest query for
# eviction. Supported values: fetched_samples, fetched_series,
# fetched_chunks, fetched_chunk_bytes.
# CLI flag: -querier.query-protection.eviction.eviction-metric
[eviction_metric: <string> | default = "fetched_samples"]

# EXPERIMENTAL: Minimum time a query must be running before it becomes
# eligible for eviction. Queries younger than this are ignored.
# CLI flag: -querier.query-protection.eviction.min-query-age
[min_query_age: <duration> | default = 10s]
```

### `query_frontend_config`
Expand Down Expand Up @@ -6785,6 +6859,43 @@ query_protection:
# CLI flag: -store-gateway.query-protection.rejection.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

eviction:
threshold:
# EXPERIMENTAL: Max CPU utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -store-gateway.query-protection.eviction.threshold.cpu-utilization
[cpu_utilization: <float> | default = 0]

# EXPERIMENTAL: Max heap utilization that this instance can reach before
# evicting the heaviest running query (across all tenants) in percentage,
# between 0 and 1. monitored_resources config must include the resource
# type. 0 to disable.
# CLI flag: -store-gateway.query-protection.eviction.threshold.heap-utilization
[heap_utilization: <float> | default = 0]

# EXPERIMENTAL: How frequently the evictor checks system resource
# utilization.
# CLI flag: -store-gateway.query-protection.eviction.check-interval
[check_interval: <duration> | default = 1s]

# EXPERIMENTAL: Number of check intervals to wait after an eviction before
# evicting again.
# CLI flag: -store-gateway.query-protection.eviction.cooldown-period
[cooldown_period: <int> | default = 3]

# EXPERIMENTAL: The query metric used to determine the heaviest query for
# eviction. Supported values: fetched_samples, fetched_series,
# fetched_chunks, fetched_chunk_bytes.
# CLI flag: -store-gateway.query-protection.eviction.eviction-metric
[eviction_metric: <string> | default = "fetched_samples"]

# EXPERIMENTAL: Minimum time a query must be running before it becomes
# eligible for eviction. Queries younger than this are ignored.
# CLI flag: -store-gateway.query-protection.eviction.min-query-age
[min_query_age: <duration> | default = 10s]

hedged_request:
# If true, hedged requests are applied to object store calls. It can help with
# reducing tail latency.
Expand Down
7 changes: 7 additions & 0 deletions docs/configuration/v1-guarantees.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,10 @@ Currently experimental features are:
- `-validation.max-label-cardinality-for-unoptimized-regex` (int) - maximum label cardinality
- `-validation.max-total-label-value-length-for-unoptimized-regex` (int) - maximum total length of all label values in bytes
- HATracker: `-distributor.ha-tracker.enable-startup-sync` (bool) - If enabled, fetches all tracked keys on startup to populate the local cache.
- Querier: Resource-based query eviction
- `-querier.query-protection.eviction.threshold.cpu-utilization` (float)
- `-querier.query-protection.eviction.threshold.heap-utilization` (float)
- `-querier.query-protection.eviction.check-interval` (duration)
- `-querier.query-protection.eviction.cooldown-period` (int)
- `-querier.query-protection.eviction.eviction-metric` (string)
- `-querier.query-protection.eviction.min-query-age` (duration)
84 changes: 76 additions & 8 deletions pkg/configs/query_protection.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,115 @@ package configs
import (
"errors"
"flag"
"fmt"
"strings"
"time"

"github.com/cortexproject/cortex/pkg/util/flagext"
"github.com/cortexproject/cortex/pkg/util/resource"
)

// recognizedEvictionMetrics lists the valid values for eviction_metric.
var recognizedEvictionMetrics = map[string]bool{
"fetched_samples": true,
"fetched_series": true,
"fetched_chunks": true,
"fetched_chunk_bytes": true,
}

type QueryProtection struct {
Rejection rejection `json:"rejection"`
Rejection rejection `json:"rejection"`
Eviction EvictionConfig `yaml:"eviction"`
}

type rejection struct {
Threshold threshold `yaml:"threshold"`
Threshold Threshold `yaml:"threshold"`
}

type threshold struct {
// Threshold holds CPU and heap utilization thresholds (0-1 range).
type Threshold struct {
CPUUtilization float64 `yaml:"cpu_utilization"`
HeapUtilization float64 `yaml:"heap_utilization"`
}

// EvictionConfig configures the resource-based query evictor.
type EvictionConfig struct {
Threshold Threshold `yaml:"threshold"`
CheckInterval time.Duration `yaml:"check_interval"`
CooldownPeriod int `yaml:"cooldown_period"`
EvictionMetric string `yaml:"eviction_metric"`
MinQueryAge time.Duration `yaml:"min_query_age"`
}

// Enabled returns true when at least one eviction threshold is greater than 0.
func (c EvictionConfig) Enabled() bool {
return c.Threshold.CPUUtilization > 0 || c.Threshold.HeapUtilization > 0
}

func (cfg *QueryProtection) RegisterFlagsWithPrefix(f *flag.FlagSet, prefix string) {
// Rejection flags
f.Float64Var(&cfg.Rejection.Threshold.CPUUtilization, prefix+"query-protection.rejection.threshold.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this instance can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.Float64Var(&cfg.Rejection.Threshold.HeapUtilization, prefix+"query-protection.rejection.threshold.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this instance can reach before rejecting new query request (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")

// Eviction flags
f.Float64Var(&cfg.Eviction.Threshold.CPUUtilization, prefix+"query-protection.eviction.threshold.cpu-utilization", 0, "EXPERIMENTAL: Max CPU utilization that this instance can reach before evicting the heaviest running query (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.Float64Var(&cfg.Eviction.Threshold.HeapUtilization, prefix+"query-protection.eviction.threshold.heap-utilization", 0, "EXPERIMENTAL: Max heap utilization that this instance can reach before evicting the heaviest running query (across all tenants) in percentage, between 0 and 1. monitored_resources config must include the resource type. 0 to disable.")
f.DurationVar(&cfg.Eviction.CheckInterval, prefix+"query-protection.eviction.check-interval", 1*time.Second, "EXPERIMENTAL: How frequently the evictor checks system resource utilization.")
f.IntVar(&cfg.Eviction.CooldownPeriod, prefix+"query-protection.eviction.cooldown-period", 3, "EXPERIMENTAL: Number of check intervals to wait after an eviction before evicting again.")
f.StringVar(&cfg.Eviction.EvictionMetric, prefix+"query-protection.eviction.eviction-metric", "fetched_samples", "EXPERIMENTAL: The query metric used to determine the heaviest query for eviction. Supported values: fetched_samples, fetched_series, fetched_chunks, fetched_chunk_bytes.")
f.DurationVar(&cfg.Eviction.MinQueryAge, prefix+"query-protection.eviction.min-query-age", 10*time.Second, "EXPERIMENTAL: Minimum time a query must be running before it becomes eligible for eviction. Queries younger than this are ignored.")
}

func (cfg *QueryProtection) Validate(monitoredResources flagext.StringSliceCSV) error {
thresholdCfg := cfg.Rejection.Threshold
if thresholdCfg.CPUUtilization > 1 || thresholdCfg.CPUUtilization < 0 {
// Validate rejection thresholds
rejThreshold := cfg.Rejection.Threshold
if rejThreshold.CPUUtilization > 1 || rejThreshold.CPUUtilization < 0 {
return errors.New("cpu_utilization must be between 0 and 1")
}

if thresholdCfg.CPUUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.CPU)) {
if rejThreshold.CPUUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.CPU)) {
return errors.New("monitored_resources config must include \"cpu\" as well")
}

if thresholdCfg.HeapUtilization > 1 || thresholdCfg.HeapUtilization < 0 {
if rejThreshold.HeapUtilization > 1 || rejThreshold.HeapUtilization < 0 {
return errors.New("heap_utilization must be between 0 and 1")
}

if thresholdCfg.HeapUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.Heap)) {
if rejThreshold.HeapUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.Heap)) {
return errors.New("monitored_resources config must include \"heap\" as well")
}

// Validate eviction thresholds
evThreshold := cfg.Eviction.Threshold
if evThreshold.CPUUtilization > 1 || evThreshold.CPUUtilization < 0 {
return errors.New("eviction cpu_utilization must be between 0 and 1")
}

if evThreshold.HeapUtilization > 1 || evThreshold.HeapUtilization < 0 {
return errors.New("eviction heap_utilization must be between 0 and 1")
}

if cfg.Eviction.Enabled() {
if cfg.Eviction.CheckInterval <= 0 {
return errors.New("eviction check_interval must be greater than 0 when eviction is enabled")
}

if cfg.Eviction.CooldownPeriod < 0 {
return errors.New("eviction cooldown_period must be >= 0")
}

if !recognizedEvictionMetrics[cfg.Eviction.EvictionMetric] {
return fmt.Errorf("unrecognized eviction_metric %q; supported values: fetched_samples, fetched_series, fetched_chunks, fetched_chunk_bytes", cfg.Eviction.EvictionMetric)
}

if evThreshold.CPUUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.CPU)) {
return errors.New("monitored_resources config must include \"cpu\" when eviction cpu threshold is set")
}

if evThreshold.HeapUtilization > 0 && !strings.Contains(monitoredResources.String(), string(resource.Heap)) {
return errors.New("monitored_resources config must include \"heap\" when eviction heap threshold is set")
}
}

return nil
}
Loading
Loading