Skip to content

Commit 9a150b1

Browse files
authored
Merge pull request #430 from element-hq/bbz/configurable-probes
Make probe thresholds and frequencies configurable
2 parents 54873d7 + 32a7f34 commit 9a150b1

34 files changed

+5682
-183
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"type": "object",
3+
"properties": {
4+
"failureThreshold": {
5+
"type": [
6+
"integer",
7+
"null"
8+
],
9+
"minimum": 1
10+
},
11+
"initialDelaySeconds": {
12+
"type": [
13+
"integer",
14+
"null"
15+
],
16+
"minimum": 0
17+
},
18+
"periodSeconds": {
19+
"type": [
20+
"integer",
21+
"null"
22+
],
23+
"minimum": 1
24+
},
25+
"successThreshold": {
26+
"type": [
27+
"integer",
28+
"null"
29+
],
30+
"minimum": 1
31+
},
32+
"timeoutSeconds": {
33+
"type": [
34+
"integer",
35+
"null"
36+
],
37+
"minimum": 1
38+
}
39+
}
40+
}

charts/matrix-stack/source/common/sub_schema_values.yaml.j2

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,53 @@ labels: {}
276276
{{ (credential("PostgreSQL password", "password", commented=True) | indent(2)) }}
277277
{%- endmacro %}
278278

279+
{% macro probe(type, failureThreshold=none, initialDelaySeconds=none, periodSeconds=none, successThreshold=none, timeoutSeconds=none) %}
280+
## Configuration of the thresholds and frequencies of the {{ type }}Probe
281+
{%- if failureThreshold is none and initialDelaySeconds is none and periodSeconds is none and successThreshold is none and timeoutSeconds is none %}
282+
# {{ type }}Probe:
283+
{%- else %}
284+
{{ type }}Probe:
285+
{%- endif %}
286+
## How many consecutive failures for the probe to be considered failed
287+
{%- if failureThreshold is not none %}
288+
failureThreshold: {{ failureThreshold }}
289+
{%- else %}
290+
# failureThreshold: 3
291+
{%- endif %}
292+
293+
## Number of seconds after the container has started before the probe starts
294+
{%- if initialDelaySeconds is not none %}
295+
initialDelaySeconds: {{ initialDelaySeconds }}
296+
{%- else %}
297+
# initialDelaySeconds: 0
298+
{%- endif %}
299+
300+
## How often (in seconds) to perform the probe
301+
{%- if periodSeconds is not none %}
302+
periodSeconds: {{ periodSeconds }}
303+
{%- else %}
304+
# periodSeconds: 1
305+
{%- endif %}
306+
307+
## How many consecutive successes for the probe to be consider successful after having failed
308+
{%- if successThreshold is not none %}
309+
{%- if type in ["startup", "liveness"] %}
310+
successThreshold: 1
311+
{%- else %}
312+
successThreshold: {{ successThreshold }}
313+
{%- endif %}
314+
{%- else %}
315+
# successThreshold: 1
316+
{%- endif %}
317+
318+
## Number of seconds after which the probe times out
319+
{%- if timeoutSeconds is not none %}
320+
timeoutSeconds: {{ timeoutSeconds }}
321+
{%- else %}
322+
# timeoutSeconds: 1
323+
{%- endif %}
324+
{%- endmacro %}
325+
279326
{% macro resources(requests_memory, requests_cpu, limits_memory, key='resources') %}
280327
## Kubernetes resources to allocate to each instance.
281328
{{ key }}:

charts/matrix-stack/source/element-web.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,15 @@
5151
},
5252
"topologySpreadConstraints": {
5353
"$ref": "file://common/topologySpreadConstraints.json"
54+
},
55+
"livenessProbe": {
56+
"$ref": "file://common/probe.json"
57+
},
58+
"readinessProbe": {
59+
"$ref": "file://common/probe.json"
60+
},
61+
"startupProbe": {
62+
"$ref": "file://common/probe.json"
5463
}
5564
}
5665
}

charts/matrix-stack/source/element-web.yaml.j2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,6 @@ replicas: 1
3030
{{- sub_schema_values.serviceAccount() -}}
3131
{{- sub_schema_values.tolerations() -}}
3232
{{- sub_schema_values.topologySpreadConstraints() }}
33+
{{- sub_schema_values.probe("liveness", failureThreshold=3, periodSeconds=10) }}
34+
{{- sub_schema_values.probe("readiness", failureThreshold=3, periodSeconds=3) }}
35+
{{- sub_schema_values.probe("startup", failureThreshold=3, initialDelaySeconds=2, periodSeconds=3) }}

charts/matrix-stack/source/haproxy.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,15 @@
4242
},
4343
"topologySpreadConstraints": {
4444
"$ref": "file://common/topologySpreadConstraints.json"
45+
},
46+
"livenessProbe": {
47+
"$ref": "file://common/probe.json"
48+
},
49+
"readinessProbe": {
50+
"$ref": "file://common/probe.json"
51+
},
52+
"startupProbe": {
53+
"$ref": "file://common/probe.json"
4554
}
4655
}
4756
}

charts/matrix-stack/source/haproxy.yaml.j2

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,8 @@ replicas: 1
1818
{{- sub_schema_values.serviceMonitors() }}
1919
{{- sub_schema_values.tolerations() }}
2020
{{- sub_schema_values.topologySpreadConstraints() }}
21+
{{- sub_schema_values.probe("liveness", initialDelaySeconds=10, timeoutSeconds=5) }}
22+
{{- sub_schema_values.probe("readiness", initialDelaySeconds=20, timeoutSeconds=5) }}
23+
# The failureThreshold here is tweaked towards Synapse being ready
24+
# If Synapse isn't being deployed, unsetting this or setting it to 3 maybe more appropriate
25+
{{- sub_schema_values.probe("startup", failureThreshold=150, periodSeconds=2) }}

charts/matrix-stack/source/matrix-rtc.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,15 @@
8686
"topologySpreadConstraints": {
8787
"$ref": "file://common/topologySpreadConstraints.json"
8888
},
89+
"livenessProbe": {
90+
"$ref": "file://common/probe.json"
91+
},
92+
"readinessProbe": {
93+
"$ref": "file://common/probe.json"
94+
},
95+
"startupProbe": {
96+
"$ref": "file://common/probe.json"
97+
},
8998
"sfu": {
9099
"type": "object",
91100
"properties": {
@@ -173,6 +182,15 @@
173182
},
174183
"tolerations": {
175184
"$ref": "file://common/tolerations.json"
185+
},
186+
"livenessProbe": {
187+
"$ref": "file://common/probe.json"
188+
},
189+
"readinessProbe": {
190+
"$ref": "file://common/probe.json"
191+
},
192+
"startupProbe": {
193+
"$ref": "file://common/probe.json"
176194
}
177195
}
178196
}

charts/matrix-stack/source/matrix-rtc.yaml.j2

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ enabled: true
3030
{{- sub_schema_values.serviceMonitors() }}
3131
{{- sub_schema_values.serviceAccount() }}
3232
{{- sub_schema_values.tolerations() }}
33+
{{- sub_schema_values.probe("liveness") }}
34+
{{- sub_schema_values.probe("readiness") }}
35+
{{- sub_schema_values.probe("startup") }}
3336

3437
sfu:
3538
enabled: true
@@ -68,3 +71,6 @@ sfu:
6871
{{- sub_schema_values.serviceAccount() | indent(2) }}
6972
{{- sub_schema_values.serviceMonitors() | indent(2) }}
7073
{{- sub_schema_values.tolerations() | indent(2) }}
74+
{{- sub_schema_values.probe("liveness") | indent(2) }}
75+
{{- sub_schema_values.probe("readiness") | indent(2) }}
76+
{{- sub_schema_values.probe("startup") | indent(2) }}

charts/matrix-stack/source/matrixAuthenticationService.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,15 @@
8585
},
8686
"topologySpreadConstraints": {
8787
"$ref": "file://common/topologySpreadConstraints.json"
88+
},
89+
"livenessProbe": {
90+
"$ref": "file://common/probe.json"
91+
},
92+
"readinessProbe": {
93+
"$ref": "file://common/probe.json"
94+
},
95+
"startupProbe": {
96+
"$ref": "file://common/probe.json"
8897
}
8998
}
9099
}

charts/matrix-stack/source/matrixAuthenticationService.yaml.j2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,6 @@ privateKeys:
4242
{{ sub_schema_values.workloadAnnotations() }}
4343
{{ sub_schema_values.serviceMonitors() }}
4444
{{ sub_schema_values.extraEnv() }}
45+
{{ sub_schema_values.probe("liveness", failureThreshold=3) }}
46+
{{ sub_schema_values.probe("readiness", failureThreshold=3) }}
47+
{{ sub_schema_values.probe("startup", initialDelaySeconds=5) }}

charts/matrix-stack/source/postgres.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@
2323
},
2424
"resources": {
2525
"$ref": "file://common/resources.json"
26+
},
27+
"livenessProbe": {
28+
"$ref": "file://common/probe.json"
29+
},
30+
"readinessProbe": {
31+
"$ref": "file://common/probe.json"
32+
},
33+
"startupProbe": {
34+
"$ref": "file://common/probe.json"
2635
}
2736
}
2837
},
@@ -73,6 +82,15 @@
7382
"topologySpreadConstraints": {
7483
"$ref": "file://common/topologySpreadConstraints.json"
7584
},
85+
"livenessProbe": {
86+
"$ref": "file://common/probe.json"
87+
},
88+
"readinessProbe": {
89+
"$ref": "file://common/probe.json"
90+
},
91+
"startupProbe": {
92+
"$ref": "file://common/probe.json"
93+
},
7694
"serviceMonitors": {
7795
"$ref": "file://common/serviceMonitors.json"
7896
}

charts/matrix-stack/source/postgres.yaml.j2

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ postgresExporter:
99
{{- sub_schema_values.image(registry='docker.io', repository='prometheuscommunity/postgres-exporter', tag='v0.17.0') | indent(2) }}
1010
{{- sub_schema_values.resources(requests_memory='10Mi', requests_cpu='10m', limits_memory='500Mi')| indent(2) }}
1111
{{- sub_schema_values.containersSecurityContext()| indent(2) }}
12+
{{- sub_schema_values.probe("liveness", periodSeconds=6, timeoutSeconds=2) | indent(2) }}
13+
{{- sub_schema_values.probe("readiness", periodSeconds=2, successThreshold=2, timeoutSeconds=2) | indent(2) }}
14+
{{- sub_schema_values.probe("startup", failureThreshold=20, periodSeconds=2) | indent(2) }}
1215

1316
{{- sub_schema_values.credential("Postgres Admin Password", "adminPassword", initIfAbsent=true) }}
1417

@@ -29,3 +32,6 @@ essPasswords:
2932
{{- sub_schema_values.serviceMonitors() }}
3033
{{- sub_schema_values.tolerations() }}
3134
{{- sub_schema_values.topologySpreadConstraints() }}
35+
{{- sub_schema_values.probe("liveness", initialDelaySeconds=45, timeoutSeconds=2) }}
36+
{{- sub_schema_values.probe("readiness", initialDelaySeconds=15, timeoutSeconds=2) }}
37+
{{- sub_schema_values.probe("startup") }}

charts/matrix-stack/source/synapse.json

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,15 @@
172172
"topologySpreadConstraints": {
173173
"$ref": "file://common/topologySpreadConstraints.json"
174174
},
175+
"livenessProbe": {
176+
"$ref": "file://common/probe.json"
177+
},
178+
"readinessProbe": {
179+
"$ref": "file://common/probe.json"
180+
},
181+
"startupProbe": {
182+
"$ref": "file://common/probe.json"
183+
},
175184
"workers": {
176185
"type": "object",
177186
"properties": {
@@ -269,6 +278,15 @@
269278
},
270279
"tolerations": {
271280
"$ref": "file://common/tolerations.json"
281+
},
282+
"livenessProbe": {
283+
"$ref": "file://common/probe.json"
284+
},
285+
"readinessProbe": {
286+
"$ref": "file://common/probe.json"
287+
},
288+
"startupProbe": {
289+
"$ref": "file://common/probe.json"
272290
}
273291
}
274292
}

charts/matrix-stack/source/synapse.yaml.j2

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ logging:
9696
{{- sub_schema_values.serviceMonitors() }}
9797
{{- sub_schema_values.tolerations() }}
9898
{{- sub_schema_values.topologySpreadConstraints() }}
99+
{{- sub_schema_values.probe("liveness", failureThreshold=8, periodSeconds=6, timeoutSeconds=2) }}
100+
{{- sub_schema_values.probe("readiness", failureThreshold=8, periodSeconds=2, successThreshold=2, timeoutSeconds=2) }}
101+
{{- sub_schema_values.probe("startup", failureThreshold=54, periodSeconds=2) }}
99102

100103
## Extra command line arguments to provide to Synapse
101104
extraArgs: []
@@ -110,3 +113,6 @@ redis:
110113
{{- sub_schema_values.resources(requests_memory='50Mi', requests_cpu='50m', limits_memory='50Mi') | indent(2) }}
111114
{{- sub_schema_values.serviceAccount() | indent(2) }}
112115
{{- sub_schema_values.tolerations() | indent(2) }}
116+
{{- sub_schema_values.probe("liveness", initialDelaySeconds=15) | indent(2) }}
117+
{{- sub_schema_values.probe("readiness", initialDelaySeconds=5) | indent(2) }}
118+
{{- sub_schema_values.probe("startup") | indent(2) }}

charts/matrix-stack/source/synapse/scalable_worker.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,15 @@
1515
},
1616
"topologySpreadConstraints": {
1717
"$ref": "file://common/topologySpreadConstraints.json"
18+
},
19+
"livenessProbe": {
20+
"$ref": "file://common/probe.json"
21+
},
22+
"readinessProbe": {
23+
"$ref": "file://common/probe.json"
24+
},
25+
"startupProbe": {
26+
"$ref": "file://common/probe.json"
1827
}
1928
},
2029
"type": "object"

charts/matrix-stack/source/synapse/single_worker.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,15 @@
55
},
66
"resources": {
77
"$ref": "file://common/resources.json"
8+
},
9+
"livenessProbe": {
10+
"$ref": "file://common/probe.json"
11+
},
12+
"readinessProbe": {
13+
"$ref": "file://common/probe.json"
14+
},
15+
"startupProbe": {
16+
"$ref": "file://common/probe.json"
817
}
918
},
1019
"type": "object"

charts/matrix-stack/source/synapse/synapse_sub_schema_values.yaml.j2

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ Copyright 2024 New Vector Ltd
44
SPDX-License-Identifier: AGPL-3.0-only
55
#}
66

7+
{% import 'sub_schema_values.yaml.j2' as sub_schema_values -%}
8+
79
{% macro single_worker(workerType) %}
810
{{ workerType }}:
911
## Set to true to deploy this worker
@@ -12,6 +14,10 @@ SPDX-License-Identifier: AGPL-3.0-only
1214
## Resources for this worker.
1315
## If omitted the global Synapse resources are used
1416
# resources: {}
17+
18+
{{- sub_schema_values.probe("liveness", failureThreshold=8, periodSeconds=6, timeoutSeconds=2) | indent(2) }}
19+
{{- sub_schema_values.probe("readiness", failureThreshold=8, periodSeconds=2, successThreshold=2, timeoutSeconds=2) | indent(2) }}
20+
{{- sub_schema_values.probe("startup", failureThreshold=54, periodSeconds=2) | indent(2) }}
1521
{%- endmacro %}
1622

1723
{% macro scalable_worker(workerType) %}
@@ -25,4 +31,8 @@ SPDX-License-Identifier: AGPL-3.0-only
2531
## Resources for this worker.
2632
## If omitted the global Synapse resources are used
2733
# resources: {}
34+
35+
{{- sub_schema_values.probe("liveness", failureThreshold=3, periodSeconds=6, timeoutSeconds=2) | indent(2) }}
36+
{{- sub_schema_values.probe("readiness", failureThreshold=3, periodSeconds=2, successThreshold=2, timeoutSeconds=2) | indent(2) }}
37+
{{- sub_schema_values.probe("startup", failureThreshold=21, periodSeconds=2) | indent(2) }}
2838
{%- endmacro %}

0 commit comments

Comments
 (0)