Skip to content

Commit 9e90ff5

Browse files
committed
workbench: make it new Nomad cluster compatible
1 parent 3726745 commit 9e90ff5

File tree

5 files changed

+128
-57
lines changed

5 files changed

+128
-57
lines changed

nix/workbench/backend/nomad-job.nix

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,10 @@ let
9191
# The SUPERVISORD_CONFIG variable must be set
9292
[ -z "''${SUPERVISORD_CONFIG:-}" ] && echo "SUPERVISORD_CONFIG env var must be set -- aborting" && exit 1
9393
94-
# Create symlink to 'supervisor' Nix folder so we can call it from 'ssh'
95-
# or 'nomad exec' without having to know the currently version running.
96-
# First check if already exists to be able to restart containers.
94+
# Create a symlink to 'supervisor' Nix Store folder so we can call it from
95+
# 'ssh' or 'nomad exec' without having it in PATH or knowing the currently
96+
# running version. But first check if it already exists to be able to
97+
# restart containers without errors.
9798
if ! test -e "''${SUPERVISOR_NIX}"
9899
then
99100
${coreutils}/bin/ln -s "${supervisor}" "''${SUPERVISOR_NIX}"
@@ -163,7 +164,7 @@ let
163164
# namespace can be specified either with the flag -namespace or read from
164165
# the NOMAD_NAMESPACE environment variable."
165166
# https://developer.hashicorp.com/nomad/tutorials/manage-clusters/namespaces
166-
namespace = "perf";
167+
namespace = "perf"; # Default to "perf" to avoid errors were possible.
167168

168169
# The region in which to execute the job.
169170
region = "global"; # SRE: They are actually using global.
@@ -270,10 +271,12 @@ let
270271
constraint = {
271272
attribute = "\${node.class}";
272273
operator = "=";
273-
# For cloud benchmarking dedicated static machines in the "perf"
274-
# class are used. We replicate that for local/test runs.
275-
# Class "qa" nodes are also available but must be limited to short
276-
# test and avoid using "infra" node class as HA jobs runs there.
274+
# For cloud benchmarking, dedicated static machines in the "perf"
275+
# class are used. We mimic that for local/test runs.
276+
# This default is just a precaution, like the top level namespace,
277+
# because there are also available "qa" Class nodes but usage of these
278+
# must be limited to short test and "infra" Class nodes are used for
279+
# HA jobs and must be avoided entirely.
277280
value = "perf";
278281
};
279282

@@ -389,8 +392,22 @@ let
389392

390393
# Specifies environment variables that will be passed to the running
391394
# process.
392-
# `null` because we are using a "template" (see below).
393-
env = {};
395+
env = {
396+
# The "old Nomad" setup somehow included the CA certs inside the
397+
# task namespace but apparently not "new Nomad". We are adding the
398+
# necessary Nix package ("cacert") and making sure `wget` finds it.
399+
#
400+
# "All nix-docker images set environment variables which point to
401+
# cacert.", see:
402+
# - https://github.com/NixOS/nixpkgs/issues/48211#issuecomment-434102565
403+
# - https://github.com/LnL7/nix-docker/blob/8dcfb3aff1f87cdafeecb0d27964b27c3fb8b1d2/default.nix#L70-L71
404+
#
405+
# Error when using `wget` to deploy the genesis tar file was:
406+
# ERROR: cannot verify iog-cardano-perf.s3.eu-central-1.amazonaws.com's certificate, issued by 'CN=Amazon RSA 2048 M01,O=Amazon,C=US':
407+
# Unable to locally verify the issuer's authority.
408+
# To connect to iog-cardano-perf.s3.eu-central-1.amazonaws.com insecurely, use `--no-check-certificate'.
409+
SSL_CERT_FILE = "${containerSpecs.containerPkgs.cacert.nix-store-path}/etc/ssl/certs/ca-bundle.crt";
410+
};
394411

395412
# Sensible defaults to run cloud version of "default", "ci-test" and
396413
# "ci-bench" in cardano-world qa class Nomad nodes.
@@ -434,7 +451,7 @@ let
434451
# When using Cardano World (nomad.world.dev.cardano.org) "perf"
435452
# class nodes we use public IPs/routing, all the other cloud runs
436453
# are behind a VPC/firewall. Local runs just use 12.0.0.1.
437-
if lib.strings.hasPrefix "cw-perf" profileData.profileName
454+
if lib.strings.hasInfix "cw-perf" profileData.profileName
438455
then "\${attr.unique.platform.aws.public-ipv4}"
439456
else ""
440457
;
@@ -1161,17 +1178,19 @@ let
11611178
# Port string from
11621179
''--port ${toString nodeSpec.port}''
11631180
]
1164-
# On cloud deployments with cardano world, that uses AWS, the hosts at the
1165-
# Linux level aren't aware of the EIP public address they have, so the
1166-
# private IP is the only network interface address to bind to.
1167-
# An alternative is to bind to 0.0.0.0 but I prefer being more specific.
1181+
# On cloud deployments to SRE-managed Nomad, that uses AWS, the hosts at
1182+
# Linux level may not be aware of the EIP public address they have so we
1183+
# can't bind to the public IP (that we can resolve to using templates).
1184+
# I prefer being more specific but the "all-weather" alternative is to
1185+
# bind to 0.0.0.0 instead of the private IP, just in case the Nomad Client
1186+
# was not started with the correct `-network-interface XX` parameter.
11681187
[
11691188
# Address string to
1170-
''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}''
1189+
''--host-addr 0.0.0.0''
11711190
# Alternatives (may not work):
1191+
#''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}''
11721192
#''--host-addr {{ env "NOMAD_IP_${servicePortName}" }}''
11731193
#''--host-addr {{range nomadService "${servicePortName}"}}{{.Address}}{{end}}''
1174-
#''--host-addr 0.0.0.0''
11751194

11761195
# Port string to
11771196
''--port {{ env "NOMAD_HOST_PORT_${servicePortName}" }}''

nix/workbench/backend/nomad.sh

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,11 +223,24 @@ backend_nomad() {
223223

224224
allocate-run-nomad-job-patch-namespace )
225225
local usage="USAGE: wb backend $op RUN-DIR NAMESPACE"
226-
local dir=${1:?$usage}; shift
227-
local namespace=${1:?$usage}; shift
226+
local dir=${1:?$usage}; shift
228227
local nomad_job_name=$(jq -r ". [\"job\"] | keys[0]" "${dir}"/nomad/nomad-job.json)
229-
msg "Setting Nomad job namespace to \"${namespace}\""
230-
jq ".job[\"${nomad_job_name}\"][\"namespace\"] = \"${namespace}\"" "${dir}"/nomad/nomad-job.json | sponge "${dir}"/nomad/nomad-job.json
228+
if test $# -gt 0 && test -n "${1}"
229+
then
230+
local namespace
231+
namespace="${1:?$usage}"; shift
232+
msg "Setting Nomad job top level namespace to \"${namespace}\""
233+
jq \
234+
".job[\"${nomad_job_name}\"][\"namespace\"] = \"${namespace}\"" \
235+
"${dir}"/nomad/nomad-job.json \
236+
| sponge "${dir}"/nomad/nomad-job.json
237+
else
238+
msg "Setting Nomad job top level namespace to null"
239+
jq \
240+
".job[\"${nomad_job_name}\"][\"namespace\"] = null" \
241+
"${dir}"/nomad/nomad-job.json \
242+
| sponge "${dir}"/nomad/nomad-job.json
243+
fi
231244
;;
232245

233246
allocate-run-nomad-job-patch-nix )

nix/workbench/backend/nomad/cloud.sh

Lines changed: 69 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -167,36 +167,51 @@ setenv-defaults-nomadcloud() {
167167
local profile_container_specs_file
168168
profile_container_specs_file="${backend_dir}"/container-specs.json
169169

170-
# If the most important `nomad` cli envars is present this is not a local
171-
# test, I repeat, this is not a drill =)
172-
if test -z "${NOMAD_ADDR:-}"
170+
if test -z "${NOMAD_ADDR+set}"
173171
then
172+
# The variable is not set, not set but empty, just not set!
174173
msg $(yellow "WARNING: Nomad address \"NOMAD_ADDR\" envar is not set")
174+
# TODO: New Nomad cluster:export NOMAD_ADDR=http://10.200.0.1:4646
175175
export NOMAD_ADDR="https://nomad.world.dev.cardano.org"
176176
msg $(blue "INFO: Setting \"NOMAD_ADDR\" to the SRE provided address for \"Performance and Tracing\" (\"${NOMAD_ADDR}\")")
177177
else
178+
# The variable is set and maybe empty!
179+
msg $(blue "INFO: Nomad address \"NOMAD_ADDR\" envar is \"${NOMAD_ADDR}\"")
178180
if test "${NOMAD_ADDR}" != "https://nomad.world.dev.cardano.org"
179181
then
180182
msg $(yellow "WARNING: Nomad address \"NOMAD_ADDR\" envar is not \"https://nomad.world.dev.cardano.org\"")
181183
fi
182184
fi
183185
# The abscence of `NOMAD_NAMESPACE` or `NOMAD_TOKEN` needs confirmation
184-
if test -z "${NOMAD_NAMESPACE:-}"
186+
if test -z ${NOMAD_NAMESPACE+set}
185187
then
188+
# The variable is not set, not set but empty, just not set!
186189
msg $(yellow "WARNING: Nomad namespace \"NOMAD_NAMESPACE\" envar is not set")
190+
# TODO: New Nomad cluster: export NOMAD_NAMESPACE=""
187191
export NOMAD_NAMESPACE="perf"
188192
msg $(blue "INFO: Setting \"NOMAD_NAMESPACE\" to the SRE provided namespace for \"Performance and Tracing\" (\"${NOMAD_NAMESPACE}\")")
189193
else
194+
# The variable is set and maybe empty!
195+
msg $(blue "INFO: Nomad namespace \"NOMAD_NAMESPACE\" envar is \"${NOMAD_NAMESPACE}\"")
190196
if test "${NOMAD_NAMESPACE}" != "perf"
191197
then
192198
msg $(yellow "WARNING: Nomad namespace \"NOMAD_NAMESPACE\" envar is not \"perf\"")
193199
fi
194200
fi
195-
if test -z "${NOMAD_TOKEN:-}"
201+
if test -z "${NOMAD_TOKEN+set}"
196202
then
203+
# The variable is not set, not set but empty, just not set!
197204
msg $(yellow "WARNING: Nomad token \"NOMAD_TOKEN\" envar is not set")
198-
msg $(blue "INFO: Fetching a \"NOMAD_TOKEN\" from SRE provided Vault for \"Performance and Tracing\"")
199-
export NOMAD_TOKEN="$(wb_nomad vault world nomad-token)"
205+
msg $(yellow "If you need to fetch a NOMAD_TOKEN for world.dev.cardano.org provide an empty string")
206+
else
207+
# The variable is set and maybe empty!
208+
if test -z "${NOMAD_TOKEN}"
209+
then
210+
msg $(blue "INFO: Fetching a \"NOMAD_TOKEN\" from SRE provided Vault for \"Performance and Tracing\"")
211+
export NOMAD_TOKEN="$(wb_nomad vault world nomad-token)"
212+
else
213+
msg $(blue "INFO: Using provided Nomad token \"NOMAD_TOKEN\" envar")
214+
fi
200215
fi
201216
# Check all the AWS S3 envars needed for the HTTP PUT request
202217
# Using same names as the AWS CLI
@@ -284,8 +299,16 @@ allocate-run-nomadcloud() {
284299
> "${dir}"/nomad/nomad-job.json
285300
fi
286301
# The job file is "slightly" modified (jq) to suit the running environment.
287-
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "${NOMAD_NAMESPACE}"
288-
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"
302+
if test -n "${NOMAD_NAMESPACE:-}"
303+
then
304+
# This sets only the global namespace, the job level namespace. Not groups!
305+
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "${NOMAD_NAMESPACE}"
306+
else
307+
# Empty the global namespace
308+
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}"
309+
fi
310+
# Will set the flake URIs from ".installable" in container-specs.json
311+
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"
289312

290313
# Set the placement info and resources accordingly
291314
local nomad_job_name
@@ -348,12 +371,12 @@ allocate-run-nomadcloud() {
348371
########################################################################
349372
local group_constraints_array
350373
# "perf" class nodes are the default unless the profile name contains
351-
# "cw-qa", we try to limit the usage of Nomad nodes that are not
352-
# dedicated Perf team nodes.
353-
# But also, we have to be careful that "perf" runs do not overlap. We
354-
# are making "perf" class nodes runs can't clash because service names
355-
# and resources definitions currently won't allow that to happen but
356-
# still a new "perf" run may mess up a previously running cluster.
374+
# "cw-qa", we try to limit the usage of Nomad nodes that are not dedicated
375+
# Perf team nodes.
376+
# But also, we have to be careful that "perf" runs do not overlap. We are
377+
# making sure "perf" class nodes runs can't clash because service names
378+
# and resources definitions currently won't allow that to happen but a new
379+
# "perf" run may still mess up a previously running cluster.
357380
if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-qa"
358381
then
359382
# Using "qa" class distinct nodes. Only "short" test allowed here.
@@ -366,29 +389,42 @@ allocate-run-nomadcloud() {
366389
}
367390
]
368391
'
369-
else
392+
elif test -n "${NOMAD_NAMESPACE:-}"
393+
then
370394
# Using Performance & Tracing exclusive "perf" class distinct nodes!
371-
group_constraints_array='
372-
[
373-
{
374-
"operator": "="
375-
, "attribute": "${node.class}"
376-
, "value": "perf"
377-
}
378-
]
379-
'
395+
group_constraints_array=" \
396+
[ \
397+
{ \
398+
\"operator\": \"=\" \
399+
, \"attribute\": \"\${node.class}\" \
400+
, \"value\": \"${NOMAD_NAMESPACE}\" \
401+
} \
402+
] \
403+
"
404+
fi
405+
# It there something to change related to group constraints ?
406+
# Sets or deletes all groups level constraints.
407+
if test -n "${group_constraints_array:-}"
408+
then
409+
# Adds it as a group level contraint to all groups.
410+
jq \
411+
--argjson group_constraints_array "${group_constraints_array}" \
412+
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \
413+
"${dir}"/nomad/nomad-job.json \
414+
| \
415+
sponge "${dir}"/nomad/nomad-job.json
416+
else
417+
# Else, empties all group level constraints, like previous namespaces.
418+
jq \
419+
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = null)" \
420+
"${dir}"/nomad/nomad-job.json \
421+
| \
422+
sponge "${dir}"/nomad/nomad-job.json
380423
fi
381-
# Adds it as a group level contraint.
382-
jq \
383-
--argjson group_constraints_array "${group_constraints_array}" \
384-
".[\"job\"][\"${nomad_job_name}\"][\"group\"] |= with_entries(.value.constraint = \$group_constraints_array)" \
385-
"${dir}"/nomad/nomad-job.json \
386-
| \
387-
sponge "${dir}"/nomad/nomad-job.json
388424
########################################################################
389425
# Memory/resources: ####################################################
390426
########################################################################
391-
# Set the resources, only for perf!
427+
# Set the resources, only for perf exlusive cloud runs!
392428
# When not "perf", when "cw-qa", only "short" tests are allowed on
393429
# whatever resources we are given.
394430
if echo "${WB_SHELL_PROFILE}" | grep --quiet "cw-perf"

nix/workbench/backend/nomad/exec.sh

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,10 @@ allocate-run-nomadexec() {
162162
"${dir}"/container-specs.json \
163163
> "${dir}"/nomad/nomad-job.json
164164
# The job file is "slightly" modified (jq) to suit the running environment.
165-
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "default"
166-
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"
165+
## Empty the global namespace. Local runs ignore "${NOMAD_NAMESPACE:-}"
166+
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}"
167+
# Will set the /nix/store paths from ".nix-store-path" in container-specs.json
168+
backend_nomad allocate-run-nomad-job-patch-nix "${dir}"
167169
}
168170

169171
deploy-genesis-nomadexec() {

nix/workbench/backend/nomad/podman.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,8 @@ allocate-run-nomadpodman() {
163163
"${dir}"/container-specs.json \
164164
> "${dir}"/nomad/nomad-job.json
165165
# The job file is "slightly" modified (jq) to suit the running environment.
166-
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}" "default"
166+
## Empty the global namespace. Local runs ignore "${NOMAD_NAMESPACE:-}"
167+
backend_nomad allocate-run-nomad-job-patch-namespace "${dir}"
167168
podman_create_image "${dir}"
168169
# Make sure the "genesis-volume" dir is present when the Nomad job is
169170
# started because with the podman task driver (always local, not used for

0 commit comments

Comments
 (0)