Skip to content

Commit 2d0fde1

Browse files
authored
Merge pull request #5384 from input-output-hk/bench-nomad-ssh
Bench nomad ssh
2 parents 296ccba + 9e90ff5 commit 2d0fde1

File tree

18 files changed

+2908
-1595
lines changed

18 files changed

+2908
-1595
lines changed

nix/workbench/backend/backend.sh

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,29 @@ backend() {
3838
local op=${1:-$(usage_backend)} # No need to shift -- backends will use the op.
3939

4040
case "${op}" in
41-
is-running ) backend_$WB_BACKEND "$@";;
41+
# Prepare functions
4242
setenv-defaults ) backend_$WB_BACKEND "$@";;
4343
allocate-run ) backend_$WB_BACKEND "$@";;
4444
describe-run ) backend_$WB_BACKEND "$@";;
45+
# Start functions
46+
is-running ) backend_$WB_BACKEND "$@";;
47+
start-cluster ) backend_$WB_BACKEND "$@";;
4548
deploy-genesis ) backend_$WB_BACKEND "$@";;
46-
start ) backend_$WB_BACKEND "$@";;
49+
# Sceneario functions
50+
start-tracers ) backend_$WB_BACKEND "$@";;
4751
start-nodes ) backend_$WB_BACKEND "$@";;
52+
start-generator ) backend_$WB_BACKEND "$@";;
53+
start-healthchecks ) backend_$WB_BACKEND "$@";;
54+
# Fine grained
4855
start-node ) backend_$WB_BACKEND "$@";;
4956
stop-node ) backend_$WB_BACKEND "$@";;
5057
wait-node ) backend_$WB_BACKEND "$@";;
5158
wait-node-stopped ) backend_$WB_BACKEND "$@";;
5259
get-node-socket-path ) backend_$WB_BACKEND "$@";;
53-
start-generator ) backend_$WB_BACKEND "$@";;
54-
start-healthchecks ) backend_$WB_BACKEND "$@";;
5560
wait-pools-stopped ) backend_$WB_BACKEND "$@";;
61+
# Stop functions
62+
stop-all ) backend_$WB_BACKEND "$@";;
63+
fetch-logs ) backend_$WB_BACKEND "$@";;
5664
stop-cluster ) backend_$WB_BACKEND "$@";;
5765
cleanup-cluster ) backend_$WB_BACKEND "$@";;
5866

nix/workbench/backend/nomad-job.nix

Lines changed: 76 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
, execTaskDriver
1212
, generatorTaskName
1313
, oneTracerPerNode ? false
14+
, withSsh ? false
1415
}:
1516

1617
let
@@ -90,9 +91,10 @@ let
9091
# The SUPERVISORD_CONFIG variable must be set
9192
[ -z "''${SUPERVISORD_CONFIG:-}" ] && echo "SUPERVISORD_CONFIG env var must be set -- aborting" && exit 1
9293
93-
# Create symlink to 'supervisor' Nix folder so we can call it from 'ssh'
94-
# or 'nomad exec' without having to know the currently version running.
95-
# First check if already exists to be able to restart containers.
94+
# Create a symlink to 'supervisor' Nix Store folder so we can call it from
95+
# 'ssh' or 'nomad exec' without having it in PATH or knowing the currently
96+
# running version. But first check if it already exists to be able to
97+
# restart containers without errors.
9698
if ! test -e "''${SUPERVISOR_NIX}"
9799
then
98100
${coreutils}/bin/ln -s "${supervisor}" "''${SUPERVISOR_NIX}"
@@ -162,7 +164,7 @@ let
162164
# namespace can be specified either with the flag -namespace or read from
163165
# the NOMAD_NAMESPACE environment variable."
164166
# https://developer.hashicorp.com/nomad/tutorials/manage-clusters/namespaces
165-
namespace = "perf";
167+
namespace = "perf"; # Default to "perf" to avoid errors were possible.
166168

167169
# The region in which to execute the job.
168170
region = "global"; # SRE: They are actually using global.
@@ -269,10 +271,13 @@ let
269271
constraint = {
270272
attribute = "\${node.class}";
271273
operator = "=";
272-
# For testing we avoid using "infra" node class as HA jobs runs there
273-
# For benchmarking dedicated static machines in the "perf"
274-
# class are used and this value should be updated accordingly.
275-
value = "qa";
274+
# For cloud benchmarking, dedicated static machines in the "perf"
275+
# class are used. We mimic that for local/test runs.
276+
# This default is just a precaution, like the top level namespace,
277+
# because there are also available "qa" Class nodes but usage of these
278+
# must be limited to short test and "infra" Class nodes are used for
279+
# HA jobs and must be avoided entirely.
280+
value = "perf";
276281
};
277282

278283
# The network stanza specifies the networking requirements for the task
@@ -387,8 +392,22 @@ let
387392

388393
# Specifies environment variables that will be passed to the running
389394
# process.
390-
# `null` because we are using a "template" (see below).
391-
env = {};
395+
env = {
396+
# The "old Nomad" setup somehow included the CA certs inside the
397+
# task namespace but apparently not "new Nomad". We are adding the
398+
# necessary Nix package ("cacert") and making sure `wget` finds it.
399+
#
400+
# "All nix-docker images set environment variables which point to
401+
# cacert.", see:
402+
# - https://github.com/NixOS/nixpkgs/issues/48211#issuecomment-434102565
403+
# - https://github.com/LnL7/nix-docker/blob/8dcfb3aff1f87cdafeecb0d27964b27c3fb8b1d2/default.nix#L70-L71
404+
#
405+
# Error when using `wget` to deploy the genesis tar file was:
406+
# ERROR: cannot verify iog-cardano-perf.s3.eu-central-1.amazonaws.com's certificate, issued by 'CN=Amazon RSA 2048 M01,O=Amazon,C=US':
407+
# Unable to locally verify the issuer's authority.
408+
# To connect to iog-cardano-perf.s3.eu-central-1.amazonaws.com insecurely, use `--no-check-certificate'.
409+
SSL_CERT_FILE = "${containerSpecs.containerPkgs.cacert.nix-store-path}/etc/ssl/certs/ca-bundle.crt";
410+
};
392411

393412
# Sensible defaults to run cloud version of "default", "ci-test" and
394413
# "ci-bench" in cardano-world qa class Nomad nodes.
@@ -432,7 +451,7 @@ let
432451
# When using Cardano World (nomad.world.dev.cardano.org) "perf"
433452
# class nodes we use public IPs/routing, all the other cloud runs
434453
# are behind a VPC/firewall. Local runs just use 12.0.0.1.
435-
if lib.strings.hasPrefix "cw-perf" profileData.profileName
454+
if lib.strings.hasInfix "cw-perf" profileData.profileName
436455
then "\${attr.unique.platform.aws.public-ipv4}"
437456
else ""
438457
;
@@ -592,6 +611,7 @@ let
592611
withGenerator = taskName == generatorTaskName;
593612
# Only for the tracer task or also nodes if oneTracerPerNode
594613
withTracer = oneTracerPerNode || taskName == "tracer";
614+
inherit withSsh;
595615
# ''{{ env "NOMAD_TASK_DIR" }}/supervisor.sock''
596616
inherit unixHttpServerPort;
597617
};
@@ -728,6 +748,43 @@ let
728748
perms = "744"; # Only for every "start.sh" script. Default: "644"
729749
}
730750
]
751+
++
752+
# ssh
753+
(lib.optionals withSsh (
754+
let
755+
ssh-service = import
756+
../service/ssh.nix
757+
{
758+
inherit pkgs;
759+
bashInteractive = containerSpecs.containerPkgs.bashInteractive.nix-store-path;
760+
coreutils = containerSpecs.containerPkgs.coreutils.nix-store-path;
761+
openssh_hacks = containerSpecs.containerPkgs.openssh_hacks.nix-store-path;
762+
}
763+
;
764+
in [
765+
## ssh start.sh script.
766+
{
767+
env = false;
768+
destination = "${task_statedir}/ssh/start.sh";
769+
data = escapeTemplate ssh-service.start.value;
770+
change_mode = "noop";
771+
error_on_missing_key = true;
772+
perms = "744"; # Only for every "start.sh" script. Default: "644"
773+
}
774+
## ssh config file.
775+
{
776+
env = false;
777+
destination = "${task_statedir}/ssh/sshd_config";
778+
data = escapeTemplate ssh-service.config.value;
779+
change_mode = "noop";
780+
error_on_missing_key = true;
781+
perms = "744"; # Only for every "start.sh" script. Default: "644"
782+
}
783+
# The deployer script must add the templates for the private keys:
784+
# - ${task_statedir}/ssh/sshd.id_ed25519
785+
# - ${task_statedir}/ssh/nobody.id_ed25519.pub
786+
]
787+
))
731788
;
732789

733790
# Specifies logging configuration for the stdout and stderr of the
@@ -1121,17 +1178,19 @@ let
11211178
# Port string from
11221179
''--port ${toString nodeSpec.port}''
11231180
]
1124-
# On cloud deployments with cardano world, that uses AWS, the hosts at the
1125-
# Linux level aren't aware of the EIP public address they have, so the
1126-
# private IP is the only network interface address to bind to.
1127-
# An alternative is to bind to 0.0.0.0 but I prefer being more specific.
1181+
# On cloud deployments to SRE-managed Nomad, that uses AWS, the hosts at
1182+
# Linux level may not be aware of the EIP public address they have so we
1183+
# can't bind to the public IP (that we can resolve to using templates).
1184+
# I prefer being more specific but the "all-weather" alternative is to
1185+
# bind to 0.0.0.0 instead of the private IP, just in case the Nomad Client
1186+
# was not started with the correct `-network-interface XX` parameter.
11281187
[
11291188
# Address string to
1130-
''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}''
1189+
''--host-addr 0.0.0.0''
11311190
# Alternatives (may not work):
1191+
#''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}''
11321192
#''--host-addr {{ env "NOMAD_IP_${servicePortName}" }}''
11331193
#''--host-addr {{range nomadService "${servicePortName}"}}{{.Address}}{{end}}''
1134-
#''--host-addr 0.0.0.0''
11351194

11361195
# Port string to
11371196
''--port {{ env "NOMAD_HOST_PORT_${servicePortName}" }}''

0 commit comments

Comments
 (0)