|
11 | 11 | , execTaskDriver
|
12 | 12 | , generatorTaskName
|
13 | 13 | , oneTracerPerNode ? false
|
| 14 | +, withSsh ? false |
14 | 15 | }:
|
15 | 16 |
|
16 | 17 | let
|
|
90 | 91 | # The SUPERVISORD_CONFIG variable must be set
|
91 | 92 | [ -z "''${SUPERVISORD_CONFIG:-}" ] && echo "SUPERVISORD_CONFIG env var must be set -- aborting" && exit 1
|
92 | 93 |
|
93 |
| - # Create symlink to 'supervisor' Nix folder so we can call it from 'ssh' |
94 |
| - # or 'nomad exec' without having to know the currently version running. |
95 |
| - # First check if already exists to be able to restart containers. |
| 94 | + # Create a symlink to 'supervisor' Nix Store folder so we can call it from |
| 95 | + # 'ssh' or 'nomad exec' without having it in PATH or knowing the currently |
| 96 | + # running version. But first check if it already exists to be able to |
| 97 | + # restart containers without errors. |
96 | 98 | if ! test -e "''${SUPERVISOR_NIX}"
|
97 | 99 | then
|
98 | 100 | ${coreutils}/bin/ln -s "${supervisor}" "''${SUPERVISOR_NIX}"
|
|
162 | 164 | # namespace can be specified either with the flag -namespace or read from
|
163 | 165 | # the NOMAD_NAMESPACE environment variable."
|
164 | 166 | # https://developer.hashicorp.com/nomad/tutorials/manage-clusters/namespaces
|
165 |
| - namespace = "perf"; |
| 167 | + namespace = "perf"; # Default to "perf" to avoid errors were possible. |
166 | 168 |
|
167 | 169 | # The region in which to execute the job.
|
168 | 170 | region = "global"; # SRE: They are actually using global.
|
@@ -269,10 +271,13 @@ let
|
269 | 271 | constraint = {
|
270 | 272 | attribute = "\${node.class}";
|
271 | 273 | operator = "=";
|
272 |
| - # For testing we avoid using "infra" node class as HA jobs runs there |
273 |
| - # For benchmarking dedicated static machines in the "perf" |
274 |
| - # class are used and this value should be updated accordingly. |
275 |
| - value = "qa"; |
| 274 | + # For cloud benchmarking, dedicated static machines in the "perf" |
| 275 | + # class are used. We mimic that for local/test runs. |
| 276 | + # This default is just a precaution, like the top level namespace, |
| 277 | + # because there are also available "qa" Class nodes but usage of these |
| 278 | + # must be limited to short test and "infra" Class nodes are used for |
| 279 | + # HA jobs and must be avoided entirely. |
| 280 | + value = "perf"; |
276 | 281 | };
|
277 | 282 |
|
278 | 283 | # The network stanza specifies the networking requirements for the task
|
|
387 | 392 |
|
388 | 393 | # Specifies environment variables that will be passed to the running
|
389 | 394 | # process.
|
390 |
| - # `null` because we are using a "template" (see below). |
391 |
| - env = {}; |
| 395 | + env = { |
| 396 | + # The "old Nomad" setup somehow included the CA certs inside the |
| 397 | + # task namespace but apparently not "new Nomad". We are adding the |
| 398 | + # necessary Nix package ("cacert") and making sure `wget` finds it. |
| 399 | + # |
| 400 | + # "All nix-docker images set environment variables which point to |
| 401 | + # cacert.", see: |
| 402 | + # - https://github.com/NixOS/nixpkgs/issues/48211#issuecomment-434102565 |
| 403 | + # - https://github.com/LnL7/nix-docker/blob/8dcfb3aff1f87cdafeecb0d27964b27c3fb8b1d2/default.nix#L70-L71 |
| 404 | + # |
| 405 | + # Error when using `wget` to deploy the genesis tar file was: |
| 406 | + # ERROR: cannot verify iog-cardano-perf.s3.eu-central-1.amazonaws.com's certificate, issued by 'CN=Amazon RSA 2048 M01,O=Amazon,C=US': |
| 407 | + # Unable to locally verify the issuer's authority. |
| 408 | + # To connect to iog-cardano-perf.s3.eu-central-1.amazonaws.com insecurely, use `--no-check-certificate'. |
| 409 | + SSL_CERT_FILE = "${containerSpecs.containerPkgs.cacert.nix-store-path}/etc/ssl/certs/ca-bundle.crt"; |
| 410 | + }; |
392 | 411 |
|
393 | 412 | # Sensible defaults to run cloud version of "default", "ci-test" and
|
394 | 413 | # "ci-bench" in cardano-world qa class Nomad nodes.
|
|
432 | 451 | # When using Cardano World (nomad.world.dev.cardano.org) "perf"
|
433 | 452 | # class nodes we use public IPs/routing, all the other cloud runs
|
434 | 453 | # are behind a VPC/firewall. Local runs just use 12.0.0.1.
|
435 |
| - if lib.strings.hasPrefix "cw-perf" profileData.profileName |
| 454 | + if lib.strings.hasInfix "cw-perf" profileData.profileName |
436 | 455 | then "\${attr.unique.platform.aws.public-ipv4}"
|
437 | 456 | else ""
|
438 | 457 | ;
|
|
592 | 611 | withGenerator = taskName == generatorTaskName;
|
593 | 612 | # Only for the tracer task or also nodes if oneTracerPerNode
|
594 | 613 | withTracer = oneTracerPerNode || taskName == "tracer";
|
| 614 | + inherit withSsh; |
595 | 615 | # ''{{ env "NOMAD_TASK_DIR" }}/supervisor.sock''
|
596 | 616 | inherit unixHttpServerPort;
|
597 | 617 | };
|
|
728 | 748 | perms = "744"; # Only for every "start.sh" script. Default: "644"
|
729 | 749 | }
|
730 | 750 | ]
|
| 751 | + ++ |
| 752 | + # ssh |
| 753 | + (lib.optionals withSsh ( |
| 754 | + let |
| 755 | + ssh-service = import |
| 756 | + ../service/ssh.nix |
| 757 | + { |
| 758 | + inherit pkgs; |
| 759 | + bashInteractive = containerSpecs.containerPkgs.bashInteractive.nix-store-path; |
| 760 | + coreutils = containerSpecs.containerPkgs.coreutils.nix-store-path; |
| 761 | + openssh_hacks = containerSpecs.containerPkgs.openssh_hacks.nix-store-path; |
| 762 | + } |
| 763 | + ; |
| 764 | + in [ |
| 765 | + ## ssh start.sh script. |
| 766 | + { |
| 767 | + env = false; |
| 768 | + destination = "${task_statedir}/ssh/start.sh"; |
| 769 | + data = escapeTemplate ssh-service.start.value; |
| 770 | + change_mode = "noop"; |
| 771 | + error_on_missing_key = true; |
| 772 | + perms = "744"; # Only for every "start.sh" script. Default: "644" |
| 773 | + } |
| 774 | + ## ssh config file. |
| 775 | + { |
| 776 | + env = false; |
| 777 | + destination = "${task_statedir}/ssh/sshd_config"; |
| 778 | + data = escapeTemplate ssh-service.config.value; |
| 779 | + change_mode = "noop"; |
| 780 | + error_on_missing_key = true; |
| 781 | + perms = "744"; # Only for every "start.sh" script. Default: "644" |
| 782 | + } |
| 783 | + # The deployer script must add the templates for the private keys: |
| 784 | + # - ${task_statedir}/ssh/sshd.id_ed25519 |
| 785 | + # - ${task_statedir}/ssh/nobody.id_ed25519.pub |
| 786 | + ] |
| 787 | + )) |
731 | 788 | ;
|
732 | 789 |
|
733 | 790 | # Specifies logging configuration for the stdout and stderr of the
|
@@ -1121,17 +1178,19 @@ let
|
1121 | 1178 | # Port string from
|
1122 | 1179 | ''--port ${toString nodeSpec.port}''
|
1123 | 1180 | ]
|
1124 |
| - # On cloud deployments with cardano world, that uses AWS, the hosts at the |
1125 |
| - # Linux level aren't aware of the EIP public address they have, so the |
1126 |
| - # private IP is the only network interface address to bind to. |
1127 |
| - # An alternative is to bind to 0.0.0.0 but I prefer being more specific. |
| 1181 | + # On cloud deployments to SRE-managed Nomad, that uses AWS, the hosts at |
| 1182 | + # Linux level may not be aware of the EIP public address they have so we |
| 1183 | + # can't bind to the public IP (that we can resolve to using templates). |
| 1184 | + # I prefer being more specific but the "all-weather" alternative is to |
| 1185 | + # bind to 0.0.0.0 instead of the private IP, just in case the Nomad Client |
| 1186 | + # was not started with the correct `-network-interface XX` parameter. |
1128 | 1187 | [
|
1129 | 1188 | # Address string to
|
1130 |
| - ''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}'' |
| 1189 | + ''--host-addr 0.0.0.0'' |
1131 | 1190 | # Alternatives (may not work):
|
| 1191 | + #''--host-addr {{ env "NOMAD_HOST_IP_${servicePortName}" }}'' |
1132 | 1192 | #''--host-addr {{ env "NOMAD_IP_${servicePortName}" }}''
|
1133 | 1193 | #''--host-addr {{range nomadService "${servicePortName}"}}{{.Address}}{{end}}''
|
1134 |
| - #''--host-addr 0.0.0.0'' |
1135 | 1194 |
|
1136 | 1195 | # Port string to
|
1137 | 1196 | ''--port {{ env "NOMAD_HOST_PORT_${servicePortName}" }}''
|
|
0 commit comments