Skip to content

Commit

Permalink
chore(QA): QA improvements: multi-user in DO and simulated delays per…
Browse files Browse the repository at this point in the history
…-node (#460)

QA improvements
  • Loading branch information
greg-szabo authored Oct 15, 2024
1 parent 3ec137b commit ce8edbe
Show file tree
Hide file tree
Showing 11 changed files with 189 additions and 121 deletions.
3 changes: 2 additions & 1 deletion qa/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ RUN --mount=type=bind,from=code,target=/mnt cargo build --release --target-dir /
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y iproute2
COPY --from=builder /tmp/release/malachite-cli /usr/local/bin/malachite-cli
ENTRYPOINT ["malachite-cli"]
COPY --chmod=755 entrypoint.sh /usr/bin/entrypoint.sh
ENTRYPOINT ["/usr/bin/entrypoint.sh"]
17 changes: 17 additions & 0 deletions qa/docker/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/sh

set -eu

## Get TC parameters (Example: "delay 100ms 10ms")
if [ -z "${TC_PARAMS:-}" ]; then
TC_PARAMS="$( (grep "^tc_params=" /config/config/config.toml | cut -d= -f2 | tr -d \") || echo "")"
fi

# Add delay using TC. Make sure the image is running with --privileged or --cap-add=NET_ADMIN.
if [ ! -f /etc/tc_done ] && [ -n "${TC_PARAMS:-}" ]; then
#shellcheck disable=SC2086
tc qdisc add dev eth0 root netem $TC_PARAMS
touch /etc/tc_done
fi

/usr/local/bin/malachite-cli "$@"
2 changes: 1 addition & 1 deletion qa/terraform/cc.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ resource tls_private_key ssh {
}

resource digitalocean_ssh_key cc {
name = "autossh"
name = "autossh-project-${var.project_name}"
public_key = tls_private_key.ssh.public_key_openssh
}

Expand Down
1 change: 0 additions & 1 deletion qa/terraform/file-commands.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ locals {
cc = local.cc
})
commands-sh = templatefile("templates/commands.tmpl", {
etc-hosts = local.etc-hosts,
path = abspath(path.root),
ips = local.ips,
nodes = local.nodes,
Expand Down
2 changes: 1 addition & 1 deletion qa/terraform/project.tf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
resource "digitalocean_project" "malachite-testnet" {
name = "malachite-testnet"
name = var.project_name
description = "A project to test the Malachite codebase."
resources = concat([
for node in concat(digitalocean_droplet.ams3, digitalocean_droplet.blr1, digitalocean_droplet.fra1, digitalocean_droplet.lon1, digitalocean_droplet.nyc1, digitalocean_droplet.nyc3, digitalocean_droplet.sfo2, digitalocean_droplet.sfo3, digitalocean_droplet.sgp1, digitalocean_droplet.syd1, digitalocean_droplet.tor1) :
Expand Down
167 changes: 96 additions & 71 deletions qa/terraform/templates/commands.tmpl
Original file line number Diff line number Diff line change
@@ -1,85 +1,123 @@
# Environment variables for the servers
# CANDC - the IP address of the command and control server (CC is used by compilers)
# NODEi - the ip address of the node server "i"
# D_N - the number of node servers in total
# PSSH_H - space-separated list of all the node server IP addresses for pssh input
# PSSH_P - the number of parallel processes to run with pssh
# MALACHITE_DIR - the path to the malachite repository directory
# IS_CC - 1 means we are on the CC server, 0 we are not. (Used to determine the docker -H parameter when run locally.)
# Environment variables for the servers:
# CANDC - the IP address of the command and control server (CC is used by compilers)
# NODEi - the ip address of the node server "i"
# D_N - the number of node servers in total
# PSSH_H - space-separated list of all the node server IP addresses for pssh input
# PSSH_P - the number of parallel processes to run with pssh
# MALACHITE_DIR - the path to the malachite repository directory
# IS_CC - 1 means we are on the CC server, 0 we are not. (Used to determine the docker -H parameter when run locally.)
##
# Aliases for easy manual access to the servers (don't use these in scripts)
# ssh-cc - ssh into the cc server
# ssh-(nodeX) - ssh into node server "X"
# ssh-cc - ssh into the cc server
# ssh-(nodeX) - ssh into node server "X"
##
# Additional functionality in shell functions (see README for more info)
# xssh - parallel ssh command to all servers. Change PSSH_H and PSSH_P for different behavior.
# get_ip - get the IP address of a node server for programmatic use (example: get_ip 0)
# ok_cc - check if the CC server has all services installed
# deploy_cc - build the local source code into a docker image on the cc server and push it to the cc registry
# setup_config - create configuration on the cc server
# done-pull - pull the node image on all the node servers. Accepts list of IDs or "all". (example: d_pull 0 1 2)
# d_run - run the application on a node server. Accepts list of IDs or "all". (example: d_run 0 1 2)
# d_log - get the logs of the application from a node server (example: d_log 0 -f)
# d_stop - stop the application on a node server. Accepts list of IDs or "all". (example: d_stop 0 2)
# d_rm - remove node container from server. Accepts list of IDs or "all". (example: d_rm 0 1 2)
# cheat_sheet - get some help on the order of commands to run
# fetch_dashboard - fetch the dashboard graphs from Grafana (example: fetch_dashboard now-30m now-15m)
# get_prometheus_data - create a compressed prometheus data file (and download it from the cc server)
# Node name translation:
# get_ip - Translate node ID numbers to IP addresses. 0 -> 1.2.3.4
# get_id - Translate host names to node IDs. nyc1-1 -> 2
# get_hostname - Translate node IDs to hostnames. 2 -> nyc1-1
# Server management:
# xssh - Parallel SSH wrapper that uses the custom-defined PSSH_* variables.
# ok_cc - Provide user feedback if the CC server finished building.
# deploy_cc - Build and push the binary to CC Hub either from local machine or from CC.
# setup_config - Create fresh default configuration for all nodes.
# Docker commands for all nodes at once:
# d_pull - pull the node image on all the node servers. Accepts list of IDs or "all". (example: d_pull 0 1 2)
# d_run - run the application on a node server. Accepts list of IDs or "all". (example: d_run 0 1 2)
# d_log - get the logs of the application from a node server (example: d_log 0 -f)
# d_stop - stop the application on a node server. Accepts list of IDs or "all". (example: d_stop 0 2)
# d_rm - remove node container from server. Accepts list of IDs or "all". (example: d_rm 0 1 2)
# Retrieve data:
# cheat_sheet - get some help on the order of commands to run
# fetch_dashboard - fetch the dashboard graphs from Grafana (example: fetch_dashboard now-30m now-15m)
# get_prometheus_data - create a compressed prometheus data file (and download it from the cc server)
# Undocumented commands that should be made available after some more testing:
# _change_one_config_entry
# _reset_prometheus_db
# _reset_elastic_db
# Undocumented commands that might become useful:
# _reset_prometheus_db_online
# _reset_elastic_db_online
# Undocumented commands used internally:
# _is_cc
# _keyscan_cc
# _compose_persistent_peers
# _change_config
# _create_hostname_links
# _parse_multiple_hosts
##

# Global variables and aliases imported using jinja templates from terraform.
export CANDC="${cc.ip}"
%{~ for i, n in nodes }
export NODE${i}="${n.ip}"
%{~ endfor }
export D_N="${length(nodes)}"
export PSSH_H="${join(" ",ips)}"
export PSSH_P="${length(nodes)}"
export PSSH_T=120
export PSSH_V="-v"
export MALACHITE_DIR="$(dirname "$(dirname "${path}")")"
export IS_CC=0

alias ssh-cc="ssh -o LogLevel=ERROR -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o GlobalKnownHostsFile=/dev/null root@${cc.ip}"
%{~ for i,n in nodes }
alias ssh-node${i}="ssh -o LogLevel=ERROR -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o GlobalKnownHostsFile=/dev/null root@${n.ip}"
%{~ endfor }
export __MALACHITE_TERRAFORM_DIR="${path}"

xssh() {
if _is_cc; then
pssh -i $PSSH_V -p $PSSH_P -t $PSSH_T -H "$PSSH_H" "$@"
else
pssh -l root -O LogLevel=ERROR -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -O GlobalKnownHostsFile=/dev/null -i $PSSH_V -p $PSSH_P -t $PSSH_T -H "$PSSH_H" "$@"
fi
}
# Global variables that change when copied to the CC server using Terraform.
export MALACHITE_DIR="$(dirname "$(dirname "$__MALACHITE_TERRAFORM_DIR")")"
export IS_CC="0"

# More global variables and aliases that can be derived from previously available data.
alias ssh-cc="ssh -o LogLevel=ERROR -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o GlobalKnownHostsFile=/dev/null root@$${CANDC}"
alias ssh_cc="ssh -o LogLevel=ERROR -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o GlobalKnownHostsFile=/dev/null root@$${CANDC}"
export PSSH_P="$${D_N}"
export PSSH_T=120
export PSSH_V="-v"

# Translate node ID numbers to IP addresses. 0 -> 1.2.3.4
get_ip() {
I="$1"
case "$1" in
%{~ for i, n in nodes }
test "$I" -eq "${i}" && echo "${n.ip}" && return
${i}) echo "${n.ip}";;
%{~ endfor }
echo "IP for node $I not found" && return 1
*) echo "IP for node $1 not found" && return 1
esac
}

# Translate host names to node IDs. nyc1-1 -> 2
get_id() {
I="$1"
case "$1" in
%{~ for i, n in nodes }
test "$I" = "${n.name}" && echo "${i}" && return
${n.name}) echo "${i}";;
%{~ endfor }
echo "id for hostname $I not found" && return 1
*) echo "id for hostname $1 not found" && return 1
esac
}

# Translate node IDs to hostnames. 2 -> nyc1-1
get_hostname() {
I="$1"
case "$1" in
%{~ for i, n in nodes }
test "$I" -eq "${i}" && echo "${n.name}" && return
${i}) echo "${n.name}";;
%{~ endfor }
echo "hostname for node $I not found" && return 1
*) echo "hostname for node $1 not found" && return 1
esac
}

## End of Terraform template code. Do not put Jinja expressions below this line.
## The below code could be broken out to a separate shell script.

# Parallel SSH wrapper that uses the custom-defined PSSH_* variables.
xssh() {
if _is_cc; then
pssh -i $PSSH_V -p $PSSH_P -t $PSSH_T -H "$PSSH_H" "$@"
else
pssh -l root -O LogLevel=ERROR -O StrictHostKeyChecking=no -O UserKnownHostsFile=/dev/null -O GlobalKnownHostsFile=/dev/null -i $PSSH_V -p $PSSH_P -t $PSSH_T -H "$PSSH_H" "$@"
fi
}

# Provide user feedback if the CC server finished building.
ok_cc() {
PSSH_P=1 PSSH_H=$CANDC xssh "cat /etc/cc"
}

# Build and push the binary to CC Hub either from local machine or from CC.
deploy_cc() {
test -d "$MALACHITE_DIR/code" || (echo "Source code repository not found. Clone or copy manually." && return 1)
if _is_cc; then
Expand All @@ -90,6 +128,7 @@ deploy_cc() {
fi
}

# Create fresh default configuration for all nodes.
setup_config() {
if _is_cc; then
rm -r /data/*
Expand Down Expand Up @@ -168,24 +207,24 @@ _reset_prometheus_db() {
fi
}

_reset_prometheus_db2() {
_reset_prometheus_db_online() {
# Mark all node_exporter data for deletion
#curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/delete_series?match[]={job="node_exporter"}'
#curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/delete_series?match[]={job="node_exporter"}'
# Mark all malachite data for deletion
#curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/delete_series?match[]={job="malachite"}'
#curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/delete_series?match[]={job="malachite"}'
# Mark all data for deletion
#curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~".+"}'
#curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/delete_series?match[]={__name__=~".+"}'
# Set end of deletion frame to "now - 30 minutes".
END="$1"
if [ -z "$END" ]; then
END="$(($(date +%s) - 60 * 30))"
fi
# Mark node_exporter data for deletion
curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/delete_series?match[]={job="node_exporter"}&end='"$END"
curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/delete_series?match[]={job="node_exporter"}&end='"$END"
# Mark malachite data for deletion
curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/delete_series?match[]={job="malachite"}&end='"$END"
curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/delete_series?match[]={job="malachite"}&end='"$END"
# Physically delete data
curl -s -X POST -g 'http://${cc.ip}:9090/api/v1/admin/tsdb/clean_tombstones'
curl -s -X POST -g 'http://$${CANDC}:9090/api/v1/admin/tsdb/clean_tombstones'
}

_reset_elastic_db() {
Expand All @@ -198,17 +237,17 @@ _reset_elastic_db() {
fi
}

_reset_elastic_db2() {
_reset_elastic_db_online() {
if [ -f /root/docker-elk/.env ]; then
source /root/docker-elk/.env
fi
if [ -z "$ELASTIC_PASSWORD" ]; then
echo "Please set the ELASTIC_PASSWORD environment variable."
else
INDEX="$(curl -s -X GET --user "elastic:$ELASTIC_PASSWORD" http://${cc.ip}:9200/_cat/indices/*docker* | cut -d\ -f3)"
INDEX="$(curl -s -X GET --user "elastic:$ELASTIC_PASSWORD" http://$${CANDC}:9200/_cat/indices/*docker* | cut -d\ -f3)"
DS="$(echo $INDEX | sed -e 's/^.ds-logs-//' -e 's/-[^-]*$//')"
curl -X DELETE --user "elastic:$ELASTIC_PASSWORD" "http://${cc.ip}:9200/_data_stream/$DS"
curl -X DELETE --user "elastic:$ELASTIC_PASSWORD" "http://${cc.ip}:9200/$INDEX"
curl -X DELETE --user "elastic:$ELASTIC_PASSWORD" "http://$${CANDC}:9200/_data_stream/$DS"
curl -X DELETE --user "elastic:$ELASTIC_PASSWORD" "http://$${CANDC}:9200/$INDEX"
fi
}

Expand All @@ -221,14 +260,6 @@ _keyscan_cc() {
ssh-keyscan -t ed25519 "$CANDC" >> "$HOME/.ssh/known_hosts"
}

_keyscan_servers() {
_keyscan_cc 2> /dev/null
%{~ for n in nodes }
ssh-keygen -R "${n.ip}" > /dev/null
ssh-keyscan -t ed25519 "${n.ip}" >> "$HOME/.ssh/known_hosts" 2> /dev/null
%{~ endfor }
}

_compose_persistent_peers() {
skip=$${1:-10000}
port=$${2:-27000}
Expand Down Expand Up @@ -302,12 +333,6 @@ _parse_multiple_hosts() {
echo "$PSSH_X"
}

_print_hosts() {
cat <<EOF
${etc-hosts}
EOF
}

fetch_dashboard() {
FROM="$${1:-now-15m}"
TO="$${2:-now}"
Expand Down
6 changes: 6 additions & 0 deletions qa/terraform/user-data/user-data.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ network:
search:
- testnet
apt:
conf: |
Acquire::Retries "3";
Acquire::https::Timeout "5";
Acquire::http::Timeout "5";
APT::Get::Fix-Broken "true";
DPkg::Lock::Timeout "60";
sources:
source1:
source: "deb https://download.docker.com/linux/debian $RELEASE stable"
Expand Down
6 changes: 6 additions & 0 deletions qa/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ variable "ssh_keys" {
type = list(string)
}

# The project name in Digital Ocean.
variable project_name {
type = string
default = "malachite-testnet"
}

# Regions and number of servers to deploy there
# Regions list: https://docs.digitalocean.com/platform/regional-availability/
# ams3 - Amsterdam
Expand Down
2 changes: 1 addition & 1 deletion qa/viewer/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FILE?=prometheus.tgz

extract:
extract: stop
rm -rf data-prometheus
mkdir data-prometheus
tar -xvzf $(FILE) -C data-prometheus
Expand Down
Loading

0 comments on commit ce8edbe

Please sign in to comment.