Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/regression.sh
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,34 @@ if [ "$(echo "$PWD"/.bin/*)" != "${PWD}/.bin/*" ]; then
echo
fi

# function to monitor system resources and log them every 10 minutes
monitor_system() {
: > monitor.log

while true; do
{
echo "===== $(date) ====="
echo "--- CPU ---"
top -b -n1 | head -5
echo "--- MEM ---"
free -h
echo "--- DISK ---"
df -h .
echo
} >> monitor.log

sleep 600 # 10 minutes
done
}

# start monitor in background
monitor_system &
MON_PID=$!

# ensure cleanup on ANY exit (success, error, Ctrl-C, set -e, etc.)
# shellcheck disable=SC2064
trap "echo 'Stopping monitor'; kill $MON_PID 2>/dev/null || true" EXIT

# Run tests and generate report

# shellcheck disable=SC2046,SC2119
Expand Down
24 changes: 17 additions & 7 deletions .github/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# DESELECT_FROM_FILE: path to file with tests to deselect
# CLUSTERS_COUNT: number of local testnet clusters to launch
# FORBID_RESTART: if set to 1, do not restart clusters between tests
# SESSION_TIMEOUT: overall timeout for the test session (e.g. 10800 for 3 hours)
#
# Notes:
# - If PYTEST_ARGS is provided, we disable cleanup and the initial "skip all" pass.
Expand Down Expand Up @@ -51,9 +52,15 @@ All targets respect the same env vars as the original Makefile.
EOF
}

pytest_w_echo() {
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
pytest "$@"
run_pytest() {
if [ -n "${SESSION_TIMEOUT:-}" ]; then
local -a timeout_arr=( "--signal=INT" "--kill-after=0" "$SESSION_TIMEOUT" )
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' timeout ${timeout_arr[*]} pytest $*"
timeout "${timeout_arr[@]}" pytest "$@"
else
echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
pytest "$@"
fi
}

ensure_dirs() {
Expand Down Expand Up @@ -133,7 +140,7 @@ initial_skip_pass() {
}

run_real_tests() {
pytest_w_echo \
run_pytest \
"$TESTS_DIR" \
"${MARKEXPR_ARR[@]}" \
"${DESELECT_FROM_FILE_ARR[@]}" \
Expand All @@ -157,41 +164,44 @@ ensure_markexpr_default() {
target_tests() {
export DbSyncAbortOnPanic="${DbSyncAbortOnPanic:-1}"
TEST_THREADS="${TEST_THREADS:-20}"
SESSION_TIMEOUT="${SESSION_TIMEOUT:-10800}"

ensure_dirs
set_common_env
compute_common_args
cleanup_previous_run
initial_skip_pass
run_real_tests "$@"
run_real_tests --timeout=7200 "$@"
}

target_testpr() {
export TESTPR=1
export CLUSTERS_COUNT="${CLUSTERS_COUNT:-5}"
TEST_THREADS="${TEST_THREADS:-20}"
SESSION_TIMEOUT="${SESSION_TIMEOUT:-2700}"
ensure_markexpr_default "smoke"

ensure_dirs
set_common_env
compute_common_args
cleanup_previous_run
initial_skip_pass
run_real_tests "$@"
run_real_tests --timeout=1200 "$@"
}

target_testnets() {
export CLUSTERS_COUNT=1
export FORBID_RESTART=1
TEST_THREADS="${TEST_THREADS:-15}"
SESSION_TIMEOUT="${SESSION_TIMEOUT:-72000}"
ensure_markexpr_default "testnets"

ensure_dirs
set_common_env
compute_common_args
cleanup_previous_run
initial_skip_pass
run_real_tests "$@"
run_real_tests --timeout=7200 "$@"
}

# Dispatch
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/regression_reusable.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ jobs:
testrun-report.xml
deselected_tests.txt
requirements_coverage.json
monitor.log
- name: ↟ Upload CLI coverage
uses: actions/upload-artifact@v5
if: success() || failure()
Expand Down
69 changes: 61 additions & 8 deletions cardano_node_tests/cluster_management/cluster_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,15 @@ def __init__(
self.pytest_tmp_dir = temptools.get_pytest_root_tmp()
self.cluster_lock = common.get_cluster_lock_file()

# Soft timeout (seconds): applies when no cluster is selected.
self.grace_period_soft = 3600
# Hard timeout (seconds): always applies, regardless of cluster selection.
self.grace_period_hard = 7200
# Time window (seconds) before deadline when stricter dead cluster checks apply.
self.strict_check_window = 1200
# Maximum allowed fraction of dead clusters during strict check window.
self.strict_dead_fraction = 0.51

self._cluster_instance_num = -1

@property
Expand Down Expand Up @@ -564,13 +573,42 @@ def _marked_select_instance(self, cget_status: _ClusterGetStatus) -> bool:
# If here, this will be the first test with the mark
return True

def _fail_on_all_dead(self) -> None:
"""Fail if all cluster instances are dead."""
dead_clusters = status_files.list_cluster_dead_files()
if len(dead_clusters) == self.num_of_instances:
msg = "All clusters are dead, cannot run."
def _check_dead_fraction(self, max_dead_fraction: float) -> None:
"""Fail if the fraction of dead cluster instances is too high."""
total = self.num_of_instances
if total == 0:
msg = "Number of cluster instances must be greater than 0."
raise ValueError(msg)
dead_count = len(status_files.list_cluster_dead_files())
dead_fraction = dead_count / total

if dead_fraction >= max_dead_fraction:
if dead_count == total:
msg = "All cluster instances are dead."
else:
msg = (
"Too many cluster instances are dead: "
f"{dead_count} out of {total} "
f"({dead_fraction:.0%} dead, "
f"maximum allowed: {max_dead_fraction:.0%})."
)
raise RuntimeError(msg)

def _fail_on_dead_clusters(self, remaining_time_sec: float) -> None:
"""Fail based on how many cluster instances are dead and time left.

Use a stricter failure threshold as we approach the deadline.
If we've been waiting a long time and too many cluster instances are dead,
it's better to fail than continue trying with too few usable instances.
"""
if remaining_time_sec <= self.strict_check_window:
max_dead_fraction = self.strict_dead_fraction
else:
# Early in the wait period we only fail if all instances are dead.
max_dead_fraction = 1.0

self._check_dead_fraction(max_dead_fraction)

def _cleanup_dead_clusters(self, cget_status: _ClusterGetStatus) -> None:
"""Cleanup if the selected cluster instance failed to start."""
# Move on to other cluster instance
Expand Down Expand Up @@ -805,8 +843,24 @@ def get_cluster_instance( # noqa: C901

self.log(f"want to run test '{cget_status.current_test}'")

# Iterate until it is possible to start the test
# Iterate until it is possible to start the test. Timeout after grace period.
now = time.monotonic()
deadline_soft = now + self.grace_period_soft
deadline_hard = now + self.grace_period_hard
while True:
now = time.monotonic()
remaining_soft = deadline_soft - now
remaining_hard = deadline_hard - now

# Timeout after soft grace period if no cluster instance was selected yet
if cget_status.selected_instance == -1 and remaining_soft <= 0:
msg = "Timeout (soft) while waiting to obtain cluster instance."
raise TimeoutError(msg)
# Timeout after hard grace period even if cluster instance was already selected
if remaining_hard <= 0:
msg = "Timeout (hard) while waiting to obtain cluster instance."
raise TimeoutError(msg)

if cget_status.respin_ready:
self._respin(scriptsdir=scriptsdir)

Expand All @@ -819,8 +873,7 @@ def get_cluster_instance( # noqa: C901
if self._is_already_running():
return self.cluster_instance_num

# Fail if all cluster instances are dead
self._fail_on_all_dead()
self._fail_on_dead_clusters(remaining_time_sec=remaining_soft)

if mark:
# Check if tests with my mark are already locked to any cluster instance
Expand Down
18 changes: 16 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ PyYAML = "^6.0.2"
requests = "^2.32.4"
pytest-subtests = "^0.14.2"
cardonnay = "^0.2.8"
pytest-timeout = "^2.4.0"

[tool.poetry.group.dev]
optional = true
Expand Down
Loading