IntersectMBO · mkoura · Nov 27, 2025 · Nov 25, 2025 · Nov 26, 2025 · Nov 26, 2025
@@ -207,6 +207,34 @@ if [ "$(echo "$PWD"/.bin/*)" != "${PWD}/.bin/*" ]; then
   echo
 fi
 
+# function to monitor system resources and log them every 10 minutes
+monitor_system() {
+  : > monitor.log
+
+  while true; do
+    {
+      echo "===== $(date) ====="
+      echo "--- CPU ---"
+      top -b -n1 | head -5
+      echo "--- MEM ---"
+      free -h
+      echo "--- DISK ---"
+      df -h .
+      echo
+    } >> monitor.log
+
+    sleep 600 # 10 minutes
+  done
+}
+
+# start monitor in background
+monitor_system &
+MON_PID=$!
+
+# ensure cleanup on ANY exit (success, error, Ctrl-C, set -e, etc.)
+# shellcheck disable=SC2064
+trap "echo 'Stopping monitor'; kill $MON_PID 2>/dev/null || true" EXIT
+
 # Run tests and generate report
 
 # shellcheck disable=SC2046,SC2119

@@ -21,6 +21,7 @@
 #   DESELECT_FROM_FILE: path to file with tests to deselect
 #   CLUSTERS_COUNT: number of local testnet clusters to launch
 #   FORBID_RESTART: if set to 1, do not restart clusters between tests
+#   SESSION_TIMEOUT: overall timeout for the test session (e.g. 10800 for 3 hours)
 #
 # Notes:
 # - If PYTEST_ARGS is provided, we disable cleanup and the initial "skip all" pass.
@@ -51,9 +52,15 @@ All targets respect the same env vars as the original Makefile.
 EOF
 }
 
-pytest_w_echo() {
-  echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
-  pytest "$@"
+run_pytest() {
+  if [ -n "${SESSION_TIMEOUT:-}" ]; then
+    local -a timeout_arr=( "--signal=INT" "--kill-after=0" "$SESSION_TIMEOUT" )
+    echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' timeout ${timeout_arr[*]} pytest $*"
+    timeout "${timeout_arr[@]}" pytest "$@"
+  else
+    echo "Running: PYTEST_ADDOPTS='${PYTEST_ADDOPTS:-}' pytest $*"
+    pytest "$@"
+  fi
 }
 
 ensure_dirs() {
@@ -133,7 +140,7 @@ initial_skip_pass() {
 }
 
 run_real_tests() {
-  pytest_w_echo \
+  run_pytest \
     "$TESTS_DIR" \
     "${MARKEXPR_ARR[@]}" \
     "${DESELECT_FROM_FILE_ARR[@]}" \
@@ -157,41 +164,44 @@ ensure_markexpr_default() {
 target_tests() {
   export DbSyncAbortOnPanic="${DbSyncAbortOnPanic:-1}"
   TEST_THREADS="${TEST_THREADS:-20}"
+  SESSION_TIMEOUT="${SESSION_TIMEOUT:-10800}"
 
   ensure_dirs
   set_common_env
   compute_common_args
   cleanup_previous_run
   initial_skip_pass
-  run_real_tests "$@"
+  run_real_tests --timeout=7200 "$@"
 }
 
 target_testpr() {
   export TESTPR=1
   export CLUSTERS_COUNT="${CLUSTERS_COUNT:-5}"
   TEST_THREADS="${TEST_THREADS:-20}"
+  SESSION_TIMEOUT="${SESSION_TIMEOUT:-2700}"
   ensure_markexpr_default "smoke"
 
   ensure_dirs
   set_common_env
   compute_common_args
   cleanup_previous_run
   initial_skip_pass
-  run_real_tests "$@"
+  run_real_tests --timeout=1200 "$@"
 }
 
 target_testnets() {
   export CLUSTERS_COUNT=1
   export FORBID_RESTART=1
   TEST_THREADS="${TEST_THREADS:-15}"
+  SESSION_TIMEOUT="${SESSION_TIMEOUT:-72000}"
   ensure_markexpr_default "testnets"
 
   ensure_dirs
   set_common_env
   compute_common_args
   cleanup_previous_run
   initial_skip_pass
-  run_real_tests "$@"
+  run_real_tests --timeout=7200 "$@"
 }
 
 # Dispatch

@@ -156,6 +156,7 @@ jobs:
             testrun-report.xml
             deselected_tests.txt
             requirements_coverage.json
+            monitor.log
       - name: ↟ Upload CLI coverage
         uses: actions/upload-artifact@v5
         if: success() || failure()

@@ -99,6 +99,15 @@ def __init__(
         self.pytest_tmp_dir = temptools.get_pytest_root_tmp()
         self.cluster_lock = common.get_cluster_lock_file()
 
+        # Soft timeout (seconds): applies when no cluster is selected.
+        self.grace_period_soft = 3600
+        # Hard timeout (seconds): always applies, regardless of cluster selection.
+        self.grace_period_hard = 7200
+        # Time window (seconds) before deadline when stricter dead cluster checks apply.
+        self.strict_check_window = 1200
+        # Maximum allowed fraction of dead clusters during strict check window.
+        self.strict_dead_fraction = 0.51
+
         self._cluster_instance_num = -1
 
     @property
@@ -564,13 +573,42 @@ def _marked_select_instance(self, cget_status: _ClusterGetStatus) -> bool:
         # If here, this will be the first test with the mark
         return True
 
-    def _fail_on_all_dead(self) -> None:
-        """Fail if all cluster instances are dead."""
-        dead_clusters = status_files.list_cluster_dead_files()
-        if len(dead_clusters) == self.num_of_instances:
-            msg = "All clusters are dead, cannot run."
+    def _check_dead_fraction(self, max_dead_fraction: float) -> None:
+        """Fail if the fraction of dead cluster instances is too high."""
+        total = self.num_of_instances
+        if total == 0:
+            msg = "Number of cluster instances must be greater than 0."
+            raise ValueError(msg)
+        dead_count = len(status_files.list_cluster_dead_files())
+        dead_fraction = dead_count / total
+
+        if dead_fraction >= max_dead_fraction:
+            if dead_count == total:
+                msg = "All cluster instances are dead."
+            else:
+                msg = (
+                    "Too many cluster instances are dead: "
+                    f"{dead_count} out of {total} "
+                    f"({dead_fraction:.0%} dead, "
+                    f"maximum allowed: {max_dead_fraction:.0%})."
+                )
             raise RuntimeError(msg)
 
+    def _fail_on_dead_clusters(self, remaining_time_sec: float) -> None:
+        """Fail based on how many cluster instances are dead and time left.
+
+        Use a stricter failure threshold as we approach the deadline.
+        If we've been waiting a long time and too many cluster instances are dead,
+        it's better to fail than continue trying with too few usable instances.
+        """
+        if remaining_time_sec <= self.strict_check_window:
+            max_dead_fraction = self.strict_dead_fraction
+        else:
+            # Early in the wait period we only fail if all instances are dead.
+            max_dead_fraction = 1.0
+
+        self._check_dead_fraction(max_dead_fraction)
+
     def _cleanup_dead_clusters(self, cget_status: _ClusterGetStatus) -> None:
         """Cleanup if the selected cluster instance failed to start."""
         # Move on to other cluster instance
@@ -805,8 +843,24 @@ def get_cluster_instance(  # noqa: C901
 
         self.log(f"want to run test '{cget_status.current_test}'")
 
-        # Iterate until it is possible to start the test
+        # Iterate until it is possible to start the test. Timeout after grace period.
+        now = time.monotonic()
+        deadline_soft = now + self.grace_period_soft
+        deadline_hard = now + self.grace_period_hard
         while True:
+            now = time.monotonic()
+            remaining_soft = deadline_soft - now
+            remaining_hard = deadline_hard - now
+
+            # Timeout after soft grace period if no cluster instance was selected yet
+            if cget_status.selected_instance == -1 and remaining_soft <= 0:
+                msg = "Timeout (soft) while waiting to obtain cluster instance."
+                raise TimeoutError(msg)
+            # Timeout after hard grace period even if cluster instance was already selected
+            if remaining_hard <= 0:
+                msg = "Timeout (hard) while waiting to obtain cluster instance."
+                raise TimeoutError(msg)
+
             if cget_status.respin_ready:
                 self._respin(scriptsdir=scriptsdir)
 
@@ -819,8 +873,7 @@ def get_cluster_instance(  # noqa: C901
                 if self._is_already_running():
                     return self.cluster_instance_num
 
-                # Fail if all cluster instances are dead
-                self._fail_on_all_dead()
+                self._fail_on_dead_clusters(remaining_time_sec=remaining_soft)
 
                 if mark:
                     # Check if tests with my mark are already locked to any cluster instance

@@ -43,6 +43,7 @@ PyYAML = "^6.0.2"
 requests = "^2.32.4"
 pytest-subtests = "^0.14.2"
 cardonnay = "^0.2.8"
+pytest-timeout = "^2.4.0"
 
 [tool.poetry.group.dev]
 optional = true