Merge pull request #3034 from IntersectMBO/add_MAX_TESTS_PER_CLUSTER

mkoura · web-flow · commit 20288a467b2e · 2025-05-08T14:27:18.000+02:00
feat(cluster): enforce max tests per cluster instance
diff --git a/.github/regression.sh b/.github/regression.sh
@@ -70,6 +70,12 @@ elif [ "$TX_ERA" = "default" ]; then
   export TX_ERA=""
 fi
 
+# Decrease the number of tests per cluster if we are using the "disk" (LMDB) UTxO backend to avoid
+# having too many concurrent readers.
+if [ -z "${MAX_TESTS_PER_CLUSTER:-""}" ] && [ "${UTXO_BACKEND:-""}" = "disk" ]; then
+  export MAX_TESTS_PER_CLUSTER=5
+fi
+
 if [ -n "${BOOTSTRAP_DIR:-""}" ]; then
   :  # don't touch `SCRIPTS_DIRNAME` when running on testnet
 elif [ "${CI_BYRON_CLUSTER:-"false"}" != "false" ]; then
diff --git a/README.md b/README.md
@@ -108,6 +108,7 @@ Test execution can be configured using environment variables.
 * `PYTEST_ARGS` – specifies additional arguments for pytest (default: unset).
 * `MARKEXPR` – specifies marker expression for pytest (default: unset).
 * `TEST_THREADS` – specifies the number of pytest workers (default: 20).
+* `MAX_TESTS_PER_CLUSTER` - specifies the maximum number of tests that can be run on a single cluster instance (default: 8).
 * `CLUSTERS_COUNT` – number of cluster instances that will be started (default: 9).
 * `CLUSTER_ERA` – cluster era for Cardano node – used for selecting the correct cluster start script (default: conway).
 * `COMMAND_ERA` – era for cardano-cli commands – can be used for creating Shelley-era (Allegra-era, ...) transactions (default: unset).
diff --git a/cardano_node_tests/cluster_management/cluster_getter.py b/cardano_node_tests/cluster_management/cluster_getter.py
@@ -541,7 +541,7 @@ def _marked_select_instance(self, cget_status: _ClusterGetStatus) -> bool:
         if cget_status.marked_running_my_anywhere:
             self.log(
                 f"c{cget_status.instance_num}: tests marked with my mark '{cget_status.mark}' "
-                "already running on other cluster instance, cannot run"
+                "already running on other cluster instance, cannot start"
             )
             return False
 
@@ -832,9 +832,7 @@ def get_cluster_instance(  # noqa: C901
                     cget_status.instance_dir.mkdir(exist_ok=True)
 
                     # Cleanup cluster instance where attempt to start cluster failed repeatedly
-                    if status_files.get_cluster_dead_file(
-                        instance_num=cget_status.instance_num
-                    ).exists():
+                    if status_files.get_cluster_dead_file(instance_num=instance_num).exists():
                         self._cleanup_dead_clusters(cget_status)
                         continue
 
@@ -845,19 +843,30 @@ def get_cluster_instance(  # noqa: C901
 
                     # Are there tests already running on this cluster instance?
                     cget_status.started_tests_sfiles = status_files.list_test_running_files(
-                        instance_num=cget_status.instance_num
+                        instance_num=instance_num
                     )
 
                     # "marked tests" = group of tests marked with my mark
                     cget_status.marked_ready_sfiles = status_files.list_curr_mark_files(
-                        instance_num=cget_status.instance_num, mark=mark
+                        instance_num=instance_num, mark=mark
                     )
 
                     # If marked tests are already running, update their status
                     self._update_marked_tests(
                         marked_tests_cache=marked_tests_cache, cget_status=cget_status
                     )
 
+                    # If there would be more tests running on this cluster instance than allowed,
+                    # we need to wait.
+                    if (
+                        self.num_of_instances > 1
+                        and (tnum := len(cget_status.started_tests_sfiles))
+                        >= configuration.MAX_TESTS_PER_CLUSTER
+                    ):
+                        cget_status.sleep_delay = 2
+                        self.log(f"c{instance_num}: {tnum} tests are already running, cannot start")
+                        continue
+
                     # Does the cluster instance needs respin to continue?
                     # Cache the result as the check itself can be expensive.
                     cget_status.cluster_needs_respin = self._cluster_needs_respin(instance_num)
diff --git a/cardano_node_tests/utils/configuration.py b/cardano_node_tests/utils/configuration.py
@@ -59,6 +59,7 @@
 
 CLUSTERS_COUNT = int(os.environ.get("CLUSTERS_COUNT") or 0)
 WORKERS_COUNT = int(os.environ.get("PYTEST_XDIST_WORKER_COUNT") or 1)
+MAX_TESTS_PER_CLUSTER = int(os.environ.get("MAX_TESTS_PER_CLUSTER") or 8)
 CLUSTERS_COUNT = int(CLUSTERS_COUNT or (min(WORKERS_COUNT, 9)))
 
 DEV_CLUSTER_RUNNING = bool(os.environ.get("DEV_CLUSTER_RUNNING"))