Merge branch 'develop' of https://github.com/Libensemble/libensemble …

…into develop
Libensemble · Feb 20, 2025 · 4c103d9 · 4c103d9
2 parents f1fd08f + 394638d
commit 4c103d9
Show file tree

Hide file tree

Showing 12 changed files with 221 additions and 150 deletions.
diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml
@@ -11,43 +11,43 @@ jobs:
             matrix:
                 os: [ubuntu-latest]
                 mpi-version: [mpich]
-                python-version: ["3.10", "3.11", "3.12"]
-                pydantic-version: ["2.8.2"]
+                python-version: ['3.10', '3.11', '3.12', '3.13']
+                pydantic-version: ['2.8.2']
                 comms-type: [m, l]
                 include:
                     - os: macos-latest
-                      python-version: 3.11
+                      python-version: '3.13'
                       mpi-version: mpich
-                      pydantic-version: "2.8.2"
+                      pydantic-version: '2.8.2'
                       comms-type: m
                     - os: macos-latest
-                      python-version: 3.11
+                      python-version: '3.13'
                       mpi-version: mpich
-                      pydantic-version: "2.8.2"
+                      pydantic-version: '2.8.2'
                       comms-type: l
                     - os: ubuntu-latest
-                      python-version: "3.10"
+                      python-version: '3.12'
                       mpi-version: mpich
-                      pydantic-version: "2.8.2"
+                      pydantic-version: '2.8.2'
                       comms-type: t
                     - os: ubuntu-latest
-                      mpi-version: "openmpi"
-                      pydantic-version: "2.8.2"
-                      python-version: "3.12"
+                      mpi-version: 'openmpi'
+                      pydantic-version: '2.8.2'
+                      python-version: '3.12'
                       comms-type: l
                     - os: ubuntu-latest
                       mpi-version: mpich
-                      python-version: "3.10"
-                      pydantic-version: "1.10.17"
+                      python-version: '3.12'
+                      pydantic-version: '1.10.17'
                       comms-type: m
                     - os: ubuntu-latest
                       mpi-version: mpich
-                      python-version: "3.10"
-                      pydantic-version: "1.10.17"
+                      python-version: '3.12'
+                      pydantic-version: '1.10.17'
                       comms-type: l
 
         env:
-            HYDRA_LAUNCHER: "fork"
+            HYDRA_LAUNCHER: 'fork'
             TERM: xterm-256color
             GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
@@ -61,7 +61,7 @@ jobs:
           uses: conda-incubator/setup-miniconda@v3
           with:
             activate-environment: condaenv
-            miniconda-version: "latest"
+            miniconda-version: 'latest'
             python-version: ${{ matrix.python-version }}
             channels: conda-forge
             channel-priority: flexible
@@ -75,8 +75,8 @@ jobs:
         - name: Install Ubuntu compilers
           if: matrix.os == 'ubuntu-latest'
           run: |
-              conda install gcc_linux-64
-              pip install nlopt==2.9.0
+            conda install gcc_linux-64
+            pip install nlopt==2.9.0
 
         # Roundabout solution on macos for proper linking with mpicc
         - name: Install macOS compilers
@@ -93,22 +93,22 @@ jobs:
           run: |
             conda env update --file install/gen_deps_environment.yml
 
-        - name: Install gpcam
-          if: matrix.python-version <= '3.13'
+        - name: Install gpcam and octave # Neither yet support 3.13
+          if: matrix.python-version <= '3.12'
           run: |
             pip install gpcam
+            conda install octave
 
-        - name: Install surmise
+        - name: Install surmise and Tasmanian
           if: matrix.os == 'ubuntu-latest'
           run: |
             pip install --upgrade git+https://github.com/bandframework/surmise.git
+            pip install Tasmanian --user
 
         - name: Install generator dependencies for Ubuntu tests
-          if: matrix.os == 'ubuntu-latest' && matrix.python-version != '3.12'
+          if: matrix.os == 'ubuntu-latest' && matrix.python-version <= '3.12'
           run: |
-            sudo apt-get install bc
-            pip install -r install/ubuntu_no312.txt
-            pip install Tasmanian --user
+            pip install scikit-build packaging
 
         - name: Install Balsam on Pydantic 1
           if: matrix.pydantic-version == '1.10.17'
@@ -120,24 +120,23 @@ jobs:
 
         - name: Install other testing dependencies
           run: |
-            conda install octave
             pip install -r install/testing_requirements.txt
             pip install -r install/misc_feature_requirements.txt
             source install/install_ibcdfo.sh
+            conda install numpy scipy
 
         - name: Install libEnsemble, flake8, lock environment
           run: |
             pip install pydantic==${{ matrix.pydantic-version }}
             pip install -e .
             flake8 libensemble
 
-        - name: Remove test for persistent Tasmanian, Surmise on Python 3.12
-          if: matrix.python-version >= '3.12'
+        - name: Remove test using octave, gpcam on Python 3.13
+          if: matrix.python-version >= '3.13'
           run: |
-            rm ./libensemble/tests/regression_tests/test_persistent_tasmanian.py
-            rm ./libensemble/tests/regression_tests/test_persistent_tasmanian_async.py
-            rm ./libensemble/tests/regression_tests/test_persistent_surmise_calib.py
-            rm ./libensemble/tests/regression_tests/test_persistent_surmise_killsims.py
+            rm ./libensemble/tests/regression_tests/test_persistent_fd_param_finder.py # needs octave, which doesn't yet support 3.13
+            rm ./libensemble/tests/regression_tests/test_persistent_aposmm_external_localopt.py # needs octave, which doesn't yet support 3.13
+            rm ./libensemble/tests/regression_tests/test_gpCAM.py # needs gpcam, which doesn't build on 3.13
 
         - name: Install redis/proxystore on Pydantic 2
           if: matrix.pydantic-version == '2.8.2'

diff --git a/docs/platforms/aurora.rst b/docs/platforms/aurora.rst
@@ -12,10 +12,16 @@ nodes.
 Configuring Python and Installation
 -----------------------------------
 
-To obtain Python use::
+To obtain Python and create a virtual environment:
+
+.. code-block:: console
 
-    module use /soft/modulefiles
     module load frameworks
+    python -m venv /path/to-venv --system-site-packages
+    . /path/to-venv/bin/activate
+
+where ``/path/to-venv`` can be anywhere you have write access. For future sessions,
+just load the frameworks module and run the activate line.
 
 To obtain libEnsemble::
 
@@ -31,7 +37,7 @@ To run the :doc:`forces_gpu<../tutorials/forces_gpu_tutorial>` tutorial on
 Aurora.
 
 To obtain the example you can git clone libEnsemble - although only
-the forces sub-directory is needed::
+the ``forces`` sub-directory is strictly needed::
 
     git clone https://github.com/Libensemble/libensemble
     cd libensemble/libensemble/tests/scaling_tests/forces/forces_app
@@ -44,40 +50,57 @@ Now go to forces_gpu directory::
 
     cd ../forces_gpu
 
-To make use of all available GPUs, open ``run_libe_forces.py`` and adjust
-the exit_criteria to do more simulations. The following will do two
-simulations for each worker::
+To make use of all available GPUs, open **run_libe_forces.py** and adjust
+the ``exit_criteria`` to perform more simulations. The following will run two
+simulations for each worker:
+
+.. code-block:: python
 
     # Instruct libEnsemble to exit after this many simulations
     ensemble.exit_criteria = ExitCriteria(sim_max=nsim_workers*2)
 
 Now grab an interactive session on two nodes (or use the batch script at
 ``../submission_scripts/submit_pbs_aurora.sh``)::
 
-    qsub -A <myproject> -l select=2 -l walltime=15:00 -lfilesystems=home -q EarlyAppAccess -I
+    qsub -A <myproject> -l select=2 -l walltime=15:00 -lfilesystems=home:flare -q debug -I
 
 Once in the interactive session, you may need to reload the frameworks module::
 
     cd $PBS_O_WORKDIR
-    module use /soft/modulefiles
-    module load frameworks
+    . /path/to-venv/bin/activate
 
 Then in the session run::
 
-    python run_libe_forces.py --comms local --nworkers 13
+    python run_libe_forces.py -n 13
 
 This provides twelve workers for running simulations (one for each GPU across
 two nodes). An extra worker is added to run the persistent generator. The
 GPU settings for each worker simulation are printed.
 
 Looking at ``libE_stats.txt`` will provide a summary of the runs.
 
+Now try running::
+
+    ./cleanup.sh
+    python run_libe_forces.py -n 7
+
+And you will see it runs with two cores and two GPUs are used per
+worker. The **forces** example automatically uses the GPUs available to
+each worker.
+
+Live viewing GPU usage
+----------------------
+
+To see GPU usage, SSH into a compute node you are on in another window and run::
+
+    module load xpu-smi
+    watch -n 0.1 xpu-smi dump -d -1 -m 0 -n 1
+
 Using tiles as GPUs
 -------------------
 
-If you wish to treat each tile as its own GPU, then add the *libE_specs*
-option ``use_tiles_as_gpus=True``, so the *libE_specs* block of
-``run_libe_forces.py`` becomes:
+To treat each tile as its own GPU, add the ``use_tiles_as_gpus=True`` option
+to the ``libE_specs`` block in **run_libe_forces.py**:
 
 .. code-block:: python
 
@@ -90,19 +113,45 @@ option ``use_tiles_as_gpus=True``, so the *libE_specs* block of
 Now you can run again but with twice the workers for running simulations (each
 will use one GPU tile)::
 
-    python run_libe_forces.py --comms local --nworkers 25
+    python run_libe_forces.py -n 25
+
+
+Running generator on the manager
+--------------------------------
+
+An alternative is to run the generator on a thread on the manager. The
+number of workers can then be set to the number of simulation workers.
+
+Change the ``libE_specs`` in **run_libe_forces.py** as follows:
+
+.. code-block:: python
+
+    nsim_workers = ensemble.nworkers
+
+    # Persistent gen does not need resources
+    ensemble.libE_specs = LibeSpecs(
+        gen_on_manager=True,
+
+
+then we can run with 12 (instead of 13) workers::
+
+    python run_libe_forces.py -n 12
+
+Dynamic resource assignment
+---------------------------
 
-Note that the *forces* example will automatically use the GPUs available to
-each worker (with one MPI rank per GPU), so if fewer workers are provided,
-more than one GPU will be used per simulation.
+In the **forces** directory you will also find:
 
-Also see ``forces_gpu_var_resources`` and ``forces_multi_app`` examples for
-cases that use varying processor/GPU counts per simulation.
+* ``forces_gpu_var_resources`` uses varying processor/GPU counts per simulation.
+* ``forces_multi_app`` uses varying processor/GPU counts per simulation and also
+  uses two different user executables, one which is CPU-only and one which
+  uses GPUs. This allows highly efficient use of nodes for multi-application
+  ensembles.
 
 Demonstration
 -------------
 
-Note that a video demonstration_ of the *forces_gpu* example on *Frontier*
+Note that a video demonstration_ of the *forces_gpu* example on **Frontier**
 is also available. The workflow is identical when running on Aurora, with the
 exception of different compiler options and numbers of workers (because the
 numbers of GPUs on a node differs).

diff --git a/install/ubuntu_no312.txt b/install/ubuntu_no312.txt
diff --git a/libensemble/executors/mpi_executor.py b/libensemble/executors/mpi_executor.py
@@ -47,29 +47,32 @@ class MPIExecutor(Executor):
         information using the ``custom_info`` argument. This takes
         a dictionary of values.
 
-        The allowable fields are::
+        The allowable fields are:
 
-            'mpi_runner' [string]:
-                Select runner: 'mpich', 'openmpi', 'aprun', 'srun', 'jsrun', 'custom'
-                All except 'custom' relate to runner classes in libEnsemble.
+        .. parsed-literal::
+
+            **'mpi_runner'** [string]:
+                Select runner: `'mpich'`, `'openmpi'`, `'aprun'`, `'srun'`, `'jsrun'`, `'custom'`
+                All except `'custom'` relate to runner classes in libEnsemble.
                 Custom allows user to define their own run-lines but without parsing
                 arguments or making use of auto-resources.
-            'runner_name' [string]:
-                Runner name: Replaces run command if present. All runners have a default
-                except for 'custom'.
-            'subgroup_launch' [bool]:
+            **'runner_name'** [string]:
+                The literal string that appears at the front of the run command.
+                This is typically 'mpirun', 'srun', etc., and can be a full path.
+                Defaults exist for all runners except 'custom'.
+            **'subgroup_launch'** [bool]:
                 Whether MPI runs should be initiated in a new process group. This needs
                 to be correct for kills to work correctly. Use the standalone test at
-                libensemble/tests/standalone_tests/kill_test to determine correct value
+                `libensemble/tests/standalone_tests/kill_test` to determine correct value
                 for a system.
 
-        For example::
+    For example::
 
-            customizer = {'mpi_runner': 'mpich',
-                          'runner_name': 'wrapper -x mpich'}
+        customizer = {'mpi_runner': 'mpich',
+                      'runner_name': 'wrapper -x mpich'}
 
-            from libensemble.executors.mpi_executor import MPIExecutor
-            exctr = MPIExecutor(custom_info=customizer)
+        from libensemble.executors.mpi_executor import MPIExecutor
+        exctr = MPIExecutor(custom_info=customizer)
 
 
     """
@@ -336,6 +339,9 @@ def submit(
         else:
             mpi_runner_obj = self.mpi_runner_obj or self._create_mpi_runner_from_attr()
 
+        if env_script is None and mpi_runner_obj is None:
+            raise ExecutorException("No valid MPI runner was found")
+
         mpi_specs = mpi_runner_obj.get_mpi_specs(
             task,
             num_procs,

diff --git a/libensemble/executors/mpi_runner.py b/libensemble/executors/mpi_runner.py
@@ -21,11 +21,13 @@ def get_runner(mpi_runner_type, runner_name=None, platform_info=None):
             "msmpi": MSMPI_MPIRunner,
             "custom": MPIRunner,
         }
-        mpi_runner = mpi_runners[mpi_runner_type]
-        if runner_name is not None:
-            runner = mpi_runner(run_command=runner_name, platform_info=platform_info)
-        else:
-            runner = mpi_runner(platform_info=platform_info)
+        runner = None
+        if mpi_runner_type is not None:
+            mpi_runner = mpi_runners[mpi_runner_type]
+            if runner_name is not None:
+                runner = mpi_runner(run_command=runner_name, platform_info=platform_info)
+            else:
+                runner = mpi_runner(platform_info=platform_info)
         return runner
 
     def __init__(self, run_command="mpiexec", platform_info=None):

diff --git a/libensemble/gen_funcs/persistent_ax_multitask.py b/libensemble/gen_funcs/persistent_ax_multitask.py
@@ -305,7 +305,7 @@ def persistent_gp_mt_ax_gen_f(H, persis_info, gen_specs, libE_info):
         # Increase iteration counter.
         model_iteration += 1
 
-    return [], persis_info, FINISHED_PERSISTENT_GEN_TAG
+    return None, persis_info, FINISHED_PERSISTENT_GEN_TAG
 
 
 class AxRunner(Runner):