CUDA Callback API (HAPI) (#282)

mayantaylor · ZwFink · ritvikrao · web-flow · commit 9f4ab8c71317 · 2025-05-02T10:09:46.000-05:00
* ctypes and cython streams working

* cython prelim implementation working!

* change to support future

* hapi works via futures now

* cleanup and example

* make separate directory

* Make example numba

* Set hapiAddCudaCallback to charm

* flag for cuda build

* add hapi docs

* Update docs with build instructions

* fix flag

* add example command to readme

* fix readme

---------

Co-authored-by: Zane Fink &lt;finkzane@gmail.com&gt;
Co-authored-by: Ritvik Rao &lt;rsrao2@illinois.edu&gt;
diff --git a/charm4py/charm.py b/charm4py/charm.py
@@ -127,6 +127,7 @@ def __init__(self):
         self.CkChareSend = self.lib.CkChareSend
         self.CkGroupSend = self.lib.CkGroupSend
         self.CkArraySend = self.lib.CkArraySend
+        self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback
         self.reducers = reduction.ReducerContainer(self)
         self.redMgr = reduction.ReductionManager(self, self.reducers)
         self.mainchareRegistered = False
@@ -933,6 +934,10 @@ def recordSendRecv(self, stats, size):
         stats[2] = max(size, stats[2])
         stats[3] += size
         stats[4] = size
+        
+    # deposit value of one of the futures that was created on this PE
+    def _future_deposit_result(self, fid, result=None):
+        self.threadMgr.depositFuture(fid, result)
 
     def __printTable__(self, table, sep):
         col_width = [max(len(x) for x in col) for col in zip(*table)]
diff --git a/charm4py/charmlib/ccharm.pxd b/charm4py/charmlib/ccharm.pxd
@@ -70,6 +70,8 @@ cdef extern from "charm.h":
     void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
     void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
 
+    void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid);
+    
 cdef extern from "conv-header.h":
     ctypedef void (*CmiHandler)(void* )
     cdef const int CmiReservedHeaderSize
diff --git a/charm4py/charmlib/charmlib_cython.pyx b/charm4py/charmlib/charmlib_cython.pyx
@@ -867,6 +867,15 @@ class CharmLib(object):
     cdef int replyLen = len(message_bytes)
     CcsSendReply(replyLen, <const void*>replyData)
 
+  def hapiAddCudaCallback(self, stream, future):
+    if not HAVE_CUDA_BUILD:
+      raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support")
+    id = future.fid
+    CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)
+
+cdef void depositFutureWithId(void *param, void* message) noexcept:
+  cdef int futureId = <int> param
+  charm._future_deposit_result(futureId, None)
 
 # first callback from Charm++ shared library
 cdef void registerMainModule() noexcept:
diff --git a/docs/gpus.rst b/docs/gpus.rst
@@ -0,0 +1,91 @@
+====
+GPUs
+====
+
+.. .. contents::
+
+
+GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface.
+Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures, 
+using the function ``charm.hapiAddCudaCallback``.
+
+The HAPI Charm4py API is:
+
+.. code-block:: python
+
+    def hapiAddCudaCallback(stream, future)
+
+.. note::
+
+    For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback 
+    into the stream such that when the callback is reached, the corresponding Charm4py future is set.
+
+Enabling HAPI
+--------
+To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source:
+
+.. code-block:: shell
+
+   export CHARM_EXTRA_BUILD_OPTS="cuda"
+   pip install .
+
+.. warning:: 
+
+    To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install.
+
+Examples
+--------
+
+.. code-block:: python
+
+    from charm4py import charm
+    import time
+    import numba.cuda as cuda
+    import numpy as np
+
+    @cuda.jit
+    def elementwise_sum_kernel(x_in, x_out):
+        idx = cuda.grid(1)
+        if idx < x_in.shape[0]:
+            x_out[idx] = x_in[idx] + x_in[idx]
+
+    def main(args):
+        N = 1_000_000
+        array_size = (N,)
+
+        s = cuda.stream()
+        stream_handle = s.handle.value
+
+        A_host = np.arange(N, dtype=np.float32)
+
+        A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+        B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+        A_gpu.copy_to_device(A_host, stream=s)
+
+        threads_per_block = 128
+        blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
+
+        print("Launching kernel and inserting callback...")
+        start_time = time.perf_counter()
+        elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
+
+        return_fut = charm.Future()
+        charm.hapiAddCudaCallback(stream_handle, return_fut)
+        return_fut.get()
+        kernel_done_time = time.perf_counter()
+        print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
+
+        B_host = B_gpu.copy_to_host(stream=s)
+
+        s.synchronize()
+
+        sum_result = np.sum(B_host)
+        print(f"Sum of result is {sum_result}")
+
+        charm.exit()
+
+    charm.start(main)
+
+
+The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track 
+completion of a numba kernel launch.
diff --git a/docs/index.rst b/docs/index.rst
@@ -41,6 +41,7 @@ to the largest supercomputers.
    sections
    pool
    rules
+   gpus
 
 .. toctree::
    :maxdepth: 2
diff --git a/examples/cuda/hapi/README.md b/examples/cuda/hapi/README.md
@@ -0,0 +1,27 @@
+## Using Charm4py with CUDA
+
+### HAPI CUDA Callback
+
+Example overview
+
+- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library
+- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures
+- The example is based around a simple torch kernel.
+
+Usage
+
+- hapiAddCudaCallback requires a cuda stream handle and a future
+- access to the Cuda stream handle depends on the Python library being used. For example...
+  - using torch: `stream_handle = torch.cuda.Stream().cuda_stream`
+  - using numba: `stream_handle = numba.cuda.stream().handle.value`
+- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams.
+
+Running example
+
+- If running locally, use:  
+
+`$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py`
+
+- If running on a cluster machine with Slurm, use:  
+
+`$ srun -n <N> python3 hapi-cuda-callback.py`
diff --git a/examples/cuda/hapi/hapi-cuda-callback.py b/examples/cuda/hapi/hapi-cuda-callback.py
@@ -0,0 +1,47 @@
+from charm4py import charm
+import time
+import numba.cuda as cuda
+import numpy as np
+
+@cuda.jit
+def elementwise_sum_kernel(x_in, x_out):
+    idx = cuda.grid(1)
+    if idx < x_in.shape[0]:
+        x_out[idx] = x_in[idx] + x_in[idx]
+
+def main(args):
+    N = 1_000_000
+    array_size = (N,)
+
+    s = cuda.stream()
+    stream_handle = s.handle.value
+
+    A_host = np.arange(N, dtype=np.float32)
+
+    A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+    B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
+    A_gpu.copy_to_device(A_host, stream=s)
+
+    threads_per_block = 128
+    blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
+
+    print("Launching kernel and inserting callback...")
+    start_time = time.perf_counter()
+    elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
+
+    return_fut = charm.Future()
+    charm.hapiAddCudaCallback(stream_handle, return_fut)
+    return_fut.get()
+    kernel_done_time = time.perf_counter()
+    print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
+
+    B_host = B_gpu.copy_to_host(stream=s)
+
+    s.synchronize()
+
+    sum_result = np.sum(B_host)
+    print(f"Sum of result is {sum_result}")
+
+    charm.exit()
+
+charm.start(main)
diff --git a/setup.py b/setup.py
@@ -325,15 +325,18 @@ def install(self):
             cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"]
         else:
             cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"]
-
+            
+    cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1
+    
     extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython',
                             sources=['charm4py/charmlib/charmlib_cython.pyx'],
                             include_dirs=['charm_src/charm/include'] + my_include_dirs,
                             library_dirs=[os.path.join(os.getcwd(), 'charm4py', '.libs')],
                             libraries=["charm"],
                             extra_compile_args=[],
                             extra_link_args=extra_link_args,
-                            ), compile_time_env={'HAVE_NUMPY': haveNumpy}))
+                            ), compile_time_env={'HAVE_NUMPY': haveNumpy,
+                                                 'HAVE_CUDA_BUILD': cudaBuild}))
 
     extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store',
                             sources=['charm4py/c_object_store.pyx'],
@@ -342,7 +345,8 @@ def install(self):
                             libraries=["charm"],
                             extra_compile_args=[],
                             extra_link_args=cobject_extra_args,
-                            ), compile_time_env={'HAVE_NUMPY': haveNumpy}))
+                            ), compile_time_env={'HAVE_NUMPY': haveNumpy,
+                                                 'HAVE_CUDA_BUILD': cudaBuild}))
 
 
 additional_setup_keywords = {}