Skip to content

Commit 9f4ab8c

Browse files
mayantaylorZwFinkritvikrao
authored
CUDA Callback API (HAPI) (#282)
* ctypes and cython streams working * cython prelim implementation working! * change to support future * hapi works via futures now * cleanup and example * make separate directory * Make example numba * Set hapiAddCudaCallback to charm * flag for cuda build * add hapi docs * Update docs with build instructions * fix flag * add example command to readme * fix readme --------- Co-authored-by: Zane Fink <[email protected]> Co-authored-by: Ritvik Rao <[email protected]>
1 parent fc4c3f5 commit 9f4ab8c

File tree

8 files changed

+189
-3
lines changed

8 files changed

+189
-3
lines changed

charm4py/charm.py

+5
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def __init__(self):
127127
self.CkChareSend = self.lib.CkChareSend
128128
self.CkGroupSend = self.lib.CkGroupSend
129129
self.CkArraySend = self.lib.CkArraySend
130+
self.hapiAddCudaCallback = self.lib.hapiAddCudaCallback
130131
self.reducers = reduction.ReducerContainer(self)
131132
self.redMgr = reduction.ReductionManager(self, self.reducers)
132133
self.mainchareRegistered = False
@@ -933,6 +934,10 @@ def recordSendRecv(self, stats, size):
933934
stats[2] = max(size, stats[2])
934935
stats[3] += size
935936
stats[4] = size
937+
938+
# deposit value of one of the futures that was created on this PE
939+
def _future_deposit_result(self, fid, result=None):
940+
self.threadMgr.depositFuture(fid, result)
936941

937942
def __printTable__(self, table, sep):
938943
col_width = [max(len(x) for x in col) for col in zip(*table)]

charm4py/charmlib/ccharm.pxd

+2
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ cdef extern from "charm.h":
7070
void CkStartQDExt_SectionCallback(int sid_pe, int sid_cnt, int rootPE, int ep);
7171
void CcdCallFnAfter(void (*CcdVoidFn)(void *userParam,double curWallTime), void *arg, double msecs);
7272

73+
void CkHapiAddCallback(long stream, void (*cb)(void*, void*), int fid);
74+
7375
cdef extern from "conv-header.h":
7476
ctypedef void (*CmiHandler)(void* )
7577
cdef const int CmiReservedHeaderSize

charm4py/charmlib/charmlib_cython.pyx

+9
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,15 @@ class CharmLib(object):
867867
cdef int replyLen = len(message_bytes)
868868
CcsSendReply(replyLen, <const void*>replyData)
869869

870+
def hapiAddCudaCallback(self, stream, future):
871+
if not HAVE_CUDA_BUILD:
872+
raise Charm4PyError("HAPI usage not allowed: Charm++ was not built with CUDA support")
873+
id = future.fid
874+
CkHapiAddCallback(<long> stream, depositFutureWithId, <int> id)
875+
876+
cdef void depositFutureWithId(void *param, void* message) noexcept:
877+
cdef int futureId = <int> param
878+
charm._future_deposit_result(futureId, None)
870879

871880
# first callback from Charm++ shared library
872881
cdef void registerMainModule() noexcept:

docs/gpus.rst

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
====
2+
GPUs
3+
====
4+
5+
.. .. contents::
6+
7+
8+
GPUs are supported in Charm4py via the Charm++ HAPI (Hybrid API) interface.
9+
Presently, this support allows asynchronous completion detection of GPU kernels via Charm4py futures,
10+
using the function ``charm.hapiAddCudaCallback``.
11+
12+
The HAPI Charm4py API is:
13+
14+
.. code-block:: python
15+
16+
def hapiAddCudaCallback(stream, future)
17+
18+
.. note::
19+
20+
For now, ``charm.hapiAddCudaCallback`` only supports numba and torch streams as input. This function inserts a callback
21+
into the stream such that when the callback is reached, the corresponding Charm4py future is set.
22+
23+
Enabling HAPI
24+
--------
25+
To build Charm4py with HAPI support, add "cuda" to the Charm build options and follow the steps to build Charm4py from source:
26+
27+
.. code-block:: shell
28+
29+
export CHARM_EXTRA_BUILD_OPTS="cuda"
30+
pip install .
31+
32+
.. warning::
33+
34+
To ensure that the underlying Charm build has Cuda enabled, remove any pre-existing builds in charm_src/charm before setting the Cuda option and running install.
35+
36+
Examples
37+
--------
38+
39+
.. code-block:: python
40+
41+
from charm4py import charm
42+
import time
43+
import numba.cuda as cuda
44+
import numpy as np
45+
46+
@cuda.jit
47+
def elementwise_sum_kernel(x_in, x_out):
48+
idx = cuda.grid(1)
49+
if idx < x_in.shape[0]:
50+
x_out[idx] = x_in[idx] + x_in[idx]
51+
52+
def main(args):
53+
N = 1_000_000
54+
array_size = (N,)
55+
56+
s = cuda.stream()
57+
stream_handle = s.handle.value
58+
59+
A_host = np.arange(N, dtype=np.float32)
60+
61+
A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
62+
B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
63+
A_gpu.copy_to_device(A_host, stream=s)
64+
65+
threads_per_block = 128
66+
blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
67+
68+
print("Launching kernel and inserting callback...")
69+
start_time = time.perf_counter()
70+
elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
71+
72+
return_fut = charm.Future()
73+
charm.hapiAddCudaCallback(stream_handle, return_fut)
74+
return_fut.get()
75+
kernel_done_time = time.perf_counter()
76+
print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
77+
78+
B_host = B_gpu.copy_to_host(stream=s)
79+
80+
s.synchronize()
81+
82+
sum_result = np.sum(B_host)
83+
print(f"Sum of result is {sum_result}")
84+
85+
charm.exit()
86+
87+
charm.start(main)
88+
89+
90+
The above example demonstrates how to use the Charm4py HAPI interface to insert a callback into a CUDA stream and track
91+
completion of a numba kernel launch.

docs/index.rst

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ to the largest supercomputers.
4141
sections
4242
pool
4343
rules
44+
gpus
4445

4546
.. toctree::
4647
:maxdepth: 2

examples/cuda/hapi/README.md

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
## Using Charm4py with CUDA
2+
3+
### HAPI CUDA Callback
4+
5+
Example overview
6+
7+
- The example in `hapi-cuda-callback.py` demonstrates usage of addCudaCallback from the Charm++ HAPI library
8+
- addCudaCallback enables an asynchronous mechanism to wait for kernel completion via Charm4py futures
9+
- The example is based around a simple torch kernel.
10+
11+
Usage
12+
13+
- hapiAddCudaCallback requires a cuda stream handle and a future
14+
- access to the Cuda stream handle depends on the Python library being used. For example...
15+
- using torch: `stream_handle = torch.cuda.Stream().cuda_stream`
16+
- using numba: `stream_handle = numba.cuda.stream().handle.value`
17+
- currently, the hapiAddCudaCallback is restricted to torch and numba based Cuda streams.
18+
19+
Running example
20+
21+
- If running locally, use:
22+
23+
`$ python3 -m charmrun.start +p<N> hapi-cuda-callback.py`
24+
25+
- If running on a cluster machine with Slurm, use:
26+
27+
`$ srun -n <N> python3 hapi-cuda-callback.py`
+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from charm4py import charm
2+
import time
3+
import numba.cuda as cuda
4+
import numpy as np
5+
6+
@cuda.jit
7+
def elementwise_sum_kernel(x_in, x_out):
8+
idx = cuda.grid(1)
9+
if idx < x_in.shape[0]:
10+
x_out[idx] = x_in[idx] + x_in[idx]
11+
12+
def main(args):
13+
N = 1_000_000
14+
array_size = (N,)
15+
16+
s = cuda.stream()
17+
stream_handle = s.handle.value
18+
19+
A_host = np.arange(N, dtype=np.float32)
20+
21+
A_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
22+
B_gpu = cuda.device_array(array_size, dtype=np.float32, stream=s)
23+
A_gpu.copy_to_device(A_host, stream=s)
24+
25+
threads_per_block = 128
26+
blocks_per_grid = (N + (threads_per_block - 1)) // threads_per_block
27+
28+
print("Launching kernel and inserting callback...")
29+
start_time = time.perf_counter()
30+
elementwise_sum_kernel[blocks_per_grid, threads_per_block, s](A_gpu, B_gpu)
31+
32+
return_fut = charm.Future()
33+
charm.hapiAddCudaCallback(stream_handle, return_fut)
34+
return_fut.get()
35+
kernel_done_time = time.perf_counter()
36+
print(f"Callback received, kernel finished in {kernel_done_time - start_time:.6f} seconds.")
37+
38+
B_host = B_gpu.copy_to_host(stream=s)
39+
40+
s.synchronize()
41+
42+
sum_result = np.sum(B_host)
43+
print(f"Sum of result is {sum_result}")
44+
45+
charm.exit()
46+
47+
charm.start(main)

setup.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -325,15 +325,18 @@ def install(self):
325325
cobject_extra_args=["-Wl,-rpath,@loader_path/.libs"]
326326
else:
327327
cobject_extra_args=["-Wl,-rpath,$ORIGIN/.libs"]
328-
328+
329+
cudaBuild = os.environ.get('CHARM_EXTRA_BUILD_OPTS', '').find('cuda') != -1
330+
329331
extensions.extend(cythonize(setuptools.Extension('charm4py.charmlib.charmlib_cython',
330332
sources=['charm4py/charmlib/charmlib_cython.pyx'],
331333
include_dirs=['charm_src/charm/include'] + my_include_dirs,
332334
library_dirs=[os.path.join(os.getcwd(), 'charm4py', '.libs')],
333335
libraries=["charm"],
334336
extra_compile_args=[],
335337
extra_link_args=extra_link_args,
336-
), compile_time_env={'HAVE_NUMPY': haveNumpy}))
338+
), compile_time_env={'HAVE_NUMPY': haveNumpy,
339+
'HAVE_CUDA_BUILD': cudaBuild}))
337340

338341
extensions.extend(cythonize(setuptools.Extension('charm4py.c_object_store',
339342
sources=['charm4py/c_object_store.pyx'],
@@ -342,7 +345,8 @@ def install(self):
342345
libraries=["charm"],
343346
extra_compile_args=[],
344347
extra_link_args=cobject_extra_args,
345-
), compile_time_env={'HAVE_NUMPY': haveNumpy}))
348+
), compile_time_env={'HAVE_NUMPY': haveNumpy,
349+
'HAVE_CUDA_BUILD': cudaBuild}))
346350

347351

348352
additional_setup_keywords = {}

0 commit comments

Comments
 (0)