Skip to content

Commit 35315a8

Browse files
authored
[offload] Fix CUDA args size by subtracting tail padding (#172249)
This commit makes the cuLaunchKernel call to pass the total arguments size without tail padding.
1 parent 35b2317 commit 35315a8

File tree

7 files changed

+52
-5
lines changed

7 files changed

+52
-5
lines changed

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
8181
DLWRAP(cuDevicePrimaryCtxRetain, 2)
8282
DLWRAP(cuModuleLoadDataEx, 5)
8383
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
84+
DLWRAP(cuFuncGetParamInfo, 4)
8485

8586
DLWRAP(cuDeviceCanAccessPeer, 3)
8687
DLWRAP(cuCtxEnablePeerAccess, 2)

offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
390390
CUmemAllocationGranularity_flags option);
391391
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
392392
CUoccupancyB2DSize, size_t, int);
393+
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
393394

394395
#endif

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
149149
// The maximum number of threads cannot exceed the maximum of the kernel.
150150
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
151151

152-
return Plugin::success();
152+
// Retrieve the size of the arguments.
153+
return initArgsSize();
153154
}
154155

155156
/// Launch the CUDA kernel function.
@@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
173174
}
174175

175176
private:
177+
/// Initialize the size of the arguments.
178+
Error initArgsSize() {
179+
CUresult Res;
180+
size_t ArgOffset, ArgSize;
181+
size_t Arg = 0;
182+
183+
ArgsSize = 0;
184+
185+
// Find the last argument to know the total size of the arguments.
186+
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
187+
CUDA_SUCCESS)
188+
ArgsSize = ArgOffset + ArgSize;
189+
190+
if (Res != CUDA_ERROR_INVALID_VALUE)
191+
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
192+
return Plugin::success();
193+
}
194+
176195
/// The CUDA kernel function to execute.
177196
CUfunction Func;
178197
/// The maximum amount of dynamic shared memory per thread group. By default,
179198
/// this is set to 48 KB.
180199
mutable uint32_t MaxDynCGroupMemLimit = 49152;
200+
201+
/// The size of the kernel arguments.
202+
size_t ArgsSize;
181203
};
182204

183205
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1430,16 +1452,23 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
14301452
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
14311453
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
14321454

1455+
// The args size passed in LaunchParams may have tail padding, which is not
1456+
// accepted by the CUDA driver.
1457+
if (ArgsSize > LaunchParams.Size)
1458+
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
1459+
"mismatch in kernel arguments");
1460+
14331461
CUstream Stream;
14341462
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
14351463
return Err;
14361464

14371465
uint32_t MaxDynCGroupMem =
14381466
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
14391467

1468+
size_t ConfigArgsSize = ArgsSize;
14401469
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
14411470
CU_LAUNCH_PARAM_BUFFER_SIZE,
1442-
reinterpret_cast<void *>(&LaunchParams.Size),
1471+
reinterpret_cast<void *>(&ConfigArgsSize),
14431472
CU_LAUNCH_PARAM_END};
14441473

14451474
// If we are running an RPC server we want to wake up the server thread

offload/test/offloading/CUDA/basic_launch_multi_arg.cu

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,6 @@
66
// clang-format on
77

88
// REQUIRES: gpu
9-
//
10-
// FIXME: https://github.com/llvm/llvm-project/issues/161265
11-
// UNSUPPORTED: gpu
129

1310
#include <stdio.h>
1411

offload/unittests/OffloadAPI/device_code/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
22
add_offload_test_device_code(bar.cpp bar)
33
# Compile with optimizations to eliminate AMDGPU implicit arguments.
44
add_offload_test_device_code(noargs.cpp noargs -O3)
5+
add_offload_test_device_code(multiargs.cpp multiargs -O3)
56
add_offload_test_device_code(byte.cpp byte)
67
add_offload_test_device_code(localmem.cpp localmem)
78
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
1516
foo.bin
1617
bar.bin
1718
noargs.bin
19+
multiargs.bin
1820
byte.bin
1921
localmem.bin
2022
localmem_reduction.bin
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#include <gpuintrin.h>
2+
3+
extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }

offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
5555

5656
KERNEL_TEST(Foo, foo)
5757
KERNEL_TEST(NoArgs, noargs)
58+
KERNEL_TEST(MultiArgs, multiargs)
5859
KERNEL_TEST(Byte, byte)
5960
KERNEL_TEST(LocalMem, localmem)
6061
KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
135136
ASSERT_SUCCESS(olSyncQueue(Queue));
136137
}
137138

139+
TEST_P(olLaunchKernelMultiTest, Success) {
140+
struct {
141+
char A;
142+
int *B;
143+
short C;
144+
} Args{0, nullptr, 0};
145+
146+
ASSERT_SUCCESS(
147+
olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
148+
149+
ASSERT_SUCCESS(olSyncQueue(Queue));
150+
}
151+
138152
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
139153
void *Mem;
140154
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,

0 commit comments

Comments
 (0)