Skip to content

Commit 6e3b496

Browse files
committed
[clang][CUDA] Avoid accounting for tail padding in LLVM offloading
1 parent cc0fecf commit 6e3b496

File tree

5 files changed

+33
-7
lines changed

5 files changed

+33
-7
lines changed

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -327,9 +327,10 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
327327
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
328328
/// function. For the LLVM/offload launch we flatten the arguments into the
329329
/// struct directly. In addition, we include the size of the arguments, thus
330-
/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
331-
/// nullptr}. The last nullptr needs to be initialized to an array of pointers
332-
/// pointing to the arguments if we want to offload to the host.
330+
/// pass {size of ({void *, short, void *}) without tail padding, ptr to {void
331+
/// *, short, void *}, nullptr}. The last nullptr needs to be initialized to an
332+
/// array of pointers pointing to the arguments if we want to offload to the
333+
/// host.
333334
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
334335
FunctionArgList &Args) {
335336
SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
@@ -339,6 +340,7 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
339340

340341
auto *Int64Ty = CGF.Builder.getInt64Ty();
341342
KernelLaunchParamsTypes.push_back(Int64Ty);
343+
KernelLaunchParamsTypes.push_back(Int64Ty);
342344
KernelLaunchParamsTypes.push_back(PtrTy);
343345
KernelLaunchParamsTypes.push_back(PtrTy);
344346

@@ -351,12 +353,24 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
351353
"kernel_launch_params");
352354

353355
auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
356+
357+
// Avoid accounting the tail padding for CUDA.
358+
auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero();
359+
if (auto N = KernelArgsTy->getNumElements()) {
360+
auto *SL = CGM.getDataLayout().getStructLayout(KernelArgsTy);
361+
KernelArgsSizeNoTailPadding = SL->getElementOffset(N - 1);
362+
KernelArgsSizeNoTailPadding += CGM.getDataLayout().getTypeAllocSize(
363+
KernelArgsTy->getElementType(N - 1));
364+
}
365+
354366
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
355367
CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
356-
CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
368+
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSizeNoTailPadding),
357369
CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
358-
CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
370+
CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
359371
CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
372+
CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
373+
CGF.Builder.CreateStructGEP(KernelLaunchParams, 3));
360374

361375
for (unsigned i = 0; i < Args.size(); ++i) {
362376
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));

offload/include/Shared/APITypes.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ static_assert(sizeof(KernelArgsTy) ==
121121
struct KernelLaunchParamsTy {
122122
/// Size of the Data array.
123123
size_t Size = 0;
124+
/// Size of the Data array without tail padding.
125+
size_t SizeNoTailPadding = 0;
124126
/// Flat array of kernel parameters.
125127
void *Data = nullptr;
126128
/// Ptrs to the Data entries. Only strictly required for the host plugin.

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,9 @@ KernelLaunchParamsTy GenericKernelTy::prepareArgs(
571571
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
572572
Ptrs[I] = &Args[I];
573573
}
574-
return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
574+
575+
size_t ArgsSize = sizeof(void *) * NumArgs;
576+
return KernelLaunchParamsTy{ArgsSize, ArgsSize, &Args[0], &Ptrs[0]};
575577
}
576578

577579
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1414,7 +1414,7 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
14141414

14151415
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
14161416
CU_LAUNCH_PARAM_BUFFER_SIZE,
1417-
reinterpret_cast<void *>(&LaunchParams.Size),
1417+
reinterpret_cast<void *>(&LaunchParams.SizeNoTailPadding),
14181418
CU_LAUNCH_PARAM_END};
14191419

14201420
// If we are running an RPC server we want to wake up the server thread

offload/test/offloading/CUDA/basic_launch_multi_arg.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ __global__ void square(int *Dst, short Q, int *Src, short P) {
2323
Src[1] = P;
2424
}
2525

26+
__global__ void accumulate(short Q, int *Dst, char P) {
27+
*Dst += Q + P;
28+
}
29+
2630
int main(int argc, char **argv) {
2731
int DevNo = 0;
2832
int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
@@ -39,5 +43,9 @@ int main(int argc, char **argv) {
3943
// CHECK: Ptr [[Ptr]], *Ptr: 42
4044
printf("Src: %i : %i\n", Src[0], Src[1]);
4145
// CHECK: Src: 3 : 4
46+
accumulate<<<1, 1>>>(3, Ptr, 7);
47+
printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
48+
// CHECK: Ptr [[Ptr]], *Ptr: 52
4249
llvm_omp_target_free_shared(Ptr, DevNo);
50+
llvm_omp_target_free_shared(Src, DevNo);
4351
}

0 commit comments

Comments
 (0)