@@ -327,9 +327,10 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
327327// / (void*, short, void*) is passed as {void **, short *, void **} to the launch
328328// / function. For the LLVM/offload launch we flatten the arguments into the
329329// / struct directly. In addition, we include the size of the arguments, thus
330- // / pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
331- // / nullptr}. The last nullptr needs to be initialized to an array of pointers
332- // / pointing to the arguments if we want to offload to the host.
330+ // / pass {size of ({void *, short, void *}) without tail padding, ptr to {void
331+ // / *, short, void *}, nullptr}. The last nullptr needs to be initialized to an
332+ // / array of pointers pointing to the arguments if we want to offload to the
333+ // / host.
333334Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload (CodeGenFunction &CGF,
334335 FunctionArgList &Args) {
335336 SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
@@ -339,6 +340,7 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
339340
340341 auto *Int64Ty = CGF.Builder .getInt64Ty ();
341342 KernelLaunchParamsTypes.push_back (Int64Ty);
343+ KernelLaunchParamsTypes.push_back (Int64Ty);
342344 KernelLaunchParamsTypes.push_back (PtrTy);
343345 KernelLaunchParamsTypes.push_back (PtrTy);
344346
@@ -351,12 +353,24 @@ Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
351353 " kernel_launch_params" );
352354
353355 auto KernelArgsSize = CGM.getDataLayout ().getTypeAllocSize (KernelArgsTy);
356+
357+ // Avoid accounting the tail padding for CUDA.
358+ auto KernelArgsSizeNoTailPadding = llvm::TypeSize::getZero ();
359+ if (auto N = KernelArgsTy->getNumElements ()) {
360+ auto *SL = CGM.getDataLayout ().getStructLayout (KernelArgsTy);
361+ KernelArgsSizeNoTailPadding = SL->getElementOffset (N - 1 );
362+ KernelArgsSizeNoTailPadding += CGM.getDataLayout ().getTypeAllocSize (
363+ KernelArgsTy->getElementType (N - 1 ));
364+ }
365+
354366 CGF.Builder .CreateStore (llvm::ConstantInt::get (Int64Ty, KernelArgsSize),
355367 CGF.Builder .CreateStructGEP (KernelLaunchParams, 0 ));
356- CGF.Builder .CreateStore (KernelArgs. emitRawPointer (CGF ),
368+ CGF.Builder .CreateStore (llvm::ConstantInt::get (Int64Ty, KernelArgsSizeNoTailPadding ),
357369 CGF.Builder .CreateStructGEP (KernelLaunchParams, 1 ));
358- CGF.Builder .CreateStore (llvm::Constant::getNullValue (PtrTy ),
370+ CGF.Builder .CreateStore (KernelArgs. emitRawPointer (CGF ),
359371 CGF.Builder .CreateStructGEP (KernelLaunchParams, 2 ));
372+ CGF.Builder .CreateStore (llvm::Constant::getNullValue (PtrTy),
373+ CGF.Builder .CreateStructGEP (KernelLaunchParams, 3 ));
360374
361375 for (unsigned i = 0 ; i < Args.size (); ++i) {
362376 auto *ArgVal = CGF.Builder .CreateLoad (CGF.GetAddrOfLocalVar (Args[i]));
0 commit comments