Skip to content

Commit ca999f7

Browse files
committed
[OpenMP][Offloading] Use bitset to indicate execution mode instead of value
The execution mode of a kernel is stored in a global variable, whose value means: - 0 - SPMD mode - 1 - indicates generic mode - 2 - SPMD mode execution with generic mode semantics We are going to add support for SIMD execution mode. It will be come with another execution mode, such as SIMD-generic mode. As a result, this value-based indicator is not flexible. This patch changes to bitset based solution to encode execution mode. Each position is: [0] - generic mode [1] - SPMD mode [2] - SIMD mode (will be added later) In this way, `0x1` is generic mode, `0x2` is SPMD mode, and `0x3` is SPMD mode execution with generic mode semantics. In the future after we add the support for SIMD mode, `0b1xx` will be in SIMD mode. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D110029
1 parent ec83114 commit ca999f7

14 files changed

+83
-79
lines changed

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

+6-5
Original file line numberDiff line numberDiff line change
@@ -1112,11 +1112,12 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
11121112
// warps participate in parallel work.
11131113
static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
11141114
bool Mode) {
1115-
auto *GVMode =
1116-
new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1117-
llvm::GlobalValue::WeakAnyLinkage,
1118-
llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1119-
Twine(Name, "_exec_mode"));
1115+
auto *GVMode = new llvm::GlobalVariable(
1116+
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1117+
llvm::GlobalValue::WeakAnyLinkage,
1118+
llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1119+
: OMP_TGT_EXEC_MODE_GENERIC),
1120+
Twine(Name, "_exec_mode"));
11201121
CGM.addCompilerUsedGlobal(GVMode);
11211122
}
11221123

clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
#define HEADER
1717

1818
// Check that the execution mode of all 3 target regions on the gpu is set to SPMD Mode.
19-
// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 0
20-
// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0
21-
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
19+
// CHECK-DAG: {{@__omp_offloading_.+l29}}_exec_mode = weak constant i8 2
20+
// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 2
21+
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2
2222

2323
template<typename tx>
2424
tx ftemplate(int n) {

clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
1313

1414
// Check that the execution mode of all 3 target regions is set to Spmd Mode.
15-
// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
16-
// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
17-
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
15+
// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 2
16+
// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2
17+
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2
1818

1919
template<typename tx>
2020
tx ftemplate(int n) {

clang/test/OpenMP/nvptx_target_simd_codegen.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
#define HEADER
1717

1818
// Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
19-
// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
20-
// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
21-
// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 0
22-
// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0
19+
// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2
20+
// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2
21+
// CHECK-DAG: {{@__omp_offloading_.+l42}}_exec_mode = weak constant i8 2
22+
// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 2
2323

2424
#define N 1000
2525

clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
#define HEADER
1717

1818
// Check that the execution mode of all 2 target regions on the gpu is set to NonSPMD Mode.
19-
// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 0
20-
// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0
21-
// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 0
22-
// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 0
19+
// CHECK-DAG: {{@__omp_offloading_.+l37}}_exec_mode = weak constant i8 2
20+
// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 2
21+
// CHECK-DAG: {{@__omp_offloading_.+l48}}_exec_mode = weak constant i8 2
22+
// CHECK-DAG: {{@__omp_offloading_.+l53}}_exec_mode = weak constant i8 2
2323

2424
#define N 1000
2525
#define M 10

llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

+8
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,14 @@ enum class OMPScheduleType {
128128
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
129129
};
130130

131+
enum OMPTgtExecModeFlags : int8_t {
132+
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
133+
OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
134+
OMP_TGT_EXEC_MODE_GENERIC_SPMD =
135+
OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
136+
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)
137+
};
138+
131139
} // end namespace omp
132140

133141
} // end namespace llvm

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

+10-7
Original file line numberDiff line numberDiff line change
@@ -3284,15 +3284,18 @@ struct AAKernelInfoFunction : AAKernelInfo {
32843284
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
32853285
(Kernel->getName() + "_exec_mode").str());
32863286
assert(ExecMode && "Kernel without exec mode?");
3287-
assert(ExecMode->getInitializer() &&
3288-
ExecMode->getInitializer()->isOneValue() &&
3289-
"Initially non-SPMD kernel has SPMD exec mode!");
3287+
assert(ExecMode->getInitializer() && "ExecMode doesn't have initializer!");
32903288

32913289
// Set the global exec mode flag to indicate SPMD-Generic mode.
3292-
constexpr int SPMDGeneric = 2;
3293-
if (!ExecMode->getInitializer()->isZeroValue())
3294-
ExecMode->setInitializer(
3295-
ConstantInt::get(ExecMode->getInitializer()->getType(), SPMDGeneric));
3290+
assert(isa<ConstantInt>(ExecMode->getInitializer()) &&
3291+
"ExecMode is not an integer!");
3292+
const int8_t ExecModeVal =
3293+
cast<ConstantInt>(ExecMode->getInitializer())->getSExtValue();
3294+
assert(ExecModeVal == OMP_TGT_EXEC_MODE_GENERIC &&
3295+
"Initially non-SPMD kernel has SPMD exec mode!");
3296+
ExecMode->setInitializer(
3297+
ConstantInt::get(ExecMode->getInitializer()->getType(),
3298+
ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD));
32963299

32973300
// Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
32983301
const int InitIsSPMDArgNo = 1;

llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ target triple = "nvptx64"
1111
; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
1212
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
1313
; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
14-
; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
14+
; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
1515
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
1616
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
1717
;.

llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ target triple = "nvptx64"
1313

1414
;.
1515
; CHECK: @[[IS_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
16-
; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
16+
; CHECK: @[[WILL_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
1717
; CHECK: @[[NON_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
1818
; CHECK: @[[WILL_NOT_BE_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
1919
; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8

llvm/test/Transforms/OpenMP/spmdization.ll

+8-8
Original file line numberDiff line numberDiff line change
@@ -91,21 +91,21 @@
9191
;.
9292
; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
9393
; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
94-
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
95-
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
96-
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
97-
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
94+
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
95+
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
96+
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
97+
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
9898
; AMDGPU: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
9999
; AMDGPU: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
100100
; AMDGPU: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
101101
; AMDGPU: @[[X_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32
102102
;.
103103
; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
104104
; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
105-
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
106-
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
107-
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
108-
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
105+
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
106+
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
107+
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
108+
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
109109
; NVPTX: @[[__OMP_OFFLOADING_14_A34CA11_DO_NOT_SPMDIZE_TARGET_L65_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1
110110
; NVPTX: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x i8*] [i8* @__omp_offloading_14_a34ca11_sequential_loop_l5_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_stack_var_l20_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_l35_exec_mode, i8* @__omp_offloading_14_a34ca11_sequential_loop_to_shared_var_guarded_l50_exec_mode, i8* @__omp_offloading_14_a34ca11_do_not_spmdize_target_l65_exec_mode], section "llvm.metadata"
111111
; NVPTX: @[[X:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 32

llvm/test/Transforms/OpenMP/spmdization_assumes.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ target triple = "nvptx64"
2323
;.
2424
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
2525
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
26-
; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
26+
; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
2727
; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_fd02_404433c2_main_l5_exec_mode], section "llvm.metadata"
2828
;.
2929
define weak void @__omp_offloading_fd02_404433c2_main_l5(double* nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {

llvm/test/Transforms/OpenMP/spmdization_guarding.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ target triple = "nvptx64"
4545
;.
4646
; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
4747
; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
48-
; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
48+
; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3
4949
; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2a_fbfa7a_sequential_loop_l6_exec_mode], section "llvm.metadata"
5050
;.
5151
; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"

openmp/libomptarget/plugins/cuda/CMakeLists.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ libomptarget_say("Building CUDA offloading plugin.")
2222
# Define the suffix for the runtime messaging dumps.
2323
add_definitions(-DTARGET_NAME=CUDA)
2424

25-
include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
25+
include_directories(
26+
${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}
27+
${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
28+
)
2629

2730
set(LIBOMPTARGET_DLOPEN_LIBCUDA OFF)
2831
option(LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA "Build with dlopened libcuda" ${LIBOMPTARGET_DLOPEN_LIBCUDA})

openmp/libomptarget/plugins/cuda/src/rtl.cpp

+29-40
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828

2929
#include "MemoryManager.h"
3030

31+
#include "llvm/Frontend/OpenMP/OMPConstants.h"
32+
3133
// Utility for retrieving and printing CUDA error string.
3234
#ifdef OMPTARGET_DEBUG
3335
#define CUDA_ERR_STRING(err) \
@@ -71,28 +73,17 @@ struct FuncOrGblEntryTy {
7173
std::vector<__tgt_offload_entry> Entries;
7274
};
7375

74-
enum ExecutionModeType {
75-
SPMD, // constructors, destructors,
76-
// combined constructs (`teams distribute parallel for [simd]`)
77-
GENERIC, // everything else
78-
SPMD_GENERIC, // Generic kernel with SPMD execution
79-
NONE
80-
};
81-
8276
/// Use a single entity to encode a kernel and a set of flags.
8377
struct KernelTy {
8478
CUfunction Func;
8579

8680
// execution mode of kernel
87-
// 0 - SPMD mode (without master warp)
88-
// 1 - Generic mode (with master warp)
89-
// 2 - SPMD mode execution with Generic mode semantics.
90-
int8_t ExecutionMode;
81+
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
9182

9283
/// Maximal number of threads per block for this kernel.
9384
int MaxThreadsPerBlock = 0;
9485

95-
KernelTy(CUfunction _Func, int8_t _ExecutionMode)
86+
KernelTy(CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
9687
: Func(_Func), ExecutionMode(_ExecutionMode) {}
9788
};
9889

@@ -867,7 +858,7 @@ class DeviceRTLTy {
867858
DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
868859

869860
// default value GENERIC (in case symbol is missing from cubin file)
870-
int8_t ExecModeVal = ExecutionModeType::GENERIC;
861+
llvm::omp::OMPTgtExecModeFlags ExecModeVal;
871862
std::string ExecModeNameStr(E->name);
872863
ExecModeNameStr += "_exec_mode";
873864
const char *ExecModeName = ExecModeNameStr.c_str();
@@ -876,9 +867,9 @@ class DeviceRTLTy {
876867
size_t CUSize;
877868
Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
878869
if (Err == CUDA_SUCCESS) {
879-
if (CUSize != sizeof(int8_t)) {
870+
if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
880871
DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
881-
ExecModeName, CUSize, sizeof(int8_t));
872+
ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
882873
return nullptr;
883874
}
884875

@@ -890,12 +881,6 @@ class DeviceRTLTy {
890881
CUDA_ERR_STRING(Err);
891882
return nullptr;
892883
}
893-
894-
if (ExecModeVal < 0 || ExecModeVal > 2) {
895-
DP("Error wrong exec_mode value specified in cubin file: %d\n",
896-
ExecModeVal);
897-
return nullptr;
898-
}
899884
} else {
900885
DP("Loading global exec_mode '%s' - symbol missing, using default "
901886
"value GENERIC (1)\n",
@@ -1098,12 +1083,19 @@ class DeviceRTLTy {
10981083

10991084
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
11001085

1086+
const bool IsSPMDGenericMode =
1087+
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1088+
const bool IsSPMDMode =
1089+
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1090+
const bool IsGenericMode =
1091+
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1092+
11011093
int CudaThreadsPerBlock;
11021094
if (ThreadLimit > 0) {
11031095
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
11041096
CudaThreadsPerBlock = ThreadLimit;
11051097
// Add master warp if necessary
1106-
if (KernelInfo->ExecutionMode == GENERIC) {
1098+
if (IsGenericMode) {
11071099
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
11081100
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
11091101
}
@@ -1136,13 +1128,21 @@ class DeviceRTLTy {
11361128
unsigned int CudaBlocksPerGrid;
11371129
if (TeamNum <= 0) {
11381130
if (LoopTripCount > 0 && EnvNumTeams < 0) {
1139-
if (KernelInfo->ExecutionMode == SPMD) {
1131+
if (IsSPMDGenericMode) {
1132+
// If we reach this point, then we are executing a kernel that was
1133+
// transformed from Generic-mode to SPMD-mode. This kernel has
1134+
// SPMD-mode execution, but needs its blocks to be scheduled
1135+
// differently because the current loop trip count only applies to the
1136+
// `teams distribute` region and will create var too few blocks using
1137+
// the regular SPMD-mode method.
1138+
CudaBlocksPerGrid = LoopTripCount;
1139+
} else if (IsSPMDMode) {
11401140
// We have a combined construct, i.e. `target teams distribute
11411141
// parallel for [simd]`. We launch so many teams so that each thread
11421142
// will execute one iteration of the loop. round up to the nearest
11431143
// integer
11441144
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
1145-
} else if (KernelInfo->ExecutionMode == GENERIC) {
1145+
} else if (IsGenericMode) {
11461146
// If we reach this point, then we have a non-combined construct, i.e.
11471147
// `teams distribute` with a nested `parallel for` and each team is
11481148
// assigned one iteration of the `distribute` loop. E.g.:
@@ -1156,16 +1156,9 @@ class DeviceRTLTy {
11561156
// Threads within a team will execute the iterations of the `parallel`
11571157
// loop.
11581158
CudaBlocksPerGrid = LoopTripCount;
1159-
} else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1160-
// If we reach this point, then we are executing a kernel that was
1161-
// transformed from Generic-mode to SPMD-mode. This kernel has
1162-
// SPMD-mode execution, but needs its blocks to be scheduled
1163-
// differently because the current loop trip count only applies to the
1164-
// `teams distribute` region and will create var too few blocks using
1165-
// the regular SPMD-mode method.
1166-
CudaBlocksPerGrid = LoopTripCount;
11671159
} else {
1168-
REPORT("Unknown execution mode: %d\n", KernelInfo->ExecutionMode);
1160+
REPORT("Unknown execution mode: %d\n",
1161+
static_cast<int8_t>(KernelInfo->ExecutionMode));
11691162
return OFFLOAD_FAIL;
11701163
}
11711164
DP("Using %d teams due to loop trip count %" PRIu32
@@ -1185,16 +1178,12 @@ class DeviceRTLTy {
11851178
}
11861179

11871180
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1188-
"Launching kernel %s with %d blocks and %d threads in %s "
1189-
"mode\n",
1181+
"Launching kernel %s with %d blocks and %d threads in %s mode\n",
11901182
(getOffloadEntry(DeviceId, TgtEntryPtr))
11911183
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
11921184
: "(null)",
11931185
CudaBlocksPerGrid, CudaThreadsPerBlock,
1194-
(KernelInfo->ExecutionMode != SPMD
1195-
? (KernelInfo->ExecutionMode == GENERIC ? "Generic"
1196-
: "SPMD-Generic")
1197-
: "SPMD"));
1186+
(!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
11981187

11991188
CUstream Stream = getStream(DeviceId, AsyncInfo);
12001189
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,

0 commit comments

Comments
 (0)