28
28
29
29
#include " MemoryManager.h"
30
30
31
+ #include " llvm/Frontend/OpenMP/OMPConstants.h"
32
+
31
33
// Utility for retrieving and printing CUDA error string.
32
34
#ifdef OMPTARGET_DEBUG
33
35
#define CUDA_ERR_STRING (err ) \
@@ -71,28 +73,17 @@ struct FuncOrGblEntryTy {
71
73
std::vector<__tgt_offload_entry> Entries;
72
74
};
73
75
74
- enum ExecutionModeType {
75
- SPMD, // constructors, destructors,
76
- // combined constructs (`teams distribute parallel for [simd]`)
77
- GENERIC, // everything else
78
- SPMD_GENERIC, // Generic kernel with SPMD execution
79
- NONE
80
- };
81
-
82
76
// / Use a single entity to encode a kernel and a set of flags.
83
77
struct KernelTy {
84
78
CUfunction Func;
85
79
86
80
// execution mode of kernel
87
- // 0 - SPMD mode (without master warp)
88
- // 1 - Generic mode (with master warp)
89
- // 2 - SPMD mode execution with Generic mode semantics.
90
- int8_t ExecutionMode;
81
+ llvm::omp::OMPTgtExecModeFlags ExecutionMode;
91
82
92
83
// / Maximal number of threads per block for this kernel.
93
84
int MaxThreadsPerBlock = 0 ;
94
85
95
- KernelTy (CUfunction _Func, int8_t _ExecutionMode)
86
+ KernelTy (CUfunction _Func, llvm::omp::OMPTgtExecModeFlags _ExecutionMode)
96
87
: Func(_Func), ExecutionMode(_ExecutionMode) {}
97
88
};
98
89
@@ -867,7 +858,7 @@ class DeviceRTLTy {
867
858
DPxPTR (E - HostBegin), E->name , DPxPTR (Func));
868
859
869
860
// default value GENERIC (in case symbol is missing from cubin file)
870
- int8_t ExecModeVal = ExecutionModeType::GENERIC ;
861
+ llvm::omp::OMPTgtExecModeFlags ExecModeVal ;
871
862
std::string ExecModeNameStr (E->name );
872
863
ExecModeNameStr += " _exec_mode" ;
873
864
const char *ExecModeName = ExecModeNameStr.c_str ();
@@ -876,9 +867,9 @@ class DeviceRTLTy {
876
867
size_t CUSize;
877
868
Err = cuModuleGetGlobal (&ExecModePtr, &CUSize, Module, ExecModeName);
878
869
if (Err == CUDA_SUCCESS) {
879
- if (CUSize != sizeof (int8_t )) {
870
+ if (CUSize != sizeof (llvm::omp::OMPTgtExecModeFlags )) {
880
871
DP (" Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n " ,
881
- ExecModeName, CUSize, sizeof (int8_t ));
872
+ ExecModeName, CUSize, sizeof (llvm::omp::OMPTgtExecModeFlags ));
882
873
return nullptr ;
883
874
}
884
875
@@ -890,12 +881,6 @@ class DeviceRTLTy {
890
881
CUDA_ERR_STRING (Err);
891
882
return nullptr ;
892
883
}
893
-
894
- if (ExecModeVal < 0 || ExecModeVal > 2 ) {
895
- DP (" Error wrong exec_mode value specified in cubin file: %d\n " ,
896
- ExecModeVal);
897
- return nullptr ;
898
- }
899
884
} else {
900
885
DP (" Loading global exec_mode '%s' - symbol missing, using default "
901
886
" value GENERIC (1)\n " ,
@@ -1098,12 +1083,19 @@ class DeviceRTLTy {
1098
1083
1099
1084
KernelTy *KernelInfo = reinterpret_cast <KernelTy *>(TgtEntryPtr);
1100
1085
1086
+ const bool IsSPMDGenericMode =
1087
+ KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
1088
+ const bool IsSPMDMode =
1089
+ KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
1090
+ const bool IsGenericMode =
1091
+ KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
1092
+
1101
1093
int CudaThreadsPerBlock;
1102
1094
if (ThreadLimit > 0 ) {
1103
1095
DP (" Setting CUDA threads per block to requested %d\n " , ThreadLimit);
1104
1096
CudaThreadsPerBlock = ThreadLimit;
1105
1097
// Add master warp if necessary
1106
- if (KernelInfo-> ExecutionMode == GENERIC ) {
1098
+ if (IsGenericMode ) {
1107
1099
DP (" Adding master warp: +%d threads\n " , DeviceData[DeviceId].WarpSize );
1108
1100
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize ;
1109
1101
}
@@ -1136,13 +1128,21 @@ class DeviceRTLTy {
1136
1128
unsigned int CudaBlocksPerGrid;
1137
1129
if (TeamNum <= 0 ) {
1138
1130
if (LoopTripCount > 0 && EnvNumTeams < 0 ) {
1139
- if (KernelInfo->ExecutionMode == SPMD) {
1131
+ if (IsSPMDGenericMode) {
1132
+ // If we reach this point, then we are executing a kernel that was
1133
+ // transformed from Generic-mode to SPMD-mode. This kernel has
1134
+ // SPMD-mode execution, but needs its blocks to be scheduled
1135
+ // differently because the current loop trip count only applies to the
1136
+ // `teams distribute` region and will create var too few blocks using
1137
+ // the regular SPMD-mode method.
1138
+ CudaBlocksPerGrid = LoopTripCount;
1139
+ } else if (IsSPMDMode) {
1140
1140
// We have a combined construct, i.e. `target teams distribute
1141
1141
// parallel for [simd]`. We launch so many teams so that each thread
1142
1142
// will execute one iteration of the loop. round up to the nearest
1143
1143
// integer
1144
1144
CudaBlocksPerGrid = ((LoopTripCount - 1 ) / CudaThreadsPerBlock) + 1 ;
1145
- } else if (KernelInfo-> ExecutionMode == GENERIC ) {
1145
+ } else if (IsGenericMode ) {
1146
1146
// If we reach this point, then we have a non-combined construct, i.e.
1147
1147
// `teams distribute` with a nested `parallel for` and each team is
1148
1148
// assigned one iteration of the `distribute` loop. E.g.:
@@ -1156,16 +1156,9 @@ class DeviceRTLTy {
1156
1156
// Threads within a team will execute the iterations of the `parallel`
1157
1157
// loop.
1158
1158
CudaBlocksPerGrid = LoopTripCount;
1159
- } else if (KernelInfo->ExecutionMode == SPMD_GENERIC) {
1160
- // If we reach this point, then we are executing a kernel that was
1161
- // transformed from Generic-mode to SPMD-mode. This kernel has
1162
- // SPMD-mode execution, but needs its blocks to be scheduled
1163
- // differently because the current loop trip count only applies to the
1164
- // `teams distribute` region and will create var too few blocks using
1165
- // the regular SPMD-mode method.
1166
- CudaBlocksPerGrid = LoopTripCount;
1167
1159
} else {
1168
- REPORT (" Unknown execution mode: %d\n " , KernelInfo->ExecutionMode );
1160
+ REPORT (" Unknown execution mode: %d\n " ,
1161
+ static_cast <int8_t >(KernelInfo->ExecutionMode ));
1169
1162
return OFFLOAD_FAIL;
1170
1163
}
1171
1164
DP (" Using %d teams due to loop trip count %" PRIu32
@@ -1185,16 +1178,12 @@ class DeviceRTLTy {
1185
1178
}
1186
1179
1187
1180
INFO (OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
1188
- " Launching kernel %s with %d blocks and %d threads in %s "
1189
- " mode\n " ,
1181
+ " Launching kernel %s with %d blocks and %d threads in %s mode\n " ,
1190
1182
(getOffloadEntry (DeviceId, TgtEntryPtr))
1191
1183
? getOffloadEntry (DeviceId, TgtEntryPtr)->name
1192
1184
: " (null)" ,
1193
1185
CudaBlocksPerGrid, CudaThreadsPerBlock,
1194
- (KernelInfo->ExecutionMode != SPMD
1195
- ? (KernelInfo->ExecutionMode == GENERIC ? " Generic"
1196
- : " SPMD-Generic" )
1197
- : " SPMD" ));
1186
+ (!IsSPMDMode ? (IsGenericMode ? " Generic" : " SPMD-Generic" ) : " SPMD" ));
1198
1187
1199
1188
CUstream Stream = getStream (DeviceId, AsyncInfo);
1200
1189
Err = cuLaunchKernel (KernelInfo->Func , CudaBlocksPerGrid, /* gridDimY */ 1 ,
0 commit comments