Skip to content

Commit e95d757

Browse files
jfedorovmaaswani
andauthored
Fixes overlapping of kernels on device, fixes few data races in zeCollector (intel#57)
* initial rev: GetHostTimer changes * fixes device kernel timestamps for none-immediate command list Signed-off-by: jfedorov <[email protected]> * improves dpc_gemm_multithreaded test Signed-off-by: jfedorov <[email protected]> * removes irrelevent "result" parameter at OnEnter.. call-backs Signed-off-by: jfedorov <[email protected]> * adds command_queue to its info map Signed-off-by: jfedorov <[email protected]> * completes the fix for ensuring no kernels overlaps on same device, finishes test for it, cleans up of ze collector and dpc_gemm_threaded test Signed-off-by: jfedorov <[email protected]> * protects with shared_mutext command_list_map_, dev_uuid_map_ in zeCollector, Signed-off-by: jfedorov <[email protected]> --------- Signed-off-by: jfedorov <[email protected]> Co-authored-by: Aswani, Mahesh <[email protected]>
1 parent 1f6e151 commit e95d757

File tree

6 files changed

+670
-220
lines changed

6 files changed

+670
-220
lines changed

sdk/samples/dpc_gemm_threaded/main.cc

+56-32
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
#include <string.h>
1212

13-
#include <CL/sycl.hpp>
13+
#include <sycl/sycl.hpp>
1414
#include <cstdlib>
1515
#include <memory>
1616
#include <thread>
@@ -24,6 +24,8 @@
2424
#define B_VALUE 0.256f
2525
#define MAX_EPS 1.0e-4f
2626

27+
static bool verbose = false;
28+
2729
static float Check(const std::vector<float>& a, float value) {
2830
assert(value > MAX_EPS);
2931

@@ -86,7 +88,9 @@ static float RunAndCheck(sycl::queue queue, const std::vector<float>& a,
8688
throw;
8789
}
8890

89-
std::cout << "\tMatrix multiplication time: " << time << " sec" << std::endl;
91+
if (verbose) {
92+
std::cout << "\tMatrix multiplication time: " << time << " sec" << std::endl;
93+
}
9094

9195
return Check(c, expected_result);
9296
}
@@ -97,8 +101,10 @@ static void Compute(sycl::queue queue, const std::vector<float>& a,
97101
float expected_result) {
98102
for (unsigned i = 0; i < repeat_count; ++i) {
99103
float eps = RunAndCheck(queue, a, b, c, size, expected_result);
100-
std::cout << "Results are " << ((eps < MAX_EPS) ? "" : "IN")
104+
if (verbose) {
105+
std::cout << "Results are " << ((eps < MAX_EPS) ? "" : "IN")
101106
<< "CORRECT with accuracy: " << eps << std::endl;
107+
}
102108
}
103109
}
104110

@@ -122,45 +128,61 @@ const unsigned max_thread_count = 64;
122128
const unsigned max_size = 8192;
123129
const unsigned min_size = 32;
124130

131+
const unsigned default_size = 1024;
132+
const unsigned default_thread_count = 2;
133+
const unsigned default_repetition_per_thread = 4;
134+
125135
void Usage(const char* name) {
126136

127-
std::cout << " Calculating floating point matrix multiply on gpu, submitting the work from many CPU threads\n";
128-
std::cout << name << " [ [number of threads, default=2, max=" << max_thread_count
129-
<< "], [matrix size, default=1024, max=" << max_size << "], [repetition count, default=4]] \n";
137+
std::cout << " Calculating floating point matrix multiply on gpu, submitting the work from many CPU threads\n"
138+
<< " Usage " << name << " [ options ]" << std::endl;
139+
std::cout <<
140+
"--threads [-t] integer " <<
141+
"Threads number, default: " << default_thread_count << std::endl;
142+
std::cout <<
143+
"--size [-s] integer " <<
144+
"Matrix size, default: " << default_size << std::endl;
145+
std::cout <<
146+
"--repeat [-r] integer " <<
147+
"Repetition number per thread, default: " << default_repetition_per_thread << std::endl;
148+
std::cout <<
149+
"--verbose [-v] " <<
150+
"Enable verbose mode to report the app progress, default: off" << std::endl;
130151
}
131152

132153
int main(int argc, char* argv[]) {
133154

134155
int exit_code = EXIT_SUCCESS;
135-
unsigned thread_count = 2;
136-
unsigned repeat_count = 4;
137-
unsigned size = 1024;
138-
139-
if (argc == 2 &&
140-
( strcmp(argv[1], "-?") == 0 or strcmp(argv[1], "-h") == 0 or strcmp(argv[1], "--help" ) == 0) ){
141-
Usage(argv[0]);
142-
return EXIT_SUCCESS;
143-
}
156+
unsigned thread_count = default_thread_count;
157+
unsigned repeat_count = default_repetition_per_thread;
158+
unsigned size = default_size;
144159

145160
try {
146161
unsigned temp;
147-
if (argc > 1) {
148-
temp = std::stoul(argv[1]);
149-
thread_count = (temp < 1) ? 1 :
150-
(temp > max_thread_count) ? max_thread_count : temp;
151-
}
152-
if (argc > 2) {
153-
temp = std::stoul(argv[2]);
154-
size = (temp < min_size) ? min_size :
155-
(temp > max_size) ? max_size : temp;
156-
}
157-
158-
if (argc > 3) {
159-
temp = std::stoul(argv[3]);
160-
repeat_count = (temp < 1) ? 1 : temp;
162+
for (uint32_t i=1; i < argc; i++) {
163+
if (strcmp(argv[i], "-s" ) == 0 || strcmp(argv[i], "--size") == 0 ){
164+
i++;
165+
temp = std::stoul(argv[i]);
166+
size = (temp < min_size) ? min_size : (temp > max_size) ? max_size : temp;
167+
} else if (strcmp(argv[i], "-t" ) == 0 || strcmp(argv[i], "--threads") == 0 ){
168+
i++;
169+
temp = std::stoul(argv[i]);
170+
thread_count = (temp < 1) ? 1 : (temp > max_thread_count) ? max_thread_count : temp;
171+
} else if (strcmp(argv[i], "-r" ) == 0 || strcmp(argv[i], "--repeat") == 0 ){
172+
i++;
173+
temp = std::stoul(argv[i]);
174+
repeat_count = (temp < 1) ? 1 : temp;
175+
} else if (strcmp(argv[i], "-v" ) == 0 || strcmp(argv[i], "--verbose") == 0 ){
176+
// verbosity off makes minimal the sample self output -
177+
// so profiling output won't be intermixed with the sample output
178+
// and could be analyzed by tests
179+
verbose = true;
180+
} else {
181+
Usage(argv[0]);
182+
return EXIT_SUCCESS;
183+
}
161184
}
162185
}
163-
164186
catch(...) {
165187
Usage(argv[0]);
166188
return EXIT_FAILURE;
@@ -328,15 +350,17 @@ int main(int argc, char* argv[]) {
328350
auto end = std::chrono::steady_clock::now();
329351
std::chrono::duration<float> time = end - start;
330352

331-
std::cout << "\t-- Total execution time: " << time.count() << " sec" << std::endl;
353+
if (verbose) {
354+
std::cout << "\t-- Total execution time: " << time.count() << " sec" << std::endl;
355+
}
332356
};
333357

334358
std::cout << "DPC++ Matrix Multiplication (CPU threads: " << thread_count << ", matrix size: " << size << " x "
335359
<< size << ", repeats: " << repeat_count << " times)" << std::endl;
336360
std::cout << "Target device: "
337361
<< queue.get_info<sycl::info::queue::device>()
338362
.get_info<sycl::info::device::name>()
339-
<< std::endl;
363+
<< std::endl << std::flush;
340364

341365
std::vector<std::thread> the_threads;
342366
for (unsigned i=0; i<thread_count; i++) {

sdk/src/levelzero/gen_tracing_callbacks.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -366,15 +366,15 @@ def gen_enter_callback(f, func, command_list_func_list, command_queue_func_list,
366366
if (cb != ""):
367367
f.write(" if (collector->options_.kernel_tracing) { \n")
368368
if (func in synchronize_func_list):
369-
f.write(" " + cb + "(params, result, global_data, instance_user_data, &kids); \n")
369+
f.write(" " + cb + "(params, global_data, instance_user_data, &kids); \n")
370370
f.write(" if (kids.size() != 0) {\n")
371371
f.write(" ze_instance_data.kid = kids[0];\n") # pass kid to the exit callback
372372
f.write(" }\n")
373373
f.write(" else {\n")
374374
f.write(" ze_instance_data.kid = (uint64_t)(-1);\n")
375375
f.write(" }\n")
376376
else:
377-
f.write(" " + cb + "(params, result, global_data, instance_user_data); \n")
377+
f.write(" " + cb + "(params, global_data, instance_user_data); \n")
378378
f.write(" }\n")
379379
f.write("\n")
380380
f.write("\n")

0 commit comments

Comments
 (0)