27
27
#include < map>
28
28
#include < memory>
29
29
#include < mutex>
30
- #include < shared_mutex>
31
30
#include < set>
31
+ #include < shared_mutex>
32
32
#include < sstream>
33
33
#include < string>
34
34
#include < vector>
@@ -216,8 +216,8 @@ struct ZeMetricQueryPools {
216
216
217
217
struct ZeInstanceData {
218
218
uint64_t start_time_host;
219
- uint64_t timestamp_host; // in ns
220
- uint64_t timestamp_device; // in ticks
219
+ uint64_t timestamp_host; // in ns
220
+ uint64_t timestamp_device; // in ticks
221
221
uint64_t end_time_host;
222
222
uint64_t kid; // passing kid from enter callback to exit callback
223
223
};
@@ -300,8 +300,8 @@ struct ZeKernelCommand {
300
300
ze_command_list_handle_t command_list = nullptr ;
301
301
ze_command_queue_handle_t queue = nullptr ;
302
302
ze_fence_handle_t fence;
303
- uint64_t submit_time = 0 ; // in ns
304
- uint64_t submit_time_device_ = 0 ; // in ticks
303
+ uint64_t submit_time = 0 ; // in ns
304
+ uint64_t submit_time_device_ = 0 ; // in ticks
305
305
uint64_t tid = 0 ;
306
306
uint64_t sycl_node_id_ = 0 ;
307
307
uint32_t sycl_invocation_id_ = 0 ;
@@ -1420,8 +1420,8 @@ overhead::Init();
1420
1420
1421
1421
uint64_t GetDeviceTimeNs (uint64_t cycles, uint64_t freq) { return cycles * NSEC_IN_SEC / freq; }
1422
1422
1423
- inline void GetHostTime (const ZeKernelCommand * command, const ze_kernel_timestamp_result_t & ts,
1424
- uint64_t & start, uint64_t & end) {
1423
+ inline void GetHostTime (const ZeKernelCommand* command, const ze_kernel_timestamp_result_t & ts,
1424
+ uint64_t & start, uint64_t & end) {
1425
1425
uint64_t device_freq = command->device_timer_frequency_ ;
1426
1426
uint64_t device_mask = command->device_timer_mask_ ;
1427
1427
@@ -1441,7 +1441,8 @@ overhead::Init();
1441
1441
// - at Enter to CommandListAppendLaunch<...> time for an Immediate Command List
1442
1442
// - at Enter to CommandQueueExecuteCommandLists for not Immediate CommandLists
1443
1443
1444
- // GPU time mask applied to the GPU time to remove some spiritous bits (in case they made there)
1444
+ // GPU time mask applied to the GPU time to remove some spiritous bits (in case they made
1445
+ // there)
1445
1446
uint64_t device_submit_time = (command->submit_time_device_ & device_mask);
1446
1447
1447
1448
// time_shift calculated in GPU scale between sync point and GPU command start,
@@ -1450,10 +1451,10 @@ overhead::Init();
1450
1451
1451
1452
if (device_start > device_submit_time) {
1452
1453
time_shift = (device_start - device_submit_time) * NSEC_IN_SEC / device_freq;
1453
- }
1454
- else {
1454
+ } else {
1455
1455
// overflow
1456
- time_shift = (device_mask - device_submit_time + 1 + device_start) * NSEC_IN_SEC / device_freq;
1456
+ time_shift =
1457
+ (device_mask - device_submit_time + 1 + device_start) * NSEC_IN_SEC / device_freq;
1457
1458
}
1458
1459
1459
1460
// GPU command duration recalculated to CPU time scale units
@@ -1841,8 +1842,8 @@ overhead::Init();
1841
1842
1842
1843
PTI_ASSERT (device_descriptors_.count (device) != 0 );
1843
1844
1844
- command_list_map_[command_list] = {
1845
- std::vector<ZeKernelCommand*>(), context, device, immediate, oi_pair};
1845
+ command_list_map_[command_list] = {std::vector<ZeKernelCommand*>(), context, device, immediate,
1846
+ oi_pair};
1846
1847
command_list_map_mutex_.unlock ();
1847
1848
1848
1849
if (immediate) {
@@ -1893,19 +1894,20 @@ overhead::Init();
1893
1894
}
1894
1895
#endif /* 0 */
1895
1896
1896
- void PrepareToExecuteCommandLists (ze_command_list_handle_t * command_lists, uint32_t command_list_count,
1897
- ze_command_queue_handle_t queue, ze_fence_handle_t fence) {
1897
+ void PrepareToExecuteCommandLists (ze_command_list_handle_t * command_lists,
1898
+ uint32_t command_list_count, ze_command_queue_handle_t queue,
1899
+ ze_fence_handle_t fence) {
1898
1900
const std::lock_guard<std::mutex> lock (lock_);
1899
1901
uint64_t host_time_sync = 0 ;
1900
1902
uint64_t device_time_sync = 0 ;
1901
1903
auto it = command_queues_.find (queue);
1902
1904
PTI_ASSERT (it != command_queues_.end ());
1903
- ze_device_handle_t device = it->second .device_ ; // this should be only one device, as queue created on specific device
1905
+ ze_device_handle_t device =
1906
+ it->second .device_ ; // this should be only one device, as queue created on specific device
1904
1907
PTI_ASSERT (nullptr != device);
1905
1908
ze_result_t status = zeDeviceGetGlobalTimestamps (device, &host_time_sync, &device_time_sync);
1906
1909
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
1907
1910
1908
-
1909
1911
for (uint32_t i = 0 ; i < command_list_count; ++i) {
1910
1912
ze_command_list_handle_t clist = command_lists[i];
1911
1913
PTI_ASSERT (clist != nullptr );
@@ -1918,23 +1920,22 @@ overhead::Init();
1918
1920
for (ZeKernelCommand* command : info.kernel_commands ) {
1919
1921
if (!command->tid ) command->tid = utils::GetTid ();
1920
1922
command->queue = queue;
1921
- // command->submit_time = host_sync;
1923
+ // command->submit_time = host_sync;
1922
1924
command->submit_time = host_time_sync;
1923
1925
command->submit_time_device_ = device_time_sync;
1924
1926
1925
1927
PTI_ASSERT (command->append_time <= command->submit_time );
1926
1928
command->fence = fence;
1927
1929
1928
- // if (queue_ordinal_index_map_.count(queue) != 0) {
1929
- // std::pair<uint32_t, uint32_t> oi = queue_ordinal_index_map_[queue];
1930
- // }
1931
-
1930
+ // if (queue_ordinal_index_map_.count(queue) != 0) {
1931
+ // std::pair<uint32_t, uint32_t> oi = queue_ordinal_index_map_[queue];
1932
+ // }
1932
1933
}
1933
1934
}
1934
1935
}
1935
1936
1936
- void PostSubmitKernelCommands (ze_command_list_handle_t * command_lists, uint32_t command_list_count,
1937
- std::vector<uint64_t >* kids) {
1937
+ void PostSubmitKernelCommands (ze_command_list_handle_t * command_lists,
1938
+ uint32_t command_list_count, std::vector<uint64_t >* kids) {
1938
1939
const std::lock_guard<std::mutex> lock (lock_);
1939
1940
1940
1941
for (uint32_t i = 0 ; i < command_list_count; ++i) {
@@ -2048,9 +2049,8 @@ overhead::Init();
2048
2049
}
2049
2050
}
2050
2051
2051
- static void OnEnterEventDestroy (ze_event_destroy_params_t * params,
2052
- void * global_data, void ** /* instance_data*/ ,
2053
- std::vector<uint64_t >* kids) {
2052
+ static void OnEnterEventDestroy (ze_event_destroy_params_t * params, void * global_data,
2053
+ void ** /* instance_data*/ , std::vector<uint64_t >* kids) {
2054
2054
if (*(params->phEvent ) != nullptr ) {
2055
2055
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2056
2056
std::vector<ZeKernelCommandExecutionRecord> kcexec;
@@ -2067,9 +2067,8 @@ overhead::Init();
2067
2067
}
2068
2068
}
2069
2069
2070
- static void OnEnterEventHostReset (ze_event_host_reset_params_t * params,
2071
- void * global_data, void ** /* instance_data*/ ,
2072
- std::vector<uint64_t >* kids) {
2070
+ static void OnEnterEventHostReset (ze_event_host_reset_params_t * params, void * global_data,
2071
+ void ** /* instance_data*/ , std::vector<uint64_t >* kids) {
2073
2072
if (*(params->phEvent ) != nullptr ) {
2074
2073
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2075
2074
std::vector<ZeKernelCommandExecutionRecord> kcexec;
@@ -2209,7 +2208,6 @@ overhead::Init();
2209
2208
2210
2209
zet_metric_query_handle_t query = nullptr ;
2211
2210
if (collector->options_ .metric_query && iskernel) {
2212
-
2213
2211
const auto it = collector->FindCommandListInfo (command_list);
2214
2212
PTI_ASSERT (it != collector->command_list_map_ .end ());
2215
2213
@@ -2229,7 +2227,7 @@ overhead::Init();
2229
2227
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
2230
2228
}
2231
2229
uint64_t host_timestamp;
2232
- uint64_t device_timestamp; // in ticks
2230
+ uint64_t device_timestamp; // in ticks
2233
2231
2234
2232
ze_result_t status = zeDeviceGetGlobalTimestamps (device, &host_timestamp, &device_timestamp);
2235
2233
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
@@ -2331,7 +2329,8 @@ overhead::Init();
2331
2329
if (command_list_info.immediate ) {
2332
2330
// command->tid = utils::GetTid();
2333
2331
command->submit_time = command->append_time ;
2334
- command->submit_time_device_ = ze_instance_data.timestamp_device ; // append time and submit time are the same
2332
+ command->submit_time_device_ =
2333
+ ze_instance_data.timestamp_device ; // append time and submit time are the same
2335
2334
command->queue = reinterpret_cast <ze_command_queue_handle_t >(command_list);
2336
2335
kernel_command_list_.push_back (command);
2337
2336
kids->push_back (command->kernel_id );
@@ -2424,8 +2423,7 @@ overhead::Init();
2424
2423
// in below GetTransferProperties => so twice for src and dst.
2425
2424
// this should be avoided
2426
2425
if (dst != nullptr ) {
2427
- ze_result_t status =
2428
- zeMemGetAllocProperties (context, dst, &mem_props, &dst_device);
2426
+ ze_result_t status = zeMemGetAllocProperties (context, dst, &mem_props, &dst_device);
2429
2427
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
2430
2428
if (dst_device) {
2431
2429
ze_result_t status = zeDeviceGetProperties (dst_device, &dev_props);
@@ -2434,8 +2432,7 @@ overhead::Init();
2434
2432
}
2435
2433
}
2436
2434
if (src != nullptr ) {
2437
- ze_result_t status =
2438
- zeMemGetAllocProperties (context, src, &mem_props, &src_device);
2435
+ ze_result_t status = zeMemGetAllocProperties (context, src, &mem_props, &src_device);
2439
2436
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
2440
2437
if (src_device) {
2441
2438
ze_result_t status = zeDeviceGetProperties (src_device, &dev_props);
@@ -2651,7 +2648,7 @@ overhead::Init();
2651
2648
props.value_size = pattern_size;
2652
2649
props.type = KERNEL_COMMAND_TYPE_MEMORY;
2653
2650
props.src_device = hSrcDevice;
2654
- props.src_device = hDstDevice;
2651
+ props.dst_device = hDstDevice;
2655
2652
return props;
2656
2653
}
2657
2654
@@ -2693,8 +2690,8 @@ overhead::Init();
2693
2690
}
2694
2691
2695
2692
static void OnEnterCommandListAppendLaunchCooperativeKernel (
2696
- ze_command_list_append_launch_cooperative_kernel_params_t * params,
2697
- void * global_data, void * * instance_data) {
2693
+ ze_command_list_append_launch_cooperative_kernel_params_t * params, void * global_data,
2694
+ void ** instance_data) {
2698
2695
if (UniController::IsCollectionEnabled ()) {
2699
2696
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2700
2697
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2721,8 +2718,8 @@ overhead::Init();
2721
2718
}
2722
2719
2723
2720
static void OnEnterCommandListAppendLaunchKernelIndirect (
2724
- ze_command_list_append_launch_kernel_indirect_params_t * params,
2725
- void * global_data, void * * instance_data) {
2721
+ ze_command_list_append_launch_kernel_indirect_params_t * params, void * global_data,
2722
+ void ** instance_data) {
2726
2723
if (UniController::IsCollectionEnabled ()) {
2727
2724
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2728
2725
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2833,8 +2830,8 @@ overhead::Init();
2833
2830
}
2834
2831
2835
2832
static void OnEnterCommandListAppendMemoryRangesBarrier (
2836
- ze_command_list_append_memory_ranges_barrier_params_t * params,
2837
- void * global_data, void * * instance_data) {
2833
+ ze_command_list_append_memory_ranges_barrier_params_t * params, void * global_data,
2834
+ void ** instance_data) {
2838
2835
if (UniController::IsCollectionEnabled ()) {
2839
2836
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2840
2837
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2861,8 +2858,8 @@ overhead::Init();
2861
2858
}
2862
2859
2863
2860
static void OnEnterCommandListAppendMemoryCopyRegion (
2864
- ze_command_list_append_memory_copy_region_params_t * params,
2865
- void * global_data, void * * instance_data) {
2861
+ ze_command_list_append_memory_copy_region_params_t * params, void * global_data,
2862
+ void ** instance_data) {
2866
2863
if (UniController::IsCollectionEnabled ()) {
2867
2864
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2868
2865
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2900,8 +2897,8 @@ overhead::Init();
2900
2897
}
2901
2898
2902
2899
static void OnEnterCommandListAppendMemoryCopyFromContext (
2903
- ze_command_list_append_memory_copy_from_context_params_t * params,
2904
- void * global_data, void * * instance_data) {
2900
+ ze_command_list_append_memory_copy_from_context_params_t * params, void * global_data,
2901
+ void ** instance_data) {
2905
2902
if (UniController::IsCollectionEnabled ()) {
2906
2903
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2907
2904
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2958,8 +2955,8 @@ overhead::Init();
2958
2955
}
2959
2956
2960
2957
static void OnEnterCommandListAppendImageCopyRegion (
2961
- ze_command_list_append_image_copy_region_params_t * params,
2962
- void * global_data, void * * instance_data) {
2958
+ ze_command_list_append_image_copy_region_params_t * params, void * global_data,
2959
+ void ** instance_data) {
2963
2960
if (UniController::IsCollectionEnabled ()) {
2964
2961
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2965
2962
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -2986,8 +2983,8 @@ overhead::Init();
2986
2983
}
2987
2984
2988
2985
static void OnEnterCommandListAppendImageCopyToMemory (
2989
- ze_command_list_append_image_copy_to_memory_params_t * params,
2990
- void * global_data, void * * instance_data) {
2986
+ ze_command_list_append_image_copy_to_memory_params_t * params, void * global_data,
2987
+ void ** instance_data) {
2991
2988
if (UniController::IsCollectionEnabled ()) {
2992
2989
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
2993
2990
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -3015,8 +3012,8 @@ overhead::Init();
3015
3012
}
3016
3013
3017
3014
static void OnEnterCommandListAppendImageCopyFromMemory (
3018
- ze_command_list_append_image_copy_from_memory_params_t * params,
3019
- void * global_data, void * * instance_data) {
3015
+ ze_command_list_append_image_copy_from_memory_params_t * params, void * global_data,
3016
+ void ** instance_data) {
3020
3017
if (UniController::IsCollectionEnabled ()) {
3021
3018
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
3022
3019
zet_metric_query_handle_t query = PrepareToAppendKernelCommand (
@@ -3136,8 +3133,8 @@ overhead::Init();
3136
3133
}
3137
3134
3138
3135
static void OnEnterCommandQueueExecuteCommandLists (
3139
- ze_command_queue_execute_command_lists_params_t * params,
3140
- void * global_data, void * * /* instance_data*/ ) {
3136
+ ze_command_queue_execute_command_lists_params_t * params, void * global_data,
3137
+ void ** /* instance_data*/ ) {
3141
3138
ZeCollector* collector = reinterpret_cast <ZeCollector*>(global_data);
3142
3139
3143
3140
if (UniController::IsCollectionEnabled ()) {
@@ -3152,7 +3149,7 @@ overhead::Init();
3152
3149
}
3153
3150
3154
3151
collector->PrepareToExecuteCommandLists (command_lists, command_list_count,
3155
- *(params->phCommandQueue ), *(params->phFence ));
3152
+ *(params->phCommandQueue ), *(params->phFence ));
3156
3153
}
3157
3154
}
3158
3155
static void OnExitCommandQueueExecuteCommandLists (
@@ -3177,7 +3174,6 @@ overhead::Init();
3177
3174
}
3178
3175
}
3179
3176
3180
-
3181
3177
static void OnExitCommandQueueSynchronize (ze_command_queue_synchronize_params_t * /* params*/ ,
3182
3178
ze_result_t result, void * global_data,
3183
3179
void ** /* instance_data*/ , std::vector<uint64_t >* kids) {
@@ -3225,7 +3221,7 @@ overhead::Init();
3225
3221
desc.context_ = *(params->phContext );
3226
3222
desc.device_ = *device;
3227
3223
desc.engine_ordinal_ = queue_desc->ordinal ;
3228
- desc.engine_index_ = queue_desc->index ;;
3224
+ desc.engine_index_ = queue_desc->index ;
3229
3225
3230
3226
collector->command_queues_ .erase (*command_queue);
3231
3227
collector->command_queues_ .insert ({*command_queue, std::move (desc)});
0 commit comments