Skip to content

Commit a034dff

Browse files
add batch queues workarounds and skips
1 parent c79c8ea commit a034dff

File tree

10 files changed

+147
-121
lines changed

10 files changed

+147
-121
lines changed

unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -19,31 +19,31 @@
1919

2020
thread_local std::vector<ze_event_handle_t> waitList;
2121

22-
/*
23-
The wait_list_view is a wrapper for eventsWaitLists, which:
24-
- enables passing a ze_event_handle_t buffer created from events as an
25-
argument for the driver API;
26-
- handles enqueueing operations associated with given events if these
27-
operations have not already been set for execution.
28-
29-
Previously, it only stored the waitlist and the corresponding event count in a
30-
single container. Currently, the constructor also ensures that all associated
31-
operations will eventually be executed, which is required for batched queues in
32-
L0v2.
33-
34-
Wait events might have been created in batched queues, which use regular
35-
command lists (batches). Since regular command lists are not executed
36-
immediately, but only after enqueueing on immediate lists, it is necessary to
37-
enqueue the regular command list associated with the given event. Otherwise, the
38-
event would never be signalled. The enqueueing is performed in onWaitListView().
39-
40-
In the case of batched queues, the function onWaitListView() is not called if
41-
the current queue created the given event. The operation associated with the
42-
given wait_list_view is added to the current batch of the queue. The entire
43-
batch is then enqueued for execution, i.e., as part of queueFinish or
44-
queueFlush. For the same queue, events from the given eventsWaitList are
45-
enqueued before the associated operation is executed.
46-
*/
22+
// The wait_list_view is a wrapper for eventsWaitLists, which:
23+
// - enables passing a ze_event_handle_t buffer created from events as an
24+
// argument for the driver API;
25+
// - handles enqueueing operations associated with given events if these
26+
// operations have not already been set for execution.
27+
//
28+
// Previously, it only stored the waitlist and the corresponding event count in
29+
// a single container. Currently, the constructor also ensures that all
30+
// associated operations will eventually be executed, which is required for
31+
// batched queues in L0v2.
32+
//
33+
// Wait events might have been created in batched queues, which use regular
34+
// command lists (batches). Since regular command lists are not executed
35+
// immediately, but only after enqueueing on immediate lists, it is necessary to
36+
// enqueue the regular command list associated with the given event. Otherwise,
37+
// the event would never be signalled. The enqueueing is performed in
38+
// onWaitListView().
39+
//
40+
// In the case of batched queues, the function onWaitListView() is not called if
41+
// the current queue created the given event. The operation associated with the
42+
// given wait_list_view is added to the current batch of the queue. The entire
43+
// batch is then enqueued for execution, i.e., as part of queueFinish or
44+
// queueFlush. For the same queue, events from the given eventsWaitList are
45+
// enqueued before the associated operation is executed.
46+
4747
template <bool HasBatchedQueue>
4848
void getZeHandlesBuffer(const ur_event_handle_t *phWaitEvents,
4949
uint32_t numWaitEvents,

unified-runtime/source/adapters/level_zero/v2/event.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,13 @@ struct ur_event_handle_t_ : ur_object {
7070

7171
// Set the queue and command that this event is associated with
7272
void setQueue(ur_queue_t_ *hQueue);
73-
void setBatch(ur_event_generation_t batch_generation);
7473
void setCommandType(ur_command_t commandType);
74+
75+
// For batched queues
76+
// Set the batch that this event is associated with
77+
void setBatch(ur_event_generation_t batch_generation);
78+
// Ensure that the batch associated with this event is submitted for
79+
// execution, otherwise the event will never be signalled
7580
void onWaitListUse();
7681

7782
void reset();

unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp

Lines changed: 24 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -898,44 +898,42 @@ ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp(
898898
lockedBatch->getCurrentGeneration()));
899899
}
900900

901-
/*
902-
In case of queues with batched submissions, which use regular command lists
903-
(similarly to command buffers), the start timestamp would be recorded as the
904-
operation is submitted (event.recordStartTimestamp() in
905-
appendTimestampRecordingExp does not use the queue but directly the device), but
906-
the end timestamp would wait for the submission of the given regular command
907-
list. The difference between the start and end timestamps would reflect the
908-
delay in the batch submission, the difference between end timestamps would
909-
reflect the actual time of execution.
910-
911-
TODO
912-
The version of timestampRecording for batched queues should be adjusted in order
913-
to reflect the idea behind the original function
914-
*/
901+
// In case of queues with batched submissions, which use regular command lists
902+
// (similarly to command buffers), the start timestamp would be recorded as the
903+
// operation is submitted (event.recordStartTimestamp() in
904+
// appendTimestampRecordingExp does not use the queue but directly the device),
905+
// but the end timestamp would wait for the submission of the given regular
906+
// command list. The difference between the start and end timestamps would
907+
// reflect the delay in the batch submission, the difference between end
908+
// timestamps would reflect the actual time of execution.
909+
//
910+
// TODO
911+
// The version of timestampRecording for batched queues should be adjusted in
912+
// order to reflect the idea behind the original function
915913

916914
ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp(
917915
bool /* blocking */, uint32_t /* numEventsInWaitList */,
918916
const ur_event_handle_t * /* phEventWaitList */,
919917
ur_event_handle_t * /* phEvent */) {
920918

921919
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
922-
/* wait_list_view waitListView =
923-
wait_list_view(phEventWaitList, numEventsInWaitList, this);
920+
// wait_list_view waitListView =
921+
// wait_list_view(phEventWaitList, numEventsInWaitList, this);
924922

925-
auto lockedBatch = currentCmdLists.lock();
923+
// auto lockedBatch = currentCmdLists.lock();
926924

927-
lockedBatch->markIssuedCommand();
925+
// lockedBatch->markIssuedCommand();
928926

929-
UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp(
930-
false, waitListView,
931-
createEventIfRequestedRegular(phEvent,
932-
lockedBatch->getCurrentGeneration())));
927+
// UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp(
928+
// false, waitListView,
929+
// createEventIfRequestedRegular(phEvent,
930+
// lockedBatch->getCurrentGeneration())));
933931

934-
if (blocking) {
935-
UR_CALL(queueFinishUnlocked(lockedBatch));
936-
}
932+
// if (blocking) {
933+
// UR_CALL(queueFinishUnlocked(lockedBatch));
934+
// }
937935

938-
return UR_RESULT_SUCCESS; */
936+
// return UR_RESULT_SUCCESS;
939937
}
940938

941939
ur_result_t ur_queue_batched_t::enqueueCommandBufferExp(

unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp

Lines changed: 43 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -30,58 +30,53 @@
3030
#include "ur_api.h"
3131
#include "ze_api.h"
3232

33-
/* Batched queues enable submission of operations to the driver in batches,
34-
* therefore reducing the overhead of submitting every single operation
35-
* individually. Similarly to command buffers in L0v2, they use regular command
36-
* lists (later referenced as 'batches'). Operations enqueued on regular command
37-
* lists are not executed immediately, but only after enqueueing the regular
38-
* command list on an immediate command list. However, in contrast to command
39-
* buffers, batched queues also handle submission of batches (regular command
40-
* lists) instead of only collecting enqueued operations, by using an internal
41-
* immediate command list. Command lists are managed by a batch_manager inside a
42-
* batched queue.
43-
*
44-
* Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in
45-
* ur_queue_flags_t or globally, through the environment variable
46-
* UR_L0_FORCE_BATCHED=1.
47-
*/
33+
// Batched queues enable submission of operations to the driver in batches,
34+
// therefore reducing the overhead of submitting every single operation
35+
// individually. Similarly to command buffers in L0v2, they use regular command
36+
// lists (later referenced as 'batches'). Operations enqueued on regular command
37+
// lists are not executed immediately, but only after enqueueing the regular
38+
// command list on an immediate command list. However, in contrast to command
39+
// buffers, batched queues also handle submission of batches (regular command
40+
// lists) instead of only collecting enqueued operations, by using an internal
41+
// immediate command list. Command lists are managed by a batch_manager inside a
42+
// batched queue.
43+
//
44+
// Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in
45+
// ur_queue_flags_t or globally, through the environment variable
46+
// UR_L0_FORCE_BATCHED=1.
4847

4948
namespace v2 {
5049

5150
struct batch_manager {
5251
private:
53-
/* The currently active regular command list, which may be replaced in the
54-
* command list manager, submitted for execution on the immediate command list
55-
* and stored in the vector of submitted batches while awaiting execution
56-
* completion
57-
*/
52+
// The currently active regular command list, which may be replaced in the
53+
// command list manager, submitted for execution on the immediate command list
54+
// and stored in the vector of submitted batches while awaiting execution
55+
// completion
5856
ur_command_list_manager activeBatch;
5957
// An immediate command list for submission of batches
6058
ur_command_list_manager immediateList;
61-
/* Submitted batches (regular command lists), stored for the completion of
62-
* their execution. After queueFinish(), the vector is cleared - at this
63-
* point, the destructor of command_list_handle adds the given command list to
64-
* the command list cache, to the stack assigned to the description of the
65-
* command list. When a new regular command list is requested after
66-
* queueFinish(), it is popped from the available stack rather than retrieved
67-
* through a driver call, which improves performance.
68-
*/
59+
// Submitted batches (regular command lists), stored for the completion of
60+
// their execution. After queueFinish(), the vector is cleared - at this
61+
// point, the destructor of command_list_handle adds the given command list to
62+
// the command list cache, to the stack assigned to the description of the
63+
// command list. When a new regular command list is requested after
64+
// queueFinish(), it is popped from the available stack rather than retrieved
65+
// through a driver call, which improves performance.
6966
std::vector<v2::raii::command_list_unique_handle> runBatches;
70-
/* The generation number of the current batch, assigned to events associated
71-
* with operations enqueued on the given batch. It is incremented during every
72-
* replacement of the current batch. When an event created by a batched queue
73-
* appears in an eventWaitList, the batch assigned to the given event might
74-
* not have been executed yet and the event might never be signalled.
75-
* Comparing generation numbers enables determining whether the current batch
76-
* should be submitted for execution. If the generation number of the current
77-
* batch is higher than the number assigned to the given event, the batch
78-
* associated with the event has already been submitted for execution and
79-
* additional submission of the current batch is not needed.
80-
*/
67+
// The generation number of the current batch, assigned to events associated
68+
// with operations enqueued on the given batch. It is incremented during every
69+
// replacement of the current batch. When an event created by a batched queue
70+
// appears in an eventWaitList, the batch assigned to the given event might
71+
// not have been executed yet and the event might never be signalled.
72+
// Comparing generation numbers enables determining whether the current batch
73+
// should be submitted for execution. If the generation number of the current
74+
// batch is higher than the number assigned to the given event, the batch
75+
// associated with the event has already been submitted for execution and
76+
// additional submission of the current batch is not needed.
8177
ur_event_generation_t regularGenerationNumber;
82-
/* The limit of regular command lists stored for execution; if exceeded, the
83-
* vector is cleared as part of queueFinish and slots are renewed.
84-
*/
78+
// The limit of regular command lists stored for execution; if exceeded, the
79+
// vector is cleared as part of queueFinish and slots are renewed.
8580
static constexpr uint64_t initialSlotsForBatches = 10;
8681
// Whether any operation has been enqueued on the current batch
8782
bool isEmpty = true;
@@ -148,12 +143,12 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ {
148143

149144
ur_queue_flags_t flags;
150145

151-
/* Regular command lists use the regular pool cache type, whereas immediate
152-
* command lists use the immediate pool cache type. Since user-requested
153-
* operations are enqueued on regular command lists and immediate command
154-
* lists are only used internally by the batched queue implementation, events
155-
* are not created for immediate command lists.
156-
*/
146+
// Regular command lists use the regular pool cache type, whereas immediate
147+
// command lists use the immediate pool cache type. Since user-requested
148+
// operations are enqueued on regular command lists and immediate command
149+
// lists are only used internally by the batched queue implementation, events
150+
// are not created for immediate command lists.
151+
157152
v2::raii::cache_borrowed_event_pool eventPoolRegular;
158153

159154
v2::raii::command_list_unique_handle getNewRegularCmdList() {

unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
#include "ur_api.h"
1010

1111
struct ur_queue_extensions {
12-
/* Non-batched queues don't need to perform any action
13-
14-
This function is intended to be called by the event. If the event has been
15-
created by the given queue and is associated with the current batch, this
16-
batch should be enqueued for execution. Otherwise, the event would never be
17-
signalled */
12+
// Non-batched queues don't need to perform any action
13+
//
14+
// This function is intended to be called by the event. If the event has been
15+
// created by the given queue and is associated with the current batch, this
16+
// batch should be enqueued for execution. Otherwise, the event would never be
17+
// signalled
1818
virtual ur_result_t
1919
onEventWaitListUse([[maybe_unused]] int64_t batch_generation) {
2020
return UR_RESULT_SUCCESS;

unified-runtime/source/adapters/native_cpu/queue.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
3333
case UR_QUEUE_INFO_EMPTY:
3434
return ReturnValue(hQueue->isEmpty());
3535
case UR_QUEUE_INFO_FLAGS:
36-
/*
37-
Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the
38-
enqueueTimestampRecording tests after introducing batched queues, since
39-
batched queues do not support enqueueTimestampRecording.
40-
*/
36+
// Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the
37+
// enqueueTimestampRecording tests after introducing batched queues, since
38+
// batched queues do not support enqueueTimestampRecording.
4139
if (!hQueue->isInOrder()) {
4240
flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
4341
}

unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "uur/fixtures.h"
1717
#include "uur/raii.h"
18+
#include "uur/utils.h"
1819

1920
#include <gtest/gtest.h>
2021
#include <map>
@@ -186,6 +187,7 @@ TEST_P(CommandListCacheTest, ImmediateCommandListsHaveProperAttributes) {
186187
TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
187188
static constexpr int NumQueuesPerType = 5;
188189
size_t NumUniqueQueueTypes = 0;
190+
bool isBatched = false;
189191

190192
for (int I = 0; I < NumQueuesPerType; I++) {
191193
NumUniqueQueueTypes = 0;
@@ -216,6 +218,8 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
216218
ASSERT_EQ(urQueueCreate(context, device, &QueueProps, Queue.ptr()),
217219
UR_RESULT_SUCCESS);
218220

221+
ASSERT_NO_FATAL_FAILURE(uur::isQueueBatched(Queue, &isBatched));
222+
219223
Queues.emplace_back(Queue);
220224
}
221225
}
@@ -227,7 +231,13 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
227231

228232
ASSERT_EQ(context->getCommandListCache().getNumImmediateCommandLists(),
229233
NumUniqueQueueTypes);
230-
ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0);
234+
235+
if (isBatched) {
236+
ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(),
237+
NumUniqueQueueTypes);
238+
} else {
239+
ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0);
240+
}
231241
}
232242
}
233243

unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "event_provider_counter.hpp"
2525
#include "event_provider_normal.hpp"
2626
#include "queue_handle.hpp"
27+
#include "uur/checks.h"
2728
#include "uur/fixtures.h"
2829
#include "ze_api.h"
2930

@@ -277,6 +278,7 @@ TEST_P(EventPoolTestWithQueue, WithTimestamp) {
277278
GTEST_SKIP() << "Profiling needs to be enabled";
278279
}
279280

281+
SKIP_IF_BATCHED_QUEUE(queue);
280282
auto zeEvent = createZeEvent(context, device);
281283

282284
ur_event_handle_t hEvent;

unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,16 @@
44
//
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66

7+
#include "uur/checks.h"
8+
#include <gtest/gtest.h>
79
#include <uur/fixtures.h>
810
#include <uur/known_failure.h>
911

1012
struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest {
1113
void SetUp() override {
1214
UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
1315

14-
ur_queue_flags_t queueFlags{};
15-
ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS,
16-
sizeof(ur_queue_flags_t), &queueFlags,
17-
nullptr));
18-
19-
if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) {
20-
UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{});
21-
}
22-
16+
SKIP_IF_BATCHED_QUEUE(queue);
2317
bool timestamp_recording_support = false;
2418
ASSERT_SUCCESS(
2519
uur::GetTimestampRecordingSupport(device, timestamp_recording_support));

0 commit comments

Comments
 (0)