add batch queues workarounds and skips

EuphoricThinking · EuphoricThinking · commit a034dfffbf91 · 2025-10-19T14:30:01.000Z
diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -19,31 +19,31 @@
 
 thread_local std::vector<ze_event_handle_t> waitList;
 
-/*
-The wait_list_view is a wrapper for eventsWaitLists, which:
- -  enables passing a ze_event_handle_t buffer created from events as an
-argument for the driver API;
- - handles enqueueing operations associated with given events if these
-operations have not already been set for execution.
-
-Previously, it only stored the waitlist and the corresponding event count in a
-single container. Currently, the constructor also ensures that all associated
-operations will eventually be executed, which is required for batched queues in
-L0v2.
-
-Wait events might have been created in batched queues, which use regular
-command lists (batches). Since regular command lists are not executed
-immediately, but only after enqueueing on immediate lists, it is necessary to
-enqueue the regular command list associated with the given event. Otherwise, the
-event would never be signalled. The enqueueing is performed in onWaitListView().
-
-In the case of batched queues, the function onWaitListView() is not called if
-the current queue created the given event. The operation associated with the
-given wait_list_view is added to the current batch of the queue. The entire
-batch is then enqueued for execution, i.e., as part of queueFinish or
-queueFlush. For the same queue, events from the given eventsWaitList are
-enqueued before the associated operation is executed.
-*/
+// The wait_list_view is a wrapper for eventsWaitLists, which:
+// -  enables passing a ze_event_handle_t buffer created from events as an
+// argument for the driver API;
+// - handles enqueueing operations associated with given events if these
+// operations have not already been set for execution.
+//
+// Previously, it only stored the waitlist and the corresponding event count in
+// a single container. Currently, the constructor also ensures that all
+// associated operations will eventually be executed, which is required for
+// batched queues in L0v2.
+//
+// Wait events might have been created in batched queues, which use regular
+// command lists (batches). Since regular command lists are not executed
+// immediately, but only after enqueueing on immediate lists, it is necessary to
+// enqueue the regular command list associated with the given event. Otherwise,
+// the event would never be signalled. The enqueueing is performed in
+// onWaitListView().
+//
+// In the case of batched queues, the function onWaitListView() is not called if
+// the current queue created the given event. The operation associated with the
+// given wait_list_view is added to the current batch of the queue. The entire
+// batch is then enqueued for execution, i.e., as part of queueFinish or
+// queueFlush. For the same queue, events from the given eventsWaitList are
+// enqueued before the associated operation is executed.
+
 template <bool HasBatchedQueue>
 void getZeHandlesBuffer(const ur_event_handle_t *phWaitEvents,
                         uint32_t numWaitEvents,
diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp
@@ -70,8 +70,13 @@ struct ur_event_handle_t_ : ur_object {
 
   // Set the queue and command that this event is associated with
   void setQueue(ur_queue_t_ *hQueue);
-  void setBatch(ur_event_generation_t batch_generation);
   void setCommandType(ur_command_t commandType);
+
+  // For batched queues
+  // Set the batch that this event is associated with
+  void setBatch(ur_event_generation_t batch_generation);
+  // Ensure that the batch associated with this event is submitted for
+  // execution, otherwise the event will never be signalled
   void onWaitListUse();
 
   void reset();
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp
@@ -898,44 +898,42 @@ ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp(
                                     lockedBatch->getCurrentGeneration()));
 }
 
-/*
-In case of queues with batched submissions, which use regular command lists
-(similarly to command buffers), the start timestamp would be recorded as the
-operation is submitted (event.recordStartTimestamp() in
-appendTimestampRecordingExp does not use the queue but directly the device), but
-the end timestamp would wait for the submission of the given regular command
-list. The difference between the start and end timestamps would reflect the
-delay in the batch submission, the difference between end timestamps would
-reflect the actual time of execution.
-
-TODO
-The version of timestampRecording for batched queues should be adjusted in order
-to reflect the idea behind the original function
-*/
+// In case of queues with batched submissions, which use regular command lists
+// (similarly to command buffers), the start timestamp would be recorded as the
+// operation is submitted (event.recordStartTimestamp() in
+// appendTimestampRecordingExp does not use the queue but directly the device),
+// but the end timestamp would wait for the submission of the given regular
+// command list. The difference between the start and end timestamps would
+// reflect the delay in the batch submission, the difference between end
+// timestamps would reflect the actual time of execution.
+//
+// TODO
+// The version of timestampRecording for batched queues should be adjusted in
+// order to reflect the idea behind the original function
 
 ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp(
     bool /* blocking */, uint32_t /* numEventsInWaitList */,
     const ur_event_handle_t * /* phEventWaitList */,
     ur_event_handle_t * /* phEvent */) {
 
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-  /*  wait_list_view waitListView =
-        wait_list_view(phEventWaitList, numEventsInWaitList, this);
+  // wait_list_view waitListView =
+  //     wait_list_view(phEventWaitList, numEventsInWaitList, this);
 
-    auto lockedBatch = currentCmdLists.lock();
+  // auto lockedBatch = currentCmdLists.lock();
 
-    lockedBatch->markIssuedCommand();
+  // lockedBatch->markIssuedCommand();
 
-    UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp(
-        false, waitListView,
-        createEventIfRequestedRegular(phEvent,
-                                      lockedBatch->getCurrentGeneration())));
+  // UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp(
+  //     false, waitListView,
+  //     createEventIfRequestedRegular(phEvent,
+  //                                   lockedBatch->getCurrentGeneration())));
 
-    if (blocking) {
-      UR_CALL(queueFinishUnlocked(lockedBatch));
-    }
+  // if (blocking) {
+  //   UR_CALL(queueFinishUnlocked(lockedBatch));
+  // }
 
-    return UR_RESULT_SUCCESS; */
+  // return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ur_queue_batched_t::enqueueCommandBufferExp(
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp
@@ -30,58 +30,53 @@
 #include "ur_api.h"
 #include "ze_api.h"
 
-/* Batched queues enable submission of operations to the driver in batches,
- * therefore reducing the overhead of submitting every single operation
- * individually. Similarly to command buffers in L0v2, they use regular command
- * lists (later referenced as 'batches'). Operations enqueued on regular command
- * lists are not executed immediately, but only after enqueueing the regular
- * command list on an immediate command list. However, in contrast to command
- * buffers, batched queues also handle submission of batches (regular command
- * lists) instead of only collecting enqueued operations, by using an internal
- * immediate command list. Command lists are managed by a batch_manager inside a
- * batched queue.
- *
- * Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in
- * ur_queue_flags_t or globally, through the environment variable
- * UR_L0_FORCE_BATCHED=1.
- */
+// Batched queues enable submission of operations to the driver in batches,
+// therefore reducing the overhead of submitting every single operation
+// individually. Similarly to command buffers in L0v2, they use regular command
+// lists (later referenced as 'batches'). Operations enqueued on regular command
+// lists are not executed immediately, but only after enqueueing the regular
+// command list on an immediate command list. However, in contrast to command
+// buffers, batched queues also handle submission of batches (regular command
+// lists) instead of only collecting enqueued operations, by using an internal
+// immediate command list. Command lists are managed by a batch_manager inside a
+// batched queue.
+//
+// Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in
+// ur_queue_flags_t or globally, through the environment variable
+// UR_L0_FORCE_BATCHED=1.
 
 namespace v2 {
 
 struct batch_manager {
 private:
-  /* The currently active regular command list, which may be replaced in the
-   * command list manager, submitted for execution on the immediate command list
-   * and stored in the vector of submitted batches while awaiting execution
-   * completion
-   */
+  // The currently active regular command list, which may be replaced in the
+  // command list manager, submitted for execution on the immediate command list
+  // and stored in the vector of submitted batches while awaiting execution
+  // completion
   ur_command_list_manager activeBatch;
   // An immediate command list for submission of batches
   ur_command_list_manager immediateList;
-  /* Submitted batches (regular command lists), stored for the completion of
-   * their execution. After queueFinish(), the vector is cleared - at this
-   * point, the destructor of command_list_handle adds the given command list to
-   * the command list cache, to the stack assigned to the description of the
-   * command list. When a new regular command list is requested after
-   * queueFinish(), it is popped from the available stack rather than retrieved
-   * through a driver call, which improves performance.
-   */
+  // Submitted batches (regular command lists), stored for the completion of
+  // their execution. After queueFinish(), the vector is cleared - at this
+  // point, the destructor of command_list_handle adds the given command list to
+  // the command list cache, to the stack assigned to the description of the
+  // command list. When a new regular command list is requested after
+  // queueFinish(), it is popped from the available stack rather than retrieved
+  // through a driver call, which improves performance.
   std::vector<v2::raii::command_list_unique_handle> runBatches;
-  /* The generation number of the current batch, assigned to events associated
-   * with operations enqueued on the given batch. It is incremented during every
-   * replacement of the current batch. When an event created by a batched queue
-   * appears in an eventWaitList, the batch assigned to the given event might
-   * not have been executed yet and the event might never be signalled.
-   * Comparing generation numbers enables determining whether the current batch
-   * should be submitted for execution. If the generation number of the current
-   * batch is higher than the number assigned to the given event, the batch
-   * associated with the event has already been submitted for execution and
-   * additional submission of the current batch is not needed.
-   */
+  // The generation number of the current batch, assigned to events associated
+  // with operations enqueued on the given batch. It is incremented during every
+  // replacement of the current batch. When an event created by a batched queue
+  // appears in an eventWaitList, the batch assigned to the given event might
+  // not have been executed yet and the event might never be signalled.
+  // Comparing generation numbers enables determining whether the current batch
+  // should be submitted for execution. If the generation number of the current
+  // batch is higher than the number assigned to the given event, the batch
+  // associated with the event has already been submitted for execution and
+  // additional submission of the current batch is not needed.
   ur_event_generation_t regularGenerationNumber;
-  /* The limit of regular command lists stored for execution; if exceeded, the
-   * vector is cleared as part of queueFinish and slots are renewed.
-   */
+  // The limit of regular command lists stored for execution; if exceeded, the
+  // vector is cleared as part of queueFinish and slots are renewed.
   static constexpr uint64_t initialSlotsForBatches = 10;
   // Whether any operation has been enqueued on the current batch
   bool isEmpty = true;
@@ -148,12 +143,12 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ {
 
   ur_queue_flags_t flags;
 
-  /* Regular command lists use the regular pool cache type, whereas immediate
-   * command lists use the immediate pool cache type. Since user-requested
-   * operations are enqueued on regular command lists and immediate command
-   * lists are only used internally by the batched queue implementation, events
-   * are not created for immediate command lists.
-   */
+  // Regular command lists use the regular pool cache type, whereas immediate
+  // command lists use the immediate pool cache type. Since user-requested
+  // operations are enqueued on regular command lists and immediate command
+  // lists are only used internally by the batched queue implementation, events
+  // are not created for immediate command lists.
+
   v2::raii::cache_borrowed_event_pool eventPoolRegular;
 
   v2::raii::command_list_unique_handle getNewRegularCmdList() {
diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp
@@ -9,12 +9,12 @@
 #include "ur_api.h"
 
 struct ur_queue_extensions {
-  /* Non-batched queues don't need to perform any action
-
-   This function is intended to be called by the event. If the event has been
-   created by the given queue and is associated with the current batch, this
-   batch should be enqueued for execution. Otherwise, the event would never be
-   signalled */
+  // Non-batched queues don't need to perform any action
+  //
+  // This function is intended to be called by the event. If the event has been
+  // created by the given queue and is associated with the current batch, this
+  // batch should be enqueued for execution. Otherwise, the event would never be
+  // signalled
   virtual ur_result_t
   onEventWaitListUse([[maybe_unused]] int64_t batch_generation) {
     return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp
@@ -33,11 +33,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
   case UR_QUEUE_INFO_EMPTY:
     return ReturnValue(hQueue->isEmpty());
   case UR_QUEUE_INFO_FLAGS:
-    /*
-    Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the
-    enqueueTimestampRecording tests after introducing batched queues, since
-    batched queues do not support enqueueTimestampRecording.
-    */
+    // Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the
+    // enqueueTimestampRecording tests after introducing batched queues, since
+    // batched queues do not support enqueueTimestampRecording.
     if (!hQueue->isInOrder()) {
       flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
     }
diff --git a/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp b/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp
@@ -15,6 +15,7 @@
 
 #include "uur/fixtures.h"
 #include "uur/raii.h"
+#include "uur/utils.h"
 
 #include <gtest/gtest.h>
 #include <map>
@@ -186,6 +187,7 @@ TEST_P(CommandListCacheTest, ImmediateCommandListsHaveProperAttributes) {
 TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
   static constexpr int NumQueuesPerType = 5;
   size_t NumUniqueQueueTypes = 0;
+  bool isBatched = false;
 
   for (int I = 0; I < NumQueuesPerType; I++) {
     NumUniqueQueueTypes = 0;
@@ -216,6 +218,8 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
           ASSERT_EQ(urQueueCreate(context, device, &QueueProps, Queue.ptr()),
                     UR_RESULT_SUCCESS);
 
+          ASSERT_NO_FATAL_FAILURE(uur::isQueueBatched(Queue, &isBatched));
+
           Queues.emplace_back(Queue);
         }
       }
@@ -227,7 +231,13 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) {
 
     ASSERT_EQ(context->getCommandListCache().getNumImmediateCommandLists(),
               NumUniqueQueueTypes);
-    ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0);
+
+    if (isBatched) {
+      ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(),
+                NumUniqueQueueTypes);
+    } else {
+      ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0);
+    }
   }
 }
 
diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp
@@ -24,6 +24,7 @@
 #include "event_provider_counter.hpp"
 #include "event_provider_normal.hpp"
 #include "queue_handle.hpp"
+#include "uur/checks.h"
 #include "uur/fixtures.h"
 #include "ze_api.h"
 
@@ -277,6 +278,7 @@ TEST_P(EventPoolTestWithQueue, WithTimestamp) {
     GTEST_SKIP() << "Profiling needs to be enabled";
   }
 
+  SKIP_IF_BATCHED_QUEUE(queue);
   auto zeEvent = createZeEvent(context, device);
 
   ur_event_handle_t hEvent;
diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp
@@ -4,22 +4,16 @@
 //
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "uur/checks.h"
+#include <gtest/gtest.h>
 #include <uur/fixtures.h>
 #include <uur/known_failure.h>
 
 struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest {
   void SetUp() override {
     UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
 
-    ur_queue_flags_t queueFlags{};
-    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS,
-                                  sizeof(ur_queue_flags_t), &queueFlags,
-                                  nullptr));
-
-    if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) {
-      UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{});
-    }
-
+    SKIP_IF_BATCHED_QUEUE(queue);
     bool timestamp_recording_support = false;
     ASSERT_SUCCESS(
         uur::GetTimestampRecordingSupport(device, timestamp_recording_support));
diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h