Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def Aspectext_oneapi_clock_sub_group : Aspect<"ext_oneapi_clock_sub_group">;
def Aspectext_oneapi_clock_work_group : Aspect<"ext_oneapi_clock_work_group">;
def Aspectext_oneapi_clock_device : Aspect<"ext_oneapi_clock_device">;
def Aspectext_oneapi_is_integrated_gpu : Aspect<"ext_oneapi_is_integrated_gpu">;
def Aspectext_oneapi_device_wait : Aspect<"ext_oneapi_device_wait">;

// Deprecated aspects
def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
Expand Down Expand Up @@ -176,7 +177,8 @@ def : TargetInfo<"__TestAspectList",
Aspectext_oneapi_clock_sub_group,
Aspectext_oneapi_clock_work_group,
Aspectext_oneapi_clock_device,
Aspectext_oneapi_is_integrated_gpu],
Aspectext_oneapi_is_integrated_gpu,
Aspectext_oneapi_device_wait],
[]>;
// This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
// match.
Expand Down
15 changes: 15 additions & 0 deletions sycl/include/sycl/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,21 @@ class __SYCL_STANDALONE_DEBUG __SYCL_EXPORT device
return profile.c_str();
}

/// Synchronizes with all queues associated with the device.
void ext_oneapi_wait();

/// Dispatches all unconsumed asynchronous exceptions for all queues or
/// contexts associated with the queues.
void ext_oneapi_throw_asynchronous();

/// Synchronizes with all queues associated with the device, then dispatches
/// all unconsumed asynchronous exceptions for all queues or contexts
/// associated with the queues.
void ext_oneapi_wait_and_throw() {
ext_oneapi_wait();
ext_oneapi_throw_asynchronous();
}

// TODO: Remove this diagnostics when __SYCL_WARN_IMAGE_ASPECT is removed.
#if defined(__clang__)
#pragma clang diagnostic pop
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/info/aspects.def
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,4 @@ __SYCL_ASPECT(ext_oneapi_clock_sub_group, 91)
__SYCL_ASPECT(ext_oneapi_clock_work_group, 92)
__SYCL_ASPECT(ext_oneapi_clock_device, 93)
__SYCL_ASPECT(ext_oneapi_is_integrated_gpu, 94)
__SYCL_ASPECT(ext_oneapi_device_wait, 95)
19 changes: 19 additions & 0 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <detail/device_impl.hpp>
#include <detail/jit_compiler.hpp>
#include <detail/platform_impl.hpp>
#include <detail/scheduler/scheduler.hpp>
#include <detail/ur_info_code.hpp>
#include <sycl/detail/ur.hpp>
#include <sycl/device.hpp>
Expand Down Expand Up @@ -509,6 +510,24 @@ device_impl::getImmediateProgressGuarantee(
return forward_progress_guarantee::weakly_parallel;
}

void device_impl::wait() const {
// Firstly, all associated queues should be cleaned through of all
// not-yet-enqueued commands and host_task.
for (const std::weak_ptr<queue_impl> &WQueue : MQueues) {
std::shared_ptr<queue_impl> Queue = WQueue.lock();
assert(Queue && "Queue should never be dangling in the list of queues "
"associated with the device!");
Queue->waitForRuntimeLevelCmdsAndClear();
}

// Then we synchronize the entire device.
getAdapter().call<detail::UrApiKind::urDeviceWaitExp>(getHandleRef());
}

void device_impl::throwAsynchronous() {
Scheduler::getInstance().flushAsyncExceptions();
}

#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
#define EXPORT_GET_INFO(PARAM) \
template <> \
Expand Down
27 changes: 27 additions & 0 deletions sycl/source/detail/device_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1597,6 +1597,10 @@ class device_impl : public std::enable_shared_from_this<device_impl> {
get_info_impl_nocheck<UR_DEVICE_INFO_IS_INTEGRATED_GPU>().value_or(
0);
}
CASE(ext_oneapi_device_wait) {
return get_info_impl_nocheck<UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP>()
.value_or(0);
}
else {
return false; // This device aspect has not been implemented yet.
}
Expand Down Expand Up @@ -2267,6 +2271,22 @@ class device_impl : public std::enable_shared_from_this<device_impl> {
return {};
}

/// Synchronizes with all queues on the device.
void wait() const;

// Dispatch all unconsumed asynchronous exception to the appropriate handlers.
void throwAsynchronous();

void registerQueue(const std::weak_ptr<queue_impl> &Q) {
std::lock_guard<std::mutex> Lock(MQueuesMutex);
MQueues.insert(Q);
}

void unregisterQueue(const std::weak_ptr<queue_impl> &Q) {
std::lock_guard<std::mutex> Lock(MQueuesMutex);
MQueues.erase(Q);
}

private:
ur_device_handle_t MDevice = 0;
// This is used for getAdapter so should be above other properties.
Expand All @@ -2277,6 +2297,13 @@ class device_impl : public std::enable_shared_from_this<device_impl> {

const ur_device_handle_t MRootDevice;

// Devices track a list of active queues on it, to allow for synchronization
// with host_task and not-yet-enqueued commands.
std::mutex MQueuesMutex;
std::set<std::weak_ptr<queue_impl>,
std::owner_less<std::weak_ptr<queue_impl>>>
MQueues;

// Order of caches matters! UR must come before SYCL info descriptors (because
// get_info calls get_info_impl but the opposite never happens) and both
// should come before aspects.
Expand Down
3 changes: 2 additions & 1 deletion sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ class event_impl {
MWorkerQueue = std::move(WorkerQueue);
};

/// Sets original queue used for submission.
/// Sets original queue and device used for submission.
///
/// @return
void setSubmittedQueue(queue_impl *SubmittedQueue);
Expand Down Expand Up @@ -394,6 +394,7 @@ class event_impl {

std::weak_ptr<queue_impl> MWorkerQueue;
std::weak_ptr<queue_impl> MSubmittedQueue;
device_impl *MSubmittedDevice = nullptr;

/// Dependency events prepared for waiting by backend.
std::vector<EventImplPtr> MPreparedDepsEvents;
Expand Down
68 changes: 42 additions & 26 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -897,32 +897,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
LastEvent->wait();
}
} else if (!isInOrder()) {
std::vector<std::weak_ptr<event_impl>> WeakEvents;
{
std::lock_guard<std::mutex> Lock(MMutex);
WeakEvents.swap(MEventsWeak);
MMissedCleanupRequests.unset(
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
for (auto &UpdatedGraph : MissedCleanupRequests)
doUnenqueuedCommandCleanup(UpdatedGraph);
MissedCleanupRequests.clear();
});
}

// Wait for unenqueued or host task events, starting
// from the latest submitted task in order to minimize total amount of
// calls, then handle the rest with urQueueFinish.
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
if (std::shared_ptr<event_impl> EventImplSharedPtr =
EventImplWeakPtrIt->lock()) {
// A nullptr UR event indicates that urQueueFinish will not cover it,
// either because it's a host task event or an unenqueued one.
if (nullptr == EventImplSharedPtr->getHandle()) {
EventImplSharedPtr->wait();
}
}
}
waitForRuntimeLevelCmdsAndClear();
}

getAdapter().call<UrApiKind::urQueueFinish>(getHandleRef());
Expand Down Expand Up @@ -1135,6 +1110,47 @@ void queue_impl::verifyProps(const property_list &Props) const {
CheckPropertiesWithData);
}

void queue_impl::waitForRuntimeLevelCmdsAndClear() {
if (isInOrder() && !MNoLastEventMode.load(std::memory_order_relaxed)) {
// if MLastEvent is not null and has no associated handle, we need to wait
// for it. We do not clear it however.
EventImplPtr LastEvent;
{
std::lock_guard<std::mutex> Lock(MMutex);
LastEvent = MDefaultGraphDeps.LastEventPtr;
}
if (LastEvent && nullptr == LastEvent->getHandle())
LastEvent->wait();
} else if (!isInOrder()) {
std::vector<std::weak_ptr<event_impl>> WeakEvents;
{
std::lock_guard<std::mutex> Lock(MMutex);
WeakEvents.swap(MEventsWeak);
MMissedCleanupRequests.unset(
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
for (auto &UpdatedGraph : MissedCleanupRequests)
doUnenqueuedCommandCleanup(UpdatedGraph);
MissedCleanupRequests.clear();
});
}

// Wait for unenqueued or host task events, starting
// from the latest submitted task in order to minimize total amount of
// calls, then handle the rest with urQueueFinish.
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
if (std::shared_ptr<event_impl> EventImplSharedPtr =
EventImplWeakPtrIt->lock()) {
// A nullptr UR event indicates that urQueueFinish will not cover it,
// either because it's a host task event or an unenqueued one.
if (nullptr == EventImplSharedPtr->getHandle()) {
EventImplSharedPtr->wait();
}
}
}
}
}

} // namespace detail
} // namespace _V1
} // namespace sycl
12 changes: 10 additions & 2 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,10 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
// `std::shared_ptr` allocations.
template <typename... Ts>
static std::shared_ptr<queue_impl> create(Ts &&...args) {
return std::make_shared<queue_impl>(std::forward<Ts>(args)...,
private_tag{});
auto ImplPtr =
std::make_shared<queue_impl>(std::forward<Ts>(args)..., private_tag{});
ImplPtr->getDeviceImpl().registerQueue(ImplPtr);
return ImplPtr;
}

~queue_impl() {
Expand Down Expand Up @@ -695,6 +697,12 @@ class queue_impl : public std::enable_shared_from_this<queue_impl> {
return MAsyncHandler;
}

/// Waits for all not-yet-enqueued and host_task commands in the queue and
/// clears the events associated with the queue (if out-of-order.)
/// Note: This should only be called if the queue is guaranteed to be
/// synchronized by the caller.
void waitForRuntimeLevelCmdsAndClear();

protected:
template <typename HandlerType = handler>
EventImplPtr insertHelperBarrier(const HandlerType &Handler) {
Expand Down
1 change: 1 addition & 0 deletions sycl/source/detail/ur_device_info_ret_types.inc
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,5 @@ MAP(UR_DEVICE_INFO_CLOCK_SUB_GROUP_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_CLOCK_WORK_GROUP_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_CLOCK_DEVICE_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_IS_INTEGRATED_GPU, ur_bool_t)
MAP(UR_DEVICE_INFO_DEVICE_WAIT_SUPPORT_EXP, ur_bool_t)
// clang-format on
10 changes: 10 additions & 0 deletions sycl/source/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -344,5 +344,15 @@ detail::string device::ext_oneapi_cl_profile_impl() const {
return detail::string{profile};
}

void device::ext_oneapi_wait() {
if (!has(aspect::ext_oneapi_device_wait))
throw sycl::exception(
make_error_code(errc::feature_not_supported),
"Device does not support aspect::ext_oneapi_device_wait.");
impl->wait();
}

void device::ext_oneapi_throw_asynchronous() { impl->throwAsynchronous(); }

} // namespace _V1
} // namespace sycl
50 changes: 50 additions & 0 deletions sycl/test-e2e/DeviceWait/basic.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// REQUIRES: aspect-ext_oneapi_device_wait

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include <sycl/detail/core.hpp>
#include <sycl/properties/all_properties.hpp>

#include <array>
#include <vector>

constexpr size_t NContexts = 2;
constexpr size_t NQueues = 6;

int main() {
sycl::device D;
std::array<sycl::context, NContexts> Contexts{sycl::context{D},
sycl::context{D}};
std::array<sycl::queue, NQueues> Queues{
sycl::queue{Contexts[0], D},
sycl::queue{Contexts[0], D, sycl::property::queue::in_order()},
sycl::queue{Contexts[0], D},
sycl::queue{Contexts[1], D, sycl::property::queue::in_order()},
sycl::queue{Contexts[1], D},
sycl::queue{Contexts[1], D, sycl::property::queue::in_order()}};

std::vector<sycl::event> Events;
Events.reserve(NQueues);
for (sycl::queue &Q : Queues) {
sycl::event E = Q.single_task([]() {
volatile int value = 1024 * 1024;
while (--value)
;
});
Events.push_back(std::move(E));
}

D.ext_oneapi_wait();

int Failed = 0;
for (size_t I = 0; I < Events.size(); ++I) {
sycl::info::event_command_status EventStatus =
Events[I].get_info<sycl::info::event::command_execution_status>();
if (EventStatus != sycl::info::event_command_status::complete) {
std::cout << "Unexpected event status for event at " << I << std::endl;
++Failed;
}
}
return Failed;
}
2 changes: 2 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3390,12 +3390,14 @@ _ZN4sycl3_V16detail9modf_implENS1_9half_impl4halfEPS3_
_ZN4sycl3_V16detail9modf_implEdPd
_ZN4sycl3_V16detail9modf_implEfPf
_ZN4sycl3_V16device11get_devicesENS0_4info11device_typeE
_ZN4sycl3_V16device15ext_oneapi_waitEv
_ZN4sycl3_V16device20ext_oneapi_can_buildENS0_3ext6oneapi12experimental15source_languageE
_ZN4sycl3_V16device22ext_oneapi_can_compileENS0_3ext6oneapi12experimental15source_languageE
_ZN4sycl3_V16device26ext_oneapi_architecture_isENS0_3ext6oneapi12experimental12architectureE
_ZN4sycl3_V16device26ext_oneapi_architecture_isENS0_3ext6oneapi12experimental13arch_categoryE
_ZN4sycl3_V16device26ext_oneapi_can_access_peerERKS1_NS0_3ext6oneapi11peer_accessE
_ZN4sycl3_V16device29ext_oneapi_enable_peer_accessERKS1_
_ZN4sycl3_V16device29ext_oneapi_throw_asynchronousEv
_ZN4sycl3_V16device30ext_oneapi_disable_peer_accessERKS1_
_ZN4sycl3_V16device32ext_oneapi_supports_cl_c_featureENS0_6detail11string_viewE
_ZN4sycl3_V16deviceC1EP13_cl_device_id
Expand Down
3 changes: 3 additions & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -4023,6 +4023,9 @@
?ext_oneapi_supports_cl_c_version@device@_V1@sycl@@QEBA_NAEBUcl_version@experimental@oneapi@ext@23@@Z
?ext_oneapi_supports_cl_extension@device@_V1@sycl@@AEBA_NVstring_view@detail@23@PEAUcl_version@experimental@oneapi@ext@23@@Z
?ext_oneapi_supports_cl_extension@device@_V1@sycl@@QEBA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAUcl_version@experimental@oneapi@ext@23@@Z
?ext_oneapi_throw_asynchronous@device@_V1@sycl@@QEAAXXZ
?ext_oneapi_wait@device@_V1@sycl@@QEAAXXZ
?ext_oneapi_wait_and_throw@device@_V1@sycl@@QEAAXXZ
?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUexternal_semaphore@experimental@oneapi@ext@23@@Z
?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUexternal_semaphore@experimental@oneapi@ext@23@_K@Z
?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uexternal_semaphore@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
Expand Down
Loading